Coverage Report

Created: 2026-06-30 06:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/vcf.c
Line
Count
Source
1
/*  vcf.c -- VCF/BCF API functions.
2
3
    Copyright (C) 2012, 2013 Broad Institute.
4
    Copyright (C) 2012-2026 Genome Research Ltd.
5
    Portions copyright (C) 2014 Intel Corporation.
6
7
    Author: Heng Li <lh3@sanger.ac.uk>
8
9
Permission is hereby granted, free of charge, to any person obtaining a copy
10
of this software and associated documentation files (the "Software"), to deal
11
in the Software without restriction, including without limitation the rights
12
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
copies of the Software, and to permit persons to whom the Software is
14
furnished to do so, subject to the following conditions:
15
16
The above copyright notice and this permission notice shall be included in
17
all copies or substantial portions of the Software.
18
19
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25
DEALINGS IN THE SOFTWARE.  */
26
27
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
28
#include <config.h>
29
30
#include <stdio.h>
31
#include <assert.h>
32
#include <string.h>
33
#include <strings.h>
34
#include <stdlib.h>
35
#include <limits.h>
36
#include <stdint.h>
37
#include <inttypes.h>
38
#include <errno.h>
39
40
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
41
#include "fuzz_settings.h"
42
#endif
43
44
#include "htslib/vcf.h"
45
#include "htslib/bgzf.h"
46
#include "htslib/tbx.h"
47
#include "htslib/hfile.h"
48
#include "hts_internal.h"
49
#include "htslib/hts_alloc.h"
50
#include "htslib/hts_endian.h"
51
#include "htslib/khash_str2int.h"
52
#include "htslib/kstring.h"
53
#include "htslib/sam.h"
54
#include "htslib/khash.h"
55
#include "bgzf_internal.h"
56
57
#if 0
58
// This helps on Intel a bit, often 6-7% faster VCF parsing.
59
// Conversely sometimes harms AMD Zen4 as ~9% slower.
60
// Possibly related to IPC differences.  However for now it's just a
61
// curiousity we ignore and stick with the simpler code.
62
//
63
// Left here as a hint for future explorers.
64
static inline int xstreq(const char *a, const char *b) {
65
    while (*a && *a == *b)
66
        a++, b++;
67
    return *a == *b;
68
}
69
70
#define KHASH_MAP_INIT_XSTR(name, khval_t) \
71
  KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, xstreq)
72
73
KHASH_MAP_INIT_XSTR(vdict, bcf_idinfo_t)
74
#else
75
KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
76
#endif
77
78
typedef khash_t(vdict) vdict_t;
79
80
KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*)
81
typedef khash_t(hdict) hdict_t;
82
83
84
#include "htslib/kseq.h"
85
HTSLIB_EXPORT
86
uint32_t bcf_float_missing    = 0x7F800001;
87
88
HTSLIB_EXPORT
89
uint32_t bcf_float_vector_end = 0x7F800002;
90
91
HTSLIB_EXPORT
92
uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
93
94
static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
95
96
/*
97
    Partial support for 64-bit POS and Number=1 INFO tags.
98
    Notes:
99
     - the support for 64-bit values is motivated by POS and INFO/END for large genomes
100
     - the use of 64-bit values does not conform to the specification
101
     - cannot output 64-bit BCF and if it does, it is not compatible with anything
102
     - experimental, use at your risk
103
*/
104
#ifdef VCF_ALLOW_INT64
105
    #define BCF_MAX_BT_INT64 (0x7fffffffffffffff)       /* INT64_MAX, for internal use only */
106
    #define BCF_MIN_BT_INT64 -9223372036854775800LL     /* INT64_MIN + 8, for internal use only */
107
#endif
108
109
817
#define BCF_IS_64BIT (1<<30)
110
111
112
// Opaque structure with auxilary data which allows to extend bcf_hdr_t without breaking ABI.
113
// Note that this preserving API and ABI requires that the first element is vdict_t struct
114
// rather than a pointer, as user programs may (and in some cases do) access the dictionary
115
// directly as (vdict_t*)hdr->dict.
116
typedef struct
117
{
118
    vdict_t dict;   // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT
119
    hdict_t *gen;   // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields
120
    size_t *key_len;// length of h->id[BCF_DT_ID] strings
121
    int version;    //cached version
122
    uint32_t ref_count; // reference count, low bit indicates bcf_hdr_destroy() has been called
123
}
124
bcf_hdr_aux_t;
125
126
static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr)
127
320k
{
128
320k
    return (bcf_hdr_aux_t *)hdr->dict[0];
129
320k
}
130
131
//version macros
132
183k
#define VCF_DEF 4002000
133
43.2k
#define VCF44   4004000
134
37.4k
#define VCF45   4005000
135
136
#define VCF_MAJOR_VER(x) ( (x) / 10000 / 100 )
137
#define VCF_MINOR_VER(x) ( ((x) % 1000000) / 1000 )
138
139
/**
140
 *  bcf_get_version - get the version as int
141
 *  @param hdr   - bcf header, to get version
142
 *  @param verstr- version string, which is already available
143
 *  Returns version on success and default version on failure
144
 *  version = major * 100 * 10000 + minor * 1000
145
 */
146
static int bcf_get_version(const bcf_hdr_t *hdr, const char *verstr)
147
22.0k
{
148
22.0k
    const char *version = NULL, vcf[] = "VCFv";
149
22.0k
    char *major = NULL, *minor = NULL;
150
22.0k
    int ver = -1;
151
22.0k
    long tmp = 0;
152
22.0k
    bcf_hdr_aux_t *aux = NULL;
153
154
22.0k
    if (!hdr && !verstr) {  //invalid input
155
0
        goto fail;
156
0
    }
157
158
22.0k
    if (hdr) {
159
15.6k
        if ((aux = get_hdr_aux(hdr)) && aux->version != 0) {    //use cached version
160
14.5k
            return aux->version;
161
14.5k
        }
162
        //get from header
163
1.08k
        version = bcf_hdr_get_version(hdr);
164
6.38k
    } else {
165
        //get from version string
166
6.38k
        version = verstr;
167
6.38k
    }
168
7.47k
    if (!(major = strstr(version, vcf))) {  //bad format
169
5.63k
        goto fail;
170
5.63k
    }
171
1.84k
    major += sizeof(vcf) - 1;
172
1.84k
    if (!(minor = strchr(major, '.'))) {    //bad format
173
200
        goto fail;
174
200
    }
175
1.64k
    tmp = strtol(major, NULL, 10);
176
1.64k
    if ((!tmp && errno == EINVAL) ||
177
1.57k
        ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) {    //failed
178
78
        goto fail;
179
78
    }
180
1.56k
    ver = tmp * 100 * 10000;
181
1.56k
    tmp = strtol(++minor, NULL, 10);
182
1.56k
    if ((!tmp && errno == EINVAL) ||
183
1.48k
        ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) {    //failed
184
350
        goto fail;
185
350
    }
186
1.21k
    ver += tmp * 1000;
187
1.21k
    return ver;
188
189
6.26k
fail:
190
6.26k
    hts_log_warning("Couldn't get VCF version, considering as %d.%d",
191
6.26k
        VCF_MAJOR_VER(VCF_DEF), VCF_MINOR_VER(VCF_DEF));
192
6.26k
    return VCF_DEF;
193
1.56k
}
194
195
// Header reference counting
196
197
static void bcf_hdr_incr_ref(bcf_hdr_t *h)
198
2.76k
{
199
2.76k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
200
2.76k
    aux->ref_count += 2;
201
2.76k
}
202
203
static void bcf_hdr_decr_ref(bcf_hdr_t *h)
204
2.76k
{
205
2.76k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
206
2.76k
    if (aux->ref_count >= 2)
207
2.76k
        aux->ref_count -= 2;
208
209
2.76k
    if (aux->ref_count == 0)
210
2.55k
        bcf_hdr_destroy(h);
211
2.76k
}
212
213
static void hdr_bgzf_private_data_cleanup(void *data)
214
2.76k
{
215
2.76k
    bcf_hdr_t *h = (bcf_hdr_t *) data;
216
2.76k
    bcf_hdr_decr_ref(h);
217
2.76k
}
218
219
static char *find_chrom_header_line(char *s)
220
0
{
221
0
    char *nl;
222
0
    if (strncmp(s, "#CHROM\t", 7) == 0) return s;
223
0
    else if ((nl = strstr(s, "\n#CHROM\t")) != NULL) return nl+1;
224
0
    else return NULL;
225
0
}
226
227
static int64_t get_rlen(const bcf_hdr_t *h, bcf1_t *v);
228
229
/*************************
230
 *** VCF header parser ***
231
 *************************/
232
233
static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len)
234
7.63k
{
235
7.63k
    const char *ss = s;
236
8.06k
    while ( *ss && isspace_c(*ss) && ss - s < len) ss++;
237
7.63k
    if ( !*ss || ss - s == len)
238
10
    {
239
10
        hts_log_error("Empty sample name: trailing spaces/tabs in the header line?");
240
10
        return -1;
241
10
    }
242
243
7.62k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
244
7.62k
    int ret;
245
7.62k
    char *sdup = malloc(len + 1);
246
7.62k
    if (!sdup) return -1;
247
7.62k
    memcpy(sdup, s, len);
248
7.62k
    sdup[len] = 0;
249
250
    // Ensure space is available in h->samples
251
7.62k
    size_t n = kh_size(d);
252
7.62k
    char **new_samples = hts_realloc_ps(h->samples, sizeof(*h->samples), n, 1);
253
7.62k
    if (!new_samples) {
254
0
        free(sdup);
255
0
        return -1;
256
0
    }
257
7.62k
    h->samples = new_samples;
258
259
7.62k
    int k = kh_put(vdict, d, sdup, &ret);
260
7.62k
    if (ret < 0) {
261
0
        free(sdup);
262
0
        return -1;
263
0
    }
264
7.62k
    if (ret) { // absent
265
7.62k
        kh_val(d, k) = bcf_idinfo_def;
266
7.62k
        kh_val(d, k).id = n;
267
7.62k
    } else {
268
6
        hts_log_error("Duplicated sample name '%s'", sdup);
269
6
        free(sdup);
270
6
        return -1;
271
6
    }
272
7.62k
    h->samples[n] = sdup;
273
7.62k
    h->dirty = 1;
274
7.62k
    return 0;
275
7.62k
}
276
277
int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
278
0
{
279
0
    if (!s) {
280
        // Allowed for backwards-compatibility, calling with s == NULL
281
        // used to trigger bcf_hdr_sync(h);
282
0
        return 0;
283
0
    }
284
0
    return bcf_hdr_add_sample_len(h, s, strlen(s));
285
0
}
286
287
int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *hdr, const char *str)
288
4.89k
{
289
4.89k
    const char *mandatory = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
290
4.89k
    if ( strncmp(str,mandatory,strlen(mandatory)) )
291
184
    {
292
184
        hts_log_error("Could not parse the \"#CHROM..\" line, either the fields are incorrect or spaces are present instead of tabs:\n\t%s",str);
293
184
        return -1;
294
184
    }
295
296
4.71k
    const char *beg = str + strlen(mandatory), *end;
297
4.71k
    if ( !*beg || *beg=='\n' ) return 0;
298
1.59k
    if ( strncmp(beg,"\tFORMAT\t",8) )
299
22
    {
300
22
        hts_log_error("Could not parse the \"#CHROM..\" line, either FORMAT is missing or spaces are present instead of tabs:\n\t%s",str);
301
22
        return -1;
302
22
    }
303
1.57k
    beg += 8;
304
305
1.57k
    int ret = 0;
306
7.75k
    while ( *beg )
307
7.63k
    {
308
7.63k
        end = beg;
309
169M
        while ( *end && *end!='\t' && *end!='\n' ) end++;
310
7.63k
        if ( bcf_hdr_add_sample_len(hdr, beg, end-beg) < 0 ) ret = -1;
311
7.63k
        if ( !*end || *end=='\n' || ret<0 ) break;
312
6.18k
        beg = end + 1;
313
6.18k
    }
314
1.57k
    return ret;
315
1.59k
}
316
317
int bcf_hdr_sync(bcf_hdr_t *h)
318
85.2k
{
319
85.2k
    int i;
320
341k
    for (i = 0; i < 3; i++)
321
255k
    {
322
255k
        vdict_t *d = (vdict_t*)h->dict[i];
323
255k
        khint_t k;
324
255k
        if ( h->n[i] < kh_size(d) )
325
1.44k
        {
326
1.44k
            bcf_idpair_t *new_idpair;
327
            // this should be true only for i=2, BCF_DT_SAMPLE
328
1.44k
            new_idpair = hts_realloc_p(h->id[i], sizeof(bcf_idpair_t), kh_size(d));
329
1.44k
            if (!new_idpair) return -1;
330
1.44k
            h->n[i] = kh_size(d);
331
1.44k
            h->id[i] = new_idpair;
332
1.44k
        }
333
2.96G
        for (k=kh_begin(d); k<kh_end(d); k++)
334
2.96G
        {
335
2.96G
            if (!kh_exist(d,k)) continue;
336
22.9M
            h->id[i][kh_val(d,k).id].key = kh_key(d,k);
337
22.9M
            h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
338
22.9M
        }
339
255k
    }
340
341
    // Invalidate key length cache
342
85.2k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
343
85.2k
    if (aux && aux->key_len) {
344
3.05k
        free(aux->key_len);
345
3.05k
        aux->key_len = NULL;
346
3.05k
    }
347
348
85.2k
    h->dirty = 0;
349
85.2k
    return 0;
350
85.2k
}
351
352
void bcf_hrec_destroy(bcf_hrec_t *hrec)
353
174k
{
354
174k
    if (!hrec) return;
355
168k
    free(hrec->key);
356
168k
    if ( hrec->value ) free(hrec->value);
357
168k
    int i;
358
547k
    for (i=0; i<hrec->nkeys; i++)
359
378k
    {
360
378k
        free(hrec->keys[i]);
361
378k
        free(hrec->vals[i]);
362
378k
    }
363
168k
    free(hrec->keys);
364
168k
    free(hrec->vals);
365
168k
    free(hrec);
366
168k
}
367
368
// Copies all fields except IDX.
369
bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
370
0
{
371
0
    int save_errno;
372
0
    bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
373
0
    if (!out) return NULL;
374
375
0
    out->type = hrec->type;
376
0
    if ( hrec->key ) {
377
0
        out->key = strdup(hrec->key);
378
0
        if (!out->key) goto fail;
379
0
    }
380
0
    if ( hrec->value ) {
381
0
        out->value = strdup(hrec->value);
382
0
        if (!out->value) goto fail;
383
0
    }
384
0
    out->nkeys = hrec->nkeys;
385
0
    out->keys = hts_malloc_p(sizeof(char*), hrec->nkeys);
386
0
    if (!out->keys) goto fail;
387
0
    out->vals = hts_malloc_p(sizeof(char*), hrec->nkeys);
388
0
    if (!out->vals) goto fail;
389
0
    int i, j = 0;
390
0
    for (i=0; i<hrec->nkeys; i++)
391
0
    {
392
0
        if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
393
0
        if ( hrec->keys[i] ) {
394
0
            out->keys[j] = strdup(hrec->keys[i]);
395
0
            if (!out->keys[j]) goto fail;
396
0
        }
397
0
        if ( hrec->vals[i] ) {
398
0
            out->vals[j] = strdup(hrec->vals[i]);
399
0
            if (!out->vals[j]) goto fail;
400
0
        }
401
0
        j++;
402
0
    }
403
0
    if ( i!=j ) out->nkeys -= i-j;   // IDX was omitted
404
0
    return out;
405
406
0
 fail:
407
0
    save_errno = errno;
408
0
    hts_log_error("%s", strerror(errno));
409
0
    bcf_hrec_destroy(out);
410
0
    errno = save_errno;
411
0
    return NULL;
412
0
}
413
414
void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
415
0
{
416
0
    fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
417
0
    int i;
418
0
    for (i=0; i<hrec->nkeys; i++)
419
0
        fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
420
0
    fprintf(fp, "\n");
421
0
}
422
423
void bcf_header_debug(bcf_hdr_t *hdr)
424
0
{
425
0
    int i, j;
426
0
    for (i=0; i<hdr->nhrec; i++)
427
0
    {
428
0
        if ( !hdr->hrec[i]->value )
429
0
        {
430
0
            fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
431
0
            fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
432
0
            for (j=1; j<hdr->hrec[i]->nkeys; j++)
433
0
                fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
434
0
            fprintf(stderr,">\n");
435
0
        }
436
0
        else
437
0
            fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
438
0
    }
439
0
}
440
441
int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len)
442
288k
{
443
288k
    char **tmp;
444
288k
    size_t n = hrec->nkeys + 1;
445
288k
    assert(len > 0 && len < SIZE_MAX);
446
288k
    tmp = hts_realloc_p(hrec->keys, sizeof(char*), n);
447
288k
    if (!tmp) return -1;
448
288k
    hrec->keys = tmp;
449
288k
    tmp = hts_realloc_p(hrec->vals, sizeof(char*), n);
450
288k
    if (!tmp) return -1;
451
288k
    hrec->vals = tmp;
452
453
288k
    hrec->keys[hrec->nkeys] = hts_malloc_ps(sizeof(char), len, 1);
454
288k
    if (!hrec->keys[hrec->nkeys]) return -1;
455
288k
    memcpy(hrec->keys[hrec->nkeys],str,len);
456
288k
    hrec->keys[hrec->nkeys][len] = 0;
457
288k
    hrec->vals[hrec->nkeys] = NULL;
458
288k
    hrec->nkeys = n;
459
288k
    return 0;
460
288k
}
461
462
int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted)
463
288k
{
464
288k
    if ( hrec->vals[i] ) {
465
0
        free(hrec->vals[i]);
466
0
        hrec->vals[i] = NULL;
467
0
    }
468
288k
    if ( !str ) return 0;
469
288k
    if ( is_quoted )
470
84.1k
    {
471
84.1k
        if (len >= SIZE_MAX - 3) {
472
0
            errno = ENOMEM;
473
0
            return -1;
474
0
        }
475
84.1k
        hrec->vals[i] = hts_malloc_ps(sizeof(char), len, 3);
476
84.1k
        if (!hrec->vals[i]) return -1;
477
84.1k
        hrec->vals[i][0] = '"';
478
84.1k
        memcpy(&hrec->vals[i][1],str,len);
479
84.1k
        hrec->vals[i][len+1] = '"';
480
84.1k
        hrec->vals[i][len+2] = 0;
481
84.1k
    }
482
204k
    else
483
204k
    {
484
204k
        if (len == SIZE_MAX) {
485
0
            errno = ENOMEM;
486
0
            return -1;
487
0
        }
488
204k
        hrec->vals[i] = hts_malloc_ps(sizeof(char), len, 1);
489
204k
        if (!hrec->vals[i]) return -1;
490
204k
        memcpy(hrec->vals[i],str,len);
491
204k
        hrec->vals[i][len] = 0;
492
204k
    }
493
288k
    return 0;
494
288k
}
495
496
int hrec_add_idx(bcf_hrec_t *hrec, int idx)
497
90.1k
{
498
90.1k
    int n = hrec->nkeys + 1;
499
90.1k
    char **tmp = hts_realloc_p(hrec->keys, sizeof(char*), n);
500
90.1k
    if (!tmp) return -1;
501
90.1k
    hrec->keys = tmp;
502
503
90.1k
    tmp = hts_realloc_p(hrec->vals, sizeof(char*), n);
504
90.1k
    if (!tmp) return -1;
505
90.1k
    hrec->vals = tmp;
506
507
90.1k
    hrec->keys[hrec->nkeys] = strdup("IDX");
508
90.1k
    if (!hrec->keys[hrec->nkeys]) return -1;
509
510
90.1k
    kstring_t str = {0,0,0};
511
90.1k
    if (kputw(idx, &str) < 0) {
512
0
        free(hrec->keys[hrec->nkeys]);
513
0
        return -1;
514
0
    }
515
90.1k
    hrec->vals[hrec->nkeys] = str.s;
516
90.1k
    hrec->nkeys = n;
517
90.1k
    return 0;
518
90.1k
}
519
520
int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
521
114k
{
522
114k
    int i;
523
175k
    for (i=0; i<hrec->nkeys; i++)
524
129k
        if ( !strcasecmp(key,hrec->keys[i]) ) return i;
525
45.6k
    return -1;
526
114k
}
527
528
static void bcf_hrec_set_type(bcf_hrec_t *hrec)
529
316k
{
530
316k
    if ( !strcmp(hrec->key, "contig") ) hrec->type = BCF_HL_CTG;
531
286k
    else if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
532
193k
    else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
533
99.2k
    else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
534
79.1k
    else if ( hrec->nkeys>0 ) hrec->type = BCF_HL_STR;
535
63.1k
    else hrec->type = BCF_HL_GEN;
536
316k
}
537
538
539
/**
540
    The arrays were generated with
541
542
    valid_ctg:
543
        perl -le '@v = (split(//,q[!#$%&*+./:;=?@^_|~-]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
544
545
    valid_tag:
546
        perl -le '@v = (split(//,q[_.]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
547
*/
548
static const uint8_t valid_ctg[256] =
549
{
550
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
552
    0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
553
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
554
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
555
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
556
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
557
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
558
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
559
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
560
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
561
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
562
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
563
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
564
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
565
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
566
};
567
static const uint8_t valid_tag[256] =
568
{
569
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
570
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
571
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
572
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
573
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
574
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
575
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
576
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
577
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
578
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
579
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
580
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
581
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
582
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
583
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
584
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
585
};
586
587
/**
588
    bcf_hrec_check() - check the validity of structured header lines
589
590
    Returns 0 on success or negative value on error.
591
592
    Currently the return status is not checked by the caller
593
    and only a warning is printed on stderr. This should be improved
594
    to propagate the error all the way up to the caller and let it
595
    decide what to do: throw an error or proceed anyway.
596
 */
597
static int bcf_hrec_check(bcf_hrec_t *hrec)
598
158k
{
599
158k
    int i;
600
158k
    bcf_hrec_set_type(hrec);
601
602
158k
    if ( hrec->type==BCF_HL_CTG )
603
14.7k
    {
604
14.7k
        i = bcf_hrec_find_key(hrec,"ID");
605
14.7k
        if ( i<0 ) goto err_missing_id;
606
10.1k
        char *val = hrec->vals[i];
607
10.1k
        if ( val[0]=='*' || val[0]=='=' || !valid_ctg[(uint8_t)val[0]] ) goto err_invalid_ctg;
608
55.2k
        while ( *(++val) )
609
54.9k
            if ( !valid_ctg[(uint8_t)*val] ) goto err_invalid_ctg;
610
353
        return 0;
611
1.48k
    }
612
143k
    if ( hrec->type==BCF_HL_INFO )
613
46.3k
    {
614
46.3k
        i = bcf_hrec_find_key(hrec,"ID");
615
46.3k
        if ( i<0 ) goto err_missing_id;
616
32.5k
        char *val = hrec->vals[i];
617
32.5k
        if ( !strcmp(val,"1000G") ) return 0;
618
32.5k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
619
15.3k
        while ( *(++val) )
620
13.3k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
621
1.99k
        return 0;
622
3.95k
    }
623
96.9k
    if ( hrec->type==BCF_HL_FMT )
624
10.0k
    {
625
10.0k
        i = bcf_hrec_find_key(hrec,"ID");
626
10.0k
        if ( i<0 ) goto err_missing_id;
627
8.82k
        char *val = hrec->vals[i];
628
8.82k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
629
10.3k
        while ( *(++val) )
630
7.84k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
631
2.47k
        return 0;
632
4.37k
    }
633
86.9k
    return 0;
634
635
19.6k
  err_missing_id:
636
19.6k
    hts_log_warning("Missing ID attribute in one or more header lines");
637
19.6k
    return -1;
638
639
9.76k
  err_invalid_ctg:
640
9.76k
    hts_log_warning("Invalid contig name: \"%s\"", hrec->vals[i]);
641
9.76k
    return -1;
642
643
36.8k
  err_invalid_tag:
644
36.8k
    hts_log_warning("Invalid tag name: \"%s\"", hrec->vals[i]);
645
36.8k
    return -1;
646
96.9k
}
647
648
static inline int is_escaped(const char *min, const char *str)
649
84.2k
{
650
84.2k
    int n = 0;
651
84.5k
    while ( --str>=min && *str=='\\' ) n++;
652
84.2k
    return n%2;
653
84.2k
}
654
655
bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
656
182k
{
657
182k
    bcf_hrec_t *hrec = NULL;
658
182k
    const char *p = line;
659
182k
    if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
660
174k
    p += 2;
661
662
174k
    const char *q = p;
663
1.61M
    while ( *q && *q!='=' && *q != '\n' ) q++;
664
174k
    ptrdiff_t n = q-p;
665
174k
    if ( *q!='=' || !n ) // wrong format
666
5.91k
        goto malformed_line;
667
668
168k
    hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
669
168k
    if (!hrec) { *len = -1; return NULL; }
670
168k
    hrec->key = hts_malloc_ps(sizeof(char), n, 1);
671
168k
    if (!hrec->key) goto fail;
672
168k
    memcpy(hrec->key,p,n);
673
168k
    hrec->key[n] = 0;
674
168k
    hrec->type = -1;
675
676
168k
    p = ++q;
677
168k
    if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
678
37.1k
    {
679
8.83M
        while ( *q && *q!='\n' ) q++;
680
37.1k
        hrec->value = hts_malloc_p(sizeof(char), (q-p+1));
681
37.1k
        if (!hrec->value) goto fail;
682
37.1k
        memcpy(hrec->value, p, q-p);
683
37.1k
        hrec->value[q-p] = 0;
684
37.1k
        *len = q - line + (*q ? 1 : 0); // Skip \n but not \0
685
37.1k
        return hrec;
686
37.1k
    }
687
688
    // structured line, e.g.
689
    // ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
690
    // ##PEDIGREE=<Name_0=G0-ID,Name_1=G1-ID,Name_3=GN-ID>
691
131k
    int nopen = 1;
692
420k
    while ( *q && *q!='\n' && nopen>0 )
693
299k
    {
694
299k
        p = ++q;
695
302k
        while ( *q && *q==' ' ) { p++; q++; }
696
        // ^[A-Za-z_][0-9A-Za-z_.]*$
697
299k
        if (p==q && *q && (isalpha_c(*q) || *q=='_'))
698
295k
        {
699
295k
            q++;
700
1.72M
            while ( *q && (isalnum_c(*q) || *q=='_' || *q=='.') ) q++;
701
295k
        }
702
299k
        n = q-p;
703
299k
        int m = 0;
704
299k
        while ( *q && *q==' ' ) { q++; m++; }
705
299k
        if ( *q!='=' || !n )
706
10.4k
            goto malformed_line;
707
708
288k
        if (bcf_hrec_add_key(hrec, p, q-p-m) < 0) goto fail;
709
288k
        p = ++q;
710
291k
        while ( *q && *q==' ' ) { p++; q++; }
711
712
288k
        int quoted = 0;
713
288k
        char ending = '\0';
714
288k
        switch (*p) {
715
84.1k
        case '"':
716
84.1k
            quoted = 1;
717
84.1k
            ending = '"';
718
84.1k
            p++;
719
84.1k
            break;
720
285
        case '[':
721
285
            quoted = 1;
722
285
            ending = ']';
723
285
            break;
724
288k
        }
725
288k
        if ( quoted ) q++;
726
220M
        while ( *q && *q != '\n' )
727
220M
        {
728
220M
            if ( quoted ) { if ( *q==ending && !is_escaped(p,q) ) break; }
729
219M
            else
730
219M
            {
731
219M
                if ( *q=='<' ) nopen++;
732
219M
                if ( *q=='>' ) nopen--;
733
219M
                if ( !nopen ) break;
734
219M
                if ( *q==',' && nopen==1 ) break;
735
219M
            }
736
220M
            q++;
737
220M
        }
738
288k
        const char *r = q;
739
288k
        if (quoted && ending == ']') {
740
285
            if (*q == ending) {
741
230
                r++;
742
230
                q++;
743
230
                quoted = 0;
744
230
            } else {
745
55
                char buffer[320];
746
55
                hts_log_error("Missing ']' in header line %s",
747
55
                              hts_strprint(buffer, sizeof(buffer), '"',
748
55
                                           line, q-line));
749
55
                goto fail;
750
55
            }
751
285
        }
752
289k
        while ( r > p && r[-1] == ' ' ) r--;
753
288k
        if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0)
754
0
            goto fail;
755
288k
        if ( quoted && *q==ending ) q++;
756
288k
        if ( *q=='>' )
757
93.0k
        {
758
93.0k
            if (nopen) nopen--;     // this can happen with nested angle brackets <>
759
93.0k
            q++;
760
93.0k
        }
761
288k
    }
762
120k
    if ( nopen )
763
27.8k
        hts_log_warning("Incomplete header line, trying to proceed anyway:\n\t[%s]\n\t[%d]",line,q[0]);
764
765
    // Skip to end of line
766
120k
    int nonspace = 0;
767
120k
    p = q;
768
998k
    while ( *q && *q!='\n' ) { nonspace |= !isspace_c(*q); q++; }
769
120k
    if (nonspace) {
770
1.17k
        char buffer[320];
771
1.17k
        hts_log_warning("Dropped trailing junk from header line '%s'",
772
1.17k
                        hts_strprint(buffer, sizeof(buffer),
773
1.17k
                                     '"', line, q - line));
774
1.17k
    }
775
776
120k
    *len = q - line + (*q ? 1 : 0);
777
120k
    return hrec;
778
779
55
 fail:
780
55
    *len = -1;
781
55
    bcf_hrec_destroy(hrec);
782
55
    return NULL;
783
784
16.3k
 malformed_line:
785
16.3k
    {
786
16.3k
        char buffer[320];
787
6.80M
        while ( *q && *q!='\n' ) q++;  // Ensure *len includes full line
788
16.3k
        hts_log_error("Could not parse the header line: %s",
789
16.3k
                      hts_strprint(buffer, sizeof(buffer),
790
16.3k
                                   '"', line, q - line));
791
16.3k
        *len = q - line + (*q ? 1 : 0);
792
16.3k
        bcf_hrec_destroy(hrec);
793
16.3k
        return NULL;
794
131k
    }
795
131k
}
796
797
static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_idinfo_t *idinfo)
798
89.1k
{
799
89.1k
    size_t new_n;
800
801
    // If available, preserve existing IDX
802
89.1k
    if ( idinfo->id==-1 )
803
88.5k
        idinfo->id = hdr->n[dict_type];
804
555
    else if ( idinfo->id < hdr->n[dict_type] && hdr->id[dict_type][idinfo->id].key )
805
9
    {
806
9
        hts_log_error("Conflicting IDX=%d lines in the header dictionary, the new tag is %s",
807
9
            idinfo->id, tag);
808
9
        errno = EINVAL;
809
9
        return -1;
810
9
    }
811
812
89.0k
    new_n = idinfo->id >= hdr->n[dict_type] ? idinfo->id+1 : hdr->n[dict_type];
813
89.0k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
814
    // hts_resize() can attempt to allocate up to 2 * requested items
815
89.0k
    if (new_n > FUZZ_ALLOC_LIMIT/(2 * sizeof(bcf_idpair_t)))
816
28
        return -1;
817
89.0k
#endif
818
89.0k
    if (hts_resize(bcf_idpair_t, new_n, &hdr->m[dict_type],
819
89.0k
                   &hdr->id[dict_type], HTS_RESIZE_CLEAR)) {
820
0
        return -1;
821
0
    }
822
89.0k
    hdr->n[dict_type] = new_n;
823
824
    // NB: the next kh_put call can invalidate the idinfo pointer, therefore
825
    // we leave it unassigned here. It must be set explicitly in bcf_hdr_sync.
826
89.0k
    hdr->id[dict_type][idinfo->id].key = tag;
827
828
89.0k
    return 0;
829
89.0k
}
830
831
// returns: 1 when hdr needs to be synced, -1 on error, 0 otherwise
832
static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
833
158k
{
834
    // contig
835
158k
    int i, ret, replacing = 0;
836
158k
    khint_t k;
837
158k
    char *str = NULL;
838
839
158k
    bcf_hrec_set_type(hrec);
840
841
158k
    if ( hrec->type==BCF_HL_CTG )
842
14.7k
    {
843
14.7k
        hts_pos_t len = 0;
844
845
        // Get the contig ID ($str) and length ($j)
846
14.7k
        i = bcf_hrec_find_key(hrec,"length");
847
14.7k
        if ( i<0 ) len = 0;
848
1.68k
        else {
849
1.68k
            char *end = hrec->vals[i];
850
1.68k
            len = strtoll(hrec->vals[i], &end, 10);
851
1.68k
            if (end == hrec->vals[i] || len < 0) return 0;
852
1.68k
        }
853
854
14.1k
        i = bcf_hrec_find_key(hrec,"ID");
855
14.1k
        if ( i<0 ) return 0;
856
10.1k
        str = strdup(hrec->vals[i]);
857
10.1k
        if (!str) return -1;
858
859
        // Register in the dictionary
860
10.1k
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
861
10.1k
        khint_t k = kh_get(vdict, d, str);
862
10.1k
        if ( k != kh_end(d) ) { // already present
863
3.52k
            free(str); str=NULL;
864
3.52k
            if (kh_val(d, k).hrec[0] != NULL) // and not removed
865
3.52k
                return 0;
866
0
            replacing = 1;
867
6.60k
        } else {
868
6.60k
            k = kh_put(vdict, d, str, &ret);
869
6.60k
            if (ret < 0) { free(str); return -1; }
870
6.60k
        }
871
872
6.60k
        int idx = bcf_hrec_find_key(hrec,"IDX");
873
6.60k
        if ( idx!=-1 )
874
1.24k
        {
875
1.24k
            char *tmp = hrec->vals[idx];
876
1.24k
            idx = strtol(hrec->vals[idx], &tmp, 10);
877
1.24k
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
878
1.18k
            {
879
1.18k
                if (!replacing) {
880
1.18k
                    kh_del(vdict, d, k);
881
1.18k
                    free(str);
882
1.18k
                }
883
1.18k
                hts_log_warning("Error parsing the IDX tag, skipping");
884
1.18k
                return 0;
885
1.18k
            }
886
1.24k
        }
887
888
5.41k
        kh_val(d, k) = bcf_idinfo_def;
889
5.41k
        kh_val(d, k).id = idx;
890
5.41k
        kh_val(d, k).info[0] = len;
891
5.41k
        kh_val(d, k).hrec[0] = hrec;
892
5.41k
        if (bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k)) < 0) {
893
21
            if (!replacing) {
894
21
                kh_del(vdict, d, k);
895
21
                free(str);
896
21
            }
897
21
            return -1;
898
21
        }
899
5.39k
        if ( idx==-1 ) {
900
5.36k
            if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
901
0
               return -1;
902
0
            }
903
5.36k
        }
904
905
5.39k
        return 1;
906
5.39k
    }
907
908
143k
    if ( hrec->type==BCF_HL_STR ) return 1;
909
135k
    if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_FMT ) return 0;
910
911
    // INFO/FILTER/FORMAT
912
103k
    char *id = NULL;
913
103k
    uint32_t type = UINT32_MAX, var = UINT32_MAX;
914
103k
    int num = -1, idx = -1;
915
361k
    for (i=0; i<hrec->nkeys; i++)
916
258k
    {
917
258k
        if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
918
169k
        else if ( !strcmp(hrec->keys[i], "IDX") )
919
1.24k
        {
920
1.24k
            char *tmp = hrec->vals[i];
921
1.24k
            idx = strtol(hrec->vals[i], &tmp, 10);
922
1.24k
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
923
264
            {
924
264
                hts_log_warning("Error parsing the IDX tag, skipping");
925
264
                return 0;
926
264
            }
927
1.24k
        }
928
168k
        else if ( !strcmp(hrec->keys[i], "Type") )
929
42.3k
        {
930
42.3k
            if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
931
40.6k
            else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
932
39.7k
            else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
933
5.17k
            else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
934
5.01k
            else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
935
3.84k
            else
936
3.84k
            {
937
3.84k
                hts_log_warning("The type \"%s\" is not supported, assuming \"String\"", hrec->vals[i]);
938
3.84k
                type = BCF_HT_STR;
939
3.84k
            }
940
42.3k
        }
941
126k
        else if ( !strcmp(hrec->keys[i], "Number") )
942
39.4k
        {
943
39.4k
            int is_fmt = hrec->type == BCF_HL_FMT;
944
39.4k
            if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
945
36.7k
            else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
946
36.6k
            else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
947
35.7k
            else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
948
35.7k
            else if ( is_fmt && !strcmp(hrec->vals[i],"P") )  var = BCF_VL_P;
949
35.7k
            else if ( is_fmt && !strcmp(hrec->vals[i],"LA") ) var = BCF_VL_LA;
950
35.7k
            else if ( is_fmt && !strcmp(hrec->vals[i],"LR") ) var = BCF_VL_LR;
951
35.7k
            else if ( is_fmt && !strcmp(hrec->vals[i],"LG") ) var = BCF_VL_LG;
952
35.7k
            else if ( is_fmt && !strcmp(hrec->vals[i],"M") )  var = BCF_VL_M;
953
35.7k
            else
954
35.7k
            {
955
35.7k
                if (sscanf(hrec->vals[i],"%d",&num) == 1)
956
35.2k
                    var = BCF_VL_FIXED;
957
35.7k
            }
958
39.4k
            if (var != BCF_VL_FIXED) num = 0xfffff;
959
39.4k
        }
960
258k
    }
961
103k
    if (hrec->type == BCF_HL_INFO || hrec->type == BCF_HL_FMT) {
962
56.1k
        if (type == -1) {
963
14.7k
            hts_log_warning("%s %s field has no Type defined. Assuming String",
964
14.7k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
965
14.7k
            type = BCF_HT_STR;
966
14.7k
        }
967
56.1k
        if (var == UINT32_MAX) {
968
17.2k
            hts_log_warning("%s %s field has no Number defined. Assuming '.'",
969
17.2k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
970
17.2k
            var = BCF_VL_VAR;
971
17.2k
        }
972
56.1k
        if ( type==BCF_HT_FLAG && (var!=BCF_VL_FIXED || num!=0) )
973
856
        {
974
856
            hts_log_warning("The definition of Flag \"%s/%s\" is invalid, forcing Number=0", hrec->key,id);
975
856
            var = BCF_VL_FIXED;
976
856
            num = 0;
977
856
        }
978
56.1k
    }
979
103k
    uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 |
980
103k
                     (var & 0xf) << 8 |
981
103k
                     (type & 0xf) << 4 |
982
103k
                     (((uint32_t) hrec->type) & 0xf));
983
984
103k
    if ( !id ) return 0;
985
88.5k
    str = strdup(id);
986
88.5k
    if (!str) return -1;
987
988
88.5k
    vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
989
88.5k
    k = kh_get(vdict, d, str);
990
88.5k
    if ( k != kh_end(d) )
991
4.83k
    {
992
        // already present
993
4.83k
        free(str);
994
4.83k
        if ( kh_val(d, k).hrec[info&0xf] ) return 0;
995
1.58k
        kh_val(d, k).info[info&0xf] = info;
996
1.58k
        kh_val(d, k).hrec[info&0xf] = hrec;
997
1.58k
        if ( idx==-1 ) {
998
1.58k
            if (hrec_add_idx(hrec, kh_val(d, k).id) < 0) {
999
0
                return -1;
1000
0
            }
1001
1.58k
        }
1002
1.58k
        return 1;
1003
1.58k
    }
1004
83.6k
    k = kh_put(vdict, d, str, &ret);
1005
83.6k
    if (ret < 0) {
1006
0
        free(str);
1007
0
        return -1;
1008
0
    }
1009
83.6k
    kh_val(d, k) = bcf_idinfo_def;
1010
83.6k
    kh_val(d, k).info[info&0xf] = info;
1011
83.6k
    kh_val(d, k).hrec[info&0xf] = hrec;
1012
83.6k
    kh_val(d, k).id = idx;
1013
83.6k
    if (bcf_hdr_set_idx(hdr, BCF_DT_ID, kh_key(d,k), &kh_val(d,k)) < 0) {
1014
16
        kh_del(vdict, d, k);
1015
16
        free(str);
1016
16
        return -1;
1017
16
    }
1018
83.6k
    if ( idx==-1 ) {
1019
83.1k
        if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
1020
0
            return -1;
1021
0
        }
1022
83.1k
    }
1023
1024
83.6k
    return 1;
1025
83.6k
}
1026
1027
static void bcf_hdr_unregister_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1028
0
{
1029
0
    if (hrec->type == BCF_HL_FLT ||
1030
0
        hrec->type == BCF_HL_INFO ||
1031
0
        hrec->type == BCF_HL_FMT ||
1032
0
        hrec->type == BCF_HL_CTG) {
1033
0
        int id = bcf_hrec_find_key(hrec, "ID");
1034
0
        if (id < 0 || !hrec->vals[id])
1035
0
            return;
1036
0
        vdict_t *dict = (hrec->type == BCF_HL_CTG
1037
0
                         ? (vdict_t*)hdr->dict[BCF_DT_CTG]
1038
0
                         : (vdict_t*)hdr->dict[BCF_DT_ID]);
1039
0
        khint_t k = kh_get(vdict, dict, hrec->vals[id]);
1040
0
        if (k != kh_end(dict))
1041
0
            kh_val(dict, k).hrec[hrec->type==BCF_HL_CTG ? 0 : hrec->type] = NULL;
1042
0
    }
1043
0
}
1044
1045
static void bcf_hdr_remove_from_hdict(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1046
0
{
1047
0
    kstring_t str = KS_INITIALIZE;
1048
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1049
0
    khint_t k;
1050
0
    int id;
1051
1052
0
    switch (hrec->type) {
1053
0
    case BCF_HL_GEN:
1054
0
        if (ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0)
1055
0
            str.l = 0;
1056
0
        break;
1057
0
    case BCF_HL_STR:
1058
0
        id = bcf_hrec_find_key(hrec, "ID");
1059
0
        if (id < 0)
1060
0
            return;
1061
0
        if (!hrec->vals[id] ||
1062
0
            ksprintf(&str, "##%s=<ID=%s>", hrec->key, hrec->vals[id]) < 0)
1063
0
            str.l = 0;
1064
0
        break;
1065
0
    default:
1066
0
        return;
1067
0
    }
1068
0
    if (str.l) {
1069
0
        k = kh_get(hdict, aux->gen, str.s);
1070
0
    } else {
1071
        // Couldn't get a string for some reason, so try the hard way...
1072
0
        for (k = kh_begin(aux->gen); k < kh_end(aux->gen); k++) {
1073
0
            if (kh_exist(aux->gen, k) && kh_val(aux->gen, k) == hrec)
1074
0
                break;
1075
0
        }
1076
0
    }
1077
0
    if (k != kh_end(aux->gen) && kh_val(aux->gen, k) == hrec) {
1078
0
        kh_val(aux->gen, k) = NULL;
1079
0
        free((char *) kh_key(aux->gen, k));
1080
0
        kh_key(aux->gen, k) = NULL;
1081
0
        kh_del(hdict, aux->gen, k);
1082
0
    }
1083
0
    free(str.s);
1084
0
}
1085
1086
int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp)
1087
0
{
1088
0
    assert( hrec->type==BCF_HL_GEN );
1089
0
    int ret;
1090
0
    khint_t k;
1091
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1092
0
    for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
1093
0
    {
1094
0
        if ( !kh_exist(aux->gen,k) ) continue;
1095
0
        if ( hrec!=(bcf_hrec_t*)kh_val(aux->gen,k) ) continue;
1096
0
        break;
1097
0
    }
1098
0
    assert( k<kh_end(aux->gen) );   // something went wrong, should never happen
1099
0
    free((char*)kh_key(aux->gen,k));
1100
0
    kh_del(hdict,aux->gen,k);
1101
0
    kstring_t str = {0,0,0};
1102
0
    if ( ksprintf(&str, "##%s=%s", tmp->key,tmp->value) < 0 )
1103
0
    {
1104
0
        free(str.s);
1105
0
        return -1;
1106
0
    }
1107
0
    k = kh_put(hdict, aux->gen, str.s, &ret);
1108
0
    if ( ret<0 )
1109
0
    {
1110
0
        free(str.s);
1111
0
        return -1;
1112
0
    }
1113
0
    free(hrec->value);
1114
0
    hrec->value = strdup(tmp->value);
1115
0
    if ( !hrec->value ) return -1;
1116
0
    kh_val(aux->gen,k) = hrec;
1117
1118
0
    if (!strcmp(hrec->key,"fileformat")) {
1119
        //update version
1120
0
        get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value);
1121
0
    }
1122
0
    return 0;
1123
0
}
1124
1125
int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1126
158k
{
1127
158k
    kstring_t str = {0,0,0};
1128
158k
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1129
1130
158k
    int res;
1131
158k
    if ( !hrec ) return 0;
1132
1133
158k
    bcf_hrec_check(hrec);   // todo: check return status and propagate errors up
1134
1135
158k
    res = bcf_hdr_register_hrec(hdr,hrec);
1136
158k
    if (res < 0) return -1;
1137
158k
    if ( !res )
1138
59.3k
    {
1139
        // If one of the hashed field, then it is already present
1140
59.3k
        if ( hrec->type != BCF_HL_GEN )
1141
27.8k
        {
1142
27.8k
            bcf_hrec_destroy(hrec);
1143
27.8k
            return 0;
1144
27.8k
        }
1145
        // Is one of the generic fields and already present?
1146
31.5k
        if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 )
1147
0
        {
1148
0
            free(str.s);
1149
0
            return -1;
1150
0
        }
1151
31.5k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1152
31.5k
        if ( k != kh_end(aux->gen) )
1153
18.2k
        {
1154
            // duplicate record
1155
18.2k
            bcf_hrec_destroy(hrec);
1156
18.2k
            free(str.s);
1157
18.2k
            return 0;
1158
18.2k
        }
1159
13.3k
        if (!strcmp(hrec->key, "fileformat")) {
1160
6.38k
            aux->version = bcf_get_version(NULL, hrec->value);
1161
6.38k
        }
1162
13.3k
    }
1163
1164
111k
    int i;
1165
111k
    if ( hrec->type==BCF_HL_STR && (i=bcf_hrec_find_key(hrec,"ID"))>=0 )
1166
4.46k
    {
1167
4.46k
        if ( ksprintf(&str, "##%s=<ID=%s>", hrec->key,hrec->vals[i]) < 0 )
1168
0
        {
1169
0
            free(str.s);
1170
0
            return -1;
1171
0
        }
1172
4.46k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1173
4.46k
        if ( k != kh_end(aux->gen) )
1174
3.20k
        {
1175
            // duplicate record
1176
3.20k
            bcf_hrec_destroy(hrec);
1177
3.20k
            free(str.s);
1178
3.20k
            return 0;
1179
3.20k
        }
1180
4.46k
    }
1181
1182
    // New record, needs to be added
1183
108k
    int n = hdr->nhrec + 1;
1184
108k
    bcf_hrec_t **new_hrec = hts_realloc_p(hdr->hrec, sizeof(bcf_hrec_t*), n);
1185
108k
    if (!new_hrec) {
1186
0
        free(str.s);
1187
0
        bcf_hdr_unregister_hrec(hdr, hrec);
1188
0
        return -1;
1189
0
    }
1190
108k
    hdr->hrec = new_hrec;
1191
1192
108k
    if ( str.s )
1193
14.5k
    {
1194
14.5k
        khint_t k = kh_put(hdict, aux->gen, str.s, &res);
1195
14.5k
        if ( res<0 )
1196
0
        {
1197
0
            free(str.s);
1198
0
            return -1;
1199
0
        }
1200
14.5k
        kh_val(aux->gen,k) = hrec;
1201
14.5k
    }
1202
1203
108k
    hdr->hrec[hdr->nhrec] = hrec;
1204
108k
    hdr->dirty = 1;
1205
108k
    hdr->nhrec = n;
1206
1207
108k
    return hrec->type==BCF_HL_GEN ? 0 : 1;
1208
108k
}
1209
1210
bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
1211
1.08k
{
1212
1.08k
    int i;
1213
1.08k
    if ( type==BCF_HL_GEN )
1214
1.08k
    {
1215
        // e.g. ##fileformat=VCFv4.2
1216
        //      ##source=GenomicsDBImport
1217
        //      ##bcftools_viewVersion=1.16-80-gdfdb0923+htslib-1.16-34-g215d364
1218
1.08k
        if ( value )
1219
0
        {
1220
0
            kstring_t str = {0,0,0};
1221
0
            ksprintf(&str, "##%s=%s", key,value);
1222
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1223
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1224
0
            free(str.s);
1225
0
            if ( k == kh_end(aux->gen) ) return NULL;
1226
0
            return kh_val(aux->gen, k);
1227
0
        }
1228
2.33k
        for (i=0; i<hdr->nhrec; i++)
1229
1.49k
        {
1230
1.49k
            if ( hdr->hrec[i]->type!=type ) continue;
1231
360
            if ( strcmp(hdr->hrec[i]->key,key) ) continue;
1232
254
            return hdr->hrec[i];
1233
360
        }
1234
832
        return NULL;
1235
1.08k
    }
1236
0
    else if ( type==BCF_HL_STR )
1237
0
    {
1238
        // e.g. ##GATKCommandLine=<ID=GenomicsDBImport,CommandLine="GenomicsDBImport....">
1239
        //      ##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
1240
0
        if (!str_class) return NULL;
1241
0
        if ( !strcmp("ID",key) )
1242
0
        {
1243
0
            kstring_t str = {0,0,0};
1244
0
            ksprintf(&str, "##%s=<%s=%s>",str_class,key,value);
1245
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1246
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1247
0
            free(str.s);
1248
0
            if ( k == kh_end(aux->gen) ) return NULL;
1249
0
            return kh_val(aux->gen, k);
1250
0
        }
1251
0
        for (i=0; i<hdr->nhrec; i++)
1252
0
        {
1253
0
            if ( hdr->hrec[i]->type!=type ) continue;
1254
0
            if ( strcmp(hdr->hrec[i]->key,str_class) ) continue;
1255
0
            int j = bcf_hrec_find_key(hdr->hrec[i],key);
1256
0
            if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i];
1257
0
        }
1258
0
        return NULL;
1259
0
    }
1260
0
    vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1261
0
    khint_t k = kh_get(vdict, d, value);
1262
0
    if ( k == kh_end(d) ) return NULL;
1263
0
    return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
1264
0
}
1265
1266
// Check the VCF header is correctly formatted as per the specification.
1267
// Note the code that calls this doesn't bother to check return values and
1268
// we have so many broken VCFs in the wild that for now we just reprt a
1269
// warning and continue anyway.  So currently this is a void function.
1270
void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
1271
4.67k
{
1272
4.67k
    int version = bcf_get_version(hdr, NULL);
1273
1274
4.67k
    struct tag {
1275
4.67k
        char name[10];
1276
4.67k
        char number_str[3];
1277
4.67k
        int number;
1278
4.67k
        int version;
1279
4.67k
        int type;
1280
4.67k
    };
1281
1282
4.67k
    char type_str[][8] = {"Flag", "Integer", "Float", "String"};
1283
1284
4.67k
    struct tag info_tags[] = {
1285
4.67k
        {"AD",        "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1286
4.67k
        {"ADF",       "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1287
4.67k
        {"ADR",       "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1288
4.67k
        {"AC",        "A",  BCF_VL_A,     VCF_DEF, BCF_HT_INT},
1289
4.67k
        {"AF",        "A",  BCF_VL_A,     VCF_DEF, BCF_HT_REAL},
1290
4.67k
        {"CIGAR",     "A",  BCF_VL_A,     VCF_DEF, BCF_HT_STR},
1291
4.67k
        {"AA",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_STR},
1292
4.67k
        {"AN",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1293
4.67k
        {"BQ",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_REAL},
1294
4.67k
        {"DB",        "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1295
4.67k
        {"DP",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1296
4.67k
        {"END",       "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1297
4.67k
        {"H2",        "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1298
4.67k
        {"H3",        "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1299
4.67k
        {"MQ",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_REAL},
1300
4.67k
        {"MQ0",       "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1301
4.67k
        {"NS",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1302
4.67k
        {"SB",        "4",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1303
4.67k
        {"SOMATIC",   "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1304
4.67k
        {"VALIDATED", "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1305
4.67k
        {"1000G",     "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1306
4.67k
    };
1307
4.67k
    static int info_warned[sizeof(info_tags)/sizeof(*info_tags)] = {0};
1308
1309
4.67k
    struct tag fmt_tags[] = {
1310
4.67k
        {"AD",   "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1311
4.67k
        {"ADF",  "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1312
4.67k
        {"ADR",  "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1313
4.67k
        {"EC",   "A",  BCF_VL_A,     VCF_DEF, BCF_HT_INT},
1314
4.67k
        {"GL",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_REAL},
1315
4.67k
        {"GP",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_REAL},
1316
4.67k
        {"PL",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_INT},
1317
4.67k
        {"PP",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_INT},
1318
4.67k
        {"DP",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1319
4.67k
        {"LEN",  "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1320
4.67k
        {"FT",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_STR},
1321
4.67k
        {"GQ",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1322
4.67k
        {"GT",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_STR},
1323
4.67k
        {"HQ",   "2",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1324
4.67k
        {"MQ",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1325
4.67k
        {"PQ",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1326
4.67k
        {"PS",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1327
4.67k
        {"PSL",  "P",  BCF_VL_P,     VCF44,   BCF_HT_STR},
1328
4.67k
        {"PSO",  "P",  BCF_VL_P,     VCF44,   BCF_HT_INT},
1329
4.67k
        {"PSQ",  "P",  BCF_VL_P,     VCF44,   BCF_HT_INT},
1330
4.67k
        {"LGL",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1331
4.67k
        {"LGP",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1332
4.67k
        {"LPL",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1333
4.67k
        {"LPP",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1334
4.67k
        {"LEC",  "LA", BCF_VL_LA,    VCF45,   BCF_HT_INT},
1335
4.67k
        {"LAD",  "LR", BCF_VL_LR,    VCF45,   BCF_HT_INT},
1336
4.67k
        {"LADF", "LR", BCF_VL_LR,    VCF45,   BCF_HT_INT},
1337
4.67k
        {"LADR", "LR", BCF_VL_LR,    VCF45,   BCF_HT_INT},
1338
4.67k
    };
1339
4.67k
    static int fmt_warned[sizeof(fmt_tags)/sizeof(*fmt_tags)] = {0};
1340
1341
    // Check INFO tag numbers.  We shouldn't really permit ".", but it's
1342
    // commonly misused so we let it slide unless it's a new tag and the
1343
    // file format claims to be new also.  We also cannot distinguish between
1344
    // Number=1 and Number=2, but we at least report the correct term if we
1345
    // get, say, Number=G in its place.
1346
    // Also check the types.
1347
4.67k
    int i;
1348
102k
    for (i = 0; i < sizeof(info_tags)/sizeof(*info_tags); i++) {
1349
98.1k
        if (info_warned[i])
1350
2.49k
            continue;
1351
95.6k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, info_tags[i].name);
1352
95.6k
        if (bcf_hdr_idinfo_exists(hdr, BCF_HL_INFO, id)) {
1353
1
            if (bcf_hdr_id2length(hdr, BCF_HL_INFO, id) != info_tags[i].number &&
1354
1
                bcf_hdr_id2length(hdr, BCF_HL_INFO, id) != BCF_VL_VAR) {
1355
0
                info_warned[i] = 1;
1356
1
            } else if (bcf_hdr_id2length(hdr, BCF_HL_INFO, id) == BCF_VL_FIXED &&
1357
0
                       bcf_hdr_id2number(hdr, BCF_HL_INFO, id) != atoi(info_tags[i].number_str)) {
1358
0
                info_warned[i] = 1;
1359
0
            }
1360
1361
1
            if (info_warned[i]) {
1362
0
                hts_log_warning("%s should be declared as Number=%s",
1363
0
                                info_tags[i].name, info_tags[i].number_str);
1364
0
            }
1365
1366
1
            if (bcf_hdr_id2type(hdr, BCF_HL_INFO, id) != info_tags[i].type) {
1367
1
                hts_log_warning("%s should be declared as Type=%s",
1368
1
                                info_tags[i].name, type_str[info_tags[i].type]);
1369
1
                info_warned[i] = 1;
1370
1
            }
1371
1
        }
1372
95.6k
    }
1373
1374
    // Check FORMAT tag numbers and types.
1375
135k
    for (i = 0; i < sizeof(fmt_tags)/sizeof(*fmt_tags); i++) {
1376
130k
        if (fmt_warned[i])
1377
0
            continue;
1378
130k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, fmt_tags[i].name);
1379
130k
        if (bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, id)) {
1380
0
            if (bcf_hdr_id2length(hdr, BCF_HL_FMT, id) != fmt_tags[i].number) {
1381
                // Permit "Number=." if this tag predates the vcf version it is
1382
                // defined within.  This is a common tactic for callers to use
1383
                // new tags with older formats in order to avoid parsing failures
1384
                // with some software.
1385
                // We don't care for 4.3 and earlier as that's more of a wild-west
1386
                // and it's not abnormal to see incorrect usage of Number=. there.
1387
0
                if ((version < VCF44 &&
1388
0
                     bcf_hdr_id2length(hdr, BCF_HL_FMT, id) != BCF_VL_VAR) ||
1389
0
                    (version >= VCF44 && version >= fmt_tags[i].version)) {
1390
0
                    fmt_warned[i] = 1;
1391
0
                }
1392
0
            } else if (bcf_hdr_id2length(hdr, BCF_HL_FMT, id) == BCF_VL_FIXED &&
1393
0
                       bcf_hdr_id2number(hdr, BCF_HL_FMT, id) != atoi(fmt_tags[i].number_str)) {
1394
0
                fmt_warned[i] = 1;
1395
0
            }
1396
1397
0
            if (fmt_warned[i]) {
1398
0
                hts_log_warning("%s should be declared as Number=%s",
1399
0
                                fmt_tags[i].name, fmt_tags[i].number_str);
1400
0
            }
1401
1402
0
            if (bcf_hdr_id2type(hdr, BCF_HL_FMT, id) != fmt_tags[i].type) {
1403
0
                hts_log_warning("%s should be declared as Type=%s",
1404
0
                                fmt_tags[i].name, type_str[fmt_tags[i].type]);
1405
0
                fmt_warned[i] = 1;
1406
0
            }
1407
0
        }
1408
130k
    }
1409
4.67k
}
1410
1411
int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
1412
6.33k
{
1413
6.33k
    int len, done = 0;
1414
6.33k
    char *p = htxt;
1415
1416
    // Check sanity: "fileformat" string must come as first
1417
6.33k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
1418
6.33k
    if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") )
1419
606
        hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?");
1420
6.33k
    if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1421
0
        bcf_hrec_destroy(hrec);
1422
0
        return -1;
1423
0
    }
1424
1425
    // The filter PASS must appear first in the dictionary
1426
6.33k
    hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
1427
6.33k
    if (!hrec || bcf_hdr_add_hrec(hdr, hrec) < 0) {
1428
0
        bcf_hrec_destroy(hrec);
1429
0
        return -1;
1430
0
    }
1431
1432
    // Parse the whole header
1433
23.9k
    do {
1434
89.1k
        while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) {
1435
65.2k
            if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1436
36
                bcf_hrec_destroy(hrec);
1437
36
                return -1;
1438
36
            }
1439
65.2k
            p += len;
1440
65.2k
        }
1441
23.9k
        assert(hrec == NULL);
1442
23.8k
        if (len < 0) {
1443
            // len < 0 indicates out-of-memory, or similar error
1444
38
            hts_log_error("Could not parse header line: %s", strerror(errno));
1445
38
            return -1;
1446
23.8k
        } else if (len > 0) {
1447
            // Bad header line.  bcf_hdr_parse_line() will have logged it.
1448
            // Skip and try again on the next line (p + len will be the start
1449
            // of the next one).
1450
16.2k
            p += len;
1451
16.2k
            continue;
1452
16.2k
        }
1453
1454
        // Next should be the sample line.  If not, it was a malformed
1455
        // header, in which case print a warning and skip (many VCF
1456
        // operations do not really care about a few malformed lines).
1457
        // In the future we may want to add a strict mode that errors in
1458
        // this case.
1459
7.57k
        if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) {
1460
2.67k
            char *eol = strchr(p, '\n');
1461
2.67k
            if (*p != '\0') {
1462
1.38k
                char buffer[320];
1463
1.38k
                hts_log_warning("Could not parse header line: %s",
1464
1.38k
                                hts_strprint(buffer, sizeof(buffer),
1465
1.38k
                                               '"', p,
1466
1.38k
                                               eol ? (eol - p) : SIZE_MAX));
1467
1.38k
            }
1468
2.67k
            if (eol) {
1469
1.32k
                p = eol + 1; // Try from the next line.
1470
1.35k
            } else {
1471
1.35k
                done = -1; // No more lines left, give up.
1472
1.35k
            }
1473
4.89k
        } else {
1474
4.89k
            done = 1; // Sample line found
1475
4.89k
        }
1476
23.8k
    } while (!done);
1477
1478
6.25k
    if (done < 0) {
1479
        // No sample line is fatal.
1480
1.35k
        hts_log_error("Could not parse the header, sample line not found");
1481
1.35k
        return -1;
1482
1.35k
    }
1483
1484
4.89k
    if (bcf_hdr_parse_sample_line(hdr,p) < 0)
1485
222
        return -1;
1486
4.67k
    if (bcf_hdr_sync(hdr) < 0)
1487
0
        return -1;
1488
4.67k
    bcf_hdr_check_sanity(hdr);
1489
4.67k
    return 0;
1490
4.67k
}
1491
1492
int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
1493
0
{
1494
0
    int len;
1495
0
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
1496
0
    if ( !hrec ) return -1;
1497
0
    if (bcf_hdr_add_hrec(hdr, hrec) < 0)
1498
0
        return -1;
1499
0
    return 0;
1500
0
}
1501
1502
void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
1503
0
{
1504
0
    int i = 0;
1505
0
    bcf_hrec_t *hrec;
1506
0
    if ( !key )
1507
0
    {
1508
        // no key, remove all entries of this type
1509
0
        while ( i<hdr->nhrec )
1510
0
        {
1511
0
            if ( hdr->hrec[i]->type!=type ) { i++; continue; }
1512
0
            hrec = hdr->hrec[i];
1513
0
            bcf_hdr_unregister_hrec(hdr, hrec);
1514
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1515
0
            hdr->dirty = 1;
1516
0
            hdr->nhrec--;
1517
0
            if ( i < hdr->nhrec )
1518
0
                memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1519
0
            bcf_hrec_destroy(hrec);
1520
0
        }
1521
0
        return;
1522
0
    }
1523
0
    while (1)
1524
0
    {
1525
0
        if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
1526
0
        {
1527
0
            hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL);
1528
0
            if ( !hrec ) return;
1529
1530
0
            for (i=0; i<hdr->nhrec; i++)
1531
0
                if ( hdr->hrec[i]==hrec ) break;
1532
0
            assert( i<hdr->nhrec );
1533
1534
0
            vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1535
0
            khint_t k = kh_get(vdict, d, key);
1536
0
            kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
1537
0
        }
1538
0
        else
1539
0
        {
1540
0
            for (i=0; i<hdr->nhrec; i++)
1541
0
            {
1542
0
                if ( hdr->hrec[i]->type!=type ) continue;
1543
0
                if ( type==BCF_HL_GEN )
1544
0
                {
1545
0
                    if ( !strcmp(hdr->hrec[i]->key,key) ) break;
1546
0
                }
1547
0
                else
1548
0
                {
1549
                    // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
1550
0
                    int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
1551
0
                    if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break;
1552
0
                }
1553
0
            }
1554
0
            if ( i==hdr->nhrec ) return;
1555
0
            hrec = hdr->hrec[i];
1556
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1557
0
        }
1558
1559
0
        hdr->nhrec--;
1560
0
        if ( i < hdr->nhrec )
1561
0
            memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1562
0
        bcf_hrec_destroy(hrec);
1563
0
        hdr->dirty = 1;
1564
0
    }
1565
0
}
1566
1567
int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
1568
0
{
1569
0
    char tmp[256], *line = tmp;
1570
0
    va_list ap;
1571
0
    va_start(ap, fmt);
1572
0
    int n = vsnprintf(line, sizeof(tmp), fmt, ap);
1573
0
    va_end(ap);
1574
1575
0
    if (n >= sizeof(tmp)) {
1576
0
        n++; // For trailing NUL
1577
0
        line = (char*)malloc(n);
1578
0
        if (!line)
1579
0
            return -1;
1580
1581
0
        va_start(ap, fmt);
1582
0
        vsnprintf(line, n, fmt, ap);
1583
0
        va_end(ap);
1584
0
    }
1585
1586
0
    int ret = bcf_hdr_append(hdr, line);
1587
1588
0
    if (line != tmp) free(line);
1589
0
    return ret;
1590
0
}
1591
1592
1593
/**********************
1594
 *** BCF header I/O ***
1595
 **********************/
1596
1597
const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
1598
1.08k
{
1599
1.08k
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1600
1.08k
    if ( !hrec )
1601
832
    {
1602
832
        hts_log_warning("No version string found, assuming VCFv4.2");
1603
832
        return "VCFv4.2";
1604
832
    }
1605
254
    return hrec->value;
1606
1.08k
}
1607
1608
int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
1609
0
{
1610
0
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1611
0
    if ( !hrec )
1612
0
    {
1613
0
        int len;
1614
0
        kstring_t str = {0,0,0};
1615
0
        if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1;
1616
0
        hrec = bcf_hdr_parse_line(hdr, str.s, &len);
1617
0
        free(str.s);
1618
1619
0
        get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value);
1620
0
    }
1621
0
    else
1622
0
    {
1623
0
        bcf_hrec_t *tmp = bcf_hrec_dup(hrec);
1624
0
        if ( !tmp ) return -1;
1625
0
        free(tmp->value);
1626
0
        tmp->value = strdup(version);
1627
0
        if ( !tmp->value ) return -1;
1628
0
        bcf_hdr_update_hrec(hdr, hrec, tmp);
1629
0
        bcf_hrec_destroy(tmp);
1630
0
    }
1631
0
    hdr->dirty = 1;
1632
    //TODO rlen may change, deal with it
1633
0
    return 0; // FIXME: check for errs in this function (return < 0 if so)
1634
0
}
1635
1636
bcf_hdr_t *bcf_hdr_init(const char *mode)
1637
6.44k
{
1638
6.44k
    int i;
1639
6.44k
    bcf_hdr_t *h;
1640
6.44k
    h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
1641
6.44k
    if (!h) return NULL;
1642
25.7k
    for (i = 0; i < 3; ++i) {
1643
19.3k
        if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail;
1644
        // Supersize the hash to make collisions very unlikely
1645
19.3k
        static int dsize[3] = {16384,16384,2048}; // info, contig, format
1646
19.3k
        if (kh_resize(vdict, h->dict[i], dsize[i]) < 0) goto fail;
1647
19.3k
    }
1648
1649
6.44k
    bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t));
1650
6.44k
    if ( !aux ) goto fail;
1651
6.44k
    if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; }
1652
6.44k
    aux->key_len = NULL;
1653
6.44k
    aux->dict = *((vdict_t*)h->dict[0]);
1654
6.44k
    aux->version = 0;
1655
6.44k
    aux->ref_count = 1;
1656
6.44k
    free(h->dict[0]);
1657
6.44k
    h->dict[0] = aux;
1658
1659
6.44k
    if ( strchr(mode,'w') )
1660
0
    {
1661
0
        bcf_hdr_append(h, "##fileformat=VCFv4.2");
1662
        // The filter PASS must appear first in the dictionary
1663
0
        bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
1664
0
        aux->version = VCF_DEF;
1665
0
    }
1666
6.44k
    return h;
1667
1668
0
 fail:
1669
0
    for (i = 0; i < 3; ++i)
1670
0
        kh_destroy(vdict, h->dict[i]);
1671
0
    free(h);
1672
0
    return NULL;
1673
6.44k
}
1674
1675
void bcf_hdr_destroy(bcf_hdr_t *h)
1676
8.99k
{
1677
8.99k
    int i;
1678
8.99k
    khint_t k;
1679
8.99k
    if (!h) return;
1680
8.99k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
1681
8.99k
    if (aux->ref_count > 1) // Refs still held, so delay destruction
1682
2.55k
    {
1683
2.55k
        aux->ref_count &= ~1;
1684
2.55k
        return;
1685
2.55k
    }
1686
25.7k
    for (i = 0; i < 3; ++i) {
1687
19.3k
        vdict_t *d = (vdict_t*)h->dict[i];
1688
19.3k
        if (d == 0) continue;
1689
224M
        for (k = kh_begin(d); k != kh_end(d); ++k)
1690
224M
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
1691
19.3k
        if ( i==0 )
1692
6.44k
        {
1693
41.4k
            for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
1694
35.0k
                if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k));
1695
6.44k
            kh_destroy(hdict, aux->gen);
1696
6.44k
            free(aux->key_len); // may exist for dict[0] only
1697
6.44k
        }
1698
19.3k
        kh_destroy(vdict, d);
1699
19.3k
        free(h->id[i]);
1700
19.3k
    }
1701
115k
    for (i=0; i<h->nhrec; i++)
1702
108k
        bcf_hrec_destroy(h->hrec[i]);
1703
6.44k
    if (h->nhrec) free(h->hrec);
1704
6.44k
    if (h->samples) free(h->samples);
1705
6.44k
    free(h->keep_samples);
1706
6.44k
    free(h->transl[0]); free(h->transl[1]);
1707
6.44k
    free(h->mem.s);
1708
6.44k
    free(h);
1709
6.44k
}
1710
1711
bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
1712
6.44k
{
1713
6.44k
    if (hfp->format.format == vcf)
1714
5.75k
        return vcf_hdr_read(hfp);
1715
686
    if (hfp->format.format != bcf) {
1716
0
        hts_log_error("Input is not detected as bcf or vcf format");
1717
0
        return NULL;
1718
0
    }
1719
1720
686
    assert(hfp->is_bgzf);
1721
1722
686
    BGZF *fp = hfp->fp.bgzf;
1723
686
    uint8_t magic[5];
1724
686
    bcf_hdr_t *h;
1725
686
    h = bcf_hdr_init("r");
1726
686
    if (!h) {
1727
0
        hts_log_error("Failed to allocate bcf header");
1728
0
        return NULL;
1729
0
    }
1730
686
    if (bgzf_read(fp, magic, 5) != 5)
1731
6
    {
1732
6
        hts_log_error("Failed to read the header (reading BCF in text mode?)");
1733
6
        bcf_hdr_destroy(h);
1734
6
        return NULL;
1735
6
    }
1736
680
    if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
1737
14
    {
1738
14
        if (!strncmp((char*)magic, "BCF", 3))
1739
14
            hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported");
1740
0
        else
1741
0
            hts_log_error("Invalid BCF2 magic string");
1742
14
        bcf_hdr_destroy(h);
1743
14
        return NULL;
1744
14
    }
1745
666
    uint8_t buf[4];
1746
666
    size_t hlen;
1747
666
    char *htxt = NULL;
1748
666
    if (bgzf_read(fp, buf, 4) != 4) goto fail;
1749
662
    hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24);
1750
662
    if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; }
1751
662
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1752
662
    if (hlen > FUZZ_ALLOC_LIMIT/2) { errno = ENOMEM; goto fail; }
1753
658
#endif
1754
658
    htxt = (char*)malloc(hlen + 1);
1755
658
    if (!htxt) goto fail;
1756
658
    if (bgzf_read(fp, htxt, hlen) != hlen) goto fail;
1757
606
    htxt[hlen] = '\0'; // Ensure htxt is terminated
1758
606
    if ( bcf_hdr_parse(h, htxt) < 0 ) goto fail;
1759
428
    free(htxt);
1760
1761
428
    bcf_hdr_incr_ref(h);
1762
428
    bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup);
1763
1764
428
    return h;
1765
238
 fail:
1766
238
    hts_log_error("Failed to read BCF header");
1767
238
    free(htxt);
1768
238
    bcf_hdr_destroy(h);
1769
238
    return NULL;
1770
606
}
1771
1772
int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
1773
4.67k
{
1774
4.67k
    if (!h) {
1775
0
        errno = EINVAL;
1776
0
        return -1;
1777
0
    }
1778
4.67k
    if ( h->dirty ) {
1779
0
        if (bcf_hdr_sync(h) < 0) return -1;
1780
0
    }
1781
4.67k
    hfp->format.category = variant_data;
1782
4.67k
    if (hfp->format.format == vcf || hfp->format.format == text_format) {
1783
2.33k
        hfp->format.format = vcf;
1784
2.33k
        return vcf_hdr_write(hfp, h);
1785
2.33k
    }
1786
1787
2.33k
    if (hfp->format.format == binary_format)
1788
2.33k
        hfp->format.format = bcf;
1789
1790
2.33k
    kstring_t htxt = {0,0,0};
1791
2.33k
    if (bcf_hdr_format(h, 1, &htxt) < 0) {
1792
0
        free(htxt.s);
1793
0
        return -1;
1794
0
    }
1795
2.33k
    kputc('\0', &htxt); // include the \0 byte
1796
1797
2.33k
    BGZF *fp = hfp->fp.bgzf;
1798
2.33k
    if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
1799
2.33k
    uint8_t hlen[4];
1800
2.33k
    u32_to_le(htxt.l, hlen);
1801
2.33k
    if ( bgzf_write(fp, hlen, 4) !=4 ) return -1;
1802
2.33k
    if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1;
1803
2.33k
    if ( bgzf_flush(fp) < 0) return -1;
1804
1805
2.33k
    bcf_hdr_incr_ref(h);
1806
2.33k
    bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup);
1807
1808
2.33k
    free(htxt.s);
1809
2.33k
    return 0;
1810
2.33k
}
1811
1812
/********************
1813
 *** BCF site I/O ***
1814
 ********************/
1815
1816
bcf1_t *bcf_init(void)
1817
4.67k
{
1818
4.67k
    bcf1_t *v;
1819
4.67k
    v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
1820
4.67k
    return v;
1821
4.67k
}
1822
1823
void bcf_clear(bcf1_t *v)
1824
39.0k
{
1825
39.0k
    int i;
1826
39.0k
    for (i=0; i<v->d.m_info; i++)
1827
0
    {
1828
0
        if ( v->d.info[i].vptr_free )
1829
0
        {
1830
0
            free(v->d.info[i].vptr - v->d.info[i].vptr_off);
1831
0
            v->d.info[i].vptr_free = 0;
1832
0
        }
1833
0
    }
1834
39.0k
    for (i=0; i<v->d.m_fmt; i++)
1835
0
    {
1836
0
        if ( v->d.fmt[i].p_free )
1837
0
        {
1838
0
            free(v->d.fmt[i].p - v->d.fmt[i].p_off);
1839
0
            v->d.fmt[i].p_free = 0;
1840
0
        }
1841
0
    }
1842
39.0k
    v->rid = v->pos = v->rlen = v->unpacked = 0;
1843
39.0k
    bcf_float_set_missing(v->qual);
1844
39.0k
    v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
1845
39.0k
    v->shared.l = v->indiv.l = 0;
1846
39.0k
    v->d.var_type = -1;
1847
39.0k
    v->d.shared_dirty = 0;
1848
39.0k
    v->d.indiv_dirty  = 0;
1849
39.0k
    v->d.n_flt = 0;
1850
39.0k
    v->errcode = 0;
1851
39.0k
    if (v->d.m_als) v->d.als[0] = 0;
1852
39.0k
    if (v->d.m_id) v->d.id[0] = 0;
1853
39.0k
}
1854
1855
void bcf_empty(bcf1_t *v)
1856
4.67k
{
1857
4.67k
    bcf_clear1(v);
1858
4.67k
    free(v->d.id);
1859
4.67k
    free(v->d.als);
1860
4.67k
    free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
1861
4.67k
    if (v->d.var ) free(v->d.var);
1862
4.67k
    free(v->shared.s); free(v->indiv.s);
1863
4.67k
    memset(&v->d,0,sizeof(v->d));
1864
4.67k
    memset(&v->shared,0,sizeof(v->shared));
1865
4.67k
    memset(&v->indiv,0,sizeof(v->indiv));
1866
4.67k
}
1867
1868
void bcf_destroy(bcf1_t *v)
1869
4.67k
{
1870
4.67k
    if (!v) return;
1871
4.67k
    bcf_empty1(v);
1872
4.67k
    free(v);
1873
4.67k
}
1874
1875
static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
1876
472
{
1877
472
    uint8_t x[32];
1878
472
    ssize_t ret;
1879
472
    uint32_t shared_len, indiv_len;
1880
472
    if ((ret = bgzf_read(fp, x, 32)) != 32) {
1881
22
        if (ret == 0) return -1;
1882
8
        return -2;
1883
22
    }
1884
450
    bcf_clear1(v);
1885
450
    shared_len = le_to_u32(x);
1886
450
    if (shared_len < 24) return -2;
1887
450
    shared_len -= 24; // to exclude six 32-bit integers
1888
450
    indiv_len = le_to_u32(x + 4);
1889
450
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1890
    // ks_resize() normally allocates 1.5 * requested size to allow for growth
1891
450
    if ((uint64_t) shared_len + indiv_len > FUZZ_ALLOC_LIMIT / 3 * 2) return -2;
1892
440
#endif
1893
440
    if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2;
1894
440
    if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2;
1895
440
    v->rid  = le_to_i32(x + 8);
1896
440
    v->pos  = le_to_u32(x + 12);
1897
440
    if ( v->pos==UINT32_MAX ) v->pos = -1;  // this is for telomere coordinate, e.g. MT:0
1898
440
    v->rlen = le_to_i32(x + 16);
1899
440
    v->qual = le_to_float(x + 20);
1900
440
    v->n_info = le_to_u16(x + 24);
1901
440
    v->n_allele = le_to_u16(x + 26);
1902
440
    v->n_sample = le_to_u32(x + 28) & 0xffffff;
1903
440
    v->n_fmt = x[31];
1904
440
    v->shared.l = shared_len;
1905
440
    v->indiv.l = indiv_len;
1906
    // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
1907
440
    if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
1908
1909
440
    if (bgzf_read(fp, v->shared.s, v->shared.l) != v->shared.l) return -2;
1910
406
    if (bgzf_read(fp, v->indiv.s, v->indiv.l) != v->indiv.l) return -2;
1911
404
    return 0;
1912
406
}
1913
1914
0
#define bit_array_size(n) ((n)/8+1)
1915
0
#define bit_array_set(a,i)   ((a)[(i)/8] |=   1 << ((i)%8))
1916
0
#define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
1917
0
#define bit_array_test(a,i)  ((a)[(i)/8] &   (1 << ((i)%8)))
1918
1919
static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1920
4.82k
                                   int32_t *val) {
1921
4.82k
    uint32_t t;
1922
4.82k
    if (end - p < 2) return -1;
1923
4.80k
    t = *p++ & 0xf;
1924
    /* Use if .. else if ... else instead of switch to force order.  Assumption
1925
       is that small integers are more frequent than big ones. */
1926
4.80k
    if (t == BCF_BT_INT8) {
1927
2.35k
        *val = *(int8_t *) p++;
1928
2.44k
    } else {
1929
2.44k
        if (end - p < (1<<bcf_type_shift[t])) return -1;
1930
2.42k
        if (t == BCF_BT_INT16) {
1931
1.60k
            *val = le_to_i16(p);
1932
1.60k
            p += 2;
1933
1.60k
        } else if (t == BCF_BT_INT32) {
1934
672
            *val = le_to_i32(p);
1935
672
            p += 4;
1936
#ifdef VCF_ALLOW_INT64
1937
        } else if (t == BCF_BT_INT64) {
1938
            // This case should never happen because there should be no
1939
            // 64-bit BCFs at all, definitely not coming from htslib
1940
            *val = le_to_i64(p);
1941
            p += 8;
1942
#endif
1943
672
        } else {
1944
144
            return -1;
1945
144
        }
1946
2.42k
    }
1947
4.63k
    *q = p;
1948
4.63k
    return 0;
1949
4.80k
}
1950
1951
static int bcf_dec_size_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1952
16.8k
                             int *num, int *type) {
1953
16.8k
    int r;
1954
16.8k
    if (p >= end) return -1;
1955
16.7k
    *type = *p & 0xf;
1956
16.7k
    if (*p>>4 != 15) {
1957
16.3k
        *q = p + 1;
1958
16.3k
        *num = *p >> 4;
1959
16.3k
        return 0;
1960
16.3k
    }
1961
404
    r = bcf_dec_typed_int1_safe(p + 1, end, q, num);
1962
404
    if (r) return r;
1963
336
    return *num >= 0 ? 0 : -1;
1964
404
}
1965
1966
556
static const char *get_type_name(int type) {
1967
556
    const char *types[9] = {
1968
556
        "null", "int (8-bit)", "int (16 bit)", "int (32 bit)",
1969
556
        "unknown", "float", "unknown", "char", "unknown"
1970
556
    };
1971
556
    int t = (type >= 0 && type < 8) ? type : 8;
1972
556
    return types[t];
1973
556
}
1974
1975
/**
1976
 *  updatephasing - updates 1st phasing based on other phasing status
1977
 *  @param p - pointer to phase value array
1978
 *  @param end - end of array
1979
 *  @param q - pointer to consumed data
1980
 *  @param samples - no. of samples in array
1981
 *  @param ploidy - no. of phasing values per sample
1982
 *  @param type - value type (one of BCF_BT_...)
1983
 *  Returns 0 on success and 1 on failure
1984
 *  Update for haploids made only if it is not unknown (.)
1985
 */
1986
static int updatephasing(uint8_t *p, uint8_t *end, uint8_t **q, int samples, int ploidy, int type)
1987
0
{
1988
0
    int j, k;
1989
0
    unsigned int inc = 1 << bcf_type_shift[type];
1990
0
    ptrdiff_t bytes = samples * ploidy * inc;
1991
1992
0
    if (samples < 0 || ploidy < 0 || end - p < bytes)
1993
0
        return 1;
1994
1995
    /*
1996
     * This works because phasing is stored in the least-significant bit
1997
     * of the GT encoding, and the data is always stored little-endian.
1998
     * Thus it's possible to get the desired result by doing bit operations
1999
     * on the least-significant byte of each value and ignoring the
2000
     * higher bytes (for 16-bit and 32-bit values).
2001
     */
2002
2003
0
    switch (ploidy) {
2004
0
    case 1:
2005
        // Trivial case - haploid data is phased by default
2006
0
        for (j = 0; j < samples; ++j) {
2007
0
            if (*p) *p |= 1;    //only if not unknown (.)
2008
0
            p += inc;
2009
0
        }
2010
0
        break;
2011
0
    case 2:
2012
        // Mostly trivial case - first is phased if second is.
2013
0
        for (j = 0; j < samples; ++j) {
2014
0
            *p |= (p[inc] & 1);
2015
0
            p += 2 * inc;
2016
0
        }
2017
0
        break;
2018
0
    default:
2019
        // Generic case - first is phased if all other alleles are.
2020
0
        for (j = 0; j < samples; ++j) {
2021
0
            uint8_t allphased = 1;
2022
0
            for (k = 1; k < ploidy; ++k)
2023
0
                allphased &= (p[inc * k]);
2024
0
            *p |= allphased;
2025
0
            p += ploidy * inc;
2026
0
        }
2027
0
    }
2028
0
    *q = p;
2029
0
    return 0;
2030
0
}
2031
2032
static void bcf_record_check_err(const bcf_hdr_t *hdr, bcf1_t *rec,
2033
2.25k
                                 char *type, uint32_t *reports, int i) {
2034
2.25k
    if (*reports == 0 || hts_verbose >= HTS_LOG_DEBUG)
2035
86
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos
2036
2.25k
                        ": Invalid FORMAT %s %d",
2037
2.25k
                        bcf_seqname_safe(hdr,rec), rec->pos+1, type, i);
2038
2.25k
    (*reports)++;
2039
2.25k
}
2040
2041
404
static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
2042
404
    uint8_t *ptr, *end;
2043
404
    size_t bytes;
2044
404
    uint32_t err = 0;
2045
404
    int type = 0;
2046
404
    int num  = 0;
2047
404
    uint32_t i, reports;
2048
404
    const uint32_t is_integer = ((1 << BCF_BT_INT8)  |
2049
404
                                 (1 << BCF_BT_INT16) |
2050
#ifdef VCF_ALLOW_INT64
2051
                                 (1 << BCF_BT_INT64) |
2052
#endif
2053
404
                                 (1 << BCF_BT_INT32));
2054
404
    const uint32_t is_valid_type = (is_integer          |
2055
404
                                    (1 << BCF_BT_NULL)  |
2056
404
                                    (1 << BCF_BT_FLOAT) |
2057
404
                                    (1 << BCF_BT_CHAR));
2058
404
    int32_t max_id = hdr ? hdr->n[BCF_DT_ID] : 0;
2059
    /* set phasing for 1st allele as in v44 for versions upto v43, to have
2060
    consistent binary values irrespective of version; not run for v >= v44,
2061
    to retain explicit phasing in v44 and higher */
2062
404
    int idgt = hdr ?
2063
404
                    bcf_get_version(hdr, NULL) < VCF44 ?
2064
404
                        bcf_hdr_id2int(hdr, BCF_DT_ID, "GT") : -1 :
2065
404
                    -1;
2066
2067
    // Check for valid contig ID
2068
404
    if (rec->rid < 0
2069
302
        || (hdr && (rec->rid >= hdr->n[BCF_DT_CTG]
2070
340
                    || hdr->id[BCF_DT_CTG][rec->rid].key == NULL))) {
2071
340
        hts_log_warning("Bad BCF record at %"PRIhts_pos": Invalid %s id %d", rec->pos+1, "CONTIG", rec->rid);
2072
340
        err |= BCF_ERR_CTG_INVALID;
2073
340
    }
2074
2075
    // Check ID
2076
404
    ptr = (uint8_t *) rec->shared.s;
2077
404
    end = ptr + rec->shared.l;
2078
404
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2079
396
    if (type != BCF_BT_CHAR) {
2080
334
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "ID", type, get_type_name(type));
2081
334
        err |= BCF_ERR_TAG_INVALID;
2082
334
    }
2083
396
    bytes = (size_t) num << bcf_type_shift[type];
2084
396
    if (end - ptr < bytes) goto bad_shared;
2085
390
    ptr += bytes;
2086
2087
    // Check REF and ALT
2088
390
    if (rec->n_allele < 1) {
2089
184
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": No REF allele",
2090
184
                        bcf_seqname_safe(hdr,rec), rec->pos+1);
2091
184
        err |= BCF_ERR_TAG_UNDEF;
2092
184
    }
2093
2094
390
    reports = 0;
2095
12.1k
    for (i = 0; i < rec->n_allele; i++) {
2096
11.7k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2097
11.7k
        if (type != BCF_BT_CHAR) {
2098
11.3k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2099
142
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "REF/ALT", type, get_type_name(type));
2100
11.3k
            err |= BCF_ERR_CHAR;
2101
11.3k
        }
2102
11.7k
        bytes = (size_t) num << bcf_type_shift[type];
2103
11.7k
        if (end - ptr < bytes) goto bad_shared;
2104
11.7k
        ptr += bytes;
2105
11.7k
    }
2106
2107
    // Check FILTER
2108
322
    reports = 0;
2109
322
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2110
306
    if (num > 0) {
2111
168
        bytes = (size_t) num << bcf_type_shift[type];
2112
168
        if (((1 << type) & is_integer) == 0) {
2113
68
            hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", type, get_type_name(type));
2114
68
            err |= BCF_ERR_TAG_INVALID;
2115
68
            if (end - ptr < bytes) goto bad_shared;
2116
62
            ptr += bytes;
2117
100
        } else {
2118
100
            if (end - ptr < bytes) goto bad_shared;
2119
3.72k
            for (i = 0; i < num; i++) {
2120
3.62k
                int32_t key = bcf_dec_int1(ptr, type, &ptr);
2121
3.62k
                if (key < 0
2122
2.77k
                    || (hdr && (key >= max_id
2123
2.51k
                                || hdr->id[BCF_DT_ID][key].key == NULL))) {
2124
2.51k
                    if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2125
88
                        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", key);
2126
2.51k
                    err |= BCF_ERR_TAG_UNDEF;
2127
2.51k
                }
2128
3.62k
            }
2129
98
        }
2130
168
    }
2131
2132
    // Check INFO
2133
298
    reports = 0;
2134
298
    bcf_idpair_t *id_tmp = hdr ? hdr->id[BCF_DT_ID] : NULL;
2135
2.30k
    for (i = 0; i < rec->n_info; i++) {
2136
2.14k
        int32_t key = -1;
2137
2.14k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_shared;
2138
2.04k
        if (key < 0 || (hdr && (key >= max_id
2139
1.60k
                                || id_tmp[key].key == NULL))) {
2140
1.60k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2141
92
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", key);
2142
1.60k
            err |= BCF_ERR_TAG_UNDEF;
2143
1.60k
        }
2144
2.04k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2145
2.01k
        if (((1 << type) & is_valid_type) == 0
2146
1.95k
            || (type == BCF_BT_NULL && num > 0)) {
2147
80
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2148
12
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", type, get_type_name(type));
2149
80
            err |= BCF_ERR_TAG_INVALID;
2150
80
        }
2151
2.01k
        bytes = (size_t) num << bcf_type_shift[type];
2152
2.01k
        if (end - ptr < bytes) goto bad_shared;
2153
2.00k
        ptr += bytes;
2154
2.00k
    }
2155
2156
    // Check FORMAT and individual information
2157
162
    ptr = (uint8_t *) rec->indiv.s;
2158
162
    end = ptr + rec->indiv.l;
2159
162
    reports = 0;
2160
2.34k
    for (i = 0; i < rec->n_fmt; i++) {
2161
2.28k
        int32_t key = -1;
2162
2.28k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_indiv;
2163
2.25k
        if (key < 0
2164
2.13k
            || (hdr && (key >= max_id
2165
2.09k
                        || id_tmp[key].key == NULL))) {
2166
2.09k
            bcf_record_check_err(hdr, rec, "id", &reports, key);
2167
2.09k
            err |= BCF_ERR_TAG_UNDEF;
2168
2.09k
        }
2169
2.25k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv;
2170
2.22k
        if (((1 << type) & is_valid_type) == 0
2171
2.12k
            || (type == BCF_BT_NULL && num > 0)) {
2172
154
            bcf_record_check_err(hdr, rec, "type", &reports, type);
2173
154
            err |= BCF_ERR_TAG_INVALID;
2174
154
        }
2175
2.22k
        if (idgt >= 0 && idgt == key) {
2176
            // check first GT phasing bit and fix up if necessary
2177
0
            if (updatephasing(ptr, end, &ptr, rec->n_sample, num, type)) {
2178
0
                err |= BCF_ERR_TAG_INVALID;
2179
0
            }
2180
2.22k
        } else {
2181
2.22k
            bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample;
2182
2.22k
            if (end - ptr < bytes) goto bad_indiv;
2183
2.18k
            ptr += bytes;
2184
2.18k
        }
2185
2.22k
    }
2186
2187
64
    if (!err && rec->rlen < 0) {
2188
        // Treat bad rlen as a warning instead of an error, and try to
2189
        // fix up by using the length of the stored REF allele.
2190
16
        static int warned = 0;
2191
16
        if (!warned) {
2192
1
            hts_log_warning("BCF record at %s:%"PRIhts_pos" has invalid RLEN (%"PRIhts_pos"). "
2193
1
                            "Only one invalid RLEN will be reported.",
2194
1
                            bcf_seqname_safe(hdr,rec), rec->pos+1, rec->rlen);
2195
1
            warned = 1;
2196
1
        }
2197
        //find rlen considering reflen, END, SVLEN, fmt LEN
2198
16
        hts_pos_t len = get_rlen(hdr, rec);
2199
16
        rec->rlen = len >= 0 ? len : 0;
2200
16
    }
2201
2202
64
    rec->errcode |= err;
2203
2204
64
    return err ? -2 : 0; // Return -2 so bcf_read() reports an error
2205
2206
242
 bad_shared:
2207
242
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - shared section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
2208
242
    return -2;
2209
2210
98
 bad_indiv:
2211
98
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - individuals section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
2212
98
    return -2;
2213
162
}
2214
2215
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
2216
int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
2217
0
{
2218
0
    if ( !hdr->keep_samples ) return 0;
2219
0
    if ( !bcf_hdr_nsamples(hdr) )
2220
0
    {
2221
0
        rec->indiv.l = rec->n_sample = 0;
2222
0
        return 0;
2223
0
    }
2224
2225
0
    int i, j;
2226
0
    uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
2227
0
    bcf_dec_t *dec = &rec->d;
2228
0
    hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
2229
0
    for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
2230
2231
0
    for (i=0; i<rec->n_fmt; i++)
2232
0
    {
2233
0
        ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
2234
0
        src = dec->fmt[i].p - dec->fmt[i].size;
2235
0
        if ( dst )
2236
0
        {
2237
0
            memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
2238
0
            dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
2239
0
        }
2240
0
        dst = dec->fmt[i].p;
2241
0
        for (j=0; j<hdr->nsamples_ori; j++)
2242
0
        {
2243
0
            src += dec->fmt[i].size;
2244
0
            if ( !bit_array_test(hdr->keep_samples,j) ) continue;
2245
0
            memmove(dst, src, dec->fmt[i].size);
2246
0
            dst += dec->fmt[i].size;
2247
0
        }
2248
0
        rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
2249
0
        dec->fmt[i].p_len = dst - dec->fmt[i].p;
2250
0
    }
2251
0
    rec->unpacked |= BCF_UN_FMT;
2252
2253
0
    rec->n_sample = bcf_hdr_nsamples(hdr);
2254
0
    return 0;
2255
0
}
2256
2257
int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
2258
35.5k
{
2259
35.5k
    if (fp->format.format == vcf) return vcf_read(fp, h, v);
2260
472
    if (!h)
2261
0
        h = (const bcf_hdr_t *) bgzf_get_private_data(fp->fp.bgzf);
2262
472
    int ret = bcf_read1_core(fp->fp.bgzf, v);
2263
472
    if (ret == 0) ret = bcf_record_check(h, v);
2264
472
    if ( ret!=0 || !h->keep_samples ) return ret;
2265
0
    return bcf_subset_format(h,v);
2266
472
}
2267
2268
int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end)
2269
0
{
2270
0
    bcf1_t *v = (bcf1_t *) vv;
2271
0
    const bcf_hdr_t *hdr = (const bcf_hdr_t *) bgzf_get_private_data(fp);
2272
0
    int ret = bcf_read1_core(fp, v);
2273
0
    if (ret == 0) ret = bcf_record_check(hdr, v);
2274
0
    if (ret  >= 0)
2275
0
        *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
2276
0
    return ret;
2277
0
}
2278
2279
static inline int bcf1_sync_id(bcf1_t *line, kstring_t *str)
2280
0
{
2281
    // single typed string
2282
0
    if ( line->d.id && strcmp(line->d.id, ".") ) {
2283
0
        return bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
2284
0
    } else {
2285
0
        return bcf_enc_size(str, 0, BCF_BT_CHAR);
2286
0
    }
2287
0
}
2288
static inline int bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
2289
0
{
2290
    // list of typed strings
2291
0
    int i;
2292
0
    for (i=0; i<line->n_allele; i++) {
2293
0
        if (bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]) < 0)
2294
0
            return -1;
2295
0
    }
2296
0
    if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
2297
0
    return 0;
2298
0
}
2299
static inline int bcf1_sync_filter(bcf1_t *line, kstring_t *str)
2300
0
{
2301
    // typed vector of integers
2302
0
    if ( line->d.n_flt ) {
2303
0
        return bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
2304
0
    } else {
2305
0
        return bcf_enc_vint(str, 0, 0, -1);
2306
0
    }
2307
0
}
2308
2309
static inline int bcf1_sync_info(bcf1_t *line, kstring_t *str)
2310
0
{
2311
    // pairs of typed vectors
2312
0
    int i, irm = -1, e = 0;
2313
0
    for (i=0; i<line->n_info; i++)
2314
0
    {
2315
0
        bcf_info_t *info = &line->d.info[i];
2316
0
        if ( !info->vptr )
2317
0
        {
2318
            // marked for removal
2319
0
            if ( irm < 0 ) irm = i;
2320
0
            continue;
2321
0
        }
2322
0
        e |= kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str) < 0;
2323
0
        if ( irm >=0 )
2324
0
        {
2325
0
            bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
2326
0
            while ( irm<=i && line->d.info[irm].vptr ) irm++;
2327
0
        }
2328
0
    }
2329
0
    if ( irm>=0 ) line->n_info = irm;
2330
0
    return e == 0 ? 0 : -1;
2331
0
}
2332
2333
static int bcf1_sync(bcf1_t *line)
2334
23
{
2335
23
    char *shared_ori = line->shared.s;
2336
23
    size_t prev_len;
2337
2338
23
    kstring_t tmp = {0,0,0};
2339
23
    if ( !line->shared.l )
2340
0
    {
2341
        // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
2342
0
        tmp = line->shared;
2343
0
        bcf1_sync_id(line, &tmp);
2344
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2345
2346
0
        bcf1_sync_alleles(line, &tmp);
2347
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2348
2349
0
        bcf1_sync_filter(line, &tmp);
2350
0
        line->unpack_size[2] = tmp.l - prev_len;
2351
2352
0
        bcf1_sync_info(line, &tmp);
2353
0
        line->shared = tmp;
2354
0
    }
2355
23
    else if ( line->d.shared_dirty )
2356
0
    {
2357
        // The line was edited, update the BCF data block.
2358
2359
0
        if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line,BCF_UN_STR);
2360
2361
        // ptr_ori points to the original unchanged BCF data.
2362
0
        uint8_t *ptr_ori = (uint8_t *) line->shared.s;
2363
2364
        // ID: single typed string
2365
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ID )
2366
0
            bcf1_sync_id(line, &tmp);
2367
0
        else
2368
0
            kputsn_(ptr_ori, line->unpack_size[0], &tmp);
2369
0
        ptr_ori += line->unpack_size[0];
2370
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2371
2372
        // REF+ALT: list of typed strings
2373
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
2374
0
            bcf1_sync_alleles(line, &tmp);
2375
0
        else
2376
0
        {
2377
0
            kputsn_(ptr_ori, line->unpack_size[1], &tmp);
2378
0
            if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
2379
0
        }
2380
0
        ptr_ori += line->unpack_size[1];
2381
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2382
2383
0
        if ( line->unpacked & BCF_UN_FLT )
2384
0
        {
2385
            // FILTER: typed vector of integers
2386
0
            if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
2387
0
                bcf1_sync_filter(line, &tmp);
2388
0
            else if ( line->d.n_flt )
2389
0
                kputsn_(ptr_ori, line->unpack_size[2], &tmp);
2390
0
            else
2391
0
                bcf_enc_vint(&tmp, 0, 0, -1);
2392
0
            ptr_ori += line->unpack_size[2];
2393
0
            line->unpack_size[2] = tmp.l - prev_len;
2394
2395
0
            if ( line->unpacked & BCF_UN_INFO )
2396
0
            {
2397
                // INFO: pairs of typed vectors
2398
0
                if ( line->d.shared_dirty & BCF1_DIRTY_INF )
2399
0
                {
2400
0
                    bcf1_sync_info(line, &tmp);
2401
0
                    ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
2402
0
                }
2403
0
            }
2404
0
        }
2405
2406
0
        int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
2407
0
        if ( size ) kputsn_(ptr_ori, size, &tmp);
2408
2409
0
        free(line->shared.s);
2410
0
        line->shared = tmp;
2411
0
    }
2412
23
    if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
2413
0
    {
2414
        // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
2415
0
        size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
2416
0
        int i;
2417
0
        for (i=0; i<line->n_info; i++)
2418
0
        {
2419
0
            uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
2420
0
            line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
2421
0
            off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
2422
0
            if ( vptr_free )
2423
0
            {
2424
0
                free(vptr_free);
2425
0
                line->d.info[i].vptr_free = 0;
2426
0
            }
2427
0
        }
2428
0
    }
2429
2430
23
    if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
2431
0
    {
2432
        // The genotype fields changed or are not present
2433
0
        tmp.l = tmp.m = 0; tmp.s = NULL;
2434
0
        int i, irm = -1;
2435
0
        for (i=0; i<line->n_fmt; i++)
2436
0
        {
2437
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
2438
0
            if ( !fmt->p )
2439
0
            {
2440
                // marked for removal
2441
0
                if ( irm < 0 ) irm = i;
2442
0
                continue;
2443
0
            }
2444
0
            kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
2445
0
            if ( irm >=0 )
2446
0
            {
2447
0
                bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
2448
0
                while ( irm<=i && line->d.fmt[irm].p ) irm++;
2449
0
            }
2450
2451
0
        }
2452
0
        if ( irm>=0 ) line->n_fmt = irm;
2453
0
        free(line->indiv.s);
2454
0
        line->indiv = tmp;
2455
2456
        // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
2457
0
        size_t off_new = 0;
2458
0
        for (i=0; i<line->n_fmt; i++)
2459
0
        {
2460
0
            uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
2461
0
            line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
2462
0
            off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
2463
0
            if ( p_free )
2464
0
            {
2465
0
                free(p_free);
2466
0
                line->d.fmt[i].p_free = 0;
2467
0
            }
2468
0
        }
2469
0
    }
2470
23
    if ( !line->n_sample ) line->n_fmt = 0;
2471
23
    line->d.shared_dirty = line->d.indiv_dirty = 0;
2472
23
    return 0;
2473
23
}
2474
2475
bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
2476
0
{
2477
0
    bcf1_sync(src);
2478
2479
0
    bcf_clear(dst);
2480
0
    dst->rid  = src->rid;
2481
0
    dst->pos  = src->pos;
2482
0
    dst->rlen = src->rlen;
2483
0
    dst->qual = src->qual;
2484
0
    dst->n_info = src->n_info; dst->n_allele = src->n_allele;
2485
0
    dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
2486
2487
0
    if ( dst->shared.m < src->shared.l )
2488
0
    {
2489
0
        dst->shared.s = (char*) realloc(dst->shared.s, src->shared.l);
2490
0
        dst->shared.m = src->shared.l;
2491
0
    }
2492
0
    dst->shared.l = src->shared.l;
2493
0
    memcpy(dst->shared.s,src->shared.s,dst->shared.l);
2494
2495
0
    if ( dst->indiv.m < src->indiv.l )
2496
0
    {
2497
0
        dst->indiv.s = (char*) realloc(dst->indiv.s, src->indiv.l);
2498
0
        dst->indiv.m = src->indiv.l;
2499
0
    }
2500
0
    dst->indiv.l = src->indiv.l;
2501
0
    memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
2502
2503
0
    return dst;
2504
0
}
2505
bcf1_t *bcf_dup(bcf1_t *src)
2506
0
{
2507
0
    bcf1_t *out = bcf_init1();
2508
0
    return bcf_copy(out, src);
2509
0
}
2510
2511
int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
2512
32.8k
{
2513
32.8k
    if ( h->dirty ) {
2514
0
        if (bcf_hdr_sync(h) < 0) return -1;
2515
0
    }
2516
32.8k
    if ( bcf_hdr_nsamples(h)!=v->n_sample )
2517
84
    {
2518
84
        hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
2519
84
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
2520
84
        return -1;
2521
84
    }
2522
2523
32.7k
    if ( hfp->format.format == vcf || hfp->format.format == text_format )
2524
30.8k
        return vcf_write(hfp,h,v);
2525
2526
1.90k
    if ( v->errcode & ~BCF_ERR_LIMITS ) // todo: unsure about the other BCF_ERR_LIMITS branches in vcf_parse_format_alloc4()
2527
1.87k
    {
2528
        // vcf_parse1() encountered a new contig or tag, undeclared in the
2529
        // header.  At this point, the header must have been printed,
2530
        // proceeding would lead to a broken BCF file. Errors must be checked
2531
        // and cleared by the caller before we can proceed.
2532
1.87k
        char errdescription[1024] = "";
2533
1.87k
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos, v->errcode, bcf_strerror(v->errcode, errdescription, sizeof(errdescription)), bcf_seqname_safe(h,v), v->pos+1);
2534
1.87k
        return -1;
2535
1.87k
    }
2536
23
    bcf1_sync(v);   // check if the BCF record was modified
2537
2538
23
    if ( v->unpacked & BCF_IS_64BIT )
2539
0
    {
2540
0
        hts_log_error("Data at %s:%"PRIhts_pos" contains 64-bit values not representable in BCF. Please use VCF instead", bcf_seqname_safe(h,v), v->pos+1);
2541
0
        return -1;
2542
0
    }
2543
2544
23
    BGZF *fp = hfp->fp.bgzf;
2545
23
    uint8_t x[32];
2546
23
    u32_to_le(v->shared.l + 24, x); // to include six 32-bit integers
2547
23
    u32_to_le(v->indiv.l, x + 4);
2548
23
    i32_to_le(v->rid, x + 8);
2549
23
    u32_to_le(v->pos, x + 12);
2550
23
    u32_to_le(v->rlen, x + 16);
2551
23
    float_to_le(v->qual, x + 20);
2552
23
    u16_to_le(v->n_info, x + 24);
2553
23
    u16_to_le(v->n_allele, x + 26);
2554
23
    u32_to_le((uint32_t)v->n_fmt<<24 | (v->n_sample & 0xffffff), x + 28);
2555
23
    if ( bgzf_write(fp, x, 32) != 32 ) return -1;
2556
23
    if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
2557
23
    if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
2558
2559
23
    if (hfp->idx) {
2560
0
        if (bgzf_idx_push(fp, hfp->idx, v->rid, v->pos, v->pos + v->rlen,
2561
0
                          bgzf_tell(fp), 1) < 0)
2562
0
            return -1;
2563
0
    }
2564
2565
23
    return 0;
2566
23
}
2567
2568
/**********************
2569
 *** VCF header I/O ***
2570
 **********************/
2571
2572
0
static int add_missing_contig_hrec(bcf_hdr_t *h, const char *name) {
2573
0
    bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t));
2574
0
    int save_errno;
2575
0
    if (!hrec) goto fail;
2576
2577
0
    hrec->key = strdup("contig");
2578
0
    if (!hrec->key) goto fail;
2579
2580
0
    if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) goto fail;
2581
0
    if (bcf_hrec_set_val(hrec, hrec->nkeys-1, name, strlen(name), 0) < 0)
2582
0
        goto fail;
2583
0
    if (bcf_hdr_add_hrec(h, hrec) < 0)
2584
0
        goto fail;
2585
0
    return 0;
2586
2587
0
 fail:
2588
0
    save_errno = errno;
2589
0
    hts_log_error("%s", strerror(errno));
2590
0
    if (hrec) bcf_hrec_destroy(hrec);
2591
0
    errno = save_errno;
2592
0
    return -1;
2593
0
}
2594
2595
bcf_hdr_t *vcf_hdr_read(htsFile *fp)
2596
5.75k
{
2597
5.75k
    kstring_t txt, *s = &fp->line;
2598
5.75k
    int ret;
2599
5.75k
    bcf_hdr_t *h;
2600
5.75k
    tbx_t *idx = NULL;
2601
5.75k
    const char **names = NULL;
2602
5.75k
    h = bcf_hdr_init("r");
2603
5.75k
    if (!h) {
2604
0
        hts_log_error("Failed to allocate bcf header");
2605
0
        return NULL;
2606
0
    }
2607
5.75k
    txt.l = txt.m = 0; txt.s = 0;
2608
93.0k
    while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) {
2609
91.9k
        int e = 0;
2610
91.9k
        if (s->l == 0) continue;
2611
87.8k
        if (s->s[0] != '#') {
2612
18
            hts_log_error("No sample line");
2613
18
            goto error;
2614
18
        }
2615
87.8k
        if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
2616
0
            kstring_t tmp = { 0, 0, NULL };
2617
0
            hFILE *f = hopen(fp->fn_aux, "r");
2618
0
            if (f == NULL) {
2619
0
                hts_log_error("Couldn't open \"%s\"", fp->fn_aux);
2620
0
                goto error;
2621
0
            }
2622
0
            while (tmp.l = 0, khgetline(&tmp, f) >= 0) {
2623
0
                char *tab = strchr(tmp.s, '\t');
2624
0
                if (tab == NULL) continue;
2625
0
                e |= (kputs("##contig=<ID=", &txt) < 0);
2626
0
                e |= (kputsn(tmp.s, tab - tmp.s, &txt) < 0);
2627
0
                e |= (kputs(",length=", &txt) < 0);
2628
0
                e |= (kputl(atol(tab), &txt) < 0);
2629
0
                e |= (kputsn(">\n", 2, &txt) < 0);
2630
0
            }
2631
0
            free(tmp.s);
2632
0
            if (hclose(f) != 0) {
2633
0
                hts_log_error("Error on closing %s", fp->fn_aux);
2634
0
                goto error;
2635
0
            }
2636
0
            if (e) goto error;
2637
0
        }
2638
87.8k
        if (kputsn(s->s, s->l, &txt) < 0) goto error;
2639
87.8k
        if (kputc('\n', &txt) < 0) goto error;
2640
87.8k
        if (s->s[1] != '#') break;
2641
87.8k
    }
2642
5.73k
    if ( ret < -1 ) goto error;
2643
5.72k
    if ( !txt.s )
2644
0
    {
2645
0
        hts_log_error("Could not read the header");
2646
0
        goto error;
2647
0
    }
2648
5.72k
    if ( bcf_hdr_parse(h, txt.s) < 0 ) goto error;
2649
2650
    // check tabix index, are all contigs listed in the header? add the missing ones
2651
4.24k
    idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SILENT_FAIL);
2652
4.24k
    if ( idx )
2653
0
    {
2654
0
        int i, n, need_sync = 0;
2655
0
        names = tbx_seqnames(idx, &n);
2656
0
        if (!names) goto error;
2657
0
        for (i=0; i<n; i++)
2658
0
        {
2659
0
            bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL);
2660
0
            if ( hrec ) continue;
2661
0
            if (add_missing_contig_hrec(h, names[i]) < 0) goto error;
2662
0
            need_sync = 1;
2663
0
        }
2664
0
        if ( need_sync ) {
2665
0
            if (bcf_hdr_sync(h) < 0) goto error;
2666
0
        }
2667
0
        free(names);
2668
0
        tbx_destroy(idx);
2669
0
    }
2670
4.24k
    free(txt.s);
2671
4.24k
    return h;
2672
2673
1.50k
 error:
2674
1.50k
    if (idx) tbx_destroy(idx);
2675
1.50k
    free(names);
2676
1.50k
    free(txt.s);
2677
1.50k
    if (h) bcf_hdr_destroy(h);
2678
1.50k
    return NULL;
2679
4.24k
}
2680
2681
int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
2682
0
{
2683
0
    int i = 0, n = 0, save_errno;
2684
0
    char **lines = hts_readlines(fname, &n);
2685
0
    if ( !lines ) return 1;
2686
0
    for (i=0; i<n-1; i++)
2687
0
    {
2688
0
        int k;
2689
0
        bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
2690
0
        if (!hrec) goto fail;
2691
0
        if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
2692
0
            bcf_hrec_destroy(hrec);
2693
0
            goto fail;
2694
0
        }
2695
0
        free(lines[i]);
2696
0
        lines[i] = NULL;
2697
0
    }
2698
0
    if (bcf_hdr_parse_sample_line(hdr, lines[n-1]) < 0) goto fail;
2699
0
    if (bcf_hdr_sync(hdr) < 0) goto fail;
2700
0
    free(lines[n-1]);
2701
0
    free(lines);
2702
0
    return 0;
2703
2704
0
 fail:
2705
0
    save_errno = errno;
2706
0
    for (; i < n; i++)
2707
0
        free(lines[i]);
2708
0
    free(lines);
2709
0
    errno = save_errno;
2710
0
    return 1;
2711
0
}
2712
2713
static int _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
2714
15.2k
{
2715
15.2k
    uint32_t e = 0;
2716
15.2k
    if ( !hrec->value )
2717
8.42k
    {
2718
8.42k
        int j, nout = 0;
2719
8.42k
        e |= ksprintf(str, "##%s=<", hrec->key) < 0;
2720
33.4k
        for (j=0; j<hrec->nkeys; j++)
2721
24.9k
        {
2722
            // do not output IDX if output is VCF
2723
24.9k
            if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
2724
21.1k
            if ( nout ) e |= kputc(',',str) < 0;
2725
21.1k
            e |= ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]) < 0;
2726
21.1k
            nout++;
2727
21.1k
        }
2728
8.42k
        e |= ksprintf(str,">\n") < 0;
2729
8.42k
    }
2730
6.81k
    else
2731
6.81k
        e |= ksprintf(str,"##%s=%s\n", hrec->key,hrec->value) < 0;
2732
2733
15.2k
    return e == 0 ? 0 : -1;
2734
15.2k
}
2735
2736
int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
2737
0
{
2738
0
    return _bcf_hrec_format(hrec,0,str);
2739
0
}
2740
2741
int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str)
2742
4.67k
{
2743
4.67k
    int i, r = 0;
2744
19.9k
    for (i=0; i<hdr->nhrec; i++)
2745
15.2k
        r |= _bcf_hrec_format(hdr->hrec[i], is_bcf, str) < 0;
2746
2747
4.67k
    r |= ksprintf(str, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") < 0;
2748
4.67k
    if ( bcf_hdr_nsamples(hdr) )
2749
1.44k
    {
2750
1.44k
        r |= ksprintf(str, "\tFORMAT") < 0;
2751
8.97k
        for (i=0; i<bcf_hdr_nsamples(hdr); i++)
2752
7.53k
            r |= ksprintf(str, "\t%s", hdr->samples[i]) < 0;
2753
1.44k
    }
2754
4.67k
    r |= ksprintf(str, "\n") < 0;
2755
2756
4.67k
    return r ? -1 : 0;
2757
4.67k
}
2758
2759
char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
2760
0
{
2761
0
    kstring_t txt = {0,0,0};
2762
0
    if (bcf_hdr_format(hdr, is_bcf, &txt) < 0)
2763
0
        return NULL;
2764
0
    if ( len ) *len = txt.l;
2765
0
    return txt.s;
2766
0
}
2767
2768
const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
2769
0
{
2770
0
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
2771
0
    int i, tid, m = kh_size(d);
2772
0
    const char **names = (const char**) calloc(m,sizeof(const char*));
2773
0
    if ( !names )
2774
0
    {
2775
0
        hts_log_error("Failed to allocate memory");
2776
0
        *n = 0;
2777
0
        return NULL;
2778
0
    }
2779
0
    khint_t k;
2780
0
    for (k=kh_begin(d); k<kh_end(d); k++)
2781
0
    {
2782
0
        if ( !kh_exist(d,k) ) continue;
2783
0
        if ( !kh_val(d, k).hrec[0] ) continue;  // removed via bcf_hdr_remove
2784
0
        tid = kh_val(d,k).id;
2785
0
        if ( tid >= m )
2786
0
        {
2787
            // This can happen after a contig has been removed from BCF header via bcf_hdr_remove()
2788
0
            if ( hts_resize(const char*, tid + 1, &m, &names, HTS_RESIZE_CLEAR)<0 )
2789
0
            {
2790
0
                hts_log_error("Failed to allocate memory");
2791
0
                *n = 0;
2792
0
                free(names);
2793
0
                return NULL;
2794
0
            }
2795
0
            m = tid + 1;
2796
0
        }
2797
0
        names[tid] = kh_key(d,k);
2798
0
    }
2799
    // ensure there are no gaps
2800
0
    for (i=0,tid=0; tid<m; i++,tid++)
2801
0
    {
2802
0
        while ( tid<m && !names[tid] ) tid++;
2803
0
        if ( tid==m ) break;
2804
0
        if ( i==tid ) continue;
2805
0
        names[i] = names[tid];
2806
0
        names[tid] = 0;
2807
0
    }
2808
0
    *n = i;
2809
0
    return names;
2810
0
}
2811
2812
int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
2813
2.33k
{
2814
2.33k
    kstring_t htxt = {0,0,0};
2815
2.33k
    if (bcf_hdr_format(h, 0, &htxt) < 0) {
2816
0
        free(htxt.s);
2817
0
        return -1;
2818
0
    }
2819
2.33k
    while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros
2820
2.33k
    int ret;
2821
2.33k
    if ( fp->format.compression!=no_compression ) {
2822
0
        ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l);
2823
0
        if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2824
2.33k
    } else {
2825
2.33k
        ret = hwrite(fp->fp.hfile, htxt.s, htxt.l);
2826
2.33k
    }
2827
2.33k
    free(htxt.s);
2828
2.33k
    return ret<0 ? -1 : 0;
2829
2.33k
}
2830
2831
/***********************
2832
 *** Typed value I/O ***
2833
 ***********************/
2834
2835
int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
2836
213k
{
2837
213k
    int32_t max = INT32_MIN, min = INT32_MAX;
2838
213k
    int i;
2839
213k
    if (n <= 0) {
2840
2.75k
        return bcf_enc_size(s, 0, BCF_BT_NULL);
2841
210k
    } else if (n == 1) {
2842
29.9k
        return bcf_enc_int1(s, a[0]);
2843
180k
    } else {
2844
180k
        if (wsize <= 0) wsize = n;
2845
2846
        // Equivalent to:
2847
        // for (i = 0; i < n; ++i) {
2848
        //     if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end )
2849
        //         continue;
2850
        //     if (max < a[i]) max = a[i];
2851
        //     if (min > a[i]) min = a[i];
2852
        // }
2853
180k
        int max4[4] = {INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN};
2854
180k
        int min4[4] = {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX};
2855
24.9M
        for (i = 0; i < (n&~3); i+=4) {
2856
            // bcf_int32_missing    == INT32_MIN and
2857
            // bcf_int32_vector_end == INT32_MIN+1.
2858
            // We skip these, but can mostly avoid explicit checking
2859
24.7M
            if (max4[0] < a[i+0]) max4[0] = a[i+0];
2860
24.7M
            if (max4[1] < a[i+1]) max4[1] = a[i+1];
2861
24.7M
            if (max4[2] < a[i+2]) max4[2] = a[i+2];
2862
24.7M
            if (max4[3] < a[i+3]) max4[3] = a[i+3];
2863
24.7M
            if (min4[0] > a[i+0] && a[i+0] > INT32_MIN+1) min4[0] = a[i+0];
2864
24.7M
            if (min4[1] > a[i+1] && a[i+1] > INT32_MIN+1) min4[1] = a[i+1];
2865
24.7M
            if (min4[2] > a[i+2] && a[i+2] > INT32_MIN+1) min4[2] = a[i+2];
2866
24.7M
            if (min4[3] > a[i+3] && a[i+3] > INT32_MIN+1) min4[3] = a[i+3];
2867
24.7M
        }
2868
180k
        min = min4[0];
2869
180k
        if (min > min4[1]) min = min4[1];
2870
180k
        if (min > min4[2]) min = min4[2];
2871
180k
        if (min > min4[3]) min = min4[3];
2872
180k
        max = max4[0];
2873
180k
        if (max < max4[1]) max = max4[1];
2874
180k
        if (max < max4[2]) max = max4[2];
2875
180k
        if (max < max4[3]) max = max4[3];
2876
430k
        for (; i < n; ++i) {
2877
249k
            if (max < a[i]) max = a[i];
2878
249k
            if (min > a[i] && a[i] > INT32_MIN+1) min = a[i];
2879
249k
        }
2880
2881
180k
        if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) {
2882
20.4k
            if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 ||
2883
20.4k
                ks_resize(s, s->l + n) < 0)
2884
0
                return -1;
2885
20.4k
            uint8_t *p = (uint8_t *) s->s + s->l;
2886
4.25M
            for (i = 0; i < n; ++i, p++) {
2887
4.23M
                if ( a[i]==bcf_int32_vector_end )   *p = bcf_int8_vector_end;
2888
4.22M
                else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing;
2889
105k
                else *p = a[i];
2890
4.23M
            }
2891
20.4k
            s->l += n;
2892
160k
        } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) {
2893
115k
            uint8_t *p;
2894
115k
            if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 ||
2895
115k
                ks_resize(s, s->l + n * sizeof(int16_t)) < 0)
2896
0
                return -1;
2897
115k
            p = (uint8_t *) s->s + s->l;
2898
44.0M
            for (i = 0; i < n; ++i)
2899
43.9M
            {
2900
43.9M
                int16_t x;
2901
43.9M
                if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
2902
43.9M
                else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
2903
875k
                else x = a[i];
2904
43.9M
                i16_to_le(x, p);
2905
43.9M
                p += sizeof(int16_t);
2906
43.9M
            }
2907
115k
            s->l += n * sizeof(int16_t);
2908
115k
        } else {
2909
45.2k
            uint8_t *p;
2910
45.2k
            if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 ||
2911
45.2k
                ks_resize(s, s->l + n * sizeof(int32_t)) < 0)
2912
0
                return -1;
2913
45.2k
            p = (uint8_t *) s->s + s->l;
2914
51.1M
            for (i = 0; i < n; ++i) {
2915
51.0M
                i32_to_le(a[i], p);
2916
51.0M
                p += sizeof(int32_t);
2917
51.0M
            }
2918
45.2k
            s->l += n * sizeof(int32_t);
2919
45.2k
        }
2920
180k
    }
2921
2922
180k
    return 0;
2923
213k
}
2924
2925
#ifdef VCF_ALLOW_INT64
2926
static int bcf_enc_long1(kstring_t *s, int64_t x) {
2927
    uint32_t e = 0;
2928
    if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32)
2929
        return bcf_enc_int1(s, x);
2930
    if (x == bcf_int64_vector_end) {
2931
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2932
        e |= kputc(bcf_int8_vector_end, s) < 0;
2933
    } else if (x == bcf_int64_missing) {
2934
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2935
        e |= kputc(bcf_int8_missing, s) < 0;
2936
    } else {
2937
        e |= bcf_enc_size(s, 1, BCF_BT_INT64);
2938
        e |= ks_expand(s, 8);
2939
        if (e == 0) { u64_to_le(x, (uint8_t *) s->s + s->l); s->l += 8; }
2940
    }
2941
    return e == 0 ? 0 : -1;
2942
}
2943
#endif
2944
2945
432k
static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) {
2946
432k
    uint8_t *p;
2947
432k
    size_t i;
2948
432k
    size_t bytes = n * sizeof(float);
2949
2950
432k
    if (bytes / sizeof(float) != n) return -1;
2951
432k
    if (ks_resize(s, s->l + bytes) < 0) return -1;
2952
2953
432k
    p = (uint8_t *) s->s + s->l;
2954
81.5M
    for (i = 0; i < n; i++) {
2955
81.0M
        float_to_le(a[i], p);
2956
81.0M
        p += sizeof(float);
2957
81.0M
    }
2958
432k
    s->l += bytes;
2959
2960
432k
    return 0;
2961
432k
}
2962
2963
int bcf_enc_vfloat(kstring_t *s, int n, float *a)
2964
432k
{
2965
432k
    assert(n >= 0);
2966
432k
    bcf_enc_size(s, n, BCF_BT_FLOAT);
2967
432k
    serialize_float_array(s, n, a);
2968
432k
    return 0; // FIXME: check for errs in this function
2969
432k
}
2970
2971
int bcf_enc_vchar(kstring_t *s, int l, const char *a)
2972
2.78M
{
2973
2.78M
    bcf_enc_size(s, l, BCF_BT_CHAR);
2974
2.78M
    kputsn(a, l, s);
2975
2.78M
    return 0; // FIXME: check for errs in this function
2976
2.78M
}
2977
2978
// Special case of n==1 as it also occurs quite often in FORMAT data.
2979
// This version is also small enough to get inlined.
2980
5.87k
static inline int bcf_fmt_array1(kstring_t *s, int type, void *data) {
2981
5.87k
    uint32_t e = 0;
2982
5.87k
    uint8_t *p = (uint8_t *)data;
2983
5.87k
    int32_t v;
2984
2985
    // helps gcc more than clang here. In billions of cycles:
2986
    //          bcf_fmt_array1  bcf_fmt_array
2987
    // gcc7:    23.2            24.3
2988
    // gcc13:   21.6            23.0
2989
    // clang13: 27.1            27.8
2990
5.87k
    switch (type) {
2991
5.87k
    case BCF_BT_CHAR:
2992
5.87k
        e |= kputc_(*p == bcf_str_missing ? '.' : *p, s) < 0;
2993
5.87k
        break;
2994
2995
0
    case BCF_BT_INT8:
2996
0
        if (*(int8_t *)p != bcf_int8_vector_end) {
2997
0
            e |= ((*(int8_t *)p == bcf_int8_missing)
2998
0
                  ? kputc_('.', s)
2999
0
                  : kputw(*(int8_t *)p, s)) < 0;
3000
0
        }
3001
0
        break;
3002
0
    case BCF_BT_INT16:
3003
0
        v = le_to_i16(p);
3004
0
        if (v != bcf_int16_vector_end) {
3005
0
            e |= (v == bcf_int16_missing
3006
0
                  ? kputc_('.', s)
3007
0
                  : kputw(v, s)) < 0;
3008
0
        }
3009
0
        break;
3010
3011
0
    case BCF_BT_INT32:
3012
0
        v = le_to_i32(p);
3013
0
        if (v != bcf_int32_vector_end) {
3014
0
            e |= (v == bcf_int32_missing
3015
0
                  ? kputc_('.', s)
3016
0
                  : kputw(v, s)) < 0;
3017
0
        }
3018
0
        break;
3019
3020
0
    case BCF_BT_FLOAT:
3021
0
        v = le_to_u32(p);
3022
0
        if (v != bcf_float_vector_end) {
3023
0
            e |= (v == bcf_float_missing
3024
0
                  ? kputc_('.', s)
3025
0
                  : kputd(le_to_float(p), s)) < 0;
3026
0
        }
3027
0
        break;
3028
3029
0
    default:
3030
0
        hts_log_error("Unexpected type %d", type);
3031
0
        return -1;
3032
5.87k
    }
3033
3034
5.87k
    return e == 0 ? 0 : -1;
3035
5.87k
}
3036
3037
int bcf_fmt_array(kstring_t *s, int n, int type, void *data)
3038
2.33M
{
3039
2.33M
    int j = 0;
3040
2.33M
    uint32_t e = 0;
3041
2.33M
    if (n == 0) {
3042
1.40M
        return kputc_('.', s) >= 0 ? 0 : -1;
3043
1.40M
    }
3044
3045
925k
    if (type == BCF_BT_CHAR)
3046
284k
    {
3047
284k
        char *p = (char *)data;
3048
3049
        // Note bcf_str_missing is already accounted for in n==0 above.
3050
284k
        if (n >= 8) {
3051
66.7k
            char *p_end = memchr(p, 0, n);
3052
66.7k
            e |= kputsn(p, p_end ? p_end-p : n, s) < 0;
3053
218k
        } else {
3054
771k
            for (j = 0; j < n && *p; ++j, ++p)
3055
553k
               e |= kputc(*p, s) < 0;
3056
218k
        }
3057
284k
    }
3058
640k
    else
3059
640k
    {
3060
640k
        #define BRANCH(type_t, convert, is_missing, is_vector_end, kprint) { \
3061
640k
            uint8_t *p = (uint8_t *) data; \
3062
89.1M
            for (j=0; j<n; j++, p += sizeof(type_t))    \
3063
88.5M
            { \
3064
88.5M
                type_t v = convert(p); \
3065
88.5M
                if ( is_vector_end ) break; \
3066
88.5M
                if ( j ) e |= kputc_(',', s) < 0; \
3067
88.5M
                e |= (is_missing ? kputc('.', s) : kprint) < 0; \
3068
88.5M
            } \
3069
640k
        }
3070
640k
        switch (type) {
3071
180k
            case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, v==bcf_int8_missing,  v==bcf_int8_vector_end,  kputw(v, s)); break;
3072
116k
            case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break;
3073
128k
            case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break;
3074
215k
            case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break;
3075
0
            default: hts_log_error("Unexpected type %d", type); exit(1); break;
3076
640k
        }
3077
640k
        #undef BRANCH
3078
640k
    }
3079
925k
    return e == 0 ? 0 : -1;
3080
925k
}
3081
3082
uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
3083
1.56M
{
3084
1.56M
    int x, type;
3085
1.56M
    x = bcf_dec_size(ptr, &ptr, &type);
3086
1.56M
    bcf_fmt_array(s, x, type, ptr);
3087
1.56M
    return ptr + (x << bcf_type_shift[type]);
3088
1.56M
}
3089
3090
/********************
3091
 *** VCF site I/O ***
3092
 ********************/
3093
3094
typedef struct {
3095
    int key;            // Key for h->id[BCF_DT_ID][key] vdict
3096
    int max_m;          // number of elements in field array (ie commas)
3097
    int size;           // field size (max_l or max_g*4 if is_gt)
3098
    int offset;         // offset of buf into h->mem
3099
    uint32_t is_gt:1,   // is genotype
3100
             max_g:31;  // maximum number of genotypes
3101
    uint32_t max_l;     // length of field
3102
    uint32_t y;         // h->id[0][fmt[j].key].val->info[BCF_HL_FMT]
3103
    uint8_t *buf;       // Pointer into h->mem
3104
} fmt_aux_t;
3105
3106
// fmt_aux_t field notes:
3107
// max_* are biggest sizes of the various FORMAT fields across all samples.
3108
// We use these after pivoting the data to ensure easy random access
3109
// of a specific sample.
3110
//
3111
// max_m is only used for type BCF_HT_REAL or BCF_HT_INT
3112
// max_g is only used for is_gt == 1 (will be BCF_HT_STR)
3113
// max_l is only used for is_gt == 0 (will be BCF_HT_STR)
3114
//
3115
// These are computed in vcf_parse_format_max3 and used in
3116
// vcf_parse_format_alloc4 to get the size.
3117
//
3118
// size is computed from max_g, max_l, max_m and is_gt.  Once computed
3119
// the max values are never accessed again.
3120
//
3121
// In theory all 4 vars could be coalesced into a single variable, but this
3122
// significantly harms speed (even if done via a union).  It's about 25-30%
3123
// slower.
3124
3125
static inline int align_mem(kstring_t *s)
3126
81.7k
{
3127
81.7k
    int e = 0;
3128
81.7k
    if (s->l&7) {
3129
10.7k
        uint64_t zero = 0;
3130
10.7k
        e = kputsn((char*)&zero, 8 - (s->l&7), s) < 0;
3131
10.7k
    }
3132
81.7k
    return e == 0 ? 0 : -1;
3133
81.7k
}
3134
3135
82.9k
#define MAX_N_FMT 255   /* Limited by size of bcf1_t n_fmt field */
3136
3137
// detect FORMAT "."
3138
static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3139
10.9k
                                   const char *p, const char *q) {
3140
10.9k
    const char *end = s->s + s->l;
3141
10.9k
    if ( q>=end )
3142
57
    {
3143
57
        hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1);
3144
57
        v->errcode |= BCF_ERR_NCOLS;
3145
57
        return -1;
3146
57
    }
3147
3148
10.8k
    v->n_fmt = 0;
3149
10.8k
    if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "."
3150
242
    {
3151
242
        v->n_sample = bcf_hdr_nsamples(h);
3152
242
        return 1;
3153
242
    }
3154
3155
10.6k
    return 0;
3156
10.8k
}
3157
3158
// get format information from the dictionary
3159
static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3160
10.6k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
3161
10.6k
    const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3162
10.6k
    char *t;
3163
10.6k
    int j;
3164
10.6k
    ks_tokaux_t aux1;
3165
3166
93.4k
    for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
3167
82.9k
        if (j >= MAX_N_FMT) {
3168
3
            v->errcode |= BCF_ERR_LIMITS;
3169
3
            hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle",
3170
3
                bcf_seqname_safe(h,v), v->pos+1);
3171
3
            return -1;
3172
3
        }
3173
3174
82.8k
        *(char*)aux1.p = 0;
3175
82.8k
        khint_t k = kh_get(vdict, d, t);
3176
82.8k
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
3177
6.16k
            if ( t[0]=='.' && t[1]==0 )
3178
3
            {
3179
3
                hts_log_error("Invalid FORMAT tag name '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3180
3
                v->errcode |= BCF_ERR_TAG_INVALID;
3181
3
                return -1;
3182
3
            }
3183
6.16k
            hts_log_warning("FORMAT '%s' at %s:%"PRIhts_pos" is not defined in the header, assuming Type=String", t, bcf_seqname_safe(h,v), v->pos+1);
3184
6.16k
            if ((v->errcode & (BCF_ERR_TAG_UNDEF|BCF_ERR_CTG_UNDEF)) == 0)
3185
386
                hts_log_warning("Missing headers may cause later processing to fail");
3186
6.16k
            kstring_t tmp = {0,0,0};
3187
6.16k
            int l;
3188
6.16k
            ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
3189
6.16k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3190
6.16k
            free(tmp.s);
3191
6.16k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3192
6.16k
            if (res < 0) bcf_hrec_destroy(hrec);
3193
6.16k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3194
3195
6.16k
            k = kh_get(vdict, d, t);
3196
6.16k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3197
6.16k
            if (res || k == kh_end(d)) {
3198
12
                hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
3199
12
                v->errcode |= BCF_ERR_TAG_INVALID;
3200
12
                return -1;
3201
12
            }
3202
6.16k
        }
3203
82.8k
        fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
3204
82.8k
        fmt[j].key = kh_val(d, k).id;
3205
82.8k
        fmt[j].is_gt = (t[0] == 'G' && t[1] == 'T' && !t[2]);
3206
82.8k
        fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
3207
82.8k
        v->n_fmt++;
3208
82.8k
    }
3209
10.5k
    return 0;
3210
10.6k
}
3211
3212
// compute max
3213
static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3214
10.5k
                                 char *p, char *q, fmt_aux_t *fmt) {
3215
10.5k
    int n_sample_ori = -1;
3216
10.5k
    char *r = q + 1;  // r: position in the format string
3217
10.5k
    int l = 0, m = 1, g = 1, j;
3218
10.5k
    v->n_sample = 0;  // m: max vector size, l: max field len, g: max number of alleles
3219
10.5k
    const char *end = s->s + s->l;
3220
3221
27.8k
    while ( r<end )
3222
27.7k
    {
3223
        // can we skip some samples?
3224
27.7k
        if ( h->keep_samples )
3225
0
        {
3226
0
            n_sample_ori++;
3227
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
3228
0
            {
3229
0
                while ( *r!='\t' && r<end ) r++;
3230
0
                if ( *r=='\t' ) { *r = 0; r++; }
3231
0
                continue;
3232
0
            }
3233
0
        }
3234
3235
        // collect fmt stats: max vector size, length, number of alleles
3236
27.7k
        j = 0;  // j-th format field
3237
27.7k
        fmt_aux_t *f = fmt;
3238
27.7k
        static char meta[256] = {
3239
            // \0 \t , / : |
3240
27.7k
            1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3241
27.7k
            0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1, 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
3242
27.7k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3243
27.7k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
3244
27.7k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3245
27.7k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3246
27.7k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3247
27.7k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3248
27.7k
        };
3249
3250
27.7k
        char *r_start = r;
3251
2.69M
        for (;;) {
3252
            // Quickly skip ahead to an appropriate meta-character
3253
3.37M
            while (!meta[(unsigned char)*r]) r++;
3254
3255
2.69M
            switch (*r) {
3256
2.63M
            case ',':
3257
2.63M
                m++;
3258
2.63M
                break;
3259
3260
1.62k
            case '|':
3261
18.8k
            case '/':
3262
18.8k
                if (f->is_gt) g++;
3263
18.8k
                break;
3264
3265
13.0k
            case '\t':
3266
13.0k
                *r = 0; // fall through
3267
3268
13.0k
            default: // valid due to while loop above.
3269
27.7k
            case '\0':
3270
44.1k
            case ':':
3271
44.1k
                l = r - r_start; r_start = r;
3272
44.1k
                if (f->max_m < m) f->max_m = m;
3273
44.1k
                if (f->max_l < l) f->max_l = l;
3274
44.1k
                if (f->is_gt && f->max_g < g) f->max_g = g;
3275
44.1k
                l = 0, m = g = 1;
3276
44.1k
                if ( *r==':' ) {
3277
16.4k
                    j++; f++;
3278
16.4k
                    if ( j>=v->n_fmt ) {
3279
24
                        hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"",
3280
24
                                      h->id[BCF_DT_CTG][v->rid].key, v->pos+1);
3281
24
                        v->errcode |= BCF_ERR_NCOLS;
3282
24
                        return -1;
3283
24
                    }
3284
27.7k
                } else goto end_for;
3285
16.3k
                break;
3286
2.69M
            }
3287
2.66M
            if ( r>=end ) break;
3288
2.66M
            r++;
3289
2.66M
        }
3290
27.7k
    end_for:
3291
27.7k
        v->n_sample++;
3292
27.7k
        if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
3293
17.2k
        r++;
3294
17.2k
    }
3295
3296
10.5k
    return 0;
3297
10.5k
}
3298
3299
// allocate memory for arrays
3300
static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3301
                                   const char *p, const char *q,
3302
10.5k
                                   fmt_aux_t *fmt) {
3303
10.5k
    kstring_t *mem = (kstring_t*)&h->mem;
3304
3305
10.5k
    int j;
3306
92.2k
    for (j = 0; j < v->n_fmt; ++j) {
3307
81.7k
        fmt_aux_t *f = &fmt[j];
3308
81.7k
        if ( !f->max_m ) f->max_m = 1;  // omitted trailing format field
3309
3310
81.7k
        if ((f->y>>4&0xf) == BCF_HT_STR) {
3311
81.7k
            f->size = f->is_gt? f->max_g << 2 : f->max_l;
3312
81.7k
        } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
3313
0
            f->size = f->max_m << 2;
3314
0
        } else {
3315
0
            hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
3316
0
            v->errcode |= BCF_ERR_TAG_INVALID;
3317
0
            return -1;
3318
0
        }
3319
3320
81.7k
        if (align_mem(mem) < 0) {
3321
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3322
0
            v->errcode |= BCF_ERR_LIMITS;
3323
0
            return -1;
3324
0
        }
3325
3326
        // Limit the total memory to ~2Gb per VCF row.  This should mean
3327
        // malformed VCF data is less likely to take excessive memory and/or
3328
        // time.
3329
81.7k
        if ((uint64_t) mem->l + v->n_sample * (uint64_t)f->size > INT_MAX) {
3330
0
            static int warned = 0;
3331
0
            if ( !warned ) hts_log_warning("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3332
0
            warned = 1;
3333
0
            v->errcode |= BCF_ERR_LIMITS;
3334
0
            f->size = -1;
3335
0
            f->offset = 0;
3336
0
            continue;
3337
0
        }
3338
3339
81.7k
        f->offset = mem->l;
3340
81.7k
        if (ks_resize(mem, mem->l + v->n_sample * (size_t)f->size) < 0) {
3341
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3342
0
            v->errcode |= BCF_ERR_LIMITS;
3343
0
            return -1;
3344
0
        }
3345
81.7k
        mem->l += v->n_sample * f->size;
3346
81.7k
    }
3347
3348
10.5k
    {
3349
10.5k
        int j;
3350
92.2k
        for (j = 0; j < v->n_fmt; ++j)
3351
81.7k
            fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
3352
10.5k
    }
3353
3354
    // check for duplicate tags
3355
10.5k
    int i;
3356
81.7k
    for (i=1; i<v->n_fmt; i++)
3357
71.1k
    {
3358
71.1k
        fmt_aux_t *ifmt = &fmt[i];
3359
71.1k
        if ( ifmt->size==-1 ) continue; // already marked for removal
3360
390k
        for (j=0; j<i; j++)
3361
374k
        {
3362
374k
            fmt_aux_t *jfmt = &fmt[j];
3363
374k
            if ( jfmt->size==-1 ) continue; // already marked for removal
3364
174k
            if ( ifmt->key!=jfmt->key ) continue;
3365
55.2k
            static int warned = 0;
3366
55.2k
            if ( !warned ) hts_log_warning("Duplicate FORMAT tag %s at %s:%"PRIhts_pos, bcf_hdr_int2id(h,BCF_DT_ID,ifmt->key), bcf_seqname_safe(h,v), v->pos+1);
3367
55.2k
            warned = 1;
3368
55.2k
            v->errcode |= BCF_ERR_TAG_INVALID;
3369
55.2k
            ifmt->size = -1;
3370
55.2k
            ifmt->offset = 0;
3371
55.2k
            break;
3372
174k
        }
3373
71.1k
    }
3374
10.5k
    return 0;
3375
10.5k
}
3376
3377
// Fill the sample fields
3378
static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3379
10.5k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
3380
10.5k
    static int extreme_val_warned = 0;
3381
10.5k
    int n_sample_ori = -1;
3382
    // At beginning of the loop t points to the first char of a format
3383
10.5k
    const char *t = q + 1;
3384
10.5k
    int m = 0;   // m: sample id
3385
10.5k
    const int nsamples = bcf_hdr_nsamples(h);
3386
10.5k
    const char *end = s->s + s->l;
3387
3388
10.5k
    int ver = bcf_get_version(h, NULL);
3389
3390
38.0k
    while ( t<end )
3391
36.1k
    {
3392
        // can we skip some samples?
3393
36.1k
        if ( h->keep_samples )
3394
0
        {
3395
0
            n_sample_ori++;
3396
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
3397
0
            {
3398
0
                while ( *t && t<end ) t++;
3399
0
                t++;
3400
0
                continue;
3401
0
            }
3402
0
        }
3403
36.1k
        if ( m == nsamples ) break;
3404
3405
27.5k
        int j = 0; // j-th format field, m-th sample
3406
43.6k
        while ( t < end )
3407
43.1k
        {
3408
43.1k
            fmt_aux_t *z = &fmt[j++];
3409
43.1k
            const int htype = z->y>>4&0xf;
3410
43.1k
            if (!z->buf) {
3411
7
                hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos,
3412
7
                              z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
3413
7
                v->errcode |= BCF_ERR_LIMITS;
3414
7
                return -1;
3415
7
            }
3416
3417
43.1k
            if ( z->size==-1 )
3418
5.42k
            {
3419
                // this field is to be ignored, it's either too big or a duplicate
3420
43.7k
                while ( *t != ':' && *t ) t++;
3421
5.42k
            }
3422
37.7k
            else if (htype == BCF_HT_STR) {
3423
37.7k
                int l;
3424
37.7k
                if (z->is_gt) {
3425
                    // Genotypes.
3426
                    //([/|])?<val>)([|/]<val>)+... where <val> is [0-9]+ or ".".
3427
6.40k
                    int32_t is_phased = 0;
3428
6.40k
                    uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
3429
6.40k
                    uint32_t unreadable = 0;
3430
6.40k
                    uint32_t max = 0;
3431
6.40k
                    int overflow = 0, ploidy = 0, anyunphased = 0, \
3432
6.40k
                        phasingprfx = 0, unknown1 = 0;
3433
3434
                    /* with prefixed phasing, it is explicitly given for 1st one
3435
                    with non-prefixed, set based on ploidy and phasing of other
3436
                    alleles. */
3437
6.40k
                    if (ver >= VCF44 && (*t == '|' || *t == '/')) {
3438
                        // cache prefix and phasing status
3439
753
                        is_phased = *t++ == '|';
3440
753
                        phasingprfx = 1;
3441
753
                    }
3442
3443
22.0k
                    for (l = 0;; ++t) {
3444
22.0k
                        ploidy++;
3445
22.0k
                        if (*t == '.') {
3446
4.16k
                            ++t, x[l++] = is_phased;
3447
4.16k
                            if (l==1) {   //for 1st allele only
3448
811
                                unknown1 = 1;
3449
811
                            }
3450
17.9k
                        } else {
3451
17.9k
                            const char *tt = t;
3452
17.9k
                            uint32_t val;
3453
                            // Or "v->n_allele < 10", but it doesn't
3454
                            // seem to be any faster and this feels safer.
3455
17.9k
                            if (*t >= '0' && *t <= '9' &&
3456
17.0k
                                !(t[1] >= '0' && t[1] <= '9')) {
3457
9.19k
                                val = *t++ - '0';
3458
9.19k
                            } else {
3459
8.71k
                                val = hts_str2uint(t, (char **)&t,
3460
8.71k
                                                   sizeof(val) * CHAR_MAX - 2,
3461
8.71k
                                                   &overflow);
3462
8.71k
                                unreadable |= tt == t;
3463
8.71k
                            }
3464
17.9k
                            if (max < val) max = val;
3465
17.9k
                            x[l++] = (val + 1) << 1 | is_phased;
3466
17.9k
                        }
3467
22.0k
                        anyunphased |= (ploidy != 1) && !is_phased;
3468
22.0k
                        is_phased = (*t == '|');
3469
22.0k
                        if (*t != '|' && *t != '/') break;
3470
22.0k
                    }
3471
6.40k
                    if (!phasingprfx) { //get GT in v44 way when no prefixed phasing
3472
                        /* no explicit phasing for 1st allele, set based on
3473
                         other alleles and ploidy */
3474
5.65k
                        if (ploidy == 1) {  //implicitly phased
3475
1.31k
                            if (!unknown1) {
3476
810
                                x[0] |= 1;
3477
810
                            }
3478
4.34k
                        } else {            //set by other unphased alleles
3479
4.34k
                            x[0] |= (anyunphased)? 0 : 1;
3480
4.34k
                        }
3481
5.65k
                    }
3482
                    // Possibly check max against v->n_allele instead?
3483
6.40k
                    if (overflow || max > (INT32_MAX >> 1) - 1) {
3484
60
                        hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3485
60
                        return -1;
3486
60
                    }
3487
6.34k
                    if (unreadable) {
3488
43
                        hts_log_error("Couldn't read GT data: value not a number or '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3489
43
                        return -1;
3490
43
                    }
3491
6.30k
                    if ( !l ) x[l++] = 0;   // An empty field, insert missing value
3492
8.88k
                    for (; l < z->size>>2; ++l)
3493
2.58k
                        x[l] = bcf_int32_vector_end;
3494
3495
31.2k
                } else {
3496
                    // Otherwise arbitrary strings
3497
31.2k
                    char *x = (char*)z->buf + z->size * (size_t)m;
3498
2.91M
                    for (l = 0; *t != ':' && *t; ++t)
3499
2.88M
                        x[l++] = *t;
3500
31.2k
                    if (z->size > l)
3501
18.2k
                        memset(&x[l], 0, (z->size-l) * sizeof(*x));
3502
31.2k
                }
3503
3504
37.7k
            } else if (htype == BCF_HT_INT) {
3505
                // One or more integers in an array
3506
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3507
0
                int l;
3508
0
                for (l = 0;; ++t) {
3509
0
                    if (*t == '.') {
3510
0
                        x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
3511
0
                    } else {
3512
0
                        int overflow = 0;
3513
0
                        char *te;
3514
0
                        long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3515
0
                        if ( te==t || overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3516
0
                        {
3517
0
                            if ( !extreme_val_warned )
3518
0
                            {
3519
0
                                hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos,
3520
0
                                                h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1);
3521
0
                                extreme_val_warned = 1;
3522
0
                            }
3523
0
                            tmp_val = bcf_int32_missing;
3524
0
                        }
3525
0
                        x[l++] = tmp_val;
3526
0
                        t = te;
3527
0
                    }
3528
0
                    if (*t != ',') break;
3529
0
                }
3530
0
                if ( !l )
3531
0
                    x[l++] = bcf_int32_missing;
3532
0
                for (; l < z->size>>2; ++l)
3533
0
                    x[l] = bcf_int32_vector_end;
3534
3535
0
            } else if (htype == BCF_HT_REAL) {
3536
                // One of more floating point values in an array
3537
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3538
0
                int l;
3539
0
                for (l = 0;; ++t) {
3540
0
                    if (*t == '.' && !isdigit_c(t[1])) {
3541
0
                        bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
3542
0
                    } else {
3543
0
                        int overflow = 0;
3544
0
                        char *te;
3545
0
                        float tmp_val = hts_str2dbl(t, &te, &overflow);
3546
0
                        if ( (te==t || overflow) && !extreme_val_warned )
3547
0
                        {
3548
0
                            hts_log_warning("Extreme FORMAT/%s value encountered at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname(h,v), v->pos+1);
3549
0
                            extreme_val_warned = 1;
3550
0
                        }
3551
0
                        x[l++] = tmp_val;
3552
0
                        t = te;
3553
0
                    }
3554
0
                    if (*t != ',') break;
3555
0
                }
3556
0
                if ( !l )
3557
                    // An empty field, insert missing value
3558
0
                    bcf_float_set_missing(x[l++]);
3559
0
                for (; l < z->size>>2; ++l)
3560
0
                    bcf_float_set_vector_end(x[l]);
3561
0
            } else {
3562
0
                hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, htype, bcf_seqname_safe(h,v), v->pos+1);
3563
0
                v->errcode |= BCF_ERR_TAG_INVALID;
3564
0
                return -1;
3565
0
            }
3566
3567
43.0k
            if (*t == '\0') {
3568
26.9k
                break;
3569
26.9k
            }
3570
16.1k
            else if (*t == ':') {
3571
16.0k
                t++;
3572
16.0k
            }
3573
25
            else {
3574
25
                char buffer[8];
3575
25
                hts_log_error("Invalid character %s in '%s' FORMAT field at %s:%"PRIhts_pos"",
3576
25
                    hts_strprint(buffer, sizeof buffer, '\'', t, 1),
3577
25
                    h->id[BCF_DT_ID][z->key].key, bcf_seqname_safe(h,v), v->pos+1);
3578
25
                v->errcode |= BCF_ERR_CHAR;
3579
25
                return -1;
3580
25
            }
3581
43.0k
        }
3582
3583
        // fill end-of-vector values
3584
394k
        for (; j < v->n_fmt; ++j) {
3585
367k
            fmt_aux_t *z = &fmt[j];
3586
367k
            const int htype = z->y>>4&0xf;
3587
367k
            int l;
3588
3589
367k
            if (z->size == -1) // this field is to be ignored
3590
300k
                continue;
3591
3592
67.0k
            if (htype == BCF_HT_STR) {
3593
67.0k
                if (z->is_gt) {
3594
11.3k
                    int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3595
11.3k
                    if (z->size) x[0] = bcf_int32_missing;
3596
33.7k
                    for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3597
55.7k
                } else {
3598
55.7k
                    char *x = (char*)z->buf + z->size * (size_t)m;
3599
55.7k
                    if ( z->size ) {
3600
9.56k
                        x[0] = '.';
3601
9.56k
                        memset(&x[1], 0, (z->size-1) * sizeof(*x));
3602
9.56k
                    }
3603
55.7k
                }
3604
67.0k
            } else if (htype == BCF_HT_INT) {
3605
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3606
0
                x[0] = bcf_int32_missing;
3607
0
                for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3608
0
            } else if (htype == BCF_HT_REAL) {
3609
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3610
0
                bcf_float_set_missing(x[0]);
3611
0
                for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
3612
0
            }
3613
67.0k
        }
3614
3615
27.4k
        m++; t++;
3616
27.4k
    }
3617
3618
10.4k
    return 0;
3619
10.5k
}
3620
3621
// write individual genotype information
3622
static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3623
10.4k
                                const char *p, const char *q, fmt_aux_t *fmt) {
3624
10.4k
    kstring_t *str = &v->indiv;
3625
10.4k
    int i, need_downsize = 0;
3626
10.4k
    if (v->n_sample > 0) {
3627
89.1k
        for (i = 0; i < v->n_fmt; ++i) {
3628
78.7k
            fmt_aux_t *z = &fmt[i];
3629
78.7k
            if ( z->size==-1 ) {
3630
52.7k
                need_downsize = 1;
3631
52.7k
                continue;
3632
52.7k
            }
3633
25.9k
            bcf_enc_int1(str, z->key);
3634
25.9k
            if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
3635
20.8k
                bcf_enc_size(str, z->size, BCF_BT_CHAR);
3636
20.8k
                kputsn((char*)z->buf, z->size * (size_t)v->n_sample, str);
3637
20.8k
            } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
3638
5.10k
                bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
3639
5.10k
            } else {
3640
0
                bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
3641
0
                if (serialize_float_array(str, (z->size>>2) * (size_t)v->n_sample,
3642
0
                                          (float *) z->buf) != 0) {
3643
0
                    v->errcode |= BCF_ERR_LIMITS;
3644
0
                    hts_log_error("Out of memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3645
0
                    return -1;
3646
0
                }
3647
0
            }
3648
25.9k
        }
3649
3650
10.4k
    }
3651
10.4k
    if ( need_downsize ) {
3652
3.85k
        i = 0;
3653
69.6k
        while ( i < v->n_fmt ) {
3654
65.8k
            if ( fmt[i].size==-1 )
3655
52.7k
            {
3656
52.7k
                v->n_fmt--;
3657
52.7k
                if ( i < v->n_fmt ) memmove(&fmt[i],&fmt[i+1],sizeof(*fmt)*(v->n_fmt-i));
3658
52.7k
            }
3659
13.1k
            else
3660
13.1k
                i++;
3661
65.8k
        }
3662
3.85k
    }
3663
10.4k
    return 0;
3664
10.4k
}
3665
3666
// validity checking
3667
10.4k
static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) {
3668
10.4k
    if ( v->n_sample!=bcf_hdr_nsamples(h) )
3669
89
    {
3670
89
        hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
3671
89
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
3672
89
        v->errcode |= BCF_ERR_NCOLS;
3673
89
        return -1;
3674
89
    }
3675
10.3k
    if ( v->indiv.l > 0xffffffff )
3676
0
    {
3677
0
        hts_log_error("The FORMAT at %s:%"PRIhts_pos" is too long", bcf_seqname_safe(h,v), v->pos+1);
3678
0
        v->errcode |= BCF_ERR_LIMITS;
3679
3680
        // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed
3681
0
        v->n_fmt = 0;
3682
0
        return -1;
3683
0
    }
3684
3685
10.3k
    return 0;
3686
10.3k
}
3687
3688
// p,q is the start and the end of the FORMAT field
3689
static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3690
                            char *p, char *q)
3691
28.7k
{
3692
28.7k
    if ( !bcf_hdr_nsamples(h) ) return 0;
3693
10.9k
    kstring_t *mem = (kstring_t*)&h->mem;
3694
10.9k
    mem->l = 0;
3695
3696
10.9k
    fmt_aux_t fmt[MAX_N_FMT];
3697
3698
    // detect FORMAT "."
3699
10.9k
    int ret; // +ve = ok, -ve = err
3700
10.9k
    if ((ret = vcf_parse_format_empty1(s, h, v, p, q)))
3701
299
        return ret ? 0 : -1;
3702
3703
    // get format information from the dictionary
3704
10.6k
    if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0)
3705
18
        return -1;
3706
3707
    // FORMAT data is per-sample A:B:C A:B:C A:B:C ... but in memory it is
3708
    // stored as per-type arrays AAA... BBB... CCC...  This is basically
3709
    // a data rotation or pivot.
3710
3711
    // The size of elements in the array grow to their maximum needed,
3712
    // permitting fast random access.  This means however we have to first
3713
    // scan the whole FORMAT line to find the maximum of each type, and
3714
    // then scan it again to find the store the data.
3715
    // We break this down into compute-max, allocate, fill-out-buffers
3716
3717
    // TODO: ?
3718
    // The alternative would be to pivot on the first pass, with fixed
3719
    // size entries for numerics and concatenated strings otherwise, also
3720
    // tracking maximum sizes.  Then on a second pass we reallocate and
3721
    // copy the data again to a uniformly sized array.  Two passes through
3722
    // memory, but without doubling string parsing.
3723
3724
    // compute max
3725
10.5k
    if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0)
3726
24
        return -1;
3727
3728
    // allocate memory for arrays
3729
10.5k
    if (vcf_parse_format_alloc4(s, h, v, p, q, fmt) < 0)
3730
0
        return -1;
3731
3732
    // fill the sample fields; at beginning of the loop
3733
10.5k
    if (vcf_parse_format_fill5(s, h, v, p, q, fmt) < 0)
3734
135
        return -1;
3735
3736
    // write individual genotype information
3737
10.4k
    if (vcf_parse_format_gt6(s, h, v, p, q, fmt) < 0)
3738
0
        return -1;
3739
3740
    // validity checking
3741
10.4k
    if (vcf_parse_format_check7(h, v) < 0)
3742
89
        return -1;
3743
3744
10.3k
    return 0;
3745
10.4k
}
3746
3747
5.05k
static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
3748
    // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
3749
    // been already printed, but will enable tools like vcfcheck to proceed.
3750
3751
5.05k
    kstring_t tmp = {0,0,0};
3752
5.05k
    khint_t k;
3753
5.05k
    int l;
3754
5.05k
    if (ksprintf(&tmp, "##contig=<ID=%s>", p) < 0)
3755
0
        return kh_end(d);
3756
5.05k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3757
5.05k
    free(tmp.s);
3758
5.05k
    int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3759
5.05k
    if (res < 0) bcf_hrec_destroy(hrec);
3760
5.05k
    if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3761
5.05k
    k = kh_get(vdict, d, p);
3762
3763
5.05k
    return k;
3764
5.05k
}
3765
3766
30.9k
static int vcf_parse_filter(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3767
30.9k
    int i, n_flt = 1, max_n_flt = 0;
3768
30.9k
    char *r, *t;
3769
30.9k
    int32_t *a_flt = NULL;
3770
30.9k
    ks_tokaux_t aux1;
3771
30.9k
    khint_t k;
3772
30.9k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3773
    // count the number of filters
3774
30.9k
    if (*(q-1) == ';') *(q-1) = 0;
3775
341M
    for (r = p; *r; ++r)
3776
341M
        if (*r == ';') ++n_flt;
3777
30.9k
    if (n_flt > max_n_flt) {
3778
30.9k
        a_flt = hts_malloc_p(sizeof(*a_flt), n_flt);
3779
30.9k
        if (!a_flt) {
3780
0
            hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3781
0
            v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3782
0
            return -1;
3783
0
        }
3784
30.9k
        max_n_flt = n_flt;
3785
30.9k
    }
3786
    // add filters
3787
1.66M
    for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
3788
1.63M
        *(char*)aux1.p = 0;
3789
1.63M
        k = kh_get(vdict, d, t);
3790
1.63M
        if (k == kh_end(d))
3791
41.0k
        {
3792
            // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
3793
            // been already printed, but will enable tools like vcfcheck to proceed.
3794
41.0k
            hts_log_warning("FILTER '%s' at %s:%"PRIhts_pos" is not defined in the header",
3795
41.0k
                            t, bcf_seqname_safe(h,v), v->pos+1);
3796
41.0k
            if ((v->errcode & (BCF_ERR_TAG_UNDEF|BCF_ERR_CTG_UNDEF)) == 0)
3797
1.17k
                hts_log_warning("Missing headers may cause later processing to fail");
3798
41.0k
            kstring_t tmp = {0,0,0};
3799
41.0k
            int l;
3800
41.0k
            ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
3801
41.0k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3802
41.0k
            free(tmp.s);
3803
41.0k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3804
41.0k
            if (res < 0) bcf_hrec_destroy(hrec);
3805
41.0k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3806
41.0k
            k = kh_get(vdict, d, t);
3807
41.0k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3808
41.0k
            if (res || k == kh_end(d)) {
3809
58
                hts_log_error("Could not add dummy header for FILTER '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
3810
58
                v->errcode |= BCF_ERR_TAG_INVALID;
3811
58
                free(a_flt);
3812
58
                return -1;
3813
58
            }
3814
41.0k
        }
3815
1.63M
        a_flt[i++] = kh_val(d, k).id;
3816
1.63M
    }
3817
3818
30.8k
    bcf_enc_vint(str, n_flt, a_flt, -1);
3819
30.8k
    free(a_flt);
3820
3821
30.8k
    return 0;
3822
30.9k
}
3823
3824
32.6k
static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3825
32.6k
    static int extreme_int_warned = 0, negative_rlen_warned = 0;
3826
32.6k
    int max_n_val = 0, overflow = 0;
3827
32.6k
    char *r, *key;
3828
32.6k
    khint_t k;
3829
32.6k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3830
32.6k
    int32_t *a_val = NULL;
3831
3832
32.6k
    v->n_info = 0;
3833
32.6k
    if (*(q-1) == ';') *(q-1) = 0;
3834
3.10M
    for (r = key = p;; ++r) {
3835
3.10M
        int c;
3836
3.10M
        char *val, *end;
3837
231M
        while (*r > '=' || (*r != ';' && *r != '=' && *r != 0)) r++;
3838
3.10M
        if (v->n_info == UINT16_MAX) {
3839
5
            hts_log_error("Too many INFO entries at %s:%"PRIhts_pos,
3840
5
                          bcf_seqname_safe(h,v), v->pos+1);
3841
5
            v->errcode |= BCF_ERR_LIMITS;
3842
5
            goto fail;
3843
5
        }
3844
3.10M
        val = end = NULL;
3845
3.10M
        c = *r; *r = 0;
3846
3.10M
        if (c == '=') {
3847
1.43M
            val = r + 1;
3848
3849
289M
            for (end = val; *end != ';' && *end != 0; ++end);
3850
1.43M
            c = *end; *end = 0;
3851
1.66M
        } else end = r;
3852
3.10M
        if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; }  // faulty VCF, ";;" in the INFO
3853
3.06M
        k = kh_get(vdict, d, key);
3854
3.06M
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
3855
28.5k
        {
3856
28.5k
            hts_log_warning("INFO '%s' at %s:%"PRIhts_pos" is not defined in the header, assuming Type=String",
3857
28.5k
                            key, bcf_seqname_safe(h,v), v->pos+1);
3858
28.5k
            if ((v->errcode & (BCF_ERR_TAG_UNDEF|BCF_ERR_CTG_UNDEF)) == 0)
3859
748
                hts_log_warning("Missing headers may cause later processing to fail");
3860
28.5k
            kstring_t tmp = {0,0,0};
3861
28.5k
            int l;
3862
28.5k
            ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
3863
28.5k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3864
28.5k
            free(tmp.s);
3865
28.5k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3866
28.5k
            if (res < 0) bcf_hrec_destroy(hrec);
3867
28.5k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3868
28.5k
            k = kh_get(vdict, d, key);
3869
28.5k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3870
28.5k
            if (res || k == kh_end(d)) {
3871
74
                hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1);
3872
74
                v->errcode |= BCF_ERR_TAG_INVALID;
3873
74
                goto fail;
3874
74
            }
3875
28.5k
        }
3876
3.06M
        uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
3877
3.06M
        ++v->n_info;
3878
3.06M
        bcf_enc_int1(str, kh_val(d, k).id);
3879
3.06M
        if (val == 0) {
3880
1.63M
            bcf_enc_size(str, 0, BCF_BT_NULL);
3881
1.63M
        } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
3882
119k
            bcf_enc_vchar(str, end - val, val);
3883
1.31M
        } else { // int/float value/array
3884
1.31M
            int i, n_val;
3885
1.31M
            char *t, *te;
3886
250M
            for (t = val, n_val = 1; *t; ++t) // count the number of values
3887
249M
                if (*t == ',') ++n_val;
3888
            // Check both int and float size in one step for simplicity
3889
1.31M
            if (n_val > max_n_val) {
3890
4.59k
                int32_t *a_tmp = hts_realloc_p(a_val, sizeof(*a_val), n_val);
3891
4.59k
                if (!a_tmp) {
3892
0
                    hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3893
0
                    v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3894
0
                    goto fail;
3895
0
                }
3896
4.59k
                a_val = a_tmp;
3897
4.59k
                max_n_val = n_val;
3898
4.59k
            }
3899
1.31M
            if ((y>>4&0xf) == BCF_HT_INT) {
3900
882k
                i = 0, t = val;
3901
882k
                int64_t val1;
3902
882k
                int is_int64 = 0;
3903
#ifdef VCF_ALLOW_INT64
3904
                if ( n_val==1 )
3905
                {
3906
                    overflow = 0;
3907
                    long long int tmp_val = hts_str2int(val, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3908
                    if ( te==val ) tmp_val = bcf_int32_missing;
3909
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT64 || tmp_val>BCF_MAX_BT_INT64 )
3910
                    {
3911
                        if ( !extreme_int_warned )
3912
                        {
3913
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3914
                            extreme_int_warned = 1;
3915
                        }
3916
                        tmp_val = bcf_int32_missing;
3917
                    }
3918
                    else
3919
                        is_int64 = 1;
3920
                    val1 = tmp_val;
3921
                    t = te;
3922
                    i = 1;  // this is just to avoid adding another nested block...
3923
                }
3924
#endif
3925
100M
                for (; i < n_val; ++i, ++t)
3926
99.3M
                {
3927
99.3M
                    overflow = 0;
3928
99.3M
                    long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3929
99.3M
                    if ( te==t ) tmp_val = bcf_int32_missing;
3930
1.56M
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3931
185k
                    {
3932
185k
                        if ( !extreme_int_warned )
3933
1
                        {
3934
1
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3935
1
                            extreme_int_warned = 1;
3936
1
                        }
3937
185k
                        tmp_val = bcf_int32_missing;
3938
185k
                    }
3939
99.3M
                    a_val[i] = tmp_val;
3940
131M
                    for (t = te; *t && *t != ','; t++);
3941
99.3M
                }
3942
882k
                if (n_val == 1) {
3943
#ifdef VCF_ALLOW_INT64
3944
                    if ( is_int64 )
3945
                    {
3946
                        v->unpacked |= BCF_IS_64BIT;
3947
                        bcf_enc_long1(str, val1);
3948
                    }
3949
                    else
3950
                        bcf_enc_int1(str, (int32_t)val1);
3951
#else
3952
707k
                    val1 = a_val[0];
3953
707k
                    bcf_enc_int1(str, (int32_t)val1);
3954
707k
#endif
3955
707k
                } else {
3956
175k
                    bcf_enc_vint(str, n_val, a_val, -1);
3957
175k
                }
3958
882k
                if (n_val==1 && (val1!=bcf_int32_missing || is_int64)
3959
588k
                    && memcmp(key, "END", 4) == 0)
3960
0
                {
3961
0
                    if ( val1 <= v->pos )
3962
0
                    {
3963
0
                        if ( !negative_rlen_warned )
3964
0
                        {
3965
0
                            hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,val1,bcf_seqname_safe(h,v),v->pos+1);
3966
0
                            negative_rlen_warned = 1;
3967
0
                        }
3968
0
                    }
3969
0
                }
3970
882k
            } else if ((y>>4&0xf) == BCF_HT_REAL) {
3971
432k
                float *val_f = (float *)a_val;
3972
81.5M
                for (i = 0, t = val; i < n_val; ++i, ++t)
3973
81.0M
                {
3974
81.0M
                    overflow = 0;
3975
81.0M
                    val_f[i] = hts_str2dbl(t, &te, &overflow);
3976
81.0M
                    if ( te==t || overflow ) // conversion failed
3977
80.0M
                        bcf_float_set_missing(val_f[i]);
3978
107M
                    for (t = te; *t && *t != ','; t++);
3979
81.0M
                }
3980
432k
                bcf_enc_vfloat(str, n_val, val_f);
3981
432k
            }
3982
1.31M
        }
3983
3.06M
        if (c == 0) break;
3984
3.05M
        r = end;
3985
3.05M
        key = r + 1;
3986
3.05M
    }
3987
3988
32.6k
    free(a_val);
3989
32.6k
    return 0;
3990
3991
79
 fail:
3992
79
    free(a_val);
3993
79
    return -1;
3994
32.6k
}
3995
3996
int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
3997
33.9k
{
3998
33.9k
    int ret = -2, overflow = 0;
3999
33.9k
    char *p, *q, *r, *t;
4000
33.9k
    kstring_t *str;
4001
33.9k
    khint_t k;
4002
33.9k
    ks_tokaux_t aux;
4003
4004
//#define NOT_DOT(p) strcmp((p), ".")
4005
//#define NOT_DOT(p) (!(*p == '.' && !p[1]))
4006
//#define NOT_DOT(p) ((*p) != '.' || (p)[1])
4007
//#define NOT_DOT(p) (q-p != 1 || memcmp(p, ".\0", 2))
4008
166k
#define NOT_DOT(p) (memcmp(p, ".\0", 2))
4009
4010
33.9k
    if (!s || !h || !v || !(s->s))
4011
0
        return ret;
4012
4013
    // Assumed in lots of places, but we may as well spot this early
4014
33.9k
    assert(sizeof(float) == sizeof(int32_t));
4015
4016
    // Ensure string we parse has space to permit some over-flow when during
4017
    // parsing.  Eg to do memcmp(key, "END", 4) in vcf_parse_info over
4018
    // the more straight forward looking strcmp, giving a speed advantage.
4019
33.9k
    if (ks_resize(s, s->l+4) < 0)
4020
0
        return -2;
4021
4022
    // Force our memory to be initialised so we avoid the technicality of
4023
    // undefined behaviour in using a 4-byte memcmp.  (The reality is this
4024
    // almost certainly is never detected by the compiler so has no impact,
4025
    // but equally so this code has minimal (often beneficial) impact on
4026
    // performance too.)
4027
33.9k
    s->s[s->l+0] = 0;
4028
33.9k
    s->s[s->l+1] = 0;
4029
33.9k
    s->s[s->l+2] = 0;
4030
33.9k
    s->s[s->l+3] = 0;
4031
4032
33.9k
    bcf_clear1(v);
4033
33.9k
    str = &v->shared;
4034
33.9k
    memset(&aux, 0, sizeof(ks_tokaux_t));
4035
4036
    // CHROM
4037
33.9k
    if (!(p = kstrtok(s->s, "\t", &aux)))
4038
0
        goto err;
4039
33.9k
    *(q = (char*)aux.p) = 0;
4040
4041
33.9k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
4042
33.9k
    k = kh_get(vdict, d, p);
4043
33.9k
    if (k == kh_end(d)) {
4044
5.05k
        hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p);
4045
5.05k
            if ((v->errcode & (BCF_ERR_TAG_UNDEF|BCF_ERR_CTG_UNDEF)) == 0)
4046
5.05k
                hts_log_warning("Missing headers may cause later processing to fail");
4047
5.05k
        v->errcode = BCF_ERR_CTG_UNDEF;
4048
5.05k
        if ((k = fix_chromosome(h, d, p)) == kh_end(d)) {
4049
63
            hts_log_error("Could not add dummy header for contig '%s'", p);
4050
63
            v->errcode |= BCF_ERR_CTG_INVALID;
4051
63
            goto err;
4052
63
        }
4053
5.05k
    }
4054
33.8k
    v->rid = kh_val(d, k).id;
4055
4056
    // POS
4057
33.8k
    if (!(p = kstrtok(0, 0, &aux)))
4058
219
        goto err;
4059
33.6k
    *(q = (char*)aux.p) = 0;
4060
4061
33.6k
    overflow = 0;
4062
33.6k
    char *tmp = p;
4063
33.6k
    v->pos = hts_str2uint(p, &p, 62, &overflow);
4064
33.6k
    if (overflow) {
4065
37
        hts_log_error("Position value '%s' is too large", tmp);
4066
37
        goto err;
4067
33.6k
    } else if ( *p ) {
4068
72
        hts_log_error("Could not parse the position '%s'", tmp);
4069
72
        goto err;
4070
33.5k
    } else {
4071
33.5k
        v->pos -= 1;
4072
33.5k
    }
4073
33.5k
    if (v->pos >= INT32_MAX)
4074
794
        v->unpacked |= BCF_IS_64BIT;
4075
4076
    // ID
4077
33.5k
    if (!(p = kstrtok(0, 0, &aux)))
4078
49
        goto err;
4079
33.5k
    *(q = (char*)aux.p) = 0;
4080
4081
33.5k
    if (NOT_DOT(p)) bcf_enc_vchar(str, q - p, p);
4082
81
    else bcf_enc_size(str, 0, BCF_BT_CHAR);
4083
4084
    // REF
4085
33.5k
    if (!(p = kstrtok(0, 0, &aux)))
4086
44
        goto err;
4087
33.4k
    *(q = (char*)aux.p) = 0;
4088
4089
33.4k
    bcf_enc_vchar(str, q - p, p);
4090
33.4k
    v->n_allele = 1, v->rlen = q - p;
4091
4092
    // ALT
4093
33.4k
    if (!(p = kstrtok(0, 0, &aux)))
4094
22
        goto err;
4095
33.4k
    *(q = (char*)aux.p) = 0;
4096
4097
33.4k
    if (NOT_DOT(p)) {
4098
67.2M
        for (r = t = p;; ++r) {
4099
67.2M
            if (*r == ',' || *r == 0) {
4100
2.59M
                if (v->n_allele == UINT16_MAX) {
4101
4
                    hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos,
4102
4
                                  bcf_seqname_safe(h,v), v->pos+1);
4103
4
                    v->errcode |= BCF_ERR_LIMITS;
4104
4
                    goto err;
4105
4
                }
4106
2.59M
                bcf_enc_vchar(str, r - t, t);
4107
2.59M
                t = r + 1;
4108
2.59M
                ++v->n_allele;
4109
2.59M
            }
4110
67.2M
            if (r == q) break;
4111
67.2M
        }
4112
32.9k
    }
4113
4114
    // QUAL
4115
33.4k
    if (!(p = kstrtok(0, 0, &aux)))
4116
70
        goto err;
4117
33.3k
    *(q = (char*)aux.p) = 0;
4118
4119
33.3k
    if (NOT_DOT(p)) v->qual = atof(p);
4120
591
    else bcf_float_set_missing(v->qual);
4121
33.3k
    if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR
4122
4123
    // FILTER
4124
33.3k
    if (!(p = kstrtok(0, 0, &aux)))
4125
62
        goto err;
4126
33.3k
    *(q = (char*)aux.p) = 0;
4127
4128
33.3k
    if (NOT_DOT(p)) {
4129
30.9k
        if (vcf_parse_filter(str, h, v, p, q)) {
4130
58
            goto err;
4131
58
        }
4132
30.9k
    } else bcf_enc_vint(str, 0, 0, -1);
4133
33.2k
    if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT
4134
4135
    // INFO
4136
33.2k
    if (!(p = kstrtok(0, 0, &aux)))
4137
94
        goto err;
4138
33.1k
    *(q = (char*)aux.p) = 0;
4139
4140
33.1k
    if (NOT_DOT(p)) {
4141
32.6k
        if (vcf_parse_info(str, h, v, p, q)) {
4142
79
            goto err;
4143
79
        }
4144
32.6k
    }
4145
33.0k
    if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
4146
4147
    // FORMAT; optional
4148
33.0k
    p = kstrtok(0, 0, &aux);
4149
33.0k
    if (p) {
4150
28.7k
        *(q = (char*)aux.p) = 0;
4151
4152
28.7k
        if (vcf_parse_format(s, h, v, p, q)) {
4153
266
            goto err;
4154
266
        }
4155
28.7k
    }
4156
4157
32.8k
 end:
4158
32.8k
    v->rlen = get_rlen(h, v);    //set rlen based on version
4159
32.8k
    ret = 0;
4160
4161
33.9k
 err:
4162
33.9k
    return ret;
4163
32.8k
}
4164
4165
int vcf_open_mode(char *mode, const char *fn, const char *format)
4166
0
{
4167
0
    if (format == NULL) {
4168
        // Try to pick a format based on the filename extension
4169
0
        char extension[HTS_MAX_EXT_LEN];
4170
0
        if (find_file_extension(fn, extension) < 0) return -1;
4171
0
        return vcf_open_mode(mode, fn, extension);
4172
0
    }
4173
0
    else if (strcasecmp(format, "bcf") == 0) strcpy(mode, "b");
4174
0
    else if (strcasecmp(format, "vcf") == 0) strcpy(mode, "");
4175
0
    else if (strcasecmp(format, "vcf.gz") == 0 || strcasecmp(format, "vcf.bgz") == 0) strcpy(mode, "z");
4176
0
    else return -1;
4177
4178
0
    return 0;
4179
0
}
4180
4181
int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
4182
35.0k
{
4183
35.0k
    int ret;
4184
35.0k
    ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
4185
35.0k
    if (ret < 0) return ret;
4186
33.9k
    return vcf_parse1(&fp->line, h, v);
4187
35.0k
}
4188
4189
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
4190
0
{
4191
0
    uint8_t *ptr_start = ptr;
4192
0
    fmt->id = bcf_dec_typed_int1(ptr, &ptr);
4193
0
    fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
4194
0
    fmt->size = fmt->n << bcf_type_shift[fmt->type];
4195
0
    fmt->p = ptr;
4196
0
    fmt->p_off  = ptr - ptr_start;
4197
0
    fmt->p_free = 0;
4198
0
    ptr += n_sample * fmt->size;
4199
0
    fmt->p_len = ptr - fmt->p;
4200
0
    return ptr;
4201
0
}
4202
4203
static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
4204
1.97k
{
4205
1.97k
    uint8_t *ptr_start = ptr;
4206
1.97k
    int64_t len = 0;
4207
1.97k
    info->key = bcf_dec_typed_int1(ptr, &ptr);
4208
1.97k
    len = info->len = bcf_dec_size(ptr, &ptr, &info->type);
4209
1.97k
    info->vptr = ptr;
4210
1.97k
    info->vptr_off  = ptr - ptr_start;
4211
1.97k
    info->vptr_free = 0;
4212
1.97k
    info->v1.i = 0;
4213
1.97k
    if (info->len == 1) {
4214
920
        switch(info->type) {
4215
0
        case BCF_BT_INT8:
4216
920
        case BCF_BT_CHAR:
4217
920
            info->v1.i = *(int8_t*)ptr;
4218
920
            break;
4219
0
        case BCF_BT_INT16:
4220
0
            info->v1.i = le_to_i16(ptr);
4221
0
            len <<= 1;
4222
0
            break;
4223
0
        case BCF_BT_INT32:
4224
0
            info->v1.i = le_to_i32(ptr);
4225
0
            len <<= 2;
4226
0
            break;
4227
0
        case BCF_BT_FLOAT:
4228
0
            info->v1.f = le_to_float(ptr);
4229
0
            len <<= 2;
4230
0
            break;
4231
0
        case BCF_BT_INT64:
4232
0
            info->v1.i = le_to_i64(ptr);
4233
0
            len <<= 3;
4234
0
            break;
4235
920
        }
4236
1.05k
    } else {
4237
1.05k
        len <<= bcf_type_shift[info->type];
4238
1.05k
    }
4239
1.97k
    ptr += len;
4240
4241
1.97k
    info->vptr_len = ptr - info->vptr;
4242
1.97k
    return ptr;
4243
1.97k
}
4244
4245
int bcf_unpack(bcf1_t *b, int which)
4246
30.8k
{
4247
30.8k
    if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
4248
30.8k
    uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
4249
30.8k
    int i;
4250
30.8k
    bcf_dec_t *d = &b->d;
4251
30.8k
    if (which & BCF_UN_FLT) which |= BCF_UN_STR;
4252
30.8k
    if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
4253
30.8k
    if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
4254
30.8k
    {
4255
30.8k
        kstring_t tmp;
4256
4257
        // ID
4258
30.8k
        tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
4259
30.8k
        ptr_ori = ptr;
4260
30.8k
        ptr = bcf_fmt_sized_array(&tmp, ptr);
4261
30.8k
        b->unpack_size[0] = ptr - ptr_ori;
4262
30.8k
        kputc_('\0', &tmp);
4263
30.8k
        d->id = tmp.s; d->m_id = tmp.m;
4264
4265
        // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
4266
30.8k
        hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
4267
30.8k
        tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
4268
30.8k
        ptr_ori = ptr;
4269
1.56M
        for (i = 0; i < b->n_allele; ++i) {
4270
            // Use offset within tmp.s as realloc may change pointer
4271
1.53M
            d->allele[i] = (char *)(intptr_t)tmp.l;
4272
1.53M
            ptr = bcf_fmt_sized_array(&tmp, ptr);
4273
1.53M
            kputc_('\0', &tmp);
4274
1.53M
        }
4275
30.8k
        b->unpack_size[1] = ptr - ptr_ori;
4276
30.8k
        d->als = tmp.s; d->m_als = tmp.m;
4277
4278
        // Convert our offsets within tmp.s back to pointers again
4279
1.56M
        for (i = 0; i < b->n_allele; ++i)
4280
1.53M
            d->allele[i] = d->als + (ptrdiff_t)d->allele[i];
4281
30.8k
        b->unpacked |= BCF_UN_STR;
4282
30.8k
    }
4283
30.8k
    if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
4284
30.8k
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
4285
30.8k
        ptr_ori = ptr;
4286
30.8k
        if (*ptr>>4) {
4287
28.5k
            int type;
4288
28.5k
            d->n_flt = bcf_dec_size(ptr, &ptr, &type);
4289
28.5k
            hts_expand(int, d->n_flt, d->m_flt, d->flt);
4290
366k
            for (i = 0; i < d->n_flt; ++i)
4291
338k
                d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
4292
28.5k
        } else ++ptr, d->n_flt = 0;
4293
30.8k
        b->unpack_size[2] = ptr - ptr_ori;
4294
30.8k
        b->unpacked |= BCF_UN_FLT;
4295
30.8k
    }
4296
30.8k
    if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
4297
0
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
4298
0
        hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
4299
0
        for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
4300
0
        for (i = 0; i < b->n_info; ++i)
4301
0
            ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
4302
0
        b->unpacked |= BCF_UN_INFO;
4303
0
    }
4304
30.8k
    if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
4305
0
        ptr = (uint8_t*)b->indiv.s;
4306
0
        hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
4307
0
        for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
4308
0
        for (i = 0; i < b->n_fmt; ++i)
4309
0
            ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
4310
0
        b->unpacked |= BCF_UN_FMT;
4311
0
    }
4312
30.8k
    return 0;
4313
30.8k
}
4314
4315
int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
4316
30.8k
{
4317
30.8k
    int i;
4318
30.8k
    int32_t max_dt_id = h->n[BCF_DT_ID];
4319
30.8k
    const char *chrom = bcf_seqname(h, v);
4320
30.8k
    if (!chrom) {
4321
0
        hts_log_error("Invalid BCF, CONTIG id=%d not present in the header",
4322
0
                      v->rid);
4323
0
        errno = EINVAL;
4324
0
        return -1;
4325
0
    }
4326
4327
30.8k
    bcf_unpack((bcf1_t*)v, BCF_UN_ALL & ~(BCF_UN_INFO|BCF_UN_FMT));
4328
4329
    // Cache of key lengths so we don't keep repeatedly using them.
4330
    // This assumes we're not modifying the header between successive calls
4331
    // to vcf_format, but that would lead to many other forms of breakage
4332
    // so it feels like a valid assumption to make.
4333
    //
4334
    // We cannot just do this in bcf_hdr_sync as some code (eg bcftools
4335
    // annotate) manipulates the headers directly without calling sync to
4336
    // refresh the data structures.  So we must do just-in-time length
4337
    // calculation during writes instead.
4338
30.8k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
4339
30.8k
    if (!aux->key_len) {
4340
4.58k
        if (!(aux->key_len = calloc(h->n[BCF_DT_ID]+1, sizeof(*aux->key_len))))
4341
0
            return -1;
4342
4.58k
    }
4343
30.8k
    size_t *key_len = aux->key_len;
4344
4345
30.8k
    kputs(chrom, s); // CHROM
4346
30.8k
    kputc_('\t', s); kputll(v->pos + 1, s); // POS
4347
30.8k
    kputc_('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
4348
30.8k
    kputc_('\t', s); // REF
4349
30.8k
    if (v->n_allele > 0) kputs(v->d.allele[0], s);
4350
0
    else kputc_('.', s);
4351
30.8k
    kputc_('\t', s); // ALT
4352
30.8k
    if (v->n_allele > 1) {
4353
1.53M
        for (i = 1; i < v->n_allele; ++i) {
4354
1.50M
            if (i > 1) kputc_(',', s);
4355
1.50M
            kputs(v->d.allele[i], s);
4356
1.50M
        }
4357
30.4k
    } else kputc_('.', s);
4358
30.8k
    kputc_('\t', s); // QUAL
4359
30.8k
    if ( bcf_float_is_missing(v->qual) ) kputc_('.', s); // QUAL
4360
30.3k
    else kputd(v->qual, s);
4361
30.8k
    kputc_('\t', s); // FILTER
4362
30.8k
    if (v->d.n_flt) {
4363
366k
        for (i = 0; i < v->d.n_flt; ++i) {
4364
338k
            int32_t idx = v->d.flt[i];
4365
338k
            if (idx < 0 || idx >= max_dt_id
4366
338k
                || h->id[BCF_DT_ID][idx].key == NULL) {
4367
0
                hts_log_error("Invalid BCF, the FILTER tag id=%d at %s:%"PRIhts_pos" not present in the header",
4368
0
                              idx, bcf_seqname_safe(h, v), v->pos + 1);
4369
0
                errno = EINVAL;
4370
0
                return -1;
4371
0
            }
4372
338k
            if (i) kputc_(';', s);
4373
338k
            if (!key_len[idx])
4374
49.3k
                key_len[idx] = strlen(h->id[BCF_DT_ID][idx].key);
4375
338k
            kputsn(h->id[BCF_DT_ID][idx].key, key_len[idx], s);
4376
338k
        }
4377
28.5k
    } else kputc_('.', s);
4378
4379
30.8k
    kputc_('\t', s); // INFO
4380
30.8k
    if (v->n_info) {
4381
16.5k
        uint8_t *ptr = v->shared.s
4382
16.5k
            ? (uint8_t *)v->shared.s + v->unpack_size[0] +
4383
16.5k
               v->unpack_size[1] + v->unpack_size[2]
4384
16.5k
            : NULL;
4385
16.5k
        int first = 1;
4386
16.5k
        bcf_info_t *info = v->d.info;
4387
4388
        // Note if we duplicate this code into custom packed and unpacked
4389
        // implementations then we gain a bit more speed, particularly with
4390
        // clang 13 (up to 5%).  Not sure why this is, but code duplication
4391
        // isn't pleasant and it's still faster adding packed support than
4392
        // not so it's a win, just not as good as it should be.
4393
16.5k
        const int info_packed = !(v->unpacked & BCF_UN_INFO) && v->shared.l;
4394
1.50M
        for (i = 0; i < v->n_info; ++i) {
4395
1.48M
            bcf_info_t in, *z;
4396
1.48M
            if (info_packed) {
4397
                // Use a local bcf_info_t when data is packed
4398
1.48M
                z = &in;
4399
1.48M
                z->key  = bcf_dec_typed_int1(ptr, &ptr);
4400
1.48M
                z->len  = bcf_dec_size(ptr, &ptr, &z->type);
4401
1.48M
                z->vptr = ptr;
4402
1.48M
                ptr += z->len << bcf_type_shift[z->type];
4403
1.48M
            } else {
4404
                // Else previously unpacked INFO struct
4405
0
                z = &info[i];
4406
4407
                // Also potentially since deleted
4408
0
                if ( !z->vptr ) continue;
4409
0
            }
4410
4411
1.48M
            bcf_idpair_t *id = z->key >= 0 && z->key < max_dt_id
4412
1.48M
                ? &h->id[BCF_DT_ID][z->key]
4413
1.48M
                : NULL;
4414
4415
1.48M
            if (!id || !id->key) {
4416
0
                hts_log_error("Invalid BCF, the INFO tag id=%d is %s at %s:%"PRIhts_pos,
4417
0
                              z->key,
4418
0
                              z->key < 0 ? "negative"
4419
0
                              : (z->key >= max_dt_id ? "too large" : "not present in the header"),
4420
0
                              bcf_seqname_safe(h, v), v->pos+1);
4421
0
                errno = EINVAL;
4422
0
                return -1;
4423
0
            }
4424
4425
            // KEY
4426
1.48M
            if (!key_len[z->key])
4427
20.2k
                key_len[z->key] = strlen(id->key);
4428
1.48M
            size_t id_len = key_len[z->key];
4429
1.48M
            if (ks_resize(s, s->l + 3 + id_len) < 0)
4430
0
                return -1;
4431
1.48M
            char *sptr = s->s + s->l;
4432
1.48M
            if ( !first ) {
4433
1.47M
                *sptr++ = ';';
4434
1.47M
                s->l++;
4435
1.47M
            }
4436
1.48M
            first = 0;
4437
1.48M
            memcpy(sptr, id->key, id_len);
4438
1.48M
            s->l += id_len;
4439
4440
            // VALUE
4441
1.48M
            if (z->len <= 0) continue;
4442
702k
            sptr[id_len] = '=';
4443
702k
            s->l++;
4444
4445
702k
            if (z->len != 1 || info_packed) {
4446
702k
                bcf_fmt_array(s, z->len, z->type, z->vptr);
4447
702k
            } else {
4448
                // Single length vectors are unpacked into their
4449
                // own info.v1 union and handled separately.
4450
0
                if (z->type == BCF_BT_FLOAT) {
4451
0
                    if ( bcf_float_is_missing(z->v1.f) )
4452
0
                        kputc_('.', s);
4453
0
                    else
4454
0
                        kputd(z->v1.f, s);
4455
0
                } else if (z->type == BCF_BT_CHAR) {
4456
0
                    kputc_(z->v1.i, s);
4457
0
                } else if (z->type < BCF_BT_INT64) {
4458
0
                    int64_t missing[] = {
4459
0
                        0, // BCF_BT_NULL
4460
0
                        bcf_int8_missing,
4461
0
                        bcf_int16_missing,
4462
0
                        bcf_int32_missing,
4463
0
                    };
4464
0
                    if (z->v1.i == missing[z->type])
4465
0
                        kputc_('.', s);
4466
0
                    else
4467
0
                        kputw(z->v1.i, s);
4468
0
                } else if (z->type == BCF_BT_INT64) {
4469
0
                    if (z->v1.i == bcf_int64_missing)
4470
0
                        kputc_('.', s);
4471
0
                    else
4472
0
                        kputll(z->v1.i, s);
4473
0
                } else {
4474
0
                    hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, z->type, bcf_seqname_safe(h, v), v->pos+1);
4475
0
                    errno = EINVAL;
4476
0
                    return -1;
4477
0
                }
4478
0
            }
4479
702k
        }
4480
16.5k
        if ( first ) kputc_('.', s);
4481
16.5k
    } else kputc_('.', s);
4482
4483
    // FORMAT and individual information
4484
30.8k
    if (v->n_sample) {
4485
9.99k
        int i,j;
4486
9.99k
        if ( v->n_fmt) {
4487
9.77k
            uint8_t *ptr = (uint8_t *)v->indiv.s;
4488
9.77k
            int gt_i = -1;
4489
9.77k
            bcf_fmt_t *fmt = v->d.fmt;
4490
9.77k
            int first = 1, ret = 0;
4491
9.77k
            int fmt_packed = !(v->unpacked & BCF_UN_FMT);
4492
4493
9.77k
            if (fmt_packed) {
4494
                // Local fmt as we have an array of num FORMAT keys,
4495
                // each of which points to N.Sample values.
4496
4497
                // No real gain to be had in handling unpacked data here,
4498
                // but it doesn't cost us much in complexity either and
4499
                // it gives us flexibility.
4500
9.77k
                fmt = hts_malloc_p(sizeof(*fmt), v->n_fmt);
4501
9.77k
                if (!fmt)
4502
0
                    return -1;
4503
9.77k
            }
4504
4505
            // KEYS
4506
33.2k
            for (i = 0; i < (int)v->n_fmt; ++i) {
4507
23.4k
                bcf_fmt_t *z;
4508
23.4k
                z = &fmt[i];
4509
23.4k
                if (fmt_packed) {
4510
23.4k
                    z->id   = bcf_dec_typed_int1(ptr, &ptr);
4511
23.4k
                    z->n    = bcf_dec_size(ptr, &ptr, &z->type);
4512
23.4k
                    z->p    = ptr;
4513
23.4k
                    z->size = z->n << bcf_type_shift[z->type];
4514
23.4k
                    ptr += v->n_sample * z->size;
4515
23.4k
                }
4516
23.4k
                if ( !z->p ) continue;
4517
23.4k
                kputc_(!first ? ':' : '\t', s); first = 0;
4518
4519
23.4k
                bcf_idpair_t *id = z->id >= 0 && z->id < max_dt_id
4520
23.4k
                    ? &h->id[BCF_DT_ID][z->id]
4521
23.4k
                    : NULL;
4522
4523
23.4k
                if (!id || !id->key) {
4524
0
                    hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", z->id, bcf_seqname_safe(h, v), v->pos+1);
4525
0
                    errno = EINVAL;
4526
0
                    if (fmt_packed)
4527
0
                        free(fmt);
4528
0
                    return -1;
4529
0
                }
4530
4531
23.4k
                if (!key_len[z->id])
4532
11.4k
                    key_len[z->id] = strlen(id->key);
4533
23.4k
                size_t id_len = key_len[z->id];
4534
23.4k
                kputsn(id->key, id_len, s);
4535
23.4k
                if (id_len == 2 && id->key[0] == 'G' && id->key[1] == 'T')
4536
4.77k
                    gt_i = i;
4537
23.4k
            }
4538
9.77k
            if ( first ) kputsn("\t.", 2, s);
4539
4540
            // VALUES per sample
4541
33.7k
            for (j = 0; j < v->n_sample; ++j) {
4542
23.9k
                kputc_('\t', s);
4543
23.9k
                first = 1;
4544
23.9k
                bcf_fmt_t *f = fmt;
4545
66.3k
                for (i = 0; i < (int)v->n_fmt; i++, f++) {
4546
58.4k
                    if ( !f->p ) continue;
4547
58.4k
                    if (!first) kputc_(':', s);
4548
58.4k
                    first = 0;
4549
58.4k
                    if (gt_i == i) {
4550
16.0k
                        if ((ret = bcf_format_gt_v2(h, f,j,s)) < 0) {
4551
0
                            hts_log_error("Failed to format GT value for sample %d, returned %d", i, ret);
4552
0
                            errno = EINVAL;
4553
0
                            if (fmt_packed)
4554
0
                                free(fmt);
4555
0
                            return -1;
4556
0
                        }
4557
16.0k
                        break;
4558
16.0k
                    }
4559
42.4k
                    else if (f->n == 1)
4560
4.49k
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4561
37.9k
                    else
4562
37.9k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4563
58.4k
                }
4564
4565
                // Simpler loop post GT and at least 1 iteration
4566
46.0k
                for (i++, f++; i < (int)v->n_fmt; i++, f++) {
4567
22.1k
                    if ( !f->p ) continue;
4568
22.1k
                    kputc_(':', s);
4569
22.1k
                    if (f->n == 1)
4570
1.38k
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4571
20.7k
                    else
4572
20.7k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4573
22.1k
                }
4574
23.9k
                if ( first ) kputc_('.', s);
4575
23.9k
            }
4576
9.77k
            if (fmt_packed)
4577
9.77k
                free(fmt);
4578
9.77k
        }
4579
221
        else
4580
1.06k
            for (j=0; j<=v->n_sample; j++)
4581
839
                kputsn("\t.", 2, s);
4582
9.99k
    }
4583
30.8k
    kputc('\n', s);
4584
30.8k
    return 0;
4585
30.8k
}
4586
4587
int vcf_write_line(htsFile *fp, kstring_t *line)
4588
0
{
4589
0
    int ret;
4590
0
    if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
4591
0
    if ( fp->format.compression!=no_compression )
4592
0
        ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
4593
0
    else
4594
0
        ret = hwrite(fp->fp.hfile, line->s, line->l);
4595
0
    return ret==line->l ? 0 : -1;
4596
0
}
4597
4598
int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
4599
30.8k
{
4600
30.8k
    ssize_t ret;
4601
30.8k
    fp->line.l = 0;
4602
30.8k
    if (vcf_format1(h, v, &fp->line) != 0)
4603
0
        return -1;
4604
30.8k
    if ( fp->format.compression!=no_compression ) {
4605
0
        if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4606
0
            return -1;
4607
0
        if (fp->idx && !fp->fp.bgzf->mt)
4608
0
            hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
4609
0
        ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
4610
30.8k
    } else {
4611
30.8k
        ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
4612
30.8k
    }
4613
4614
30.8k
    if (fp->idx && fp->format.compression == bgzf) {
4615
0
        int tid;
4616
0
        if ((tid = hts_idx_tbi_name(fp->idx, v->rid, bcf_seqname_safe(h, v))) < 0)
4617
0
            return -1;
4618
4619
0
        if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
4620
0
                          tid, v->pos, v->pos + v->rlen,
4621
0
                          bgzf_tell(fp->fp.bgzf), 1) < 0)
4622
0
            return -1;
4623
0
    }
4624
4625
30.8k
    return ret==fp->line.l ? 0 : -1;
4626
30.8k
}
4627
4628
/************************
4629
 * Data access routines *
4630
 ************************/
4631
4632
int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
4633
260k
{
4634
260k
    khint_t k;
4635
260k
    vdict_t *d = (vdict_t*)h->dict[which];
4636
260k
    k = kh_get(vdict, d, id);
4637
260k
    return k == kh_end(d)? -1 : kh_val(d, k).id;
4638
260k
}
4639
4640
4641
/********************
4642
 *** BCF indexing ***
4643
 ********************/
4644
4645
// Calculate number of index levels given min_shift and the header contig
4646
// list.  Also returns number of contigs in *nids_out.
4647
static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int *min_shift_in_out,
4648
                               int starting_n_lvls, int *nids_out)
4649
0
{
4650
0
    int n_lvls = starting_n_lvls, i, nids = 0;
4651
0
    int64_t max_len = 0;
4652
4653
0
    for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
4654
0
    {
4655
0
        if ( !h->id[BCF_DT_CTG][i].val ) continue;
4656
0
        if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] )
4657
0
            max_len = h->id[BCF_DT_CTG][i].val->info[0];
4658
0
        nids++;
4659
0
    }
4660
0
    if ( !max_len ) max_len = (1LL<<31) - 1;  // In case contig line is broken.
4661
4662
0
    hts_adjust_csi_settings(max_len, min_shift_in_out, &n_lvls);
4663
4664
0
    if (nids_out) *nids_out = nids;
4665
0
    return n_lvls;
4666
0
}
4667
4668
hts_idx_t *bcf_index(htsFile *fp, int min_shift)
4669
0
{
4670
0
    int n_lvls;
4671
0
    bcf1_t *b = NULL;
4672
0
    hts_idx_t *idx = NULL;
4673
0
    bcf_hdr_t *h;
4674
0
    int r;
4675
0
    h = bcf_hdr_read(fp);
4676
0
    if ( !h ) return NULL;
4677
0
    int nids = 0;
4678
0
    n_lvls = idx_calc_n_lvls_ids(h, &min_shift, 0, &nids);
4679
0
    idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4680
0
    if (!idx) goto fail;
4681
0
    b = bcf_init1();
4682
0
    if (!b) goto fail;
4683
0
    while ((r = bcf_read1(fp,h, b)) >= 0) {
4684
0
        int ret;
4685
0
        ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
4686
0
        if (ret < 0) goto fail;
4687
0
    }
4688
0
    if (r < -1) goto fail;
4689
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
4690
0
    bcf_destroy1(b);
4691
0
    bcf_hdr_destroy(h);
4692
0
    return idx;
4693
4694
0
 fail:
4695
0
    hts_idx_destroy(idx);
4696
0
    bcf_destroy1(b);
4697
0
    bcf_hdr_destroy(h);
4698
0
    return NULL;
4699
0
}
4700
4701
hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
4702
0
{
4703
0
    return fnidx? hts_idx_load2(fn, fnidx) : bcf_index_load(fn);
4704
0
}
4705
4706
hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags)
4707
0
{
4708
0
    return hts_idx_load3(fn, fnidx, HTS_FMT_CSI, flags);
4709
0
}
4710
4711
int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads)
4712
0
{
4713
0
    htsFile *fp;
4714
0
    hts_idx_t *idx;
4715
0
    tbx_t *tbx;
4716
0
    int ret;
4717
0
    if ((fp = hts_open(fn, "rb")) == 0) return -2;
4718
0
    if (n_threads)
4719
0
        hts_set_threads(fp, n_threads);
4720
0
    if ( fp->format.compression!=bgzf ) { hts_close(fp); return -3; }
4721
0
    switch (fp->format.format) {
4722
0
        case bcf:
4723
0
            if (!min_shift) {
4724
0
                hts_log_error("TBI indices for BCF files are not supported");
4725
0
                ret = -1;
4726
0
            } else {
4727
0
                idx = bcf_index(fp, min_shift);
4728
0
                if (idx) {
4729
0
                    ret = hts_idx_save_as(idx, fn, fnidx, HTS_FMT_CSI);
4730
0
                    if (ret < 0) ret = -4;
4731
0
                    hts_idx_destroy(idx);
4732
0
                }
4733
0
                else ret = -1;
4734
0
            }
4735
0
            break;
4736
4737
0
        case vcf:
4738
0
            tbx = tbx_index(hts_get_bgzfp(fp), min_shift, &tbx_conf_vcf);
4739
0
            if (tbx) {
4740
0
                ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0 ? HTS_FMT_CSI : HTS_FMT_TBI);
4741
0
                if (ret < 0) ret = -4;
4742
0
                tbx_destroy(tbx);
4743
0
            }
4744
0
            else ret = -1;
4745
0
            break;
4746
4747
0
        default:
4748
0
            ret = -3;
4749
0
            break;
4750
0
    }
4751
0
    hts_close(fp);
4752
0
    return ret;
4753
0
}
4754
4755
int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
4756
0
{
4757
0
    return bcf_index_build3(fn, fnidx, min_shift, 0);
4758
0
}
4759
4760
int bcf_index_build(const char *fn, int min_shift)
4761
0
{
4762
0
    return bcf_index_build3(fn, NULL, min_shift, 0);
4763
0
}
4764
4765
// Initialise fp->idx for the current format type.
4766
// This must be called after the header has been written but no other data.
4767
0
static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4768
0
    int n_lvls, fmt;
4769
4770
0
    if (min_shift == 0) {
4771
0
        min_shift = 14;
4772
0
        n_lvls = 5;
4773
0
        fmt = HTS_FMT_TBI;
4774
0
    } else {
4775
        // Set initial n_lvls to match tbx_index()
4776
0
        int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3;
4777
        // Increase if necessary
4778
0
        n_lvls = idx_calc_n_lvls_ids(h, &min_shift, starting_n_lvls, NULL);
4779
0
        fmt = HTS_FMT_CSI;
4780
0
    }
4781
4782
0
    fp->idx = hts_idx_init(0, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4783
0
    if (!fp->idx) return -1;
4784
4785
    // Tabix meta data, added even in CSI for VCF
4786
0
    uint8_t conf[4*7];
4787
0
    u32_to_le(TBX_VCF, conf+0);  // fmt
4788
0
    u32_to_le(1,       conf+4);  // name col
4789
0
    u32_to_le(2,       conf+8);  // beg col
4790
0
    u32_to_le(0,       conf+12); // end col
4791
0
    u32_to_le('#',     conf+16); // comment
4792
0
    u32_to_le(0,       conf+20); // n.skip
4793
0
    u32_to_le(0,       conf+24); // ref name len
4794
0
    if (hts_idx_set_meta(fp->idx, sizeof(conf)*sizeof(*conf), (uint8_t *)conf, 1) < 0) {
4795
0
        hts_idx_destroy(fp->idx);
4796
0
        fp->idx = NULL;
4797
0
        return -1;
4798
0
    }
4799
0
    fp->fnidx = fnidx;
4800
4801
0
    return 0;
4802
0
}
4803
4804
// Initialise fp->idx for the current format type.
4805
// This must be called after the header has been written but no other data.
4806
0
int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4807
0
    int n_lvls, nids = 0;
4808
4809
0
    if (fp->format.compression != bgzf) {
4810
0
        hts_log_error("Indexing is only supported on BGZF-compressed files");
4811
0
        return -3; // Matches no-compression return for bcf_index_build3()
4812
0
    }
4813
4814
0
    if (fp->format.format == vcf)
4815
0
        return vcf_idx_init(fp, h, min_shift, fnidx);
4816
4817
0
    if (!min_shift)
4818
0
        min_shift = 14;
4819
4820
0
    n_lvls = idx_calc_n_lvls_ids(h, &min_shift, 0, &nids);
4821
4822
0
    fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4823
0
    if (!fp->idx) return -1;
4824
0
    fp->fnidx = fnidx;
4825
4826
0
    return 0;
4827
0
}
4828
4829
// Finishes an index. Call after the last record has been written.
4830
// Returns 0 on success, <0 on failure.
4831
//
4832
// NB: same format as SAM/BAM as it uses bgzf.
4833
0
int bcf_idx_save(htsFile *fp) {
4834
0
    return sam_idx_save(fp);
4835
0
}
4836
4837
// Wrap around bcf_hdr_name2id() to get the right signature for hts_name2id_f
4838
0
static int bcf_hdr_name2id_wrapper(void *vhdr, const char *ref) {
4839
0
    return bcf_hdr_name2id((bcf_hdr_t *) vhdr, ref);
4840
0
}
4841
4842
hts_itr_t *bcf_itr_querys1(const hts_idx_t *idx, bcf_hdr_t *hdr,
4843
0
                           const char *region) {
4844
0
    return hts_itr_querys(idx, region, bcf_hdr_name2id_wrapper, hdr,
4845
0
                          hts_itr_query, bcf_readrec);
4846
0
}
4847
4848
hts_itr_t *bcf_itr_regarray(const hts_idx_t *idx, bcf_hdr_t *hdr,
4849
0
                            char **regarray, unsigned int regcount) {
4850
0
    hts_itr_t *itr = NULL;
4851
0
    hts_reglist_t *r_list = NULL;
4852
0
    int r_count = 0;
4853
4854
0
    r_list = hts_reglist_create(regarray, regcount, &r_count, hdr,
4855
0
                                bcf_hdr_name2id_wrapper);
4856
0
    if (!r_list)
4857
0
        return NULL;
4858
4859
0
    itr = hts_itr_regions(idx, r_list, r_count, bcf_hdr_name2id_wrapper, hdr,
4860
0
                          hts_itr_multi_bam, bcf_readrec,
4861
0
                          bgzf_pseek, bgzf_ptell);
4862
0
    if (!itr)
4863
0
        hts_reglist_free(r_list, r_count);
4864
4865
0
    return itr;
4866
0
}
4867
4868
/*****************
4869
 *** Utilities ***
4870
 *****************/
4871
4872
int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
4873
0
{
4874
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res;
4875
0
    for (i=0; i<src->nhrec; i++)
4876
0
    {
4877
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4878
0
        {
4879
0
            int j;
4880
0
            for (j=0; j<ndst_ori; j++)
4881
0
            {
4882
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4883
4884
                // Checking only the key part of generic lines, otherwise
4885
                // the VCFs are too verbose. Should we perhaps add a flag
4886
                // to bcf_hdr_combine() and make this optional?
4887
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4888
0
            }
4889
0
            if ( j>=ndst_ori ) {
4890
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4891
0
                if (res < 0) return -1;
4892
0
                need_sync += res;
4893
0
            }
4894
0
        }
4895
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4896
0
        {
4897
            // NB: we are ignoring fields without ID
4898
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4899
0
            if ( j>=0 )
4900
0
            {
4901
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4902
0
                if ( !rec ) {
4903
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4904
0
                    if (res < 0) return -1;
4905
0
                    need_sync += res;
4906
0
                }
4907
0
            }
4908
0
        }
4909
0
        else
4910
0
        {
4911
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4912
0
            assert( j>=0 ); // this should always be true for valid VCFs
4913
4914
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4915
0
            if ( !rec ) {
4916
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4917
0
                if (res < 0) return -1;
4918
0
                need_sync += res;
4919
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4920
0
            {
4921
                // Check that both records are of the same type. The bcf_hdr_id2length
4922
                // macro cannot be used here because dst header is not synced yet.
4923
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4924
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4925
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4926
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4927
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4928
0
                {
4929
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4930
0
                        src->hrec[i]->vals[0]);
4931
0
                    ret |= 1;
4932
0
                }
4933
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4934
0
                {
4935
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4936
0
                        src->hrec[i]->vals[0]);
4937
0
                    ret |= 1;
4938
0
                }
4939
0
            }
4940
0
        }
4941
0
    }
4942
0
    if ( need_sync ) {
4943
0
        if (bcf_hdr_sync(dst) < 0) return -1;
4944
0
    }
4945
0
    return ret;
4946
0
}
4947
4948
bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
4949
0
{
4950
0
    if ( !dst )
4951
0
    {
4952
        // this will effectively strip existing IDX attributes from src to become dst
4953
0
        dst = bcf_hdr_init("r");
4954
0
        kstring_t htxt = {0,0,0};
4955
0
        if (bcf_hdr_format(src, 0, &htxt) < 0) {
4956
0
            free(htxt.s);
4957
0
            return NULL;
4958
0
        }
4959
0
        if ( bcf_hdr_parse(dst, htxt.s) < 0 ) {
4960
0
            bcf_hdr_destroy(dst);
4961
0
            dst = NULL;
4962
0
        }
4963
0
        free(htxt.s);
4964
0
        return dst;
4965
0
    }
4966
4967
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, res;
4968
0
    for (i=0; i<src->nhrec; i++)
4969
0
    {
4970
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4971
0
        {
4972
0
            int j;
4973
0
            for (j=0; j<ndst_ori; j++)
4974
0
            {
4975
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4976
4977
                // Checking only the key part of generic lines, otherwise
4978
                // the VCFs are too verbose. Should we perhaps add a flag
4979
                // to bcf_hdr_combine() and make this optional?
4980
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4981
0
            }
4982
0
            if ( j>=ndst_ori ) {
4983
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4984
0
                if (res < 0) return NULL;
4985
0
                need_sync += res;
4986
0
            }
4987
0
            else if ( !strcmp(src->hrec[i]->key,"fileformat") )
4988
0
            {
4989
0
                int ver_src = bcf_get_version(src,src->hrec[i]->value);
4990
0
                int ver_dst = bcf_get_version(dst,dst->hrec[j]->value);
4991
0
                if ( ver_src > ver_dst )
4992
0
                {
4993
0
                    if (bcf_hdr_set_version(dst,src->hrec[i]->value) < 0)
4994
0
                        return NULL;
4995
0
                    need_sync = 1;
4996
0
                }
4997
0
            }
4998
0
        }
4999
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
5000
0
        {
5001
            // NB: we are ignoring fields without ID
5002
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
5003
0
            if ( j>=0 )
5004
0
            {
5005
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
5006
0
                if ( !rec ) {
5007
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
5008
0
                    if (res < 0) return NULL;
5009
0
                    need_sync += res;
5010
0
                }
5011
0
            }
5012
0
        }
5013
0
        else
5014
0
        {
5015
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
5016
0
            assert( j>=0 ); // this should always be true for valid VCFs
5017
5018
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
5019
0
            if ( !rec ) {
5020
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
5021
0
                if (res < 0) return NULL;
5022
0
                need_sync += res;
5023
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
5024
0
            {
5025
                // Check that both records are of the same type. The bcf_hdr_id2length
5026
                // macro cannot be used here because dst header is not synced yet.
5027
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
5028
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
5029
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
5030
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
5031
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
5032
0
                {
5033
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
5034
0
                        src->hrec[i]->vals[0]);
5035
0
                }
5036
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
5037
0
                {
5038
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
5039
0
                        src->hrec[i]->vals[0]);
5040
0
                }
5041
0
            }
5042
0
        }
5043
0
    }
5044
0
    if ( need_sync ) {
5045
0
        if (bcf_hdr_sync(dst) < 0) return NULL;
5046
0
    }
5047
0
    return dst;
5048
0
}
5049
5050
int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
5051
0
{
5052
0
    int i;
5053
0
    if ( line->errcode )
5054
0
    {
5055
0
        char errordescription[1024] = "";
5056
0
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_strerror(line->errcode, errordescription, sizeof(errordescription)),  bcf_seqname_safe(src_hdr,line), line->pos+1);
5057
0
        exit(1);
5058
0
    }
5059
0
    if ( src_hdr->ntransl==-1 ) return 0;    // no need to translate, all tags have the same id
5060
0
    if ( !src_hdr->ntransl )  // called for the first time, see what needs translating
5061
0
    {
5062
0
        int dict;
5063
0
        for (dict=0; dict<2; dict++)    // BCF_DT_ID and BCF_DT_CTG
5064
0
        {
5065
0
            src_hdr->transl[dict] = hts_malloc_p(sizeof(int), src_hdr->n[dict]);
5066
0
            for (i=0; i<src_hdr->n[dict]; i++)
5067
0
            {
5068
0
                if ( !src_hdr->id[dict][i].key ) // gap left after removed BCF header lines
5069
0
                {
5070
0
                    src_hdr->transl[dict][i] = -1;
5071
0
                    continue;
5072
0
                }
5073
0
                src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
5074
0
                if ( src_hdr->transl[dict][i]!=-1 && i!=src_hdr->transl[dict][i] ) src_hdr->ntransl++;
5075
0
            }
5076
0
        }
5077
0
        if ( !src_hdr->ntransl )
5078
0
        {
5079
0
            free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
5080
0
            free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
5081
0
            src_hdr->ntransl = -1;
5082
0
        }
5083
0
        if ( src_hdr->ntransl==-1 ) return 0;
5084
0
    }
5085
0
    bcf_unpack(line,BCF_UN_ALL);
5086
5087
    // CHROM
5088
0
    if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
5089
5090
    // FILTER
5091
0
    for (i=0; i<line->d.n_flt; i++)
5092
0
    {
5093
0
        int src_id = line->d.flt[i];
5094
0
        if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
5095
0
            line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
5096
0
        line->d.shared_dirty |= BCF1_DIRTY_FLT;
5097
0
    }
5098
5099
    // INFO
5100
0
    for (i=0; i<line->n_info; i++)
5101
0
    {
5102
0
        int src_id = line->d.info[i].key;
5103
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
5104
0
        if ( dst_id<0 ) continue;
5105
0
        line->d.info[i].key = dst_id;
5106
0
        if ( !line->d.info[i].vptr ) continue;  // skip deleted
5107
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5108
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5109
0
        if ( src_size==dst_size )   // can overwrite
5110
0
        {
5111
0
            uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
5112
0
            if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
5113
0
            else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
5114
0
            else { *(uint32_t*)vptr = (uint32_t)dst_id; }
5115
0
        }
5116
0
        else    // must realloc
5117
0
        {
5118
0
            bcf_info_t *info = &line->d.info[i];
5119
0
            kstring_t str = {0,0,0};
5120
0
            bcf_enc_int1(&str, dst_id);
5121
0
            bcf_enc_size(&str, info->len,info->type);
5122
0
            uint32_t vptr_off = str.l;
5123
0
            kputsn((char*)info->vptr, info->vptr_len, &str);
5124
0
            if( info->vptr_free ) free(info->vptr - info->vptr_off);
5125
0
            info->vptr_off = vptr_off;
5126
0
            info->vptr = (uint8_t*)str.s + info->vptr_off;
5127
0
            info->vptr_free = 1;
5128
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5129
0
        }
5130
0
    }
5131
5132
    // FORMAT
5133
0
    for (i=0; i<line->n_fmt; i++)
5134
0
    {
5135
0
        int src_id = line->d.fmt[i].id;
5136
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
5137
0
        if ( dst_id<0 ) continue;
5138
0
        line->d.fmt[i].id = dst_id;
5139
0
        if( !line->d.fmt[i].p ) continue;  // skip deleted
5140
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5141
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5142
0
        if ( src_size==dst_size )   // can overwrite
5143
0
        {
5144
0
            uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off;    // pointer to the vector size (4bits) and BT type (4bits)
5145
0
            if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
5146
0
            else if ( dst_size==BCF_BT_INT16 ) { i16_to_le(dst_id, p + 1); }
5147
0
            else { i32_to_le(dst_id, p + 1); }
5148
0
        }
5149
0
        else    // must realloc
5150
0
        {
5151
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
5152
0
            kstring_t str = {0,0,0};
5153
0
            bcf_enc_int1(&str, dst_id);
5154
0
            bcf_enc_size(&str, fmt->n, fmt->type);
5155
0
            uint32_t p_off = str.l;
5156
0
            kputsn((char*)fmt->p, fmt->p_len, &str);
5157
0
            if( fmt->p_free ) free(fmt->p - fmt->p_off);
5158
0
            fmt->p_off = p_off;
5159
0
            fmt->p = (uint8_t*)str.s + fmt->p_off;
5160
0
            fmt->p_free = 1;
5161
0
            line->d.indiv_dirty = 1;
5162
0
        }
5163
0
    }
5164
0
    return 0;
5165
0
}
5166
5167
bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
5168
0
{
5169
0
    bcf_hdr_t *hout = bcf_hdr_init("r");
5170
0
    if (!hout) {
5171
0
        hts_log_error("Failed to allocate bcf header");
5172
0
        return NULL;
5173
0
    }
5174
0
    kstring_t htxt = {0,0,0};
5175
0
    if (bcf_hdr_format(hdr, 1, &htxt) < 0) {
5176
0
        free(htxt.s);
5177
0
        return NULL;
5178
0
    }
5179
0
    if ( bcf_hdr_parse(hout, htxt.s) < 0 ) {
5180
0
        bcf_hdr_destroy(hout);
5181
0
        hout = NULL;
5182
0
    }
5183
0
    free(htxt.s);
5184
0
    return hout;
5185
0
}
5186
5187
bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
5188
0
{
5189
0
    void *names_hash = khash_str2int_init();
5190
0
    kstring_t htxt = {0,0,0};
5191
0
    kstring_t str = {0,0,0};
5192
0
    bcf_hdr_t *h = bcf_hdr_init("w");
5193
0
    int r = 0;
5194
0
    if (!h || !names_hash) {
5195
0
        hts_log_error("Failed to allocate bcf header");
5196
0
        goto err;
5197
0
    }
5198
0
    if (bcf_hdr_format(h0, 1, &htxt) < 0) {
5199
0
        hts_log_error("Failed to get header text");
5200
0
        goto err;
5201
0
    }
5202
0
    bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
5203
0
    int j;
5204
0
    for (j=0; j<n; j++) imap[j] = -1;
5205
0
    if ( bcf_hdr_nsamples(h0) > 0) {
5206
0
        char *p = find_chrom_header_line(htxt.s);
5207
0
        int i = 0, end = n? 8 : 7;
5208
0
        while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
5209
0
        if (i != end) {
5210
0
            hts_log_error("Wrong number of columns in header #CHROM line");
5211
0
            goto err;
5212
0
        }
5213
0
        r |= kputsn(htxt.s, p - htxt.s, &str) < 0;
5214
0
        for (i = 0; i < n; ++i) {
5215
0
            if ( khash_str2int_has_key(names_hash,samples[i]) )
5216
0
            {
5217
0
                hts_log_error("Duplicate sample name \"%s\"", samples[i]);
5218
0
                goto err;
5219
0
            }
5220
0
            imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
5221
0
            if (imap[i] < 0) continue;
5222
0
            r |= kputc('\t', &str) < 0;
5223
0
            r |= kputs(samples[i], &str) < 0;
5224
0
            r |= khash_str2int_inc(names_hash,samples[i]) < 0;
5225
0
        }
5226
0
    } else r |= kputsn(htxt.s, htxt.l, &str) < 0;
5227
0
    while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
5228
0
    r |= kputc('\n',&str) < 0;
5229
0
    if (r) {
5230
0
        hts_log_error("%s", strerror(errno));
5231
0
        goto err;
5232
0
    }
5233
0
    if ( bcf_hdr_parse(h, str.s) < 0 ) {
5234
0
        bcf_hdr_destroy(h);
5235
0
        h = NULL;
5236
0
    }
5237
0
    free(str.s);
5238
0
    free(htxt.s);
5239
0
    khash_str2int_destroy(names_hash);
5240
0
    return h;
5241
5242
0
 err:
5243
0
    ks_free(&str);
5244
0
    ks_free(&htxt);
5245
0
    khash_str2int_destroy(names_hash);
5246
0
    bcf_hdr_destroy(h);
5247
0
    return NULL;
5248
0
}
5249
5250
int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
5251
0
{
5252
0
    if ( samples && !strcmp("-",samples) ) return 0;            // keep all samples
5253
5254
0
    int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
5255
0
    hdr->keep_samples = (uint8_t*) calloc(narr,1);
5256
0
    if (!hdr->keep_samples) return -1;
5257
5258
0
    hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
5259
0
    if ( !samples )
5260
0
    {
5261
        // exclude all samples
5262
0
        khint_t k;
5263
0
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE], *new_dict;
5264
0
        new_dict = kh_init(vdict);
5265
0
        if (!new_dict) return -1;
5266
5267
0
        bcf_hdr_nsamples(hdr) = 0;
5268
5269
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
5270
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
5271
0
        kh_destroy(vdict, d);
5272
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
5273
0
        if (bcf_hdr_sync(hdr) < 0) return -1;
5274
5275
0
        return 0;
5276
0
    }
5277
5278
0
    if ( samples[0]=='^' )
5279
0
        for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
5280
5281
0
    int idx, n, ret = 0;
5282
0
    char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
5283
0
    if ( !smpls ) return -1;
5284
0
    for (i=0; i<n; i++)
5285
0
    {
5286
0
        idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
5287
0
        if ( idx<0 )
5288
0
        {
5289
0
            if ( !ret ) ret = i+1;
5290
0
            continue;
5291
0
        }
5292
0
        assert( idx<bcf_hdr_nsamples(hdr) );
5293
0
        if (  samples[0]=='^' )
5294
0
            bit_array_clear(hdr->keep_samples, idx);
5295
0
        else
5296
0
            bit_array_set(hdr->keep_samples, idx);
5297
0
    }
5298
0
    for (i=0; i<n; i++) free(smpls[i]);
5299
0
    free(smpls);
5300
5301
0
    bcf_hdr_nsamples(hdr) = 0;
5302
0
    for (i=0; i<hdr->nsamples_ori; i++)
5303
0
        if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
5304
5305
0
    if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
5306
0
    else
5307
0
    {
5308
        // Make new list and dictionary with desired samples
5309
0
        char **samples = hts_malloc_p(sizeof(char*), bcf_hdr_nsamples(hdr));
5310
0
        vdict_t *new_dict, *d;
5311
0
        int k, res;
5312
0
        if (!samples) return -1;
5313
5314
0
        new_dict = kh_init(vdict);
5315
0
        if (!new_dict) {
5316
0
            free(samples);
5317
0
            return -1;
5318
0
        }
5319
0
        idx = 0;
5320
0
        for (i=0; i<hdr->nsamples_ori; i++) {
5321
0
            if ( bit_array_test(hdr->keep_samples,i) ) {
5322
0
                samples[idx] = hdr->samples[i];
5323
0
                k = kh_put(vdict, new_dict, hdr->samples[i], &res);
5324
0
                if (res < 0) {
5325
0
                    free(samples);
5326
0
                    kh_destroy(vdict, new_dict);
5327
0
                    return -1;
5328
0
                }
5329
0
                kh_val(new_dict, k) = bcf_idinfo_def;
5330
0
                kh_val(new_dict, k).id = idx;
5331
0
                idx++;
5332
0
            }
5333
0
        }
5334
5335
        // Delete desired samples from old dictionary, so we don't free them
5336
0
        d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
5337
0
        for (i=0; i < idx; i++) {
5338
0
            int k = kh_get(vdict, d, samples[i]);
5339
0
            if (k < kh_end(d)) kh_del(vdict, d, k);
5340
0
        }
5341
5342
        // Free everything else
5343
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
5344
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
5345
0
        kh_destroy(vdict, d);
5346
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
5347
5348
0
        free(hdr->samples);
5349
0
        hdr->samples = samples;
5350
5351
0
        if (bcf_hdr_sync(hdr) < 0)
5352
0
            return -1;
5353
0
    }
5354
5355
0
    return ret;
5356
0
}
5357
5358
int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
5359
0
{
5360
0
    kstring_t ind;
5361
0
    ind.s = 0; ind.l = ind.m = 0;
5362
0
    if (n) {
5363
0
        bcf_fmt_t fmt[MAX_N_FMT];
5364
0
        int i, j;
5365
0
        uint8_t *ptr = (uint8_t*)v->indiv.s;
5366
0
        for (i = 0; i < v->n_fmt; ++i)
5367
0
            ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
5368
0
        for (i = 0; i < (int)v->n_fmt; ++i) {
5369
0
            bcf_fmt_t *f = &fmt[i];
5370
0
            bcf_enc_int1(&ind, f->id);
5371
0
            bcf_enc_size(&ind, f->n, f->type);
5372
0
            for (j = 0; j < n; ++j)
5373
0
                if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
5374
0
        }
5375
0
        for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
5376
0
        v->n_sample = i;
5377
0
    } else v->n_sample = 0;
5378
0
    if ( !v->n_sample ) v->n_fmt = 0;
5379
0
    free(v->indiv.s);
5380
0
    v->indiv = ind;
5381
0
    v->unpacked &= ~BCF_UN_FMT;    // only BCF is ready for output, VCF will need to unpack again
5382
0
    return 0;
5383
0
}
5384
5385
int bcf_is_snp(bcf1_t *v)
5386
0
{
5387
0
    int i;
5388
0
    bcf_unpack(v, BCF_UN_STR);
5389
0
    for (i = 0; i < v->n_allele; ++i)
5390
0
    {
5391
0
        if ( v->d.allele[i][1]==0 && v->d.allele[i][0]!='*' ) continue;
5392
5393
        // mpileup's <X> allele, see also below. This is not completely satisfactory,
5394
        // a general library is here narrowly tailored to fit samtools.
5395
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue;
5396
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='*' && v->d.allele[i][2]=='>' ) continue;
5397
5398
0
        break;
5399
0
    }
5400
0
    return i == v->n_allele;
5401
0
}
5402
5403
static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t *var)
5404
0
{
5405
0
    if ( *alt == '*' && !alt[1] ) { var->n = 0; var->type = VCF_OVERLAP; return; }  // overlapping variant
5406
5407
    // The most frequent case
5408
0
    if ( !ref[1] && !alt[1] )
5409
0
    {
5410
0
        if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
5411
0
        if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
5412
0
        var->n = 1; var->type = VCF_SNP; return;
5413
0
    }
5414
0
    if ( alt[0]=='<' )
5415
0
    {
5416
0
        if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
5417
0
        if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }
5418
0
        if ( !strcmp("NON_REF>",alt+1) ) { var->n = 0; var->type = VCF_REF; return; }
5419
0
        var->type = VCF_OTHER;
5420
0
        return;
5421
0
    }
5422
5423
    // Catch "joined before" breakend case
5424
0
    if ( alt[0]==']' || alt[0] == '[' )
5425
0
    {
5426
0
        var->type = VCF_BND; return;
5427
0
    }
5428
5429
    // Iterate through alt characters that match the reference
5430
0
    const char *r = ref, *a = alt;
5431
0
    while (*r && *a && toupper_c(*r)==toupper_c(*a) ) { r++; a++; }     // unfortunately, matching REF,ALT case is not guaranteed
5432
5433
0
    if ( *a && !*r )
5434
0
    {
5435
0
        while ( *a ) a++;
5436
0
        if ( *(a-1)==']' || *(a-1)=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend
5437
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_INS; return;
5438
0
    }
5439
0
    else if ( *r && !*a )
5440
0
    {
5441
0
        while ( *r ) r++;
5442
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_DEL; return;
5443
0
    }
5444
0
    else if ( !*r && !*a )
5445
0
    {
5446
0
        var->n = 0; var->type = VCF_REF; return;
5447
0
    }
5448
5449
0
    const char *re = r, *ae = a;
5450
0
    while ( re[1] ) re++;
5451
0
    while ( ae[1] ) ae++;
5452
0
    if ( ae[0]==']' || ae[0]=='[' ) { var->type = VCF_BND; return; }    // "joined after" breakend
5453
0
    while ( re>r && ae>a && toupper_c(*re)==toupper_c(*ae) ) { re--; ae--; }
5454
0
    if ( ae==a )
5455
0
    {
5456
0
        if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
5457
0
        var->n = -(re-r);
5458
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_DEL; return; }
5459
0
        var->type = VCF_OTHER; return;
5460
0
    }
5461
0
    else if ( re==r )
5462
0
    {
5463
0
        var->n = ae-a;
5464
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_INS; return; }
5465
0
        var->type = VCF_OTHER; return;
5466
0
    }
5467
5468
0
    var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
5469
0
    var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
5470
5471
    // should do also complex events, SVs, etc...
5472
0
}
5473
5474
static int bcf_set_variant_types(bcf1_t *b)
5475
0
{
5476
0
    if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
5477
0
    bcf_dec_t *d = &b->d;
5478
0
    if ( d->n_var < b->n_allele )
5479
0
    {
5480
0
        bcf_variant_t *new_var = hts_realloc_p(d->var, sizeof(bcf_variant_t),
5481
0
                                              b->n_allele);
5482
0
        if (!new_var)
5483
0
            return -1;
5484
0
        d->var = new_var;
5485
0
        d->n_var = b->n_allele;
5486
0
    }
5487
0
    int i;
5488
0
    b->d.var_type = 0;
5489
0
    d->var[0].type = VCF_REF;
5490
0
    d->var[0].n    = 0;
5491
0
    for (i=1; i<b->n_allele; i++)
5492
0
    {
5493
0
        bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
5494
0
        b->d.var_type |= d->var[i].type;
5495
        //fprintf(stderr,"[set_variant_type] %d   %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
5496
0
    }
5497
0
    return 0;
5498
0
}
5499
5500
// bcf_get_variant_type/bcf_get_variant_types should only return the following,
5501
// to be compatible with callers that are not expecting newer values
5502
// like VCF_INS, VCF_DEL.  The full set is available from the newer
5503
// vcf_has_variant_type* interfaces.
5504
0
#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP)
5505
int bcf_get_variant_types(bcf1_t *rec)
5506
0
{
5507
0
    if ( rec->d.var_type==-1 ) {
5508
0
        if (bcf_set_variant_types(rec) != 0) {
5509
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5510
0
            exit(1); // Due to legacy API having no way to report failures
5511
0
        }
5512
0
    }
5513
0
    return rec->d.var_type & ORIG_VAR_TYPES;
5514
0
}
5515
5516
int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
5517
0
{
5518
0
    if ( rec->d.var_type==-1 ) {
5519
0
        if (bcf_set_variant_types(rec) != 0) {
5520
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5521
0
            exit(1); // Due to legacy API having no way to report failures
5522
0
        }
5523
0
    }
5524
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) {
5525
0
        hts_log_error("Requested allele outside valid range");
5526
0
        exit(1);
5527
0
    }
5528
0
    return rec->d.var[ith_allele].type & ORIG_VAR_TYPES;
5529
0
}
5530
#undef ORIG_VAR_TYPES
5531
5532
int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask)
5533
0
{
5534
0
    if ( rec->d.var_type==-1 ) {
5535
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5536
0
    }
5537
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return -1;
5538
0
    if (bitmask == VCF_REF) {  // VCF_REF is 0, so handled as a special case
5539
0
        return rec->d.var[ith_allele].type == VCF_REF;
5540
0
    }
5541
0
    return bitmask & rec->d.var[ith_allele].type;
5542
0
}
5543
5544
int bcf_variant_length(bcf1_t *rec, int ith_allele)
5545
0
{
5546
0
    if ( rec->d.var_type==-1 ) {
5547
0
        if (bcf_set_variant_types(rec) != 0) return bcf_int32_missing;
5548
0
    }
5549
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return bcf_int32_missing;
5550
0
    return rec->d.var[ith_allele].n;
5551
0
}
5552
5553
int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask,
5554
                          enum bcf_variant_match mode)
5555
0
{
5556
0
    if ( rec->d.var_type==-1 ) {
5557
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5558
0
    }
5559
0
    uint32_t type = rec->d.var_type;
5560
0
    if ( mode==bcf_match_overlap ) return bitmask & type;
5561
5562
    // VCF_INDEL is always set with VCF_INS and VCF_DEL by bcf_set_variant_type[s], but the bitmask may
5563
    // ask for say `VCF_INS` or `VCF_INDEL` only
5564
0
    if ( bitmask&(VCF_INS|VCF_DEL) && !(bitmask&VCF_INDEL) ) type &= ~VCF_INDEL;
5565
0
    else if ( bitmask&VCF_INDEL && !(bitmask&(VCF_INS|VCF_DEL)) ) type &= ~(VCF_INS|VCF_DEL);
5566
5567
0
    if ( mode==bcf_match_subset )
5568
0
    {
5569
0
        if ( ~bitmask & type ) return 0;
5570
0
        else return bitmask & type;
5571
0
    }
5572
    // mode == bcf_match_exact
5573
0
    if ( bitmask==VCF_REF ) return type==bitmask ? 1 : 0;
5574
0
    return type==bitmask ? type : 0;
5575
0
}
5576
5577
int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5578
0
{
5579
0
    static int negative_rlen_warned = 0;
5580
0
    int is_end_tag, is_svlen_tag = 0;
5581
5582
    // Is the field already present?
5583
0
    int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5584
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1;    // No such INFO field in the header
5585
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
5586
5587
0
    is_end_tag = strcmp(key, "END") == 0;
5588
0
    is_svlen_tag = strcmp(key, "SVLEN") == 0;
5589
5590
0
    for (i=0; i<line->n_info; i++)
5591
0
        if ( inf_id==line->d.info[i].key ) break;
5592
0
    bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
5593
5594
0
    if ( !n || (type==BCF_HT_STR && !values) )
5595
0
    {
5596
0
        if ( inf )
5597
0
        {
5598
            // Mark the tag for removal, free existing memory if necessary
5599
0
            if ( inf->vptr_free )
5600
0
            {
5601
0
                free(inf->vptr - inf->vptr_off);
5602
0
                inf->vptr_free = 0;
5603
0
            }
5604
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5605
0
            inf->vptr = NULL;
5606
0
            inf->vptr_off = inf->vptr_len = 0;
5607
0
        }
5608
0
        if ( n==0 && (is_end_tag || is_svlen_tag) ) {
5609
0
            line->rlen = get_rlen(hdr, line);
5610
0
        }
5611
0
        return 0;
5612
0
    }
5613
5614
0
    if (is_end_tag)
5615
0
    {
5616
0
        if (n != 1)
5617
0
        {
5618
0
            hts_log_error("END info tag should only have one value at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5619
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5620
0
            return -1;
5621
0
        }
5622
0
        if (type != BCF_HT_INT && type != BCF_HT_LONG)
5623
0
        {
5624
0
            hts_log_error("Wrong type (%d) for END info tag at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5625
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5626
0
            return -1;
5627
0
        }
5628
0
    }
5629
5630
    // Encode the values and determine the size required to accommodate the values
5631
0
    kstring_t str = {0,0,0};
5632
0
    bcf_enc_int1(&str, inf_id);
5633
0
    if ( type==BCF_HT_INT )
5634
0
        bcf_enc_vint(&str, n, (int32_t*)values, -1);
5635
0
    else if ( type==BCF_HT_REAL )
5636
0
        bcf_enc_vfloat(&str, n, (float*)values);
5637
0
    else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
5638
0
    {
5639
0
        if ( values==NULL )
5640
0
            bcf_enc_size(&str, 0, BCF_BT_NULL);
5641
0
        else
5642
0
            bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
5643
0
    }
5644
#ifdef VCF_ALLOW_INT64
5645
    else if ( type==BCF_HT_LONG )
5646
    {
5647
        if (n != 1) {
5648
            hts_log_error("Only storing a single BCF_HT_LONG value is supported at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5649
            abort();
5650
        }
5651
        bcf_enc_long1(&str, *(int64_t *) values);
5652
    }
5653
#endif
5654
0
    else
5655
0
    {
5656
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5657
0
        abort();
5658
0
    }
5659
5660
    // Is the INFO tag already present
5661
0
    if ( inf )
5662
0
    {
5663
        // Is it big enough to accommodate new block?
5664
0
        if ( inf->vptr && str.l <= inf->vptr_len + inf->vptr_off )
5665
0
        {
5666
0
            if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
5667
0
            uint8_t *ptr = inf->vptr - inf->vptr_off;
5668
0
            memcpy(ptr, str.s, str.l);
5669
0
            free(str.s);
5670
0
            int vptr_free = inf->vptr_free;
5671
0
            bcf_unpack_info_core1(ptr, inf);
5672
0
            inf->vptr_free = vptr_free;
5673
0
        }
5674
0
        else
5675
0
        {
5676
0
            if ( inf->vptr_free )
5677
0
                free(inf->vptr - inf->vptr_off);
5678
0
            bcf_unpack_info_core1((uint8_t*)str.s, inf);
5679
0
            inf->vptr_free = 1;
5680
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5681
0
        }
5682
0
    }
5683
0
    else
5684
0
    {
5685
        // The tag is not present, create new one
5686
0
        line->n_info++;
5687
0
        hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
5688
0
        inf = &line->d.info[line->n_info-1];
5689
0
        bcf_unpack_info_core1((uint8_t*)str.s, inf);
5690
0
        inf->vptr_free = 1;
5691
0
        line->d.shared_dirty |= BCF1_DIRTY_INF;
5692
0
    }
5693
0
    line->unpacked |= BCF_UN_INFO;
5694
5695
0
   if ( n==1 && is_end_tag) {
5696
0
        hts_pos_t end = type == BCF_HT_INT ? *(int32_t *) values : *(int64_t *) values;
5697
0
        if ( (type == BCF_HT_INT && end!=bcf_int32_missing) || (type == BCF_HT_LONG && end!=bcf_int64_missing) )
5698
0
        {
5699
0
            if ( end <= line->pos )
5700
0
            {
5701
0
                if ( !negative_rlen_warned )
5702
0
                {
5703
0
                    hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,end,bcf_seqname_safe(hdr,line),line->pos+1);
5704
0
                    negative_rlen_warned = 1;
5705
0
                }
5706
0
            }
5707
0
        }
5708
0
    }
5709
0
    if (is_svlen_tag || is_end_tag) {
5710
0
        line->rlen = get_rlen(hdr, line);
5711
0
    }
5712
0
    return 0;
5713
0
}
5714
5715
int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
5716
0
{
5717
0
    if ( !n )
5718
0
        return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
5719
5720
0
    int i, max_len = 0;
5721
0
    for (i=0; i<n; i++)
5722
0
    {
5723
0
        int len = strlen(values[i]);
5724
0
        if ( len > max_len ) max_len = len;
5725
0
    }
5726
0
    char *out = hts_malloc_p(max_len, n);
5727
0
    if ( !out ) return -2;
5728
0
    for (i=0; i<n; i++)
5729
0
    {
5730
0
        char *dst = out+i*max_len;
5731
0
        const char *src = values[i];
5732
0
        int j = 0;
5733
0
        while ( src[j] ) { dst[j] = src[j]; j++; }
5734
0
        for (; j<max_len; j++) dst[j] = 0;
5735
0
    }
5736
0
    int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
5737
0
    free(out);
5738
0
    return ret;
5739
0
}
5740
5741
int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5742
0
{
5743
    // Is the field already present?
5744
0
    int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5745
0
    int is_len = 0;
5746
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
5747
0
    {
5748
0
        if ( !n ) return 0;
5749
0
        return -1;  // the key not present in the header
5750
0
    }
5751
5752
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5753
5754
0
    for (i=0; i<line->n_fmt; i++)
5755
0
        if ( line->d.fmt[i].id==fmt_id ) break;
5756
0
    bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
5757
5758
0
    is_len = strcmp(key, "LEN") == 0;
5759
0
    if ( !n )
5760
0
    {
5761
0
        if ( fmt )
5762
0
        {
5763
            // Mark the tag for removal, free existing memory if necessary
5764
0
            if ( fmt->p_free )
5765
0
            {
5766
0
                free(fmt->p - fmt->p_off);
5767
0
                fmt->p_free = 0;
5768
0
            }
5769
0
            line->d.indiv_dirty = 1;
5770
0
            fmt->p = NULL;
5771
0
        }
5772
0
        if (is_len) {
5773
0
            line->rlen = get_rlen(hdr, line);
5774
0
        }
5775
0
        return 0;
5776
0
    }
5777
5778
0
    line->n_sample = bcf_hdr_nsamples(hdr);
5779
0
    int nps = n / line->n_sample;  // number of values per sample
5780
0
    assert( nps && nps*line->n_sample==n );     // must be divisible by n_sample
5781
5782
    // Encode the values and determine the size required to accommodate the values
5783
0
    kstring_t str = {0,0,0};
5784
0
    bcf_enc_int1(&str, fmt_id);
5785
0
    assert(values != NULL);
5786
0
    if ( type==BCF_HT_INT )
5787
0
        bcf_enc_vint(&str, n, (int32_t*)values, nps);
5788
0
    else if ( type==BCF_HT_REAL )
5789
0
    {
5790
0
        bcf_enc_size(&str, nps, BCF_BT_FLOAT);
5791
0
        serialize_float_array(&str, nps*line->n_sample, (float *) values);
5792
0
    }
5793
0
    else if ( type==BCF_HT_STR )
5794
0
    {
5795
0
        bcf_enc_size(&str, nps, BCF_BT_CHAR);
5796
0
        kputsn((char*)values, nps*line->n_sample, &str);
5797
0
    }
5798
0
    else
5799
0
    {
5800
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5801
0
        abort();
5802
0
    }
5803
5804
0
    if ( !fmt )
5805
0
    {
5806
        // Not present, new format field
5807
0
        line->n_fmt++;
5808
0
        hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
5809
5810
        // Special case: VCF specification requires that GT is always first
5811
0
        if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
5812
0
        {
5813
0
            for (i=line->n_fmt-1; i>0; i--)
5814
0
                line->d.fmt[i] = line->d.fmt[i-1];
5815
0
            fmt = &line->d.fmt[0];
5816
0
        }
5817
0
        else
5818
0
            fmt = &line->d.fmt[line->n_fmt-1];
5819
0
        bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5820
0
        line->d.indiv_dirty = 1;
5821
0
        fmt->p_free = 1;
5822
0
    }
5823
0
    else
5824
0
    {
5825
        // The tag is already present, check if it is big enough to accommodate the new block
5826
0
        if ( fmt->p && str.l <= fmt->p_len + fmt->p_off )
5827
0
        {
5828
            // good, the block is big enough
5829
0
            if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
5830
0
            uint8_t *ptr = fmt->p - fmt->p_off;
5831
0
            memcpy(ptr, str.s, str.l);
5832
0
            free(str.s);
5833
0
            int p_free = fmt->p_free;
5834
0
            bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
5835
0
            fmt->p_free = p_free;
5836
0
        }
5837
0
        else
5838
0
        {
5839
0
            if ( fmt->p_free )
5840
0
                free(fmt->p - fmt->p_off);
5841
0
            bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5842
0
            fmt->p_free = 1;
5843
0
            line->d.indiv_dirty = 1;
5844
0
        }
5845
0
    }
5846
0
    line->unpacked |= BCF_UN_FMT;
5847
5848
0
    if (is_len) {
5849
0
        line->rlen = get_rlen(hdr, line);
5850
0
    }
5851
0
    return 0;
5852
0
}
5853
5854
5855
int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
5856
0
{
5857
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5858
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5859
0
    line->d.n_flt = n;
5860
0
    if ( !n ) return 0;
5861
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5862
0
    int i;
5863
0
    for (i=0; i<n; i++)
5864
0
        line->d.flt[i] = flt_ids[i];
5865
0
    return 0;
5866
0
}
5867
5868
int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
5869
0
{
5870
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5871
0
    int i;
5872
0
    for (i=0; i<line->d.n_flt; i++)
5873
0
        if ( flt_id==line->d.flt[i] ) break;
5874
0
    if ( i<line->d.n_flt ) return 0;    // this filter is already set
5875
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5876
0
    if ( flt_id==0 )    // set to PASS
5877
0
        line->d.n_flt = 1;
5878
0
    else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
5879
0
        line->d.n_flt = 1;
5880
0
    else
5881
0
        line->d.n_flt++;
5882
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5883
0
    line->d.flt[line->d.n_flt-1] = flt_id;
5884
0
    return 1;
5885
0
}
5886
int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
5887
0
{
5888
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5889
0
    int i;
5890
0
    for (i=0; i<line->d.n_flt; i++)
5891
0
        if ( flt_id==line->d.flt[i] ) break;
5892
0
    if ( i==line->d.n_flt ) return 0;   // the filter is not present
5893
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5894
0
    if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt));
5895
0
    line->d.n_flt--;
5896
0
    if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
5897
0
    return 0;
5898
0
}
5899
5900
int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
5901
0
{
5902
0
    if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
5903
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
5904
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1;  // not defined in the header
5905
5906
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5907
0
    if ( id==0 && !line->d.n_flt) return 1; // PASS
5908
5909
0
    int i;
5910
0
    for (i=0; i<line->d.n_flt; i++)
5911
0
        if ( line->d.flt[i]==id ) return 1;
5912
0
    return 0;
5913
0
}
5914
5915
static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
5916
0
{
5917
0
    line->d.shared_dirty |= BCF1_DIRTY_ALS;
5918
0
    line->d.var_type = -1;
5919
5920
0
    line->n_allele = nals;
5921
0
    hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
5922
5923
0
    char *als = line->d.als;
5924
0
    int n = 0;
5925
0
    while (n<nals)
5926
0
    {
5927
0
        line->d.allele[n] = als;
5928
0
        while ( *als ) als++;
5929
0
        als++;
5930
0
        n++;
5931
0
    }
5932
    // Update REF length. Note that END is 1-based while line->pos 0-based
5933
0
    line->rlen = get_rlen(hdr, line);
5934
5935
0
    return 0;
5936
0
}
5937
int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
5938
0
{
5939
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5940
0
    char *free_old = NULL;
5941
0
    char buffer[256];
5942
0
    size_t used = 0;
5943
5944
    // The pointers in alleles may point into the existing line->d.als memory,
5945
    // so care needs to be taken not to clobber them while updating.  Usually
5946
    // they will be short so we can copy through an intermediate buffer.
5947
    // If they're longer, or won't fit in the existing allocation we
5948
    // can allocate a new buffer to write into.  Note that in either case
5949
    // pointers to line->d.als memory in alleles may not be valid when we've
5950
    // finished.
5951
0
    int i;
5952
0
    size_t avail = line->d.m_als < sizeof(buffer) ? line->d.m_als : sizeof(buffer);
5953
0
    for (i=0; i<nals; i++) {
5954
0
        size_t sz = strlen(alleles[i]) + 1;
5955
0
        if (avail - used < sz)
5956
0
            break;
5957
0
        memcpy(buffer + used, alleles[i], sz);
5958
0
        used += sz;
5959
0
    }
5960
5961
    // Did we miss anything?
5962
0
    if (i < nals) {
5963
0
        int j;
5964
0
        size_t needed = used;
5965
0
        char *new_als;
5966
0
        for (j = i; j < nals; j++)
5967
0
            needed += strlen(alleles[j]) + 1;
5968
0
        if (needed < line->d.m_als) // Don't shrink the buffer
5969
0
            needed = line->d.m_als;
5970
0
        if (needed > INT_MAX) {
5971
0
            hts_log_error("REF + alleles too long to fit in a BCF record");
5972
0
            return -1;
5973
0
        }
5974
0
        new_als = malloc(needed);
5975
0
        if (!new_als)
5976
0
            return -1;
5977
0
        free_old = line->d.als;
5978
0
        line->d.als = new_als;
5979
0
        line->d.m_als = needed;
5980
0
    }
5981
5982
    // Copy from the temp buffer to the destination
5983
0
    if (used) {
5984
0
        assert(used <= line->d.m_als);
5985
0
        memcpy(line->d.als, buffer, used);
5986
0
    }
5987
5988
    // Add in any remaining entries - if this happens we will always be
5989
    // writing to a newly-allocated buffer.
5990
0
    for (; i < nals; i++) {
5991
0
        size_t sz = strlen(alleles[i]) + 1;
5992
0
        memcpy(line->d.als + used, alleles[i], sz);
5993
0
        used += sz;
5994
0
    }
5995
5996
0
    if (free_old)
5997
0
        free(free_old);
5998
0
    return _bcf1_sync_alleles(hdr,line,nals);
5999
0
}
6000
6001
int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
6002
0
{
6003
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
6004
0
    kstring_t tmp;
6005
0
    tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
6006
0
    kputs(alleles_string, &tmp);
6007
0
    line->d.als = tmp.s; line->d.m_als = tmp.m;
6008
6009
0
    int nals = 1;
6010
0
    char *t = line->d.als;
6011
0
    while (*t)
6012
0
    {
6013
0
        if ( *t==',' ) { *t = 0; nals++; }
6014
0
        t++;
6015
0
    }
6016
0
    return _bcf1_sync_alleles(hdr, line, nals);
6017
0
}
6018
6019
int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
6020
0
{
6021
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
6022
0
    kstring_t tmp;
6023
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
6024
0
    if ( id )
6025
0
        kputs(id, &tmp);
6026
0
    else
6027
0
        kputs(".", &tmp);
6028
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
6029
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
6030
0
    return 0;
6031
0
}
6032
6033
int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
6034
0
{
6035
0
    if ( !id ) return 0;
6036
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
6037
6038
0
    kstring_t tmp;
6039
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
6040
6041
0
    int len = strlen(id);
6042
0
    char *dst = line->d.id;
6043
0
    while ( *dst && (dst=strstr(dst,id)) )
6044
0
    {
6045
0
        if ( dst[len]!=0 && dst[len]!=';' ) dst++;              // a prefix, not a match
6046
0
        else if ( dst==line->d.id || dst[-1]==';' ) return 0;   // already present
6047
0
        dst++;  // a suffix, not a match
6048
0
    }
6049
0
    if ( line->d.id && (line->d.id[0]!='.' || line->d.id[1]) )
6050
0
    {
6051
0
        tmp.l = strlen(line->d.id);
6052
0
        kputc(';',&tmp);
6053
0
    }
6054
0
    kputs(id,&tmp);
6055
6056
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
6057
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
6058
0
    return 0;
6059
6060
0
}
6061
6062
bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
6063
0
{
6064
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
6065
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL;   // no such FMT field in the header
6066
0
    return bcf_get_fmt_id(line, id);
6067
0
}
6068
6069
bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
6070
0
{
6071
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
6072
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL;   // no such INFO field in the header
6073
0
    return bcf_get_info_id(line, id);
6074
0
}
6075
6076
bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
6077
0
{
6078
0
    int i;
6079
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
6080
0
    for (i=0; i<line->n_fmt; i++)
6081
0
    {
6082
0
        if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
6083
0
    }
6084
0
    return NULL;
6085
0
}
6086
6087
bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
6088
0
{
6089
0
    int i;
6090
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
6091
0
    for (i=0; i<line->n_info; i++)
6092
0
    {
6093
0
        if ( line->d.info[i].key==id ) return &line->d.info[i];
6094
0
    }
6095
0
    return NULL;
6096
0
}
6097
6098
6099
int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
6100
0
{
6101
0
    int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
6102
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1;    // no such INFO field in the header
6103
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2;     // expected different type
6104
6105
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
6106
6107
0
    for (i=0; i<line->n_info; i++)
6108
0
        if ( line->d.info[i].key==tag_id ) break;
6109
0
    if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3;       // the tag is not present in this record
6110
0
    if ( type==BCF_HT_FLAG ) return 1;
6111
6112
0
    bcf_info_t *info = &line->d.info[i];
6113
0
    if ( !info->vptr ) return -3;           // the tag was marked for removal
6114
0
    if ( type==BCF_HT_STR )
6115
0
    {
6116
0
        if ( *ndst < info->len+1 )
6117
0
        {
6118
0
            *ndst = info->len + 1;
6119
0
            *dst  = realloc(*dst, *ndst);
6120
0
        }
6121
0
        memcpy(*dst,info->vptr,info->len);
6122
0
        ((uint8_t*)*dst)[info->len] = 0;
6123
0
        return info->len;
6124
0
    }
6125
6126
    // Make sure the buffer is big enough
6127
0
    int size1;
6128
0
    switch (type) {
6129
0
        case BCF_HT_INT:  size1 = sizeof(int32_t); break;
6130
0
        case BCF_HT_LONG: size1 = sizeof(int64_t); break;
6131
0
        case BCF_HT_REAL: size1 = sizeof(float); break;
6132
0
        default:
6133
0
            hts_log_error("Unexpected output type %d at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
6134
0
            return -2;
6135
0
    }
6136
0
    if ( *ndst < info->len )
6137
0
    {
6138
0
        *ndst = info->len;
6139
0
        *dst  = hts_realloc_p(*dst, *ndst, size1);
6140
0
    }
6141
6142
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_regular, out_type_t) do { \
6143
0
        out_type_t *tmp = (out_type_t *) *dst; \
6144
0
        int j; \
6145
0
        for (j=0; j<info->len; j++) \
6146
0
        { \
6147
0
            type_t p = convert(info->vptr + j * sizeof(type_t)); \
6148
0
            if ( is_vector_end ) break; \
6149
0
            if ( is_missing ) set_missing; \
6150
0
            else set_regular; \
6151
0
            tmp++; \
6152
0
        } \
6153
0
        ret = j; \
6154
0
    } while (0)
6155
0
    switch (info->type) {
6156
0
        case BCF_BT_INT8:
6157
0
            if (type == BCF_HT_LONG) {
6158
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int64_missing, *tmp=p, int64_t);
6159
0
            } else {
6160
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=p, int32_t);
6161
0
            }
6162
0
            break;
6163
0
        case BCF_BT_INT16:
6164
0
            if (type == BCF_HT_LONG) {
6165
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t);
6166
0
            } else {
6167
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t);
6168
0
            }
6169
0
            break;
6170
0
        case BCF_BT_INT32:
6171
0
            if (type == BCF_HT_LONG) {
6172
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); break;
6173
0
            } else {
6174
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break;
6175
0
            }
6176
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break;
6177
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, info->type, bcf_seqname_safe(hdr,line), line->pos+1); return -2;
6178
0
    }
6179
0
    #undef BRANCH
6180
0
    return ret;  // set by BRANCH
6181
0
}
6182
6183
int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
6184
0
{
6185
0
    int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
6186
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
6187
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;     // expected different type
6188
6189
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
6190
6191
0
    for (i=0; i<line->n_fmt; i++)
6192
0
        if ( line->d.fmt[i].id==tag_id ) break;
6193
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
6194
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
6195
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
6196
6197
0
    int nsmpl = bcf_hdr_nsamples(hdr);
6198
0
    if ( !*dst )
6199
0
    {
6200
0
        *dst = hts_malloc_p(sizeof(char*), nsmpl);
6201
0
        if ( !*dst ) return -4;     // could not alloc
6202
0
        (*dst)[0] = NULL;
6203
0
    }
6204
0
    int n = (fmt->n+1)*nsmpl;
6205
0
    if ( *ndst < n )
6206
0
    {
6207
0
        (*dst)[0] = realloc((*dst)[0], n);
6208
0
        if ( !(*dst)[0] ) return -4;    // could not alloc
6209
0
        *ndst = n;
6210
0
    }
6211
0
    for (i=0; i<nsmpl; i++)
6212
0
    {
6213
0
        uint8_t *src = fmt->p + i*fmt->n;
6214
0
        uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
6215
0
        memcpy(tmp,src,fmt->n);
6216
0
        tmp[fmt->n] = 0;
6217
0
        (*dst)[i] = (char*) tmp;
6218
0
    }
6219
0
    return n;
6220
0
}
6221
6222
int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
6223
0
{
6224
0
    int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
6225
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
6226
0
    if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
6227
0
    {
6228
        // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
6229
0
        if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
6230
0
    }
6231
0
    else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2;     // expected different type
6232
6233
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
6234
6235
0
    for (i=0; i<line->n_fmt; i++)
6236
0
        if ( line->d.fmt[i].id==tag_id ) break;
6237
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
6238
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
6239
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
6240
6241
0
    if ( type==BCF_HT_STR )
6242
0
    {
6243
0
        int n = fmt->n*bcf_hdr_nsamples(hdr);
6244
0
        if ( *ndst < n )
6245
0
        {
6246
0
            *dst  = realloc(*dst, n);
6247
0
            if ( !*dst ) return -4;     // could not alloc
6248
0
            *ndst = n;
6249
0
        }
6250
0
        memcpy(*dst,fmt->p,n);
6251
0
        return n;
6252
0
    }
6253
6254
    // Make sure the buffer is big enough
6255
0
    int nsmpl = bcf_hdr_nsamples(hdr);
6256
0
    int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
6257
0
    if ( *ndst < fmt->n*nsmpl )
6258
0
    {
6259
0
        *ndst = fmt->n*nsmpl;
6260
0
        *dst  = hts_realloc_p(*dst, *ndst, size1);
6261
0
        if ( !*dst ) return -4;     // could not alloc
6262
0
    }
6263
6264
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_vector_end, set_regular, out_type_t) { \
6265
0
        out_type_t *tmp = (out_type_t *) *dst; \
6266
0
        uint8_t *fmt_p = fmt->p; \
6267
0
        for (i=0; i<nsmpl; i++) \
6268
0
        { \
6269
0
            for (j=0; j<fmt->n; j++) \
6270
0
            { \
6271
0
                type_t p = convert(fmt_p + j * sizeof(type_t)); \
6272
0
                if ( is_missing ) set_missing; \
6273
0
                else if ( is_vector_end ) { set_vector_end; break; } \
6274
0
                else set_regular; \
6275
0
                tmp++; \
6276
0
            } \
6277
0
            for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
6278
0
            fmt_p += fmt->size; \
6279
0
        } \
6280
0
    }
6281
0
    switch (fmt->type) {
6282
0
        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6283
0
        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6284
0
        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6285
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), bcf_float_set(tmp, p), float); break;
6286
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt->type, bcf_seqname_safe(hdr,line), line->pos+1); exit(1);
6287
0
    }
6288
0
    #undef BRANCH
6289
6290
0
    return nsmpl*fmt->n;
6291
0
}
6292
6293
//error description structure definition
6294
typedef struct err_desc {
6295
    int  errorcode;
6296
    const char *description;
6297
}err_desc;
6298
6299
// error descriptions
6300
static const err_desc errdesc_bcf[] = {
6301
    { BCF_ERR_CTG_UNDEF, "Contig not defined in header"},
6302
    { BCF_ERR_TAG_UNDEF, "Tag not defined in header" },
6303
    { BCF_ERR_NCOLS, "Incorrect number of columns" },
6304
    { BCF_ERR_LIMITS, "Limits reached" },
6305
    { BCF_ERR_CHAR, "Invalid character" },
6306
    { BCF_ERR_CTG_INVALID, "Invalid contig" },
6307
    { BCF_ERR_TAG_INVALID, "Invalid tag" },
6308
};
6309
6310
/// append given description to buffer based on available size and add ... when not enough space
6311
    /** @param buffer       buffer to which description to be appended
6312
        @param offset       offset at which to be appended
6313
        @param maxbuffer    maximum size of the buffer
6314
        @param description  the description to be appended
6315
on failure returns -1 - when buffer is not big enough; returns -1 on invalid params and on too small buffer which are improbable due to validation at caller site
6316
on success returns 0
6317
    */
6318
3.93k
static int add_desc_to_buffer(char *buffer, size_t *offset, size_t maxbuffer, const char *description) {
6319
6320
3.93k
    if (!description || !buffer || !offset || (maxbuffer < 4))
6321
0
        return -1;
6322
6323
3.93k
    size_t rembuffer = maxbuffer - *offset;
6324
3.93k
    if (rembuffer > (strlen(description) + (rembuffer == maxbuffer ? 0 : 1))) {    //add description with optionally required ','
6325
3.93k
        *offset += snprintf(buffer + *offset, rembuffer, "%s%s", (rembuffer == maxbuffer)? "": ",", description);
6326
3.93k
    } else {    //not enough space for description, put ...
6327
0
        size_t tmppos = (rembuffer <= 4) ? maxbuffer - 4 : *offset;
6328
0
        snprintf(buffer + tmppos, 4, "...");    //ignore offset update
6329
0
        return -1;
6330
0
    }
6331
3.93k
    return 0;
6332
3.93k
}
6333
6334
//get description for given error code. return NULL on error
6335
1.87k
const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) {
6336
1.87k
    size_t usedup = 0;
6337
1.87k
    int ret = 0;
6338
1.87k
    int idx;
6339
6340
1.87k
    if (!buffer || maxbuffer < 4)
6341
0
        return NULL;           //invalid / insufficient buffer
6342
6343
1.87k
    if (!errorcode) {
6344
0
        buffer[0] = '\0';      //no error, set null
6345
0
        return buffer;
6346
0
    }
6347
6348
15.0k
    for (idx = 0; idx < sizeof(errdesc_bcf) / sizeof(err_desc); ++idx) {
6349
13.1k
        if (errorcode & errdesc_bcf[idx].errorcode) {    //error is set, add description
6350
3.93k
            ret = add_desc_to_buffer(buffer, &usedup, maxbuffer, errdesc_bcf[idx].description);
6351
3.93k
            if (ret < 0)
6352
0
                break;         //not enough space, ... added, no need to continue
6353
6354
3.93k
            errorcode &= ~errdesc_bcf[idx].errorcode;    //reset the error
6355
3.93k
        }
6356
13.1k
    }
6357
6358
1.87k
    if (errorcode && (ret >= 0))  {     //undescribed error is present in error code and had enough buffer, try to add unkonwn error as well§
6359
0
        add_desc_to_buffer(buffer, &usedup, maxbuffer, "Unknown error");
6360
0
    }
6361
1.87k
    return buffer;
6362
1.87k
}
6363
6364
/**
6365
 *  bcf_format_gt_v2 - formats GT information on a string
6366
 *  @param hdr - bcf header, to get version
6367
 *  @param fmt - pointer to bcf format data
6368
 *  @param isample - position of interested sample in data
6369
 *  @param str - pointer to output string
6370
 *  Returns 0 on success and -1 on failure
6371
 *  This method is preferred over bcf_format_gt as this supports vcf4.4 and
6372
 *  prefixed phasing. Explicit / prefixed phasing for 1st allele is used only
6373
 *  when it is a must to correctly express phasing.
6374
 * correctly express phasing.
6375
 */
6376
int bcf_format_gt_v2(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample, kstring_t *str)
6377
16.0k
{
6378
16.0k
    uint32_t e = 0;
6379
16.0k
    int ploidy = 1, anyunphased = 0;
6380
16.0k
    int32_t val0 = 0;
6381
16.0k
    size_t pos = str ? str->l : 0;
6382
6383
16.0k
    #define BRANCH(type_t, convert, missing, vector_end) { \
6384
14.4k
        uint8_t *ptr = fmt->p + isample*fmt->size; \
6385
14.4k
        int i; \
6386
37.5k
        for (i=0; i<fmt->n; i++, ptr += sizeof(type_t)) \
6387
32.4k
        { \
6388
32.4k
            type_t val = convert(ptr); \
6389
32.4k
            if ( val == vector_end ) break; \
6390
32.4k
            if (!i) { val0 = val; } \
6391
23.0k
            if (i) { \
6392
8.66k
                e |= kputc("/|"[val & 1], str) < 0; \
6393
8.66k
                anyunphased |= !(val & 1); \
6394
8.66k
            } \
6395
23.0k
            if (!(val >> 1)) e |= kputc('.', str) < 0; \
6396
23.0k
            else e |= kputw((val >> 1) - 1, str) < 0; \
6397
23.0k
        } \
6398
14.4k
        if (i == 0) e |= kputc('.', str) < 0; \
6399
14.4k
        ploidy = i; \
6400
14.4k
    }
6401
16.0k
    switch (fmt->type) {
6402
8.30k
        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  bcf_int8_missing,
6403
8.30k
            bcf_int8_vector_end); break;
6404
957
        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing,
6405
957
            bcf_int16_vector_end); break;
6406
5.16k
        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing,
6407
5.16k
            bcf_int32_vector_end); break;
6408
1.60k
        case BCF_BT_NULL:  e |= kputc('.', str) < 0; break;
6409
0
        default: hts_log_error("Unexpected type %d", fmt->type); return -2;
6410
16.0k
    }
6411
16.0k
    #undef BRANCH
6412
6413
16.0k
    if (hdr && get_hdr_aux(hdr)->version >= VCF44) {
6414
        //output which supports prefixed phasing
6415
6416
        /* update 1st allele's phasing if required and append rest to it.
6417
        use prefixed phasing only when it is a must. i.e. without which the
6418
        inferred value will be incorrect */
6419
9.42k
        if (val0 & 1) {
6420
            /* 1st one is phased, if ploidy is > 1 and an unphased allele exists
6421
             need to specify explicitly */
6422
1.09k
            e |= (ploidy > 1 && anyunphased) ?
6423
88
                    (kinsert_char('|', pos, str) < 0) :
6424
1.09k
                        (ploidy <= 1 && !((val0 >> 1)) ? //|. needs explicit o/p
6425
0
                            (kinsert_char('|', pos, str) < 0) :
6426
1.00k
                            0);
6427
8.33k
        } else {
6428
            /* 1st allele is unphased, if ploidy is = 1 or allele is '.' or
6429
             ploidy > 1 and no other unphased allele exist, need to specify
6430
             explicitly */
6431
8.33k
            e |= ((ploidy <= 1 && val0 != 0) || (ploidy > 1 && !anyunphased)) ?
6432
5.14k
                    (kinsert_char('/', pos, str) < 0) :
6433
8.33k
                    0;
6434
8.33k
        }
6435
9.42k
    }
6436
16.0k
    return e == 0 ? 0 : -1;
6437
16.0k
}
6438
6439
/**
6440
 *  get_rlen - calculates and returns rlen value
6441
 *  @param h - bcf header
6442
 *  @param v - bcf data
6443
 *  Returns rlen calculated on success and -1 on failure.
6444
 *  rlen calculation is dependent on vcf version and a few other field data.
6445
 *  When bcf decoded data is available, refers it. When not available, retrieves
6446
 *  required field data by seeking on the data stream.
6447
 *  Ideally pos & version be set appropriately before any info/format field
6448
 *  update to have proper rlen calculation.
6449
 *  As version is not kept properly updated in practice, it is ignored in calcs.
6450
 */
6451
static int64_t get_rlen(const bcf_hdr_t *h, bcf1_t *v)
6452
32.8k
{
6453
32.8k
    uint8_t *f = (uint8_t*)v->shared.s, *t = NULL,
6454
32.8k
        *e = (uint8_t*)v->shared.s + v->shared.l;
6455
32.8k
    int size, type, id, lenid, endid, svlenid, i, bad, gvcf = 0, use_svlen = 0;
6456
32.8k
    bcf_info_t *endinfo = NULL, *svleninfo = NULL, end_lcl, svlen_lcl;
6457
32.8k
    bcf_fmt_t *lenfmt = NULL, len_lcl;
6458
6459
    //holds SVLEN allele status for the max no of alleles
6460
32.8k
    uint8_t svlenals[8192];
6461
    //pos from info END, fmt LEN, info SVLEN
6462
32.8k
    hts_pos_t end = 0, end_fmtlen = 0, end_svlen = 0, hpos;
6463
32.8k
    int64_t len_ref = 0, len = 0, tmp;
6464
32.8k
    endid = bcf_hdr_id2int(h, BCF_DT_ID, "END");
6465
6466
    //initialise bytes which are to be used
6467
32.8k
    memset(svlenals, 0, 1 + v->n_allele / 8);
6468
6469
    //use decoded data where ever available and where not, get from stream
6470
32.8k
    if (v->unpacked & BCF_UN_STR || v->d.shared_dirty & BCF1_DIRTY_ALS) {
6471
0
        for (i = 1; i < v->n_allele; ++i) {
6472
            // check only symbolic alt alleles
6473
0
            if (v->d.allele[i][0] != '<')
6474
0
                continue;
6475
0
            if (svlen_on_ref_for_vcf_alt(v->d.allele[i], -1)) {
6476
                // del, dup or cnv allele, note to check corresponding svlen val
6477
0
                svlenals[i >> 3] |= 1 << (i & 7);
6478
0
                use_svlen = 1;
6479
0
            } else if (!strcmp(v->d.allele[i], "<*>") ||
6480
0
                         !strcmp(v->d.allele[i], "<NON_REF>")) {
6481
0
                gvcf = 1;   //gvcf present, have to check for LEN field
6482
0
            }
6483
0
        }
6484
0
        f += v->unpack_size[0] + v->unpack_size[1];
6485
0
        len_ref = v->n_allele ? strlen(v->d.allele[0]) : 0;
6486
32.8k
    } else if (f < e) {
6487
        //skip ID
6488
32.8k
        size = bcf_dec_size(f, &f, &type);
6489
32.8k
        f += size << bcf_type_shift[type];
6490
        // REF, ALT
6491
2.37M
        for (i = 0; i < v->n_allele; ++i) {
6492
            //check all alleles, w/o NUL
6493
2.33M
            size = bcf_dec_size(f, &f, &type);
6494
2.33M
            if (!i) {   //REF length
6495
32.8k
                len_ref = size;
6496
2.30M
            } else if (size > 0 && *f == '<') {
6497
22.1k
                if (svlen_on_ref_for_vcf_alt((char *) f, size)) {
6498
                    // del, dup or cnv allele, note to check corresponding svlen val
6499
27
                    svlenals[i >> 3] |= 1 << (i & 7);
6500
27
                    use_svlen = 1;
6501
22.1k
                } else if ((size == 3 && !strncmp((char*)f, "<*>", size)) ||
6502
14.7k
                    (size == 9 && !strncmp((char*)f, "<NON_REF>", size))) {
6503
7.77k
                    gvcf = 1;   //gvcf present, have to check for LEN field
6504
7.77k
                }
6505
22.1k
            }
6506
2.33M
            f += size << bcf_type_shift[type];
6507
2.33M
        }
6508
32.8k
    }
6509
    // FILTER
6510
32.8k
    if (v->unpacked & BCF_UN_FLT) {
6511
0
        f += v->unpack_size[2];
6512
32.8k
    } else if (f < e) {
6513
32.8k
        size = bcf_dec_size(f, &f, &type);
6514
32.8k
        f += size << bcf_type_shift[type];
6515
32.8k
    }
6516
6517
    // Only do SVLEN lookup if there are suitable symbolic alleles
6518
32.8k
    svlenid = use_svlen ? bcf_hdr_id2int(h, BCF_DT_ID, "SVLEN") : -1;
6519
6520
    // INFO
6521
32.8k
    if (svlenid >= 0 || endid >= 0 ) {  //only if end/svlen present
6522
13.2k
        if (v->unpacked & BCF_UN_INFO || v->d.shared_dirty & BCF1_DIRTY_INF) {
6523
0
            endinfo = bcf_get_info(h, v, "END");
6524
0
            svleninfo = bcf_get_info(h, v, "SVLEN");
6525
13.2k
        } else if (f < e) {
6526
21.2k
            for (i = 0; i < v->n_info; ++i) {
6527
16.6k
                id = bcf_dec_typed_int1(f, &t);
6528
16.6k
                if (id == endid) {  //END
6529
1.97k
                    t = bcf_unpack_info_core1(f, &end_lcl);
6530
1.97k
                    endinfo = &end_lcl;
6531
1.97k
                    if (svleninfo || svlenid < 0) {
6532
1.97k
                        break;  //already got svlen or no need to search further
6533
1.97k
                    }
6534
14.7k
                } else if (id == svlenid) { //SVLEN
6535
0
                    t = bcf_unpack_info_core1(f, &svlen_lcl);
6536
0
                    svleninfo = &svlen_lcl;
6537
0
                    if (endinfo || endid < 0 ) {
6538
0
                        break;  //already got end or no need to search further
6539
0
                    }
6540
14.7k
                } else {
6541
14.7k
                    f = t;
6542
14.7k
                    size = bcf_dec_size(f, &t, &type);
6543
14.7k
                    t += size << bcf_type_shift[type];
6544
14.7k
                }
6545
14.7k
                f = t;
6546
14.7k
            }
6547
6.54k
        }
6548
13.2k
    }
6549
6550
    // Only do LEN lookup if a <*> allele was found
6551
32.8k
    lenid = gvcf ? bcf_hdr_id2int(h, BCF_DT_ID, "LEN") : -1;
6552
6553
    // FORMAT
6554
32.8k
    if (lenid >= 0) {
6555
        //with LEN and has gvcf allele
6556
0
        f = (uint8_t*)v->indiv.s; t = NULL; e = (uint8_t*)v->indiv.s + v->indiv.l;
6557
0
        if (v->unpacked & BCF_UN_FMT || v->d.indiv_dirty) {
6558
0
            lenfmt = bcf_get_fmt(h, v, "LEN");
6559
0
        } else if (f < e) {
6560
0
            for (i = 0; i < v->n_fmt; ++i) {
6561
0
                id = bcf_dec_typed_int1(f, &t);
6562
0
                if (id == lenid) {
6563
0
                        t = bcf_unpack_fmt_core1(f, v->n_sample, &len_lcl);
6564
0
                    lenfmt = &len_lcl;
6565
0
                    break;  //that's all needed
6566
0
                } else {
6567
0
                    f = t;
6568
0
                    size = bcf_dec_size(f, &t, &type);
6569
0
                    t += size * v->n_sample << bcf_type_shift[type];
6570
0
                }
6571
0
                f = t;
6572
0
            }
6573
0
        }
6574
0
    }
6575
    //got required data, find end and rlen
6576
32.8k
    if (endinfo && endinfo->vptr) { //end position given by info END
6577
        //end info exists, not being deleted
6578
1.97k
        end = endinfo->v1.i;
6579
1.97k
        switch(endinfo->type) {
6580
0
            case BCF_BT_INT8:  end = end == bcf_int8_missing ? 0 : end;  break;
6581
0
            case BCF_BT_INT16: end = end == bcf_int16_missing ? 0 : end; break;
6582
0
            case BCF_BT_INT32: end = end == bcf_int32_missing ? 0 : end; break;
6583
0
            case BCF_BT_INT64: end = end == bcf_int64_missing ? 0 : end; break;
6584
1.97k
            default: end = 0; break; //invalid
6585
1.97k
        }
6586
1.97k
    }
6587
6588
32.8k
    if (svleninfo && svleninfo->vptr) {
6589
        //svlen info exists, not being deleted
6590
0
        bad = 0;
6591
        //get largest svlen corresponding to a <DEL> symbolic allele
6592
0
        for (i = 0; i < svleninfo->len && i + 1 < v->n_allele; ++i) {
6593
0
            if (!(svlenals[i >> 3] & (1 << ((i + 1) & 7))))
6594
0
                continue;
6595
6596
0
            switch(svleninfo->type) {
6597
0
                case BCF_BT_INT8:
6598
0
                    tmp = le_to_i8(&svleninfo->vptr[i]);
6599
0
                    tmp = tmp == bcf_int8_missing ? 0 : tmp;
6600
0
                break;
6601
0
                case BCF_BT_INT16:
6602
0
                    tmp = le_to_i16(&svleninfo->vptr[i * 2]);
6603
0
                    tmp = tmp == bcf_int16_missing ? 0 : tmp;
6604
0
                break;
6605
0
                case BCF_BT_INT32:
6606
0
                    tmp = le_to_i32(&svleninfo->vptr[i * 4]);
6607
0
                    tmp = tmp == bcf_int32_missing ? 0 : tmp;
6608
0
                break;
6609
0
                case BCF_BT_INT64:
6610
0
                    tmp = le_to_i64(&svleninfo->vptr[i * 8]);
6611
0
                    tmp = tmp == bcf_int64_missing ? 0 : tmp;
6612
0
                break;
6613
0
                default: //invalid
6614
0
                    tmp = 0;
6615
0
                    bad = 1;
6616
0
                break;
6617
0
            }
6618
0
            if (bad) {  //stop svlen check
6619
0
                len = 0;
6620
0
                break;
6621
0
            }
6622
6623
0
            tmp = tmp < 0 ? llabs(tmp) : tmp;
6624
0
            if (len < tmp) len = tmp;
6625
0
        }
6626
0
    }
6627
32.8k
    if ((!svleninfo || !len) && end) { //no svlen, infer from end
6628
0
        len = end > v->pos ? end - v->pos - 1 : 0;
6629
0
    }
6630
32.8k
    end_svlen = v->pos + len + 1;   //end position found from SVLEN
6631
6632
32.8k
    len = 0;
6633
32.8k
    if (lenfmt && lenfmt->p) {
6634
        //fmt len exists, not being deleted, has gvcf and version >= 4.5
6635
0
        int j = 0;
6636
0
        int64_t offset = 0;
6637
0
        bad = 0;
6638
0
        for (i = 0; i < v->n_sample; ++i) {
6639
0
            for (j = 0; j < lenfmt->n; ++j) {
6640
0
                switch(lenfmt->type) {
6641
0
                case BCF_BT_INT8:
6642
0
                    tmp = le_to_i8(lenfmt->p + offset + j);
6643
0
                    tmp = tmp == bcf_int8_missing ? 0 : tmp;
6644
0
                break;
6645
0
                case BCF_BT_INT16:
6646
0
                    tmp = le_to_i16(lenfmt->p + offset + j * 2);
6647
0
                    tmp = tmp == bcf_int16_missing ? 0 : tmp;
6648
0
                break;
6649
0
                case BCF_BT_INT32:
6650
0
                    tmp = le_to_i32(lenfmt->p + offset + j * 4);
6651
0
                    tmp = tmp == bcf_int32_missing ? 0 : tmp;
6652
0
                break;
6653
0
                case BCF_BT_INT64:
6654
0
                    tmp = le_to_i64(lenfmt->p + offset + j * 8);
6655
0
                    tmp = tmp == bcf_int64_missing ? 0 : tmp;
6656
0
                break;
6657
0
                default: //invalid
6658
0
                    bad = 1;
6659
0
                break;
6660
0
                }
6661
0
                if (bad) {  //stop LEN check
6662
0
                    len = 0;
6663
0
                    break;
6664
0
                }
6665
                //assumes only gvcf have valid LEN
6666
0
                if (len < tmp) len = tmp;
6667
0
            }
6668
0
            offset += j << bcf_type_shift[lenfmt->type];
6669
0
        }
6670
0
    }
6671
32.8k
    if ((!lenfmt || !len) && end) { //no fmt len, infer from end
6672
0
        len = end > v->pos ? end - v->pos : 0;
6673
0
    }
6674
32.8k
    end_fmtlen = v->pos + len;  //end position found from LEN
6675
6676
    //get largest pos, based on END, SVLEN, fmt LEN and length using it
6677
32.8k
    hpos = end < end_svlen ?
6678
8.03k
            end_svlen < end_fmtlen ? end_fmtlen : end_svlen :
6679
32.8k
            end < end_fmtlen ? end_fmtlen : end;
6680
32.8k
    len = hpos - v->pos;
6681
6682
    //NOTE: 'end' calculation be in sync with tbx.c:tbx_parse1
6683
6684
    /* rlen to be calculated based on version, END, SVLEN, fmt LEN, ref len.
6685
    Relevance of these fields vary across different vcf versions.
6686
    Many times, these info/fmt fields are used without version updates;
6687
    hence these fields are used for calculation disregarding vcf version */
6688
32.8k
    return len < len_ref ? len_ref : len;
6689
32.8k
}