Coverage Report

Created: 2026-02-11 06:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/vcf.c
Line
Count
Source
1
/*  vcf.c -- VCF/BCF API functions.
2
3
    Copyright (C) 2012, 2013 Broad Institute.
4
    Copyright (C) 2012-2025 Genome Research Ltd.
5
    Portions copyright (C) 2014 Intel Corporation.
6
7
    Author: Heng Li <lh3@sanger.ac.uk>
8
9
Permission is hereby granted, free of charge, to any person obtaining a copy
10
of this software and associated documentation files (the "Software"), to deal
11
in the Software without restriction, including without limitation the rights
12
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
copies of the Software, and to permit persons to whom the Software is
14
furnished to do so, subject to the following conditions:
15
16
The above copyright notice and this permission notice shall be included in
17
all copies or substantial portions of the Software.
18
19
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25
DEALINGS IN THE SOFTWARE.  */
26
27
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
28
#include <config.h>
29
30
#include <stdio.h>
31
#include <assert.h>
32
#include <string.h>
33
#include <strings.h>
34
#include <stdlib.h>
35
#include <limits.h>
36
#include <stdint.h>
37
#include <inttypes.h>
38
#include <errno.h>
39
40
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
41
#include "fuzz_settings.h"
42
#endif
43
44
#include "htslib/vcf.h"
45
#include "htslib/bgzf.h"
46
#include "htslib/tbx.h"
47
#include "htslib/hfile.h"
48
#include "hts_internal.h"
49
#include "htslib/hts_endian.h"
50
#include "htslib/khash_str2int.h"
51
#include "htslib/kstring.h"
52
#include "htslib/sam.h"
53
#include "htslib/khash.h"
54
#include "bgzf_internal.h"
55
56
#if 0
57
// This helps on Intel a bit, often 6-7% faster VCF parsing.
58
// Conversely sometimes harms AMD Zen4 as ~9% slower.
59
// Possibly related to IPC differences.  However for now it's just a
60
// curiousity we ignore and stick with the simpler code.
61
//
62
// Left here as a hint for future explorers.
63
static inline int xstreq(const char *a, const char *b) {
64
    while (*a && *a == *b)
65
        a++, b++;
66
    return *a == *b;
67
}
68
69
#define KHASH_MAP_INIT_XSTR(name, khval_t) \
70
  KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, xstreq)
71
72
KHASH_MAP_INIT_XSTR(vdict, bcf_idinfo_t)
73
#else
74
KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
75
#endif
76
77
typedef khash_t(vdict) vdict_t;
78
79
KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*)
80
typedef khash_t(hdict) hdict_t;
81
82
83
#include "htslib/kseq.h"
84
HTSLIB_EXPORT
85
uint32_t bcf_float_missing    = 0x7F800001;
86
87
HTSLIB_EXPORT
88
uint32_t bcf_float_vector_end = 0x7F800002;
89
90
HTSLIB_EXPORT
91
uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
92
93
static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
94
95
/*
96
    Partial support for 64-bit POS and Number=1 INFO tags.
97
    Notes:
98
     - the support for 64-bit values is motivated by POS and INFO/END for large genomes
99
     - the use of 64-bit values does not conform to the specification
100
     - cannot output 64-bit BCF and if it does, it is not compatible with anything
101
     - experimental, use at your risk
102
*/
103
#ifdef VCF_ALLOW_INT64
104
    #define BCF_MAX_BT_INT64 (0x7fffffffffffffff)       /* INT64_MAX, for internal use only */
105
    #define BCF_MIN_BT_INT64 -9223372036854775800LL     /* INT64_MIN + 8, for internal use only */
106
#endif
107
108
978
#define BCF_IS_64BIT (1<<30)
109
110
111
// Opaque structure with auxilary data which allows to extend bcf_hdr_t without breaking ABI.
112
// Note that this preserving API and ABI requires that the first element is vdict_t struct
113
// rather than a pointer, as user programs may (and in some cases do) access the dictionary
114
// directly as (vdict_t*)hdr->dict.
115
typedef struct
116
{
117
    vdict_t dict;   // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT
118
    hdict_t *gen;   // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields
119
    size_t *key_len;// length of h->id[BCF_DT_ID] strings
120
    int version;    //cached version
121
    uint32_t ref_count; // reference count, low bit indicates bcf_hdr_destroy() has been called
122
}
123
bcf_hdr_aux_t;
124
125
static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr)
126
168k
{
127
168k
    return (bcf_hdr_aux_t *)hdr->dict[0];
128
168k
}
129
130
//version macros
131
46.9k
#define VCF_DEF 4002000
132
19.6k
#define VCF44   4004000
133
9.50k
#define VCF45   4005000
134
135
#define VCF_MAJOR_VER(x) ( (x) / 10000 / 100 )
136
#define VCF_MINOR_VER(x) ( ((x) % 1000000) / 1000 )
137
138
/**
139
 *  bcf_get_version - get the version as int
140
 *  @param hdr   - bcf header, to get version
141
 *  @param verstr- version string, which is already available
142
 *  Returns version on success and default version on failure
143
 *  version = major * 100 * 10000 + minor * 1000
144
 */
145
static int bcf_get_version(const bcf_hdr_t *hdr, const char *verstr)
146
11.3k
{
147
11.3k
    const char *version = NULL, vcf[] = "VCFv";
148
11.3k
    char *major = NULL, *minor = NULL;
149
11.3k
    int ver = -1;
150
11.3k
    long tmp = 0;
151
11.3k
    bcf_hdr_aux_t *aux = NULL;
152
153
11.3k
    if (!hdr && !verstr) {  //invalid input
154
0
        goto fail;
155
0
    }
156
157
11.3k
    if (hdr) {
158
9.68k
        if ((aux = get_hdr_aux(hdr)) && aux->version != 0) {    //use cached version
159
9.25k
            return aux->version;
160
9.25k
        }
161
        //get from header
162
436
        version = bcf_hdr_get_version(hdr);
163
1.67k
    } else {
164
        //get from version string
165
1.67k
        version = verstr;
166
1.67k
    }
167
2.11k
    if (!(major = strstr(version, vcf))) {  //bad format
168
1.34k
        goto fail;
169
1.34k
    }
170
763
    major += sizeof(vcf) - 1;
171
763
    if (!(minor = strchr(major, '.'))) {    //bad format
172
100
        goto fail;
173
100
    }
174
663
    tmp = strtol(major, NULL, 10);
175
663
    if ((!tmp && errno == EINVAL) ||
176
589
        ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) {    //failed
177
74
        goto fail;
178
74
    }
179
589
    ver = tmp * 100 * 10000;
180
589
    tmp = strtol(++minor, NULL, 10);
181
589
    if ((!tmp && errno == EINVAL) ||
182
542
        ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) {    //failed
183
243
        goto fail;
184
243
    }
185
346
    ver += tmp * 1000;
186
346
    return ver;
187
188
1.76k
fail:
189
1.76k
    hts_log_warning("Couldn't get VCF version, considering as %d.%d",
190
1.76k
        VCF_MAJOR_VER(VCF_DEF), VCF_MINOR_VER(VCF_DEF));
191
1.76k
    return VCF_DEF;
192
589
}
193
194
// Header reference counting
195
196
static void bcf_hdr_incr_ref(bcf_hdr_t *h)
197
704
{
198
704
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
199
704
    aux->ref_count += 2;
200
704
}
201
202
static void bcf_hdr_decr_ref(bcf_hdr_t *h)
203
704
{
204
704
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
205
704
    if (aux->ref_count >= 2)
206
704
        aux->ref_count -= 2;
207
208
704
    if (aux->ref_count == 0)
209
649
        bcf_hdr_destroy(h);
210
704
}
211
212
static void hdr_bgzf_private_data_cleanup(void *data)
213
704
{
214
704
    bcf_hdr_t *h = (bcf_hdr_t *) data;
215
704
    bcf_hdr_decr_ref(h);
216
704
}
217
218
static char *find_chrom_header_line(char *s)
219
0
{
220
0
    char *nl;
221
0
    if (strncmp(s, "#CHROM\t", 7) == 0) return s;
222
0
    else if ((nl = strstr(s, "\n#CHROM\t")) != NULL) return nl+1;
223
0
    else return NULL;
224
0
}
225
226
static int64_t get_rlen(const bcf_hdr_t *h, bcf1_t *v);
227
228
/*************************
229
 *** VCF header parser ***
230
 *************************/
231
232
static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len)
233
2.35k
{
234
2.35k
    const char *ss = s;
235
2.39k
    while ( *ss && isspace_c(*ss) && ss - s < len) ss++;
236
2.35k
    if ( !*ss || ss - s == len)
237
2
    {
238
2
        hts_log_error("Empty sample name: trailing spaces/tabs in the header line?");
239
2
        return -1;
240
2
    }
241
242
2.34k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
243
2.34k
    int ret;
244
2.34k
    char *sdup = malloc(len + 1);
245
2.34k
    if (!sdup) return -1;
246
2.34k
    memcpy(sdup, s, len);
247
2.34k
    sdup[len] = 0;
248
249
    // Ensure space is available in h->samples
250
2.34k
    size_t n = kh_size(d);
251
2.34k
    char **new_samples = realloc(h->samples, sizeof(char*) * (n + 1));
252
2.34k
    if (!new_samples) {
253
0
        free(sdup);
254
0
        return -1;
255
0
    }
256
2.34k
    h->samples = new_samples;
257
258
2.34k
    int k = kh_put(vdict, d, sdup, &ret);
259
2.34k
    if (ret < 0) {
260
0
        free(sdup);
261
0
        return -1;
262
0
    }
263
2.34k
    if (ret) { // absent
264
2.34k
        kh_val(d, k) = bcf_idinfo_def;
265
2.34k
        kh_val(d, k).id = n;
266
2.34k
    } else {
267
0
        hts_log_error("Duplicated sample name '%s'", sdup);
268
0
        free(sdup);
269
0
        return -1;
270
0
    }
271
2.34k
    h->samples[n] = sdup;
272
2.34k
    h->dirty = 1;
273
2.34k
    return 0;
274
2.34k
}
275
276
int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
277
0
{
278
0
    if (!s) {
279
        // Allowed for backwards-compatibility, calling with s == NULL
280
        // used to trigger bcf_hdr_sync(h);
281
0
        return 0;
282
0
    }
283
0
    return bcf_hdr_add_sample_len(h, s, strlen(s));
284
0
}
285
286
int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *hdr, const char *str)
287
1.19k
{
288
1.19k
    const char *mandatory = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
289
1.19k
    if ( strncmp(str,mandatory,strlen(mandatory)) )
290
8
    {
291
8
        hts_log_error("Could not parse the \"#CHROM..\" line, either the fields are incorrect or spaces are present instead of tabs:\n\t%s",str);
292
8
        return -1;
293
8
    }
294
295
1.19k
    const char *beg = str + strlen(mandatory), *end;
296
1.19k
    if ( !*beg || *beg=='\n' ) return 0;
297
346
    if ( strncmp(beg,"\tFORMAT\t",8) )
298
0
    {
299
0
        hts_log_error("Could not parse the \"#CHROM..\" line, either FORMAT is missing or spaces are present instead of tabs:\n\t%s",str);
300
0
        return -1;
301
0
    }
302
346
    beg += 8;
303
304
346
    int ret = 0;
305
2.35k
    while ( *beg )
306
2.35k
    {
307
2.35k
        end = beg;
308
36.5M
        while ( *end && *end!='\t' && *end!='\n' ) end++;
309
2.35k
        if ( bcf_hdr_add_sample_len(hdr, beg, end-beg) < 0 ) ret = -1;
310
2.35k
        if ( !*end || *end=='\n' || ret<0 ) break;
311
2.00k
        beg = end + 1;
312
2.00k
    }
313
346
    return ret;
314
346
}
315
316
int bcf_hdr_sync(bcf_hdr_t *h)
317
38.9k
{
318
38.9k
    int i;
319
155k
    for (i = 0; i < 3; i++)
320
116k
    {
321
116k
        vdict_t *d = (vdict_t*)h->dict[i];
322
116k
        khint_t k;
323
116k
        if ( h->n[i] < kh_size(d) )
324
344
        {
325
344
            bcf_idpair_t *new_idpair;
326
            // this should be true only for i=2, BCF_DT_SAMPLE
327
344
            new_idpair = (bcf_idpair_t*) realloc(h->id[i], kh_size(d)*sizeof(bcf_idpair_t));
328
344
            if (!new_idpair) return -1;
329
344
            h->n[i] = kh_size(d);
330
344
            h->id[i] = new_idpair;
331
344
        }
332
1.35G
        for (k=kh_begin(d); k<kh_end(d); k++)
333
1.35G
        {
334
1.35G
            if (!kh_exist(d,k)) continue;
335
10.3M
            h->id[i][kh_val(d,k).id].key = kh_key(d,k);
336
10.3M
            h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
337
10.3M
        }
338
116k
    }
339
340
    // Invalidate key length cache
341
38.9k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
342
38.9k
    if (aux && aux->key_len) {
343
1.51k
        free(aux->key_len);
344
1.51k
        aux->key_len = NULL;
345
1.51k
    }
346
347
38.9k
    h->dirty = 0;
348
38.9k
    return 0;
349
38.9k
}
350
351
void bcf_hrec_destroy(bcf_hrec_t *hrec)
352
93.8k
{
353
93.8k
    if (!hrec) return;
354
89.2k
    free(hrec->key);
355
89.2k
    if ( hrec->value ) free(hrec->value);
356
89.2k
    int i;
357
272k
    for (i=0; i<hrec->nkeys; i++)
358
183k
    {
359
183k
        free(hrec->keys[i]);
360
183k
        free(hrec->vals[i]);
361
183k
    }
362
89.2k
    free(hrec->keys);
363
89.2k
    free(hrec->vals);
364
89.2k
    free(hrec);
365
89.2k
}
366
367
// Copies all fields except IDX.
368
bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
369
0
{
370
0
    int save_errno;
371
0
    bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
372
0
    if (!out) return NULL;
373
374
0
    out->type = hrec->type;
375
0
    if ( hrec->key ) {
376
0
        out->key = strdup(hrec->key);
377
0
        if (!out->key) goto fail;
378
0
    }
379
0
    if ( hrec->value ) {
380
0
        out->value = strdup(hrec->value);
381
0
        if (!out->value) goto fail;
382
0
    }
383
0
    out->nkeys = hrec->nkeys;
384
0
    out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys);
385
0
    if (!out->keys) goto fail;
386
0
    out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys);
387
0
    if (!out->vals) goto fail;
388
0
    int i, j = 0;
389
0
    for (i=0; i<hrec->nkeys; i++)
390
0
    {
391
0
        if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
392
0
        if ( hrec->keys[i] ) {
393
0
            out->keys[j] = strdup(hrec->keys[i]);
394
0
            if (!out->keys[j]) goto fail;
395
0
        }
396
0
        if ( hrec->vals[i] ) {
397
0
            out->vals[j] = strdup(hrec->vals[i]);
398
0
            if (!out->vals[j]) goto fail;
399
0
        }
400
0
        j++;
401
0
    }
402
0
    if ( i!=j ) out->nkeys -= i-j;   // IDX was omitted
403
0
    return out;
404
405
0
 fail:
406
0
    save_errno = errno;
407
0
    hts_log_error("%s", strerror(errno));
408
0
    bcf_hrec_destroy(out);
409
0
    errno = save_errno;
410
0
    return NULL;
411
0
}
412
413
void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
414
0
{
415
0
    fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
416
0
    int i;
417
0
    for (i=0; i<hrec->nkeys; i++)
418
0
        fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
419
0
    fprintf(fp, "\n");
420
0
}
421
422
void bcf_header_debug(bcf_hdr_t *hdr)
423
0
{
424
0
    int i, j;
425
0
    for (i=0; i<hdr->nhrec; i++)
426
0
    {
427
0
        if ( !hdr->hrec[i]->value )
428
0
        {
429
0
            fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
430
0
            fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
431
0
            for (j=1; j<hdr->hrec[i]->nkeys; j++)
432
0
                fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
433
0
            fprintf(stderr,">\n");
434
0
        }
435
0
        else
436
0
            fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
437
0
    }
438
0
}
439
440
int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len)
441
142k
{
442
142k
    char **tmp;
443
142k
    size_t n = hrec->nkeys + 1;
444
142k
    assert(len > 0 && len < SIZE_MAX);
445
142k
    tmp = realloc(hrec->keys, sizeof(char*)*n);
446
142k
    if (!tmp) return -1;
447
142k
    hrec->keys = tmp;
448
142k
    tmp = realloc(hrec->vals, sizeof(char*)*n);
449
142k
    if (!tmp) return -1;
450
142k
    hrec->vals = tmp;
451
452
142k
    hrec->keys[hrec->nkeys] = (char*) malloc((len+1)*sizeof(char));
453
142k
    if (!hrec->keys[hrec->nkeys]) return -1;
454
142k
    memcpy(hrec->keys[hrec->nkeys],str,len);
455
142k
    hrec->keys[hrec->nkeys][len] = 0;
456
142k
    hrec->vals[hrec->nkeys] = NULL;
457
142k
    hrec->nkeys = n;
458
142k
    return 0;
459
142k
}
460
461
int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted)
462
142k
{
463
142k
    if ( hrec->vals[i] ) {
464
0
        free(hrec->vals[i]);
465
0
        hrec->vals[i] = NULL;
466
0
    }
467
142k
    if ( !str ) return 0;
468
142k
    if ( is_quoted )
469
39.6k
    {
470
39.6k
        if (len >= SIZE_MAX - 3) {
471
0
            errno = ENOMEM;
472
0
            return -1;
473
0
        }
474
39.6k
        hrec->vals[i] = (char*) malloc((len+3)*sizeof(char));
475
39.6k
        if (!hrec->vals[i]) return -1;
476
39.6k
        hrec->vals[i][0] = '"';
477
39.6k
        memcpy(&hrec->vals[i][1],str,len);
478
39.6k
        hrec->vals[i][len+1] = '"';
479
39.6k
        hrec->vals[i][len+2] = 0;
480
39.6k
    }
481
102k
    else
482
102k
    {
483
102k
        if (len == SIZE_MAX) {
484
0
            errno = ENOMEM;
485
0
            return -1;
486
0
        }
487
102k
        hrec->vals[i] = (char*) malloc((len+1)*sizeof(char));
488
102k
        if (!hrec->vals[i]) return -1;
489
102k
        memcpy(hrec->vals[i],str,len);
490
102k
        hrec->vals[i][len] = 0;
491
102k
    }
492
142k
    return 0;
493
142k
}
494
495
int hrec_add_idx(bcf_hrec_t *hrec, int idx)
496
40.4k
{
497
40.4k
    int n = hrec->nkeys + 1;
498
40.4k
    char **tmp = (char**) realloc(hrec->keys, sizeof(char*)*n);
499
40.4k
    if (!tmp) return -1;
500
40.4k
    hrec->keys = tmp;
501
502
40.4k
    tmp = (char**) realloc(hrec->vals, sizeof(char*)*n);
503
40.4k
    if (!tmp) return -1;
504
40.4k
    hrec->vals = tmp;
505
506
40.4k
    hrec->keys[hrec->nkeys] = strdup("IDX");
507
40.4k
    if (!hrec->keys[hrec->nkeys]) return -1;
508
509
40.4k
    kstring_t str = {0,0,0};
510
40.4k
    if (kputw(idx, &str) < 0) {
511
0
        free(hrec->keys[hrec->nkeys]);
512
0
        return -1;
513
0
    }
514
40.4k
    hrec->vals[hrec->nkeys] = str.s;
515
40.4k
    hrec->nkeys = n;
516
40.4k
    return 0;
517
40.4k
}
518
519
int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
520
59.8k
{
521
59.8k
    int i;
522
99.0k
    for (i=0; i<hrec->nkeys; i++)
523
71.8k
        if ( !strcasecmp(key,hrec->keys[i]) ) return i;
524
27.2k
    return -1;
525
59.8k
}
526
527
static void bcf_hrec_set_type(bcf_hrec_t *hrec)
528
163k
{
529
163k
    if ( !strcmp(hrec->key, "contig") ) hrec->type = BCF_HL_CTG;
530
148k
    else if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
531
100k
    else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
532
55.1k
    else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
533
45.4k
    else if ( hrec->nkeys>0 ) hrec->type = BCF_HL_STR;
534
33.6k
    else hrec->type = BCF_HL_GEN;
535
163k
}
536
537
538
/**
539
    The arrays were generated with
540
541
    valid_ctg:
542
        perl -le '@v = (split(//,q[!#$%&*+./:;=?@^_|~-]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
543
544
    valid_tag:
545
        perl -le '@v = (split(//,q[_.]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
546
*/
547
static const uint8_t valid_ctg[256] =
548
{
549
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
550
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551
    0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
552
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
553
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
554
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
555
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
556
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
557
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
558
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
559
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
560
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
561
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
562
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
563
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
564
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
565
};
566
static const uint8_t valid_tag[256] =
567
{
568
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
569
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
570
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
571
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
572
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
573
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
574
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
575
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
576
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
577
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
578
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
579
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
580
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
581
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
582
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
583
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
584
};
585
586
/**
587
    bcf_hrec_check() - check the validity of structured header lines
588
589
    Returns 0 on success or negative value on error.
590
591
    Currently the return status is not checked by the caller
592
    and only a warning is printed on stderr. This should be improved
593
    to propagate the error all the way up to the caller and let it
594
    decide what to do: throw an error or proceed anyway.
595
 */
596
static int bcf_hrec_check(bcf_hrec_t *hrec)
597
81.9k
{
598
81.9k
    int i;
599
81.9k
    bcf_hrec_set_type(hrec);
600
601
81.9k
    if ( hrec->type==BCF_HL_CTG )
602
7.70k
    {
603
7.70k
        i = bcf_hrec_find_key(hrec,"ID");
604
7.70k
        if ( i<0 ) goto err_missing_id;
605
3.30k
        char *val = hrec->vals[i];
606
3.30k
        if ( val[0]=='*' || val[0]=='=' || !valid_ctg[(uint8_t)val[0]] ) goto err_invalid_ctg;
607
54.7k
        while ( *(++val) )
608
54.6k
            if ( !valid_ctg[(uint8_t)*val] ) goto err_invalid_ctg;
609
80
        return 0;
610
401
    }
611
74.2k
    if ( hrec->type==BCF_HL_INFO )
612
24.1k
    {
613
24.1k
        i = bcf_hrec_find_key(hrec,"ID");
614
24.1k
        if ( i<0 ) goto err_missing_id;
615
16.3k
        char *val = hrec->vals[i];
616
16.3k
        if ( !strcmp(val,"1000G") ) return 0;
617
16.2k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
618
9.85k
        while ( *(++val) )
619
8.20k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
620
1.65k
        return 0;
621
2.41k
    }
622
50.1k
    if ( hrec->type==BCF_HL_FMT )
623
4.84k
    {
624
4.84k
        i = bcf_hrec_find_key(hrec,"ID");
625
4.84k
        if ( i<0 ) goto err_missing_id;
626
3.88k
        char *val = hrec->vals[i];
627
3.88k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
628
5.26k
        while ( *(++val) )
629
4.41k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
630
844
        return 0;
631
2.19k
    }
632
45.2k
    return 0;
633
634
13.1k
  err_missing_id:
635
13.1k
    hts_log_warning("Missing ID attribute in one or more header lines");
636
13.1k
    return -1;
637
638
3.22k
  err_invalid_ctg:
639
3.22k
    hts_log_warning("Invalid contig name: \"%s\"", hrec->vals[i]);
640
3.22k
    return -1;
641
642
17.6k
  err_invalid_tag:
643
17.6k
    hts_log_warning("Invalid tag name: \"%s\"", hrec->vals[i]);
644
17.6k
    return -1;
645
50.1k
}
646
647
static inline int is_escaped(const char *min, const char *str)
648
39.2k
{
649
39.2k
    int n = 0;
650
39.2k
    while ( --str>=min && *str=='\\' ) n++;
651
39.2k
    return n%2;
652
39.2k
}
653
654
bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
655
95.6k
{
656
95.6k
    bcf_hrec_t *hrec = NULL;
657
95.6k
    const char *p = line;
658
95.6k
    if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
659
93.8k
    p += 2;
660
661
93.8k
    const char *q = p;
662
1.08M
    while ( *q && *q!='=' && *q != '\n' ) q++;
663
93.8k
    ptrdiff_t n = q-p;
664
93.8k
    if ( *q!='=' || !n ) // wrong format
665
4.58k
        goto malformed_line;
666
667
89.2k
    hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
668
89.2k
    if (!hrec) { *len = -1; return NULL; }
669
89.2k
    hrec->key = (char*) malloc(sizeof(char)*(n+1));
670
89.2k
    if (!hrec->key) goto fail;
671
89.2k
    memcpy(hrec->key,p,n);
672
89.2k
    hrec->key[n] = 0;
673
89.2k
    hrec->type = -1;
674
675
89.2k
    p = ++q;
676
89.2k
    if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
677
20.0k
    {
678
9.93M
        while ( *q && *q!='\n' ) q++;
679
20.0k
        hrec->value = (char*) malloc((q-p+1)*sizeof(char));
680
20.0k
        if (!hrec->value) goto fail;
681
20.0k
        memcpy(hrec->value, p, q-p);
682
20.0k
        hrec->value[q-p] = 0;
683
20.0k
        *len = q - line + (*q ? 1 : 0); // Skip \n but not \0
684
20.0k
        return hrec;
685
20.0k
    }
686
687
    // structured line, e.g.
688
    // ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
689
    // ##PEDIGREE=<Name_0=G0-ID,Name_1=G1-ID,Name_3=GN-ID>
690
69.2k
    int nopen = 1;
691
211k
    while ( *q && *q!='\n' && nopen>0 )
692
149k
    {
693
149k
        p = ++q;
694
150k
        while ( *q && *q==' ' ) { p++; q++; }
695
        // ^[A-Za-z_][0-9A-Za-z_.]*$
696
149k
        if (p==q && *q && (isalpha_c(*q) || *q=='_'))
697
148k
        {
698
148k
            q++;
699
796k
            while ( *q && (isalnum_c(*q) || *q=='_' || *q=='.') ) q++;
700
148k
        }
701
149k
        n = q-p;
702
149k
        int m = 0;
703
150k
        while ( *q && *q==' ' ) { q++; m++; }
704
149k
        if ( *q!='=' || !n )
705
7.28k
            goto malformed_line;
706
707
142k
        if (bcf_hrec_add_key(hrec, p, q-p-m) < 0) goto fail;
708
142k
        p = ++q;
709
144k
        while ( *q && *q==' ' ) { p++; q++; }
710
711
142k
        int quoted = 0;
712
142k
        char ending = '\0';
713
142k
        switch (*p) {
714
39.6k
        case '"':
715
39.6k
            quoted = 1;
716
39.6k
            ending = '"';
717
39.6k
            p++;
718
39.6k
            break;
719
8
        case '[':
720
8
            quoted = 1;
721
8
            ending = ']';
722
8
            break;
723
142k
        }
724
142k
        if ( quoted ) q++;
725
145M
        while ( *q && *q != '\n' )
726
145M
        {
727
145M
            if ( quoted ) { if ( *q==ending && !is_escaped(p,q) ) break; }
728
145M
            else
729
145M
            {
730
145M
                if ( *q=='<' ) nopen++;
731
145M
                if ( *q=='>' ) nopen--;
732
145M
                if ( !nopen ) break;
733
145M
                if ( *q==',' && nopen==1 ) break;
734
145M
            }
735
145M
            q++;
736
145M
        }
737
142k
        const char *r = q;
738
142k
        if (quoted && ending == ']') {
739
8
            if (*q == ending) {
740
1
                r++;
741
1
                q++;
742
1
                quoted = 0;
743
7
            } else {
744
7
                char buffer[320];
745
7
                hts_log_error("Missing ']' in header line %s",
746
7
                              hts_strprint(buffer, sizeof(buffer), '"',
747
7
                                           line, q-line));
748
7
                goto fail;
749
7
            }
750
8
        }
751
142k
        while ( r > p && r[-1] == ' ' ) r--;
752
142k
        if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0)
753
0
            goto fail;
754
142k
        if ( quoted && *q==ending ) q++;
755
142k
        if ( *q=='>' )
756
41.3k
        {
757
41.3k
            if (nopen) nopen--;     // this can happen with nested angle brackets <>
758
41.3k
            q++;
759
41.3k
        }
760
142k
    }
761
61.9k
    if ( nopen )
762
20.6k
        hts_log_warning("Incomplete header line, trying to proceed anyway:\n\t[%s]\n\t[%d]",line,q[0]);
763
764
    // Skip to end of line
765
61.9k
    int nonspace = 0;
766
61.9k
    p = q;
767
721k
    while ( *q && *q!='\n' ) { nonspace |= !isspace_c(*q); q++; }
768
61.9k
    if (nonspace) {
769
719
        char buffer[320];
770
719
        hts_log_warning("Dropped trailing junk from header line '%s'",
771
719
                        hts_strprint(buffer, sizeof(buffer),
772
719
                                     '"', line, q - line));
773
719
    }
774
775
61.9k
    *len = q - line + (*q ? 1 : 0);
776
61.9k
    return hrec;
777
778
7
 fail:
779
7
    *len = -1;
780
7
    bcf_hrec_destroy(hrec);
781
7
    return NULL;
782
783
11.8k
 malformed_line:
784
11.8k
    {
785
11.8k
        char buffer[320];
786
302k
        while ( *q && *q!='\n' ) q++;  // Ensure *len includes full line
787
11.8k
        hts_log_error("Could not parse the header line: %s",
788
11.8k
                      hts_strprint(buffer, sizeof(buffer),
789
11.8k
                                   '"', line, q - line));
790
11.8k
        *len = q - line + (*q ? 1 : 0);
791
11.8k
        bcf_hrec_destroy(hrec);
792
11.8k
        return NULL;
793
69.2k
    }
794
69.2k
}
795
796
static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_idinfo_t *idinfo)
797
39.8k
{
798
39.8k
    size_t new_n;
799
800
    // If available, preserve existing IDX
801
39.8k
    if ( idinfo->id==-1 )
802
39.6k
        idinfo->id = hdr->n[dict_type];
803
204
    else if ( idinfo->id < hdr->n[dict_type] && hdr->id[dict_type][idinfo->id].key )
804
0
    {
805
0
        hts_log_error("Conflicting IDX=%d lines in the header dictionary, the new tag is %s",
806
0
            idinfo->id, tag);
807
0
        errno = EINVAL;
808
0
        return -1;
809
0
    }
810
811
39.8k
    new_n = idinfo->id >= hdr->n[dict_type] ? idinfo->id+1 : hdr->n[dict_type];
812
39.8k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
813
    // hts_resize() can attempt to allocate up to 2 * requested items
814
39.8k
    if (new_n > FUZZ_ALLOC_LIMIT/(2 * sizeof(bcf_idpair_t)))
815
6
        return -1;
816
39.8k
#endif
817
39.8k
    if (hts_resize(bcf_idpair_t, new_n, &hdr->m[dict_type],
818
39.8k
                   &hdr->id[dict_type], HTS_RESIZE_CLEAR)) {
819
0
        return -1;
820
0
    }
821
39.8k
    hdr->n[dict_type] = new_n;
822
823
    // NB: the next kh_put call can invalidate the idinfo pointer, therefore
824
    // we leave it unassigned here. It must be set explicitly in bcf_hdr_sync.
825
39.8k
    hdr->id[dict_type][idinfo->id].key = tag;
826
827
39.8k
    return 0;
828
39.8k
}
829
830
// returns: 1 when hdr needs to be synced, -1 on error, 0 otherwise
831
static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
832
81.9k
{
833
    // contig
834
81.9k
    int i, ret, replacing = 0;
835
81.9k
    khint_t k;
836
81.9k
    char *str = NULL;
837
838
81.9k
    bcf_hrec_set_type(hrec);
839
840
81.9k
    if ( hrec->type==BCF_HL_CTG )
841
7.70k
    {
842
7.70k
        hts_pos_t len = 0;
843
844
        // Get the contig ID ($str) and length ($j)
845
7.70k
        i = bcf_hrec_find_key(hrec,"length");
846
7.70k
        if ( i<0 ) len = 0;
847
2.04k
        else {
848
2.04k
            char *end = hrec->vals[i];
849
2.04k
            len = strtoll(hrec->vals[i], &end, 10);
850
2.04k
            if (end == hrec->vals[i] || len < 0) return 0;
851
2.04k
        }
852
853
6.97k
        i = bcf_hrec_find_key(hrec,"ID");
854
6.97k
        if ( i<0 ) return 0;
855
3.30k
        str = strdup(hrec->vals[i]);
856
3.30k
        if (!str) return -1;
857
858
        // Register in the dictionary
859
3.30k
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
860
3.30k
        khint_t k = kh_get(vdict, d, str);
861
3.30k
        if ( k != kh_end(d) ) { // already present
862
808
            free(str); str=NULL;
863
808
            if (kh_val(d, k).hrec[0] != NULL) // and not removed
864
808
                return 0;
865
0
            replacing = 1;
866
2.50k
        } else {
867
2.50k
            k = kh_put(vdict, d, str, &ret);
868
2.50k
            if (ret < 0) { free(str); return -1; }
869
2.50k
        }
870
871
2.50k
        int idx = bcf_hrec_find_key(hrec,"IDX");
872
2.50k
        if ( idx!=-1 )
873
964
        {
874
964
            char *tmp = hrec->vals[idx];
875
964
            idx = strtol(hrec->vals[idx], &tmp, 10);
876
964
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
877
950
            {
878
950
                if (!replacing) {
879
950
                    kh_del(vdict, d, k);
880
950
                    free(str);
881
950
                }
882
950
                hts_log_warning("Error parsing the IDX tag, skipping");
883
950
                return 0;
884
950
            }
885
964
        }
886
887
1.55k
        kh_val(d, k) = bcf_idinfo_def;
888
1.55k
        kh_val(d, k).id = idx;
889
1.55k
        kh_val(d, k).info[0] = len;
890
1.55k
        kh_val(d, k).hrec[0] = hrec;
891
1.55k
        if (bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k)) < 0) {
892
6
            if (!replacing) {
893
6
                kh_del(vdict, d, k);
894
6
                free(str);
895
6
            }
896
6
            return -1;
897
6
        }
898
1.54k
        if ( idx==-1 ) {
899
1.53k
            if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
900
0
               return -1;
901
0
            }
902
1.53k
        }
903
904
1.54k
        return 1;
905
1.54k
    }
906
907
74.2k
    if ( hrec->type==BCF_HL_STR ) return 1;
908
68.3k
    if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_FMT ) return 0;
909
910
    // INFO/FILTER/FORMAT
911
51.5k
    char *id = NULL;
912
51.5k
    uint32_t type = UINT32_MAX, var = UINT32_MAX;
913
51.5k
    int num = -1, idx = -1;
914
174k
    for (i=0; i<hrec->nkeys; i++)
915
123k
    {
916
123k
        if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
917
80.8k
        else if ( !strcmp(hrec->keys[i], "IDX") )
918
1.43k
        {
919
1.43k
            char *tmp = hrec->vals[i];
920
1.43k
            idx = strtol(hrec->vals[i], &tmp, 10);
921
1.43k
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
922
214
            {
923
214
                hts_log_warning("Error parsing the IDX tag, skipping");
924
214
                return 0;
925
214
            }
926
1.43k
        }
927
79.3k
        else if ( !strcmp(hrec->keys[i], "Type") )
928
21.1k
        {
929
21.1k
            if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
930
20.3k
            else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
931
19.9k
            else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
932
4.77k
            else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
933
4.69k
            else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
934
3.81k
            else
935
3.81k
            {
936
3.81k
                hts_log_warning("The type \"%s\" is not supported, assuming \"String\"", hrec->vals[i]);
937
3.81k
                type = BCF_HT_STR;
938
3.81k
            }
939
21.1k
        }
940
58.2k
        else if ( !strcmp(hrec->keys[i], "Number") )
941
17.0k
        {
942
17.0k
            int is_fmt = hrec->type == BCF_HL_FMT;
943
17.0k
            if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
944
16.4k
            else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
945
16.3k
            else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
946
15.9k
            else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
947
15.9k
            else if ( is_fmt && !strcmp(hrec->vals[i],"P") )  var = BCF_VL_P;
948
15.9k
            else if ( is_fmt && !strcmp(hrec->vals[i],"LA") ) var = BCF_VL_LA;
949
15.9k
            else if ( is_fmt && !strcmp(hrec->vals[i],"LR") ) var = BCF_VL_LR;
950
15.9k
            else if ( is_fmt && !strcmp(hrec->vals[i],"LG") ) var = BCF_VL_LG;
951
15.9k
            else if ( is_fmt && !strcmp(hrec->vals[i],"M") )  var = BCF_VL_M;
952
15.9k
            else
953
15.9k
            {
954
15.9k
                if (sscanf(hrec->vals[i],"%d",&num) == 1)
955
15.6k
                    var = BCF_VL_FIXED;
956
15.9k
            }
957
17.0k
            if (var != BCF_VL_FIXED) num = 0xfffff;
958
17.0k
        }
959
123k
    }
960
51.3k
    if (hrec->type == BCF_HL_INFO || hrec->type == BCF_HL_FMT) {
961
28.7k
        if (type == -1) {
962
8.18k
            hts_log_warning("%s %s field has no Type defined. Assuming String",
963
8.18k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
964
8.18k
            type = BCF_HT_STR;
965
8.18k
        }
966
28.7k
        if (var == UINT32_MAX) {
967
12.0k
            hts_log_warning("%s %s field has no Number defined. Assuming '.'",
968
12.0k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
969
12.0k
            var = BCF_VL_VAR;
970
12.0k
        }
971
28.7k
        if ( type==BCF_HT_FLAG && (var!=BCF_VL_FIXED || num!=0) )
972
676
        {
973
676
            hts_log_warning("The definition of Flag \"%s/%s\" is invalid, forcing Number=0", hrec->key,id);
974
676
            var = BCF_VL_FIXED;
975
676
            num = 0;
976
676
        }
977
28.7k
    }
978
51.3k
    uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 |
979
51.3k
                     (var & 0xf) << 8 |
980
51.3k
                     (type & 0xf) << 4 |
981
51.3k
                     (((uint32_t) hrec->type) & 0xf));
982
983
51.3k
    if ( !id ) return 0;
984
42.5k
    str = strdup(id);
985
42.5k
    if (!str) return -1;
986
987
42.5k
    vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
988
42.5k
    k = kh_get(vdict, d, str);
989
42.5k
    if ( k != kh_end(d) )
990
4.22k
    {
991
        // already present
992
4.22k
        free(str);
993
4.22k
        if ( kh_val(d, k).hrec[info&0xf] ) return 0;
994
790
        kh_val(d, k).info[info&0xf] = info;
995
790
        kh_val(d, k).hrec[info&0xf] = hrec;
996
790
        if ( idx==-1 ) {
997
790
            if (hrec_add_idx(hrec, kh_val(d, k).id) < 0) {
998
0
                return -1;
999
0
            }
1000
790
        }
1001
790
        return 1;
1002
790
    }
1003
38.3k
    k = kh_put(vdict, d, str, &ret);
1004
38.3k
    if (ret < 0) {
1005
0
        free(str);
1006
0
        return -1;
1007
0
    }
1008
38.3k
    kh_val(d, k) = bcf_idinfo_def;
1009
38.3k
    kh_val(d, k).info[info&0xf] = info;
1010
38.3k
    kh_val(d, k).hrec[info&0xf] = hrec;
1011
38.3k
    kh_val(d, k).id = idx;
1012
38.3k
    if (bcf_hdr_set_idx(hdr, BCF_DT_ID, kh_key(d,k), &kh_val(d,k)) < 0) {
1013
0
        kh_del(vdict, d, k);
1014
0
        free(str);
1015
0
        return -1;
1016
0
    }
1017
38.3k
    if ( idx==-1 ) {
1018
38.1k
        if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
1019
0
            return -1;
1020
0
        }
1021
38.1k
    }
1022
1023
38.3k
    return 1;
1024
38.3k
}
1025
1026
static void bcf_hdr_unregister_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1027
0
{
1028
0
    if (hrec->type == BCF_HL_FLT ||
1029
0
        hrec->type == BCF_HL_INFO ||
1030
0
        hrec->type == BCF_HL_FMT ||
1031
0
        hrec->type == BCF_HL_CTG) {
1032
0
        int id = bcf_hrec_find_key(hrec, "ID");
1033
0
        if (id < 0 || !hrec->vals[id])
1034
0
            return;
1035
0
        vdict_t *dict = (hrec->type == BCF_HL_CTG
1036
0
                         ? (vdict_t*)hdr->dict[BCF_DT_CTG]
1037
0
                         : (vdict_t*)hdr->dict[BCF_DT_ID]);
1038
0
        khint_t k = kh_get(vdict, dict, hrec->vals[id]);
1039
0
        if (k != kh_end(dict))
1040
0
            kh_val(dict, k).hrec[hrec->type==BCF_HL_CTG ? 0 : hrec->type] = NULL;
1041
0
    }
1042
0
}
1043
1044
static void bcf_hdr_remove_from_hdict(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1045
0
{
1046
0
    kstring_t str = KS_INITIALIZE;
1047
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1048
0
    khint_t k;
1049
0
    int id;
1050
1051
0
    switch (hrec->type) {
1052
0
    case BCF_HL_GEN:
1053
0
        if (ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0)
1054
0
            str.l = 0;
1055
0
        break;
1056
0
    case BCF_HL_STR:
1057
0
        id = bcf_hrec_find_key(hrec, "ID");
1058
0
        if (id < 0)
1059
0
            return;
1060
0
        if (!hrec->vals[id] ||
1061
0
            ksprintf(&str, "##%s=<ID=%s>", hrec->key, hrec->vals[id]) < 0)
1062
0
            str.l = 0;
1063
0
        break;
1064
0
    default:
1065
0
        return;
1066
0
    }
1067
0
    if (str.l) {
1068
0
        k = kh_get(hdict, aux->gen, str.s);
1069
0
    } else {
1070
        // Couldn't get a string for some reason, so try the hard way...
1071
0
        for (k = kh_begin(aux->gen); k < kh_end(aux->gen); k++) {
1072
0
            if (kh_exist(aux->gen, k) && kh_val(aux->gen, k) == hrec)
1073
0
                break;
1074
0
        }
1075
0
    }
1076
0
    if (k != kh_end(aux->gen) && kh_val(aux->gen, k) == hrec) {
1077
0
        kh_val(aux->gen, k) = NULL;
1078
0
        free((char *) kh_key(aux->gen, k));
1079
0
        kh_key(aux->gen, k) = NULL;
1080
0
        kh_del(hdict, aux->gen, k);
1081
0
    }
1082
0
    free(str.s);
1083
0
}
1084
1085
int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp)
1086
0
{
1087
0
    assert( hrec->type==BCF_HL_GEN );
1088
0
    int ret;
1089
0
    khint_t k;
1090
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1091
0
    for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
1092
0
    {
1093
0
        if ( !kh_exist(aux->gen,k) ) continue;
1094
0
        if ( hrec!=(bcf_hrec_t*)kh_val(aux->gen,k) ) continue;
1095
0
        break;
1096
0
    }
1097
0
    assert( k<kh_end(aux->gen) );   // something went wrong, should never happen
1098
0
    free((char*)kh_key(aux->gen,k));
1099
0
    kh_del(hdict,aux->gen,k);
1100
0
    kstring_t str = {0,0,0};
1101
0
    if ( ksprintf(&str, "##%s=%s", tmp->key,tmp->value) < 0 )
1102
0
    {
1103
0
        free(str.s);
1104
0
        return -1;
1105
0
    }
1106
0
    k = kh_put(hdict, aux->gen, str.s, &ret);
1107
0
    if ( ret<0 )
1108
0
    {
1109
0
        free(str.s);
1110
0
        return -1;
1111
0
    }
1112
0
    free(hrec->value);
1113
0
    hrec->value = strdup(tmp->value);
1114
0
    if ( !hrec->value ) return -1;
1115
0
    kh_val(aux->gen,k) = hrec;
1116
1117
0
    if (!strcmp(hrec->key,"fileformat")) {
1118
        //update version
1119
0
        get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value);
1120
0
    }
1121
0
    return 0;
1122
0
}
1123
1124
int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1125
82.0k
{
1126
82.0k
    kstring_t str = {0,0,0};
1127
82.0k
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1128
1129
82.0k
    int res;
1130
82.0k
    if ( !hrec ) return 0;
1131
1132
81.9k
    bcf_hrec_check(hrec);   // todo: check return status and propagate errors up
1133
1134
81.9k
    res = bcf_hdr_register_hrec(hdr,hrec);
1135
81.9k
    if (res < 0) return -1;
1136
81.9k
    if ( !res )
1137
35.3k
    {
1138
        // If one of the hashed field, then it is already present
1139
35.3k
        if ( hrec->type != BCF_HL_GEN )
1140
18.5k
        {
1141
18.5k
            bcf_hrec_destroy(hrec);
1142
18.5k
            return 0;
1143
18.5k
        }
1144
        // Is one of the generic fields and already present?
1145
16.8k
        if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 )
1146
0
        {
1147
0
            free(str.s);
1148
0
            return -1;
1149
0
        }
1150
16.8k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1151
16.8k
        if ( k != kh_end(aux->gen) )
1152
11.4k
        {
1153
            // duplicate record
1154
11.4k
            bcf_hrec_destroy(hrec);
1155
11.4k
            free(str.s);
1156
11.4k
            return 0;
1157
11.4k
        }
1158
5.35k
        if (!strcmp(hrec->key, "fileformat")) {
1159
1.67k
            aux->version = bcf_get_version(NULL, hrec->value);
1160
1.67k
        }
1161
5.35k
    }
1162
1163
51.9k
    int i;
1164
51.9k
    if ( hrec->type==BCF_HL_STR && (i=bcf_hrec_find_key(hrec,"ID"))>=0 )
1165
2.75k
    {
1166
2.75k
        if ( ksprintf(&str, "##%s=<ID=%s>", hrec->key,hrec->vals[i]) < 0 )
1167
0
        {
1168
0
            free(str.s);
1169
0
            return -1;
1170
0
        }
1171
2.75k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1172
2.75k
        if ( k != kh_end(aux->gen) )
1173
2.20k
        {
1174
            // duplicate record
1175
2.20k
            bcf_hrec_destroy(hrec);
1176
2.20k
            free(str.s);
1177
2.20k
            return 0;
1178
2.20k
        }
1179
2.75k
    }
1180
1181
    // New record, needs to be added
1182
49.7k
    int n = hdr->nhrec + 1;
1183
49.7k
    bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
1184
49.7k
    if (!new_hrec) {
1185
0
        free(str.s);
1186
0
        bcf_hdr_unregister_hrec(hdr, hrec);
1187
0
        return -1;
1188
0
    }
1189
49.7k
    hdr->hrec = new_hrec;
1190
1191
49.7k
    if ( str.s )
1192
5.89k
    {
1193
5.89k
        khint_t k = kh_put(hdict, aux->gen, str.s, &res);
1194
5.89k
        if ( res<0 )
1195
0
        {
1196
0
            free(str.s);
1197
0
            return -1;
1198
0
        }
1199
5.89k
        kh_val(aux->gen,k) = hrec;
1200
5.89k
    }
1201
1202
49.7k
    hdr->hrec[hdr->nhrec] = hrec;
1203
49.7k
    hdr->dirty = 1;
1204
49.7k
    hdr->nhrec = n;
1205
1206
49.7k
    return hrec->type==BCF_HL_GEN ? 0 : 1;
1207
49.7k
}
1208
1209
bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
1210
436
{
1211
436
    int i;
1212
436
    if ( type==BCF_HL_GEN )
1213
436
    {
1214
        // e.g. ##fileformat=VCFv4.2
1215
        //      ##source=GenomicsDBImport
1216
        //      ##bcftools_viewVersion=1.16-80-gdfdb0923+htslib-1.16-34-g215d364
1217
436
        if ( value )
1218
0
        {
1219
0
            kstring_t str = {0,0,0};
1220
0
            ksprintf(&str, "##%s=%s", key,value);
1221
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1222
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1223
0
            free(str.s);
1224
0
            if ( k == kh_end(aux->gen) ) return NULL;
1225
0
            return kh_val(aux->gen, k);
1226
0
        }
1227
822
        for (i=0; i<hdr->nhrec; i++)
1228
604
        {
1229
604
            if ( hdr->hrec[i]->type!=type ) continue;
1230
266
            if ( strcmp(hdr->hrec[i]->key,key) ) continue;
1231
218
            return hdr->hrec[i];
1232
266
        }
1233
218
        return NULL;
1234
436
    }
1235
0
    else if ( type==BCF_HL_STR )
1236
0
    {
1237
        // e.g. ##GATKCommandLine=<ID=GenomicsDBImport,CommandLine="GenomicsDBImport....">
1238
        //      ##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
1239
0
        if (!str_class) return NULL;
1240
0
        if ( !strcmp("ID",key) )
1241
0
        {
1242
0
            kstring_t str = {0,0,0};
1243
0
            ksprintf(&str, "##%s=<%s=%s>",str_class,key,value);
1244
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1245
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1246
0
            free(str.s);
1247
0
            if ( k == kh_end(aux->gen) ) return NULL;
1248
0
            return kh_val(aux->gen, k);
1249
0
        }
1250
0
        for (i=0; i<hdr->nhrec; i++)
1251
0
        {
1252
0
            if ( hdr->hrec[i]->type!=type ) continue;
1253
0
            if ( strcmp(hdr->hrec[i]->key,str_class) ) continue;
1254
0
            int j = bcf_hrec_find_key(hdr->hrec[i],key);
1255
0
            if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i];
1256
0
        }
1257
0
        return NULL;
1258
0
    }
1259
0
    vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1260
0
    khint_t k = kh_get(vdict, d, value);
1261
0
    if ( k == kh_end(d) ) return NULL;
1262
0
    return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
1263
0
}
1264
1265
// Check the VCF header is correctly formatted as per the specification.
1266
// Note the code that calls this doesn't bother to check return values and
1267
// we have so many broken VCFs in the wild that for now we just reprt a
1268
// warning and continue anyway.  So currently this is a void function.
1269
void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
1270
1.18k
{
1271
1.18k
    int version = bcf_get_version(hdr, NULL);
1272
1273
1.18k
    struct tag {
1274
1.18k
        char name[10];
1275
1.18k
        char number_str[3];
1276
1.18k
        int number;
1277
1.18k
        int version;
1278
1.18k
        int type;
1279
1.18k
    };
1280
1281
1.18k
    char type_str[][8] = {"Flag", "Integer", "Float", "String"};
1282
1283
1.18k
    struct tag info_tags[] = {
1284
1.18k
        {"AD",        "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1285
1.18k
        {"ADF",       "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1286
1.18k
        {"ADR",       "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1287
1.18k
        {"AC",        "A",  BCF_VL_A,     VCF_DEF, BCF_HT_INT},
1288
1.18k
        {"AF",        "A",  BCF_VL_A,     VCF_DEF, BCF_HT_REAL},
1289
1.18k
        {"CIGAR",     "A",  BCF_VL_A,     VCF_DEF, BCF_HT_STR},
1290
1.18k
        {"AA",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_STR},
1291
1.18k
        {"AN",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1292
1.18k
        {"BQ",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_REAL},
1293
1.18k
        {"DB",        "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1294
1.18k
        {"DP",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1295
1.18k
        {"END",       "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1296
1.18k
        {"H2",        "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1297
1.18k
        {"H3",        "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1298
1.18k
        {"MQ",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_REAL},
1299
1.18k
        {"MQ0",       "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1300
1.18k
        {"NS",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1301
1.18k
        {"SB",        "4",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1302
1.18k
        {"SOMATIC",   "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1303
1.18k
        {"VALIDATED", "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1304
1.18k
        {"1000G",     "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1305
1.18k
    };
1306
1.18k
    static int info_warned[sizeof(info_tags)/sizeof(*info_tags)] = {0};
1307
1308
1.18k
    struct tag fmt_tags[] = {
1309
1.18k
        {"AD",   "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1310
1.18k
        {"ADF",  "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1311
1.18k
        {"ADR",  "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1312
1.18k
        {"EC",   "A",  BCF_VL_A,     VCF_DEF, BCF_HT_INT},
1313
1.18k
        {"GL",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_REAL},
1314
1.18k
        {"GP",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_REAL},
1315
1.18k
        {"PL",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_INT},
1316
1.18k
        {"PP",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_INT},
1317
1.18k
        {"DP",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1318
1.18k
        {"LEN",  "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1319
1.18k
        {"FT",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_STR},
1320
1.18k
        {"GQ",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1321
1.18k
        {"GT",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_STR},
1322
1.18k
        {"HQ",   "2",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1323
1.18k
        {"MQ",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1324
1.18k
        {"PQ",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1325
1.18k
        {"PS",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1326
1.18k
        {"PSL",  "P",  BCF_VL_P,     VCF44,   BCF_HT_STR},
1327
1.18k
        {"PSO",  "P",  BCF_VL_P,     VCF44,   BCF_HT_INT},
1328
1.18k
        {"PSQ",  "P",  BCF_VL_P,     VCF44,   BCF_HT_INT},
1329
1.18k
        {"LGL",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1330
1.18k
        {"LGP",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1331
1.18k
        {"LPL",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1332
1.18k
        {"LPP",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1333
1.18k
        {"LEC",  "LA", BCF_VL_LA,    VCF45,   BCF_HT_INT},
1334
1.18k
        {"LAD",  "LR", BCF_VL_LR,    VCF45,   BCF_HT_INT},
1335
1.18k
        {"LADF", "LR", BCF_VL_LR,    VCF45,   BCF_HT_INT},
1336
1.18k
        {"LADR", "LR", BCF_VL_LR,    VCF45,   BCF_HT_INT},
1337
1.18k
    };
1338
1.18k
    static int fmt_warned[sizeof(fmt_tags)/sizeof(*fmt_tags)] = {0};
1339
1340
    // Check INFO tag numbers.  We shouldn't really permit ".", but it's
1341
    // commonly misused so we let it slide unless it's a new tag and the
1342
    // file format claims to be new also.  We also cannot distinguish between
1343
    // Number=1 and Number=2, but we at least report the correct term if we
1344
    // get, say, Number=G in its place.
1345
    // Also check the types.
1346
1.18k
    int i;
1347
26.1k
    for (i = 0; i < sizeof(info_tags)/sizeof(*info_tags); i++) {
1348
24.9k
        if (info_warned[i])
1349
0
            continue;
1350
24.9k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, info_tags[i].name);
1351
24.9k
        if (bcf_hdr_idinfo_exists(hdr, BCF_HL_INFO, id)) {
1352
0
            if (bcf_hdr_id2length(hdr, BCF_HL_INFO, id) != info_tags[i].number &&
1353
0
                bcf_hdr_id2length(hdr, BCF_HL_INFO, id) != BCF_VL_VAR) {
1354
0
                info_warned[i] = 1;
1355
0
            } else if (bcf_hdr_id2length(hdr, BCF_HL_INFO, id) == BCF_VL_FIXED &&
1356
0
                       bcf_hdr_id2number(hdr, BCF_HL_INFO, id) != atoi(info_tags[i].number_str)) {
1357
0
                info_warned[i] = 1;
1358
0
            }
1359
1360
0
            if (info_warned[i]) {
1361
0
                hts_log_warning("%s should be declared as Number=%s",
1362
0
                                info_tags[i].name, info_tags[i].number_str);
1363
0
            }
1364
1365
0
            if (bcf_hdr_id2type(hdr, BCF_HL_INFO, id) != info_tags[i].type) {
1366
0
                hts_log_warning("%s should be declared as Type=%s",
1367
0
                                info_tags[i].name, type_str[info_tags[i].type]);
1368
0
                info_warned[i] = 1;
1369
0
            }
1370
0
        }
1371
24.9k
    }
1372
1373
    // Check FORMAT tag numbers and types.
1374
34.4k
    for (i = 0; i < sizeof(fmt_tags)/sizeof(*fmt_tags); i++) {
1375
33.2k
        if (fmt_warned[i])
1376
0
            continue;
1377
33.2k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, fmt_tags[i].name);
1378
33.2k
        if (bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, id)) {
1379
0
            if (bcf_hdr_id2length(hdr, BCF_HL_FMT, id) != fmt_tags[i].number) {
1380
                // Permit "Number=." if this tag predates the vcf version it is
1381
                // defined within.  This is a common tactic for callers to use
1382
                // new tags with older formats in order to avoid parsing failures
1383
                // with some software.
1384
                // We don't care for 4.3 and earlier as that's more of a wild-west
1385
                // and it's not abnormal to see incorrect usage of Number=. there.
1386
0
                if ((version < VCF44 &&
1387
0
                     bcf_hdr_id2length(hdr, BCF_HL_FMT, id) != BCF_VL_VAR) ||
1388
0
                    (version >= VCF44 && version >= fmt_tags[i].version)) {
1389
0
                    fmt_warned[i] = 1;
1390
0
                }
1391
0
            } else if (bcf_hdr_id2length(hdr, BCF_HL_FMT, id) == BCF_VL_FIXED &&
1392
0
                       bcf_hdr_id2number(hdr, BCF_HL_FMT, id) != atoi(fmt_tags[i].number_str)) {
1393
0
                fmt_warned[i] = 1;
1394
0
            }
1395
1396
0
            if (fmt_warned[i]) {
1397
0
                hts_log_warning("%s should be declared as Number=%s",
1398
0
                                fmt_tags[i].name, fmt_tags[i].number_str);
1399
0
            }
1400
1401
0
            if (bcf_hdr_id2type(hdr, BCF_HL_FMT, id) != fmt_tags[i].type) {
1402
0
                hts_log_warning("%s should be declared as Type=%s",
1403
0
                                fmt_tags[i].name, type_str[fmt_tags[i].type]);
1404
0
                fmt_warned[i] = 1;
1405
0
            }
1406
0
        }
1407
33.2k
    }
1408
1.18k
}
1409
1410
int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
1411
1.39k
{
1412
1.39k
    int len, done = 0;
1413
1.39k
    char *p = htxt;
1414
1415
    // Check sanity: "fileformat" string must come as first
1416
1.39k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
1417
1.39k
    if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") )
1418
112
        hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?");
1419
1.39k
    if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1420
0
        bcf_hrec_destroy(hrec);
1421
0
        return -1;
1422
0
    }
1423
1424
    // The filter PASS must appear first in the dictionary
1425
1.39k
    hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
1426
1.39k
    if (!hrec || bcf_hdr_add_hrec(hdr, hrec) < 0) {
1427
0
        bcf_hrec_destroy(hrec);
1428
0
        return -1;
1429
0
    }
1430
1431
    // Parse the whole header
1432
13.5k
    do {
1433
54.9k
        while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) {
1434
41.4k
            if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1435
6
                bcf_hrec_destroy(hrec);
1436
6
                return -1;
1437
6
            }
1438
41.4k
            p += len;
1439
41.4k
        }
1440
13.5k
        assert(hrec == NULL);
1441
13.5k
        if (len < 0) {
1442
            // len < 0 indicates out-of-memory, or similar error
1443
4
            hts_log_error("Could not parse header line: %s", strerror(errno));
1444
4
            return -1;
1445
13.5k
        } else if (len > 0) {
1446
            // Bad header line.  bcf_hdr_parse_line() will have logged it.
1447
            // Skip and try again on the next line (p + len will be the start
1448
            // of the next one).
1449
11.8k
            p += len;
1450
11.8k
            continue;
1451
11.8k
        }
1452
1453
        // Next should be the sample line.  If not, it was a malformed
1454
        // header, in which case print a warning and skip (many VCF
1455
        // operations do not really care about a few malformed lines).
1456
        // In the future we may want to add a strict mode that errors in
1457
        // this case.
1458
1.72k
        if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) {
1459
530
            char *eol = strchr(p, '\n');
1460
530
            if (*p != '\0') {
1461
342
                char buffer[320];
1462
342
                hts_log_warning("Could not parse header line: %s",
1463
342
                                hts_strprint(buffer, sizeof(buffer),
1464
342
                                               '"', p,
1465
342
                                               eol ? (eol - p) : SIZE_MAX));
1466
342
            }
1467
530
            if (eol) {
1468
342
                p = eol + 1; // Try from the next line.
1469
342
            } else {
1470
188
                done = -1; // No more lines left, give up.
1471
188
            }
1472
1.19k
        } else {
1473
1.19k
            done = 1; // Sample line found
1474
1.19k
        }
1475
13.5k
    } while (!done);
1476
1477
1.38k
    if (done < 0) {
1478
        // No sample line is fatal.
1479
188
        hts_log_error("Could not parse the header, sample line not found");
1480
188
        return -1;
1481
188
    }
1482
1483
1.19k
    if (bcf_hdr_parse_sample_line(hdr,p) < 0)
1484
10
        return -1;
1485
1.18k
    if (bcf_hdr_sync(hdr) < 0)
1486
0
        return -1;
1487
1.18k
    bcf_hdr_check_sanity(hdr);
1488
1.18k
    return 0;
1489
1.18k
}
1490
1491
int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
1492
0
{
1493
0
    int len;
1494
0
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
1495
0
    if ( !hrec ) return -1;
1496
0
    if (bcf_hdr_add_hrec(hdr, hrec) < 0)
1497
0
        return -1;
1498
0
    return 0;
1499
0
}
1500
1501
void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
1502
0
{
1503
0
    int i = 0;
1504
0
    bcf_hrec_t *hrec;
1505
0
    if ( !key )
1506
0
    {
1507
        // no key, remove all entries of this type
1508
0
        while ( i<hdr->nhrec )
1509
0
        {
1510
0
            if ( hdr->hrec[i]->type!=type ) { i++; continue; }
1511
0
            hrec = hdr->hrec[i];
1512
0
            bcf_hdr_unregister_hrec(hdr, hrec);
1513
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1514
0
            hdr->dirty = 1;
1515
0
            hdr->nhrec--;
1516
0
            if ( i < hdr->nhrec )
1517
0
                memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1518
0
            bcf_hrec_destroy(hrec);
1519
0
        }
1520
0
        return;
1521
0
    }
1522
0
    while (1)
1523
0
    {
1524
0
        if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
1525
0
        {
1526
0
            hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL);
1527
0
            if ( !hrec ) return;
1528
1529
0
            for (i=0; i<hdr->nhrec; i++)
1530
0
                if ( hdr->hrec[i]==hrec ) break;
1531
0
            assert( i<hdr->nhrec );
1532
1533
0
            vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1534
0
            khint_t k = kh_get(vdict, d, key);
1535
0
            kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
1536
0
        }
1537
0
        else
1538
0
        {
1539
0
            for (i=0; i<hdr->nhrec; i++)
1540
0
            {
1541
0
                if ( hdr->hrec[i]->type!=type ) continue;
1542
0
                if ( type==BCF_HL_GEN )
1543
0
                {
1544
0
                    if ( !strcmp(hdr->hrec[i]->key,key) ) break;
1545
0
                }
1546
0
                else
1547
0
                {
1548
                    // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
1549
0
                    int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
1550
0
                    if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break;
1551
0
                }
1552
0
            }
1553
0
            if ( i==hdr->nhrec ) return;
1554
0
            hrec = hdr->hrec[i];
1555
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1556
0
        }
1557
1558
0
        hdr->nhrec--;
1559
0
        if ( i < hdr->nhrec )
1560
0
            memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1561
0
        bcf_hrec_destroy(hrec);
1562
0
        hdr->dirty = 1;
1563
0
    }
1564
0
}
1565
1566
int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
1567
0
{
1568
0
    char tmp[256], *line = tmp;
1569
0
    va_list ap;
1570
0
    va_start(ap, fmt);
1571
0
    int n = vsnprintf(line, sizeof(tmp), fmt, ap);
1572
0
    va_end(ap);
1573
1574
0
    if (n >= sizeof(tmp)) {
1575
0
        n++; // For trailing NUL
1576
0
        line = (char*)malloc(n);
1577
0
        if (!line)
1578
0
            return -1;
1579
1580
0
        va_start(ap, fmt);
1581
0
        vsnprintf(line, n, fmt, ap);
1582
0
        va_end(ap);
1583
0
    }
1584
1585
0
    int ret = bcf_hdr_append(hdr, line);
1586
1587
0
    if (line != tmp) free(line);
1588
0
    return ret;
1589
0
}
1590
1591
1592
/**********************
1593
 *** BCF header I/O ***
1594
 **********************/
1595
1596
const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
1597
436
{
1598
436
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1599
436
    if ( !hrec )
1600
218
    {
1601
218
        hts_log_warning("No version string found, assuming VCFv4.2");
1602
218
        return "VCFv4.2";
1603
218
    }
1604
218
    return hrec->value;
1605
436
}
1606
1607
int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
1608
0
{
1609
0
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1610
0
    if ( !hrec )
1611
0
    {
1612
0
        int len;
1613
0
        kstring_t str = {0,0,0};
1614
0
        if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1;
1615
0
        hrec = bcf_hdr_parse_line(hdr, str.s, &len);
1616
0
        free(str.s);
1617
1618
0
        get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value);
1619
0
    }
1620
0
    else
1621
0
    {
1622
0
        bcf_hrec_t *tmp = bcf_hrec_dup(hrec);
1623
0
        if ( !tmp ) return -1;
1624
0
        free(tmp->value);
1625
0
        tmp->value = strdup(version);
1626
0
        if ( !tmp->value ) return -1;
1627
0
        bcf_hdr_update_hrec(hdr, hrec, tmp);
1628
0
        bcf_hrec_destroy(tmp);
1629
0
    }
1630
0
    hdr->dirty = 1;
1631
    //TODO rlen may change, deal with it
1632
0
    return 0; // FIXME: check for errs in this function (return < 0 if so)
1633
0
}
1634
1635
bcf_hdr_t *bcf_hdr_init(const char *mode)
1636
1.40k
{
1637
1.40k
    int i;
1638
1.40k
    bcf_hdr_t *h;
1639
1.40k
    h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
1640
1.40k
    if (!h) return NULL;
1641
5.61k
    for (i = 0; i < 3; ++i) {
1642
4.21k
        if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail;
1643
        // Supersize the hash to make collisions very unlikely
1644
4.21k
        static int dsize[3] = {16384,16384,2048}; // info, contig, format
1645
4.21k
        if (kh_resize(vdict, h->dict[i], dsize[i]) < 0) goto fail;
1646
4.21k
    }
1647
1648
1.40k
    bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t));
1649
1.40k
    if ( !aux ) goto fail;
1650
1.40k
    if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; }
1651
1.40k
    aux->key_len = NULL;
1652
1.40k
    aux->dict = *((vdict_t*)h->dict[0]);
1653
1.40k
    aux->version = 0;
1654
1.40k
    aux->ref_count = 1;
1655
1.40k
    free(h->dict[0]);
1656
1.40k
    h->dict[0] = aux;
1657
1658
1.40k
    if ( strchr(mode,'w') )
1659
0
    {
1660
0
        bcf_hdr_append(h, "##fileformat=VCFv4.2");
1661
        // The filter PASS must appear first in the dictionary
1662
0
        bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
1663
0
        aux->version = VCF_DEF;
1664
0
    }
1665
1.40k
    return h;
1666
1667
0
 fail:
1668
0
    for (i = 0; i < 3; ++i)
1669
0
        kh_destroy(vdict, h->dict[i]);
1670
0
    free(h);
1671
0
    return NULL;
1672
1.40k
}
1673
1674
void bcf_hdr_destroy(bcf_hdr_t *h)
1675
2.05k
{
1676
2.05k
    int i;
1677
2.05k
    khint_t k;
1678
2.05k
    if (!h) return;
1679
2.05k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
1680
2.05k
    if (aux->ref_count > 1) // Refs still held, so delay destruction
1681
649
    {
1682
649
        aux->ref_count &= ~1;
1683
649
        return;
1684
649
    }
1685
5.61k
    for (i = 0; i < 3; ++i) {
1686
4.21k
        vdict_t *d = (vdict_t*)h->dict[i];
1687
4.21k
        if (d == 0) continue;
1688
48.8M
        for (k = kh_begin(d); k != kh_end(d); ++k)
1689
48.8M
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
1690
4.21k
        if ( i==0 )
1691
1.40k
        {
1692
13.4k
            for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
1693
12.0k
                if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k));
1694
1.40k
            kh_destroy(hdict, aux->gen);
1695
1.40k
            free(aux->key_len); // may exist for dict[0] only
1696
1.40k
        }
1697
4.21k
        kh_destroy(vdict, d);
1698
4.21k
        free(h->id[i]);
1699
4.21k
    }
1700
51.1k
    for (i=0; i<h->nhrec; i++)
1701
49.7k
        bcf_hrec_destroy(h->hrec[i]);
1702
1.40k
    if (h->nhrec) free(h->hrec);
1703
1.40k
    if (h->samples) free(h->samples);
1704
1.40k
    free(h->keep_samples);
1705
1.40k
    free(h->transl[0]); free(h->transl[1]);
1706
1.40k
    free(h->mem.s);
1707
1.40k
    free(h);
1708
1.40k
}
1709
1710
bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
1711
1.40k
{
1712
1.40k
    if (hfp->format.format == vcf)
1713
1.29k
        return vcf_hdr_read(hfp);
1714
112
    if (hfp->format.format != bcf) {
1715
0
        hts_log_error("Input is not detected as bcf or vcf format");
1716
0
        return NULL;
1717
0
    }
1718
1719
112
    assert(hfp->is_bgzf);
1720
1721
112
    BGZF *fp = hfp->fp.bgzf;
1722
112
    uint8_t magic[5];
1723
112
    bcf_hdr_t *h;
1724
112
    h = bcf_hdr_init("r");
1725
112
    if (!h) {
1726
0
        hts_log_error("Failed to allocate bcf header");
1727
0
        return NULL;
1728
0
    }
1729
112
    if (bgzf_read(fp, magic, 5) != 5)
1730
0
    {
1731
0
        hts_log_error("Failed to read the header (reading BCF in text mode?)");
1732
0
        bcf_hdr_destroy(h);
1733
0
        return NULL;
1734
0
    }
1735
112
    if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
1736
0
    {
1737
0
        if (!strncmp((char*)magic, "BCF", 3))
1738
0
            hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported");
1739
0
        else
1740
0
            hts_log_error("Invalid BCF2 magic string");
1741
0
        bcf_hdr_destroy(h);
1742
0
        return NULL;
1743
0
    }
1744
112
    uint8_t buf[4];
1745
112
    size_t hlen;
1746
112
    char *htxt = NULL;
1747
112
    if (bgzf_read(fp, buf, 4) != 4) goto fail;
1748
112
    hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24);
1749
112
    if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; }
1750
112
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1751
112
    if (hlen > FUZZ_ALLOC_LIMIT/2) { errno = ENOMEM; goto fail; }
1752
112
#endif
1753
112
    htxt = (char*)malloc(hlen + 1);
1754
112
    if (!htxt) goto fail;
1755
112
    if (bgzf_read(fp, htxt, hlen) != hlen) goto fail;
1756
112
    htxt[hlen] = '\0'; // Ensure htxt is terminated
1757
112
    if ( bcf_hdr_parse(h, htxt) < 0 ) goto fail;
1758
110
    free(htxt);
1759
1760
110
    bcf_hdr_incr_ref(h);
1761
110
    bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup);
1762
1763
110
    return h;
1764
2
 fail:
1765
2
    hts_log_error("Failed to read BCF header");
1766
2
    free(htxt);
1767
2
    bcf_hdr_destroy(h);
1768
2
    return NULL;
1769
112
}
1770
1771
int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
1772
1.18k
{
1773
1.18k
    if (!h) {
1774
0
        errno = EINVAL;
1775
0
        return -1;
1776
0
    }
1777
1.18k
    if ( h->dirty ) {
1778
0
        if (bcf_hdr_sync(h) < 0) return -1;
1779
0
    }
1780
1.18k
    hfp->format.category = variant_data;
1781
1.18k
    if (hfp->format.format == vcf || hfp->format.format == text_format) {
1782
594
        hfp->format.format = vcf;
1783
594
        return vcf_hdr_write(hfp, h);
1784
594
    }
1785
1786
594
    if (hfp->format.format == binary_format)
1787
594
        hfp->format.format = bcf;
1788
1789
594
    kstring_t htxt = {0,0,0};
1790
594
    if (bcf_hdr_format(h, 1, &htxt) < 0) {
1791
0
        free(htxt.s);
1792
0
        return -1;
1793
0
    }
1794
594
    kputc('\0', &htxt); // include the \0 byte
1795
1796
594
    BGZF *fp = hfp->fp.bgzf;
1797
594
    if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
1798
594
    uint8_t hlen[4];
1799
594
    u32_to_le(htxt.l, hlen);
1800
594
    if ( bgzf_write(fp, hlen, 4) !=4 ) return -1;
1801
594
    if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1;
1802
594
    if ( bgzf_flush(fp) < 0) return -1;
1803
1804
594
    bcf_hdr_incr_ref(h);
1805
594
    bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup);
1806
1807
594
    free(htxt.s);
1808
594
    return 0;
1809
594
}
1810
1811
/********************
1812
 *** BCF site I/O ***
1813
 ********************/
1814
1815
bcf1_t *bcf_init(void)
1816
1.18k
{
1817
1.18k
    bcf1_t *v;
1818
1.18k
    v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
1819
1.18k
    return v;
1820
1.18k
}
1821
1822
void bcf_clear(bcf1_t *v)
1823
27.9k
{
1824
27.9k
    int i;
1825
27.9k
    for (i=0; i<v->d.m_info; i++)
1826
0
    {
1827
0
        if ( v->d.info[i].vptr_free )
1828
0
        {
1829
0
            free(v->d.info[i].vptr - v->d.info[i].vptr_off);
1830
0
            v->d.info[i].vptr_free = 0;
1831
0
        }
1832
0
    }
1833
27.9k
    for (i=0; i<v->d.m_fmt; i++)
1834
0
    {
1835
0
        if ( v->d.fmt[i].p_free )
1836
0
        {
1837
0
            free(v->d.fmt[i].p - v->d.fmt[i].p_off);
1838
0
            v->d.fmt[i].p_free = 0;
1839
0
        }
1840
0
    }
1841
27.9k
    v->rid = v->pos = v->rlen = v->unpacked = 0;
1842
27.9k
    bcf_float_set_missing(v->qual);
1843
27.9k
    v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
1844
27.9k
    v->shared.l = v->indiv.l = 0;
1845
27.9k
    v->d.var_type = -1;
1846
27.9k
    v->d.shared_dirty = 0;
1847
27.9k
    v->d.indiv_dirty  = 0;
1848
27.9k
    v->d.n_flt = 0;
1849
27.9k
    v->errcode = 0;
1850
27.9k
    if (v->d.m_als) v->d.als[0] = 0;
1851
27.9k
    if (v->d.m_id) v->d.id[0] = 0;
1852
27.9k
}
1853
1854
void bcf_empty(bcf1_t *v)
1855
1.18k
{
1856
1.18k
    bcf_clear1(v);
1857
1.18k
    free(v->d.id);
1858
1.18k
    free(v->d.als);
1859
1.18k
    free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
1860
1.18k
    if (v->d.var ) free(v->d.var);
1861
1.18k
    free(v->shared.s); free(v->indiv.s);
1862
1.18k
    memset(&v->d,0,sizeof(v->d));
1863
1.18k
    memset(&v->shared,0,sizeof(v->shared));
1864
1.18k
    memset(&v->indiv,0,sizeof(v->indiv));
1865
1.18k
}
1866
1867
void bcf_destroy(bcf1_t *v)
1868
1.18k
{
1869
1.18k
    if (!v) return;
1870
1.18k
    bcf_empty1(v);
1871
1.18k
    free(v);
1872
1.18k
}
1873
1874
static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
1875
110
{
1876
110
    uint8_t x[32];
1877
110
    ssize_t ret;
1878
110
    uint32_t shared_len, indiv_len;
1879
110
    if ((ret = bgzf_read(fp, x, 32)) != 32) {
1880
0
        if (ret == 0) return -1;
1881
0
        return -2;
1882
0
    }
1883
110
    bcf_clear1(v);
1884
110
    shared_len = le_to_u32(x);
1885
110
    if (shared_len < 24) return -2;
1886
110
    shared_len -= 24; // to exclude six 32-bit integers
1887
110
    indiv_len = le_to_u32(x + 4);
1888
110
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1889
    // ks_resize() normally allocates 1.5 * requested size to allow for growth
1890
110
    if ((uint64_t) shared_len + indiv_len > FUZZ_ALLOC_LIMIT / 3 * 2) return -2;
1891
110
#endif
1892
110
    if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2;
1893
110
    if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2;
1894
110
    v->rid  = le_to_i32(x + 8);
1895
110
    v->pos  = le_to_u32(x + 12);
1896
110
    if ( v->pos==UINT32_MAX ) v->pos = -1;  // this is for telomere coordinate, e.g. MT:0
1897
110
    v->rlen = le_to_i32(x + 16);
1898
110
    v->qual = le_to_float(x + 20);
1899
110
    v->n_info = le_to_u16(x + 24);
1900
110
    v->n_allele = le_to_u16(x + 26);
1901
110
    v->n_sample = le_to_u32(x + 28) & 0xffffff;
1902
110
    v->n_fmt = x[31];
1903
110
    v->shared.l = shared_len;
1904
110
    v->indiv.l = indiv_len;
1905
    // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
1906
110
    if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
1907
1908
110
    if (bgzf_read(fp, v->shared.s, v->shared.l) != v->shared.l) return -2;
1909
108
    if (bgzf_read(fp, v->indiv.s, v->indiv.l) != v->indiv.l) return -2;
1910
108
    return 0;
1911
108
}
1912
1913
0
#define bit_array_size(n) ((n)/8+1)
1914
0
#define bit_array_set(a,i)   ((a)[(i)/8] |=   1 << ((i)%8))
1915
0
#define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
1916
0
#define bit_array_test(a,i)  ((a)[(i)/8] &   (1 << ((i)%8)))
1917
1918
static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1919
4.28k
                                   int32_t *val) {
1920
4.28k
    uint32_t t;
1921
4.28k
    if (end - p < 2) return -1;
1922
4.28k
    t = *p++ & 0xf;
1923
    /* Use if .. else if ... else instead of switch to force order.  Assumption
1924
       is that small integers are more frequent than big ones. */
1925
4.28k
    if (t == BCF_BT_INT8) {
1926
2.33k
        *val = *(int8_t *) p++;
1927
2.33k
    } else {
1928
1.94k
        if (end - p < (1<<bcf_type_shift[t])) return -1;
1929
1.94k
        if (t == BCF_BT_INT16) {
1930
1.44k
            *val = le_to_i16(p);
1931
1.44k
            p += 2;
1932
1.44k
        } else if (t == BCF_BT_INT32) {
1933
430
            *val = le_to_i32(p);
1934
430
            p += 4;
1935
#ifdef VCF_ALLOW_INT64
1936
        } else if (t == BCF_BT_INT64) {
1937
            // This case should never happen because there should be no
1938
            // 64-bit BCFs at all, definitely not coming from htslib
1939
            *val = le_to_i64(p);
1940
            p += 8;
1941
#endif
1942
430
        } else {
1943
72
            return -1;
1944
72
        }
1945
1.94k
    }
1946
4.20k
    *q = p;
1947
4.20k
    return 0;
1948
4.28k
}
1949
1950
static int bcf_dec_size_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1951
12.2k
                             int *num, int *type) {
1952
12.2k
    int r;
1953
12.2k
    if (p >= end) return -1;
1954
12.2k
    *type = *p & 0xf;
1955
12.2k
    if (*p>>4 != 15) {
1956
11.8k
        *q = p + 1;
1957
11.8k
        *num = *p >> 4;
1958
11.8k
        return 0;
1959
11.8k
    }
1960
348
    r = bcf_dec_typed_int1_safe(p + 1, end, q, num);
1961
348
    if (r) return r;
1962
334
    return *num >= 0 ? 0 : -1;
1963
348
}
1964
1965
182
static const char *get_type_name(int type) {
1966
182
    const char *types[9] = {
1967
182
        "null", "int (8-bit)", "int (16 bit)", "int (32 bit)",
1968
182
        "unknown", "float", "unknown", "char", "unknown"
1969
182
    };
1970
182
    int t = (type >= 0 && type < 8) ? type : 8;
1971
182
    return types[t];
1972
182
}
1973
1974
/**
1975
 *  updatephasing - updates 1st phasing based on other phasing status
1976
 *  @param p - pointer to phase value array
1977
 *  @param end - end of array
1978
 *  @param q - pointer to consumed data
1979
 *  @param samples - no. of samples in array
1980
 *  @param ploidy - no. of phasing values per sample
1981
 *  @param type - value type (one of BCF_BT_...)
1982
 *  Returns 0 on success and 1 on failure
1983
 *  Update for haploids made only if it is not unknown (.)
1984
 */
1985
static int updatephasing(uint8_t *p, uint8_t *end, uint8_t **q, int samples, int ploidy, int type)
1986
0
{
1987
0
    int j, k;
1988
0
    unsigned int inc = 1 << bcf_type_shift[type];
1989
0
    ptrdiff_t bytes = samples * ploidy * inc;
1990
1991
0
    if (samples < 0 || ploidy < 0 || end - p < bytes)
1992
0
        return 1;
1993
1994
    /*
1995
     * This works because phasing is stored in the least-significant bit
1996
     * of the GT encoding, and the data is always stored little-endian.
1997
     * Thus it's possible to get the desired result by doing bit operations
1998
     * on the least-significant byte of each value and ignoring the
1999
     * higher bytes (for 16-bit and 32-bit values).
2000
     */
2001
2002
0
    switch (ploidy) {
2003
0
    case 1:
2004
        // Trivial case - haploid data is phased by default
2005
0
        for (j = 0; j < samples; ++j) {
2006
0
            if (*p) *p |= 1;    //only if not unknown (.)
2007
0
            p += inc;
2008
0
        }
2009
0
        break;
2010
0
    case 2:
2011
        // Mostly trivial case - first is phased if second is.
2012
0
        for (j = 0; j < samples; ++j) {
2013
0
            *p |= (p[inc] & 1);
2014
0
            p += 2 * inc;
2015
0
        }
2016
0
        break;
2017
0
    default:
2018
        // Generic case - first is phased if all other alleles are.
2019
0
        for (j = 0; j < samples; ++j) {
2020
0
            uint8_t allphased = 1;
2021
0
            for (k = 1; k < ploidy; ++k)
2022
0
                allphased &= (p[inc * k]);
2023
0
            *p |= allphased;
2024
0
            p += ploidy * inc;
2025
0
        }
2026
0
    }
2027
0
    *q = p;
2028
0
    return 0;
2029
0
}
2030
2031
static void bcf_record_check_err(const bcf_hdr_t *hdr, bcf1_t *rec,
2032
2.01k
                                 char *type, uint32_t *reports, int i) {
2033
2.01k
    if (*reports == 0 || hts_verbose >= HTS_LOG_DEBUG)
2034
34
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos
2035
2.01k
                        ": Invalid FORMAT %s %d",
2036
2.01k
                        bcf_seqname_safe(hdr,rec), rec->pos+1, type, i);
2037
2.01k
    (*reports)++;
2038
2.01k
}
2039
2040
108
static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
2041
108
    uint8_t *ptr, *end;
2042
108
    size_t bytes;
2043
108
    uint32_t err = 0;
2044
108
    int type = 0;
2045
108
    int num  = 0;
2046
108
    uint32_t i, reports;
2047
108
    const uint32_t is_integer = ((1 << BCF_BT_INT8)  |
2048
108
                                 (1 << BCF_BT_INT16) |
2049
#ifdef VCF_ALLOW_INT64
2050
                                 (1 << BCF_BT_INT64) |
2051
#endif
2052
108
                                 (1 << BCF_BT_INT32));
2053
108
    const uint32_t is_valid_type = (is_integer          |
2054
108
                                    (1 << BCF_BT_NULL)  |
2055
108
                                    (1 << BCF_BT_FLOAT) |
2056
108
                                    (1 << BCF_BT_CHAR));
2057
108
    int32_t max_id = hdr ? hdr->n[BCF_DT_ID] : 0;
2058
    /* set phasing for 1st allele as in v44 for versions upto v43, to have
2059
    consistent binary values irrespective of version; not run for v >= v44,
2060
    to retain explicit phasing in v44 and higher */
2061
108
    int idgt = hdr ?
2062
108
                    bcf_get_version(hdr, NULL) < VCF44 ?
2063
108
                        bcf_hdr_id2int(hdr, BCF_DT_ID, "GT") : -1 :
2064
108
                    -1;
2065
2066
    // Check for valid contig ID
2067
108
    if (rec->rid < 0
2068
88
        || (hdr && (rec->rid >= hdr->n[BCF_DT_CTG]
2069
108
                    || hdr->id[BCF_DT_CTG][rec->rid].key == NULL))) {
2070
108
        hts_log_warning("Bad BCF record at %"PRIhts_pos": Invalid %s id %d", rec->pos+1, "CONTIG", rec->rid);
2071
108
        err |= BCF_ERR_CTG_INVALID;
2072
108
    }
2073
2074
    // Check ID
2075
108
    ptr = (uint8_t *) rec->shared.s;
2076
108
    end = ptr + rec->shared.l;
2077
108
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2078
106
    if (type != BCF_BT_CHAR) {
2079
106
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "ID", type, get_type_name(type));
2080
106
        err |= BCF_ERR_TAG_INVALID;
2081
106
    }
2082
106
    bytes = (size_t) num << bcf_type_shift[type];
2083
106
    if (end - ptr < bytes) goto bad_shared;
2084
106
    ptr += bytes;
2085
2086
    // Check REF and ALT
2087
106
    if (rec->n_allele < 1) {
2088
42
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": No REF allele",
2089
42
                        bcf_seqname_safe(hdr,rec), rec->pos+1);
2090
42
        err |= BCF_ERR_TAG_UNDEF;
2091
42
    }
2092
2093
106
    reports = 0;
2094
8.23k
    for (i = 0; i < rec->n_allele; i++) {
2095
8.13k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2096
8.12k
        if (type != BCF_BT_CHAR) {
2097
7.92k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2098
64
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "REF/ALT", type, get_type_name(type));
2099
7.92k
            err |= BCF_ERR_CHAR;
2100
7.92k
        }
2101
8.12k
        bytes = (size_t) num << bcf_type_shift[type];
2102
8.12k
        if (end - ptr < bytes) goto bad_shared;
2103
8.12k
        ptr += bytes;
2104
8.12k
    }
2105
2106
    // Check FILTER
2107
96
    reports = 0;
2108
96
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2109
96
    if (num > 0) {
2110
48
        bytes = (size_t) num << bcf_type_shift[type];
2111
48
        if (((1 << type) & is_integer) == 0) {
2112
10
            hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", type, get_type_name(type));
2113
10
            err |= BCF_ERR_TAG_INVALID;
2114
10
            if (end - ptr < bytes) goto bad_shared;
2115
10
            ptr += bytes;
2116
38
        } else {
2117
38
            if (end - ptr < bytes) goto bad_shared;
2118
2.56k
            for (i = 0; i < num; i++) {
2119
2.52k
                int32_t key = bcf_dec_int1(ptr, type, &ptr);
2120
2.52k
                if (key < 0
2121
1.63k
                    || (hdr && (key >= max_id
2122
2.32k
                                || hdr->id[BCF_DT_ID][key].key == NULL))) {
2123
2.32k
                    if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2124
38
                        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", key);
2125
2.32k
                    err |= BCF_ERR_TAG_UNDEF;
2126
2.32k
                }
2127
2.52k
            }
2128
38
        }
2129
48
    }
2130
2131
    // Check INFO
2132
96
    reports = 0;
2133
96
    bcf_idpair_t *id_tmp = hdr ? hdr->id[BCF_DT_ID] : NULL;
2134
1.97k
    for (i = 0; i < rec->n_info; i++) {
2135
1.93k
        int32_t key = -1;
2136
1.93k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_shared;
2137
1.89k
        if (key < 0 || (hdr && (key >= max_id
2138
1.60k
                                || id_tmp[key].key == NULL))) {
2139
1.60k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2140
44
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", key);
2141
1.60k
            err |= BCF_ERR_TAG_UNDEF;
2142
1.60k
        }
2143
1.89k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2144
1.88k
        if (((1 << type) & is_valid_type) == 0
2145
1.80k
            || (type == BCF_BT_NULL && num > 0)) {
2146
74
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2147
2
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", type, get_type_name(type));
2148
74
            err |= BCF_ERR_TAG_INVALID;
2149
74
        }
2150
1.88k
        bytes = (size_t) num << bcf_type_shift[type];
2151
1.88k
        if (end - ptr < bytes) goto bad_shared;
2152
1.87k
        ptr += bytes;
2153
1.87k
    }
2154
2155
    // Check FORMAT and individual information
2156
38
    ptr = (uint8_t *) rec->indiv.s;
2157
38
    end = ptr + rec->indiv.l;
2158
38
    reports = 0;
2159
2.00k
    for (i = 0; i < rec->n_fmt; i++) {
2160
2.00k
        int32_t key = -1;
2161
2.00k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_indiv;
2162
1.98k
        if (key < 0
2163
1.87k
            || (hdr && (key >= max_id
2164
1.88k
                        || id_tmp[key].key == NULL))) {
2165
1.88k
            bcf_record_check_err(hdr, rec, "id", &reports, key);
2166
1.88k
            err |= BCF_ERR_TAG_UNDEF;
2167
1.88k
        }
2168
1.98k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv;
2169
1.97k
        if (((1 << type) & is_valid_type) == 0
2170
1.90k
            || (type == BCF_BT_NULL && num > 0)) {
2171
128
            bcf_record_check_err(hdr, rec, "type", &reports, type);
2172
128
            err |= BCF_ERR_TAG_INVALID;
2173
128
        }
2174
1.97k
        if (idgt >= 0 && idgt == key) {
2175
            // check first GT phasing bit and fix up if necessary
2176
0
            if (updatephasing(ptr, end, &ptr, rec->n_sample, num, type)) {
2177
0
                err |= BCF_ERR_TAG_INVALID;
2178
0
            }
2179
1.97k
        } else {
2180
1.97k
            bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample;
2181
1.97k
            if (end - ptr < bytes) goto bad_indiv;
2182
1.96k
            ptr += bytes;
2183
1.96k
        }
2184
1.97k
    }
2185
2186
0
    if (!err && rec->rlen < 0) {
2187
        // Treat bad rlen as a warning instead of an error, and try to
2188
        // fix up by using the length of the stored REF allele.
2189
0
        static int warned = 0;
2190
0
        if (!warned) {
2191
0
            hts_log_warning("BCF record at %s:%"PRIhts_pos" has invalid RLEN (%"PRIhts_pos"). "
2192
0
                            "Only one invalid RLEN will be reported.",
2193
0
                            bcf_seqname_safe(hdr,rec), rec->pos+1, rec->rlen);
2194
0
            warned = 1;
2195
0
        }
2196
        //find rlen considering reflen, END, SVLEN, fmt LEN
2197
0
        hts_pos_t len = get_rlen(hdr, rec);
2198
0
        rec->rlen = len >= 0 ? len : 0;
2199
0
    }
2200
2201
0
    rec->errcode |= err;
2202
2203
0
    return err ? -2 : 0; // Return -2 so bcf_read() reports an error
2204
2205
70
 bad_shared:
2206
70
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - shared section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
2207
70
    return -2;
2208
2209
38
 bad_indiv:
2210
38
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - individuals section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
2211
38
    return -2;
2212
38
}
2213
2214
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
2215
int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
2216
0
{
2217
0
    if ( !hdr->keep_samples ) return 0;
2218
0
    if ( !bcf_hdr_nsamples(hdr) )
2219
0
    {
2220
0
        rec->indiv.l = rec->n_sample = 0;
2221
0
        return 0;
2222
0
    }
2223
2224
0
    int i, j;
2225
0
    uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
2226
0
    bcf_dec_t *dec = &rec->d;
2227
0
    hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
2228
0
    for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
2229
2230
0
    for (i=0; i<rec->n_fmt; i++)
2231
0
    {
2232
0
        ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
2233
0
        src = dec->fmt[i].p - dec->fmt[i].size;
2234
0
        if ( dst )
2235
0
        {
2236
0
            memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
2237
0
            dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
2238
0
        }
2239
0
        dst = dec->fmt[i].p;
2240
0
        for (j=0; j<hdr->nsamples_ori; j++)
2241
0
        {
2242
0
            src += dec->fmt[i].size;
2243
0
            if ( !bit_array_test(hdr->keep_samples,j) ) continue;
2244
0
            memmove(dst, src, dec->fmt[i].size);
2245
0
            dst += dec->fmt[i].size;
2246
0
        }
2247
0
        rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
2248
0
        dec->fmt[i].p_len = dst - dec->fmt[i].p;
2249
0
    }
2250
0
    rec->unpacked |= BCF_UN_FMT;
2251
2252
0
    rec->n_sample = bcf_hdr_nsamples(hdr);
2253
0
    return 0;
2254
0
}
2255
2256
int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
2257
26.9k
{
2258
26.9k
    if (fp->format.format == vcf) return vcf_read(fp, h, v);
2259
110
    if (!h)
2260
0
        h = (const bcf_hdr_t *) bgzf_get_private_data(fp->fp.bgzf);
2261
110
    int ret = bcf_read1_core(fp->fp.bgzf, v);
2262
110
    if (ret == 0) ret = bcf_record_check(h, v);
2263
110
    if ( ret!=0 || !h->keep_samples ) return ret;
2264
0
    return bcf_subset_format(h,v);
2265
110
}
2266
2267
int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end)
2268
0
{
2269
0
    bcf1_t *v = (bcf1_t *) vv;
2270
0
    const bcf_hdr_t *hdr = (const bcf_hdr_t *) bgzf_get_private_data(fp);
2271
0
    int ret = bcf_read1_core(fp, v);
2272
0
    if (ret == 0) ret = bcf_record_check(hdr, v);
2273
0
    if (ret  >= 0)
2274
0
        *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
2275
0
    return ret;
2276
0
}
2277
2278
static inline int bcf1_sync_id(bcf1_t *line, kstring_t *str)
2279
0
{
2280
    // single typed string
2281
0
    if ( line->d.id && strcmp(line->d.id, ".") ) {
2282
0
        return bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
2283
0
    } else {
2284
0
        return bcf_enc_size(str, 0, BCF_BT_CHAR);
2285
0
    }
2286
0
}
2287
static inline int bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
2288
0
{
2289
    // list of typed strings
2290
0
    int i;
2291
0
    for (i=0; i<line->n_allele; i++) {
2292
0
        if (bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]) < 0)
2293
0
            return -1;
2294
0
    }
2295
0
    if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
2296
0
    return 0;
2297
0
}
2298
static inline int bcf1_sync_filter(bcf1_t *line, kstring_t *str)
2299
0
{
2300
    // typed vector of integers
2301
0
    if ( line->d.n_flt ) {
2302
0
        return bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
2303
0
    } else {
2304
0
        return bcf_enc_vint(str, 0, 0, -1);
2305
0
    }
2306
0
}
2307
2308
static inline int bcf1_sync_info(bcf1_t *line, kstring_t *str)
2309
0
{
2310
    // pairs of typed vectors
2311
0
    int i, irm = -1, e = 0;
2312
0
    for (i=0; i<line->n_info; i++)
2313
0
    {
2314
0
        bcf_info_t *info = &line->d.info[i];
2315
0
        if ( !info->vptr )
2316
0
        {
2317
            // marked for removal
2318
0
            if ( irm < 0 ) irm = i;
2319
0
            continue;
2320
0
        }
2321
0
        e |= kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str) < 0;
2322
0
        if ( irm >=0 )
2323
0
        {
2324
0
            bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
2325
0
            while ( irm<=i && line->d.info[irm].vptr ) irm++;
2326
0
        }
2327
0
    }
2328
0
    if ( irm>=0 ) line->n_info = irm;
2329
0
    return e == 0 ? 0 : -1;
2330
0
}
2331
2332
static int bcf1_sync(bcf1_t *line)
2333
0
{
2334
0
    char *shared_ori = line->shared.s;
2335
0
    size_t prev_len;
2336
2337
0
    kstring_t tmp = {0,0,0};
2338
0
    if ( !line->shared.l )
2339
0
    {
2340
        // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
2341
0
        tmp = line->shared;
2342
0
        bcf1_sync_id(line, &tmp);
2343
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2344
2345
0
        bcf1_sync_alleles(line, &tmp);
2346
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2347
2348
0
        bcf1_sync_filter(line, &tmp);
2349
0
        line->unpack_size[2] = tmp.l - prev_len;
2350
2351
0
        bcf1_sync_info(line, &tmp);
2352
0
        line->shared = tmp;
2353
0
    }
2354
0
    else if ( line->d.shared_dirty )
2355
0
    {
2356
        // The line was edited, update the BCF data block.
2357
2358
0
        if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line,BCF_UN_STR);
2359
2360
        // ptr_ori points to the original unchanged BCF data.
2361
0
        uint8_t *ptr_ori = (uint8_t *) line->shared.s;
2362
2363
        // ID: single typed string
2364
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ID )
2365
0
            bcf1_sync_id(line, &tmp);
2366
0
        else
2367
0
            kputsn_(ptr_ori, line->unpack_size[0], &tmp);
2368
0
        ptr_ori += line->unpack_size[0];
2369
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2370
2371
        // REF+ALT: list of typed strings
2372
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
2373
0
            bcf1_sync_alleles(line, &tmp);
2374
0
        else
2375
0
        {
2376
0
            kputsn_(ptr_ori, line->unpack_size[1], &tmp);
2377
0
            if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
2378
0
        }
2379
0
        ptr_ori += line->unpack_size[1];
2380
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2381
2382
0
        if ( line->unpacked & BCF_UN_FLT )
2383
0
        {
2384
            // FILTER: typed vector of integers
2385
0
            if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
2386
0
                bcf1_sync_filter(line, &tmp);
2387
0
            else if ( line->d.n_flt )
2388
0
                kputsn_(ptr_ori, line->unpack_size[2], &tmp);
2389
0
            else
2390
0
                bcf_enc_vint(&tmp, 0, 0, -1);
2391
0
            ptr_ori += line->unpack_size[2];
2392
0
            line->unpack_size[2] = tmp.l - prev_len;
2393
2394
0
            if ( line->unpacked & BCF_UN_INFO )
2395
0
            {
2396
                // INFO: pairs of typed vectors
2397
0
                if ( line->d.shared_dirty & BCF1_DIRTY_INF )
2398
0
                {
2399
0
                    bcf1_sync_info(line, &tmp);
2400
0
                    ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
2401
0
                }
2402
0
            }
2403
0
        }
2404
2405
0
        int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
2406
0
        if ( size ) kputsn_(ptr_ori, size, &tmp);
2407
2408
0
        free(line->shared.s);
2409
0
        line->shared = tmp;
2410
0
    }
2411
0
    if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
2412
0
    {
2413
        // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
2414
0
        size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
2415
0
        int i;
2416
0
        for (i=0; i<line->n_info; i++)
2417
0
        {
2418
0
            uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
2419
0
            line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
2420
0
            off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
2421
0
            if ( vptr_free )
2422
0
            {
2423
0
                free(vptr_free);
2424
0
                line->d.info[i].vptr_free = 0;
2425
0
            }
2426
0
        }
2427
0
    }
2428
2429
0
    if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
2430
0
    {
2431
        // The genotype fields changed or are not present
2432
0
        tmp.l = tmp.m = 0; tmp.s = NULL;
2433
0
        int i, irm = -1;
2434
0
        for (i=0; i<line->n_fmt; i++)
2435
0
        {
2436
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
2437
0
            if ( !fmt->p )
2438
0
            {
2439
                // marked for removal
2440
0
                if ( irm < 0 ) irm = i;
2441
0
                continue;
2442
0
            }
2443
0
            kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
2444
0
            if ( irm >=0 )
2445
0
            {
2446
0
                bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
2447
0
                while ( irm<=i && line->d.fmt[irm].p ) irm++;
2448
0
            }
2449
2450
0
        }
2451
0
        if ( irm>=0 ) line->n_fmt = irm;
2452
0
        free(line->indiv.s);
2453
0
        line->indiv = tmp;
2454
2455
        // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
2456
0
        size_t off_new = 0;
2457
0
        for (i=0; i<line->n_fmt; i++)
2458
0
        {
2459
0
            uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
2460
0
            line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
2461
0
            off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
2462
0
            if ( p_free )
2463
0
            {
2464
0
                free(p_free);
2465
0
                line->d.fmt[i].p_free = 0;
2466
0
            }
2467
0
        }
2468
0
    }
2469
0
    if ( !line->n_sample ) line->n_fmt = 0;
2470
0
    line->d.shared_dirty = line->d.indiv_dirty = 0;
2471
0
    return 0;
2472
0
}
2473
2474
bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
2475
0
{
2476
0
    bcf1_sync(src);
2477
2478
0
    bcf_clear(dst);
2479
0
    dst->rid  = src->rid;
2480
0
    dst->pos  = src->pos;
2481
0
    dst->rlen = src->rlen;
2482
0
    dst->qual = src->qual;
2483
0
    dst->n_info = src->n_info; dst->n_allele = src->n_allele;
2484
0
    dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
2485
2486
0
    if ( dst->shared.m < src->shared.l )
2487
0
    {
2488
0
        dst->shared.s = (char*) realloc(dst->shared.s, src->shared.l);
2489
0
        dst->shared.m = src->shared.l;
2490
0
    }
2491
0
    dst->shared.l = src->shared.l;
2492
0
    memcpy(dst->shared.s,src->shared.s,dst->shared.l);
2493
2494
0
    if ( dst->indiv.m < src->indiv.l )
2495
0
    {
2496
0
        dst->indiv.s = (char*) realloc(dst->indiv.s, src->indiv.l);
2497
0
        dst->indiv.m = src->indiv.l;
2498
0
    }
2499
0
    dst->indiv.l = src->indiv.l;
2500
0
    memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
2501
2502
0
    return dst;
2503
0
}
2504
bcf1_t *bcf_dup(bcf1_t *src)
2505
0
{
2506
0
    bcf1_t *out = bcf_init1();
2507
0
    return bcf_copy(out, src);
2508
0
}
2509
2510
int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
2511
26.2k
{
2512
26.2k
    if ( h->dirty ) {
2513
0
        if (bcf_hdr_sync(h) < 0) return -1;
2514
0
    }
2515
26.2k
    if ( bcf_hdr_nsamples(h)!=v->n_sample )
2516
17
    {
2517
17
        hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
2518
17
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
2519
17
        return -1;
2520
17
    }
2521
2522
26.2k
    if ( hfp->format.format == vcf || hfp->format.format == text_format )
2523
25.7k
        return vcf_write(hfp,h,v);
2524
2525
508
    if ( v->errcode & ~BCF_ERR_LIMITS ) // todo: unsure about the other BCF_ERR_LIMITS branches in vcf_parse_format_alloc4()
2526
508
    {
2527
        // vcf_parse1() encountered a new contig or tag, undeclared in the
2528
        // header.  At this point, the header must have been printed,
2529
        // proceeding would lead to a broken BCF file. Errors must be checked
2530
        // and cleared by the caller before we can proceed.
2531
508
        char errdescription[1024] = "";
2532
508
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos, v->errcode, bcf_strerror(v->errcode, errdescription, sizeof(errdescription)), bcf_seqname_safe(h,v), v->pos+1);
2533
508
        return -1;
2534
508
    }
2535
0
    bcf1_sync(v);   // check if the BCF record was modified
2536
2537
0
    if ( v->unpacked & BCF_IS_64BIT )
2538
0
    {
2539
0
        hts_log_error("Data at %s:%"PRIhts_pos" contains 64-bit values not representable in BCF. Please use VCF instead", bcf_seqname_safe(h,v), v->pos+1);
2540
0
        return -1;
2541
0
    }
2542
2543
0
    BGZF *fp = hfp->fp.bgzf;
2544
0
    uint8_t x[32];
2545
0
    u32_to_le(v->shared.l + 24, x); // to include six 32-bit integers
2546
0
    u32_to_le(v->indiv.l, x + 4);
2547
0
    i32_to_le(v->rid, x + 8);
2548
0
    u32_to_le(v->pos, x + 12);
2549
0
    u32_to_le(v->rlen, x + 16);
2550
0
    float_to_le(v->qual, x + 20);
2551
0
    u16_to_le(v->n_info, x + 24);
2552
0
    u16_to_le(v->n_allele, x + 26);
2553
0
    u32_to_le((uint32_t)v->n_fmt<<24 | (v->n_sample & 0xffffff), x + 28);
2554
0
    if ( bgzf_write(fp, x, 32) != 32 ) return -1;
2555
0
    if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
2556
0
    if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
2557
2558
0
    if (hfp->idx) {
2559
0
        if (bgzf_idx_push(fp, hfp->idx, v->rid, v->pos, v->pos + v->rlen,
2560
0
                          bgzf_tell(fp), 1) < 0)
2561
0
            return -1;
2562
0
    }
2563
2564
0
    return 0;
2565
0
}
2566
2567
/**********************
2568
 *** VCF header I/O ***
2569
 **********************/
2570
2571
0
static int add_missing_contig_hrec(bcf_hdr_t *h, const char *name) {
2572
0
    bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t));
2573
0
    int save_errno;
2574
0
    if (!hrec) goto fail;
2575
2576
0
    hrec->key = strdup("contig");
2577
0
    if (!hrec->key) goto fail;
2578
2579
0
    if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) goto fail;
2580
0
    if (bcf_hrec_set_val(hrec, hrec->nkeys-1, name, strlen(name), 0) < 0)
2581
0
        goto fail;
2582
0
    if (bcf_hdr_add_hrec(h, hrec) < 0)
2583
0
        goto fail;
2584
0
    return 0;
2585
2586
0
 fail:
2587
0
    save_errno = errno;
2588
0
    hts_log_error("%s", strerror(errno));
2589
0
    if (hrec) bcf_hrec_destroy(hrec);
2590
0
    errno = save_errno;
2591
0
    return -1;
2592
0
}
2593
2594
bcf_hdr_t *vcf_hdr_read(htsFile *fp)
2595
1.29k
{
2596
1.29k
    kstring_t txt, *s = &fp->line;
2597
1.29k
    int ret;
2598
1.29k
    bcf_hdr_t *h;
2599
1.29k
    tbx_t *idx = NULL;
2600
1.29k
    const char **names = NULL;
2601
1.29k
    h = bcf_hdr_init("r");
2602
1.29k
    if (!h) {
2603
0
        hts_log_error("Failed to allocate bcf header");
2604
0
        return NULL;
2605
0
    }
2606
1.29k
    txt.l = txt.m = 0; txt.s = 0;
2607
62.2k
    while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) {
2608
62.0k
        int e = 0;
2609
62.0k
        if (s->l == 0) continue;
2610
58.6k
        if (s->s[0] != '#') {
2611
6
            hts_log_error("No sample line");
2612
6
            goto error;
2613
6
        }
2614
58.6k
        if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
2615
0
            kstring_t tmp = { 0, 0, NULL };
2616
0
            hFILE *f = hopen(fp->fn_aux, "r");
2617
0
            if (f == NULL) {
2618
0
                hts_log_error("Couldn't open \"%s\"", fp->fn_aux);
2619
0
                goto error;
2620
0
            }
2621
0
            while (tmp.l = 0, kgetline(&tmp, (kgets_func *) hgets, f) >= 0) {
2622
0
                char *tab = strchr(tmp.s, '\t');
2623
0
                if (tab == NULL) continue;
2624
0
                e |= (kputs("##contig=<ID=", &txt) < 0);
2625
0
                e |= (kputsn(tmp.s, tab - tmp.s, &txt) < 0);
2626
0
                e |= (kputs(",length=", &txt) < 0);
2627
0
                e |= (kputl(atol(tab), &txt) < 0);
2628
0
                e |= (kputsn(">\n", 2, &txt) < 0);
2629
0
            }
2630
0
            free(tmp.s);
2631
0
            if (hclose(f) != 0) {
2632
0
                hts_log_error("Error on closing %s", fp->fn_aux);
2633
0
                goto error;
2634
0
            }
2635
0
            if (e) goto error;
2636
0
        }
2637
58.6k
        if (kputsn(s->s, s->l, &txt) < 0) goto error;
2638
58.6k
        if (kputc('\n', &txt) < 0) goto error;
2639
58.6k
        if (s->s[1] != '#') break;
2640
58.6k
    }
2641
1.28k
    if ( ret < -1 ) goto error;
2642
1.28k
    if ( !txt.s )
2643
0
    {
2644
0
        hts_log_error("Could not read the header");
2645
0
        goto error;
2646
0
    }
2647
1.28k
    if ( bcf_hdr_parse(h, txt.s) < 0 ) goto error;
2648
2649
    // check tabix index, are all contigs listed in the header? add the missing ones
2650
1.07k
    idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SILENT_FAIL);
2651
1.07k
    if ( idx )
2652
0
    {
2653
0
        int i, n, need_sync = 0;
2654
0
        names = tbx_seqnames(idx, &n);
2655
0
        if (!names) goto error;
2656
0
        for (i=0; i<n; i++)
2657
0
        {
2658
0
            bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL);
2659
0
            if ( hrec ) continue;
2660
0
            if (add_missing_contig_hrec(h, names[i]) < 0) goto error;
2661
0
            need_sync = 1;
2662
0
        }
2663
0
        if ( need_sync ) {
2664
0
            if (bcf_hdr_sync(h) < 0) goto error;
2665
0
        }
2666
0
        free(names);
2667
0
        tbx_destroy(idx);
2668
0
    }
2669
1.07k
    free(txt.s);
2670
1.07k
    return h;
2671
2672
214
 error:
2673
214
    if (idx) tbx_destroy(idx);
2674
214
    free(names);
2675
214
    free(txt.s);
2676
214
    if (h) bcf_hdr_destroy(h);
2677
214
    return NULL;
2678
1.07k
}
2679
2680
int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
2681
0
{
2682
0
    int i = 0, n = 0, save_errno;
2683
0
    char **lines = hts_readlines(fname, &n);
2684
0
    if ( !lines ) return 1;
2685
0
    for (i=0; i<n-1; i++)
2686
0
    {
2687
0
        int k;
2688
0
        bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
2689
0
        if (!hrec) goto fail;
2690
0
        if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
2691
0
            bcf_hrec_destroy(hrec);
2692
0
            goto fail;
2693
0
        }
2694
0
        free(lines[i]);
2695
0
        lines[i] = NULL;
2696
0
    }
2697
0
    if (bcf_hdr_parse_sample_line(hdr, lines[n-1]) < 0) goto fail;
2698
0
    if (bcf_hdr_sync(hdr) < 0) goto fail;
2699
0
    free(lines[n-1]);
2700
0
    free(lines);
2701
0
    return 0;
2702
2703
0
 fail:
2704
0
    save_errno = errno;
2705
0
    for (; i < n; i++)
2706
0
        free(lines[i]);
2707
0
    free(lines);
2708
0
    errno = save_errno;
2709
0
    return 1;
2710
0
}
2711
2712
static int _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
2713
6.09k
{
2714
6.09k
    uint32_t e = 0;
2715
6.09k
    if ( !hrec->value )
2716
3.53k
    {
2717
3.53k
        int j, nout = 0;
2718
3.53k
        e |= ksprintf(str, "##%s=<", hrec->key) < 0;
2719
12.6k
        for (j=0; j<hrec->nkeys; j++)
2720
9.12k
        {
2721
            // do not output IDX if output is VCF
2722
9.12k
            if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
2723
7.85k
            if ( nout ) e |= kputc(',',str) < 0;
2724
7.85k
            e |= ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]) < 0;
2725
7.85k
            nout++;
2726
7.85k
        }
2727
3.53k
        e |= ksprintf(str,">\n") < 0;
2728
3.53k
    }
2729
2.56k
    else
2730
2.56k
        e |= ksprintf(str,"##%s=%s\n", hrec->key,hrec->value) < 0;
2731
2732
6.09k
    return e == 0 ? 0 : -1;
2733
6.09k
}
2734
2735
int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
2736
0
{
2737
0
    return _bcf_hrec_format(hrec,0,str);
2738
0
}
2739
2740
int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str)
2741
1.18k
{
2742
1.18k
    int i, r = 0;
2743
7.28k
    for (i=0; i<hdr->nhrec; i++)
2744
6.09k
        r |= _bcf_hrec_format(hdr->hrec[i], is_bcf, str) < 0;
2745
2746
1.18k
    r |= ksprintf(str, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") < 0;
2747
1.18k
    if ( bcf_hdr_nsamples(hdr) )
2748
344
    {
2749
344
        r |= ksprintf(str, "\tFORMAT") < 0;
2750
2.69k
        for (i=0; i<bcf_hdr_nsamples(hdr); i++)
2751
2.34k
            r |= ksprintf(str, "\t%s", hdr->samples[i]) < 0;
2752
344
    }
2753
1.18k
    r |= ksprintf(str, "\n") < 0;
2754
2755
1.18k
    return r ? -1 : 0;
2756
1.18k
}
2757
2758
char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
2759
0
{
2760
0
    kstring_t txt = {0,0,0};
2761
0
    if (bcf_hdr_format(hdr, is_bcf, &txt) < 0)
2762
0
        return NULL;
2763
0
    if ( len ) *len = txt.l;
2764
0
    return txt.s;
2765
0
}
2766
2767
const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
2768
0
{
2769
0
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
2770
0
    int i, tid, m = kh_size(d);
2771
0
    const char **names = (const char**) calloc(m,sizeof(const char*));
2772
0
    if ( !names )
2773
0
    {
2774
0
        hts_log_error("Failed to allocate memory");
2775
0
        *n = 0;
2776
0
        return NULL;
2777
0
    }
2778
0
    khint_t k;
2779
0
    for (k=kh_begin(d); k<kh_end(d); k++)
2780
0
    {
2781
0
        if ( !kh_exist(d,k) ) continue;
2782
0
        if ( !kh_val(d, k).hrec[0] ) continue;  // removed via bcf_hdr_remove
2783
0
        tid = kh_val(d,k).id;
2784
0
        if ( tid >= m )
2785
0
        {
2786
            // This can happen after a contig has been removed from BCF header via bcf_hdr_remove()
2787
0
            if ( hts_resize(const char*, tid + 1, &m, &names, HTS_RESIZE_CLEAR)<0 )
2788
0
            {
2789
0
                hts_log_error("Failed to allocate memory");
2790
0
                *n = 0;
2791
0
                free(names);
2792
0
                return NULL;
2793
0
            }
2794
0
            m = tid + 1;
2795
0
        }
2796
0
        names[tid] = kh_key(d,k);
2797
0
    }
2798
    // ensure there are no gaps
2799
0
    for (i=0,tid=0; tid<m; i++,tid++)
2800
0
    {
2801
0
        while ( tid<m && !names[tid] ) tid++;
2802
0
        if ( tid==m ) break;
2803
0
        if ( i==tid ) continue;
2804
0
        names[i] = names[tid];
2805
0
        names[tid] = 0;
2806
0
    }
2807
0
    *n = i;
2808
0
    return names;
2809
0
}
2810
2811
int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
2812
594
{
2813
594
    kstring_t htxt = {0,0,0};
2814
594
    if (bcf_hdr_format(h, 0, &htxt) < 0) {
2815
0
        free(htxt.s);
2816
0
        return -1;
2817
0
    }
2818
594
    while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros
2819
594
    int ret;
2820
594
    if ( fp->format.compression!=no_compression ) {
2821
0
        ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l);
2822
0
        if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2823
594
    } else {
2824
594
        ret = hwrite(fp->fp.hfile, htxt.s, htxt.l);
2825
594
    }
2826
594
    free(htxt.s);
2827
594
    return ret<0 ? -1 : 0;
2828
594
}
2829
2830
/***********************
2831
 *** Typed value I/O ***
2832
 ***********************/
2833
2834
int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
2835
134k
{
2836
134k
    int32_t max = INT32_MIN, min = INT32_MAX;
2837
134k
    int i;
2838
134k
    if (n <= 0) {
2839
1.93k
        return bcf_enc_size(s, 0, BCF_BT_NULL);
2840
133k
    } else if (n == 1) {
2841
24.4k
        return bcf_enc_int1(s, a[0]);
2842
108k
    } else {
2843
108k
        if (wsize <= 0) wsize = n;
2844
2845
        // Equivalent to:
2846
        // for (i = 0; i < n; ++i) {
2847
        //     if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end )
2848
        //         continue;
2849
        //     if (max < a[i]) max = a[i];
2850
        //     if (min > a[i]) min = a[i];
2851
        // }
2852
108k
        int max4[4] = {INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN};
2853
108k
        int min4[4] = {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX};
2854
17.3M
        for (i = 0; i < (n&~3); i+=4) {
2855
            // bcf_int32_missing    == INT32_MIN and
2856
            // bcf_int32_vector_end == INT32_MIN+1.
2857
            // We skip these, but can mostly avoid explicit checking
2858
17.2M
            if (max4[0] < a[i+0]) max4[0] = a[i+0];
2859
17.2M
            if (max4[1] < a[i+1]) max4[1] = a[i+1];
2860
17.2M
            if (max4[2] < a[i+2]) max4[2] = a[i+2];
2861
17.2M
            if (max4[3] < a[i+3]) max4[3] = a[i+3];
2862
17.2M
            if (min4[0] > a[i+0] && a[i+0] > INT32_MIN+1) min4[0] = a[i+0];
2863
17.2M
            if (min4[1] > a[i+1] && a[i+1] > INT32_MIN+1) min4[1] = a[i+1];
2864
17.2M
            if (min4[2] > a[i+2] && a[i+2] > INT32_MIN+1) min4[2] = a[i+2];
2865
17.2M
            if (min4[3] > a[i+3] && a[i+3] > INT32_MIN+1) min4[3] = a[i+3];
2866
17.2M
        }
2867
108k
        min = min4[0];
2868
108k
        if (min > min4[1]) min = min4[1];
2869
108k
        if (min > min4[2]) min = min4[2];
2870
108k
        if (min > min4[3]) min = min4[3];
2871
108k
        max = max4[0];
2872
108k
        if (max < max4[1]) max = max4[1];
2873
108k
        if (max < max4[2]) max = max4[2];
2874
108k
        if (max < max4[3]) max = max4[3];
2875
249k
        for (; i < n; ++i) {
2876
140k
            if (max < a[i]) max = a[i];
2877
140k
            if (min > a[i] && a[i] > INT32_MIN+1) min = a[i];
2878
140k
        }
2879
2880
108k
        if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) {
2881
11.2k
            if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 ||
2882
11.2k
                ks_resize(s, s->l + n) < 0)
2883
0
                return -1;
2884
11.2k
            uint8_t *p = (uint8_t *) s->s + s->l;
2885
2.67M
            for (i = 0; i < n; ++i, p++) {
2886
2.65M
                if ( a[i]==bcf_int32_vector_end )   *p = bcf_int8_vector_end;
2887
2.65M
                else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing;
2888
70.2k
                else *p = a[i];
2889
2.65M
            }
2890
11.2k
            s->l += n;
2891
97.2k
        } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) {
2892
74.5k
            uint8_t *p;
2893
74.5k
            if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 ||
2894
74.5k
                ks_resize(s, s->l + n * sizeof(int16_t)) < 0)
2895
0
                return -1;
2896
74.5k
            p = (uint8_t *) s->s + s->l;
2897
31.0M
            for (i = 0; i < n; ++i)
2898
30.9M
            {
2899
30.9M
                int16_t x;
2900
30.9M
                if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
2901
30.9M
                else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
2902
617k
                else x = a[i];
2903
30.9M
                i16_to_le(x, p);
2904
30.9M
                p += sizeof(int16_t);
2905
30.9M
            }
2906
74.5k
            s->l += n * sizeof(int16_t);
2907
74.5k
        } else {
2908
22.7k
            uint8_t *p;
2909
22.7k
            if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 ||
2910
22.7k
                ks_resize(s, s->l + n * sizeof(int32_t)) < 0)
2911
0
                return -1;
2912
22.7k
            p = (uint8_t *) s->s + s->l;
2913
35.3M
            for (i = 0; i < n; ++i) {
2914
35.3M
                i32_to_le(a[i], p);
2915
35.3M
                p += sizeof(int32_t);
2916
35.3M
            }
2917
22.7k
            s->l += n * sizeof(int32_t);
2918
22.7k
        }
2919
108k
    }
2920
2921
108k
    return 0;
2922
134k
}
2923
2924
#ifdef VCF_ALLOW_INT64
2925
static int bcf_enc_long1(kstring_t *s, int64_t x) {
2926
    uint32_t e = 0;
2927
    if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32)
2928
        return bcf_enc_int1(s, x);
2929
    if (x == bcf_int64_vector_end) {
2930
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2931
        e |= kputc(bcf_int8_vector_end, s) < 0;
2932
    } else if (x == bcf_int64_missing) {
2933
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2934
        e |= kputc(bcf_int8_missing, s) < 0;
2935
    } else {
2936
        e |= bcf_enc_size(s, 1, BCF_BT_INT64);
2937
        e |= ks_expand(s, 8);
2938
        if (e == 0) { u64_to_le(x, (uint8_t *) s->s + s->l); s->l += 8; }
2939
    }
2940
    return e == 0 ? 0 : -1;
2941
}
2942
#endif
2943
2944
263k
static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) {
2945
263k
    uint8_t *p;
2946
263k
    size_t i;
2947
263k
    size_t bytes = n * sizeof(float);
2948
2949
263k
    if (bytes / sizeof(float) != n) return -1;
2950
263k
    if (ks_resize(s, s->l + bytes) < 0) return -1;
2951
2952
263k
    p = (uint8_t *) s->s + s->l;
2953
75.1M
    for (i = 0; i < n; i++) {
2954
74.8M
        float_to_le(a[i], p);
2955
74.8M
        p += sizeof(float);
2956
74.8M
    }
2957
263k
    s->l += bytes;
2958
2959
263k
    return 0;
2960
263k
}
2961
2962
int bcf_enc_vfloat(kstring_t *s, int n, float *a)
2963
263k
{
2964
263k
    assert(n >= 0);
2965
263k
    bcf_enc_size(s, n, BCF_BT_FLOAT);
2966
263k
    serialize_float_array(s, n, a);
2967
263k
    return 0; // FIXME: check for errs in this function
2968
263k
}
2969
2970
int bcf_enc_vchar(kstring_t *s, int l, const char *a)
2971
1.77M
{
2972
1.77M
    bcf_enc_size(s, l, BCF_BT_CHAR);
2973
1.77M
    kputsn(a, l, s);
2974
1.77M
    return 0; // FIXME: check for errs in this function
2975
1.77M
}
2976
2977
// Special case of n==1 as it also occurs quite often in FORMAT data.
2978
// This version is also small enough to get inlined.
2979
2.88k
static inline int bcf_fmt_array1(kstring_t *s, int type, void *data) {
2980
2.88k
    uint32_t e = 0;
2981
2.88k
    uint8_t *p = (uint8_t *)data;
2982
2.88k
    int32_t v;
2983
2984
    // helps gcc more than clang here. In billions of cycles:
2985
    //          bcf_fmt_array1  bcf_fmt_array
2986
    // gcc7:    23.2            24.3
2987
    // gcc13:   21.6            23.0
2988
    // clang13: 27.1            27.8
2989
2.88k
    switch (type) {
2990
2.88k
    case BCF_BT_CHAR:
2991
2.88k
        e |= kputc_(*p == bcf_str_missing ? '.' : *p, s) < 0;
2992
2.88k
        break;
2993
2994
0
    case BCF_BT_INT8:
2995
0
        if (*(int8_t *)p != bcf_int8_vector_end) {
2996
0
            e |= ((*(int8_t *)p == bcf_int8_missing)
2997
0
                  ? kputc_('.', s)
2998
0
                  : kputw(*(int8_t *)p, s)) < 0;
2999
0
        }
3000
0
        break;
3001
0
    case BCF_BT_INT16:
3002
0
        v = le_to_i16(p);
3003
0
        if (v != bcf_int16_vector_end) {
3004
0
            e |= (v == bcf_int16_missing
3005
0
                  ? kputc_('.', s)
3006
0
                  : kputw(v, s)) < 0;
3007
0
        }
3008
0
        break;
3009
3010
0
    case BCF_BT_INT32:
3011
0
        v = le_to_i32(p);
3012
0
        if (v != bcf_int32_vector_end) {
3013
0
            e |= (v == bcf_int32_missing
3014
0
                  ? kputc_('.', s)
3015
0
                  : kputw(v, s)) < 0;
3016
0
        }
3017
0
        break;
3018
3019
0
    case BCF_BT_FLOAT:
3020
0
        v = le_to_u32(p);
3021
0
        if (v != bcf_float_vector_end) {
3022
0
            e |= (v == bcf_float_missing
3023
0
                  ? kputc_('.', s)
3024
0
                  : kputd(le_to_float(p), s)) < 0;
3025
0
        }
3026
0
        break;
3027
3028
0
    default:
3029
0
        hts_log_error("Unexpected type %d", type);
3030
0
        return -1;
3031
2.88k
    }
3032
3033
2.88k
    return e == 0 ? 0 : -1;
3034
2.88k
}
3035
3036
int bcf_fmt_array(kstring_t *s, int n, int type, void *data)
3037
1.49M
{
3038
1.49M
    int j = 0;
3039
1.49M
    uint32_t e = 0;
3040
1.49M
    if (n == 0) {
3041
920k
        return kputc_('.', s) >= 0 ? 0 : -1;
3042
920k
    }
3043
3044
574k
    if (type == BCF_BT_CHAR)
3045
181k
    {
3046
181k
        char *p = (char *)data;
3047
3048
        // Note bcf_str_missing is already accounted for in n==0 above.
3049
181k
        if (n >= 8) {
3050
49.4k
            char *p_end = memchr(p, 0, n);
3051
49.4k
            e |= kputsn(p, p_end ? p_end-p : n, s) < 0;
3052
131k
        } else {
3053
479k
            for (j = 0; j < n && *p; ++j, ++p)
3054
347k
               e |= kputc(*p, s) < 0;
3055
131k
        }
3056
181k
    }
3057
393k
    else
3058
393k
    {
3059
393k
        #define BRANCH(type_t, convert, is_missing, is_vector_end, kprint) { \
3060
393k
            uint8_t *p = (uint8_t *) data; \
3061
70.7M
            for (j=0; j<n; j++, p += sizeof(type_t))    \
3062
70.3M
            { \
3063
70.3M
                type_t v = convert(p); \
3064
70.3M
                if ( is_vector_end ) break; \
3065
70.3M
                if ( j ) e |= kputc_(',', s) < 0; \
3066
70.3M
                e |= (is_missing ? kputc('.', s) : kprint) < 0; \
3067
70.3M
            } \
3068
393k
        }
3069
393k
        switch (type) {
3070
112k
            case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, v==bcf_int8_missing,  v==bcf_int8_vector_end,  kputw(v, s)); break;
3071
72.6k
            case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break;
3072
76.9k
            case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break;
3073
131k
            case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break;
3074
0
            default: hts_log_error("Unexpected type %d", type); exit(1); break;
3075
393k
        }
3076
393k
        #undef BRANCH
3077
393k
    }
3078
574k
    return e == 0 ? 0 : -1;
3079
574k
}
3080
3081
uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
3082
1.02M
{
3083
1.02M
    int x, type;
3084
1.02M
    x = bcf_dec_size(ptr, &ptr, &type);
3085
1.02M
    bcf_fmt_array(s, x, type, ptr);
3086
1.02M
    return ptr + (x << bcf_type_shift[type]);
3087
1.02M
}
3088
3089
/********************
3090
 *** VCF site I/O ***
3091
 ********************/
3092
3093
typedef struct {
3094
    int key;            // Key for h->id[BCF_DT_ID][key] vdict
3095
    int max_m;          // number of elements in field array (ie commas)
3096
    int size;           // field size (max_l or max_g*4 if is_gt)
3097
    int offset;         // offset of buf into h->mem
3098
    uint32_t is_gt:1,   // is genotype
3099
             max_g:31;  // maximum number of genotypes
3100
    uint32_t max_l;     // length of field
3101
    uint32_t y;         // h->id[0][fmt[j].key].val->info[BCF_HL_FMT]
3102
    uint8_t *buf;       // Pointer into h->mem
3103
} fmt_aux_t;
3104
3105
// fmt_aux_t field notes:
3106
// max_* are biggest sizes of the various FORMAT fields across all samples.
3107
// We use these after pivoting the data to ensure easy random access
3108
// of a specific sample.
3109
//
3110
// max_m is only used for type BCF_HT_REAL or BCF_HT_INT
3111
// max_g is only used for is_gt == 1 (will be BCF_HT_STR)
3112
// max_l is only used for is_gt == 0 (will be BCF_HT_STR)
3113
//
3114
// These are computed in vcf_parse_format_max3 and used in
3115
// vcf_parse_format_alloc4 to get the size.
3116
//
3117
// size is computed from max_g, max_l, max_m and is_gt.  Once computed
3118
// the max values are never accessed again.
3119
//
3120
// In theory all 4 vars could be coalesced into a single variable, but this
3121
// significantly harms speed (even if done via a union).  It's about 25-30%
3122
// slower.
3123
3124
static inline int align_mem(kstring_t *s)
3125
45.6k
{
3126
45.6k
    int e = 0;
3127
45.6k
    if (s->l&7) {
3128
5.24k
        uint64_t zero = 0;
3129
5.24k
        e = kputsn((char*)&zero, 8 - (s->l&7), s) < 0;
3130
5.24k
    }
3131
45.6k
    return e == 0 ? 0 : -1;
3132
45.6k
}
3133
3134
46.3k
#define MAX_N_FMT 255   /* Limited by size of bcf1_t n_fmt field */
3135
3136
// detect FORMAT "."
3137
static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3138
8.56k
                                   const char *p, const char *q) {
3139
8.56k
    const char *end = s->s + s->l;
3140
8.56k
    if ( q>=end )
3141
6
    {
3142
6
        hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1);
3143
6
        v->errcode |= BCF_ERR_NCOLS;
3144
6
        return -1;
3145
6
    }
3146
3147
8.56k
    v->n_fmt = 0;
3148
8.56k
    if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "."
3149
142
    {
3150
142
        v->n_sample = bcf_hdr_nsamples(h);
3151
142
        return 1;
3152
142
    }
3153
3154
8.41k
    return 0;
3155
8.56k
}
3156
3157
// get format information from the dictionary
3158
static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3159
8.41k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
3160
8.41k
    const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3161
8.41k
    char *t;
3162
8.41k
    int j;
3163
8.41k
    ks_tokaux_t aux1;
3164
3165
54.7k
    for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
3166
46.3k
        if (j >= MAX_N_FMT) {
3167
1
            v->errcode |= BCF_ERR_LIMITS;
3168
1
            hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle",
3169
1
                bcf_seqname_safe(h,v), v->pos+1);
3170
1
            return -1;
3171
1
        }
3172
3173
46.2k
        *(char*)aux1.p = 0;
3174
46.2k
        khint_t k = kh_get(vdict, d, t);
3175
46.2k
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
3176
1.99k
            if ( t[0]=='.' && t[1]==0 )
3177
0
            {
3178
0
                hts_log_error("Invalid FORMAT tag name '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3179
0
                v->errcode |= BCF_ERR_TAG_INVALID;
3180
0
                return -1;
3181
0
            }
3182
1.99k
            hts_log_warning("FORMAT '%s' at %s:%"PRIhts_pos" is not defined in the header, assuming Type=String", t, bcf_seqname_safe(h,v), v->pos+1);
3183
1.99k
            kstring_t tmp = {0,0,0};
3184
1.99k
            int l;
3185
1.99k
            ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
3186
1.99k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3187
1.99k
            free(tmp.s);
3188
1.99k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3189
1.99k
            if (res < 0) bcf_hrec_destroy(hrec);
3190
1.99k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3191
3192
1.99k
            k = kh_get(vdict, d, t);
3193
1.99k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3194
1.99k
            if (res || k == kh_end(d)) {
3195
9
                hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
3196
9
                v->errcode |= BCF_ERR_TAG_INVALID;
3197
9
                return -1;
3198
9
            }
3199
1.99k
        }
3200
46.2k
        fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
3201
46.2k
        fmt[j].key = kh_val(d, k).id;
3202
46.2k
        fmt[j].is_gt = (t[0] == 'G' && t[1] == 'T' && !t[2]);
3203
46.2k
        fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
3204
46.2k
        v->n_fmt++;
3205
46.2k
    }
3206
8.40k
    return 0;
3207
8.41k
}
3208
3209
// compute max
3210
static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3211
8.40k
                                 char *p, char *q, fmt_aux_t *fmt) {
3212
8.40k
    int n_sample_ori = -1;
3213
8.40k
    char *r = q + 1;  // r: position in the format string
3214
8.40k
    int l = 0, m = 1, g = 1, j;
3215
8.40k
    v->n_sample = 0;  // m: max vector size, l: max field len, g: max number of alleles
3216
8.40k
    const char *end = s->s + s->l;
3217
3218
16.0k
    while ( r<end )
3219
15.9k
    {
3220
        // can we skip some samples?
3221
15.9k
        if ( h->keep_samples )
3222
0
        {
3223
0
            n_sample_ori++;
3224
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
3225
0
            {
3226
0
                while ( *r!='\t' && r<end ) r++;
3227
0
                if ( *r=='\t' ) { *r = 0; r++; }
3228
0
                continue;
3229
0
            }
3230
0
        }
3231
3232
        // collect fmt stats: max vector size, length, number of alleles
3233
15.9k
        j = 0;  // j-th format field
3234
15.9k
        fmt_aux_t *f = fmt;
3235
15.9k
        static char meta[256] = {
3236
            // \0 \t , / : |
3237
15.9k
            1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3238
15.9k
            0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1, 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
3239
15.9k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3240
15.9k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
3241
15.9k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3242
15.9k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3243
15.9k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3244
15.9k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3245
15.9k
        };
3246
3247
15.9k
        char *r_start = r;
3248
3.06M
        for (;;) {
3249
            // Quickly skip ahead to an appropriate meta-character
3250
3.43M
            while (!meta[(unsigned char)*r]) r++;
3251
3252
3.06M
            switch (*r) {
3253
3.03M
            case ',':
3254
3.03M
                m++;
3255
3.03M
                break;
3256
3257
235
            case '|':
3258
4.05k
            case '/':
3259
4.05k
                if (f->is_gt) g++;
3260
4.05k
                break;
3261
3262
7.49k
            case '\t':
3263
7.49k
                *r = 0; // fall through
3264
3265
7.49k
            default: // valid due to while loop above.
3266
15.9k
            case '\0':
3267
24.8k
            case ':':
3268
24.8k
                l = r - r_start; r_start = r;
3269
24.8k
                if (f->max_m < m) f->max_m = m;
3270
24.8k
                if (f->max_l < l) f->max_l = l;
3271
24.8k
                if (f->is_gt && f->max_g < g) f->max_g = g;
3272
24.8k
                l = 0, m = g = 1;
3273
24.8k
                if ( *r==':' ) {
3274
8.88k
                    j++; f++;
3275
8.88k
                    if ( j>=v->n_fmt ) {
3276
15
                        hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"",
3277
15
                                      h->id[BCF_DT_CTG][v->rid].key, v->pos+1);
3278
15
                        v->errcode |= BCF_ERR_NCOLS;
3279
15
                        return -1;
3280
15
                    }
3281
15.9k
                } else goto end_for;
3282
8.87k
                break;
3283
3.06M
            }
3284
3.04M
            if ( r>=end ) break;
3285
3.04M
            r++;
3286
3.04M
        }
3287
15.9k
    end_for:
3288
15.9k
        v->n_sample++;
3289
15.9k
        if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
3290
7.59k
        r++;
3291
7.59k
    }
3292
3293
8.39k
    return 0;
3294
8.40k
}
3295
3296
// allocate memory for arrays
3297
static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3298
                                   const char *p, const char *q,
3299
8.39k
                                   fmt_aux_t *fmt) {
3300
8.39k
    kstring_t *mem = (kstring_t*)&h->mem;
3301
3302
8.39k
    int j;
3303
54.0k
    for (j = 0; j < v->n_fmt; ++j) {
3304
45.6k
        fmt_aux_t *f = &fmt[j];
3305
45.6k
        if ( !f->max_m ) f->max_m = 1;  // omitted trailing format field
3306
3307
45.6k
        if ((f->y>>4&0xf) == BCF_HT_STR) {
3308
45.6k
            f->size = f->is_gt? f->max_g << 2 : f->max_l;
3309
45.6k
        } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
3310
0
            f->size = f->max_m << 2;
3311
0
        } else {
3312
0
            hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
3313
0
            v->errcode |= BCF_ERR_TAG_INVALID;
3314
0
            return -1;
3315
0
        }
3316
3317
45.6k
        if (align_mem(mem) < 0) {
3318
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3319
0
            v->errcode |= BCF_ERR_LIMITS;
3320
0
            return -1;
3321
0
        }
3322
3323
        // Limit the total memory to ~2Gb per VCF row.  This should mean
3324
        // malformed VCF data is less likely to take excessive memory and/or
3325
        // time.
3326
45.6k
        if ((uint64_t) mem->l + v->n_sample * (uint64_t)f->size > INT_MAX) {
3327
0
            static int warned = 0;
3328
0
            if ( !warned ) hts_log_warning("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3329
0
            warned = 1;
3330
0
            v->errcode |= BCF_ERR_LIMITS;
3331
0
            f->size = -1;
3332
0
            f->offset = 0;
3333
0
            continue;
3334
0
        }
3335
3336
45.6k
        f->offset = mem->l;
3337
45.6k
        if (ks_resize(mem, mem->l + v->n_sample * (size_t)f->size) < 0) {
3338
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3339
0
            v->errcode |= BCF_ERR_LIMITS;
3340
0
            return -1;
3341
0
        }
3342
45.6k
        mem->l += v->n_sample * f->size;
3343
45.6k
    }
3344
3345
8.39k
    {
3346
8.39k
        int j;
3347
54.0k
        for (j = 0; j < v->n_fmt; ++j)
3348
45.6k
            fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
3349
8.39k
    }
3350
3351
    // check for duplicate tags
3352
8.39k
    int i;
3353
45.6k
    for (i=1; i<v->n_fmt; i++)
3354
37.2k
    {
3355
37.2k
        fmt_aux_t *ifmt = &fmt[i];
3356
37.2k
        if ( ifmt->size==-1 ) continue; // already marked for removal
3357
174k
        for (j=0; j<i; j++)
3358
164k
        {
3359
164k
            fmt_aux_t *jfmt = &fmt[j];
3360
164k
            if ( jfmt->size==-1 ) continue; // already marked for removal
3361
82.4k
            if ( ifmt->key!=jfmt->key ) continue;
3362
28.0k
            static int warned = 0;
3363
28.0k
            if ( !warned ) hts_log_warning("Duplicate FORMAT tag %s at %s:%"PRIhts_pos, bcf_hdr_int2id(h,BCF_DT_ID,ifmt->key), bcf_seqname_safe(h,v), v->pos+1);
3364
28.0k
            warned = 1;
3365
28.0k
            v->errcode |= BCF_ERR_TAG_INVALID;
3366
28.0k
            ifmt->size = -1;
3367
28.0k
            ifmt->offset = 0;
3368
28.0k
            break;
3369
82.4k
        }
3370
37.2k
    }
3371
8.39k
    return 0;
3372
8.39k
}
3373
3374
// Fill the sample fields
3375
static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3376
8.39k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
3377
8.39k
    static int extreme_val_warned = 0;
3378
8.39k
    int n_sample_ori = -1;
3379
    // At beginning of the loop t points to the first char of a format
3380
8.39k
    const char *t = q + 1;
3381
8.39k
    int m = 0;   // m: sample id
3382
8.39k
    const int nsamples = bcf_hdr_nsamples(h);
3383
8.39k
    const char *end = s->s + s->l;
3384
3385
8.39k
    int ver = bcf_get_version(h, NULL);
3386
3387
24.2k
    while ( t<end )
3388
23.5k
    {
3389
        // can we skip some samples?
3390
23.5k
        if ( h->keep_samples )
3391
0
        {
3392
0
            n_sample_ori++;
3393
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
3394
0
            {
3395
0
                while ( *t && t<end ) t++;
3396
0
                t++;
3397
0
                continue;
3398
0
            }
3399
0
        }
3400
23.5k
        if ( m == nsamples ) break;
3401
3402
15.9k
        int j = 0; // j-th format field, m-th sample
3403
24.7k
        while ( t < end )
3404
24.2k
        {
3405
24.2k
            fmt_aux_t *z = &fmt[j++];
3406
24.2k
            const int htype = z->y>>4&0xf;
3407
24.2k
            if (!z->buf) {
3408
2
                hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos,
3409
2
                              z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
3410
2
                v->errcode |= BCF_ERR_LIMITS;
3411
2
                return -1;
3412
2
            }
3413
3414
24.2k
            if ( z->size==-1 )
3415
2.84k
            {
3416
                // this field is to be ignored, it's either too big or a duplicate
3417
35.3k
                while ( *t != ':' && *t ) t++;
3418
2.84k
            }
3419
21.4k
            else if (htype == BCF_HT_STR) {
3420
21.4k
                int l;
3421
21.4k
                if (z->is_gt) {
3422
                    // Genotypes.
3423
                    //([/|])?<val>)([|/]<val>)+... where <val> is [0-9]+ or ".".
3424
3.68k
                    int32_t is_phased = 0;
3425
3.68k
                    uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
3426
3.68k
                    uint32_t unreadable = 0;
3427
3.68k
                    uint32_t max = 0;
3428
3.68k
                    int overflow = 0, ploidy = 0, anyunphased = 0, \
3429
3.68k
                        phasingprfx = 0, unknown1 = 0;
3430
3431
                    /* with prefixed phasing, it is explicitly given for 1st one
3432
                    with non-prefixed, set based on ploidy and phasing of other
3433
                    alleles. */
3434
3.68k
                    if (ver >= VCF44 && (*t == '|' || *t == '/')) {
3435
                        // cache prefix and phasing status
3436
245
                        is_phased = *t++ == '|';
3437
245
                        phasingprfx = 1;
3438
245
                    }
3439
3440
6.69k
                    for (l = 0;; ++t) {
3441
6.69k
                        ploidy++;
3442
6.69k
                        if (*t == '.') {
3443
417
                            ++t, x[l++] = is_phased;
3444
417
                            if (l==1) {   //for 1st allele only
3445
374
                                unknown1 = 1;
3446
374
                            }
3447
6.27k
                        } else {
3448
6.27k
                            const char *tt = t;
3449
6.27k
                            uint32_t val;
3450
                            // Or "v->n_allele < 10", but it doesn't
3451
                            // seem to be any faster and this feels safer.
3452
6.27k
                            if (*t >= '0' && *t <= '9' &&
3453
6.19k
                                !(t[1] >= '0' && t[1] <= '9')) {
3454
2.57k
                                val = *t++ - '0';
3455
3.69k
                            } else {
3456
3.69k
                                val = hts_str2uint(t, (char **)&t,
3457
3.69k
                                                   sizeof(val) * CHAR_MAX - 2,
3458
3.69k
                                                   &overflow);
3459
3.69k
                                unreadable |= tt == t;
3460
3.69k
                            }
3461
6.27k
                            if (max < val) max = val;
3462
6.27k
                            x[l++] = (val + 1) << 1 | is_phased;
3463
6.27k
                        }
3464
6.69k
                        anyunphased |= (ploidy != 1) && !is_phased;
3465
6.69k
                        is_phased = (*t == '|');
3466
6.69k
                        if (*t != '|' && *t != '/') break;
3467
6.69k
                    }
3468
3.68k
                    if (!phasingprfx) { //get GT in v44 way when no prefixed phasing
3469
                        /* no explicit phasing for 1st allele, set based on
3470
                         other alleles and ploidy */
3471
3.43k
                        if (ploidy == 1) {  //implicitly phased
3472
1.20k
                            if (!unknown1) {
3473
871
                                x[0] |= 1;
3474
871
                            }
3475
2.23k
                        } else {            //set by other unphased alleles
3476
2.23k
                            x[0] |= (anyunphased)? 0 : 1;
3477
2.23k
                        }
3478
3.43k
                    }
3479
                    // Possibly check max against v->n_allele instead?
3480
3.68k
                    if (overflow || max > (INT32_MAX >> 1) - 1) {
3481
14
                        hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3482
14
                        return -1;
3483
14
                    }
3484
3.66k
                    if (unreadable) {
3485
5
                        hts_log_error("Couldn't read GT data: value not a number or '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3486
5
                        return -1;
3487
5
                    }
3488
3.66k
                    if ( !l ) x[l++] = 0;   // An empty field, insert missing value
3489
4.22k
                    for (; l < z->size>>2; ++l)
3490
560
                        x[l] = bcf_int32_vector_end;
3491
3492
17.7k
                } else {
3493
                    // Otherwise arbitrary strings
3494
17.7k
                    char *x = (char*)z->buf + z->size * (size_t)m;
3495
3.31M
                    for (l = 0; *t != ':' && *t; ++t)
3496
3.29M
                        x[l++] = *t;
3497
17.7k
                    if (z->size > l)
3498
8.03k
                        memset(&x[l], 0, (z->size-l) * sizeof(*x));
3499
17.7k
                }
3500
3501
21.4k
            } else if (htype == BCF_HT_INT) {
3502
                // One or more integers in an array
3503
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3504
0
                int l;
3505
0
                for (l = 0;; ++t) {
3506
0
                    if (*t == '.') {
3507
0
                        x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
3508
0
                    } else {
3509
0
                        int overflow = 0;
3510
0
                        char *te;
3511
0
                        long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3512
0
                        if ( te==t || overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3513
0
                        {
3514
0
                            if ( !extreme_val_warned )
3515
0
                            {
3516
0
                                hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos,
3517
0
                                                h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1);
3518
0
                                extreme_val_warned = 1;
3519
0
                            }
3520
0
                            tmp_val = bcf_int32_missing;
3521
0
                        }
3522
0
                        x[l++] = tmp_val;
3523
0
                        t = te;
3524
0
                    }
3525
0
                    if (*t != ',') break;
3526
0
                }
3527
0
                if ( !l )
3528
0
                    x[l++] = bcf_int32_missing;
3529
0
                for (; l < z->size>>2; ++l)
3530
0
                    x[l] = bcf_int32_vector_end;
3531
3532
0
            } else if (htype == BCF_HT_REAL) {
3533
                // One of more floating point values in an array
3534
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3535
0
                int l;
3536
0
                for (l = 0;; ++t) {
3537
0
                    if (*t == '.' && !isdigit_c(t[1])) {
3538
0
                        bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
3539
0
                    } else {
3540
0
                        int overflow = 0;
3541
0
                        char *te;
3542
0
                        float tmp_val = hts_str2dbl(t, &te, &overflow);
3543
0
                        if ( (te==t || overflow) && !extreme_val_warned )
3544
0
                        {
3545
0
                            hts_log_warning("Extreme FORMAT/%s value encountered at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname(h,v), v->pos+1);
3546
0
                            extreme_val_warned = 1;
3547
0
                        }
3548
0
                        x[l++] = tmp_val;
3549
0
                        t = te;
3550
0
                    }
3551
0
                    if (*t != ',') break;
3552
0
                }
3553
0
                if ( !l )
3554
                    // An empty field, insert missing value
3555
0
                    bcf_float_set_missing(x[l++]);
3556
0
                for (; l < z->size>>2; ++l)
3557
0
                    bcf_float_set_vector_end(x[l]);
3558
0
            } else {
3559
0
                hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, htype, bcf_seqname_safe(h,v), v->pos+1);
3560
0
                v->errcode |= BCF_ERR_TAG_INVALID;
3561
0
                return -1;
3562
0
            }
3563
3564
24.2k
            if (*t == '\0') {
3565
15.4k
                break;
3566
15.4k
            }
3567
8.81k
            else if (*t == ':') {
3568
8.80k
                t++;
3569
8.80k
            }
3570
5
            else {
3571
5
                char buffer[8];
3572
5
                hts_log_error("Invalid character %s in '%s' FORMAT field at %s:%"PRIhts_pos"",
3573
5
                    hts_strprint(buffer, sizeof buffer, '\'', t, 1),
3574
5
                    h->id[BCF_DT_ID][z->key].key, bcf_seqname_safe(h,v), v->pos+1);
3575
5
                v->errcode |= BCF_ERR_CHAR;
3576
5
                return -1;
3577
5
            }
3578
24.2k
        }
3579
3580
        // fill end-of-vector values
3581
173k
        for (; j < v->n_fmt; ++j) {
3582
157k
            fmt_aux_t *z = &fmt[j];
3583
157k
            const int htype = z->y>>4&0xf;
3584
157k
            int l;
3585
3586
157k
            if (z->size == -1) // this field is to be ignored
3587
131k
                continue;
3588
3589
25.7k
            if (htype == BCF_HT_STR) {
3590
25.7k
                if (z->is_gt) {
3591
5.41k
                    int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3592
5.41k
                    if (z->size) x[0] = bcf_int32_missing;
3593
9.94k
                    for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3594
20.3k
                } else {
3595
20.3k
                    char *x = (char*)z->buf + z->size * (size_t)m;
3596
20.3k
                    if ( z->size ) {
3597
3.20k
                        x[0] = '.';
3598
3.20k
                        memset(&x[1], 0, (z->size-1) * sizeof(*x));
3599
3.20k
                    }
3600
20.3k
                }
3601
25.7k
            } else if (htype == BCF_HT_INT) {
3602
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3603
0
                x[0] = bcf_int32_missing;
3604
0
                for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3605
0
            } else if (htype == BCF_HT_REAL) {
3606
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3607
0
                bcf_float_set_missing(x[0]);
3608
0
                for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
3609
0
            }
3610
25.7k
        }
3611
3612
15.8k
        m++; t++;
3613
15.8k
    }
3614
3615
8.36k
    return 0;
3616
8.39k
}
3617
3618
// write individual genotype information
3619
static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3620
8.36k
                                const char *p, const char *q, fmt_aux_t *fmt) {
3621
8.36k
    kstring_t *str = &v->indiv;
3622
8.36k
    int i, need_downsize = 0;
3623
8.36k
    if (v->n_sample > 0) {
3624
53.9k
        for (i = 0; i < v->n_fmt; ++i) {
3625
45.5k
            fmt_aux_t *z = &fmt[i];
3626
45.5k
            if ( z->size==-1 ) {
3627
27.9k
                need_downsize = 1;
3628
27.9k
                continue;
3629
27.9k
            }
3630
17.5k
            bcf_enc_int1(str, z->key);
3631
17.5k
            if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
3632
13.7k
                bcf_enc_size(str, z->size, BCF_BT_CHAR);
3633
13.7k
                kputsn((char*)z->buf, z->size * (size_t)v->n_sample, str);
3634
13.7k
            } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
3635
3.84k
                bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
3636
3.84k
            } else {
3637
0
                bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
3638
0
                if (serialize_float_array(str, (z->size>>2) * (size_t)v->n_sample,
3639
0
                                          (float *) z->buf) != 0) {
3640
0
                    v->errcode |= BCF_ERR_LIMITS;
3641
0
                    hts_log_error("Out of memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3642
0
                    return -1;
3643
0
                }
3644
0
            }
3645
17.5k
        }
3646
3647
8.36k
    }
3648
8.36k
    if ( need_downsize ) {
3649
3.69k
        i = 0;
3650
41.9k
        while ( i < v->n_fmt ) {
3651
38.2k
            if ( fmt[i].size==-1 )
3652
27.9k
            {
3653
27.9k
                v->n_fmt--;
3654
27.9k
                if ( i < v->n_fmt ) memmove(&fmt[i],&fmt[i+1],sizeof(*fmt)*(v->n_fmt-i));
3655
27.9k
            }
3656
10.2k
            else
3657
10.2k
                i++;
3658
38.2k
        }
3659
3.69k
    }
3660
8.36k
    return 0;
3661
8.36k
}
3662
3663
// validity checking
3664
8.36k
static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) {
3665
8.36k
    if ( v->n_sample!=bcf_hdr_nsamples(h) )
3666
22
    {
3667
22
        hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
3668
22
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
3669
22
        v->errcode |= BCF_ERR_NCOLS;
3670
22
        return -1;
3671
22
    }
3672
8.34k
    if ( v->indiv.l > 0xffffffff )
3673
0
    {
3674
0
        hts_log_error("The FORMAT at %s:%"PRIhts_pos" is too long", bcf_seqname_safe(h,v), v->pos+1);
3675
0
        v->errcode |= BCF_ERR_LIMITS;
3676
3677
        // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed
3678
0
        v->n_fmt = 0;
3679
0
        return -1;
3680
0
    }
3681
3682
8.34k
    return 0;
3683
8.34k
}
3684
3685
// p,q is the start and the end of the FORMAT field
3686
static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3687
                            char *p, char *q)
3688
23.5k
{
3689
23.5k
    if ( !bcf_hdr_nsamples(h) ) return 0;
3690
8.56k
    kstring_t *mem = (kstring_t*)&h->mem;
3691
8.56k
    mem->l = 0;
3692
3693
8.56k
    fmt_aux_t fmt[MAX_N_FMT];
3694
3695
    // detect FORMAT "."
3696
8.56k
    int ret; // +ve = ok, -ve = err
3697
8.56k
    if ((ret = vcf_parse_format_empty1(s, h, v, p, q)))
3698
148
        return ret ? 0 : -1;
3699
3700
    // get format information from the dictionary
3701
8.41k
    if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0)
3702
10
        return -1;
3703
3704
    // FORMAT data is per-sample A:B:C A:B:C A:B:C ... but in memory it is
3705
    // stored as per-type arrays AAA... BBB... CCC...  This is basically
3706
    // a data rotation or pivot.
3707
3708
    // The size of elements in the array grow to their maximum needed,
3709
    // permitting fast random access.  This means however we have to first
3710
    // scan the whole FORMAT line to find the maximum of each type, and
3711
    // then scan it again to find the store the data.
3712
    // We break this down into compute-max, allocate, fill-out-buffers
3713
3714
    // TODO: ?
3715
    // The alternative would be to pivot on the first pass, with fixed
3716
    // size entries for numerics and concatenated strings otherwise, also
3717
    // tracking maximum sizes.  Then on a second pass we reallocate and
3718
    // copy the data again to a uniformly sized array.  Two passes through
3719
    // memory, but without doubling string parsing.
3720
3721
    // compute max
3722
8.40k
    if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0)
3723
15
        return -1;
3724
3725
    // allocate memory for arrays
3726
8.39k
    if (vcf_parse_format_alloc4(s, h, v, p, q, fmt) < 0)
3727
0
        return -1;
3728
3729
    // fill the sample fields; at beginning of the loop
3730
8.39k
    if (vcf_parse_format_fill5(s, h, v, p, q, fmt) < 0)
3731
26
        return -1;
3732
3733
    // write individual genotype information
3734
8.36k
    if (vcf_parse_format_gt6(s, h, v, p, q, fmt) < 0)
3735
0
        return -1;
3736
3737
    // validity checking
3738
8.36k
    if (vcf_parse_format_check7(h, v) < 0)
3739
22
        return -1;
3740
3741
8.34k
    return 0;
3742
8.36k
}
3743
3744
1.48k
static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
3745
    // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
3746
    // been already printed, but will enable tools like vcfcheck to proceed.
3747
3748
1.48k
    kstring_t tmp = {0,0,0};
3749
1.48k
    khint_t k;
3750
1.48k
    int l;
3751
1.48k
    if (ksprintf(&tmp, "##contig=<ID=%s>", p) < 0)
3752
0
        return kh_end(d);
3753
1.48k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3754
1.48k
    free(tmp.s);
3755
1.48k
    int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3756
1.48k
    if (res < 0) bcf_hrec_destroy(hrec);
3757
1.48k
    if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3758
1.48k
    k = kh_get(vdict, d, p);
3759
3760
1.48k
    return k;
3761
1.48k
}
3762
3763
24.6k
static int vcf_parse_filter(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3764
24.6k
    int i, n_flt = 1, max_n_flt = 0;
3765
24.6k
    char *r, *t;
3766
24.6k
    int32_t *a_flt = NULL;
3767
24.6k
    ks_tokaux_t aux1;
3768
24.6k
    khint_t k;
3769
24.6k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3770
    // count the number of filters
3771
24.6k
    if (*(q-1) == ';') *(q-1) = 0;
3772
177M
    for (r = p; *r; ++r)
3773
177M
        if (*r == ';') ++n_flt;
3774
24.6k
    if (n_flt > max_n_flt) {
3775
24.6k
        a_flt = malloc(n_flt * sizeof(*a_flt));
3776
24.6k
        if (!a_flt) {
3777
0
            hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3778
0
            v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3779
0
            return -1;
3780
0
        }
3781
24.6k
        max_n_flt = n_flt;
3782
24.6k
    }
3783
    // add filters
3784
928k
    for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
3785
903k
        *(char*)aux1.p = 0;
3786
903k
        k = kh_get(vdict, d, t);
3787
903k
        if (k == kh_end(d))
3788
21.1k
        {
3789
            // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
3790
            // been already printed, but will enable tools like vcfcheck to proceed.
3791
21.1k
            hts_log_warning("FILTER '%s' is not defined in the header", t);
3792
21.1k
            kstring_t tmp = {0,0,0};
3793
21.1k
            int l;
3794
21.1k
            ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
3795
21.1k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3796
21.1k
            free(tmp.s);
3797
21.1k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3798
21.1k
            if (res < 0) bcf_hrec_destroy(hrec);
3799
21.1k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3800
21.1k
            k = kh_get(vdict, d, t);
3801
21.1k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3802
21.1k
            if (res || k == kh_end(d)) {
3803
16
                hts_log_error("Could not add dummy header for FILTER '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
3804
16
                v->errcode |= BCF_ERR_TAG_INVALID;
3805
16
                free(a_flt);
3806
16
                return -1;
3807
16
            }
3808
21.1k
        }
3809
903k
        a_flt[i++] = kh_val(d, k).id;
3810
903k
    }
3811
3812
24.6k
    bcf_enc_vint(str, n_flt, a_flt, -1);
3813
24.6k
    free(a_flt);
3814
3815
24.6k
    return 0;
3816
24.6k
}
3817
3818
26.2k
static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3819
26.2k
    static int extreme_int_warned = 0, negative_rlen_warned = 0;
3820
26.2k
    int max_n_val = 0, overflow = 0;
3821
26.2k
    char *r, *key;
3822
26.2k
    khint_t k;
3823
26.2k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3824
26.2k
    int32_t *a_val = NULL;
3825
3826
26.2k
    v->n_info = 0;
3827
26.2k
    if (*(q-1) == ';') *(q-1) = 0;
3828
1.83M
    for (r = key = p;; ++r) {
3829
1.83M
        int c;
3830
1.83M
        char *val, *end;
3831
110M
        while (*r > '=' || (*r != ';' && *r != '=' && *r != 0)) r++;
3832
1.83M
        if (v->n_info == UINT16_MAX) {
3833
0
            hts_log_error("Too many INFO entries at %s:%"PRIhts_pos,
3834
0
                          bcf_seqname_safe(h,v), v->pos+1);
3835
0
            v->errcode |= BCF_ERR_LIMITS;
3836
0
            goto fail;
3837
0
        }
3838
1.83M
        val = end = NULL;
3839
1.83M
        c = *r; *r = 0;
3840
1.83M
        if (c == '=') {
3841
961k
            val = r + 1;
3842
3843
240M
            for (end = val; *end != ';' && *end != 0; ++end);
3844
961k
            c = *end; *end = 0;
3845
961k
        } else end = r;
3846
1.83M
        if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; }  // faulty VCF, ";;" in the INFO
3847
1.81M
        k = kh_get(vdict, d, key);
3848
1.81M
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
3849
13.2k
        {
3850
13.2k
            hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key);
3851
13.2k
            kstring_t tmp = {0,0,0};
3852
13.2k
            int l;
3853
13.2k
            ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
3854
13.2k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3855
13.2k
            free(tmp.s);
3856
13.2k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3857
13.2k
            if (res < 0) bcf_hrec_destroy(hrec);
3858
13.2k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3859
13.2k
            k = kh_get(vdict, d, key);
3860
13.2k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3861
13.2k
            if (res || k == kh_end(d)) {
3862
27
                hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1);
3863
27
                v->errcode |= BCF_ERR_TAG_INVALID;
3864
27
                goto fail;
3865
27
            }
3866
13.2k
        }
3867
1.81M
        uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
3868
1.81M
        ++v->n_info;
3869
1.81M
        bcf_enc_int1(str, kh_val(d, k).id);
3870
1.81M
        if (val == 0) {
3871
854k
            bcf_enc_size(str, 0, BCF_BT_NULL);
3872
961k
        } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
3873
138k
            bcf_enc_vchar(str, end - val, val);
3874
823k
        } else { // int/float value/array
3875
823k
            int i, n_val;
3876
823k
            char *t, *te;
3877
197M
            for (t = val, n_val = 1; *t; ++t) // count the number of values
3878
196M
                if (*t == ',') ++n_val;
3879
            // Check both int and float size in one step for simplicity
3880
823k
            if (n_val > max_n_val) {
3881
2.50k
                int32_t *a_tmp = (int32_t *)realloc(a_val, n_val * sizeof(*a_val));
3882
2.50k
                if (!a_tmp) {
3883
0
                    hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3884
0
                    v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3885
0
                    goto fail;
3886
0
                }
3887
2.50k
                a_val = a_tmp;
3888
2.50k
                max_n_val = n_val;
3889
2.50k
            }
3890
823k
            if ((y>>4&0xf) == BCF_HT_INT) {
3891
559k
                i = 0, t = val;
3892
559k
                int64_t val1;
3893
559k
                int is_int64 = 0;
3894
#ifdef VCF_ALLOW_INT64
3895
                if ( n_val==1 )
3896
                {
3897
                    overflow = 0;
3898
                    long long int tmp_val = hts_str2int(val, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3899
                    if ( te==val ) tmp_val = bcf_int32_missing;
3900
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT64 || tmp_val>BCF_MAX_BT_INT64 )
3901
                    {
3902
                        if ( !extreme_int_warned )
3903
                        {
3904
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3905
                            extreme_int_warned = 1;
3906
                        }
3907
                        tmp_val = bcf_int32_missing;
3908
                    }
3909
                    else
3910
                        is_int64 = 1;
3911
                    val1 = tmp_val;
3912
                    t = te;
3913
                    i = 1;  // this is just to avoid adding another nested block...
3914
                }
3915
#endif
3916
69.5M
                for (; i < n_val; ++i, ++t)
3917
69.0M
                {
3918
69.0M
                    overflow = 0;
3919
69.0M
                    long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3920
69.0M
                    if ( te==t ) tmp_val = bcf_int32_missing;
3921
999k
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3922
121k
                    {
3923
121k
                        if ( !extreme_int_warned )
3924
1
                        {
3925
1
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3926
1
                            extreme_int_warned = 1;
3927
1
                        }
3928
121k
                        tmp_val = bcf_int32_missing;
3929
121k
                    }
3930
69.0M
                    a_val[i] = tmp_val;
3931
91.9M
                    for (t = te; *t && *t != ','; t++);
3932
69.0M
                }
3933
559k
                if (n_val == 1) {
3934
#ifdef VCF_ALLOW_INT64
3935
                    if ( is_int64 )
3936
                    {
3937
                        v->unpacked |= BCF_IS_64BIT;
3938
                        bcf_enc_long1(str, val1);
3939
                    }
3940
                    else
3941
                        bcf_enc_int1(str, (int32_t)val1);
3942
#else
3943
454k
                    val1 = a_val[0];
3944
454k
                    bcf_enc_int1(str, (int32_t)val1);
3945
454k
#endif
3946
454k
                } else {
3947
104k
                    bcf_enc_vint(str, n_val, a_val, -1);
3948
104k
                }
3949
559k
                if (n_val==1 && (val1!=bcf_int32_missing || is_int64)
3950
374k
                    && memcmp(key, "END", 4) == 0)
3951
0
                {
3952
0
                    if ( val1 <= v->pos )
3953
0
                    {
3954
0
                        if ( !negative_rlen_warned )
3955
0
                        {
3956
0
                            hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,val1,bcf_seqname_safe(h,v),v->pos+1);
3957
0
                            negative_rlen_warned = 1;
3958
0
                        }
3959
0
                    }
3960
0
                }
3961
559k
            } else if ((y>>4&0xf) == BCF_HT_REAL) {
3962
263k
                float *val_f = (float *)a_val;
3963
75.1M
                for (i = 0, t = val; i < n_val; ++i, ++t)
3964
74.8M
                {
3965
74.8M
                    overflow = 0;
3966
74.8M
                    val_f[i] = hts_str2dbl(t, &te, &overflow);
3967
74.8M
                    if ( te==t || overflow ) // conversion failed
3968
74.0M
                        bcf_float_set_missing(val_f[i]);
3969
97.1M
                    for (t = te; *t && *t != ','; t++);
3970
74.8M
                }
3971
263k
                bcf_enc_vfloat(str, n_val, val_f);
3972
263k
            }
3973
823k
        }
3974
1.81M
        if (c == 0) break;
3975
1.80M
        r = end;
3976
1.80M
        key = r + 1;
3977
1.80M
    }
3978
3979
26.2k
    free(a_val);
3980
26.2k
    return 0;
3981
3982
27
 fail:
3983
27
    free(a_val);
3984
27
    return -1;
3985
26.2k
}
3986
3987
int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
3988
26.6k
{
3989
26.6k
    int ret = -2, overflow = 0;
3990
26.6k
    char *p, *q, *r, *t;
3991
26.6k
    kstring_t *str;
3992
26.6k
    khint_t k;
3993
26.6k
    ks_tokaux_t aux;
3994
3995
//#define NOT_DOT(p) strcmp((p), ".")
3996
//#define NOT_DOT(p) (!(*p == '.' && !p[1]))
3997
//#define NOT_DOT(p) ((*p) != '.' || (p)[1])
3998
//#define NOT_DOT(p) (q-p != 1 || memcmp(p, ".\0", 2))
3999
132k
#define NOT_DOT(p) (memcmp(p, ".\0", 2))
4000
4001
26.6k
    if (!s || !h || !v || !(s->s))
4002
0
        return ret;
4003
4004
    // Assumed in lots of places, but we may as well spot this early
4005
26.6k
    assert(sizeof(float) == sizeof(int32_t));
4006
4007
    // Ensure string we parse has space to permit some over-flow when during
4008
    // parsing.  Eg to do memcmp(key, "END", 4) in vcf_parse_info over
4009
    // the more straight forward looking strcmp, giving a speed advantage.
4010
26.6k
    if (ks_resize(s, s->l+4) < 0)
4011
0
        return -2;
4012
4013
    // Force our memory to be initialised so we avoid the technicality of
4014
    // undefined behaviour in using a 4-byte memcmp.  (The reality is this
4015
    // almost certainly is never detected by the compiler so has no impact,
4016
    // but equally so this code has minimal (often beneficial) impact on
4017
    // performance too.)
4018
26.6k
    s->s[s->l+0] = 0;
4019
26.6k
    s->s[s->l+1] = 0;
4020
26.6k
    s->s[s->l+2] = 0;
4021
26.6k
    s->s[s->l+3] = 0;
4022
4023
26.6k
    bcf_clear1(v);
4024
26.6k
    str = &v->shared;
4025
26.6k
    memset(&aux, 0, sizeof(ks_tokaux_t));
4026
4027
    // CHROM
4028
26.6k
    if (!(p = kstrtok(s->s, "\t", &aux)))
4029
0
        goto err;
4030
26.6k
    *(q = (char*)aux.p) = 0;
4031
4032
26.6k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
4033
26.6k
    k = kh_get(vdict, d, p);
4034
26.6k
    if (k == kh_end(d)) {
4035
1.48k
        hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p);
4036
1.48k
        v->errcode = BCF_ERR_CTG_UNDEF;
4037
1.48k
        if ((k = fix_chromosome(h, d, p)) == kh_end(d)) {
4038
24
            hts_log_error("Could not add dummy header for contig '%s'", p);
4039
24
            v->errcode |= BCF_ERR_CTG_INVALID;
4040
24
            goto err;
4041
24
        }
4042
1.48k
    }
4043
26.5k
    v->rid = kh_val(d, k).id;
4044
4045
    // POS
4046
26.5k
    if (!(p = kstrtok(0, 0, &aux)))
4047
118
        goto err;
4048
26.4k
    *(q = (char*)aux.p) = 0;
4049
4050
26.4k
    overflow = 0;
4051
26.4k
    char *tmp = p;
4052
26.4k
    v->pos = hts_str2uint(p, &p, 62, &overflow);
4053
26.4k
    if (overflow) {
4054
1
        hts_log_error("Position value '%s' is too large", tmp);
4055
1
        goto err;
4056
26.4k
    } else if ( *p ) {
4057
19
        hts_log_error("Could not parse the position '%s'", tmp);
4058
19
        goto err;
4059
26.4k
    } else {
4060
26.4k
        v->pos -= 1;
4061
26.4k
    }
4062
26.4k
    if (v->pos >= INT32_MAX)
4063
978
        v->unpacked |= BCF_IS_64BIT;
4064
4065
    // ID
4066
26.4k
    if (!(p = kstrtok(0, 0, &aux)))
4067
6
        goto err;
4068
26.4k
    *(q = (char*)aux.p) = 0;
4069
4070
26.4k
    if (NOT_DOT(p)) bcf_enc_vchar(str, q - p, p);
4071
3
    else bcf_enc_size(str, 0, BCF_BT_CHAR);
4072
4073
    // REF
4074
26.4k
    if (!(p = kstrtok(0, 0, &aux)))
4075
11
        goto err;
4076
26.4k
    *(q = (char*)aux.p) = 0;
4077
4078
26.4k
    bcf_enc_vchar(str, q - p, p);
4079
26.4k
    v->n_allele = 1, v->rlen = q - p;
4080
4081
    // ALT
4082
26.4k
    if (!(p = kstrtok(0, 0, &aux)))
4083
4
        goto err;
4084
26.4k
    *(q = (char*)aux.p) = 0;
4085
4086
26.4k
    if (NOT_DOT(p)) {
4087
49.5M
        for (r = t = p;; ++r) {
4088
49.5M
            if (*r == ',' || *r == 0) {
4089
1.58M
                if (v->n_allele == UINT16_MAX) {
4090
3
                    hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos,
4091
3
                                  bcf_seqname_safe(h,v), v->pos+1);
4092
3
                    v->errcode |= BCF_ERR_LIMITS;
4093
3
                    goto err;
4094
3
                }
4095
1.58M
                bcf_enc_vchar(str, r - t, t);
4096
1.58M
                t = r + 1;
4097
1.58M
                ++v->n_allele;
4098
1.58M
            }
4099
49.5M
            if (r == q) break;
4100
49.5M
        }
4101
26.0k
    }
4102
4103
    // QUAL
4104
26.4k
    if (!(p = kstrtok(0, 0, &aux)))
4105
16
        goto err;
4106
26.4k
    *(q = (char*)aux.p) = 0;
4107
4108
26.4k
    if (NOT_DOT(p)) v->qual = atof(p);
4109
318
    else bcf_float_set_missing(v->qual);
4110
26.4k
    if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR
4111
4112
    // FILTER
4113
26.4k
    if (!(p = kstrtok(0, 0, &aux)))
4114
16
        goto err;
4115
26.4k
    *(q = (char*)aux.p) = 0;
4116
4117
26.4k
    if (NOT_DOT(p)) {
4118
24.6k
        if (vcf_parse_filter(str, h, v, p, q)) {
4119
16
            goto err;
4120
16
        }
4121
24.6k
    } else bcf_enc_vint(str, 0, 0, -1);
4122
26.3k
    if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT
4123
4124
    // INFO
4125
26.3k
    if (!(p = kstrtok(0, 0, &aux)))
4126
21
        goto err;
4127
26.3k
    *(q = (char*)aux.p) = 0;
4128
4129
26.3k
    if (NOT_DOT(p)) {
4130
26.2k
        if (vcf_parse_info(str, h, v, p, q)) {
4131
27
            goto err;
4132
27
        }
4133
26.2k
    }
4134
26.3k
    if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
4135
4136
    // FORMAT; optional
4137
26.3k
    p = kstrtok(0, 0, &aux);
4138
26.3k
    if (p) {
4139
23.5k
        *(q = (char*)aux.p) = 0;
4140
4141
23.5k
        if (vcf_parse_format(s, h, v, p, q)) {
4142
73
            goto err;
4143
73
        }
4144
23.5k
    }
4145
4146
26.2k
 end:
4147
26.2k
    v->rlen = get_rlen(h, v);    //set rlen based on version
4148
26.2k
    ret = 0;
4149
4150
26.6k
 err:
4151
26.6k
    return ret;
4152
26.2k
}
4153
4154
int vcf_open_mode(char *mode, const char *fn, const char *format)
4155
0
{
4156
0
    if (format == NULL) {
4157
        // Try to pick a format based on the filename extension
4158
0
        char extension[HTS_MAX_EXT_LEN];
4159
0
        if (find_file_extension(fn, extension) < 0) return -1;
4160
0
        return vcf_open_mode(mode, fn, extension);
4161
0
    }
4162
0
    else if (strcasecmp(format, "bcf") == 0) strcpy(mode, "b");
4163
0
    else if (strcasecmp(format, "vcf") == 0) strcpy(mode, "");
4164
0
    else if (strcasecmp(format, "vcf.gz") == 0 || strcasecmp(format, "vcf.bgz") == 0) strcpy(mode, "z");
4165
0
    else return -1;
4166
4167
0
    return 0;
4168
0
}
4169
4170
int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
4171
26.8k
{
4172
26.8k
    int ret;
4173
26.8k
    ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
4174
26.8k
    if (ret < 0) return ret;
4175
26.6k
    return vcf_parse1(&fp->line, h, v);
4176
26.8k
}
4177
4178
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
4179
0
{
4180
0
    uint8_t *ptr_start = ptr;
4181
0
    fmt->id = bcf_dec_typed_int1(ptr, &ptr);
4182
0
    fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
4183
0
    fmt->size = fmt->n << bcf_type_shift[fmt->type];
4184
0
    fmt->p = ptr;
4185
0
    fmt->p_off  = ptr - ptr_start;
4186
0
    fmt->p_free = 0;
4187
0
    ptr += n_sample * fmt->size;
4188
0
    fmt->p_len = ptr - fmt->p;
4189
0
    return ptr;
4190
0
}
4191
4192
static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
4193
1.05k
{
4194
1.05k
    uint8_t *ptr_start = ptr;
4195
1.05k
    int64_t len = 0;
4196
1.05k
    info->key = bcf_dec_typed_int1(ptr, &ptr);
4197
1.05k
    len = info->len = bcf_dec_size(ptr, &ptr, &info->type);
4198
1.05k
    info->vptr = ptr;
4199
1.05k
    info->vptr_off  = ptr - ptr_start;
4200
1.05k
    info->vptr_free = 0;
4201
1.05k
    info->v1.i = 0;
4202
1.05k
    if (info->len == 1) {
4203
63
        switch(info->type) {
4204
0
        case BCF_BT_INT8:
4205
63
        case BCF_BT_CHAR:
4206
63
            info->v1.i = *(int8_t*)ptr;
4207
63
            break;
4208
0
        case BCF_BT_INT16:
4209
0
            info->v1.i = le_to_i16(ptr);
4210
0
            len <<= 1;
4211
0
            break;
4212
0
        case BCF_BT_INT32:
4213
0
            info->v1.i = le_to_i32(ptr);
4214
0
            len <<= 2;
4215
0
            break;
4216
0
        case BCF_BT_FLOAT:
4217
0
            info->v1.f = le_to_float(ptr);
4218
0
            len <<= 2;
4219
0
            break;
4220
0
        case BCF_BT_INT64:
4221
0
            info->v1.i = le_to_i64(ptr);
4222
0
            len <<= 3;
4223
0
            break;
4224
63
        }
4225
988
    } else {
4226
988
        len <<= bcf_type_shift[info->type];
4227
988
    }
4228
1.05k
    ptr += len;
4229
4230
1.05k
    info->vptr_len = ptr - info->vptr;
4231
1.05k
    return ptr;
4232
1.05k
}
4233
4234
int bcf_unpack(bcf1_t *b, int which)
4235
25.7k
{
4236
25.7k
    if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
4237
25.7k
    uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
4238
25.7k
    int i;
4239
25.7k
    bcf_dec_t *d = &b->d;
4240
25.7k
    if (which & BCF_UN_FLT) which |= BCF_UN_STR;
4241
25.7k
    if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
4242
25.7k
    if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
4243
25.7k
    {
4244
25.7k
        kstring_t tmp;
4245
4246
        // ID
4247
25.7k
        tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
4248
25.7k
        ptr_ori = ptr;
4249
25.7k
        ptr = bcf_fmt_sized_array(&tmp, ptr);
4250
25.7k
        b->unpack_size[0] = ptr - ptr_ori;
4251
25.7k
        kputc_('\0', &tmp);
4252
25.7k
        d->id = tmp.s; d->m_id = tmp.m;
4253
4254
        // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
4255
25.7k
        hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
4256
25.7k
        tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
4257
25.7k
        ptr_ori = ptr;
4258
1.02M
        for (i = 0; i < b->n_allele; ++i) {
4259
            // Use offset within tmp.s as realloc may change pointer
4260
996k
            d->allele[i] = (char *)(intptr_t)tmp.l;
4261
996k
            ptr = bcf_fmt_sized_array(&tmp, ptr);
4262
996k
            kputc_('\0', &tmp);
4263
996k
        }
4264
25.7k
        b->unpack_size[1] = ptr - ptr_ori;
4265
25.7k
        d->als = tmp.s; d->m_als = tmp.m;
4266
4267
        // Convert our offsets within tmp.s back to pointers again
4268
1.02M
        for (i = 0; i < b->n_allele; ++i)
4269
996k
            d->allele[i] = d->als + (ptrdiff_t)d->allele[i];
4270
25.7k
        b->unpacked |= BCF_UN_STR;
4271
25.7k
    }
4272
25.7k
    if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
4273
25.7k
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
4274
25.7k
        ptr_ori = ptr;
4275
25.7k
        if (*ptr>>4) {
4276
24.0k
            int type;
4277
24.0k
            d->n_flt = bcf_dec_size(ptr, &ptr, &type);
4278
24.0k
            hts_expand(int, d->n_flt, d->m_flt, d->flt);
4279
316k
            for (i = 0; i < d->n_flt; ++i)
4280
292k
                d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
4281
24.0k
        } else ++ptr, d->n_flt = 0;
4282
25.7k
        b->unpack_size[2] = ptr - ptr_ori;
4283
25.7k
        b->unpacked |= BCF_UN_FLT;
4284
25.7k
    }
4285
25.7k
    if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
4286
0
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
4287
0
        hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
4288
0
        for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
4289
0
        for (i = 0; i < b->n_info; ++i)
4290
0
            ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
4291
0
        b->unpacked |= BCF_UN_INFO;
4292
0
    }
4293
25.7k
    if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
4294
0
        ptr = (uint8_t*)b->indiv.s;
4295
0
        hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
4296
0
        for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
4297
0
        for (i = 0; i < b->n_fmt; ++i)
4298
0
            ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
4299
0
        b->unpacked |= BCF_UN_FMT;
4300
0
    }
4301
25.7k
    return 0;
4302
25.7k
}
4303
4304
int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
4305
25.7k
{
4306
25.7k
    int i;
4307
25.7k
    int32_t max_dt_id = h->n[BCF_DT_ID];
4308
25.7k
    const char *chrom = bcf_seqname(h, v);
4309
25.7k
    if (!chrom) {
4310
0
        hts_log_error("Invalid BCF, CONTIG id=%d not present in the header",
4311
0
                      v->rid);
4312
0
        errno = EINVAL;
4313
0
        return -1;
4314
0
    }
4315
4316
25.7k
    bcf_unpack((bcf1_t*)v, BCF_UN_ALL & ~(BCF_UN_INFO|BCF_UN_FMT));
4317
4318
    // Cache of key lengths so we don't keep repeatedly using them.
4319
    // This assumes we're not modifying the header between successive calls
4320
    // to vcf_format, but that would lead to many other forms of breakage
4321
    // so it feels like a valid assumption to make.
4322
    //
4323
    // We cannot just do this in bcf_hdr_sync as some code (eg bcftools
4324
    // annotate) manipulates the headers directly without calling sync to
4325
    // refresh the data structures.  So we must do just-in-time length
4326
    // calculation during writes instead.
4327
25.7k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
4328
25.7k
    if (!aux->key_len) {
4329
1.88k
        if (!(aux->key_len = calloc(h->n[BCF_DT_ID]+1, sizeof(*aux->key_len))))
4330
0
            return -1;
4331
1.88k
    }
4332
25.7k
    size_t *key_len = aux->key_len;
4333
4334
25.7k
    kputs(chrom, s); // CHROM
4335
25.7k
    kputc_('\t', s); kputll(v->pos + 1, s); // POS
4336
25.7k
    kputc_('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
4337
25.7k
    kputc_('\t', s); // REF
4338
25.7k
    if (v->n_allele > 0) kputs(v->d.allele[0], s);
4339
0
    else kputc_('.', s);
4340
25.7k
    kputc_('\t', s); // ALT
4341
25.7k
    if (v->n_allele > 1) {
4342
996k
        for (i = 1; i < v->n_allele; ++i) {
4343
971k
            if (i > 1) kputc_(',', s);
4344
971k
            kputs(v->d.allele[i], s);
4345
971k
        }
4346
25.3k
    } else kputc_('.', s);
4347
25.7k
    kputc_('\t', s); // QUAL
4348
25.7k
    if ( bcf_float_is_missing(v->qual) ) kputc_('.', s); // QUAL
4349
25.4k
    else kputd(v->qual, s);
4350
25.7k
    kputc_('\t', s); // FILTER
4351
25.7k
    if (v->d.n_flt) {
4352
316k
        for (i = 0; i < v->d.n_flt; ++i) {
4353
292k
            int32_t idx = v->d.flt[i];
4354
292k
            if (idx < 0 || idx >= max_dt_id
4355
292k
                || h->id[BCF_DT_ID][idx].key == NULL) {
4356
0
                hts_log_error("Invalid BCF, the FILTER tag id=%d at %s:%"PRIhts_pos" not present in the header",
4357
0
                              idx, bcf_seqname_safe(h, v), v->pos + 1);
4358
0
                errno = EINVAL;
4359
0
                return -1;
4360
0
            }
4361
292k
            if (i) kputc_(';', s);
4362
292k
            if (!key_len[idx])
4363
40.9k
                key_len[idx] = strlen(h->id[BCF_DT_ID][idx].key);
4364
292k
            kputsn(h->id[BCF_DT_ID][idx].key, key_len[idx], s);
4365
292k
        }
4366
24.0k
    } else kputc_('.', s);
4367
4368
25.7k
    kputc_('\t', s); // INFO
4369
25.7k
    if (v->n_info) {
4370
13.5k
        uint8_t *ptr = v->shared.s
4371
13.5k
            ? (uint8_t *)v->shared.s + v->unpack_size[0] +
4372
13.5k
               v->unpack_size[1] + v->unpack_size[2]
4373
13.5k
            : NULL;
4374
13.5k
        int first = 1;
4375
13.5k
        bcf_info_t *info = v->d.info;
4376
4377
        // Note if we duplicate this code into custom packed and unpacked
4378
        // implementations then we gain a bit more speed, particularly with
4379
        // clang 13 (up to 5%).  Not sure why this is, but code duplication
4380
        // isn't pleasant and it's still faster adding packed support than
4381
        // not so it's a win, just not as good as it should be.
4382
13.5k
        const int info_packed = !(v->unpacked & BCF_UN_INFO) && v->shared.l;
4383
855k
        for (i = 0; i < v->n_info; ++i) {
4384
841k
            bcf_info_t in, *z;
4385
841k
            if (info_packed) {
4386
                // Use a local bcf_info_t when data is packed
4387
841k
                z = &in;
4388
841k
                z->key  = bcf_dec_typed_int1(ptr, &ptr);
4389
841k
                z->len  = bcf_dec_size(ptr, &ptr, &z->type);
4390
841k
                z->vptr = ptr;
4391
841k
                ptr += z->len << bcf_type_shift[z->type];
4392
841k
            } else {
4393
                // Else previously unpacked INFO struct
4394
0
                z = &info[i];
4395
4396
                // Also potentially since deleted
4397
0
                if ( !z->vptr ) continue;
4398
0
            }
4399
4400
841k
            bcf_idpair_t *id = z->key >= 0 && z->key < max_dt_id
4401
841k
                ? &h->id[BCF_DT_ID][z->key]
4402
841k
                : NULL;
4403
4404
841k
            if (!id || !id->key) {
4405
0
                hts_log_error("Invalid BCF, the INFO tag id=%d is %s at %s:%"PRIhts_pos,
4406
0
                              z->key,
4407
0
                              z->key < 0 ? "negative"
4408
0
                              : (z->key >= max_dt_id ? "too large" : "not present in the header"),
4409
0
                              bcf_seqname_safe(h, v), v->pos+1);
4410
0
                errno = EINVAL;
4411
0
                return -1;
4412
0
            }
4413
4414
            // KEY
4415
841k
            if (!key_len[z->key])
4416
9.87k
                key_len[z->key] = strlen(id->key);
4417
841k
            size_t id_len = key_len[z->key];
4418
841k
            if (ks_resize(s, s->l + 3 + id_len) < 0)
4419
0
                return -1;
4420
841k
            char *sptr = s->s + s->l;
4421
841k
            if ( !first ) {
4422
828k
                *sptr++ = ';';
4423
828k
                s->l++;
4424
828k
            }
4425
841k
            first = 0;
4426
841k
            memcpy(sptr, id->key, id_len);
4427
841k
            s->l += id_len;
4428
4429
            // VALUE
4430
841k
            if (z->len <= 0) continue;
4431
445k
            sptr[id_len] = '=';
4432
445k
            s->l++;
4433
4434
445k
            if (z->len != 1 || info_packed) {
4435
445k
                bcf_fmt_array(s, z->len, z->type, z->vptr);
4436
445k
            } else {
4437
                // Single length vectors are unpacked into their
4438
                // own info.v1 union and handled separately.
4439
0
                if (z->type == BCF_BT_FLOAT) {
4440
0
                    if ( bcf_float_is_missing(z->v1.f) )
4441
0
                        kputc_('.', s);
4442
0
                    else
4443
0
                        kputd(z->v1.f, s);
4444
0
                } else if (z->type == BCF_BT_CHAR) {
4445
0
                    kputc_(z->v1.i, s);
4446
0
                } else if (z->type < BCF_BT_INT64) {
4447
0
                    int64_t missing[] = {
4448
0
                        0, // BCF_BT_NULL
4449
0
                        bcf_int8_missing,
4450
0
                        bcf_int16_missing,
4451
0
                        bcf_int32_missing,
4452
0
                    };
4453
0
                    if (z->v1.i == missing[z->type])
4454
0
                        kputc_('.', s);
4455
0
                    else
4456
0
                        kputw(z->v1.i, s);
4457
0
                } else if (z->type == BCF_BT_INT64) {
4458
0
                    if (z->v1.i == bcf_int64_missing)
4459
0
                        kputc_('.', s);
4460
0
                    else
4461
0
                        kputll(z->v1.i, s);
4462
0
                } else {
4463
0
                    hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, z->type, bcf_seqname_safe(h, v), v->pos+1);
4464
0
                    errno = EINVAL;
4465
0
                    return -1;
4466
0
                }
4467
0
            }
4468
445k
        }
4469
13.5k
        if ( first ) kputc_('.', s);
4470
13.5k
    } else kputc_('.', s);
4471
4472
    // FORMAT and individual information
4473
25.7k
    if (v->n_sample) {
4474
8.33k
        int i,j;
4475
8.33k
        if ( v->n_fmt) {
4476
8.19k
            uint8_t *ptr = (uint8_t *)v->indiv.s;
4477
8.19k
            int gt_i = -1;
4478
8.19k
            bcf_fmt_t *fmt = v->d.fmt;
4479
8.19k
            int first = 1, ret = 0;
4480
8.19k
            int fmt_packed = !(v->unpacked & BCF_UN_FMT);
4481
4482
8.19k
            if (fmt_packed) {
4483
                // Local fmt as we have an array of num FORMAT keys,
4484
                // each of which points to N.Sample values.
4485
4486
                // No real gain to be had in handling unpacked data here,
4487
                // but it doesn't cost us much in complexity either and
4488
                // it gives us flexibility.
4489
8.19k
                fmt = malloc(v->n_fmt * sizeof(*fmt));
4490
8.19k
                if (!fmt)
4491
0
                    return -1;
4492
8.19k
            }
4493
4494
            // KEYS
4495
25.0k
            for (i = 0; i < (int)v->n_fmt; ++i) {
4496
16.8k
                bcf_fmt_t *z;
4497
16.8k
                z = &fmt[i];
4498
16.8k
                if (fmt_packed) {
4499
16.8k
                    z->id   = bcf_dec_typed_int1(ptr, &ptr);
4500
16.8k
                    z->n    = bcf_dec_size(ptr, &ptr, &z->type);
4501
16.8k
                    z->p    = ptr;
4502
16.8k
                    z->size = z->n << bcf_type_shift[z->type];
4503
16.8k
                    ptr += v->n_sample * z->size;
4504
16.8k
                }
4505
16.8k
                if ( !z->p ) continue;
4506
16.8k
                kputc_(!first ? ':' : '\t', s); first = 0;
4507
4508
16.8k
                bcf_idpair_t *id = z->id >= 0 && z->id < max_dt_id
4509
16.8k
                    ? &h->id[BCF_DT_ID][z->id]
4510
16.8k
                    : NULL;
4511
4512
16.8k
                if (!id || !id->key) {
4513
0
                    hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", z->id, bcf_seqname_safe(h, v), v->pos+1);
4514
0
                    errno = EINVAL;
4515
0
                    if (fmt_packed)
4516
0
                        free(fmt);
4517
0
                    return -1;
4518
0
                }
4519
4520
16.8k
                if (!key_len[z->id])
4521
8.59k
                    key_len[z->id] = strlen(id->key);
4522
16.8k
                size_t id_len = key_len[z->id];
4523
16.8k
                kputsn(id->key, id_len, s);
4524
16.8k
                if (id_len == 2 && id->key[0] == 'G' && id->key[1] == 'T')
4525
3.76k
                    gt_i = i;
4526
16.8k
            }
4527
8.19k
            if ( first ) kputsn("\t.", 2, s);
4528
4529
            // VALUES per sample
4530
22.9k
            for (j = 0; j < v->n_sample; ++j) {
4531
14.7k
                kputc_('\t', s);
4532
14.7k
                first = 1;
4533
14.7k
                bcf_fmt_t *f = fmt;
4534
38.1k
                for (i = 0; i < (int)v->n_fmt; i++, f++) {
4535
32.0k
                    if ( !f->p ) continue;
4536
32.0k
                    if (!first) kputc_(':', s);
4537
32.0k
                    first = 0;
4538
32.0k
                    if (gt_i == i) {
4539
8.64k
                        if ((ret = bcf_format_gt_v2(h, f,j,s)) < 0) {
4540
0
                            hts_log_error("Failed to format GT value for sample %d, returned %d", i, ret);
4541
0
                            errno = EINVAL;
4542
0
                            if (fmt_packed)
4543
0
                                free(fmt);
4544
0
                            return -1;
4545
0
                        }
4546
8.64k
                        break;
4547
8.64k
                    }
4548
23.4k
                    else if (f->n == 1)
4549
2.01k
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4550
21.4k
                    else
4551
21.4k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4552
32.0k
                }
4553
4554
                // Simpler loop post GT and at least 1 iteration
4555
21.1k
                for (i++, f++; i < (int)v->n_fmt; i++, f++) {
4556
6.37k
                    if ( !f->p ) continue;
4557
6.37k
                    kputc_(':', s);
4558
6.37k
                    if (f->n == 1)
4559
872
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4560
5.50k
                    else
4561
5.50k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4562
6.37k
                }
4563
14.7k
                if ( first ) kputc_('.', s);
4564
14.7k
            }
4565
8.19k
            if (fmt_packed)
4566
8.19k
                free(fmt);
4567
8.19k
        }
4568
137
        else
4569
430
            for (j=0; j<=v->n_sample; j++)
4570
293
                kputsn("\t.", 2, s);
4571
8.33k
    }
4572
25.7k
    kputc('\n', s);
4573
25.7k
    return 0;
4574
25.7k
}
4575
4576
int vcf_write_line(htsFile *fp, kstring_t *line)
4577
0
{
4578
0
    int ret;
4579
0
    if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
4580
0
    if ( fp->format.compression!=no_compression )
4581
0
        ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
4582
0
    else
4583
0
        ret = hwrite(fp->fp.hfile, line->s, line->l);
4584
0
    return ret==line->l ? 0 : -1;
4585
0
}
4586
4587
int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
4588
25.7k
{
4589
25.7k
    ssize_t ret;
4590
25.7k
    fp->line.l = 0;
4591
25.7k
    if (vcf_format1(h, v, &fp->line) != 0)
4592
0
        return -1;
4593
25.7k
    if ( fp->format.compression!=no_compression ) {
4594
0
        if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4595
0
            return -1;
4596
0
        if (fp->idx && !fp->fp.bgzf->mt)
4597
0
            hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
4598
0
        ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
4599
25.7k
    } else {
4600
25.7k
        ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
4601
25.7k
    }
4602
4603
25.7k
    if (fp->idx && fp->format.compression == bgzf) {
4604
0
        int tid;
4605
0
        if ((tid = hts_idx_tbi_name(fp->idx, v->rid, bcf_seqname_safe(h, v))) < 0)
4606
0
            return -1;
4607
4608
0
        if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
4609
0
                          tid, v->pos, v->pos + v->rlen,
4610
0
                          bgzf_tell(fp->fp.bgzf), 1) < 0)
4611
0
            return -1;
4612
0
    }
4613
4614
25.7k
    return ret==fp->line.l ? 0 : -1;
4615
25.7k
}
4616
4617
/************************
4618
 * Data access routines *
4619
 ************************/
4620
4621
int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
4622
84.7k
{
4623
84.7k
    khint_t k;
4624
84.7k
    vdict_t *d = (vdict_t*)h->dict[which];
4625
84.7k
    k = kh_get(vdict, d, id);
4626
84.7k
    return k == kh_end(d)? -1 : kh_val(d, k).id;
4627
84.7k
}
4628
4629
4630
/********************
4631
 *** BCF indexing ***
4632
 ********************/
4633
4634
// Calculate number of index levels given min_shift and the header contig
4635
// list.  Also returns number of contigs in *nids_out.
4636
static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int *min_shift_in_out,
4637
                               int starting_n_lvls, int *nids_out)
4638
0
{
4639
0
    int n_lvls = starting_n_lvls, i, nids = 0;
4640
0
    int64_t max_len = 0;
4641
4642
0
    for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
4643
0
    {
4644
0
        if ( !h->id[BCF_DT_CTG][i].val ) continue;
4645
0
        if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] )
4646
0
            max_len = h->id[BCF_DT_CTG][i].val->info[0];
4647
0
        nids++;
4648
0
    }
4649
0
    if ( !max_len ) max_len = (1LL<<31) - 1;  // In case contig line is broken.
4650
4651
0
    hts_adjust_csi_settings(max_len, min_shift_in_out, &n_lvls);
4652
4653
0
    if (nids_out) *nids_out = nids;
4654
0
    return n_lvls;
4655
0
}
4656
4657
hts_idx_t *bcf_index(htsFile *fp, int min_shift)
4658
0
{
4659
0
    int n_lvls;
4660
0
    bcf1_t *b = NULL;
4661
0
    hts_idx_t *idx = NULL;
4662
0
    bcf_hdr_t *h;
4663
0
    int r;
4664
0
    h = bcf_hdr_read(fp);
4665
0
    if ( !h ) return NULL;
4666
0
    int nids = 0;
4667
0
    n_lvls = idx_calc_n_lvls_ids(h, &min_shift, 0, &nids);
4668
0
    idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4669
0
    if (!idx) goto fail;
4670
0
    b = bcf_init1();
4671
0
    if (!b) goto fail;
4672
0
    while ((r = bcf_read1(fp,h, b)) >= 0) {
4673
0
        int ret;
4674
0
        ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
4675
0
        if (ret < 0) goto fail;
4676
0
    }
4677
0
    if (r < -1) goto fail;
4678
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
4679
0
    bcf_destroy1(b);
4680
0
    bcf_hdr_destroy(h);
4681
0
    return idx;
4682
4683
0
 fail:
4684
0
    hts_idx_destroy(idx);
4685
0
    bcf_destroy1(b);
4686
0
    bcf_hdr_destroy(h);
4687
0
    return NULL;
4688
0
}
4689
4690
hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
4691
0
{
4692
0
    return fnidx? hts_idx_load2(fn, fnidx) : bcf_index_load(fn);
4693
0
}
4694
4695
hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags)
4696
0
{
4697
0
    return hts_idx_load3(fn, fnidx, HTS_FMT_CSI, flags);
4698
0
}
4699
4700
int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads)
4701
0
{
4702
0
    htsFile *fp;
4703
0
    hts_idx_t *idx;
4704
0
    tbx_t *tbx;
4705
0
    int ret;
4706
0
    if ((fp = hts_open(fn, "rb")) == 0) return -2;
4707
0
    if (n_threads)
4708
0
        hts_set_threads(fp, n_threads);
4709
0
    if ( fp->format.compression!=bgzf ) { hts_close(fp); return -3; }
4710
0
    switch (fp->format.format) {
4711
0
        case bcf:
4712
0
            if (!min_shift) {
4713
0
                hts_log_error("TBI indices for BCF files are not supported");
4714
0
                ret = -1;
4715
0
            } else {
4716
0
                idx = bcf_index(fp, min_shift);
4717
0
                if (idx) {
4718
0
                    ret = hts_idx_save_as(idx, fn, fnidx, HTS_FMT_CSI);
4719
0
                    if (ret < 0) ret = -4;
4720
0
                    hts_idx_destroy(idx);
4721
0
                }
4722
0
                else ret = -1;
4723
0
            }
4724
0
            break;
4725
4726
0
        case vcf:
4727
0
            tbx = tbx_index(hts_get_bgzfp(fp), min_shift, &tbx_conf_vcf);
4728
0
            if (tbx) {
4729
0
                ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0 ? HTS_FMT_CSI : HTS_FMT_TBI);
4730
0
                if (ret < 0) ret = -4;
4731
0
                tbx_destroy(tbx);
4732
0
            }
4733
0
            else ret = -1;
4734
0
            break;
4735
4736
0
        default:
4737
0
            ret = -3;
4738
0
            break;
4739
0
    }
4740
0
    hts_close(fp);
4741
0
    return ret;
4742
0
}
4743
4744
int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
4745
0
{
4746
0
    return bcf_index_build3(fn, fnidx, min_shift, 0);
4747
0
}
4748
4749
int bcf_index_build(const char *fn, int min_shift)
4750
0
{
4751
0
    return bcf_index_build3(fn, NULL, min_shift, 0);
4752
0
}
4753
4754
// Initialise fp->idx for the current format type.
4755
// This must be called after the header has been written but no other data.
4756
0
static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4757
0
    int n_lvls, fmt;
4758
4759
0
    if (min_shift == 0) {
4760
0
        min_shift = 14;
4761
0
        n_lvls = 5;
4762
0
        fmt = HTS_FMT_TBI;
4763
0
    } else {
4764
        // Set initial n_lvls to match tbx_index()
4765
0
        int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3;
4766
        // Increase if necessary
4767
0
        n_lvls = idx_calc_n_lvls_ids(h, &min_shift, starting_n_lvls, NULL);
4768
0
        fmt = HTS_FMT_CSI;
4769
0
    }
4770
4771
0
    fp->idx = hts_idx_init(0, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4772
0
    if (!fp->idx) return -1;
4773
4774
    // Tabix meta data, added even in CSI for VCF
4775
0
    uint8_t conf[4*7];
4776
0
    u32_to_le(TBX_VCF, conf+0);  // fmt
4777
0
    u32_to_le(1,       conf+4);  // name col
4778
0
    u32_to_le(2,       conf+8);  // beg col
4779
0
    u32_to_le(0,       conf+12); // end col
4780
0
    u32_to_le('#',     conf+16); // comment
4781
0
    u32_to_le(0,       conf+20); // n.skip
4782
0
    u32_to_le(0,       conf+24); // ref name len
4783
0
    if (hts_idx_set_meta(fp->idx, sizeof(conf)*sizeof(*conf), (uint8_t *)conf, 1) < 0) {
4784
0
        hts_idx_destroy(fp->idx);
4785
0
        fp->idx = NULL;
4786
0
        return -1;
4787
0
    }
4788
0
    fp->fnidx = fnidx;
4789
4790
0
    return 0;
4791
0
}
4792
4793
// Initialise fp->idx for the current format type.
4794
// This must be called after the header has been written but no other data.
4795
0
int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4796
0
    int n_lvls, nids = 0;
4797
4798
0
    if (fp->format.compression != bgzf) {
4799
0
        hts_log_error("Indexing is only supported on BGZF-compressed files");
4800
0
        return -3; // Matches no-compression return for bcf_index_build3()
4801
0
    }
4802
4803
0
    if (fp->format.format == vcf)
4804
0
        return vcf_idx_init(fp, h, min_shift, fnidx);
4805
4806
0
    if (!min_shift)
4807
0
        min_shift = 14;
4808
4809
0
    n_lvls = idx_calc_n_lvls_ids(h, &min_shift, 0, &nids);
4810
4811
0
    fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4812
0
    if (!fp->idx) return -1;
4813
0
    fp->fnidx = fnidx;
4814
4815
0
    return 0;
4816
0
}
4817
4818
// Finishes an index. Call after the last record has been written.
4819
// Returns 0 on success, <0 on failure.
4820
//
4821
// NB: same format as SAM/BAM as it uses bgzf.
4822
0
int bcf_idx_save(htsFile *fp) {
4823
0
    return sam_idx_save(fp);
4824
0
}
4825
4826
/*****************
4827
 *** Utilities ***
4828
 *****************/
4829
4830
int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
4831
0
{
4832
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res;
4833
0
    for (i=0; i<src->nhrec; i++)
4834
0
    {
4835
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4836
0
        {
4837
0
            int j;
4838
0
            for (j=0; j<ndst_ori; j++)
4839
0
            {
4840
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4841
4842
                // Checking only the key part of generic lines, otherwise
4843
                // the VCFs are too verbose. Should we perhaps add a flag
4844
                // to bcf_hdr_combine() and make this optional?
4845
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4846
0
            }
4847
0
            if ( j>=ndst_ori ) {
4848
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4849
0
                if (res < 0) return -1;
4850
0
                need_sync += res;
4851
0
            }
4852
0
        }
4853
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4854
0
        {
4855
            // NB: we are ignoring fields without ID
4856
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4857
0
            if ( j>=0 )
4858
0
            {
4859
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4860
0
                if ( !rec ) {
4861
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4862
0
                    if (res < 0) return -1;
4863
0
                    need_sync += res;
4864
0
                }
4865
0
            }
4866
0
        }
4867
0
        else
4868
0
        {
4869
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4870
0
            assert( j>=0 ); // this should always be true for valid VCFs
4871
4872
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4873
0
            if ( !rec ) {
4874
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4875
0
                if (res < 0) return -1;
4876
0
                need_sync += res;
4877
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4878
0
            {
4879
                // Check that both records are of the same type. The bcf_hdr_id2length
4880
                // macro cannot be used here because dst header is not synced yet.
4881
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4882
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4883
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4884
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4885
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4886
0
                {
4887
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4888
0
                        src->hrec[i]->vals[0]);
4889
0
                    ret |= 1;
4890
0
                }
4891
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4892
0
                {
4893
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4894
0
                        src->hrec[i]->vals[0]);
4895
0
                    ret |= 1;
4896
0
                }
4897
0
            }
4898
0
        }
4899
0
    }
4900
0
    if ( need_sync ) {
4901
0
        if (bcf_hdr_sync(dst) < 0) return -1;
4902
0
    }
4903
0
    return ret;
4904
0
}
4905
4906
bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
4907
0
{
4908
0
    if ( !dst )
4909
0
    {
4910
        // this will effectively strip existing IDX attributes from src to become dst
4911
0
        dst = bcf_hdr_init("r");
4912
0
        kstring_t htxt = {0,0,0};
4913
0
        if (bcf_hdr_format(src, 0, &htxt) < 0) {
4914
0
            free(htxt.s);
4915
0
            return NULL;
4916
0
        }
4917
0
        if ( bcf_hdr_parse(dst, htxt.s) < 0 ) {
4918
0
            bcf_hdr_destroy(dst);
4919
0
            dst = NULL;
4920
0
        }
4921
0
        free(htxt.s);
4922
0
        return dst;
4923
0
    }
4924
4925
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, res;
4926
0
    for (i=0; i<src->nhrec; i++)
4927
0
    {
4928
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4929
0
        {
4930
0
            int j;
4931
0
            for (j=0; j<ndst_ori; j++)
4932
0
            {
4933
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4934
4935
                // Checking only the key part of generic lines, otherwise
4936
                // the VCFs are too verbose. Should we perhaps add a flag
4937
                // to bcf_hdr_combine() and make this optional?
4938
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4939
0
            }
4940
0
            if ( j>=ndst_ori ) {
4941
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4942
0
                if (res < 0) return NULL;
4943
0
                need_sync += res;
4944
0
            }
4945
0
            else if ( !strcmp(src->hrec[i]->key,"fileformat") )
4946
0
            {
4947
0
                int ver_src = bcf_get_version(src,src->hrec[i]->value);
4948
0
                int ver_dst = bcf_get_version(dst,dst->hrec[j]->value);
4949
0
                if ( ver_src > ver_dst )
4950
0
                {
4951
0
                    if (bcf_hdr_set_version(dst,src->hrec[i]->value) < 0)
4952
0
                        return NULL;
4953
0
                    need_sync = 1;
4954
0
                }
4955
0
            }
4956
0
        }
4957
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4958
0
        {
4959
            // NB: we are ignoring fields without ID
4960
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4961
0
            if ( j>=0 )
4962
0
            {
4963
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4964
0
                if ( !rec ) {
4965
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4966
0
                    if (res < 0) return NULL;
4967
0
                    need_sync += res;
4968
0
                }
4969
0
            }
4970
0
        }
4971
0
        else
4972
0
        {
4973
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4974
0
            assert( j>=0 ); // this should always be true for valid VCFs
4975
4976
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4977
0
            if ( !rec ) {
4978
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4979
0
                if (res < 0) return NULL;
4980
0
                need_sync += res;
4981
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4982
0
            {
4983
                // Check that both records are of the same type. The bcf_hdr_id2length
4984
                // macro cannot be used here because dst header is not synced yet.
4985
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4986
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4987
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4988
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4989
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4990
0
                {
4991
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4992
0
                        src->hrec[i]->vals[0]);
4993
0
                }
4994
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4995
0
                {
4996
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4997
0
                        src->hrec[i]->vals[0]);
4998
0
                }
4999
0
            }
5000
0
        }
5001
0
    }
5002
0
    if ( need_sync ) {
5003
0
        if (bcf_hdr_sync(dst) < 0) return NULL;
5004
0
    }
5005
0
    return dst;
5006
0
}
5007
5008
int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
5009
0
{
5010
0
    int i;
5011
0
    if ( line->errcode )
5012
0
    {
5013
0
        char errordescription[1024] = "";
5014
0
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_strerror(line->errcode, errordescription, sizeof(errordescription)),  bcf_seqname_safe(src_hdr,line), line->pos+1);
5015
0
        exit(1);
5016
0
    }
5017
0
    if ( src_hdr->ntransl==-1 ) return 0;    // no need to translate, all tags have the same id
5018
0
    if ( !src_hdr->ntransl )  // called for the first time, see what needs translating
5019
0
    {
5020
0
        int dict;
5021
0
        for (dict=0; dict<2; dict++)    // BCF_DT_ID and BCF_DT_CTG
5022
0
        {
5023
0
            src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
5024
0
            for (i=0; i<src_hdr->n[dict]; i++)
5025
0
            {
5026
0
                if ( !src_hdr->id[dict][i].key ) // gap left after removed BCF header lines
5027
0
                {
5028
0
                    src_hdr->transl[dict][i] = -1;
5029
0
                    continue;
5030
0
                }
5031
0
                src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
5032
0
                if ( src_hdr->transl[dict][i]!=-1 && i!=src_hdr->transl[dict][i] ) src_hdr->ntransl++;
5033
0
            }
5034
0
        }
5035
0
        if ( !src_hdr->ntransl )
5036
0
        {
5037
0
            free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
5038
0
            free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
5039
0
            src_hdr->ntransl = -1;
5040
0
        }
5041
0
        if ( src_hdr->ntransl==-1 ) return 0;
5042
0
    }
5043
0
    bcf_unpack(line,BCF_UN_ALL);
5044
5045
    // CHROM
5046
0
    if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
5047
5048
    // FILTER
5049
0
    for (i=0; i<line->d.n_flt; i++)
5050
0
    {
5051
0
        int src_id = line->d.flt[i];
5052
0
        if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
5053
0
            line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
5054
0
        line->d.shared_dirty |= BCF1_DIRTY_FLT;
5055
0
    }
5056
5057
    // INFO
5058
0
    for (i=0; i<line->n_info; i++)
5059
0
    {
5060
0
        int src_id = line->d.info[i].key;
5061
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
5062
0
        if ( dst_id<0 ) continue;
5063
0
        line->d.info[i].key = dst_id;
5064
0
        if ( !line->d.info[i].vptr ) continue;  // skip deleted
5065
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5066
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5067
0
        if ( src_size==dst_size )   // can overwrite
5068
0
        {
5069
0
            uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
5070
0
            if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
5071
0
            else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
5072
0
            else { *(uint32_t*)vptr = (uint32_t)dst_id; }
5073
0
        }
5074
0
        else    // must realloc
5075
0
        {
5076
0
            bcf_info_t *info = &line->d.info[i];
5077
0
            kstring_t str = {0,0,0};
5078
0
            bcf_enc_int1(&str, dst_id);
5079
0
            bcf_enc_size(&str, info->len,info->type);
5080
0
            uint32_t vptr_off = str.l;
5081
0
            kputsn((char*)info->vptr, info->vptr_len, &str);
5082
0
            if( info->vptr_free ) free(info->vptr - info->vptr_off);
5083
0
            info->vptr_off = vptr_off;
5084
0
            info->vptr = (uint8_t*)str.s + info->vptr_off;
5085
0
            info->vptr_free = 1;
5086
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5087
0
        }
5088
0
    }
5089
5090
    // FORMAT
5091
0
    for (i=0; i<line->n_fmt; i++)
5092
0
    {
5093
0
        int src_id = line->d.fmt[i].id;
5094
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
5095
0
        if ( dst_id<0 ) continue;
5096
0
        line->d.fmt[i].id = dst_id;
5097
0
        if( !line->d.fmt[i].p ) continue;  // skip deleted
5098
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5099
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5100
0
        if ( src_size==dst_size )   // can overwrite
5101
0
        {
5102
0
            uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off;    // pointer to the vector size (4bits) and BT type (4bits)
5103
0
            if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
5104
0
            else if ( dst_size==BCF_BT_INT16 ) { i16_to_le(dst_id, p + 1); }
5105
0
            else { i32_to_le(dst_id, p + 1); }
5106
0
        }
5107
0
        else    // must realloc
5108
0
        {
5109
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
5110
0
            kstring_t str = {0,0,0};
5111
0
            bcf_enc_int1(&str, dst_id);
5112
0
            bcf_enc_size(&str, fmt->n, fmt->type);
5113
0
            uint32_t p_off = str.l;
5114
0
            kputsn((char*)fmt->p, fmt->p_len, &str);
5115
0
            if( fmt->p_free ) free(fmt->p - fmt->p_off);
5116
0
            fmt->p_off = p_off;
5117
0
            fmt->p = (uint8_t*)str.s + fmt->p_off;
5118
0
            fmt->p_free = 1;
5119
0
            line->d.indiv_dirty = 1;
5120
0
        }
5121
0
    }
5122
0
    return 0;
5123
0
}
5124
5125
bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
5126
0
{
5127
0
    bcf_hdr_t *hout = bcf_hdr_init("r");
5128
0
    if (!hout) {
5129
0
        hts_log_error("Failed to allocate bcf header");
5130
0
        return NULL;
5131
0
    }
5132
0
    kstring_t htxt = {0,0,0};
5133
0
    if (bcf_hdr_format(hdr, 1, &htxt) < 0) {
5134
0
        free(htxt.s);
5135
0
        return NULL;
5136
0
    }
5137
0
    if ( bcf_hdr_parse(hout, htxt.s) < 0 ) {
5138
0
        bcf_hdr_destroy(hout);
5139
0
        hout = NULL;
5140
0
    }
5141
0
    free(htxt.s);
5142
0
    return hout;
5143
0
}
5144
5145
bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
5146
0
{
5147
0
    void *names_hash = khash_str2int_init();
5148
0
    kstring_t htxt = {0,0,0};
5149
0
    kstring_t str = {0,0,0};
5150
0
    bcf_hdr_t *h = bcf_hdr_init("w");
5151
0
    int r = 0;
5152
0
    if (!h || !names_hash) {
5153
0
        hts_log_error("Failed to allocate bcf header");
5154
0
        goto err;
5155
0
    }
5156
0
    if (bcf_hdr_format(h0, 1, &htxt) < 0) {
5157
0
        hts_log_error("Failed to get header text");
5158
0
        goto err;
5159
0
    }
5160
0
    bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
5161
0
    int j;
5162
0
    for (j=0; j<n; j++) imap[j] = -1;
5163
0
    if ( bcf_hdr_nsamples(h0) > 0) {
5164
0
        char *p = find_chrom_header_line(htxt.s);
5165
0
        int i = 0, end = n? 8 : 7;
5166
0
        while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
5167
0
        if (i != end) {
5168
0
            hts_log_error("Wrong number of columns in header #CHROM line");
5169
0
            goto err;
5170
0
        }
5171
0
        r |= kputsn(htxt.s, p - htxt.s, &str) < 0;
5172
0
        for (i = 0; i < n; ++i) {
5173
0
            if ( khash_str2int_has_key(names_hash,samples[i]) )
5174
0
            {
5175
0
                hts_log_error("Duplicate sample name \"%s\"", samples[i]);
5176
0
                goto err;
5177
0
            }
5178
0
            imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
5179
0
            if (imap[i] < 0) continue;
5180
0
            r |= kputc('\t', &str) < 0;
5181
0
            r |= kputs(samples[i], &str) < 0;
5182
0
            r |= khash_str2int_inc(names_hash,samples[i]) < 0;
5183
0
        }
5184
0
    } else r |= kputsn(htxt.s, htxt.l, &str) < 0;
5185
0
    while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
5186
0
    r |= kputc('\n',&str) < 0;
5187
0
    if (r) {
5188
0
        hts_log_error("%s", strerror(errno));
5189
0
        goto err;
5190
0
    }
5191
0
    if ( bcf_hdr_parse(h, str.s) < 0 ) {
5192
0
        bcf_hdr_destroy(h);
5193
0
        h = NULL;
5194
0
    }
5195
0
    free(str.s);
5196
0
    free(htxt.s);
5197
0
    khash_str2int_destroy(names_hash);
5198
0
    return h;
5199
5200
0
 err:
5201
0
    ks_free(&str);
5202
0
    ks_free(&htxt);
5203
0
    khash_str2int_destroy(names_hash);
5204
0
    bcf_hdr_destroy(h);
5205
0
    return NULL;
5206
0
}
5207
5208
int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
5209
0
{
5210
0
    if ( samples && !strcmp("-",samples) ) return 0;            // keep all samples
5211
5212
0
    int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
5213
0
    hdr->keep_samples = (uint8_t*) calloc(narr,1);
5214
0
    if (!hdr->keep_samples) return -1;
5215
5216
0
    hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
5217
0
    if ( !samples )
5218
0
    {
5219
        // exclude all samples
5220
0
        khint_t k;
5221
0
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE], *new_dict;
5222
0
        new_dict = kh_init(vdict);
5223
0
        if (!new_dict) return -1;
5224
5225
0
        bcf_hdr_nsamples(hdr) = 0;
5226
5227
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
5228
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
5229
0
        kh_destroy(vdict, d);
5230
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
5231
0
        if (bcf_hdr_sync(hdr) < 0) return -1;
5232
5233
0
        return 0;
5234
0
    }
5235
5236
0
    if ( samples[0]=='^' )
5237
0
        for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
5238
5239
0
    int idx, n, ret = 0;
5240
0
    char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
5241
0
    if ( !smpls ) return -1;
5242
0
    for (i=0; i<n; i++)
5243
0
    {
5244
0
        idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
5245
0
        if ( idx<0 )
5246
0
        {
5247
0
            if ( !ret ) ret = i+1;
5248
0
            continue;
5249
0
        }
5250
0
        assert( idx<bcf_hdr_nsamples(hdr) );
5251
0
        if (  samples[0]=='^' )
5252
0
            bit_array_clear(hdr->keep_samples, idx);
5253
0
        else
5254
0
            bit_array_set(hdr->keep_samples, idx);
5255
0
    }
5256
0
    for (i=0; i<n; i++) free(smpls[i]);
5257
0
    free(smpls);
5258
5259
0
    bcf_hdr_nsamples(hdr) = 0;
5260
0
    for (i=0; i<hdr->nsamples_ori; i++)
5261
0
        if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
5262
5263
0
    if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
5264
0
    else
5265
0
    {
5266
        // Make new list and dictionary with desired samples
5267
0
        char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr));
5268
0
        vdict_t *new_dict, *d;
5269
0
        int k, res;
5270
0
        if (!samples) return -1;
5271
5272
0
        new_dict = kh_init(vdict);
5273
0
        if (!new_dict) {
5274
0
            free(samples);
5275
0
            return -1;
5276
0
        }
5277
0
        idx = 0;
5278
0
        for (i=0; i<hdr->nsamples_ori; i++) {
5279
0
            if ( bit_array_test(hdr->keep_samples,i) ) {
5280
0
                samples[idx] = hdr->samples[i];
5281
0
                k = kh_put(vdict, new_dict, hdr->samples[i], &res);
5282
0
                if (res < 0) {
5283
0
                    free(samples);
5284
0
                    kh_destroy(vdict, new_dict);
5285
0
                    return -1;
5286
0
                }
5287
0
                kh_val(new_dict, k) = bcf_idinfo_def;
5288
0
                kh_val(new_dict, k).id = idx;
5289
0
                idx++;
5290
0
            }
5291
0
        }
5292
5293
        // Delete desired samples from old dictionary, so we don't free them
5294
0
        d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
5295
0
        for (i=0; i < idx; i++) {
5296
0
            int k = kh_get(vdict, d, samples[i]);
5297
0
            if (k < kh_end(d)) kh_del(vdict, d, k);
5298
0
        }
5299
5300
        // Free everything else
5301
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
5302
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
5303
0
        kh_destroy(vdict, d);
5304
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
5305
5306
0
        free(hdr->samples);
5307
0
        hdr->samples = samples;
5308
5309
0
        if (bcf_hdr_sync(hdr) < 0)
5310
0
            return -1;
5311
0
    }
5312
5313
0
    return ret;
5314
0
}
5315
5316
int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
5317
0
{
5318
0
    kstring_t ind;
5319
0
    ind.s = 0; ind.l = ind.m = 0;
5320
0
    if (n) {
5321
0
        bcf_fmt_t fmt[MAX_N_FMT];
5322
0
        int i, j;
5323
0
        uint8_t *ptr = (uint8_t*)v->indiv.s;
5324
0
        for (i = 0; i < v->n_fmt; ++i)
5325
0
            ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
5326
0
        for (i = 0; i < (int)v->n_fmt; ++i) {
5327
0
            bcf_fmt_t *f = &fmt[i];
5328
0
            bcf_enc_int1(&ind, f->id);
5329
0
            bcf_enc_size(&ind, f->n, f->type);
5330
0
            for (j = 0; j < n; ++j)
5331
0
                if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
5332
0
        }
5333
0
        for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
5334
0
        v->n_sample = i;
5335
0
    } else v->n_sample = 0;
5336
0
    if ( !v->n_sample ) v->n_fmt = 0;
5337
0
    free(v->indiv.s);
5338
0
    v->indiv = ind;
5339
0
    v->unpacked &= ~BCF_UN_FMT;    // only BCF is ready for output, VCF will need to unpack again
5340
0
    return 0;
5341
0
}
5342
5343
int bcf_is_snp(bcf1_t *v)
5344
0
{
5345
0
    int i;
5346
0
    bcf_unpack(v, BCF_UN_STR);
5347
0
    for (i = 0; i < v->n_allele; ++i)
5348
0
    {
5349
0
        if ( v->d.allele[i][1]==0 && v->d.allele[i][0]!='*' ) continue;
5350
5351
        // mpileup's <X> allele, see also below. This is not completely satisfactory,
5352
        // a general library is here narrowly tailored to fit samtools.
5353
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue;
5354
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='*' && v->d.allele[i][2]=='>' ) continue;
5355
5356
0
        break;
5357
0
    }
5358
0
    return i == v->n_allele;
5359
0
}
5360
5361
static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t *var)
5362
0
{
5363
0
    if ( *alt == '*' && !alt[1] ) { var->n = 0; var->type = VCF_OVERLAP; return; }  // overlapping variant
5364
5365
    // The most frequent case
5366
0
    if ( !ref[1] && !alt[1] )
5367
0
    {
5368
0
        if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
5369
0
        if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
5370
0
        var->n = 1; var->type = VCF_SNP; return;
5371
0
    }
5372
0
    if ( alt[0]=='<' )
5373
0
    {
5374
0
        if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
5375
0
        if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }
5376
0
        if ( !strcmp("NON_REF>",alt+1) ) { var->n = 0; var->type = VCF_REF; return; }
5377
0
        var->type = VCF_OTHER;
5378
0
        return;
5379
0
    }
5380
5381
    // Catch "joined before" breakend case
5382
0
    if ( alt[0]==']' || alt[0] == '[' )
5383
0
    {
5384
0
        var->type = VCF_BND; return;
5385
0
    }
5386
5387
    // Iterate through alt characters that match the reference
5388
0
    const char *r = ref, *a = alt;
5389
0
    while (*r && *a && toupper_c(*r)==toupper_c(*a) ) { r++; a++; }     // unfortunately, matching REF,ALT case is not guaranteed
5390
5391
0
    if ( *a && !*r )
5392
0
    {
5393
0
        while ( *a ) a++;
5394
0
        if ( *(a-1)==']' || *(a-1)=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend
5395
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_INS; return;
5396
0
    }
5397
0
    else if ( *r && !*a )
5398
0
    {
5399
0
        while ( *r ) r++;
5400
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_DEL; return;
5401
0
    }
5402
0
    else if ( !*r && !*a )
5403
0
    {
5404
0
        var->n = 0; var->type = VCF_REF; return;
5405
0
    }
5406
5407
0
    const char *re = r, *ae = a;
5408
0
    while ( re[1] ) re++;
5409
0
    while ( ae[1] ) ae++;
5410
0
    if ( ae[0]==']' || ae[0]=='[' ) { var->type = VCF_BND; return; }    // "joined after" breakend
5411
0
    while ( re>r && ae>a && toupper_c(*re)==toupper_c(*ae) ) { re--; ae--; }
5412
0
    if ( ae==a )
5413
0
    {
5414
0
        if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
5415
0
        var->n = -(re-r);
5416
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_DEL; return; }
5417
0
        var->type = VCF_OTHER; return;
5418
0
    }
5419
0
    else if ( re==r )
5420
0
    {
5421
0
        var->n = ae-a;
5422
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_INS; return; }
5423
0
        var->type = VCF_OTHER; return;
5424
0
    }
5425
5426
0
    var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
5427
0
    var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
5428
5429
    // should do also complex events, SVs, etc...
5430
0
}
5431
5432
static int bcf_set_variant_types(bcf1_t *b)
5433
0
{
5434
0
    if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
5435
0
    bcf_dec_t *d = &b->d;
5436
0
    if ( d->n_var < b->n_allele )
5437
0
    {
5438
0
        bcf_variant_t *new_var = realloc(d->var, sizeof(bcf_variant_t)*b->n_allele);
5439
0
        if (!new_var)
5440
0
            return -1;
5441
0
        d->var = new_var;
5442
0
        d->n_var = b->n_allele;
5443
0
    }
5444
0
    int i;
5445
0
    b->d.var_type = 0;
5446
0
    d->var[0].type = VCF_REF;
5447
0
    d->var[0].n    = 0;
5448
0
    for (i=1; i<b->n_allele; i++)
5449
0
    {
5450
0
        bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
5451
0
        b->d.var_type |= d->var[i].type;
5452
        //fprintf(stderr,"[set_variant_type] %d   %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
5453
0
    }
5454
0
    return 0;
5455
0
}
5456
5457
// bcf_get_variant_type/bcf_get_variant_types should only return the following,
5458
// to be compatible with callers that are not expecting newer values
5459
// like VCF_INS, VCF_DEL.  The full set is available from the newer
5460
// vcf_has_variant_type* interfaces.
5461
0
#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP)
5462
int bcf_get_variant_types(bcf1_t *rec)
5463
0
{
5464
0
    if ( rec->d.var_type==-1 ) {
5465
0
        if (bcf_set_variant_types(rec) != 0) {
5466
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5467
0
            exit(1); // Due to legacy API having no way to report failures
5468
0
        }
5469
0
    }
5470
0
    return rec->d.var_type & ORIG_VAR_TYPES;
5471
0
}
5472
5473
int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
5474
0
{
5475
0
    if ( rec->d.var_type==-1 ) {
5476
0
        if (bcf_set_variant_types(rec) != 0) {
5477
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5478
0
            exit(1); // Due to legacy API having no way to report failures
5479
0
        }
5480
0
    }
5481
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) {
5482
0
        hts_log_error("Requested allele outside valid range");
5483
0
        exit(1);
5484
0
    }
5485
0
    return rec->d.var[ith_allele].type & ORIG_VAR_TYPES;
5486
0
}
5487
#undef ORIG_VAR_TYPES
5488
5489
int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask)
5490
0
{
5491
0
    if ( rec->d.var_type==-1 ) {
5492
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5493
0
    }
5494
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return -1;
5495
0
    if (bitmask == VCF_REF) {  // VCF_REF is 0, so handled as a special case
5496
0
        return rec->d.var[ith_allele].type == VCF_REF;
5497
0
    }
5498
0
    return bitmask & rec->d.var[ith_allele].type;
5499
0
}
5500
5501
int bcf_variant_length(bcf1_t *rec, int ith_allele)
5502
0
{
5503
0
    if ( rec->d.var_type==-1 ) {
5504
0
        if (bcf_set_variant_types(rec) != 0) return bcf_int32_missing;
5505
0
    }
5506
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return bcf_int32_missing;
5507
0
    return rec->d.var[ith_allele].n;
5508
0
}
5509
5510
int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask,
5511
                          enum bcf_variant_match mode)
5512
0
{
5513
0
    if ( rec->d.var_type==-1 ) {
5514
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5515
0
    }
5516
0
    uint32_t type = rec->d.var_type;
5517
0
    if ( mode==bcf_match_overlap ) return bitmask & type;
5518
5519
    // VCF_INDEL is always set with VCF_INS and VCF_DEL by bcf_set_variant_type[s], but the bitmask may
5520
    // ask for say `VCF_INS` or `VCF_INDEL` only
5521
0
    if ( bitmask&(VCF_INS|VCF_DEL) && !(bitmask&VCF_INDEL) ) type &= ~VCF_INDEL;
5522
0
    else if ( bitmask&VCF_INDEL && !(bitmask&(VCF_INS|VCF_DEL)) ) type &= ~(VCF_INS|VCF_DEL);
5523
5524
0
    if ( mode==bcf_match_subset )
5525
0
    {
5526
0
        if ( ~bitmask & type ) return 0;
5527
0
        else return bitmask & type;
5528
0
    }
5529
    // mode == bcf_match_exact
5530
0
    if ( bitmask==VCF_REF ) return type==bitmask ? 1 : 0;
5531
0
    return type==bitmask ? type : 0;
5532
0
}
5533
5534
int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5535
0
{
5536
0
    static int negative_rlen_warned = 0;
5537
0
    int is_end_tag, is_svlen_tag = 0;
5538
5539
    // Is the field already present?
5540
0
    int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5541
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1;    // No such INFO field in the header
5542
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
5543
5544
0
    is_end_tag = strcmp(key, "END") == 0;
5545
0
    is_svlen_tag = strcmp(key, "SVLEN") == 0;
5546
5547
0
    for (i=0; i<line->n_info; i++)
5548
0
        if ( inf_id==line->d.info[i].key ) break;
5549
0
    bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
5550
5551
0
    if ( !n || (type==BCF_HT_STR && !values) )
5552
0
    {
5553
0
        if ( inf )
5554
0
        {
5555
            // Mark the tag for removal, free existing memory if necessary
5556
0
            if ( inf->vptr_free )
5557
0
            {
5558
0
                free(inf->vptr - inf->vptr_off);
5559
0
                inf->vptr_free = 0;
5560
0
            }
5561
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5562
0
            inf->vptr = NULL;
5563
0
            inf->vptr_off = inf->vptr_len = 0;
5564
0
        }
5565
0
        if ( n==0 && (is_end_tag || is_svlen_tag) ) {
5566
0
            line->rlen = get_rlen(hdr, line);
5567
0
        }
5568
0
        return 0;
5569
0
    }
5570
5571
0
    if (is_end_tag)
5572
0
    {
5573
0
        if (n != 1)
5574
0
        {
5575
0
            hts_log_error("END info tag should only have one value at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5576
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5577
0
            return -1;
5578
0
        }
5579
0
        if (type != BCF_HT_INT && type != BCF_HT_LONG)
5580
0
        {
5581
0
            hts_log_error("Wrong type (%d) for END info tag at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5582
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5583
0
            return -1;
5584
0
        }
5585
0
    }
5586
5587
    // Encode the values and determine the size required to accommodate the values
5588
0
    kstring_t str = {0,0,0};
5589
0
    bcf_enc_int1(&str, inf_id);
5590
0
    if ( type==BCF_HT_INT )
5591
0
        bcf_enc_vint(&str, n, (int32_t*)values, -1);
5592
0
    else if ( type==BCF_HT_REAL )
5593
0
        bcf_enc_vfloat(&str, n, (float*)values);
5594
0
    else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
5595
0
    {
5596
0
        if ( values==NULL )
5597
0
            bcf_enc_size(&str, 0, BCF_BT_NULL);
5598
0
        else
5599
0
            bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
5600
0
    }
5601
#ifdef VCF_ALLOW_INT64
5602
    else if ( type==BCF_HT_LONG )
5603
    {
5604
        if (n != 1) {
5605
            hts_log_error("Only storing a single BCF_HT_LONG value is supported at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5606
            abort();
5607
        }
5608
        bcf_enc_long1(&str, *(int64_t *) values);
5609
    }
5610
#endif
5611
0
    else
5612
0
    {
5613
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5614
0
        abort();
5615
0
    }
5616
5617
    // Is the INFO tag already present
5618
0
    if ( inf )
5619
0
    {
5620
        // Is it big enough to accommodate new block?
5621
0
        if ( inf->vptr && str.l <= inf->vptr_len + inf->vptr_off )
5622
0
        {
5623
0
            if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
5624
0
            uint8_t *ptr = inf->vptr - inf->vptr_off;
5625
0
            memcpy(ptr, str.s, str.l);
5626
0
            free(str.s);
5627
0
            int vptr_free = inf->vptr_free;
5628
0
            bcf_unpack_info_core1(ptr, inf);
5629
0
            inf->vptr_free = vptr_free;
5630
0
        }
5631
0
        else
5632
0
        {
5633
0
            if ( inf->vptr_free )
5634
0
                free(inf->vptr - inf->vptr_off);
5635
0
            bcf_unpack_info_core1((uint8_t*)str.s, inf);
5636
0
            inf->vptr_free = 1;
5637
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5638
0
        }
5639
0
    }
5640
0
    else
5641
0
    {
5642
        // The tag is not present, create new one
5643
0
        line->n_info++;
5644
0
        hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
5645
0
        inf = &line->d.info[line->n_info-1];
5646
0
        bcf_unpack_info_core1((uint8_t*)str.s, inf);
5647
0
        inf->vptr_free = 1;
5648
0
        line->d.shared_dirty |= BCF1_DIRTY_INF;
5649
0
    }
5650
0
    line->unpacked |= BCF_UN_INFO;
5651
5652
0
   if ( n==1 && is_end_tag) {
5653
0
        hts_pos_t end = type == BCF_HT_INT ? *(int32_t *) values : *(int64_t *) values;
5654
0
        if ( (type == BCF_HT_INT && end!=bcf_int32_missing) || (type == BCF_HT_LONG && end!=bcf_int64_missing) )
5655
0
        {
5656
0
            if ( end <= line->pos )
5657
0
            {
5658
0
                if ( !negative_rlen_warned )
5659
0
                {
5660
0
                    hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,end,bcf_seqname_safe(hdr,line),line->pos+1);
5661
0
                    negative_rlen_warned = 1;
5662
0
                }
5663
0
            }
5664
0
        }
5665
0
    }
5666
0
    if (is_svlen_tag || is_end_tag) {
5667
0
        line->rlen = get_rlen(hdr, line);
5668
0
    }
5669
0
    return 0;
5670
0
}
5671
5672
int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
5673
0
{
5674
0
    if ( !n )
5675
0
        return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
5676
5677
0
    int i, max_len = 0;
5678
0
    for (i=0; i<n; i++)
5679
0
    {
5680
0
        int len = strlen(values[i]);
5681
0
        if ( len > max_len ) max_len = len;
5682
0
    }
5683
0
    char *out = (char*) malloc(max_len*n);
5684
0
    if ( !out ) return -2;
5685
0
    for (i=0; i<n; i++)
5686
0
    {
5687
0
        char *dst = out+i*max_len;
5688
0
        const char *src = values[i];
5689
0
        int j = 0;
5690
0
        while ( src[j] ) { dst[j] = src[j]; j++; }
5691
0
        for (; j<max_len; j++) dst[j] = 0;
5692
0
    }
5693
0
    int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
5694
0
    free(out);
5695
0
    return ret;
5696
0
}
5697
5698
int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5699
0
{
5700
    // Is the field already present?
5701
0
    int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5702
0
    int is_len = 0;
5703
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
5704
0
    {
5705
0
        if ( !n ) return 0;
5706
0
        return -1;  // the key not present in the header
5707
0
    }
5708
5709
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5710
5711
0
    for (i=0; i<line->n_fmt; i++)
5712
0
        if ( line->d.fmt[i].id==fmt_id ) break;
5713
0
    bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
5714
5715
0
    is_len = strcmp(key, "LEN") == 0;
5716
0
    if ( !n )
5717
0
    {
5718
0
        if ( fmt )
5719
0
        {
5720
            // Mark the tag for removal, free existing memory if necessary
5721
0
            if ( fmt->p_free )
5722
0
            {
5723
0
                free(fmt->p - fmt->p_off);
5724
0
                fmt->p_free = 0;
5725
0
            }
5726
0
            line->d.indiv_dirty = 1;
5727
0
            fmt->p = NULL;
5728
0
        }
5729
0
        if (is_len) {
5730
0
            line->rlen = get_rlen(hdr, line);
5731
0
        }
5732
0
        return 0;
5733
0
    }
5734
5735
0
    line->n_sample = bcf_hdr_nsamples(hdr);
5736
0
    int nps = n / line->n_sample;  // number of values per sample
5737
0
    assert( nps && nps*line->n_sample==n );     // must be divisible by n_sample
5738
5739
    // Encode the values and determine the size required to accommodate the values
5740
0
    kstring_t str = {0,0,0};
5741
0
    bcf_enc_int1(&str, fmt_id);
5742
0
    assert(values != NULL);
5743
0
    if ( type==BCF_HT_INT )
5744
0
        bcf_enc_vint(&str, n, (int32_t*)values, nps);
5745
0
    else if ( type==BCF_HT_REAL )
5746
0
    {
5747
0
        bcf_enc_size(&str, nps, BCF_BT_FLOAT);
5748
0
        serialize_float_array(&str, nps*line->n_sample, (float *) values);
5749
0
    }
5750
0
    else if ( type==BCF_HT_STR )
5751
0
    {
5752
0
        bcf_enc_size(&str, nps, BCF_BT_CHAR);
5753
0
        kputsn((char*)values, nps*line->n_sample, &str);
5754
0
    }
5755
0
    else
5756
0
    {
5757
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5758
0
        abort();
5759
0
    }
5760
5761
0
    if ( !fmt )
5762
0
    {
5763
        // Not present, new format field
5764
0
        line->n_fmt++;
5765
0
        hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
5766
5767
        // Special case: VCF specification requires that GT is always first
5768
0
        if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
5769
0
        {
5770
0
            for (i=line->n_fmt-1; i>0; i--)
5771
0
                line->d.fmt[i] = line->d.fmt[i-1];
5772
0
            fmt = &line->d.fmt[0];
5773
0
        }
5774
0
        else
5775
0
            fmt = &line->d.fmt[line->n_fmt-1];
5776
0
        bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5777
0
        line->d.indiv_dirty = 1;
5778
0
        fmt->p_free = 1;
5779
0
    }
5780
0
    else
5781
0
    {
5782
        // The tag is already present, check if it is big enough to accommodate the new block
5783
0
        if ( fmt->p && str.l <= fmt->p_len + fmt->p_off )
5784
0
        {
5785
            // good, the block is big enough
5786
0
            if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
5787
0
            uint8_t *ptr = fmt->p - fmt->p_off;
5788
0
            memcpy(ptr, str.s, str.l);
5789
0
            free(str.s);
5790
0
            int p_free = fmt->p_free;
5791
0
            bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
5792
0
            fmt->p_free = p_free;
5793
0
        }
5794
0
        else
5795
0
        {
5796
0
            if ( fmt->p_free )
5797
0
                free(fmt->p - fmt->p_off);
5798
0
            bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5799
0
            fmt->p_free = 1;
5800
0
            line->d.indiv_dirty = 1;
5801
0
        }
5802
0
    }
5803
0
    line->unpacked |= BCF_UN_FMT;
5804
5805
0
    if (is_len) {
5806
0
        line->rlen = get_rlen(hdr, line);
5807
0
    }
5808
0
    return 0;
5809
0
}
5810
5811
5812
int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
5813
0
{
5814
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5815
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5816
0
    line->d.n_flt = n;
5817
0
    if ( !n ) return 0;
5818
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5819
0
    int i;
5820
0
    for (i=0; i<n; i++)
5821
0
        line->d.flt[i] = flt_ids[i];
5822
0
    return 0;
5823
0
}
5824
5825
int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
5826
0
{
5827
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5828
0
    int i;
5829
0
    for (i=0; i<line->d.n_flt; i++)
5830
0
        if ( flt_id==line->d.flt[i] ) break;
5831
0
    if ( i<line->d.n_flt ) return 0;    // this filter is already set
5832
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5833
0
    if ( flt_id==0 )    // set to PASS
5834
0
        line->d.n_flt = 1;
5835
0
    else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
5836
0
        line->d.n_flt = 1;
5837
0
    else
5838
0
        line->d.n_flt++;
5839
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5840
0
    line->d.flt[line->d.n_flt-1] = flt_id;
5841
0
    return 1;
5842
0
}
5843
int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
5844
0
{
5845
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5846
0
    int i;
5847
0
    for (i=0; i<line->d.n_flt; i++)
5848
0
        if ( flt_id==line->d.flt[i] ) break;
5849
0
    if ( i==line->d.n_flt ) return 0;   // the filter is not present
5850
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5851
0
    if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt));
5852
0
    line->d.n_flt--;
5853
0
    if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
5854
0
    return 0;
5855
0
}
5856
5857
int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
5858
0
{
5859
0
    if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
5860
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
5861
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1;  // not defined in the header
5862
5863
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5864
0
    if ( id==0 && !line->d.n_flt) return 1; // PASS
5865
5866
0
    int i;
5867
0
    for (i=0; i<line->d.n_flt; i++)
5868
0
        if ( line->d.flt[i]==id ) return 1;
5869
0
    return 0;
5870
0
}
5871
5872
static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
5873
0
{
5874
0
    line->d.shared_dirty |= BCF1_DIRTY_ALS;
5875
0
    line->d.var_type = -1;
5876
5877
0
    line->n_allele = nals;
5878
0
    hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
5879
5880
0
    char *als = line->d.als;
5881
0
    int n = 0;
5882
0
    while (n<nals)
5883
0
    {
5884
0
        line->d.allele[n] = als;
5885
0
        while ( *als ) als++;
5886
0
        als++;
5887
0
        n++;
5888
0
    }
5889
    // Update REF length. Note that END is 1-based while line->pos 0-based
5890
0
    line->rlen = get_rlen(hdr, line);
5891
5892
0
    return 0;
5893
0
}
5894
int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
5895
0
{
5896
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5897
0
    char *free_old = NULL;
5898
0
    char buffer[256];
5899
0
    size_t used = 0;
5900
5901
    // The pointers in alleles may point into the existing line->d.als memory,
5902
    // so care needs to be taken not to clobber them while updating.  Usually
5903
    // they will be short so we can copy through an intermediate buffer.
5904
    // If they're longer, or won't fit in the existing allocation we
5905
    // can allocate a new buffer to write into.  Note that in either case
5906
    // pointers to line->d.als memory in alleles may not be valid when we've
5907
    // finished.
5908
0
    int i;
5909
0
    size_t avail = line->d.m_als < sizeof(buffer) ? line->d.m_als : sizeof(buffer);
5910
0
    for (i=0; i<nals; i++) {
5911
0
        size_t sz = strlen(alleles[i]) + 1;
5912
0
        if (avail - used < sz)
5913
0
            break;
5914
0
        memcpy(buffer + used, alleles[i], sz);
5915
0
        used += sz;
5916
0
    }
5917
5918
    // Did we miss anything?
5919
0
    if (i < nals) {
5920
0
        int j;
5921
0
        size_t needed = used;
5922
0
        char *new_als;
5923
0
        for (j = i; j < nals; j++)
5924
0
            needed += strlen(alleles[j]) + 1;
5925
0
        if (needed < line->d.m_als) // Don't shrink the buffer
5926
0
            needed = line->d.m_als;
5927
0
        if (needed > INT_MAX) {
5928
0
            hts_log_error("REF + alleles too long to fit in a BCF record");
5929
0
            return -1;
5930
0
        }
5931
0
        new_als = malloc(needed);
5932
0
        if (!new_als)
5933
0
            return -1;
5934
0
        free_old = line->d.als;
5935
0
        line->d.als = new_als;
5936
0
        line->d.m_als = needed;
5937
0
    }
5938
5939
    // Copy from the temp buffer to the destination
5940
0
    if (used) {
5941
0
        assert(used <= line->d.m_als);
5942
0
        memcpy(line->d.als, buffer, used);
5943
0
    }
5944
5945
    // Add in any remaining entries - if this happens we will always be
5946
    // writing to a newly-allocated buffer.
5947
0
    for (; i < nals; i++) {
5948
0
        size_t sz = strlen(alleles[i]) + 1;
5949
0
        memcpy(line->d.als + used, alleles[i], sz);
5950
0
        used += sz;
5951
0
    }
5952
5953
0
    if (free_old)
5954
0
        free(free_old);
5955
0
    return _bcf1_sync_alleles(hdr,line,nals);
5956
0
}
5957
5958
int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
5959
0
{
5960
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5961
0
    kstring_t tmp;
5962
0
    tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
5963
0
    kputs(alleles_string, &tmp);
5964
0
    line->d.als = tmp.s; line->d.m_als = tmp.m;
5965
5966
0
    int nals = 1;
5967
0
    char *t = line->d.als;
5968
0
    while (*t)
5969
0
    {
5970
0
        if ( *t==',' ) { *t = 0; nals++; }
5971
0
        t++;
5972
0
    }
5973
0
    return _bcf1_sync_alleles(hdr, line, nals);
5974
0
}
5975
5976
int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
5977
0
{
5978
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5979
0
    kstring_t tmp;
5980
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
5981
0
    if ( id )
5982
0
        kputs(id, &tmp);
5983
0
    else
5984
0
        kputs(".", &tmp);
5985
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
5986
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
5987
0
    return 0;
5988
0
}
5989
5990
int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
5991
0
{
5992
0
    if ( !id ) return 0;
5993
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5994
5995
0
    kstring_t tmp;
5996
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
5997
5998
0
    int len = strlen(id);
5999
0
    char *dst = line->d.id;
6000
0
    while ( *dst && (dst=strstr(dst,id)) )
6001
0
    {
6002
0
        if ( dst[len]!=0 && dst[len]!=';' ) dst++;              // a prefix, not a match
6003
0
        else if ( dst==line->d.id || dst[-1]==';' ) return 0;   // already present
6004
0
        dst++;  // a suffix, not a match
6005
0
    }
6006
0
    if ( line->d.id && (line->d.id[0]!='.' || line->d.id[1]) )
6007
0
    {
6008
0
        tmp.l = strlen(line->d.id);
6009
0
        kputc(';',&tmp);
6010
0
    }
6011
0
    kputs(id,&tmp);
6012
6013
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
6014
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
6015
0
    return 0;
6016
6017
0
}
6018
6019
bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
6020
0
{
6021
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
6022
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL;   // no such FMT field in the header
6023
0
    return bcf_get_fmt_id(line, id);
6024
0
}
6025
6026
bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
6027
0
{
6028
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
6029
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL;   // no such INFO field in the header
6030
0
    return bcf_get_info_id(line, id);
6031
0
}
6032
6033
bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
6034
0
{
6035
0
    int i;
6036
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
6037
0
    for (i=0; i<line->n_fmt; i++)
6038
0
    {
6039
0
        if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
6040
0
    }
6041
0
    return NULL;
6042
0
}
6043
6044
bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
6045
0
{
6046
0
    int i;
6047
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
6048
0
    for (i=0; i<line->n_info; i++)
6049
0
    {
6050
0
        if ( line->d.info[i].key==id ) return &line->d.info[i];
6051
0
    }
6052
0
    return NULL;
6053
0
}
6054
6055
6056
int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
6057
0
{
6058
0
    int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
6059
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1;    // no such INFO field in the header
6060
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2;     // expected different type
6061
6062
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
6063
6064
0
    for (i=0; i<line->n_info; i++)
6065
0
        if ( line->d.info[i].key==tag_id ) break;
6066
0
    if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3;       // the tag is not present in this record
6067
0
    if ( type==BCF_HT_FLAG ) return 1;
6068
6069
0
    bcf_info_t *info = &line->d.info[i];
6070
0
    if ( !info->vptr ) return -3;           // the tag was marked for removal
6071
0
    if ( type==BCF_HT_STR )
6072
0
    {
6073
0
        if ( *ndst < info->len+1 )
6074
0
        {
6075
0
            *ndst = info->len + 1;
6076
0
            *dst  = realloc(*dst, *ndst);
6077
0
        }
6078
0
        memcpy(*dst,info->vptr,info->len);
6079
0
        ((uint8_t*)*dst)[info->len] = 0;
6080
0
        return info->len;
6081
0
    }
6082
6083
    // Make sure the buffer is big enough
6084
0
    int size1;
6085
0
    switch (type) {
6086
0
        case BCF_HT_INT:  size1 = sizeof(int32_t); break;
6087
0
        case BCF_HT_LONG: size1 = sizeof(int64_t); break;
6088
0
        case BCF_HT_REAL: size1 = sizeof(float); break;
6089
0
        default:
6090
0
            hts_log_error("Unexpected output type %d at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
6091
0
            return -2;
6092
0
    }
6093
0
    if ( *ndst < info->len )
6094
0
    {
6095
0
        *ndst = info->len;
6096
0
        *dst  = realloc(*dst, *ndst * size1);
6097
0
    }
6098
6099
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_regular, out_type_t) do { \
6100
0
        out_type_t *tmp = (out_type_t *) *dst; \
6101
0
        int j; \
6102
0
        for (j=0; j<info->len; j++) \
6103
0
        { \
6104
0
            type_t p = convert(info->vptr + j * sizeof(type_t)); \
6105
0
            if ( is_vector_end ) break; \
6106
0
            if ( is_missing ) set_missing; \
6107
0
            else set_regular; \
6108
0
            tmp++; \
6109
0
        } \
6110
0
        ret = j; \
6111
0
    } while (0)
6112
0
    switch (info->type) {
6113
0
        case BCF_BT_INT8:
6114
0
            if (type == BCF_HT_LONG) {
6115
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int64_missing, *tmp=p, int64_t);
6116
0
            } else {
6117
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=p, int32_t);
6118
0
            }
6119
0
            break;
6120
0
        case BCF_BT_INT16:
6121
0
            if (type == BCF_HT_LONG) {
6122
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t);
6123
0
            } else {
6124
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t);
6125
0
            }
6126
0
            break;
6127
0
        case BCF_BT_INT32:
6128
0
            if (type == BCF_HT_LONG) {
6129
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); break;
6130
0
            } else {
6131
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break;
6132
0
            }
6133
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break;
6134
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, info->type, bcf_seqname_safe(hdr,line), line->pos+1); return -2;
6135
0
    }
6136
0
    #undef BRANCH
6137
0
    return ret;  // set by BRANCH
6138
0
}
6139
6140
int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
6141
0
{
6142
0
    int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
6143
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
6144
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;     // expected different type
6145
6146
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
6147
6148
0
    for (i=0; i<line->n_fmt; i++)
6149
0
        if ( line->d.fmt[i].id==tag_id ) break;
6150
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
6151
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
6152
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
6153
6154
0
    int nsmpl = bcf_hdr_nsamples(hdr);
6155
0
    if ( !*dst )
6156
0
    {
6157
0
        *dst = (char**) malloc(sizeof(char*)*nsmpl);
6158
0
        if ( !*dst ) return -4;     // could not alloc
6159
0
        (*dst)[0] = NULL;
6160
0
    }
6161
0
    int n = (fmt->n+1)*nsmpl;
6162
0
    if ( *ndst < n )
6163
0
    {
6164
0
        (*dst)[0] = realloc((*dst)[0], n);
6165
0
        if ( !(*dst)[0] ) return -4;    // could not alloc
6166
0
        *ndst = n;
6167
0
    }
6168
0
    for (i=0; i<nsmpl; i++)
6169
0
    {
6170
0
        uint8_t *src = fmt->p + i*fmt->n;
6171
0
        uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
6172
0
        memcpy(tmp,src,fmt->n);
6173
0
        tmp[fmt->n] = 0;
6174
0
        (*dst)[i] = (char*) tmp;
6175
0
    }
6176
0
    return n;
6177
0
}
6178
6179
int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
6180
0
{
6181
0
    int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
6182
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
6183
0
    if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
6184
0
    {
6185
        // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
6186
0
        if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
6187
0
    }
6188
0
    else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2;     // expected different type
6189
6190
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
6191
6192
0
    for (i=0; i<line->n_fmt; i++)
6193
0
        if ( line->d.fmt[i].id==tag_id ) break;
6194
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
6195
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
6196
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
6197
6198
0
    if ( type==BCF_HT_STR )
6199
0
    {
6200
0
        int n = fmt->n*bcf_hdr_nsamples(hdr);
6201
0
        if ( *ndst < n )
6202
0
        {
6203
0
            *dst  = realloc(*dst, n);
6204
0
            if ( !*dst ) return -4;     // could not alloc
6205
0
            *ndst = n;
6206
0
        }
6207
0
        memcpy(*dst,fmt->p,n);
6208
0
        return n;
6209
0
    }
6210
6211
    // Make sure the buffer is big enough
6212
0
    int nsmpl = bcf_hdr_nsamples(hdr);
6213
0
    int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
6214
0
    if ( *ndst < fmt->n*nsmpl )
6215
0
    {
6216
0
        *ndst = fmt->n*nsmpl;
6217
0
        *dst  = realloc(*dst, *ndst*size1);
6218
0
        if ( !*dst ) return -4;     // could not alloc
6219
0
    }
6220
6221
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_vector_end, set_regular, out_type_t) { \
6222
0
        out_type_t *tmp = (out_type_t *) *dst; \
6223
0
        uint8_t *fmt_p = fmt->p; \
6224
0
        for (i=0; i<nsmpl; i++) \
6225
0
        { \
6226
0
            for (j=0; j<fmt->n; j++) \
6227
0
            { \
6228
0
                type_t p = convert(fmt_p + j * sizeof(type_t)); \
6229
0
                if ( is_missing ) set_missing; \
6230
0
                else if ( is_vector_end ) { set_vector_end; break; } \
6231
0
                else set_regular; \
6232
0
                tmp++; \
6233
0
            } \
6234
0
            for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
6235
0
            fmt_p += fmt->size; \
6236
0
        } \
6237
0
    }
6238
0
    switch (fmt->type) {
6239
0
        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6240
0
        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6241
0
        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6242
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), bcf_float_set(tmp, p), float); break;
6243
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt->type, bcf_seqname_safe(hdr,line), line->pos+1); exit(1);
6244
0
    }
6245
0
    #undef BRANCH
6246
6247
0
    return nsmpl*fmt->n;
6248
0
}
6249
6250
//error description structure definition
6251
typedef struct err_desc {
6252
    int  errorcode;
6253
    const char *description;
6254
}err_desc;
6255
6256
// error descriptions
6257
static const err_desc errdesc_bcf[] = {
6258
    { BCF_ERR_CTG_UNDEF, "Contig not defined in header"},
6259
    { BCF_ERR_TAG_UNDEF, "Tag not defined in header" },
6260
    { BCF_ERR_NCOLS, "Incorrect number of columns" },
6261
    { BCF_ERR_LIMITS, "Limits reached" },
6262
    { BCF_ERR_CHAR, "Invalid character" },
6263
    { BCF_ERR_CTG_INVALID, "Invalid contig" },
6264
    { BCF_ERR_TAG_INVALID, "Invalid tag" },
6265
};
6266
6267
/// append given description to buffer based on available size and add ... when not enough space
6268
    /** @param buffer       buffer to which description to be appended
6269
        @param offset       offset at which to be appended
6270
        @param maxbuffer    maximum size of the buffer
6271
        @param description  the description to be appended
6272
on failure returns -1 - when buffer is not big enough; returns -1 on invalid params and on too small buffer which are improbable due to validation at caller site
6273
on success returns 0
6274
    */
6275
1.07k
static int add_desc_to_buffer(char *buffer, size_t *offset, size_t maxbuffer, const char *description) {
6276
6277
1.07k
    if (!description || !buffer || !offset || (maxbuffer < 4))
6278
0
        return -1;
6279
6280
1.07k
    size_t rembuffer = maxbuffer - *offset;
6281
1.07k
    if (rembuffer > (strlen(description) + (rembuffer == maxbuffer ? 0 : 1))) {    //add description with optionally required ','
6282
1.07k
        *offset += snprintf(buffer + *offset, rembuffer, "%s%s", (rembuffer == maxbuffer)? "": ",", description);
6283
1.07k
    } else {    //not enough space for description, put ...
6284
0
        size_t tmppos = (rembuffer <= 4) ? maxbuffer - 4 : *offset;
6285
0
        snprintf(buffer + tmppos, 4, "...");    //ignore offset update
6286
0
        return -1;
6287
0
    }
6288
1.07k
    return 0;
6289
1.07k
}
6290
6291
//get description for given error code. return NULL on error
6292
508
const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) {
6293
508
    size_t usedup = 0;
6294
508
    int ret = 0;
6295
508
    int idx;
6296
6297
508
    if (!buffer || maxbuffer < 4)
6298
0
        return NULL;           //invalid / insufficient buffer
6299
6300
508
    if (!errorcode) {
6301
0
        buffer[0] = '\0';      //no error, set null
6302
0
        return buffer;
6303
0
    }
6304
6305
4.06k
    for (idx = 0; idx < sizeof(errdesc_bcf) / sizeof(err_desc); ++idx) {
6306
3.55k
        if (errorcode & errdesc_bcf[idx].errorcode) {    //error is set, add description
6307
1.07k
            ret = add_desc_to_buffer(buffer, &usedup, maxbuffer, errdesc_bcf[idx].description);
6308
1.07k
            if (ret < 0)
6309
0
                break;         //not enough space, ... added, no need to continue
6310
6311
1.07k
            errorcode &= ~errdesc_bcf[idx].errorcode;    //reset the error
6312
1.07k
        }
6313
3.55k
    }
6314
6315
508
    if (errorcode && (ret >= 0))  {     //undescribed error is present in error code and had enough buffer, try to add unkonwn error as well§
6316
0
        add_desc_to_buffer(buffer, &usedup, maxbuffer, "Unknown error");
6317
0
    }
6318
508
    return buffer;
6319
508
}
6320
6321
/**
6322
 *  bcf_format_gt_v2 - formats GT information on a string
6323
 *  @param hdr - bcf header, to get version
6324
 *  @param fmt - pointer to bcf format data
6325
 *  @param isample - position of interested sample in data
6326
 *  @param str - pointer to output string
6327
 *  Returns 0 on success and -1 on failure
6328
 *  This method is preferred over bcf_format_gt as this supports vcf4.4 and
6329
 *  prefixed phasing. Explicit / prefixed phasing for 1st allele is used only
6330
 *  when it is a must to correctly express phasing.
6331
 * correctly express phasing.
6332
 */
6333
int bcf_format_gt_v2(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample, kstring_t *str)
6334
8.64k
{
6335
8.64k
    uint32_t e = 0;
6336
8.64k
    int ploidy = 1, anyunphased = 0;
6337
8.64k
    int32_t val0 = 0;
6338
8.64k
    size_t pos = str ? str->l : 0;
6339
6340
8.64k
    #define BRANCH(type_t, convert, missing, vector_end) { \
6341
8.13k
        uint8_t *ptr = fmt->p + isample*fmt->size; \
6342
8.13k
        int i; \
6343
18.6k
        for (i=0; i<fmt->n; i++, ptr += sizeof(type_t)) \
6344
14.2k
        { \
6345
14.2k
            type_t val = convert(ptr); \
6346
14.2k
            if ( val == vector_end ) break; \
6347
14.2k
            if (!i) { val0 = val; } \
6348
10.5k
            if (i) { \
6349
2.38k
                e |= kputc("/|"[val & 1], str) < 0; \
6350
2.38k
                anyunphased |= !(val & 1); \
6351
2.38k
            } \
6352
10.5k
            if (!(val >> 1)) e |= kputc('.', str) < 0; \
6353
10.5k
            else e |= kputw((val >> 1) - 1, str) < 0; \
6354
10.5k
        } \
6355
8.13k
        if (i == 0) e |= kputc('.', str) < 0; \
6356
8.13k
        ploidy = i; \
6357
8.13k
    }
6358
8.64k
    switch (fmt->type) {
6359
4.07k
        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  bcf_int8_missing,
6360
4.07k
            bcf_int8_vector_end); break;
6361
1.37k
        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing,
6362
1.37k
            bcf_int16_vector_end); break;
6363
2.68k
        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing,
6364
2.68k
            bcf_int32_vector_end); break;
6365
517
        case BCF_BT_NULL:  e |= kputc('.', str) < 0; break;
6366
0
        default: hts_log_error("Unexpected type %d", fmt->type); return -2;
6367
8.64k
    }
6368
8.64k
    #undef BRANCH
6369
6370
8.64k
    if (hdr && get_hdr_aux(hdr)->version >= VCF44) {
6371
        //output which supports prefixed phasing
6372
6373
        /* update 1st allele's phasing if required and append rest to it.
6374
        use prefixed phasing only when it is a must. i.e. without which the
6375
        inferred value will be incorrect */
6376
6.24k
        if (val0 & 1) {
6377
            /* 1st one is phased, if ploidy is > 1 and an unphased allele exists
6378
             need to specify explicitly */
6379
805
            e |= (ploidy > 1 && anyunphased) ?
6380
2
                    (kinsert_char('|', pos, str) < 0) :
6381
805
                        (ploidy <= 1 && !((val0 >> 1)) ? //|. needs explicit o/p
6382
0
                            (kinsert_char('|', pos, str) < 0) :
6383
803
                            0);
6384
5.44k
        } else {
6385
            /* 1st allele is unphased, if ploidy is = 1 or allele is '.' or
6386
             ploidy > 1 and no other unphased allele exist, need to specify
6387
             explicitly */
6388
5.44k
            e |= ((ploidy <= 1 && val0 != 0) || (ploidy > 1 && !anyunphased)) ?
6389
3.27k
                    (kinsert_char('/', pos, str) < 0) :
6390
5.44k
                    0;
6391
5.44k
        }
6392
6.24k
    }
6393
8.64k
    return e == 0 ? 0 : -1;
6394
8.64k
}
6395
6396
/**
6397
 *  get_rlen - calculates and returns rlen value
6398
 *  @param h - bcf header
6399
 *  @param v - bcf data
6400
 *  Returns rlen calculated on success and -1 on failure.
6401
 *  rlen calculation is dependent on vcf version and a few other field data.
6402
 *  When bcf decoded data is available, refers it. When not available, retrieves
6403
 *  required field data by seeking on the data stream.
6404
 *  Ideally pos & version be set appropriately before any info/format field
6405
 *  update to have proper rlen calculation.
6406
 *  As version is not kept properly updated in practice, it is ignored in calcs.
6407
 */
6408
static int64_t get_rlen(const bcf_hdr_t *h, bcf1_t *v)
6409
26.2k
{
6410
26.2k
    uint8_t *f = (uint8_t*)v->shared.s, *t = NULL,
6411
26.2k
        *e = (uint8_t*)v->shared.s + v->shared.l;
6412
26.2k
    int size, type, id, lenid, endid, svlenid, i, bad, gvcf = 0, use_svlen = 0;
6413
26.2k
    bcf_info_t *endinfo = NULL, *svleninfo = NULL, end_lcl, svlen_lcl;
6414
26.2k
    bcf_fmt_t *lenfmt = NULL, len_lcl;
6415
6416
    //holds SVLEN allele status for the max no of alleles
6417
26.2k
    uint8_t svlenals[8192];
6418
    //pos from info END, fmt LEN, info SVLEN
6419
26.2k
    hts_pos_t end = 0, end_fmtlen = 0, end_svlen = 0, hpos;
6420
26.2k
    int64_t len_ref = 0, len = 0, tmp;
6421
26.2k
    endid = bcf_hdr_id2int(h, BCF_DT_ID, "END");
6422
6423
    //initialise bytes which are to be used
6424
26.2k
    memset(svlenals, 0, 1 + v->n_allele / 8);
6425
6426
    //use decoded data where ever available and where not, get from stream
6427
26.2k
    if (v->unpacked & BCF_UN_STR || v->d.shared_dirty & BCF1_DIRTY_ALS) {
6428
0
        for (i = 1; i < v->n_allele; ++i) {
6429
            // check only symbolic alt alleles
6430
0
            if (v->d.allele[i][0] != '<')
6431
0
                continue;
6432
0
            if (svlen_on_ref_for_vcf_alt(v->d.allele[i], -1)) {
6433
                // del, dup or cnv allele, note to check corresponding svlen val
6434
0
                svlenals[i >> 3] |= 1 << (i & 7);
6435
0
                use_svlen = 1;
6436
0
            } else if (!strcmp(v->d.allele[i], "<*>") ||
6437
0
                         !strcmp(v->d.allele[i], "<NON_REF>")) {
6438
0
                gvcf = 1;   //gvcf present, have to check for LEN field
6439
0
            }
6440
0
        }
6441
0
        f += v->unpack_size[0] + v->unpack_size[1];
6442
0
        len_ref = v->n_allele ? strlen(v->d.allele[0]) : 0;
6443
26.2k
    } else if (f < e) {
6444
        //skip ID
6445
26.2k
        size = bcf_dec_size(f, &f, &type);
6446
26.2k
        f += size << bcf_type_shift[type];
6447
        // REF, ALT
6448
1.43M
        for (i = 0; i < v->n_allele; ++i) {
6449
            //check all alleles, w/o NUL
6450
1.40M
            size = bcf_dec_size(f, &f, &type);
6451
1.40M
            if (!i) {   //REF length
6452
26.2k
                len_ref = size;
6453
1.38M
            } else if (size > 0 && *f == '<') {
6454
9.11k
                if (svlen_on_ref_for_vcf_alt((char *) f, size)) {
6455
                    // del, dup or cnv allele, note to check corresponding svlen val
6456
0
                    svlenals[i >> 3] |= 1 << (i & 7);
6457
0
                    use_svlen = 1;
6458
9.11k
                } else if ((size == 3 && !strncmp((char*)f, "<*>", size)) ||
6459
6.02k
                    (size == 9 && !strncmp((char*)f, "<NON_REF>", size))) {
6460
3.56k
                    gvcf = 1;   //gvcf present, have to check for LEN field
6461
3.56k
                }
6462
9.11k
            }
6463
1.40M
            f += size << bcf_type_shift[type];
6464
1.40M
        }
6465
26.2k
    }
6466
    // FILTER
6467
26.2k
    if (v->unpacked & BCF_UN_FLT) {
6468
0
        f += v->unpack_size[2];
6469
26.2k
    } else if (f < e) {
6470
26.2k
        size = bcf_dec_size(f, &f, &type);
6471
26.2k
        f += size << bcf_type_shift[type];
6472
26.2k
    }
6473
6474
    // Only do SVLEN lookup if there are suitable symbolic alleles
6475
26.2k
    svlenid = use_svlen ? bcf_hdr_id2int(h, BCF_DT_ID, "SVLEN") : -1;
6476
6477
    // INFO
6478
26.2k
    if (svlenid >= 0 || endid >= 0 ) {  //only if end/svlen present
6479
11.1k
        if (v->unpacked & BCF_UN_INFO || v->d.shared_dirty & BCF1_DIRTY_INF) {
6480
0
            endinfo = bcf_get_info(h, v, "END");
6481
0
            svleninfo = bcf_get_info(h, v, "SVLEN");
6482
11.1k
        } else if (f < e) {
6483
12.9k
            for (i = 0; i < v->n_info; ++i) {
6484
8.84k
                id = bcf_dec_typed_int1(f, &t);
6485
8.84k
                if (id == endid) {  //END
6486
1.05k
                    t = bcf_unpack_info_core1(f, &end_lcl);
6487
1.05k
                    endinfo = &end_lcl;
6488
1.05k
                    if (svleninfo || svlenid < 0) {
6489
1.05k
                        break;  //already got svlen or no need to search further
6490
1.05k
                    }
6491
7.79k
                } else if (id == svlenid) { //SVLEN
6492
0
                    t = bcf_unpack_info_core1(f, &svlen_lcl);
6493
0
                    svleninfo = &svlen_lcl;
6494
0
                    if (endinfo || endid < 0 ) {
6495
0
                        break;  //already got end or no need to search further
6496
0
                    }
6497
7.79k
                } else {
6498
7.79k
                    f = t;
6499
7.79k
                    size = bcf_dec_size(f, &t, &type);
6500
7.79k
                    t += size << bcf_type_shift[type];
6501
7.79k
                }
6502
7.79k
                f = t;
6503
7.79k
            }
6504
5.20k
        }
6505
11.1k
    }
6506
6507
    // Only do LEN lookup if a <*> allele was found
6508
26.2k
    lenid = gvcf ? bcf_hdr_id2int(h, BCF_DT_ID, "LEN") : -1;
6509
6510
    // FORMAT
6511
26.2k
    if (lenid >= 0) {
6512
        //with LEN and has gvcf allele
6513
0
        f = (uint8_t*)v->indiv.s; t = NULL; e = (uint8_t*)v->indiv.s + v->indiv.l;
6514
0
        if (v->unpacked & BCF_UN_FMT || v->d.indiv_dirty) {
6515
0
            lenfmt = bcf_get_fmt(h, v, "LEN");
6516
0
        } else if (f < e) {
6517
0
            for (i = 0; i < v->n_fmt; ++i) {
6518
0
                id = bcf_dec_typed_int1(f, &t);
6519
0
                if (id == lenid) {
6520
0
                        t = bcf_unpack_fmt_core1(f, v->n_sample, &len_lcl);
6521
0
                    lenfmt = &len_lcl;
6522
0
                    break;  //that's all needed
6523
0
                } else {
6524
0
                    f = t;
6525
0
                    size = bcf_dec_size(f, &t, &type);
6526
0
                    t += size * v->n_sample << bcf_type_shift[type];
6527
0
                }
6528
0
                f = t;
6529
0
            }
6530
0
        }
6531
0
    }
6532
    //got required data, find end and rlen
6533
26.2k
    if (endinfo && endinfo->vptr) { //end position given by info END
6534
        //end info exists, not being deleted
6535
1.05k
        end = endinfo->v1.i;
6536
1.05k
        switch(endinfo->type) {
6537
0
            case BCF_BT_INT8:  end = end == bcf_int8_missing ? 0 : end;  break;
6538
0
            case BCF_BT_INT16: end = end == bcf_int16_missing ? 0 : end; break;
6539
0
            case BCF_BT_INT32: end = end == bcf_int32_missing ? 0 : end; break;
6540
0
            case BCF_BT_INT64: end = end == bcf_int64_missing ? 0 : end; break;
6541
1.05k
            default: end = 0; break; //invalid
6542
1.05k
        }
6543
1.05k
    }
6544
6545
26.2k
    if (svleninfo && svleninfo->vptr) {
6546
        //svlen info exists, not being deleted
6547
0
        bad = 0;
6548
        //get largest svlen corresponding to a <DEL> symbolic allele
6549
0
        for (i = 0; i < svleninfo->len && i + 1 < v->n_allele; ++i) {
6550
0
            if (!(svlenals[i >> 3] & (1 << ((i + 1) & 7))))
6551
0
                continue;
6552
6553
0
            switch(svleninfo->type) {
6554
0
                case BCF_BT_INT8:
6555
0
                    tmp = le_to_i8(&svleninfo->vptr[i]);
6556
0
                    tmp = tmp == bcf_int8_missing ? 0 : tmp;
6557
0
                break;
6558
0
                case BCF_BT_INT16:
6559
0
                    tmp = le_to_i16(&svleninfo->vptr[i * 2]);
6560
0
                    tmp = tmp == bcf_int16_missing ? 0 : tmp;
6561
0
                break;
6562
0
                case BCF_BT_INT32:
6563
0
                    tmp = le_to_i32(&svleninfo->vptr[i * 4]);
6564
0
                    tmp = tmp == bcf_int32_missing ? 0 : tmp;
6565
0
                break;
6566
0
                case BCF_BT_INT64:
6567
0
                    tmp = le_to_i64(&svleninfo->vptr[i * 8]);
6568
0
                    tmp = tmp == bcf_int64_missing ? 0 : tmp;
6569
0
                break;
6570
0
                default: //invalid
6571
0
                    tmp = 0;
6572
0
                    bad = 1;
6573
0
                break;
6574
0
            }
6575
0
            if (bad) {  //stop svlen check
6576
0
                len = 0;
6577
0
                break;
6578
0
            }
6579
6580
0
            tmp = tmp < 0 ? llabs(tmp) : tmp;
6581
0
            if (len < tmp) len = tmp;
6582
0
        }
6583
0
    }
6584
26.2k
    if ((!svleninfo || !len) && end) { //no svlen, infer from end
6585
0
        len = end > v->pos ? end - v->pos - 1 : 0;
6586
0
    }
6587
26.2k
    end_svlen = v->pos + len + 1;   //end position found from SVLEN
6588
6589
26.2k
    len = 0;
6590
26.2k
    if (lenfmt && lenfmt->p) {
6591
        //fmt len exists, not being deleted, has gvcf and version >= 4.5
6592
0
        int j = 0;
6593
0
        int64_t offset = 0;
6594
0
        bad = 0;
6595
0
        for (i = 0; i < v->n_sample; ++i) {
6596
0
            for (j = 0; j < lenfmt->n; ++j) {
6597
0
                switch(lenfmt->type) {
6598
0
                case BCF_BT_INT8:
6599
0
                    tmp = le_to_i8(lenfmt->p + offset + j);
6600
0
                    tmp = tmp == bcf_int8_missing ? 0 : tmp;
6601
0
                break;
6602
0
                case BCF_BT_INT16:
6603
0
                    tmp = le_to_i16(lenfmt->p + offset + j * 2);
6604
0
                    tmp = tmp == bcf_int16_missing ? 0 : tmp;
6605
0
                break;
6606
0
                case BCF_BT_INT32:
6607
0
                    tmp = le_to_i32(lenfmt->p + offset + j * 4);
6608
0
                    tmp = tmp == bcf_int32_missing ? 0 : tmp;
6609
0
                break;
6610
0
                case BCF_BT_INT64:
6611
0
                    tmp = le_to_i64(lenfmt->p + offset + j * 8);
6612
0
                    tmp = tmp == bcf_int64_missing ? 0 : tmp;
6613
0
                break;
6614
0
                default: //invalid
6615
0
                    bad = 1;
6616
0
                break;
6617
0
                }
6618
0
                if (bad) {  //stop LEN check
6619
0
                    len = 0;
6620
0
                    break;
6621
0
                }
6622
                //assumes only gvcf have valid LEN
6623
0
                if (len < tmp) len = tmp;
6624
0
            }
6625
0
            offset += j << bcf_type_shift[lenfmt->type];
6626
0
        }
6627
0
    }
6628
26.2k
    if ((!lenfmt || !len) && end) { //no fmt len, infer from end
6629
0
        len = end > v->pos ? end - v->pos : 0;
6630
0
    }
6631
26.2k
    end_fmtlen = v->pos + len;  //end position found from LEN
6632
6633
    //get largest pos, based on END, SVLEN, fmt LEN and length using it
6634
26.2k
    hpos = end < end_svlen ?
6635
6.91k
            end_svlen < end_fmtlen ? end_fmtlen : end_svlen :
6636
26.2k
            end < end_fmtlen ? end_fmtlen : end;
6637
26.2k
    len = hpos - v->pos;
6638
6639
    //NOTE: 'end' calculation be in sync with tbx.c:tbx_parse1
6640
6641
    /* rlen to be calculated based on version, END, SVLEN, fmt LEN, ref len.
6642
    Relevance of these fields vary across different vcf versions.
6643
    Many times, these info/fmt fields are used without version updates;
6644
    hence these fields are used for calculation disregarding vcf version */
6645
26.2k
    return len < len_ref ? len_ref : len;
6646
26.2k
}