Coverage Report

Created: 2026-02-14 06:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/vcf.c
Line
Count
Source
1
/*  vcf.c -- VCF/BCF API functions.
2
3
    Copyright (C) 2012, 2013 Broad Institute.
4
    Copyright (C) 2012-2025 Genome Research Ltd.
5
    Portions copyright (C) 2014 Intel Corporation.
6
7
    Author: Heng Li <lh3@sanger.ac.uk>
8
9
Permission is hereby granted, free of charge, to any person obtaining a copy
10
of this software and associated documentation files (the "Software"), to deal
11
in the Software without restriction, including without limitation the rights
12
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
copies of the Software, and to permit persons to whom the Software is
14
furnished to do so, subject to the following conditions:
15
16
The above copyright notice and this permission notice shall be included in
17
all copies or substantial portions of the Software.
18
19
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25
DEALINGS IN THE SOFTWARE.  */
26
27
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
28
#include <config.h>
29
30
#include <stdio.h>
31
#include <assert.h>
32
#include <string.h>
33
#include <strings.h>
34
#include <stdlib.h>
35
#include <limits.h>
36
#include <stdint.h>
37
#include <inttypes.h>
38
#include <errno.h>
39
40
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
41
#include "fuzz_settings.h"
42
#endif
43
44
#include "htslib/vcf.h"
45
#include "htslib/bgzf.h"
46
#include "htslib/tbx.h"
47
#include "htslib/hfile.h"
48
#include "hts_internal.h"
49
#include "htslib/hts_endian.h"
50
#include "htslib/khash_str2int.h"
51
#include "htslib/kstring.h"
52
#include "htslib/sam.h"
53
#include "htslib/khash.h"
54
#include "bgzf_internal.h"
55
56
#if 0
57
// This helps on Intel a bit, often 6-7% faster VCF parsing.
58
// Conversely sometimes harms AMD Zen4 as ~9% slower.
59
// Possibly related to IPC differences.  However for now it's just a
60
// curiousity we ignore and stick with the simpler code.
61
//
62
// Left here as a hint for future explorers.
63
static inline int xstreq(const char *a, const char *b) {
64
    while (*a && *a == *b)
65
        a++, b++;
66
    return *a == *b;
67
}
68
69
#define KHASH_MAP_INIT_XSTR(name, khval_t) \
70
  KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, xstreq)
71
72
KHASH_MAP_INIT_XSTR(vdict, bcf_idinfo_t)
73
#else
74
KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
75
#endif
76
77
typedef khash_t(vdict) vdict_t;
78
79
KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*)
80
typedef khash_t(hdict) hdict_t;
81
82
83
#include "htslib/kseq.h"
84
HTSLIB_EXPORT
85
uint32_t bcf_float_missing    = 0x7F800001;
86
87
HTSLIB_EXPORT
88
uint32_t bcf_float_vector_end = 0x7F800002;
89
90
HTSLIB_EXPORT
91
uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
92
93
static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
94
95
/*
96
    Partial support for 64-bit POS and Number=1 INFO tags.
97
    Notes:
98
     - the support for 64-bit values is motivated by POS and INFO/END for large genomes
99
     - the use of 64-bit values does not conform to the specification
100
     - cannot output 64-bit BCF and if it does, it is not compatible with anything
101
     - experimental, use at your risk
102
*/
103
#ifdef VCF_ALLOW_INT64
104
    #define BCF_MAX_BT_INT64 (0x7fffffffffffffff)       /* INT64_MAX, for internal use only */
105
    #define BCF_MIN_BT_INT64 -9223372036854775800LL     /* INT64_MIN + 8, for internal use only */
106
#endif
107
108
790
#define BCF_IS_64BIT (1<<30)
109
110
111
// Opaque structure with auxilary data which allows to extend bcf_hdr_t without breaking ABI.
112
// Note that this preserving API and ABI requires that the first element is vdict_t struct
113
// rather than a pointer, as user programs may (and in some cases do) access the dictionary
114
// directly as (vdict_t*)hdr->dict.
115
typedef struct
116
{
117
    vdict_t dict;   // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT
118
    hdict_t *gen;   // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields
119
    size_t *key_len;// length of h->id[BCF_DT_ID] strings
120
    int version;    //cached version
121
    uint32_t ref_count; // reference count, low bit indicates bcf_hdr_destroy() has been called
122
}
123
bcf_hdr_aux_t;
124
125
static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr)
126
254k
{
127
254k
    return (bcf_hdr_aux_t *)hdr->dict[0];
128
254k
}
129
130
//version macros
131
117k
#define VCF_DEF 4002000
132
28.1k
#define VCF44   4004000
133
23.8k
#define VCF45   4005000
134
135
#define VCF_MAJOR_VER(x) ( (x) / 10000 / 100 )
136
#define VCF_MINOR_VER(x) ( ((x) % 1000000) / 1000 )
137
138
/**
139
 *  bcf_get_version - get the version as int
140
 *  @param hdr   - bcf header, to get version
141
 *  @param verstr- version string, which is already available
142
 *  Returns version on success and default version on failure
143
 *  version = major * 100 * 10000 + minor * 1000
144
 */
145
static int bcf_get_version(const bcf_hdr_t *hdr, const char *verstr)
146
16.3k
{
147
16.3k
    const char *version = NULL, vcf[] = "VCFv";
148
16.3k
    char *major = NULL, *minor = NULL;
149
16.3k
    int ver = -1;
150
16.3k
    long tmp = 0;
151
16.3k
    bcf_hdr_aux_t *aux = NULL;
152
153
16.3k
    if (!hdr && !verstr) {  //invalid input
154
0
        goto fail;
155
0
    }
156
157
16.3k
    if (hdr) {
158
12.4k
        if ((aux = get_hdr_aux(hdr)) && aux->version != 0) {    //use cached version
159
11.6k
            return aux->version;
160
11.6k
        }
161
        //get from header
162
798
        version = bcf_hdr_get_version(hdr);
163
3.88k
    } else {
164
        //get from version string
165
3.88k
        version = verstr;
166
3.88k
    }
167
4.68k
    if (!(major = strstr(version, vcf))) {  //bad format
168
3.31k
        goto fail;
169
3.31k
    }
170
1.36k
    major += sizeof(vcf) - 1;
171
1.36k
    if (!(minor = strchr(major, '.'))) {    //bad format
172
184
        goto fail;
173
184
    }
174
1.18k
    tmp = strtol(major, NULL, 10);
175
1.18k
    if ((!tmp && errno == EINVAL) ||
176
1.11k
        ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) {    //failed
177
67
        goto fail;
178
67
    }
179
1.11k
    ver = tmp * 100 * 10000;
180
1.11k
    tmp = strtol(++minor, NULL, 10);
181
1.11k
    if ((!tmp && errno == EINVAL) ||
182
1.05k
        ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) {    //failed
183
255
        goto fail;
184
255
    }
185
859
    ver += tmp * 1000;
186
859
    return ver;
187
188
3.82k
fail:
189
3.82k
    hts_log_warning("Couldn't get VCF version, considering as %d.%d",
190
3.82k
        VCF_MAJOR_VER(VCF_DEF), VCF_MINOR_VER(VCF_DEF));
191
3.82k
    return VCF_DEF;
192
1.11k
}
193
194
// Header reference counting
195
196
static void bcf_hdr_incr_ref(bcf_hdr_t *h)
197
1.79k
{
198
1.79k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
199
1.79k
    aux->ref_count += 2;
200
1.79k
}
201
202
static void bcf_hdr_decr_ref(bcf_hdr_t *h)
203
1.79k
{
204
1.79k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
205
1.79k
    if (aux->ref_count >= 2)
206
1.79k
        aux->ref_count -= 2;
207
208
1.79k
    if (aux->ref_count == 0)
209
1.64k
        bcf_hdr_destroy(h);
210
1.79k
}
211
212
static void hdr_bgzf_private_data_cleanup(void *data)
213
1.79k
{
214
1.79k
    bcf_hdr_t *h = (bcf_hdr_t *) data;
215
1.79k
    bcf_hdr_decr_ref(h);
216
1.79k
}
217
218
static char *find_chrom_header_line(char *s)
219
0
{
220
0
    char *nl;
221
0
    if (strncmp(s, "#CHROM\t", 7) == 0) return s;
222
0
    else if ((nl = strstr(s, "\n#CHROM\t")) != NULL) return nl+1;
223
0
    else return NULL;
224
0
}
225
226
static int64_t get_rlen(const bcf_hdr_t *h, bcf1_t *v);
227
228
/*************************
229
 *** VCF header parser ***
230
 *************************/
231
232
static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len)
233
5.21k
{
234
5.21k
    const char *ss = s;
235
5.30k
    while ( *ss && isspace_c(*ss) && ss - s < len) ss++;
236
5.21k
    if ( !*ss || ss - s == len)
237
2
    {
238
2
        hts_log_error("Empty sample name: trailing spaces/tabs in the header line?");
239
2
        return -1;
240
2
    }
241
242
5.20k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
243
5.20k
    int ret;
244
5.20k
    char *sdup = malloc(len + 1);
245
5.20k
    if (!sdup) return -1;
246
5.20k
    memcpy(sdup, s, len);
247
5.20k
    sdup[len] = 0;
248
249
    // Ensure space is available in h->samples
250
5.20k
    size_t n = kh_size(d);
251
5.20k
    char **new_samples = realloc(h->samples, sizeof(char*) * (n + 1));
252
5.20k
    if (!new_samples) {
253
0
        free(sdup);
254
0
        return -1;
255
0
    }
256
5.20k
    h->samples = new_samples;
257
258
5.20k
    int k = kh_put(vdict, d, sdup, &ret);
259
5.20k
    if (ret < 0) {
260
0
        free(sdup);
261
0
        return -1;
262
0
    }
263
5.20k
    if (ret) { // absent
264
5.20k
        kh_val(d, k) = bcf_idinfo_def;
265
5.20k
        kh_val(d, k).id = n;
266
5.20k
    } else {
267
2
        hts_log_error("Duplicated sample name '%s'", sdup);
268
2
        free(sdup);
269
2
        return -1;
270
2
    }
271
5.20k
    h->samples[n] = sdup;
272
5.20k
    h->dirty = 1;
273
5.20k
    return 0;
274
5.20k
}
275
276
int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
277
0
{
278
0
    if (!s) {
279
        // Allowed for backwards-compatibility, calling with s == NULL
280
        // used to trigger bcf_hdr_sync(h);
281
0
        return 0;
282
0
    }
283
0
    return bcf_hdr_add_sample_len(h, s, strlen(s));
284
0
}
285
286
int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *hdr, const char *str)
287
3.04k
{
288
3.04k
    const char *mandatory = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
289
3.04k
    if ( strncmp(str,mandatory,strlen(mandatory)) )
290
44
    {
291
44
        hts_log_error("Could not parse the \"#CHROM..\" line, either the fields are incorrect or spaces are present instead of tabs:\n\t%s",str);
292
44
        return -1;
293
44
    }
294
295
3.00k
    const char *beg = str + strlen(mandatory), *end;
296
3.00k
    if ( !*beg || *beg=='\n' ) return 0;
297
948
    if ( strncmp(beg,"\tFORMAT\t",8) )
298
14
    {
299
14
        hts_log_error("Could not parse the \"#CHROM..\" line, either FORMAT is missing or spaces are present instead of tabs:\n\t%s",str);
300
14
        return -1;
301
14
    }
302
934
    beg += 8;
303
304
934
    int ret = 0;
305
5.22k
    while ( *beg )
306
5.21k
    {
307
5.21k
        end = beg;
308
36.5M
        while ( *end && *end!='\t' && *end!='\n' ) end++;
309
5.21k
        if ( bcf_hdr_add_sample_len(hdr, beg, end-beg) < 0 ) ret = -1;
310
5.21k
        if ( !*end || *end=='\n' || ret<0 ) break;
311
4.28k
        beg = end + 1;
312
4.28k
    }
313
934
    return ret;
314
948
}
315
316
int bcf_hdr_sync(bcf_hdr_t *h)
317
68.3k
{
318
68.3k
    int i;
319
273k
    for (i = 0; i < 3; i++)
320
204k
    {
321
204k
        vdict_t *d = (vdict_t*)h->dict[i];
322
204k
        khint_t k;
323
204k
        if ( h->n[i] < kh_size(d) )
324
928
        {
325
928
            bcf_idpair_t *new_idpair;
326
            // this should be true only for i=2, BCF_DT_SAMPLE
327
928
            new_idpair = (bcf_idpair_t*) realloc(h->id[i], kh_size(d)*sizeof(bcf_idpair_t));
328
928
            if (!new_idpair) return -1;
329
928
            h->n[i] = kh_size(d);
330
928
            h->id[i] = new_idpair;
331
928
        }
332
2.37G
        for (k=kh_begin(d); k<kh_end(d); k++)
333
2.37G
        {
334
2.37G
            if (!kh_exist(d,k)) continue;
335
21.8M
            h->id[i][kh_val(d,k).id].key = kh_key(d,k);
336
21.8M
            h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
337
21.8M
        }
338
204k
    }
339
340
    // Invalidate key length cache
341
68.3k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
342
68.3k
    if (aux && aux->key_len) {
343
2.18k
        free(aux->key_len);
344
2.18k
        aux->key_len = NULL;
345
2.18k
    }
346
347
68.3k
    h->dirty = 0;
348
68.3k
    return 0;
349
68.3k
}
350
351
void bcf_hrec_destroy(bcf_hrec_t *hrec)
352
138k
{
353
138k
    if (!hrec) return;
354
133k
    free(hrec->key);
355
133k
    if ( hrec->value ) free(hrec->value);
356
133k
    int i;
357
442k
    for (i=0; i<hrec->nkeys; i++)
358
309k
    {
359
309k
        free(hrec->keys[i]);
360
309k
        free(hrec->vals[i]);
361
309k
    }
362
133k
    free(hrec->keys);
363
133k
    free(hrec->vals);
364
133k
    free(hrec);
365
133k
}
366
367
// Copies all fields except IDX.
368
bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
369
0
{
370
0
    int save_errno;
371
0
    bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
372
0
    if (!out) return NULL;
373
374
0
    out->type = hrec->type;
375
0
    if ( hrec->key ) {
376
0
        out->key = strdup(hrec->key);
377
0
        if (!out->key) goto fail;
378
0
    }
379
0
    if ( hrec->value ) {
380
0
        out->value = strdup(hrec->value);
381
0
        if (!out->value) goto fail;
382
0
    }
383
0
    out->nkeys = hrec->nkeys;
384
0
    out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys);
385
0
    if (!out->keys) goto fail;
386
0
    out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys);
387
0
    if (!out->vals) goto fail;
388
0
    int i, j = 0;
389
0
    for (i=0; i<hrec->nkeys; i++)
390
0
    {
391
0
        if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
392
0
        if ( hrec->keys[i] ) {
393
0
            out->keys[j] = strdup(hrec->keys[i]);
394
0
            if (!out->keys[j]) goto fail;
395
0
        }
396
0
        if ( hrec->vals[i] ) {
397
0
            out->vals[j] = strdup(hrec->vals[i]);
398
0
            if (!out->vals[j]) goto fail;
399
0
        }
400
0
        j++;
401
0
    }
402
0
    if ( i!=j ) out->nkeys -= i-j;   // IDX was omitted
403
0
    return out;
404
405
0
 fail:
406
0
    save_errno = errno;
407
0
    hts_log_error("%s", strerror(errno));
408
0
    bcf_hrec_destroy(out);
409
0
    errno = save_errno;
410
0
    return NULL;
411
0
}
412
413
void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
414
0
{
415
0
    fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
416
0
    int i;
417
0
    for (i=0; i<hrec->nkeys; i++)
418
0
        fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
419
0
    fprintf(fp, "\n");
420
0
}
421
422
void bcf_header_debug(bcf_hdr_t *hdr)
423
0
{
424
0
    int i, j;
425
0
    for (i=0; i<hdr->nhrec; i++)
426
0
    {
427
0
        if ( !hdr->hrec[i]->value )
428
0
        {
429
0
            fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
430
0
            fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
431
0
            for (j=1; j<hdr->hrec[i]->nkeys; j++)
432
0
                fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
433
0
            fprintf(stderr,">\n");
434
0
        }
435
0
        else
436
0
            fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
437
0
    }
438
0
}
439
440
int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len)
441
237k
{
442
237k
    char **tmp;
443
237k
    size_t n = hrec->nkeys + 1;
444
237k
    assert(len > 0 && len < SIZE_MAX);
445
237k
    tmp = realloc(hrec->keys, sizeof(char*)*n);
446
237k
    if (!tmp) return -1;
447
237k
    hrec->keys = tmp;
448
237k
    tmp = realloc(hrec->vals, sizeof(char*)*n);
449
237k
    if (!tmp) return -1;
450
237k
    hrec->vals = tmp;
451
452
237k
    hrec->keys[hrec->nkeys] = (char*) malloc((len+1)*sizeof(char));
453
237k
    if (!hrec->keys[hrec->nkeys]) return -1;
454
237k
    memcpy(hrec->keys[hrec->nkeys],str,len);
455
237k
    hrec->keys[hrec->nkeys][len] = 0;
456
237k
    hrec->vals[hrec->nkeys] = NULL;
457
237k
    hrec->nkeys = n;
458
237k
    return 0;
459
237k
}
460
461
int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted)
462
237k
{
463
237k
    if ( hrec->vals[i] ) {
464
0
        free(hrec->vals[i]);
465
0
        hrec->vals[i] = NULL;
466
0
    }
467
237k
    if ( !str ) return 0;
468
237k
    if ( is_quoted )
469
67.6k
    {
470
67.6k
        if (len >= SIZE_MAX - 3) {
471
0
            errno = ENOMEM;
472
0
            return -1;
473
0
        }
474
67.6k
        hrec->vals[i] = (char*) malloc((len+3)*sizeof(char));
475
67.6k
        if (!hrec->vals[i]) return -1;
476
67.6k
        hrec->vals[i][0] = '"';
477
67.6k
        memcpy(&hrec->vals[i][1],str,len);
478
67.6k
        hrec->vals[i][len+1] = '"';
479
67.6k
        hrec->vals[i][len+2] = 0;
480
67.6k
    }
481
170k
    else
482
170k
    {
483
170k
        if (len == SIZE_MAX) {
484
0
            errno = ENOMEM;
485
0
            return -1;
486
0
        }
487
170k
        hrec->vals[i] = (char*) malloc((len+1)*sizeof(char));
488
170k
        if (!hrec->vals[i]) return -1;
489
170k
        memcpy(hrec->vals[i],str,len);
490
170k
        hrec->vals[i][len] = 0;
491
170k
    }
492
237k
    return 0;
493
237k
}
494
495
int hrec_add_idx(bcf_hrec_t *hrec, int idx)
496
71.4k
{
497
71.4k
    int n = hrec->nkeys + 1;
498
71.4k
    char **tmp = (char**) realloc(hrec->keys, sizeof(char*)*n);
499
71.4k
    if (!tmp) return -1;
500
71.4k
    hrec->keys = tmp;
501
502
71.4k
    tmp = (char**) realloc(hrec->vals, sizeof(char*)*n);
503
71.4k
    if (!tmp) return -1;
504
71.4k
    hrec->vals = tmp;
505
506
71.4k
    hrec->keys[hrec->nkeys] = strdup("IDX");
507
71.4k
    if (!hrec->keys[hrec->nkeys]) return -1;
508
509
71.4k
    kstring_t str = {0,0,0};
510
71.4k
    if (kputw(idx, &str) < 0) {
511
0
        free(hrec->keys[hrec->nkeys]);
512
0
        return -1;
513
0
    }
514
71.4k
    hrec->vals[hrec->nkeys] = str.s;
515
71.4k
    hrec->nkeys = n;
516
71.4k
    return 0;
517
71.4k
}
518
519
int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
520
90.5k
{
521
90.5k
    int i;
522
144k
    for (i=0; i<hrec->nkeys; i++)
523
107k
        if ( !strcasecmp(key,hrec->keys[i]) ) return i;
524
37.1k
    return -1;
525
90.5k
}
526
527
static void bcf_hrec_set_type(bcf_hrec_t *hrec)
528
251k
{
529
251k
    if ( !strcmp(hrec->key, "contig") ) hrec->type = BCF_HL_CTG;
530
228k
    else if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
531
151k
    else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
532
76.3k
    else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
533
59.6k
    else if ( hrec->nkeys>0 ) hrec->type = BCF_HL_STR;
534
47.7k
    else hrec->type = BCF_HL_GEN;
535
251k
}
536
537
538
/**
539
    The arrays were generated with
540
541
    valid_ctg:
542
        perl -le '@v = (split(//,q[!#$%&*+./:;=?@^_|~-]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
543
544
    valid_tag:
545
        perl -le '@v = (split(//,q[_.]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
546
*/
547
static const uint8_t valid_ctg[256] =
548
{
549
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
550
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551
    0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
552
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
553
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
554
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
555
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
556
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
557
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
558
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
559
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
560
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
561
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
562
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
563
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
564
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
565
};
566
static const uint8_t valid_tag[256] =
567
{
568
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
569
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
570
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
571
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
572
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
573
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
574
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
575
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
576
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
577
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
578
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
579
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
580
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
581
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
582
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
583
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
584
};
585
586
/**
587
    bcf_hrec_check() - check the validity of structured header lines
588
589
    Returns 0 on success or negative value on error.
590
591
    Currently the return status is not checked by the caller
592
    and only a warning is printed on stderr. This should be improved
593
    to propagate the error all the way up to the caller and let it
594
    decide what to do: throw an error or proceed anyway.
595
 */
596
static int bcf_hrec_check(bcf_hrec_t *hrec)
597
125k
{
598
125k
    int i;
599
125k
    bcf_hrec_set_type(hrec);
600
601
125k
    if ( hrec->type==BCF_HL_CTG )
602
11.5k
    {
603
11.5k
        i = bcf_hrec_find_key(hrec,"ID");
604
11.5k
        if ( i<0 ) goto err_missing_id;
605
5.39k
        char *val = hrec->vals[i];
606
5.39k
        if ( val[0]=='*' || val[0]=='=' || !valid_ctg[(uint8_t)val[0]] ) goto err_invalid_ctg;
607
67.7k
        while ( *(++val) )
608
67.6k
            if ( !valid_ctg[(uint8_t)*val] ) goto err_invalid_ctg;
609
172
        return 0;
610
763
    }
611
114k
    if ( hrec->type==BCF_HL_INFO )
612
38.1k
    {
613
38.1k
        i = bcf_hrec_find_key(hrec,"ID");
614
38.1k
        if ( i<0 ) goto err_missing_id;
615
28.4k
        char *val = hrec->vals[i];
616
28.4k
        if ( !strcmp(val,"1000G") ) return 0;
617
28.3k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
618
11.2k
        while ( *(++val) )
619
9.84k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
620
1.40k
        return 0;
621
3.29k
    }
622
75.8k
    if ( hrec->type==BCF_HL_FMT )
623
8.34k
    {
624
8.34k
        i = bcf_hrec_find_key(hrec,"ID");
625
8.34k
        if ( i<0 ) goto err_missing_id;
626
7.14k
        char *val = hrec->vals[i];
627
7.14k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
628
8.91k
        while ( *(++val) )
629
6.94k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
630
1.97k
        return 0;
631
3.75k
    }
632
67.4k
    return 0;
633
634
17.1k
  err_missing_id:
635
17.1k
    hts_log_warning("Missing ID attribute in one or more header lines");
636
17.1k
    return -1;
637
638
5.22k
  err_invalid_ctg:
639
5.22k
    hts_log_warning("Invalid contig name: \"%s\"", hrec->vals[i]);
640
5.22k
    return -1;
641
642
32.1k
  err_invalid_tag:
643
32.1k
    hts_log_warning("Invalid tag name: \"%s\"", hrec->vals[i]);
644
32.1k
    return -1;
645
75.8k
}
646
647
static inline int is_escaped(const char *min, const char *str)
648
67.6k
{
649
67.6k
    int n = 0;
650
67.7k
    while ( --str>=min && *str=='\\' ) n++;
651
67.6k
    return n%2;
652
67.6k
}
653
654
bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
655
143k
{
656
143k
    bcf_hrec_t *hrec = NULL;
657
143k
    const char *p = line;
658
143k
    if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
659
138k
    p += 2;
660
661
138k
    const char *q = p;
662
1.31M
    while ( *q && *q!='=' && *q != '\n' ) q++;
663
138k
    ptrdiff_t n = q-p;
664
138k
    if ( *q!='=' || !n ) // wrong format
665
5.50k
        goto malformed_line;
666
667
133k
    hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
668
133k
    if (!hrec) { *len = -1; return NULL; }
669
133k
    hrec->key = (char*) malloc(sizeof(char)*(n+1));
670
133k
    if (!hrec->key) goto fail;
671
133k
    memcpy(hrec->key,p,n);
672
133k
    hrec->key[n] = 0;
673
133k
    hrec->type = -1;
674
675
133k
    p = ++q;
676
133k
    if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
677
27.8k
    {
678
9.22M
        while ( *q && *q!='\n' ) q++;
679
27.8k
        hrec->value = (char*) malloc((q-p+1)*sizeof(char));
680
27.8k
        if (!hrec->value) goto fail;
681
27.8k
        memcpy(hrec->value, p, q-p);
682
27.8k
        hrec->value[q-p] = 0;
683
27.8k
        *len = q - line + (*q ? 1 : 0); // Skip \n but not \0
684
27.8k
        return hrec;
685
27.8k
    }
686
687
    // structured line, e.g.
688
    // ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
689
    // ##PEDIGREE=<Name_0=G0-ID,Name_1=G1-ID,Name_3=GN-ID>
690
105k
    int nopen = 1;
691
342k
    while ( *q && *q!='\n' && nopen>0 )
692
245k
    {
693
245k
        p = ++q;
694
247k
        while ( *q && *q==' ' ) { p++; q++; }
695
        // ^[A-Za-z_][0-9A-Za-z_.]*$
696
245k
        if (p==q && *q && (isalpha_c(*q) || *q=='_'))
697
242k
        {
698
242k
            q++;
699
1.36M
            while ( *q && (isalnum_c(*q) || *q=='_' || *q=='.') ) q++;
700
242k
        }
701
245k
        n = q-p;
702
245k
        int m = 0;
703
245k
        while ( *q && *q==' ' ) { q++; m++; }
704
245k
        if ( *q!='=' || !n )
705
7.39k
            goto malformed_line;
706
707
237k
        if (bcf_hrec_add_key(hrec, p, q-p-m) < 0) goto fail;
708
237k
        p = ++q;
709
239k
        while ( *q && *q==' ' ) { p++; q++; }
710
711
237k
        int quoted = 0;
712
237k
        char ending = '\0';
713
237k
        switch (*p) {
714
67.6k
        case '"':
715
67.6k
            quoted = 1;
716
67.6k
            ending = '"';
717
67.6k
            p++;
718
67.6k
            break;
719
175
        case '[':
720
175
            quoted = 1;
721
175
            ending = ']';
722
175
            break;
723
237k
        }
724
237k
        if ( quoted ) q++;
725
235M
        while ( *q && *q != '\n' )
726
235M
        {
727
235M
            if ( quoted ) { if ( *q==ending && !is_escaped(p,q) ) break; }
728
234M
            else
729
234M
            {
730
234M
                if ( *q=='<' ) nopen++;
731
234M
                if ( *q=='>' ) nopen--;
732
234M
                if ( !nopen ) break;
733
234M
                if ( *q==',' && nopen==1 ) break;
734
234M
            }
735
235M
            q++;
736
235M
        }
737
237k
        const char *r = q;
738
237k
        if (quoted && ending == ']') {
739
175
            if (*q == ending) {
740
156
                r++;
741
156
                q++;
742
156
                quoted = 0;
743
156
            } else {
744
19
                char buffer[320];
745
19
                hts_log_error("Missing ']' in header line %s",
746
19
                              hts_strprint(buffer, sizeof(buffer), '"',
747
19
                                           line, q-line));
748
19
                goto fail;
749
19
            }
750
175
        }
751
238k
        while ( r > p && r[-1] == ' ' ) r--;
752
237k
        if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0)
753
0
            goto fail;
754
237k
        if ( quoted && *q==ending ) q++;
755
237k
        if ( *q=='>' )
756
72.0k
        {
757
72.0k
            if (nopen) nopen--;     // this can happen with nested angle brackets <>
758
72.0k
            q++;
759
72.0k
        }
760
237k
    }
761
97.8k
    if ( nopen )
762
25.7k
        hts_log_warning("Incomplete header line, trying to proceed anyway:\n\t[%s]\n\t[%d]",line,q[0]);
763
764
    // Skip to end of line
765
97.8k
    int nonspace = 0;
766
97.8k
    p = q;
767
968k
    while ( *q && *q!='\n' ) { nonspace |= !isspace_c(*q); q++; }
768
97.8k
    if (nonspace) {
769
1.04k
        char buffer[320];
770
1.04k
        hts_log_warning("Dropped trailing junk from header line '%s'",
771
1.04k
                        hts_strprint(buffer, sizeof(buffer),
772
1.04k
                                     '"', line, q - line));
773
1.04k
    }
774
775
97.8k
    *len = q - line + (*q ? 1 : 0);
776
97.8k
    return hrec;
777
778
19
 fail:
779
19
    *len = -1;
780
19
    bcf_hrec_destroy(hrec);
781
19
    return NULL;
782
783
12.9k
 malformed_line:
784
12.9k
    {
785
12.9k
        char buffer[320];
786
3.46M
        while ( *q && *q!='\n' ) q++;  // Ensure *len includes full line
787
12.9k
        hts_log_error("Could not parse the header line: %s",
788
12.9k
                      hts_strprint(buffer, sizeof(buffer),
789
12.9k
                                   '"', line, q - line));
790
12.9k
        *len = q - line + (*q ? 1 : 0);
791
12.9k
        bcf_hrec_destroy(hrec);
792
12.9k
        return NULL;
793
105k
    }
794
105k
}
795
796
static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_idinfo_t *idinfo)
797
70.6k
{
798
70.6k
    size_t new_n;
799
800
    // If available, preserve existing IDX
801
70.6k
    if ( idinfo->id==-1 )
802
70.3k
        idinfo->id = hdr->n[dict_type];
803
349
    else if ( idinfo->id < hdr->n[dict_type] && hdr->id[dict_type][idinfo->id].key )
804
1
    {
805
1
        hts_log_error("Conflicting IDX=%d lines in the header dictionary, the new tag is %s",
806
1
            idinfo->id, tag);
807
1
        errno = EINVAL;
808
1
        return -1;
809
1
    }
810
811
70.6k
    new_n = idinfo->id >= hdr->n[dict_type] ? idinfo->id+1 : hdr->n[dict_type];
812
70.6k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
813
    // hts_resize() can attempt to allocate up to 2 * requested items
814
70.6k
    if (new_n > FUZZ_ALLOC_LIMIT/(2 * sizeof(bcf_idpair_t)))
815
18
        return -1;
816
70.6k
#endif
817
70.6k
    if (hts_resize(bcf_idpair_t, new_n, &hdr->m[dict_type],
818
70.6k
                   &hdr->id[dict_type], HTS_RESIZE_CLEAR)) {
819
0
        return -1;
820
0
    }
821
70.6k
    hdr->n[dict_type] = new_n;
822
823
    // NB: the next kh_put call can invalidate the idinfo pointer, therefore
824
    // we leave it unassigned here. It must be set explicitly in bcf_hdr_sync.
825
70.6k
    hdr->id[dict_type][idinfo->id].key = tag;
826
827
70.6k
    return 0;
828
70.6k
}
829
830
// returns: 1 when hdr needs to be synced, -1 on error, 0 otherwise
831
static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
832
125k
{
833
    // contig
834
125k
    int i, ret, replacing = 0;
835
125k
    khint_t k;
836
125k
    char *str = NULL;
837
838
125k
    bcf_hrec_set_type(hrec);
839
840
125k
    if ( hrec->type==BCF_HL_CTG )
841
11.5k
    {
842
11.5k
        hts_pos_t len = 0;
843
844
        // Get the contig ID ($str) and length ($j)
845
11.5k
        i = bcf_hrec_find_key(hrec,"length");
846
11.5k
        if ( i<0 ) len = 0;
847
2.64k
        else {
848
2.64k
            char *end = hrec->vals[i];
849
2.64k
            len = strtoll(hrec->vals[i], &end, 10);
850
2.64k
            if (end == hrec->vals[i] || len < 0) return 0;
851
2.64k
        }
852
853
10.6k
        i = bcf_hrec_find_key(hrec,"ID");
854
10.6k
        if ( i<0 ) return 0;
855
5.39k
        str = strdup(hrec->vals[i]);
856
5.39k
        if (!str) return -1;
857
858
        // Register in the dictionary
859
5.39k
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
860
5.39k
        khint_t k = kh_get(vdict, d, str);
861
5.39k
        if ( k != kh_end(d) ) { // already present
862
1.21k
            free(str); str=NULL;
863
1.21k
            if (kh_val(d, k).hrec[0] != NULL) // and not removed
864
1.21k
                return 0;
865
0
            replacing = 1;
866
4.17k
        } else {
867
4.17k
            k = kh_put(vdict, d, str, &ret);
868
4.17k
            if (ret < 0) { free(str); return -1; }
869
4.17k
        }
870
871
4.17k
        int idx = bcf_hrec_find_key(hrec,"IDX");
872
4.17k
        if ( idx!=-1 )
873
801
        {
874
801
            char *tmp = hrec->vals[idx];
875
801
            idx = strtol(hrec->vals[idx], &tmp, 10);
876
801
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
877
768
            {
878
768
                if (!replacing) {
879
768
                    kh_del(vdict, d, k);
880
768
                    free(str);
881
768
                }
882
768
                hts_log_warning("Error parsing the IDX tag, skipping");
883
768
                return 0;
884
768
            }
885
801
        }
886
887
3.40k
        kh_val(d, k) = bcf_idinfo_def;
888
3.40k
        kh_val(d, k).id = idx;
889
3.40k
        kh_val(d, k).info[0] = len;
890
3.40k
        kh_val(d, k).hrec[0] = hrec;
891
3.40k
        if (bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k)) < 0) {
892
19
            if (!replacing) {
893
19
                kh_del(vdict, d, k);
894
19
                free(str);
895
19
            }
896
19
            return -1;
897
19
        }
898
3.38k
        if ( idx==-1 ) {
899
3.37k
            if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
900
0
               return -1;
901
0
            }
902
3.37k
        }
903
904
3.38k
        return 1;
905
3.38k
    }
906
907
114k
    if ( hrec->type==BCF_HL_STR ) return 1;
908
108k
    if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_FMT ) return 0;
909
910
    // INFO/FILTER/FORMAT
911
84.1k
    char *id = NULL;
912
84.1k
    uint32_t type = UINT32_MAX, var = UINT32_MAX;
913
84.1k
    int num = -1, idx = -1;
914
295k
    for (i=0; i<hrec->nkeys; i++)
915
212k
    {
916
212k
        if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
917
139k
        else if ( !strcmp(hrec->keys[i], "IDX") )
918
1.74k
        {
919
1.74k
            char *tmp = hrec->vals[i];
920
1.74k
            idx = strtol(hrec->vals[i], &tmp, 10);
921
1.74k
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
922
750
            {
923
750
                hts_log_warning("Error parsing the IDX tag, skipping");
924
750
                return 0;
925
750
            }
926
1.74k
        }
927
137k
        else if ( !strcmp(hrec->keys[i], "Type") )
928
36.1k
        {
929
36.1k
            if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
930
34.4k
            else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
931
33.5k
            else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
932
5.41k
            else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
933
5.30k
            else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
934
4.06k
            else
935
4.06k
            {
936
4.06k
                hts_log_warning("The type \"%s\" is not supported, assuming \"String\"", hrec->vals[i]);
937
4.06k
                type = BCF_HT_STR;
938
4.06k
            }
939
36.1k
        }
940
101k
        else if ( !strcmp(hrec->keys[i], "Number") )
941
30.9k
        {
942
30.9k
            int is_fmt = hrec->type == BCF_HL_FMT;
943
30.9k
            if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
944
30.1k
            else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
945
30.0k
            else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
946
29.4k
            else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
947
29.4k
            else if ( is_fmt && !strcmp(hrec->vals[i],"P") )  var = BCF_VL_P;
948
29.4k
            else if ( is_fmt && !strcmp(hrec->vals[i],"LA") ) var = BCF_VL_LA;
949
29.4k
            else if ( is_fmt && !strcmp(hrec->vals[i],"LR") ) var = BCF_VL_LR;
950
29.4k
            else if ( is_fmt && !strcmp(hrec->vals[i],"LG") ) var = BCF_VL_LG;
951
29.4k
            else if ( is_fmt && !strcmp(hrec->vals[i],"M") )  var = BCF_VL_M;
952
29.4k
            else
953
29.4k
            {
954
29.4k
                if (sscanf(hrec->vals[i],"%d",&num) == 1)
955
28.9k
                    var = BCF_VL_FIXED;
956
29.4k
            }
957
30.9k
            if (var != BCF_VL_FIXED) num = 0xfffff;
958
30.9k
        }
959
212k
    }
960
83.4k
    if (hrec->type == BCF_HL_INFO || hrec->type == BCF_HL_FMT) {
961
45.7k
        if (type == -1) {
962
10.5k
            hts_log_warning("%s %s field has no Type defined. Assuming String",
963
10.5k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
964
10.5k
            type = BCF_HT_STR;
965
10.5k
        }
966
45.7k
        if (var == UINT32_MAX) {
967
15.2k
            hts_log_warning("%s %s field has no Number defined. Assuming '.'",
968
15.2k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
969
15.2k
            var = BCF_VL_VAR;
970
15.2k
        }
971
45.7k
        if ( type==BCF_HT_FLAG && (var!=BCF_VL_FIXED || num!=0) )
972
944
        {
973
944
            hts_log_warning("The definition of Flag \"%s/%s\" is invalid, forcing Number=0", hrec->key,id);
974
944
            var = BCF_VL_FIXED;
975
944
            num = 0;
976
944
        }
977
45.7k
    }
978
83.4k
    uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 |
979
83.4k
                     (var & 0xf) << 8 |
980
83.4k
                     (type & 0xf) << 4 |
981
83.4k
                     (((uint32_t) hrec->type) & 0xf));
982
983
83.4k
    if ( !id ) return 0;
984
72.7k
    str = strdup(id);
985
72.7k
    if (!str) return -1;
986
987
72.7k
    vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
988
72.7k
    k = kh_get(vdict, d, str);
989
72.7k
    if ( k != kh_end(d) )
990
5.48k
    {
991
        // already present
992
5.48k
        free(str);
993
5.48k
        if ( kh_val(d, k).hrec[info&0xf] ) return 0;
994
1.16k
        kh_val(d, k).info[info&0xf] = info;
995
1.16k
        kh_val(d, k).hrec[info&0xf] = hrec;
996
1.16k
        if ( idx==-1 ) {
997
1.16k
            if (hrec_add_idx(hrec, kh_val(d, k).id) < 0) {
998
0
                return -1;
999
0
            }
1000
1.16k
        }
1001
1.16k
        return 1;
1002
1.16k
    }
1003
67.2k
    k = kh_put(vdict, d, str, &ret);
1004
67.2k
    if (ret < 0) {
1005
0
        free(str);
1006
0
        return -1;
1007
0
    }
1008
67.2k
    kh_val(d, k) = bcf_idinfo_def;
1009
67.2k
    kh_val(d, k).info[info&0xf] = info;
1010
67.2k
    kh_val(d, k).hrec[info&0xf] = hrec;
1011
67.2k
    kh_val(d, k).id = idx;
1012
67.2k
    if (bcf_hdr_set_idx(hdr, BCF_DT_ID, kh_key(d,k), &kh_val(d,k)) < 0) {
1013
0
        kh_del(vdict, d, k);
1014
0
        free(str);
1015
0
        return -1;
1016
0
    }
1017
67.2k
    if ( idx==-1 ) {
1018
66.9k
        if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
1019
0
            return -1;
1020
0
        }
1021
66.9k
    }
1022
1023
67.2k
    return 1;
1024
67.2k
}
1025
1026
static void bcf_hdr_unregister_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1027
0
{
1028
0
    if (hrec->type == BCF_HL_FLT ||
1029
0
        hrec->type == BCF_HL_INFO ||
1030
0
        hrec->type == BCF_HL_FMT ||
1031
0
        hrec->type == BCF_HL_CTG) {
1032
0
        int id = bcf_hrec_find_key(hrec, "ID");
1033
0
        if (id < 0 || !hrec->vals[id])
1034
0
            return;
1035
0
        vdict_t *dict = (hrec->type == BCF_HL_CTG
1036
0
                         ? (vdict_t*)hdr->dict[BCF_DT_CTG]
1037
0
                         : (vdict_t*)hdr->dict[BCF_DT_ID]);
1038
0
        khint_t k = kh_get(vdict, dict, hrec->vals[id]);
1039
0
        if (k != kh_end(dict))
1040
0
            kh_val(dict, k).hrec[hrec->type==BCF_HL_CTG ? 0 : hrec->type] = NULL;
1041
0
    }
1042
0
}
1043
1044
static void bcf_hdr_remove_from_hdict(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1045
0
{
1046
0
    kstring_t str = KS_INITIALIZE;
1047
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1048
0
    khint_t k;
1049
0
    int id;
1050
1051
0
    switch (hrec->type) {
1052
0
    case BCF_HL_GEN:
1053
0
        if (ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0)
1054
0
            str.l = 0;
1055
0
        break;
1056
0
    case BCF_HL_STR:
1057
0
        id = bcf_hrec_find_key(hrec, "ID");
1058
0
        if (id < 0)
1059
0
            return;
1060
0
        if (!hrec->vals[id] ||
1061
0
            ksprintf(&str, "##%s=<ID=%s>", hrec->key, hrec->vals[id]) < 0)
1062
0
            str.l = 0;
1063
0
        break;
1064
0
    default:
1065
0
        return;
1066
0
    }
1067
0
    if (str.l) {
1068
0
        k = kh_get(hdict, aux->gen, str.s);
1069
0
    } else {
1070
        // Couldn't get a string for some reason, so try the hard way...
1071
0
        for (k = kh_begin(aux->gen); k < kh_end(aux->gen); k++) {
1072
0
            if (kh_exist(aux->gen, k) && kh_val(aux->gen, k) == hrec)
1073
0
                break;
1074
0
        }
1075
0
    }
1076
0
    if (k != kh_end(aux->gen) && kh_val(aux->gen, k) == hrec) {
1077
0
        kh_val(aux->gen, k) = NULL;
1078
0
        free((char *) kh_key(aux->gen, k));
1079
0
        kh_key(aux->gen, k) = NULL;
1080
0
        kh_del(hdict, aux->gen, k);
1081
0
    }
1082
0
    free(str.s);
1083
0
}
1084
1085
int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp)
1086
0
{
1087
0
    assert( hrec->type==BCF_HL_GEN );
1088
0
    int ret;
1089
0
    khint_t k;
1090
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1091
0
    for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
1092
0
    {
1093
0
        if ( !kh_exist(aux->gen,k) ) continue;
1094
0
        if ( hrec!=(bcf_hrec_t*)kh_val(aux->gen,k) ) continue;
1095
0
        break;
1096
0
    }
1097
0
    assert( k<kh_end(aux->gen) );   // something went wrong, should never happen
1098
0
    free((char*)kh_key(aux->gen,k));
1099
0
    kh_del(hdict,aux->gen,k);
1100
0
    kstring_t str = {0,0,0};
1101
0
    if ( ksprintf(&str, "##%s=%s", tmp->key,tmp->value) < 0 )
1102
0
    {
1103
0
        free(str.s);
1104
0
        return -1;
1105
0
    }
1106
0
    k = kh_put(hdict, aux->gen, str.s, &ret);
1107
0
    if ( ret<0 )
1108
0
    {
1109
0
        free(str.s);
1110
0
        return -1;
1111
0
    }
1112
0
    free(hrec->value);
1113
0
    hrec->value = strdup(tmp->value);
1114
0
    if ( !hrec->value ) return -1;
1115
0
    kh_val(aux->gen,k) = hrec;
1116
1117
0
    if (!strcmp(hrec->key,"fileformat")) {
1118
        //update version
1119
0
        get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value);
1120
0
    }
1121
0
    return 0;
1122
0
}
1123
1124
int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1125
125k
{
1126
125k
    kstring_t str = {0,0,0};
1127
125k
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1128
1129
125k
    int res;
1130
125k
    if ( !hrec ) return 0;
1131
1132
125k
    bcf_hrec_check(hrec);   // todo: check return status and propagate errors up
1133
1134
125k
    res = bcf_hdr_register_hrec(hdr,hrec);
1135
125k
    if (res < 0) return -1;
1136
125k
    if ( !res )
1137
47.8k
    {
1138
        // If one of the hashed field, then it is already present
1139
47.8k
        if ( hrec->type != BCF_HL_GEN )
1140
23.9k
        {
1141
23.9k
            bcf_hrec_destroy(hrec);
1142
23.9k
            return 0;
1143
23.9k
        }
1144
        // Is one of the generic fields and already present?
1145
23.8k
        if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 )
1146
0
        {
1147
0
            free(str.s);
1148
0
            return -1;
1149
0
        }
1150
23.8k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1151
23.8k
        if ( k != kh_end(aux->gen) )
1152
14.4k
        {
1153
            // duplicate record
1154
14.4k
            bcf_hrec_destroy(hrec);
1155
14.4k
            free(str.s);
1156
14.4k
            return 0;
1157
14.4k
        }
1158
9.45k
        if (!strcmp(hrec->key, "fileformat")) {
1159
3.88k
            aux->version = bcf_get_version(NULL, hrec->value);
1160
3.88k
        }
1161
9.45k
    }
1162
1163
87.2k
    int i;
1164
87.2k
    if ( hrec->type==BCF_HL_STR && (i=bcf_hrec_find_key(hrec,"ID"))>=0 )
1165
3.59k
    {
1166
3.59k
        if ( ksprintf(&str, "##%s=<ID=%s>", hrec->key,hrec->vals[i]) < 0 )
1167
0
        {
1168
0
            free(str.s);
1169
0
            return -1;
1170
0
        }
1171
3.59k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1172
3.59k
        if ( k != kh_end(aux->gen) )
1173
2.56k
        {
1174
            // duplicate record
1175
2.56k
            bcf_hrec_destroy(hrec);
1176
2.56k
            free(str.s);
1177
2.56k
            return 0;
1178
2.56k
        }
1179
3.59k
    }
1180
1181
    // New record, needs to be added
1182
84.6k
    int n = hdr->nhrec + 1;
1183
84.6k
    bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
1184
84.6k
    if (!new_hrec) {
1185
0
        free(str.s);
1186
0
        bcf_hdr_unregister_hrec(hdr, hrec);
1187
0
        return -1;
1188
0
    }
1189
84.6k
    hdr->hrec = new_hrec;
1190
1191
84.6k
    if ( str.s )
1192
10.4k
    {
1193
10.4k
        khint_t k = kh_put(hdict, aux->gen, str.s, &res);
1194
10.4k
        if ( res<0 )
1195
0
        {
1196
0
            free(str.s);
1197
0
            return -1;
1198
0
        }
1199
10.4k
        kh_val(aux->gen,k) = hrec;
1200
10.4k
    }
1201
1202
84.6k
    hdr->hrec[hdr->nhrec] = hrec;
1203
84.6k
    hdr->dirty = 1;
1204
84.6k
    hdr->nhrec = n;
1205
1206
84.6k
    return hrec->type==BCF_HL_GEN ? 0 : 1;
1207
84.6k
}
1208
1209
bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
1210
798
{
1211
798
    int i;
1212
798
    if ( type==BCF_HL_GEN )
1213
798
    {
1214
        // e.g. ##fileformat=VCFv4.2
1215
        //      ##source=GenomicsDBImport
1216
        //      ##bcftools_viewVersion=1.16-80-gdfdb0923+htslib-1.16-34-g215d364
1217
798
        if ( value )
1218
0
        {
1219
0
            kstring_t str = {0,0,0};
1220
0
            ksprintf(&str, "##%s=%s", key,value);
1221
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1222
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1223
0
            free(str.s);
1224
0
            if ( k == kh_end(aux->gen) ) return NULL;
1225
0
            return kh_val(aux->gen, k);
1226
0
        }
1227
1.82k
        for (i=0; i<hdr->nhrec; i++)
1228
1.23k
        {
1229
1.23k
            if ( hdr->hrec[i]->type!=type ) continue;
1230
344
            if ( strcmp(hdr->hrec[i]->key,key) ) continue;
1231
204
            return hdr->hrec[i];
1232
344
        }
1233
594
        return NULL;
1234
798
    }
1235
0
    else if ( type==BCF_HL_STR )
1236
0
    {
1237
        // e.g. ##GATKCommandLine=<ID=GenomicsDBImport,CommandLine="GenomicsDBImport....">
1238
        //      ##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
1239
0
        if (!str_class) return NULL;
1240
0
        if ( !strcmp("ID",key) )
1241
0
        {
1242
0
            kstring_t str = {0,0,0};
1243
0
            ksprintf(&str, "##%s=<%s=%s>",str_class,key,value);
1244
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1245
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1246
0
            free(str.s);
1247
0
            if ( k == kh_end(aux->gen) ) return NULL;
1248
0
            return kh_val(aux->gen, k);
1249
0
        }
1250
0
        for (i=0; i<hdr->nhrec; i++)
1251
0
        {
1252
0
            if ( hdr->hrec[i]->type!=type ) continue;
1253
0
            if ( strcmp(hdr->hrec[i]->key,str_class) ) continue;
1254
0
            int j = bcf_hrec_find_key(hdr->hrec[i],key);
1255
0
            if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i];
1256
0
        }
1257
0
        return NULL;
1258
0
    }
1259
0
    vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1260
0
    khint_t k = kh_get(vdict, d, value);
1261
0
    if ( k == kh_end(d) ) return NULL;
1262
0
    return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
1263
0
}
1264
1265
// Check the VCF header is correctly formatted as per the specification.
1266
// Note the code that calls this doesn't bother to check return values and
1267
// we have so many broken VCFs in the wild that for now we just reprt a
1268
// warning and continue anyway.  So currently this is a void function.
1269
void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
1270
2.98k
{
1271
2.98k
    int version = bcf_get_version(hdr, NULL);
1272
1273
2.98k
    struct tag {
1274
2.98k
        char name[10];
1275
2.98k
        char number_str[3];
1276
2.98k
        int number;
1277
2.98k
        int version;
1278
2.98k
        int type;
1279
2.98k
    };
1280
1281
2.98k
    char type_str[][8] = {"Flag", "Integer", "Float", "String"};
1282
1283
2.98k
    struct tag info_tags[] = {
1284
2.98k
        {"AD",        "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1285
2.98k
        {"ADF",       "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1286
2.98k
        {"ADR",       "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1287
2.98k
        {"AC",        "A",  BCF_VL_A,     VCF_DEF, BCF_HT_INT},
1288
2.98k
        {"AF",        "A",  BCF_VL_A,     VCF_DEF, BCF_HT_REAL},
1289
2.98k
        {"CIGAR",     "A",  BCF_VL_A,     VCF_DEF, BCF_HT_STR},
1290
2.98k
        {"AA",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_STR},
1291
2.98k
        {"AN",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1292
2.98k
        {"BQ",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_REAL},
1293
2.98k
        {"DB",        "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1294
2.98k
        {"DP",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1295
2.98k
        {"END",       "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1296
2.98k
        {"H2",        "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1297
2.98k
        {"H3",        "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1298
2.98k
        {"MQ",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_REAL},
1299
2.98k
        {"MQ0",       "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1300
2.98k
        {"NS",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1301
2.98k
        {"SB",        "4",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1302
2.98k
        {"SOMATIC",   "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1303
2.98k
        {"VALIDATED", "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1304
2.98k
        {"1000G",     "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1305
2.98k
    };
1306
2.98k
    static int info_warned[sizeof(info_tags)/sizeof(*info_tags)] = {0};
1307
1308
2.98k
    struct tag fmt_tags[] = {
1309
2.98k
        {"AD",   "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1310
2.98k
        {"ADF",  "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1311
2.98k
        {"ADR",  "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1312
2.98k
        {"EC",   "A",  BCF_VL_A,     VCF_DEF, BCF_HT_INT},
1313
2.98k
        {"GL",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_REAL},
1314
2.98k
        {"GP",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_REAL},
1315
2.98k
        {"PL",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_INT},
1316
2.98k
        {"PP",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_INT},
1317
2.98k
        {"DP",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1318
2.98k
        {"LEN",  "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1319
2.98k
        {"FT",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_STR},
1320
2.98k
        {"GQ",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1321
2.98k
        {"GT",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_STR},
1322
2.98k
        {"HQ",   "2",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1323
2.98k
        {"MQ",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1324
2.98k
        {"PQ",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1325
2.98k
        {"PS",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1326
2.98k
        {"PSL",  "P",  BCF_VL_P,     VCF44,   BCF_HT_STR},
1327
2.98k
        {"PSO",  "P",  BCF_VL_P,     VCF44,   BCF_HT_INT},
1328
2.98k
        {"PSQ",  "P",  BCF_VL_P,     VCF44,   BCF_HT_INT},
1329
2.98k
        {"LGL",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1330
2.98k
        {"LGP",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1331
2.98k
        {"LPL",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1332
2.98k
        {"LPP",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1333
2.98k
        {"LEC",  "LA", BCF_VL_LA,    VCF45,   BCF_HT_INT},
1334
2.98k
        {"LAD",  "LR", BCF_VL_LR,    VCF45,   BCF_HT_INT},
1335
2.98k
        {"LADF", "LR", BCF_VL_LR,    VCF45,   BCF_HT_INT},
1336
2.98k
        {"LADR", "LR", BCF_VL_LR,    VCF45,   BCF_HT_INT},
1337
2.98k
    };
1338
2.98k
    static int fmt_warned[sizeof(fmt_tags)/sizeof(*fmt_tags)] = {0};
1339
1340
    // Check INFO tag numbers.  We shouldn't really permit ".", but it's
1341
    // commonly misused so we let it slide unless it's a new tag and the
1342
    // file format claims to be new also.  We also cannot distinguish between
1343
    // Number=1 and Number=2, but we at least report the correct term if we
1344
    // get, say, Number=G in its place.
1345
    // Also check the types.
1346
2.98k
    int i;
1347
65.6k
    for (i = 0; i < sizeof(info_tags)/sizeof(*info_tags); i++) {
1348
62.7k
        if (info_warned[i])
1349
1.86k
            continue;
1350
60.8k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, info_tags[i].name);
1351
60.8k
        if (bcf_hdr_idinfo_exists(hdr, BCF_HL_INFO, id)) {
1352
1
            if (bcf_hdr_id2length(hdr, BCF_HL_INFO, id) != info_tags[i].number &&
1353
1
                bcf_hdr_id2length(hdr, BCF_HL_INFO, id) != BCF_VL_VAR) {
1354
0
                info_warned[i] = 1;
1355
1
            } else if (bcf_hdr_id2length(hdr, BCF_HL_INFO, id) == BCF_VL_FIXED &&
1356
0
                       bcf_hdr_id2number(hdr, BCF_HL_INFO, id) != atoi(info_tags[i].number_str)) {
1357
0
                info_warned[i] = 1;
1358
0
            }
1359
1360
1
            if (info_warned[i]) {
1361
0
                hts_log_warning("%s should be declared as Number=%s",
1362
0
                                info_tags[i].name, info_tags[i].number_str);
1363
0
            }
1364
1365
1
            if (bcf_hdr_id2type(hdr, BCF_HL_INFO, id) != info_tags[i].type) {
1366
1
                hts_log_warning("%s should be declared as Type=%s",
1367
1
                                info_tags[i].name, type_str[info_tags[i].type]);
1368
1
                info_warned[i] = 1;
1369
1
            }
1370
1
        }
1371
60.8k
    }
1372
1373
    // Check FORMAT tag numbers and types.
1374
86.5k
    for (i = 0; i < sizeof(fmt_tags)/sizeof(*fmt_tags); i++) {
1375
83.6k
        if (fmt_warned[i])
1376
0
            continue;
1377
83.6k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, fmt_tags[i].name);
1378
83.6k
        if (bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, id)) {
1379
0
            if (bcf_hdr_id2length(hdr, BCF_HL_FMT, id) != fmt_tags[i].number) {
1380
                // Permit "Number=." if this tag predates the vcf version it is
1381
                // defined within.  This is a common tactic for callers to use
1382
                // new tags with older formats in order to avoid parsing failures
1383
                // with some software.
1384
                // We don't care for 4.3 and earlier as that's more of a wild-west
1385
                // and it's not abnormal to see incorrect usage of Number=. there.
1386
0
                if ((version < VCF44 &&
1387
0
                     bcf_hdr_id2length(hdr, BCF_HL_FMT, id) != BCF_VL_VAR) ||
1388
0
                    (version >= VCF44 && version >= fmt_tags[i].version)) {
1389
0
                    fmt_warned[i] = 1;
1390
0
                }
1391
0
            } else if (bcf_hdr_id2length(hdr, BCF_HL_FMT, id) == BCF_VL_FIXED &&
1392
0
                       bcf_hdr_id2number(hdr, BCF_HL_FMT, id) != atoi(fmt_tags[i].number_str)) {
1393
0
                fmt_warned[i] = 1;
1394
0
            }
1395
1396
0
            if (fmt_warned[i]) {
1397
0
                hts_log_warning("%s should be declared as Number=%s",
1398
0
                                fmt_tags[i].name, fmt_tags[i].number_str);
1399
0
            }
1400
1401
0
            if (bcf_hdr_id2type(hdr, BCF_HL_FMT, id) != fmt_tags[i].type) {
1402
0
                hts_log_warning("%s should be declared as Type=%s",
1403
0
                                fmt_tags[i].name, type_str[fmt_tags[i].type]);
1404
0
                fmt_warned[i] = 1;
1405
0
            }
1406
0
        }
1407
83.6k
    }
1408
2.98k
}
1409
1410
int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
1411
3.62k
{
1412
3.62k
    int len, done = 0;
1413
3.62k
    char *p = htxt;
1414
1415
    // Check sanity: "fileformat" string must come as first
1416
3.62k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
1417
3.62k
    if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") )
1418
326
        hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?");
1419
3.62k
    if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1420
0
        bcf_hrec_destroy(hrec);
1421
0
        return -1;
1422
0
    }
1423
1424
    // The filter PASS must appear first in the dictionary
1425
3.62k
    hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
1426
3.62k
    if (!hrec || bcf_hdr_add_hrec(hdr, hrec) < 0) {
1427
0
        bcf_hrec_destroy(hrec);
1428
0
        return -1;
1429
0
    }
1430
1431
    // Parse the whole header
1432
17.6k
    do {
1433
70.8k
        while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) {
1434
53.2k
            if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1435
18
                bcf_hrec_destroy(hrec);
1436
18
                return -1;
1437
18
            }
1438
53.2k
            p += len;
1439
53.2k
        }
1440
17.6k
        assert(hrec == NULL);
1441
17.6k
        if (len < 0) {
1442
            // len < 0 indicates out-of-memory, or similar error
1443
8
            hts_log_error("Could not parse header line: %s", strerror(errno));
1444
8
            return -1;
1445
17.6k
        } else if (len > 0) {
1446
            // Bad header line.  bcf_hdr_parse_line() will have logged it.
1447
            // Skip and try again on the next line (p + len will be the start
1448
            // of the next one).
1449
12.8k
            p += len;
1450
12.8k
            continue;
1451
12.8k
        }
1452
1453
        // Next should be the sample line.  If not, it was a malformed
1454
        // header, in which case print a warning and skip (many VCF
1455
        // operations do not really care about a few malformed lines).
1456
        // In the future we may want to add a strict mode that errors in
1457
        // this case.
1458
4.79k
        if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) {
1459
1.74k
            char *eol = strchr(p, '\n');
1460
1.74k
            if (*p != '\0') {
1461
1.21k
                char buffer[320];
1462
1.21k
                hts_log_warning("Could not parse header line: %s",
1463
1.21k
                                hts_strprint(buffer, sizeof(buffer),
1464
1.21k
                                               '"', p,
1465
1.21k
                                               eol ? (eol - p) : SIZE_MAX));
1466
1.21k
            }
1467
1.74k
            if (eol) {
1468
1.20k
                p = eol + 1; // Try from the next line.
1469
1.20k
            } else {
1470
546
                done = -1; // No more lines left, give up.
1471
546
            }
1472
3.04k
        } else {
1473
3.04k
            done = 1; // Sample line found
1474
3.04k
        }
1475
17.6k
    } while (!done);
1476
1477
3.59k
    if (done < 0) {
1478
        // No sample line is fatal.
1479
546
        hts_log_error("Could not parse the header, sample line not found");
1480
546
        return -1;
1481
546
    }
1482
1483
3.04k
    if (bcf_hdr_parse_sample_line(hdr,p) < 0)
1484
62
        return -1;
1485
2.98k
    if (bcf_hdr_sync(hdr) < 0)
1486
0
        return -1;
1487
2.98k
    bcf_hdr_check_sanity(hdr);
1488
2.98k
    return 0;
1489
2.98k
}
1490
1491
int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
1492
0
{
1493
0
    int len;
1494
0
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
1495
0
    if ( !hrec ) return -1;
1496
0
    if (bcf_hdr_add_hrec(hdr, hrec) < 0)
1497
0
        return -1;
1498
0
    return 0;
1499
0
}
1500
1501
void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
1502
0
{
1503
0
    int i = 0;
1504
0
    bcf_hrec_t *hrec;
1505
0
    if ( !key )
1506
0
    {
1507
        // no key, remove all entries of this type
1508
0
        while ( i<hdr->nhrec )
1509
0
        {
1510
0
            if ( hdr->hrec[i]->type!=type ) { i++; continue; }
1511
0
            hrec = hdr->hrec[i];
1512
0
            bcf_hdr_unregister_hrec(hdr, hrec);
1513
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1514
0
            hdr->dirty = 1;
1515
0
            hdr->nhrec--;
1516
0
            if ( i < hdr->nhrec )
1517
0
                memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1518
0
            bcf_hrec_destroy(hrec);
1519
0
        }
1520
0
        return;
1521
0
    }
1522
0
    while (1)
1523
0
    {
1524
0
        if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
1525
0
        {
1526
0
            hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL);
1527
0
            if ( !hrec ) return;
1528
1529
0
            for (i=0; i<hdr->nhrec; i++)
1530
0
                if ( hdr->hrec[i]==hrec ) break;
1531
0
            assert( i<hdr->nhrec );
1532
1533
0
            vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1534
0
            khint_t k = kh_get(vdict, d, key);
1535
0
            kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
1536
0
        }
1537
0
        else
1538
0
        {
1539
0
            for (i=0; i<hdr->nhrec; i++)
1540
0
            {
1541
0
                if ( hdr->hrec[i]->type!=type ) continue;
1542
0
                if ( type==BCF_HL_GEN )
1543
0
                {
1544
0
                    if ( !strcmp(hdr->hrec[i]->key,key) ) break;
1545
0
                }
1546
0
                else
1547
0
                {
1548
                    // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
1549
0
                    int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
1550
0
                    if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break;
1551
0
                }
1552
0
            }
1553
0
            if ( i==hdr->nhrec ) return;
1554
0
            hrec = hdr->hrec[i];
1555
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1556
0
        }
1557
1558
0
        hdr->nhrec--;
1559
0
        if ( i < hdr->nhrec )
1560
0
            memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1561
0
        bcf_hrec_destroy(hrec);
1562
0
        hdr->dirty = 1;
1563
0
    }
1564
0
}
1565
1566
int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
1567
0
{
1568
0
    char tmp[256], *line = tmp;
1569
0
    va_list ap;
1570
0
    va_start(ap, fmt);
1571
0
    int n = vsnprintf(line, sizeof(tmp), fmt, ap);
1572
0
    va_end(ap);
1573
1574
0
    if (n >= sizeof(tmp)) {
1575
0
        n++; // For trailing NUL
1576
0
        line = (char*)malloc(n);
1577
0
        if (!line)
1578
0
            return -1;
1579
1580
0
        va_start(ap, fmt);
1581
0
        vsnprintf(line, n, fmt, ap);
1582
0
        va_end(ap);
1583
0
    }
1584
1585
0
    int ret = bcf_hdr_append(hdr, line);
1586
1587
0
    if (line != tmp) free(line);
1588
0
    return ret;
1589
0
}
1590
1591
1592
/**********************
1593
 *** BCF header I/O ***
1594
 **********************/
1595
1596
const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
1597
798
{
1598
798
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1599
798
    if ( !hrec )
1600
594
    {
1601
594
        hts_log_warning("No version string found, assuming VCFv4.2");
1602
594
        return "VCFv4.2";
1603
594
    }
1604
204
    return hrec->value;
1605
798
}
1606
1607
int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
1608
0
{
1609
0
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1610
0
    if ( !hrec )
1611
0
    {
1612
0
        int len;
1613
0
        kstring_t str = {0,0,0};
1614
0
        if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1;
1615
0
        hrec = bcf_hdr_parse_line(hdr, str.s, &len);
1616
0
        free(str.s);
1617
1618
0
        get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value);
1619
0
    }
1620
0
    else
1621
0
    {
1622
0
        bcf_hrec_t *tmp = bcf_hrec_dup(hrec);
1623
0
        if ( !tmp ) return -1;
1624
0
        free(tmp->value);
1625
0
        tmp->value = strdup(version);
1626
0
        if ( !tmp->value ) return -1;
1627
0
        bcf_hdr_update_hrec(hdr, hrec, tmp);
1628
0
        bcf_hrec_destroy(tmp);
1629
0
    }
1630
0
    hdr->dirty = 1;
1631
    //TODO rlen may change, deal with it
1632
0
    return 0; // FIXME: check for errs in this function (return < 0 if so)
1633
0
}
1634
1635
bcf_hdr_t *bcf_hdr_init(const char *mode)
1636
3.63k
{
1637
3.63k
    int i;
1638
3.63k
    bcf_hdr_t *h;
1639
3.63k
    h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
1640
3.63k
    if (!h) return NULL;
1641
14.5k
    for (i = 0; i < 3; ++i) {
1642
10.9k
        if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail;
1643
        // Supersize the hash to make collisions very unlikely
1644
10.9k
        static int dsize[3] = {16384,16384,2048}; // info, contig, format
1645
10.9k
        if (kh_resize(vdict, h->dict[i], dsize[i]) < 0) goto fail;
1646
10.9k
    }
1647
1648
3.63k
    bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t));
1649
3.63k
    if ( !aux ) goto fail;
1650
3.63k
    if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; }
1651
3.63k
    aux->key_len = NULL;
1652
3.63k
    aux->dict = *((vdict_t*)h->dict[0]);
1653
3.63k
    aux->version = 0;
1654
3.63k
    aux->ref_count = 1;
1655
3.63k
    free(h->dict[0]);
1656
3.63k
    h->dict[0] = aux;
1657
1658
3.63k
    if ( strchr(mode,'w') )
1659
0
    {
1660
0
        bcf_hdr_append(h, "##fileformat=VCFv4.2");
1661
        // The filter PASS must appear first in the dictionary
1662
0
        bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
1663
0
        aux->version = VCF_DEF;
1664
0
    }
1665
3.63k
    return h;
1666
1667
0
 fail:
1668
0
    for (i = 0; i < 3; ++i)
1669
0
        kh_destroy(vdict, h->dict[i]);
1670
0
    free(h);
1671
0
    return NULL;
1672
3.63k
}
1673
1674
void bcf_hdr_destroy(bcf_hdr_t *h)
1675
5.27k
{
1676
5.27k
    int i;
1677
5.27k
    khint_t k;
1678
5.27k
    if (!h) return;
1679
5.27k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
1680
5.27k
    if (aux->ref_count > 1) // Refs still held, so delay destruction
1681
1.64k
    {
1682
1.64k
        aux->ref_count &= ~1;
1683
1.64k
        return;
1684
1.64k
    }
1685
14.5k
    for (i = 0; i < 3; ++i) {
1686
10.9k
        vdict_t *d = (vdict_t*)h->dict[i];
1687
10.9k
        if (d == 0) continue;
1688
126M
        for (k = kh_begin(d); k != kh_end(d); ++k)
1689
126M
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
1690
10.9k
        if ( i==0 )
1691
3.63k
        {
1692
26.8k
            for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
1693
23.1k
                if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k));
1694
3.63k
            kh_destroy(hdict, aux->gen);
1695
3.63k
            free(aux->key_len); // may exist for dict[0] only
1696
3.63k
        }
1697
10.9k
        kh_destroy(vdict, d);
1698
10.9k
        free(h->id[i]);
1699
10.9k
    }
1700
88.3k
    for (i=0; i<h->nhrec; i++)
1701
84.6k
        bcf_hrec_destroy(h->hrec[i]);
1702
3.63k
    if (h->nhrec) free(h->hrec);
1703
3.63k
    if (h->samples) free(h->samples);
1704
3.63k
    free(h->keep_samples);
1705
3.63k
    free(h->transl[0]); free(h->transl[1]);
1706
3.63k
    free(h->mem.s);
1707
3.63k
    free(h);
1708
3.63k
}
1709
1710
bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
1711
3.63k
{
1712
3.63k
    if (hfp->format.format == vcf)
1713
3.30k
        return vcf_hdr_read(hfp);
1714
326
    if (hfp->format.format != bcf) {
1715
0
        hts_log_error("Input is not detected as bcf or vcf format");
1716
0
        return NULL;
1717
0
    }
1718
1719
326
    assert(hfp->is_bgzf);
1720
1721
326
    BGZF *fp = hfp->fp.bgzf;
1722
326
    uint8_t magic[5];
1723
326
    bcf_hdr_t *h;
1724
326
    h = bcf_hdr_init("r");
1725
326
    if (!h) {
1726
0
        hts_log_error("Failed to allocate bcf header");
1727
0
        return NULL;
1728
0
    }
1729
326
    if (bgzf_read(fp, magic, 5) != 5)
1730
0
    {
1731
0
        hts_log_error("Failed to read the header (reading BCF in text mode?)");
1732
0
        bcf_hdr_destroy(h);
1733
0
        return NULL;
1734
0
    }
1735
326
    if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
1736
0
    {
1737
0
        if (!strncmp((char*)magic, "BCF", 3))
1738
0
            hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported");
1739
0
        else
1740
0
            hts_log_error("Invalid BCF2 magic string");
1741
0
        bcf_hdr_destroy(h);
1742
0
        return NULL;
1743
0
    }
1744
326
    uint8_t buf[4];
1745
326
    size_t hlen;
1746
326
    char *htxt = NULL;
1747
326
    if (bgzf_read(fp, buf, 4) != 4) goto fail;
1748
326
    hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24);
1749
326
    if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; }
1750
326
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1751
326
    if (hlen > FUZZ_ALLOC_LIMIT/2) { errno = ENOMEM; goto fail; }
1752
326
#endif
1753
326
    htxt = (char*)malloc(hlen + 1);
1754
326
    if (!htxt) goto fail;
1755
326
    if (bgzf_read(fp, htxt, hlen) != hlen) goto fail;
1756
326
    htxt[hlen] = '\0'; // Ensure htxt is terminated
1757
326
    if ( bcf_hdr_parse(h, htxt) < 0 ) goto fail;
1758
302
    free(htxt);
1759
1760
302
    bcf_hdr_incr_ref(h);
1761
302
    bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup);
1762
1763
302
    return h;
1764
24
 fail:
1765
24
    hts_log_error("Failed to read BCF header");
1766
24
    free(htxt);
1767
24
    bcf_hdr_destroy(h);
1768
24
    return NULL;
1769
326
}
1770
1771
int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
1772
2.98k
{
1773
2.98k
    if (!h) {
1774
0
        errno = EINVAL;
1775
0
        return -1;
1776
0
    }
1777
2.98k
    if ( h->dirty ) {
1778
0
        if (bcf_hdr_sync(h) < 0) return -1;
1779
0
    }
1780
2.98k
    hfp->format.category = variant_data;
1781
2.98k
    if (hfp->format.format == vcf || hfp->format.format == text_format) {
1782
1.49k
        hfp->format.format = vcf;
1783
1.49k
        return vcf_hdr_write(hfp, h);
1784
1.49k
    }
1785
1786
1.49k
    if (hfp->format.format == binary_format)
1787
1.49k
        hfp->format.format = bcf;
1788
1789
1.49k
    kstring_t htxt = {0,0,0};
1790
1.49k
    if (bcf_hdr_format(h, 1, &htxt) < 0) {
1791
0
        free(htxt.s);
1792
0
        return -1;
1793
0
    }
1794
1.49k
    kputc('\0', &htxt); // include the \0 byte
1795
1796
1.49k
    BGZF *fp = hfp->fp.bgzf;
1797
1.49k
    if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
1798
1.49k
    uint8_t hlen[4];
1799
1.49k
    u32_to_le(htxt.l, hlen);
1800
1.49k
    if ( bgzf_write(fp, hlen, 4) !=4 ) return -1;
1801
1.49k
    if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1;
1802
1.49k
    if ( bgzf_flush(fp) < 0) return -1;
1803
1804
1.49k
    bcf_hdr_incr_ref(h);
1805
1.49k
    bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup);
1806
1807
1.49k
    free(htxt.s);
1808
1.49k
    return 0;
1809
1.49k
}
1810
1811
/********************
1812
 *** BCF site I/O ***
1813
 ********************/
1814
1815
bcf1_t *bcf_init(void)
1816
2.98k
{
1817
2.98k
    bcf1_t *v;
1818
2.98k
    v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
1819
2.98k
    return v;
1820
2.98k
}
1821
1822
void bcf_clear(bcf1_t *v)
1823
34.0k
{
1824
34.0k
    int i;
1825
34.0k
    for (i=0; i<v->d.m_info; i++)
1826
0
    {
1827
0
        if ( v->d.info[i].vptr_free )
1828
0
        {
1829
0
            free(v->d.info[i].vptr - v->d.info[i].vptr_off);
1830
0
            v->d.info[i].vptr_free = 0;
1831
0
        }
1832
0
    }
1833
34.0k
    for (i=0; i<v->d.m_fmt; i++)
1834
0
    {
1835
0
        if ( v->d.fmt[i].p_free )
1836
0
        {
1837
0
            free(v->d.fmt[i].p - v->d.fmt[i].p_off);
1838
0
            v->d.fmt[i].p_free = 0;
1839
0
        }
1840
0
    }
1841
34.0k
    v->rid = v->pos = v->rlen = v->unpacked = 0;
1842
34.0k
    bcf_float_set_missing(v->qual);
1843
34.0k
    v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
1844
34.0k
    v->shared.l = v->indiv.l = 0;
1845
34.0k
    v->d.var_type = -1;
1846
34.0k
    v->d.shared_dirty = 0;
1847
34.0k
    v->d.indiv_dirty  = 0;
1848
34.0k
    v->d.n_flt = 0;
1849
34.0k
    v->errcode = 0;
1850
34.0k
    if (v->d.m_als) v->d.als[0] = 0;
1851
34.0k
    if (v->d.m_id) v->d.id[0] = 0;
1852
34.0k
}
1853
1854
void bcf_empty(bcf1_t *v)
1855
2.98k
{
1856
2.98k
    bcf_clear1(v);
1857
2.98k
    free(v->d.id);
1858
2.98k
    free(v->d.als);
1859
2.98k
    free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
1860
2.98k
    if (v->d.var ) free(v->d.var);
1861
2.98k
    free(v->shared.s); free(v->indiv.s);
1862
2.98k
    memset(&v->d,0,sizeof(v->d));
1863
2.98k
    memset(&v->shared,0,sizeof(v->shared));
1864
2.98k
    memset(&v->indiv,0,sizeof(v->indiv));
1865
2.98k
}
1866
1867
void bcf_destroy(bcf1_t *v)
1868
2.98k
{
1869
2.98k
    if (!v) return;
1870
2.98k
    bcf_empty1(v);
1871
2.98k
    free(v);
1872
2.98k
}
1873
1874
static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
1875
354
{
1876
354
    uint8_t x[32];
1877
354
    ssize_t ret;
1878
354
    uint32_t shared_len, indiv_len;
1879
354
    if ((ret = bgzf_read(fp, x, 32)) != 32) {
1880
16
        if (ret == 0) return -1;
1881
10
        return -2;
1882
16
    }
1883
338
    bcf_clear1(v);
1884
338
    shared_len = le_to_u32(x);
1885
338
    if (shared_len < 24) return -2;
1886
336
    shared_len -= 24; // to exclude six 32-bit integers
1887
336
    indiv_len = le_to_u32(x + 4);
1888
336
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1889
    // ks_resize() normally allocates 1.5 * requested size to allow for growth
1890
336
    if ((uint64_t) shared_len + indiv_len > FUZZ_ALLOC_LIMIT / 3 * 2) return -2;
1891
334
#endif
1892
334
    if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2;
1893
334
    if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2;
1894
334
    v->rid  = le_to_i32(x + 8);
1895
334
    v->pos  = le_to_u32(x + 12);
1896
334
    if ( v->pos==UINT32_MAX ) v->pos = -1;  // this is for telomere coordinate, e.g. MT:0
1897
334
    v->rlen = le_to_i32(x + 16);
1898
334
    v->qual = le_to_float(x + 20);
1899
334
    v->n_info = le_to_u16(x + 24);
1900
334
    v->n_allele = le_to_u16(x + 26);
1901
334
    v->n_sample = le_to_u32(x + 28) & 0xffffff;
1902
334
    v->n_fmt = x[31];
1903
334
    v->shared.l = shared_len;
1904
334
    v->indiv.l = indiv_len;
1905
    // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
1906
334
    if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
1907
1908
334
    if (bgzf_read(fp, v->shared.s, v->shared.l) != v->shared.l) return -2;
1909
292
    if (bgzf_read(fp, v->indiv.s, v->indiv.l) != v->indiv.l) return -2;
1910
292
    return 0;
1911
292
}
1912
1913
0
#define bit_array_size(n) ((n)/8+1)
1914
0
#define bit_array_set(a,i)   ((a)[(i)/8] |=   1 << ((i)%8))
1915
0
#define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
1916
0
#define bit_array_test(a,i)  ((a)[(i)/8] &   (1 << ((i)%8)))
1917
1918
static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1919
4.41k
                                   int32_t *val) {
1920
4.41k
    uint32_t t;
1921
4.41k
    if (end - p < 2) return -1;
1922
4.39k
    t = *p++ & 0xf;
1923
    /* Use if .. else if ... else instead of switch to force order.  Assumption
1924
       is that small integers are more frequent than big ones. */
1925
4.39k
    if (t == BCF_BT_INT8) {
1926
2.27k
        *val = *(int8_t *) p++;
1927
2.27k
    } else {
1928
2.12k
        if (end - p < (1<<bcf_type_shift[t])) return -1;
1929
2.11k
        if (t == BCF_BT_INT16) {
1930
1.45k
            *val = le_to_i16(p);
1931
1.45k
            p += 2;
1932
1.45k
        } else if (t == BCF_BT_INT32) {
1933
552
            *val = le_to_i32(p);
1934
552
            p += 4;
1935
#ifdef VCF_ALLOW_INT64
1936
        } else if (t == BCF_BT_INT64) {
1937
            // This case should never happen because there should be no
1938
            // 64-bit BCFs at all, definitely not coming from htslib
1939
            *val = le_to_i64(p);
1940
            p += 8;
1941
#endif
1942
552
        } else {
1943
108
            return -1;
1944
108
        }
1945
2.11k
    }
1946
4.28k
    *q = p;
1947
4.28k
    return 0;
1948
4.39k
}
1949
1950
static int bcf_dec_size_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1951
15.3k
                             int *num, int *type) {
1952
15.3k
    int r;
1953
15.3k
    if (p >= end) return -1;
1954
15.2k
    *type = *p & 0xf;
1955
15.2k
    if (*p>>4 != 15) {
1956
14.9k
        *q = p + 1;
1957
14.9k
        *num = *p >> 4;
1958
14.9k
        return 0;
1959
14.9k
    }
1960
388
    r = bcf_dec_typed_int1_safe(p + 1, end, q, num);
1961
388
    if (r) return r;
1962
354
    return *num >= 0 ? 0 : -1;
1963
388
}
1964
1965
380
static const char *get_type_name(int type) {
1966
380
    const char *types[9] = {
1967
380
        "null", "int (8-bit)", "int (16 bit)", "int (32 bit)",
1968
380
        "unknown", "float", "unknown", "char", "unknown"
1969
380
    };
1970
380
    int t = (type >= 0 && type < 8) ? type : 8;
1971
380
    return types[t];
1972
380
}
1973
1974
/**
1975
 *  updatephasing - updates 1st phasing based on other phasing status
1976
 *  @param p - pointer to phase value array
1977
 *  @param end - end of array
1978
 *  @param q - pointer to consumed data
1979
 *  @param samples - no. of samples in array
1980
 *  @param ploidy - no. of phasing values per sample
1981
 *  @param type - value type (one of BCF_BT_...)
1982
 *  Returns 0 on success and 1 on failure
1983
 *  Update for haploids made only if it is not unknown (.)
1984
 */
1985
static int updatephasing(uint8_t *p, uint8_t *end, uint8_t **q, int samples, int ploidy, int type)
1986
0
{
1987
0
    int j, k;
1988
0
    unsigned int inc = 1 << bcf_type_shift[type];
1989
0
    ptrdiff_t bytes = samples * ploidy * inc;
1990
1991
0
    if (samples < 0 || ploidy < 0 || end - p < bytes)
1992
0
        return 1;
1993
1994
    /*
1995
     * This works because phasing is stored in the least-significant bit
1996
     * of the GT encoding, and the data is always stored little-endian.
1997
     * Thus it's possible to get the desired result by doing bit operations
1998
     * on the least-significant byte of each value and ignoring the
1999
     * higher bytes (for 16-bit and 32-bit values).
2000
     */
2001
2002
0
    switch (ploidy) {
2003
0
    case 1:
2004
        // Trivial case - haploid data is phased by default
2005
0
        for (j = 0; j < samples; ++j) {
2006
0
            if (*p) *p |= 1;    //only if not unknown (.)
2007
0
            p += inc;
2008
0
        }
2009
0
        break;
2010
0
    case 2:
2011
        // Mostly trivial case - first is phased if second is.
2012
0
        for (j = 0; j < samples; ++j) {
2013
0
            *p |= (p[inc] & 1);
2014
0
            p += 2 * inc;
2015
0
        }
2016
0
        break;
2017
0
    default:
2018
        // Generic case - first is phased if all other alleles are.
2019
0
        for (j = 0; j < samples; ++j) {
2020
0
            uint8_t allphased = 1;
2021
0
            for (k = 1; k < ploidy; ++k)
2022
0
                allphased &= (p[inc * k]);
2023
0
            *p |= allphased;
2024
0
            p += ploidy * inc;
2025
0
        }
2026
0
    }
2027
0
    *q = p;
2028
0
    return 0;
2029
0
}
2030
2031
static void bcf_record_check_err(const bcf_hdr_t *hdr, bcf1_t *rec,
2032
2.11k
                                 char *type, uint32_t *reports, int i) {
2033
2.11k
    if (*reports == 0 || hts_verbose >= HTS_LOG_DEBUG)
2034
76
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos
2035
2.11k
                        ": Invalid FORMAT %s %d",
2036
2.11k
                        bcf_seqname_safe(hdr,rec), rec->pos+1, type, i);
2037
2.11k
    (*reports)++;
2038
2.11k
}
2039
2040
292
static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
2041
292
    uint8_t *ptr, *end;
2042
292
    size_t bytes;
2043
292
    uint32_t err = 0;
2044
292
    int type = 0;
2045
292
    int num  = 0;
2046
292
    uint32_t i, reports;
2047
292
    const uint32_t is_integer = ((1 << BCF_BT_INT8)  |
2048
292
                                 (1 << BCF_BT_INT16) |
2049
#ifdef VCF_ALLOW_INT64
2050
                                 (1 << BCF_BT_INT64) |
2051
#endif
2052
292
                                 (1 << BCF_BT_INT32));
2053
292
    const uint32_t is_valid_type = (is_integer          |
2054
292
                                    (1 << BCF_BT_NULL)  |
2055
292
                                    (1 << BCF_BT_FLOAT) |
2056
292
                                    (1 << BCF_BT_CHAR));
2057
292
    int32_t max_id = hdr ? hdr->n[BCF_DT_ID] : 0;
2058
    /* set phasing for 1st allele as in v44 for versions upto v43, to have
2059
    consistent binary values irrespective of version; not run for v >= v44,
2060
    to retain explicit phasing in v44 and higher */
2061
292
    int idgt = hdr ?
2062
292
                    bcf_get_version(hdr, NULL) < VCF44 ?
2063
292
                        bcf_hdr_id2int(hdr, BCF_DT_ID, "GT") : -1 :
2064
292
                    -1;
2065
2066
    // Check for valid contig ID
2067
292
    if (rec->rid < 0
2068
230
        || (hdr && (rec->rid >= hdr->n[BCF_DT_CTG]
2069
218
                    || hdr->id[BCF_DT_CTG][rec->rid].key == NULL))) {
2070
218
        hts_log_warning("Bad BCF record at %"PRIhts_pos": Invalid %s id %d", rec->pos+1, "CONTIG", rec->rid);
2071
218
        err |= BCF_ERR_CTG_INVALID;
2072
218
    }
2073
2074
    // Check ID
2075
292
    ptr = (uint8_t *) rec->shared.s;
2076
292
    end = ptr + rec->shared.l;
2077
292
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2078
290
    if (type != BCF_BT_CHAR) {
2079
228
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "ID", type, get_type_name(type));
2080
228
        err |= BCF_ERR_TAG_INVALID;
2081
228
    }
2082
290
    bytes = (size_t) num << bcf_type_shift[type];
2083
290
    if (end - ptr < bytes) goto bad_shared;
2084
286
    ptr += bytes;
2085
2086
    // Check REF and ALT
2087
286
    if (rec->n_allele < 1) {
2088
116
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": No REF allele",
2089
116
                        bcf_seqname_safe(hdr,rec), rec->pos+1);
2090
116
        err |= BCF_ERR_TAG_UNDEF;
2091
116
    }
2092
2093
286
    reports = 0;
2094
11.1k
    for (i = 0; i < rec->n_allele; i++) {
2095
10.8k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2096
10.8k
        if (type != BCF_BT_CHAR) {
2097
10.3k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2098
110
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "REF/ALT", type, get_type_name(type));
2099
10.3k
            err |= BCF_ERR_CHAR;
2100
10.3k
        }
2101
10.8k
        bytes = (size_t) num << bcf_type_shift[type];
2102
10.8k
        if (end - ptr < bytes) goto bad_shared;
2103
10.8k
        ptr += bytes;
2104
10.8k
    }
2105
2106
    // Check FILTER
2107
256
    reports = 0;
2108
256
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2109
248
    if (num > 0) {
2110
120
        bytes = (size_t) num << bcf_type_shift[type];
2111
120
        if (((1 << type) & is_integer) == 0) {
2112
32
            hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", type, get_type_name(type));
2113
32
            err |= BCF_ERR_TAG_INVALID;
2114
32
            if (end - ptr < bytes) goto bad_shared;
2115
30
            ptr += bytes;
2116
88
        } else {
2117
88
            if (end - ptr < bytes) goto bad_shared;
2118
3.65k
            for (i = 0; i < num; i++) {
2119
3.56k
                int32_t key = bcf_dec_int1(ptr, type, &ptr);
2120
3.56k
                if (key < 0
2121
2.67k
                    || (hdr && (key >= max_id
2122
2.56k
                                || hdr->id[BCF_DT_ID][key].key == NULL))) {
2123
2.56k
                    if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2124
72
                        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", key);
2125
2.56k
                    err |= BCF_ERR_TAG_UNDEF;
2126
2.56k
                }
2127
3.56k
            }
2128
86
        }
2129
120
    }
2130
2131
    // Check INFO
2132
244
    reports = 0;
2133
244
    bcf_idpair_t *id_tmp = hdr ? hdr->id[BCF_DT_ID] : NULL;
2134
2.03k
    for (i = 0; i < rec->n_info; i++) {
2135
1.89k
        int32_t key = -1;
2136
1.89k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_shared;
2137
1.81k
        if (key < 0 || (hdr && (key >= max_id
2138
1.37k
                                || id_tmp[key].key == NULL))) {
2139
1.37k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2140
64
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", key);
2141
1.37k
            err |= BCF_ERR_TAG_UNDEF;
2142
1.37k
        }
2143
1.81k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2144
1.79k
        if (((1 << type) & is_valid_type) == 0
2145
1.73k
            || (type == BCF_BT_NULL && num > 0)) {
2146
80
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2147
10
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", type, get_type_name(type));
2148
80
            err |= BCF_ERR_TAG_INVALID;
2149
80
        }
2150
1.79k
        bytes = (size_t) num << bcf_type_shift[type];
2151
1.79k
        if (end - ptr < bytes) goto bad_shared;
2152
1.79k
        ptr += bytes;
2153
1.79k
    }
2154
2155
    // Check FORMAT and individual information
2156
144
    ptr = (uint8_t *) rec->indiv.s;
2157
144
    end = ptr + rec->indiv.l;
2158
144
    reports = 0;
2159
2.19k
    for (i = 0; i < rec->n_fmt; i++) {
2160
2.13k
        int32_t key = -1;
2161
2.13k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_indiv;
2162
2.10k
        if (key < 0
2163
1.99k
            || (hdr && (key >= max_id
2164
1.97k
                        || id_tmp[key].key == NULL))) {
2165
1.97k
            bcf_record_check_err(hdr, rec, "id", &reports, key);
2166
1.97k
            err |= BCF_ERR_TAG_UNDEF;
2167
1.97k
        }
2168
2.10k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv;
2169
2.08k
        if (((1 << type) & is_valid_type) == 0
2170
1.99k
            || (type == BCF_BT_NULL && num > 0)) {
2171
140
            bcf_record_check_err(hdr, rec, "type", &reports, type);
2172
140
            err |= BCF_ERR_TAG_INVALID;
2173
140
        }
2174
2.08k
        if (idgt >= 0 && idgt == key) {
2175
            // check first GT phasing bit and fix up if necessary
2176
0
            if (updatephasing(ptr, end, &ptr, rec->n_sample, num, type)) {
2177
0
                err |= BCF_ERR_TAG_INVALID;
2178
0
            }
2179
2.08k
        } else {
2180
2.08k
            bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample;
2181
2.08k
            if (end - ptr < bytes) goto bad_indiv;
2182
2.05k
            ptr += bytes;
2183
2.05k
        }
2184
2.08k
    }
2185
2186
62
    if (!err && rec->rlen < 0) {
2187
        // Treat bad rlen as a warning instead of an error, and try to
2188
        // fix up by using the length of the stored REF allele.
2189
10
        static int warned = 0;
2190
10
        if (!warned) {
2191
1
            hts_log_warning("BCF record at %s:%"PRIhts_pos" has invalid RLEN (%"PRIhts_pos"). "
2192
1
                            "Only one invalid RLEN will be reported.",
2193
1
                            bcf_seqname_safe(hdr,rec), rec->pos+1, rec->rlen);
2194
1
            warned = 1;
2195
1
        }
2196
        //find rlen considering reflen, END, SVLEN, fmt LEN
2197
10
        hts_pos_t len = get_rlen(hdr, rec);
2198
10
        rec->rlen = len >= 0 ? len : 0;
2199
10
    }
2200
2201
62
    rec->errcode |= err;
2202
2203
62
    return err ? -2 : 0; // Return -2 so bcf_read() reports an error
2204
2205
148
 bad_shared:
2206
148
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - shared section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
2207
148
    return -2;
2208
2209
82
 bad_indiv:
2210
82
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - individuals section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
2211
82
    return -2;
2212
144
}
2213
2214
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
2215
int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
2216
0
{
2217
0
    if ( !hdr->keep_samples ) return 0;
2218
0
    if ( !bcf_hdr_nsamples(hdr) )
2219
0
    {
2220
0
        rec->indiv.l = rec->n_sample = 0;
2221
0
        return 0;
2222
0
    }
2223
2224
0
    int i, j;
2225
0
    uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
2226
0
    bcf_dec_t *dec = &rec->d;
2227
0
    hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
2228
0
    for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
2229
2230
0
    for (i=0; i<rec->n_fmt; i++)
2231
0
    {
2232
0
        ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
2233
0
        src = dec->fmt[i].p - dec->fmt[i].size;
2234
0
        if ( dst )
2235
0
        {
2236
0
            memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
2237
0
            dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
2238
0
        }
2239
0
        dst = dec->fmt[i].p;
2240
0
        for (j=0; j<hdr->nsamples_ori; j++)
2241
0
        {
2242
0
            src += dec->fmt[i].size;
2243
0
            if ( !bit_array_test(hdr->keep_samples,j) ) continue;
2244
0
            memmove(dst, src, dec->fmt[i].size);
2245
0
            dst += dec->fmt[i].size;
2246
0
        }
2247
0
        rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
2248
0
        dec->fmt[i].p_len = dst - dec->fmt[i].p;
2249
0
    }
2250
0
    rec->unpacked |= BCF_UN_FMT;
2251
2252
0
    rec->n_sample = bcf_hdr_nsamples(hdr);
2253
0
    return 0;
2254
0
}
2255
2256
int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
2257
31.7k
{
2258
31.7k
    if (fp->format.format == vcf) return vcf_read(fp, h, v);
2259
354
    if (!h)
2260
0
        h = (const bcf_hdr_t *) bgzf_get_private_data(fp->fp.bgzf);
2261
354
    int ret = bcf_read1_core(fp->fp.bgzf, v);
2262
354
    if (ret == 0) ret = bcf_record_check(h, v);
2263
354
    if ( ret!=0 || !h->keep_samples ) return ret;
2264
0
    return bcf_subset_format(h,v);
2265
354
}
2266
2267
int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end)
2268
0
{
2269
0
    bcf1_t *v = (bcf1_t *) vv;
2270
0
    const bcf_hdr_t *hdr = (const bcf_hdr_t *) bgzf_get_private_data(fp);
2271
0
    int ret = bcf_read1_core(fp, v);
2272
0
    if (ret == 0) ret = bcf_record_check(hdr, v);
2273
0
    if (ret  >= 0)
2274
0
        *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
2275
0
    return ret;
2276
0
}
2277
2278
static inline int bcf1_sync_id(bcf1_t *line, kstring_t *str)
2279
0
{
2280
    // single typed string
2281
0
    if ( line->d.id && strcmp(line->d.id, ".") ) {
2282
0
        return bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
2283
0
    } else {
2284
0
        return bcf_enc_size(str, 0, BCF_BT_CHAR);
2285
0
    }
2286
0
}
2287
static inline int bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
2288
0
{
2289
    // list of typed strings
2290
0
    int i;
2291
0
    for (i=0; i<line->n_allele; i++) {
2292
0
        if (bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]) < 0)
2293
0
            return -1;
2294
0
    }
2295
0
    if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
2296
0
    return 0;
2297
0
}
2298
static inline int bcf1_sync_filter(bcf1_t *line, kstring_t *str)
2299
0
{
2300
    // typed vector of integers
2301
0
    if ( line->d.n_flt ) {
2302
0
        return bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
2303
0
    } else {
2304
0
        return bcf_enc_vint(str, 0, 0, -1);
2305
0
    }
2306
0
}
2307
2308
static inline int bcf1_sync_info(bcf1_t *line, kstring_t *str)
2309
0
{
2310
    // pairs of typed vectors
2311
0
    int i, irm = -1, e = 0;
2312
0
    for (i=0; i<line->n_info; i++)
2313
0
    {
2314
0
        bcf_info_t *info = &line->d.info[i];
2315
0
        if ( !info->vptr )
2316
0
        {
2317
            // marked for removal
2318
0
            if ( irm < 0 ) irm = i;
2319
0
            continue;
2320
0
        }
2321
0
        e |= kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str) < 0;
2322
0
        if ( irm >=0 )
2323
0
        {
2324
0
            bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
2325
0
            while ( irm<=i && line->d.info[irm].vptr ) irm++;
2326
0
        }
2327
0
    }
2328
0
    if ( irm>=0 ) line->n_info = irm;
2329
0
    return e == 0 ? 0 : -1;
2330
0
}
2331
2332
static int bcf1_sync(bcf1_t *line)
2333
26
{
2334
26
    char *shared_ori = line->shared.s;
2335
26
    size_t prev_len;
2336
2337
26
    kstring_t tmp = {0,0,0};
2338
26
    if ( !line->shared.l )
2339
0
    {
2340
        // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
2341
0
        tmp = line->shared;
2342
0
        bcf1_sync_id(line, &tmp);
2343
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2344
2345
0
        bcf1_sync_alleles(line, &tmp);
2346
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2347
2348
0
        bcf1_sync_filter(line, &tmp);
2349
0
        line->unpack_size[2] = tmp.l - prev_len;
2350
2351
0
        bcf1_sync_info(line, &tmp);
2352
0
        line->shared = tmp;
2353
0
    }
2354
26
    else if ( line->d.shared_dirty )
2355
0
    {
2356
        // The line was edited, update the BCF data block.
2357
2358
0
        if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line,BCF_UN_STR);
2359
2360
        // ptr_ori points to the original unchanged BCF data.
2361
0
        uint8_t *ptr_ori = (uint8_t *) line->shared.s;
2362
2363
        // ID: single typed string
2364
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ID )
2365
0
            bcf1_sync_id(line, &tmp);
2366
0
        else
2367
0
            kputsn_(ptr_ori, line->unpack_size[0], &tmp);
2368
0
        ptr_ori += line->unpack_size[0];
2369
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2370
2371
        // REF+ALT: list of typed strings
2372
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
2373
0
            bcf1_sync_alleles(line, &tmp);
2374
0
        else
2375
0
        {
2376
0
            kputsn_(ptr_ori, line->unpack_size[1], &tmp);
2377
0
            if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
2378
0
        }
2379
0
        ptr_ori += line->unpack_size[1];
2380
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2381
2382
0
        if ( line->unpacked & BCF_UN_FLT )
2383
0
        {
2384
            // FILTER: typed vector of integers
2385
0
            if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
2386
0
                bcf1_sync_filter(line, &tmp);
2387
0
            else if ( line->d.n_flt )
2388
0
                kputsn_(ptr_ori, line->unpack_size[2], &tmp);
2389
0
            else
2390
0
                bcf_enc_vint(&tmp, 0, 0, -1);
2391
0
            ptr_ori += line->unpack_size[2];
2392
0
            line->unpack_size[2] = tmp.l - prev_len;
2393
2394
0
            if ( line->unpacked & BCF_UN_INFO )
2395
0
            {
2396
                // INFO: pairs of typed vectors
2397
0
                if ( line->d.shared_dirty & BCF1_DIRTY_INF )
2398
0
                {
2399
0
                    bcf1_sync_info(line, &tmp);
2400
0
                    ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
2401
0
                }
2402
0
            }
2403
0
        }
2404
2405
0
        int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
2406
0
        if ( size ) kputsn_(ptr_ori, size, &tmp);
2407
2408
0
        free(line->shared.s);
2409
0
        line->shared = tmp;
2410
0
    }
2411
26
    if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
2412
0
    {
2413
        // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
2414
0
        size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
2415
0
        int i;
2416
0
        for (i=0; i<line->n_info; i++)
2417
0
        {
2418
0
            uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
2419
0
            line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
2420
0
            off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
2421
0
            if ( vptr_free )
2422
0
            {
2423
0
                free(vptr_free);
2424
0
                line->d.info[i].vptr_free = 0;
2425
0
            }
2426
0
        }
2427
0
    }
2428
2429
26
    if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
2430
0
    {
2431
        // The genotype fields changed or are not present
2432
0
        tmp.l = tmp.m = 0; tmp.s = NULL;
2433
0
        int i, irm = -1;
2434
0
        for (i=0; i<line->n_fmt; i++)
2435
0
        {
2436
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
2437
0
            if ( !fmt->p )
2438
0
            {
2439
                // marked for removal
2440
0
                if ( irm < 0 ) irm = i;
2441
0
                continue;
2442
0
            }
2443
0
            kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
2444
0
            if ( irm >=0 )
2445
0
            {
2446
0
                bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
2447
0
                while ( irm<=i && line->d.fmt[irm].p ) irm++;
2448
0
            }
2449
2450
0
        }
2451
0
        if ( irm>=0 ) line->n_fmt = irm;
2452
0
        free(line->indiv.s);
2453
0
        line->indiv = tmp;
2454
2455
        // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
2456
0
        size_t off_new = 0;
2457
0
        for (i=0; i<line->n_fmt; i++)
2458
0
        {
2459
0
            uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
2460
0
            line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
2461
0
            off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
2462
0
            if ( p_free )
2463
0
            {
2464
0
                free(p_free);
2465
0
                line->d.fmt[i].p_free = 0;
2466
0
            }
2467
0
        }
2468
0
    }
2469
26
    if ( !line->n_sample ) line->n_fmt = 0;
2470
26
    line->d.shared_dirty = line->d.indiv_dirty = 0;
2471
26
    return 0;
2472
26
}
2473
2474
bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
2475
0
{
2476
0
    bcf1_sync(src);
2477
2478
0
    bcf_clear(dst);
2479
0
    dst->rid  = src->rid;
2480
0
    dst->pos  = src->pos;
2481
0
    dst->rlen = src->rlen;
2482
0
    dst->qual = src->qual;
2483
0
    dst->n_info = src->n_info; dst->n_allele = src->n_allele;
2484
0
    dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
2485
2486
0
    if ( dst->shared.m < src->shared.l )
2487
0
    {
2488
0
        dst->shared.s = (char*) realloc(dst->shared.s, src->shared.l);
2489
0
        dst->shared.m = src->shared.l;
2490
0
    }
2491
0
    dst->shared.l = src->shared.l;
2492
0
    memcpy(dst->shared.s,src->shared.s,dst->shared.l);
2493
2494
0
    if ( dst->indiv.m < src->indiv.l )
2495
0
    {
2496
0
        dst->indiv.s = (char*) realloc(dst->indiv.s, src->indiv.l);
2497
0
        dst->indiv.m = src->indiv.l;
2498
0
    }
2499
0
    dst->indiv.l = src->indiv.l;
2500
0
    memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
2501
2502
0
    return dst;
2503
0
}
2504
bcf1_t *bcf_dup(bcf1_t *src)
2505
0
{
2506
0
    bcf1_t *out = bcf_init1();
2507
0
    return bcf_copy(out, src);
2508
0
}
2509
2510
int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
2511
30.0k
{
2512
30.0k
    if ( h->dirty ) {
2513
0
        if (bcf_hdr_sync(h) < 0) return -1;
2514
0
    }
2515
30.0k
    if ( bcf_hdr_nsamples(h)!=v->n_sample )
2516
50
    {
2517
50
        hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
2518
50
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
2519
50
        return -1;
2520
50
    }
2521
2522
29.9k
    if ( hfp->format.format == vcf || hfp->format.format == text_format )
2523
28.7k
        return vcf_write(hfp,h,v);
2524
2525
1.25k
    if ( v->errcode & ~BCF_ERR_LIMITS ) // todo: unsure about the other BCF_ERR_LIMITS branches in vcf_parse_format_alloc4()
2526
1.22k
    {
2527
        // vcf_parse1() encountered a new contig or tag, undeclared in the
2528
        // header.  At this point, the header must have been printed,
2529
        // proceeding would lead to a broken BCF file. Errors must be checked
2530
        // and cleared by the caller before we can proceed.
2531
1.22k
        char errdescription[1024] = "";
2532
1.22k
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos, v->errcode, bcf_strerror(v->errcode, errdescription, sizeof(errdescription)), bcf_seqname_safe(h,v), v->pos+1);
2533
1.22k
        return -1;
2534
1.22k
    }
2535
26
    bcf1_sync(v);   // check if the BCF record was modified
2536
2537
26
    if ( v->unpacked & BCF_IS_64BIT )
2538
0
    {
2539
0
        hts_log_error("Data at %s:%"PRIhts_pos" contains 64-bit values not representable in BCF. Please use VCF instead", bcf_seqname_safe(h,v), v->pos+1);
2540
0
        return -1;
2541
0
    }
2542
2543
26
    BGZF *fp = hfp->fp.bgzf;
2544
26
    uint8_t x[32];
2545
26
    u32_to_le(v->shared.l + 24, x); // to include six 32-bit integers
2546
26
    u32_to_le(v->indiv.l, x + 4);
2547
26
    i32_to_le(v->rid, x + 8);
2548
26
    u32_to_le(v->pos, x + 12);
2549
26
    u32_to_le(v->rlen, x + 16);
2550
26
    float_to_le(v->qual, x + 20);
2551
26
    u16_to_le(v->n_info, x + 24);
2552
26
    u16_to_le(v->n_allele, x + 26);
2553
26
    u32_to_le((uint32_t)v->n_fmt<<24 | (v->n_sample & 0xffffff), x + 28);
2554
26
    if ( bgzf_write(fp, x, 32) != 32 ) return -1;
2555
26
    if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
2556
26
    if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
2557
2558
26
    if (hfp->idx) {
2559
0
        if (bgzf_idx_push(fp, hfp->idx, v->rid, v->pos, v->pos + v->rlen,
2560
0
                          bgzf_tell(fp), 1) < 0)
2561
0
            return -1;
2562
0
    }
2563
2564
26
    return 0;
2565
26
}
2566
2567
/**********************
2568
 *** VCF header I/O ***
2569
 **********************/
2570
2571
0
static int add_missing_contig_hrec(bcf_hdr_t *h, const char *name) {
2572
0
    bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t));
2573
0
    int save_errno;
2574
0
    if (!hrec) goto fail;
2575
2576
0
    hrec->key = strdup("contig");
2577
0
    if (!hrec->key) goto fail;
2578
2579
0
    if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) goto fail;
2580
0
    if (bcf_hrec_set_val(hrec, hrec->nkeys-1, name, strlen(name), 0) < 0)
2581
0
        goto fail;
2582
0
    if (bcf_hdr_add_hrec(h, hrec) < 0)
2583
0
        goto fail;
2584
0
    return 0;
2585
2586
0
 fail:
2587
0
    save_errno = errno;
2588
0
    hts_log_error("%s", strerror(errno));
2589
0
    if (hrec) bcf_hrec_destroy(hrec);
2590
0
    errno = save_errno;
2591
0
    return -1;
2592
0
}
2593
2594
bcf_hdr_t *vcf_hdr_read(htsFile *fp)
2595
3.30k
{
2596
3.30k
    kstring_t txt, *s = &fp->line;
2597
3.30k
    int ret;
2598
3.30k
    bcf_hdr_t *h;
2599
3.30k
    tbx_t *idx = NULL;
2600
3.30k
    const char **names = NULL;
2601
3.30k
    h = bcf_hdr_init("r");
2602
3.30k
    if (!h) {
2603
0
        hts_log_error("Failed to allocate bcf header");
2604
0
        return NULL;
2605
0
    }
2606
3.30k
    txt.l = txt.m = 0; txt.s = 0;
2607
77.2k
    while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) {
2608
76.7k
        int e = 0;
2609
76.7k
        if (s->l == 0) continue;
2610
72.9k
        if (s->s[0] != '#') {
2611
10
            hts_log_error("No sample line");
2612
10
            goto error;
2613
10
        }
2614
72.9k
        if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
2615
0
            kstring_t tmp = { 0, 0, NULL };
2616
0
            hFILE *f = hopen(fp->fn_aux, "r");
2617
0
            if (f == NULL) {
2618
0
                hts_log_error("Couldn't open \"%s\"", fp->fn_aux);
2619
0
                goto error;
2620
0
            }
2621
0
            while (tmp.l = 0, kgetline(&tmp, (kgets_func *) hgets, f) >= 0) {
2622
0
                char *tab = strchr(tmp.s, '\t');
2623
0
                if (tab == NULL) continue;
2624
0
                e |= (kputs("##contig=<ID=", &txt) < 0);
2625
0
                e |= (kputsn(tmp.s, tab - tmp.s, &txt) < 0);
2626
0
                e |= (kputs(",length=", &txt) < 0);
2627
0
                e |= (kputl(atol(tab), &txt) < 0);
2628
0
                e |= (kputsn(">\n", 2, &txt) < 0);
2629
0
            }
2630
0
            free(tmp.s);
2631
0
            if (hclose(f) != 0) {
2632
0
                hts_log_error("Error on closing %s", fp->fn_aux);
2633
0
                goto error;
2634
0
            }
2635
0
            if (e) goto error;
2636
0
        }
2637
72.9k
        if (kputsn(s->s, s->l, &txt) < 0) goto error;
2638
72.9k
        if (kputc('\n', &txt) < 0) goto error;
2639
72.9k
        if (s->s[1] != '#') break;
2640
72.9k
    }
2641
3.29k
    if ( ret < -1 ) goto error;
2642
3.29k
    if ( !txt.s )
2643
0
    {
2644
0
        hts_log_error("Could not read the header");
2645
0
        goto error;
2646
0
    }
2647
3.29k
    if ( bcf_hdr_parse(h, txt.s) < 0 ) goto error;
2648
2649
    // check tabix index, are all contigs listed in the header? add the missing ones
2650
2.68k
    idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SILENT_FAIL);
2651
2.68k
    if ( idx )
2652
0
    {
2653
0
        int i, n, need_sync = 0;
2654
0
        names = tbx_seqnames(idx, &n);
2655
0
        if (!names) goto error;
2656
0
        for (i=0; i<n; i++)
2657
0
        {
2658
0
            bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL);
2659
0
            if ( hrec ) continue;
2660
0
            if (add_missing_contig_hrec(h, names[i]) < 0) goto error;
2661
0
            need_sync = 1;
2662
0
        }
2663
0
        if ( need_sync ) {
2664
0
            if (bcf_hdr_sync(h) < 0) goto error;
2665
0
        }
2666
0
        free(names);
2667
0
        tbx_destroy(idx);
2668
0
    }
2669
2.68k
    free(txt.s);
2670
2.68k
    return h;
2671
2672
624
 error:
2673
624
    if (idx) tbx_destroy(idx);
2674
624
    free(names);
2675
624
    free(txt.s);
2676
624
    if (h) bcf_hdr_destroy(h);
2677
624
    return NULL;
2678
2.68k
}
2679
2680
int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
2681
0
{
2682
0
    int i = 0, n = 0, save_errno;
2683
0
    char **lines = hts_readlines(fname, &n);
2684
0
    if ( !lines ) return 1;
2685
0
    for (i=0; i<n-1; i++)
2686
0
    {
2687
0
        int k;
2688
0
        bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
2689
0
        if (!hrec) goto fail;
2690
0
        if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
2691
0
            bcf_hrec_destroy(hrec);
2692
0
            goto fail;
2693
0
        }
2694
0
        free(lines[i]);
2695
0
        lines[i] = NULL;
2696
0
    }
2697
0
    if (bcf_hdr_parse_sample_line(hdr, lines[n-1]) < 0) goto fail;
2698
0
    if (bcf_hdr_sync(hdr) < 0) goto fail;
2699
0
    free(lines[n-1]);
2700
0
    free(lines);
2701
0
    return 0;
2702
2703
0
 fail:
2704
0
    save_errno = errno;
2705
0
    for (; i < n; i++)
2706
0
        free(lines[i]);
2707
0
    free(lines);
2708
0
    errno = save_errno;
2709
0
    return 1;
2710
0
}
2711
2712
static int _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
2713
10.7k
{
2714
10.7k
    uint32_t e = 0;
2715
10.7k
    if ( !hrec->value )
2716
5.78k
    {
2717
5.78k
        int j, nout = 0;
2718
5.78k
        e |= ksprintf(str, "##%s=<", hrec->key) < 0;
2719
23.3k
        for (j=0; j<hrec->nkeys; j++)
2720
17.6k
        {
2721
            // do not output IDX if output is VCF
2722
17.6k
            if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
2723
14.9k
            if ( nout ) e |= kputc(',',str) < 0;
2724
14.9k
            e |= ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]) < 0;
2725
14.9k
            nout++;
2726
14.9k
        }
2727
5.78k
        e |= ksprintf(str,">\n") < 0;
2728
5.78k
    }
2729
5.01k
    else
2730
5.01k
        e |= ksprintf(str,"##%s=%s\n", hrec->key,hrec->value) < 0;
2731
2732
10.7k
    return e == 0 ? 0 : -1;
2733
10.7k
}
2734
2735
int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
2736
0
{
2737
0
    return _bcf_hrec_format(hrec,0,str);
2738
0
}
2739
2740
int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str)
2741
2.98k
{
2742
2.98k
    int i, r = 0;
2743
13.7k
    for (i=0; i<hdr->nhrec; i++)
2744
10.7k
        r |= _bcf_hrec_format(hdr->hrec[i], is_bcf, str) < 0;
2745
2746
2.98k
    r |= ksprintf(str, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") < 0;
2747
2.98k
    if ( bcf_hdr_nsamples(hdr) )
2748
928
    {
2749
928
        r |= ksprintf(str, "\tFORMAT") < 0;
2750
6.06k
        for (i=0; i<bcf_hdr_nsamples(hdr); i++)
2751
5.14k
            r |= ksprintf(str, "\t%s", hdr->samples[i]) < 0;
2752
928
    }
2753
2.98k
    r |= ksprintf(str, "\n") < 0;
2754
2755
2.98k
    return r ? -1 : 0;
2756
2.98k
}
2757
2758
char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
2759
0
{
2760
0
    kstring_t txt = {0,0,0};
2761
0
    if (bcf_hdr_format(hdr, is_bcf, &txt) < 0)
2762
0
        return NULL;
2763
0
    if ( len ) *len = txt.l;
2764
0
    return txt.s;
2765
0
}
2766
2767
const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
2768
0
{
2769
0
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
2770
0
    int i, tid, m = kh_size(d);
2771
0
    const char **names = (const char**) calloc(m,sizeof(const char*));
2772
0
    if ( !names )
2773
0
    {
2774
0
        hts_log_error("Failed to allocate memory");
2775
0
        *n = 0;
2776
0
        return NULL;
2777
0
    }
2778
0
    khint_t k;
2779
0
    for (k=kh_begin(d); k<kh_end(d); k++)
2780
0
    {
2781
0
        if ( !kh_exist(d,k) ) continue;
2782
0
        if ( !kh_val(d, k).hrec[0] ) continue;  // removed via bcf_hdr_remove
2783
0
        tid = kh_val(d,k).id;
2784
0
        if ( tid >= m )
2785
0
        {
2786
            // This can happen after a contig has been removed from BCF header via bcf_hdr_remove()
2787
0
            if ( hts_resize(const char*, tid + 1, &m, &names, HTS_RESIZE_CLEAR)<0 )
2788
0
            {
2789
0
                hts_log_error("Failed to allocate memory");
2790
0
                *n = 0;
2791
0
                free(names);
2792
0
                return NULL;
2793
0
            }
2794
0
            m = tid + 1;
2795
0
        }
2796
0
        names[tid] = kh_key(d,k);
2797
0
    }
2798
    // ensure there are no gaps
2799
0
    for (i=0,tid=0; tid<m; i++,tid++)
2800
0
    {
2801
0
        while ( tid<m && !names[tid] ) tid++;
2802
0
        if ( tid==m ) break;
2803
0
        if ( i==tid ) continue;
2804
0
        names[i] = names[tid];
2805
0
        names[tid] = 0;
2806
0
    }
2807
0
    *n = i;
2808
0
    return names;
2809
0
}
2810
2811
int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
2812
1.49k
{
2813
1.49k
    kstring_t htxt = {0,0,0};
2814
1.49k
    if (bcf_hdr_format(h, 0, &htxt) < 0) {
2815
0
        free(htxt.s);
2816
0
        return -1;
2817
0
    }
2818
1.49k
    while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros
2819
1.49k
    int ret;
2820
1.49k
    if ( fp->format.compression!=no_compression ) {
2821
0
        ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l);
2822
0
        if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2823
1.49k
    } else {
2824
1.49k
        ret = hwrite(fp->fp.hfile, htxt.s, htxt.l);
2825
1.49k
    }
2826
1.49k
    free(htxt.s);
2827
1.49k
    return ret<0 ? -1 : 0;
2828
1.49k
}
2829
2830
/***********************
2831
 *** Typed value I/O ***
2832
 ***********************/
2833
2834
int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
2835
196k
{
2836
196k
    int32_t max = INT32_MIN, min = INT32_MAX;
2837
196k
    int i;
2838
196k
    if (n <= 0) {
2839
3.36k
        return bcf_enc_size(s, 0, BCF_BT_NULL);
2840
192k
    } else if (n == 1) {
2841
26.8k
        return bcf_enc_int1(s, a[0]);
2842
165k
    } else {
2843
165k
        if (wsize <= 0) wsize = n;
2844
2845
        // Equivalent to:
2846
        // for (i = 0; i < n; ++i) {
2847
        //     if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end )
2848
        //         continue;
2849
        //     if (max < a[i]) max = a[i];
2850
        //     if (min > a[i]) min = a[i];
2851
        // }
2852
165k
        int max4[4] = {INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN};
2853
165k
        int min4[4] = {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX};
2854
27.8M
        for (i = 0; i < (n&~3); i+=4) {
2855
            // bcf_int32_missing    == INT32_MIN and
2856
            // bcf_int32_vector_end == INT32_MIN+1.
2857
            // We skip these, but can mostly avoid explicit checking
2858
27.6M
            if (max4[0] < a[i+0]) max4[0] = a[i+0];
2859
27.6M
            if (max4[1] < a[i+1]) max4[1] = a[i+1];
2860
27.6M
            if (max4[2] < a[i+2]) max4[2] = a[i+2];
2861
27.6M
            if (max4[3] < a[i+3]) max4[3] = a[i+3];
2862
27.6M
            if (min4[0] > a[i+0] && a[i+0] > INT32_MIN+1) min4[0] = a[i+0];
2863
27.6M
            if (min4[1] > a[i+1] && a[i+1] > INT32_MIN+1) min4[1] = a[i+1];
2864
27.6M
            if (min4[2] > a[i+2] && a[i+2] > INT32_MIN+1) min4[2] = a[i+2];
2865
27.6M
            if (min4[3] > a[i+3] && a[i+3] > INT32_MIN+1) min4[3] = a[i+3];
2866
27.6M
        }
2867
165k
        min = min4[0];
2868
165k
        if (min > min4[1]) min = min4[1];
2869
165k
        if (min > min4[2]) min = min4[2];
2870
165k
        if (min > min4[3]) min = min4[3];
2871
165k
        max = max4[0];
2872
165k
        if (max < max4[1]) max = max4[1];
2873
165k
        if (max < max4[2]) max = max4[2];
2874
165k
        if (max < max4[3]) max = max4[3];
2875
397k
        for (; i < n; ++i) {
2876
231k
            if (max < a[i]) max = a[i];
2877
231k
            if (min > a[i] && a[i] > INT32_MIN+1) min = a[i];
2878
231k
        }
2879
2880
165k
        if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) {
2881
19.8k
            if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 ||
2882
19.8k
                ks_resize(s, s->l + n) < 0)
2883
0
                return -1;
2884
19.8k
            uint8_t *p = (uint8_t *) s->s + s->l;
2885
3.99M
            for (i = 0; i < n; ++i, p++) {
2886
3.97M
                if ( a[i]==bcf_int32_vector_end )   *p = bcf_int8_vector_end;
2887
3.97M
                else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing;
2888
78.5k
                else *p = a[i];
2889
3.97M
            }
2890
19.8k
            s->l += n;
2891
146k
        } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) {
2892
105k
            uint8_t *p;
2893
105k
            if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 ||
2894
105k
                ks_resize(s, s->l + n * sizeof(int16_t)) < 0)
2895
0
                return -1;
2896
105k
            p = (uint8_t *) s->s + s->l;
2897
40.4M
            for (i = 0; i < n; ++i)
2898
40.3M
            {
2899
40.3M
                int16_t x;
2900
40.3M
                if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
2901
40.2M
                else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
2902
645k
                else x = a[i];
2903
40.3M
                i16_to_le(x, p);
2904
40.3M
                p += sizeof(int16_t);
2905
40.3M
            }
2906
105k
            s->l += n * sizeof(int16_t);
2907
105k
        } else {
2908
40.7k
            uint8_t *p;
2909
40.7k
            if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 ||
2910
40.7k
                ks_resize(s, s->l + n * sizeof(int32_t)) < 0)
2911
0
                return -1;
2912
40.7k
            p = (uint8_t *) s->s + s->l;
2913
66.6M
            for (i = 0; i < n; ++i) {
2914
66.5M
                i32_to_le(a[i], p);
2915
66.5M
                p += sizeof(int32_t);
2916
66.5M
            }
2917
40.7k
            s->l += n * sizeof(int32_t);
2918
40.7k
        }
2919
165k
    }
2920
2921
165k
    return 0;
2922
196k
}
2923
2924
#ifdef VCF_ALLOW_INT64
2925
static int bcf_enc_long1(kstring_t *s, int64_t x) {
2926
    uint32_t e = 0;
2927
    if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32)
2928
        return bcf_enc_int1(s, x);
2929
    if (x == bcf_int64_vector_end) {
2930
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2931
        e |= kputc(bcf_int8_vector_end, s) < 0;
2932
    } else if (x == bcf_int64_missing) {
2933
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2934
        e |= kputc(bcf_int8_missing, s) < 0;
2935
    } else {
2936
        e |= bcf_enc_size(s, 1, BCF_BT_INT64);
2937
        e |= ks_expand(s, 8);
2938
        if (e == 0) { u64_to_le(x, (uint8_t *) s->s + s->l); s->l += 8; }
2939
    }
2940
    return e == 0 ? 0 : -1;
2941
}
2942
#endif
2943
2944
488k
static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) {
2945
488k
    uint8_t *p;
2946
488k
    size_t i;
2947
488k
    size_t bytes = n * sizeof(float);
2948
2949
488k
    if (bytes / sizeof(float) != n) return -1;
2950
488k
    if (ks_resize(s, s->l + bytes) < 0) return -1;
2951
2952
488k
    p = (uint8_t *) s->s + s->l;
2953
110M
    for (i = 0; i < n; i++) {
2954
109M
        float_to_le(a[i], p);
2955
109M
        p += sizeof(float);
2956
109M
    }
2957
488k
    s->l += bytes;
2958
2959
488k
    return 0;
2960
488k
}
2961
2962
int bcf_enc_vfloat(kstring_t *s, int n, float *a)
2963
488k
{
2964
488k
    assert(n >= 0);
2965
488k
    bcf_enc_size(s, n, BCF_BT_FLOAT);
2966
488k
    serialize_float_array(s, n, a);
2967
488k
    return 0; // FIXME: check for errs in this function
2968
488k
}
2969
2970
int bcf_enc_vchar(kstring_t *s, int l, const char *a)
2971
2.37M
{
2972
2.37M
    bcf_enc_size(s, l, BCF_BT_CHAR);
2973
2.37M
    kputsn(a, l, s);
2974
2.37M
    return 0; // FIXME: check for errs in this function
2975
2.37M
}
2976
2977
// Special case of n==1 as it also occurs quite often in FORMAT data.
2978
// This version is also small enough to get inlined.
2979
5.08k
static inline int bcf_fmt_array1(kstring_t *s, int type, void *data) {
2980
5.08k
    uint32_t e = 0;
2981
5.08k
    uint8_t *p = (uint8_t *)data;
2982
5.08k
    int32_t v;
2983
2984
    // helps gcc more than clang here. In billions of cycles:
2985
    //          bcf_fmt_array1  bcf_fmt_array
2986
    // gcc7:    23.2            24.3
2987
    // gcc13:   21.6            23.0
2988
    // clang13: 27.1            27.8
2989
5.08k
    switch (type) {
2990
5.08k
    case BCF_BT_CHAR:
2991
5.08k
        e |= kputc_(*p == bcf_str_missing ? '.' : *p, s) < 0;
2992
5.08k
        break;
2993
2994
0
    case BCF_BT_INT8:
2995
0
        if (*(int8_t *)p != bcf_int8_vector_end) {
2996
0
            e |= ((*(int8_t *)p == bcf_int8_missing)
2997
0
                  ? kputc_('.', s)
2998
0
                  : kputw(*(int8_t *)p, s)) < 0;
2999
0
        }
3000
0
        break;
3001
0
    case BCF_BT_INT16:
3002
0
        v = le_to_i16(p);
3003
0
        if (v != bcf_int16_vector_end) {
3004
0
            e |= (v == bcf_int16_missing
3005
0
                  ? kputc_('.', s)
3006
0
                  : kputw(v, s)) < 0;
3007
0
        }
3008
0
        break;
3009
3010
0
    case BCF_BT_INT32:
3011
0
        v = le_to_i32(p);
3012
0
        if (v != bcf_int32_vector_end) {
3013
0
            e |= (v == bcf_int32_missing
3014
0
                  ? kputc_('.', s)
3015
0
                  : kputw(v, s)) < 0;
3016
0
        }
3017
0
        break;
3018
3019
0
    case BCF_BT_FLOAT:
3020
0
        v = le_to_u32(p);
3021
0
        if (v != bcf_float_vector_end) {
3022
0
            e |= (v == bcf_float_missing
3023
0
                  ? kputc_('.', s)
3024
0
                  : kputd(le_to_float(p), s)) < 0;
3025
0
        }
3026
0
        break;
3027
3028
0
    default:
3029
0
        hts_log_error("Unexpected type %d", type);
3030
0
        return -1;
3031
5.08k
    }
3032
3033
5.08k
    return e == 0 ? 0 : -1;
3034
5.08k
}
3035
3036
int bcf_fmt_array(kstring_t *s, int n, int type, void *data)
3037
2.15M
{
3038
2.15M
    int j = 0;
3039
2.15M
    uint32_t e = 0;
3040
2.15M
    if (n == 0) {
3041
1.31M
        return kputc_('.', s) >= 0 ? 0 : -1;
3042
1.31M
    }
3043
3044
842k
    if (type == BCF_BT_CHAR)
3045
218k
    {
3046
218k
        char *p = (char *)data;
3047
3048
        // Note bcf_str_missing is already accounted for in n==0 above.
3049
218k
        if (n >= 8) {
3050
58.4k
            char *p_end = memchr(p, 0, n);
3051
58.4k
            e |= kputsn(p, p_end ? p_end-p : n, s) < 0;
3052
160k
        } else {
3053
596k
            for (j = 0; j < n && *p; ++j, ++p)
3054
436k
               e |= kputc(*p, s) < 0;
3055
160k
        }
3056
218k
    }
3057
623k
    else
3058
623k
    {
3059
623k
        #define BRANCH(type_t, convert, is_missing, is_vector_end, kprint) { \
3060
623k
            uint8_t *p = (uint8_t *) data; \
3061
109M
            for (j=0; j<n; j++, p += sizeof(type_t))    \
3062
108M
            { \
3063
108M
                type_t v = convert(p); \
3064
108M
                if ( is_vector_end ) break; \
3065
108M
                if ( j ) e |= kputc_(',', s) < 0; \
3066
108M
                e |= (is_missing ? kputc('.', s) : kprint) < 0; \
3067
108M
            } \
3068
623k
        }
3069
623k
        switch (type) {
3070
163k
            case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, v==bcf_int8_missing,  v==bcf_int8_vector_end,  kputw(v, s)); break;
3071
104k
            case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break;
3072
111k
            case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break;
3073
244k
            case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break;
3074
0
            default: hts_log_error("Unexpected type %d", type); exit(1); break;
3075
623k
        }
3076
623k
        #undef BRANCH
3077
623k
    }
3078
842k
    return e == 0 ? 0 : -1;
3079
842k
}
3080
3081
uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
3082
1.44M
{
3083
1.44M
    int x, type;
3084
1.44M
    x = bcf_dec_size(ptr, &ptr, &type);
3085
1.44M
    bcf_fmt_array(s, x, type, ptr);
3086
1.44M
    return ptr + (x << bcf_type_shift[type]);
3087
1.44M
}
3088
3089
/********************
3090
 *** VCF site I/O ***
3091
 ********************/
3092
3093
typedef struct {
3094
    int key;            // Key for h->id[BCF_DT_ID][key] vdict
3095
    int max_m;          // number of elements in field array (ie commas)
3096
    int size;           // field size (max_l or max_g*4 if is_gt)
3097
    int offset;         // offset of buf into h->mem
3098
    uint32_t is_gt:1,   // is genotype
3099
             max_g:31;  // maximum number of genotypes
3100
    uint32_t max_l;     // length of field
3101
    uint32_t y;         // h->id[0][fmt[j].key].val->info[BCF_HL_FMT]
3102
    uint8_t *buf;       // Pointer into h->mem
3103
} fmt_aux_t;
3104
3105
// fmt_aux_t field notes:
3106
// max_* are biggest sizes of the various FORMAT fields across all samples.
3107
// We use these after pivoting the data to ensure easy random access
3108
// of a specific sample.
3109
//
3110
// max_m is only used for type BCF_HT_REAL or BCF_HT_INT
3111
// max_g is only used for is_gt == 1 (will be BCF_HT_STR)
3112
// max_l is only used for is_gt == 0 (will be BCF_HT_STR)
3113
//
3114
// These are computed in vcf_parse_format_max3 and used in
3115
// vcf_parse_format_alloc4 to get the size.
3116
//
3117
// size is computed from max_g, max_l, max_m and is_gt.  Once computed
3118
// the max values are never accessed again.
3119
//
3120
// In theory all 4 vars could be coalesced into a single variable, but this
3121
// significantly harms speed (even if done via a union).  It's about 25-30%
3122
// slower.
3123
3124
static inline int align_mem(kstring_t *s)
3125
57.1k
{
3126
57.1k
    int e = 0;
3127
57.1k
    if (s->l&7) {
3128
8.07k
        uint64_t zero = 0;
3129
8.07k
        e = kputsn((char*)&zero, 8 - (s->l&7), s) < 0;
3130
8.07k
    }
3131
57.1k
    return e == 0 ? 0 : -1;
3132
57.1k
}
3133
3134
58.2k
#define MAX_N_FMT 255   /* Limited by size of bcf1_t n_fmt field */
3135
3136
// detect FORMAT "."
3137
static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3138
9.40k
                                   const char *p, const char *q) {
3139
9.40k
    const char *end = s->s + s->l;
3140
9.40k
    if ( q>=end )
3141
34
    {
3142
34
        hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1);
3143
34
        v->errcode |= BCF_ERR_NCOLS;
3144
34
        return -1;
3145
34
    }
3146
3147
9.36k
    v->n_fmt = 0;
3148
9.36k
    if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "."
3149
194
    {
3150
194
        v->n_sample = bcf_hdr_nsamples(h);
3151
194
        return 1;
3152
194
    }
3153
3154
9.17k
    return 0;
3155
9.36k
}
3156
3157
// get format information from the dictionary
3158
static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3159
9.17k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
3160
9.17k
    const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3161
9.17k
    char *t;
3162
9.17k
    int j;
3163
9.17k
    ks_tokaux_t aux1;
3164
3165
67.4k
    for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
3166
58.2k
        if (j >= MAX_N_FMT) {
3167
3
            v->errcode |= BCF_ERR_LIMITS;
3168
3
            hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle",
3169
3
                bcf_seqname_safe(h,v), v->pos+1);
3170
3
            return -1;
3171
3
        }
3172
3173
58.2k
        *(char*)aux1.p = 0;
3174
58.2k
        khint_t k = kh_get(vdict, d, t);
3175
58.2k
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
3176
4.35k
            if ( t[0]=='.' && t[1]==0 )
3177
1
            {
3178
1
                hts_log_error("Invalid FORMAT tag name '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3179
1
                v->errcode |= BCF_ERR_TAG_INVALID;
3180
1
                return -1;
3181
1
            }
3182
4.35k
            hts_log_warning("FORMAT '%s' at %s:%"PRIhts_pos" is not defined in the header, assuming Type=String", t, bcf_seqname_safe(h,v), v->pos+1);
3183
4.35k
            kstring_t tmp = {0,0,0};
3184
4.35k
            int l;
3185
4.35k
            ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
3186
4.35k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3187
4.35k
            free(tmp.s);
3188
4.35k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3189
4.35k
            if (res < 0) bcf_hrec_destroy(hrec);
3190
4.35k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3191
3192
4.35k
            k = kh_get(vdict, d, t);
3193
4.35k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3194
4.35k
            if (res || k == kh_end(d)) {
3195
8
                hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
3196
8
                v->errcode |= BCF_ERR_TAG_INVALID;
3197
8
                return -1;
3198
8
            }
3199
4.35k
        }
3200
58.2k
        fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
3201
58.2k
        fmt[j].key = kh_val(d, k).id;
3202
58.2k
        fmt[j].is_gt = (t[0] == 'G' && t[1] == 'T' && !t[2]);
3203
58.2k
        fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
3204
58.2k
        v->n_fmt++;
3205
58.2k
    }
3206
9.16k
    return 0;
3207
9.17k
}
3208
3209
// compute max
3210
static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3211
9.16k
                                 char *p, char *q, fmt_aux_t *fmt) {
3212
9.16k
    int n_sample_ori = -1;
3213
9.16k
    char *r = q + 1;  // r: position in the format string
3214
9.16k
    int l = 0, m = 1, g = 1, j;
3215
9.16k
    v->n_sample = 0;  // m: max vector size, l: max field len, g: max number of alleles
3216
9.16k
    const char *end = s->s + s->l;
3217
3218
20.2k
    while ( r<end )
3219
20.1k
    {
3220
        // can we skip some samples?
3221
20.1k
        if ( h->keep_samples )
3222
0
        {
3223
0
            n_sample_ori++;
3224
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
3225
0
            {
3226
0
                while ( *r!='\t' && r<end ) r++;
3227
0
                if ( *r=='\t' ) { *r = 0; r++; }
3228
0
                continue;
3229
0
            }
3230
0
        }
3231
3232
        // collect fmt stats: max vector size, length, number of alleles
3233
20.1k
        j = 0;  // j-th format field
3234
20.1k
        fmt_aux_t *f = fmt;
3235
20.1k
        static char meta[256] = {
3236
            // \0 \t , / : |
3237
20.1k
            1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3238
20.1k
            0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1, 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
3239
20.1k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3240
20.1k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
3241
20.1k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3242
20.1k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3243
20.1k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3244
20.1k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3245
20.1k
        };
3246
3247
20.1k
        char *r_start = r;
3248
3.63M
        for (;;) {
3249
            // Quickly skip ahead to an appropriate meta-character
3250
4.05M
            while (!meta[(unsigned char)*r]) r++;
3251
3252
3.63M
            switch (*r) {
3253
3.59M
            case ',':
3254
3.59M
                m++;
3255
3.59M
                break;
3256
3257
644
            case '|':
3258
11.8k
            case '/':
3259
11.8k
                if (f->is_gt) g++;
3260
11.8k
                break;
3261
3262
8.38k
            case '\t':
3263
8.38k
                *r = 0; // fall through
3264
3265
8.38k
            default: // valid due to while loop above.
3266
20.1k
            case '\0':
3267
32.2k
            case ':':
3268
32.2k
                l = r - r_start; r_start = r;
3269
32.2k
                if (f->max_m < m) f->max_m = m;
3270
32.2k
                if (f->max_l < l) f->max_l = l;
3271
32.2k
                if (f->is_gt && f->max_g < g) f->max_g = g;
3272
32.2k
                l = 0, m = g = 1;
3273
32.2k
                if ( *r==':' ) {
3274
12.0k
                    j++; f++;
3275
12.0k
                    if ( j>=v->n_fmt ) {
3276
18
                        hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"",
3277
18
                                      h->id[BCF_DT_CTG][v->rid].key, v->pos+1);
3278
18
                        v->errcode |= BCF_ERR_NCOLS;
3279
18
                        return -1;
3280
18
                    }
3281
20.1k
                } else goto end_for;
3282
12.0k
                break;
3283
3.63M
            }
3284
3.61M
            if ( r>=end ) break;
3285
3.61M
            r++;
3286
3.61M
        }
3287
20.1k
    end_for:
3288
20.1k
        v->n_sample++;
3289
20.1k
        if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
3290
11.0k
        r++;
3291
11.0k
    }
3292
3293
9.14k
    return 0;
3294
9.16k
}
3295
3296
// allocate memory for arrays
3297
static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3298
                                   const char *p, const char *q,
3299
9.14k
                                   fmt_aux_t *fmt) {
3300
9.14k
    kstring_t *mem = (kstring_t*)&h->mem;
3301
3302
9.14k
    int j;
3303
66.3k
    for (j = 0; j < v->n_fmt; ++j) {
3304
57.1k
        fmt_aux_t *f = &fmt[j];
3305
57.1k
        if ( !f->max_m ) f->max_m = 1;  // omitted trailing format field
3306
3307
57.1k
        if ((f->y>>4&0xf) == BCF_HT_STR) {
3308
57.1k
            f->size = f->is_gt? f->max_g << 2 : f->max_l;
3309
57.1k
        } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
3310
0
            f->size = f->max_m << 2;
3311
0
        } else {
3312
0
            hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
3313
0
            v->errcode |= BCF_ERR_TAG_INVALID;
3314
0
            return -1;
3315
0
        }
3316
3317
57.1k
        if (align_mem(mem) < 0) {
3318
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3319
0
            v->errcode |= BCF_ERR_LIMITS;
3320
0
            return -1;
3321
0
        }
3322
3323
        // Limit the total memory to ~2Gb per VCF row.  This should mean
3324
        // malformed VCF data is less likely to take excessive memory and/or
3325
        // time.
3326
57.1k
        if ((uint64_t) mem->l + v->n_sample * (uint64_t)f->size > INT_MAX) {
3327
0
            static int warned = 0;
3328
0
            if ( !warned ) hts_log_warning("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3329
0
            warned = 1;
3330
0
            v->errcode |= BCF_ERR_LIMITS;
3331
0
            f->size = -1;
3332
0
            f->offset = 0;
3333
0
            continue;
3334
0
        }
3335
3336
57.1k
        f->offset = mem->l;
3337
57.1k
        if (ks_resize(mem, mem->l + v->n_sample * (size_t)f->size) < 0) {
3338
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3339
0
            v->errcode |= BCF_ERR_LIMITS;
3340
0
            return -1;
3341
0
        }
3342
57.1k
        mem->l += v->n_sample * f->size;
3343
57.1k
    }
3344
3345
9.14k
    {
3346
9.14k
        int j;
3347
66.3k
        for (j = 0; j < v->n_fmt; ++j)
3348
57.1k
            fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
3349
9.14k
    }
3350
3351
    // check for duplicate tags
3352
9.14k
    int i;
3353
57.1k
    for (i=1; i<v->n_fmt; i++)
3354
48.0k
    {
3355
48.0k
        fmt_aux_t *ifmt = &fmt[i];
3356
48.0k
        if ( ifmt->size==-1 ) continue; // already marked for removal
3357
248k
        for (j=0; j<i; j++)
3358
236k
        {
3359
236k
            fmt_aux_t *jfmt = &fmt[j];
3360
236k
            if ( jfmt->size==-1 ) continue; // already marked for removal
3361
111k
            if ( ifmt->key!=jfmt->key ) continue;
3362
36.1k
            static int warned = 0;
3363
36.1k
            if ( !warned ) hts_log_warning("Duplicate FORMAT tag %s at %s:%"PRIhts_pos, bcf_hdr_int2id(h,BCF_DT_ID,ifmt->key), bcf_seqname_safe(h,v), v->pos+1);
3364
36.1k
            warned = 1;
3365
36.1k
            v->errcode |= BCF_ERR_TAG_INVALID;
3366
36.1k
            ifmt->size = -1;
3367
36.1k
            ifmt->offset = 0;
3368
36.1k
            break;
3369
111k
        }
3370
48.0k
    }
3371
9.14k
    return 0;
3372
9.14k
}
3373
3374
// Fill the sample fields
3375
static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3376
9.14k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
3377
9.14k
    static int extreme_val_warned = 0;
3378
9.14k
    int n_sample_ori = -1;
3379
    // At beginning of the loop t points to the first char of a format
3380
9.14k
    const char *t = q + 1;
3381
9.14k
    int m = 0;   // m: sample id
3382
9.14k
    const int nsamples = bcf_hdr_nsamples(h);
3383
9.14k
    const char *end = s->s + s->l;
3384
3385
9.14k
    int ver = bcf_get_version(h, NULL);
3386
3387
29.1k
    while ( t<end )
3388
28.0k
    {
3389
        // can we skip some samples?
3390
28.0k
        if ( h->keep_samples )
3391
0
        {
3392
0
            n_sample_ori++;
3393
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
3394
0
            {
3395
0
                while ( *t && t<end ) t++;
3396
0
                t++;
3397
0
                continue;
3398
0
            }
3399
0
        }
3400
28.0k
        if ( m == nsamples ) break;
3401
3402
20.0k
        int j = 0; // j-th format field, m-th sample
3403
32.0k
        while ( t < end )
3404
31.5k
        {
3405
31.5k
            fmt_aux_t *z = &fmt[j++];
3406
31.5k
            const int htype = z->y>>4&0xf;
3407
31.5k
            if (!z->buf) {
3408
10
                hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos,
3409
10
                              z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
3410
10
                v->errcode |= BCF_ERR_LIMITS;
3411
10
                return -1;
3412
10
            }
3413
3414
31.5k
            if ( z->size==-1 )
3415
4.67k
            {
3416
                // this field is to be ignored, it's either too big or a duplicate
3417
39.3k
                while ( *t != ':' && *t ) t++;
3418
4.67k
            }
3419
26.8k
            else if (htype == BCF_HT_STR) {
3420
26.8k
                int l;
3421
26.8k
                if (z->is_gt) {
3422
                    // Genotypes.
3423
                    //([/|])?<val>)([|/]<val>)+... where <val> is [0-9]+ or ".".
3424
4.17k
                    int32_t is_phased = 0;
3425
4.17k
                    uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
3426
4.17k
                    uint32_t unreadable = 0;
3427
4.17k
                    uint32_t max = 0;
3428
4.17k
                    int overflow = 0, ploidy = 0, anyunphased = 0, \
3429
4.17k
                        phasingprfx = 0, unknown1 = 0;
3430
3431
                    /* with prefixed phasing, it is explicitly given for 1st one
3432
                    with non-prefixed, set based on ploidy and phasing of other
3433
                    alleles. */
3434
4.17k
                    if (ver >= VCF44 && (*t == '|' || *t == '/')) {
3435
                        // cache prefix and phasing status
3436
338
                        is_phased = *t++ == '|';
3437
338
                        phasingprfx = 1;
3438
338
                    }
3439
3440
13.8k
                    for (l = 0;; ++t) {
3441
13.8k
                        ploidy++;
3442
13.8k
                        if (*t == '.') {
3443
3.63k
                            ++t, x[l++] = is_phased;
3444
3.63k
                            if (l==1) {   //for 1st allele only
3445
597
                                unknown1 = 1;
3446
597
                            }
3447
10.1k
                        } else {
3448
10.1k
                            const char *tt = t;
3449
10.1k
                            uint32_t val;
3450
                            // Or "v->n_allele < 10", but it doesn't
3451
                            // seem to be any faster and this feels safer.
3452
10.1k
                            if (*t >= '0' && *t <= '9' &&
3453
9.81k
                                !(t[1] >= '0' && t[1] <= '9')) {
3454
5.72k
                                val = *t++ - '0';
3455
5.72k
                            } else {
3456
4.45k
                                val = hts_str2uint(t, (char **)&t,
3457
4.45k
                                                   sizeof(val) * CHAR_MAX - 2,
3458
4.45k
                                                   &overflow);
3459
4.45k
                                unreadable |= tt == t;
3460
4.45k
                            }
3461
10.1k
                            if (max < val) max = val;
3462
10.1k
                            x[l++] = (val + 1) << 1 | is_phased;
3463
10.1k
                        }
3464
13.8k
                        anyunphased |= (ploidy != 1) && !is_phased;
3465
13.8k
                        is_phased = (*t == '|');
3466
13.8k
                        if (*t != '|' && *t != '/') break;
3467
13.8k
                    }
3468
4.17k
                    if (!phasingprfx) { //get GT in v44 way when no prefixed phasing
3469
                        /* no explicit phasing for 1st allele, set based on
3470
                         other alleles and ploidy */
3471
3.83k
                        if (ploidy == 1) {  //implicitly phased
3472
1.32k
                            if (!unknown1) {
3473
956
                                x[0] |= 1;
3474
956
                            }
3475
2.51k
                        } else {            //set by other unphased alleles
3476
2.51k
                            x[0] |= (anyunphased)? 0 : 1;
3477
2.51k
                        }
3478
3.83k
                    }
3479
                    // Possibly check max against v->n_allele instead?
3480
4.17k
                    if (overflow || max > (INT32_MAX >> 1) - 1) {
3481
41
                        hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3482
41
                        return -1;
3483
41
                    }
3484
4.13k
                    if (unreadable) {
3485
17
                        hts_log_error("Couldn't read GT data: value not a number or '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3486
17
                        return -1;
3487
17
                    }
3488
4.11k
                    if ( !l ) x[l++] = 0;   // An empty field, insert missing value
3489
4.92k
                    for (; l < z->size>>2; ++l)
3490
810
                        x[l] = bcf_int32_vector_end;
3491
3492
22.6k
                } else {
3493
                    // Otherwise arbitrary strings
3494
22.6k
                    char *x = (char*)z->buf + z->size * (size_t)m;
3495
3.92M
                    for (l = 0; *t != ':' && *t; ++t)
3496
3.90M
                        x[l++] = *t;
3497
22.6k
                    if (z->size > l)
3498
12.0k
                        memset(&x[l], 0, (z->size-l) * sizeof(*x));
3499
22.6k
                }
3500
3501
26.8k
            } else if (htype == BCF_HT_INT) {
3502
                // One or more integers in an array
3503
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3504
0
                int l;
3505
0
                for (l = 0;; ++t) {
3506
0
                    if (*t == '.') {
3507
0
                        x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
3508
0
                    } else {
3509
0
                        int overflow = 0;
3510
0
                        char *te;
3511
0
                        long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3512
0
                        if ( te==t || overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3513
0
                        {
3514
0
                            if ( !extreme_val_warned )
3515
0
                            {
3516
0
                                hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos,
3517
0
                                                h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1);
3518
0
                                extreme_val_warned = 1;
3519
0
                            }
3520
0
                            tmp_val = bcf_int32_missing;
3521
0
                        }
3522
0
                        x[l++] = tmp_val;
3523
0
                        t = te;
3524
0
                    }
3525
0
                    if (*t != ',') break;
3526
0
                }
3527
0
                if ( !l )
3528
0
                    x[l++] = bcf_int32_missing;
3529
0
                for (; l < z->size>>2; ++l)
3530
0
                    x[l] = bcf_int32_vector_end;
3531
3532
0
            } else if (htype == BCF_HT_REAL) {
3533
                // One of more floating point values in an array
3534
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3535
0
                int l;
3536
0
                for (l = 0;; ++t) {
3537
0
                    if (*t == '.' && !isdigit_c(t[1])) {
3538
0
                        bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
3539
0
                    } else {
3540
0
                        int overflow = 0;
3541
0
                        char *te;
3542
0
                        float tmp_val = hts_str2dbl(t, &te, &overflow);
3543
0
                        if ( (te==t || overflow) && !extreme_val_warned )
3544
0
                        {
3545
0
                            hts_log_warning("Extreme FORMAT/%s value encountered at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname(h,v), v->pos+1);
3546
0
                            extreme_val_warned = 1;
3547
0
                        }
3548
0
                        x[l++] = tmp_val;
3549
0
                        t = te;
3550
0
                    }
3551
0
                    if (*t != ',') break;
3552
0
                }
3553
0
                if ( !l )
3554
                    // An empty field, insert missing value
3555
0
                    bcf_float_set_missing(x[l++]);
3556
0
                for (; l < z->size>>2; ++l)
3557
0
                    bcf_float_set_vector_end(x[l]);
3558
0
            } else {
3559
0
                hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, htype, bcf_seqname_safe(h,v), v->pos+1);
3560
0
                v->errcode |= BCF_ERR_TAG_INVALID;
3561
0
                return -1;
3562
0
            }
3563
3564
31.4k
            if (*t == '\0') {
3565
19.5k
                break;
3566
19.5k
            }
3567
11.9k
            else if (*t == ':') {
3568
11.9k
                t++;
3569
11.9k
            }
3570
8
            else {
3571
8
                char buffer[8];
3572
8
                hts_log_error("Invalid character %s in '%s' FORMAT field at %s:%"PRIhts_pos"",
3573
8
                    hts_strprint(buffer, sizeof buffer, '\'', t, 1),
3574
8
                    h->id[BCF_DT_ID][z->key].key, bcf_seqname_safe(h,v), v->pos+1);
3575
8
                v->errcode |= BCF_ERR_CHAR;
3576
8
                return -1;
3577
8
            }
3578
31.4k
        }
3579
3580
        // fill end-of-vector values
3581
285k
        for (; j < v->n_fmt; ++j) {
3582
265k
            fmt_aux_t *z = &fmt[j];
3583
265k
            const int htype = z->y>>4&0xf;
3584
265k
            int l;
3585
3586
265k
            if (z->size == -1) // this field is to be ignored
3587
221k
                continue;
3588
3589
43.7k
            if (htype == BCF_HT_STR) {
3590
43.7k
                if (z->is_gt) {
3591
7.61k
                    int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3592
7.61k
                    if (z->size) x[0] = bcf_int32_missing;
3593
21.0k
                    for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3594
36.1k
                } else {
3595
36.1k
                    char *x = (char*)z->buf + z->size * (size_t)m;
3596
36.1k
                    if ( z->size ) {
3597
7.29k
                        x[0] = '.';
3598
7.29k
                        memset(&x[1], 0, (z->size-1) * sizeof(*x));
3599
7.29k
                    }
3600
36.1k
                }
3601
43.7k
            } else if (htype == BCF_HT_INT) {
3602
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3603
0
                x[0] = bcf_int32_missing;
3604
0
                for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3605
0
            } else if (htype == BCF_HT_REAL) {
3606
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3607
0
                bcf_float_set_missing(x[0]);
3608
0
                for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
3609
0
            }
3610
43.7k
        }
3611
3612
19.9k
        m++; t++;
3613
19.9k
    }
3614
3615
9.06k
    return 0;
3616
9.14k
}
3617
3618
// write individual genotype information
3619
static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3620
9.06k
                                const char *p, const char *q, fmt_aux_t *fmt) {
3621
9.06k
    kstring_t *str = &v->indiv;
3622
9.06k
    int i, need_downsize = 0;
3623
9.06k
    if (v->n_sample > 0) {
3624
63.6k
        for (i = 0; i < v->n_fmt; ++i) {
3625
54.6k
            fmt_aux_t *z = &fmt[i];
3626
54.6k
            if ( z->size==-1 ) {
3627
33.9k
                need_downsize = 1;
3628
33.9k
                continue;
3629
33.9k
            }
3630
20.7k
            bcf_enc_int1(str, z->key);
3631
20.7k
            if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
3632
16.3k
                bcf_enc_size(str, z->size, BCF_BT_CHAR);
3633
16.3k
                kputsn((char*)z->buf, z->size * (size_t)v->n_sample, str);
3634
16.3k
            } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
3635
4.42k
                bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
3636
4.42k
            } else {
3637
0
                bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
3638
0
                if (serialize_float_array(str, (z->size>>2) * (size_t)v->n_sample,
3639
0
                                          (float *) z->buf) != 0) {
3640
0
                    v->errcode |= BCF_ERR_LIMITS;
3641
0
                    hts_log_error("Out of memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3642
0
                    return -1;
3643
0
                }
3644
0
            }
3645
20.7k
        }
3646
3647
9.05k
    }
3648
9.06k
    if ( need_downsize ) {
3649
4.29k
        i = 0;
3650
50.6k
        while ( i < v->n_fmt ) {
3651
46.3k
            if ( fmt[i].size==-1 )
3652
33.9k
            {
3653
33.9k
                v->n_fmt--;
3654
33.9k
                if ( i < v->n_fmt ) memmove(&fmt[i],&fmt[i+1],sizeof(*fmt)*(v->n_fmt-i));
3655
33.9k
            }
3656
12.4k
            else
3657
12.4k
                i++;
3658
46.3k
        }
3659
4.29k
    }
3660
9.06k
    return 0;
3661
9.06k
}
3662
3663
// validity checking
3664
9.06k
static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) {
3665
9.06k
    if ( v->n_sample!=bcf_hdr_nsamples(h) )
3666
66
    {
3667
66
        hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
3668
66
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
3669
66
        v->errcode |= BCF_ERR_NCOLS;
3670
66
        return -1;
3671
66
    }
3672
9.00k
    if ( v->indiv.l > 0xffffffff )
3673
0
    {
3674
0
        hts_log_error("The FORMAT at %s:%"PRIhts_pos" is too long", bcf_seqname_safe(h,v), v->pos+1);
3675
0
        v->errcode |= BCF_ERR_LIMITS;
3676
3677
        // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed
3678
0
        v->n_fmt = 0;
3679
0
        return -1;
3680
0
    }
3681
3682
9.00k
    return 0;
3683
9.00k
}
3684
3685
// p,q is the start and the end of the FORMAT field
3686
static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3687
                            char *p, char *q)
3688
26.9k
{
3689
26.9k
    if ( !bcf_hdr_nsamples(h) ) return 0;
3690
9.40k
    kstring_t *mem = (kstring_t*)&h->mem;
3691
9.40k
    mem->l = 0;
3692
3693
9.40k
    fmt_aux_t fmt[MAX_N_FMT];
3694
3695
    // detect FORMAT "."
3696
9.40k
    int ret; // +ve = ok, -ve = err
3697
9.40k
    if ((ret = vcf_parse_format_empty1(s, h, v, p, q)))
3698
228
        return ret ? 0 : -1;
3699
3700
    // get format information from the dictionary
3701
9.17k
    if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0)
3702
12
        return -1;
3703
3704
    // FORMAT data is per-sample A:B:C A:B:C A:B:C ... but in memory it is
3705
    // stored as per-type arrays AAA... BBB... CCC...  This is basically
3706
    // a data rotation or pivot.
3707
3708
    // The size of elements in the array grow to their maximum needed,
3709
    // permitting fast random access.  This means however we have to first
3710
    // scan the whole FORMAT line to find the maximum of each type, and
3711
    // then scan it again to find the store the data.
3712
    // We break this down into compute-max, allocate, fill-out-buffers
3713
3714
    // TODO: ?
3715
    // The alternative would be to pivot on the first pass, with fixed
3716
    // size entries for numerics and concatenated strings otherwise, also
3717
    // tracking maximum sizes.  Then on a second pass we reallocate and
3718
    // copy the data again to a uniformly sized array.  Two passes through
3719
    // memory, but without doubling string parsing.
3720
3721
    // compute max
3722
9.16k
    if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0)
3723
18
        return -1;
3724
3725
    // allocate memory for arrays
3726
9.14k
    if (vcf_parse_format_alloc4(s, h, v, p, q, fmt) < 0)
3727
0
        return -1;
3728
3729
    // fill the sample fields; at beginning of the loop
3730
9.14k
    if (vcf_parse_format_fill5(s, h, v, p, q, fmt) < 0)
3731
76
        return -1;
3732
3733
    // write individual genotype information
3734
9.06k
    if (vcf_parse_format_gt6(s, h, v, p, q, fmt) < 0)
3735
0
        return -1;
3736
3737
    // validity checking
3738
9.06k
    if (vcf_parse_format_check7(h, v) < 0)
3739
66
        return -1;
3740
3741
9.00k
    return 0;
3742
9.06k
}
3743
3744
3.20k
static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
3745
    // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
3746
    // been already printed, but will enable tools like vcfcheck to proceed.
3747
3748
3.20k
    kstring_t tmp = {0,0,0};
3749
3.20k
    khint_t k;
3750
3.20k
    int l;
3751
3.20k
    if (ksprintf(&tmp, "##contig=<ID=%s>", p) < 0)
3752
0
        return kh_end(d);
3753
3.20k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3754
3.20k
    free(tmp.s);
3755
3.20k
    int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3756
3.20k
    if (res < 0) bcf_hrec_destroy(hrec);
3757
3.20k
    if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3758
3.20k
    k = kh_get(vdict, d, p);
3759
3760
3.20k
    return k;
3761
3.20k
}
3762
3763
27.2k
static int vcf_parse_filter(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3764
27.2k
    int i, n_flt = 1, max_n_flt = 0;
3765
27.2k
    char *r, *t;
3766
27.2k
    int32_t *a_flt = NULL;
3767
27.2k
    ks_tokaux_t aux1;
3768
27.2k
    khint_t k;
3769
27.2k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3770
    // count the number of filters
3771
27.2k
    if (*(q-1) == ';') *(q-1) = 0;
3772
318M
    for (r = p; *r; ++r)
3773
318M
        if (*r == ';') ++n_flt;
3774
27.2k
    if (n_flt > max_n_flt) {
3775
27.2k
        a_flt = malloc(n_flt * sizeof(*a_flt));
3776
27.2k
        if (!a_flt) {
3777
0
            hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3778
0
            v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3779
0
            return -1;
3780
0
        }
3781
27.2k
        max_n_flt = n_flt;
3782
27.2k
    }
3783
    // add filters
3784
1.55M
    for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
3785
1.53M
        *(char*)aux1.p = 0;
3786
1.53M
        k = kh_get(vdict, d, t);
3787
1.53M
        if (k == kh_end(d))
3788
34.0k
        {
3789
            // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
3790
            // been already printed, but will enable tools like vcfcheck to proceed.
3791
34.0k
            hts_log_warning("FILTER '%s' is not defined in the header", t);
3792
34.0k
            kstring_t tmp = {0,0,0};
3793
34.0k
            int l;
3794
34.0k
            ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
3795
34.0k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3796
34.0k
            free(tmp.s);
3797
34.0k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3798
34.0k
            if (res < 0) bcf_hrec_destroy(hrec);
3799
34.0k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3800
34.0k
            k = kh_get(vdict, d, t);
3801
34.0k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3802
34.0k
            if (res || k == kh_end(d)) {
3803
35
                hts_log_error("Could not add dummy header for FILTER '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
3804
35
                v->errcode |= BCF_ERR_TAG_INVALID;
3805
35
                free(a_flt);
3806
35
                return -1;
3807
35
            }
3808
34.0k
        }
3809
1.53M
        a_flt[i++] = kh_val(d, k).id;
3810
1.53M
    }
3811
3812
27.2k
    bcf_enc_vint(str, n_flt, a_flt, -1);
3813
27.2k
    free(a_flt);
3814
3815
27.2k
    return 0;
3816
27.2k
}
3817
3818
29.8k
static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3819
29.8k
    static int extreme_int_warned = 0, negative_rlen_warned = 0;
3820
29.8k
    int max_n_val = 0, overflow = 0;
3821
29.8k
    char *r, *key;
3822
29.8k
    khint_t k;
3823
29.8k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3824
29.8k
    int32_t *a_val = NULL;
3825
3826
29.8k
    v->n_info = 0;
3827
29.8k
    if (*(q-1) == ';') *(q-1) = 0;
3828
2.91M
    for (r = key = p;; ++r) {
3829
2.91M
        int c;
3830
2.91M
        char *val, *end;
3831
205M
        while (*r > '=' || (*r != ';' && *r != '=' && *r != 0)) r++;
3832
2.91M
        if (v->n_info == UINT16_MAX) {
3833
1
            hts_log_error("Too many INFO entries at %s:%"PRIhts_pos,
3834
1
                          bcf_seqname_safe(h,v), v->pos+1);
3835
1
            v->errcode |= BCF_ERR_LIMITS;
3836
1
            goto fail;
3837
1
        }
3838
2.91M
        val = end = NULL;
3839
2.91M
        c = *r; *r = 0;
3840
2.91M
        if (c == '=') {
3841
1.39M
            val = r + 1;
3842
3843
348M
            for (end = val; *end != ';' && *end != 0; ++end);
3844
1.39M
            c = *end; *end = 0;
3845
1.52M
        } else end = r;
3846
2.91M
        if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; }  // faulty VCF, ";;" in the INFO
3847
2.88M
        k = kh_get(vdict, d, key);
3848
2.88M
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
3849
23.9k
        {
3850
23.9k
            hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key);
3851
23.9k
            kstring_t tmp = {0,0,0};
3852
23.9k
            int l;
3853
23.9k
            ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
3854
23.9k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3855
23.9k
            free(tmp.s);
3856
23.9k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3857
23.9k
            if (res < 0) bcf_hrec_destroy(hrec);
3858
23.9k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3859
23.9k
            k = kh_get(vdict, d, key);
3860
23.9k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3861
23.9k
            if (res || k == kh_end(d)) {
3862
52
                hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1);
3863
52
                v->errcode |= BCF_ERR_TAG_INVALID;
3864
52
                goto fail;
3865
52
            }
3866
23.9k
        }
3867
2.88M
        uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
3868
2.88M
        ++v->n_info;
3869
2.88M
        bcf_enc_int1(str, kh_val(d, k).id);
3870
2.88M
        if (val == 0) {
3871
1.48M
            bcf_enc_size(str, 0, BCF_BT_NULL);
3872
1.48M
        } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
3873
113k
            bcf_enc_vchar(str, end - val, val);
3874
1.27M
        } else { // int/float value/array
3875
1.27M
            int i, n_val;
3876
1.27M
            char *t, *te;
3877
308M
            for (t = val, n_val = 1; *t; ++t) // count the number of values
3878
307M
                if (*t == ',') ++n_val;
3879
            // Check both int and float size in one step for simplicity
3880
1.27M
            if (n_val > max_n_val) {
3881
2.90k
                int32_t *a_tmp = (int32_t *)realloc(a_val, n_val * sizeof(*a_val));
3882
2.90k
                if (!a_tmp) {
3883
0
                    hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3884
0
                    v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3885
0
                    goto fail;
3886
0
                }
3887
2.90k
                a_val = a_tmp;
3888
2.90k
                max_n_val = n_val;
3889
2.90k
            }
3890
1.27M
            if ((y>>4&0xf) == BCF_HT_INT) {
3891
791k
                i = 0, t = val;
3892
791k
                int64_t val1;
3893
791k
                int is_int64 = 0;
3894
#ifdef VCF_ALLOW_INT64
3895
                if ( n_val==1 )
3896
                {
3897
                    overflow = 0;
3898
                    long long int tmp_val = hts_str2int(val, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3899
                    if ( te==val ) tmp_val = bcf_int32_missing;
3900
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT64 || tmp_val>BCF_MAX_BT_INT64 )
3901
                    {
3902
                        if ( !extreme_int_warned )
3903
                        {
3904
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3905
                            extreme_int_warned = 1;
3906
                        }
3907
                        tmp_val = bcf_int32_missing;
3908
                    }
3909
                    else
3910
                        is_int64 = 1;
3911
                    val1 = tmp_val;
3912
                    t = te;
3913
                    i = 1;  // this is just to avoid adding another nested block...
3914
                }
3915
#endif
3916
111M
                for (; i < n_val; ++i, ++t)
3917
111M
                {
3918
111M
                    overflow = 0;
3919
111M
                    long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3920
111M
                    if ( te==t ) tmp_val = bcf_int32_missing;
3921
1.43M
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3922
173k
                    {
3923
173k
                        if ( !extreme_int_warned )
3924
1
                        {
3925
1
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3926
1
                            extreme_int_warned = 1;
3927
1
                        }
3928
173k
                        tmp_val = bcf_int32_missing;
3929
173k
                    }
3930
111M
                    a_val[i] = tmp_val;
3931
153M
                    for (t = te; *t && *t != ','; t++);
3932
111M
                }
3933
791k
                if (n_val == 1) {
3934
#ifdef VCF_ALLOW_INT64
3935
                    if ( is_int64 )
3936
                    {
3937
                        v->unpacked |= BCF_IS_64BIT;
3938
                        bcf_enc_long1(str, val1);
3939
                    }
3940
                    else
3941
                        bcf_enc_int1(str, (int32_t)val1);
3942
#else
3943
629k
                    val1 = a_val[0];
3944
629k
                    bcf_enc_int1(str, (int32_t)val1);
3945
629k
#endif
3946
629k
                } else {
3947
161k
                    bcf_enc_vint(str, n_val, a_val, -1);
3948
161k
                }
3949
791k
                if (n_val==1 && (val1!=bcf_int32_missing || is_int64)
3950
520k
                    && memcmp(key, "END", 4) == 0)
3951
0
                {
3952
0
                    if ( val1 <= v->pos )
3953
0
                    {
3954
0
                        if ( !negative_rlen_warned )
3955
0
                        {
3956
0
                            hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,val1,bcf_seqname_safe(h,v),v->pos+1);
3957
0
                            negative_rlen_warned = 1;
3958
0
                        }
3959
0
                    }
3960
0
                }
3961
791k
            } else if ((y>>4&0xf) == BCF_HT_REAL) {
3962
488k
                float *val_f = (float *)a_val;
3963
110M
                for (i = 0, t = val; i < n_val; ++i, ++t)
3964
109M
                {
3965
109M
                    overflow = 0;
3966
109M
                    val_f[i] = hts_str2dbl(t, &te, &overflow);
3967
109M
                    if ( te==t || overflow ) // conversion failed
3968
108M
                        bcf_float_set_missing(val_f[i]);
3969
142M
                    for (t = te; *t && *t != ','; t++);
3970
109M
                }
3971
488k
                bcf_enc_vfloat(str, n_val, val_f);
3972
488k
            }
3973
1.27M
        }
3974
2.88M
        if (c == 0) break;
3975
2.86M
        r = end;
3976
2.86M
        key = r + 1;
3977
2.86M
    }
3978
3979
29.8k
    free(a_val);
3980
29.8k
    return 0;
3981
3982
53
 fail:
3983
53
    free(a_val);
3984
53
    return -1;
3985
29.8k
}
3986
3987
int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
3988
30.6k
{
3989
30.6k
    int ret = -2, overflow = 0;
3990
30.6k
    char *p, *q, *r, *t;
3991
30.6k
    kstring_t *str;
3992
30.6k
    khint_t k;
3993
30.6k
    ks_tokaux_t aux;
3994
3995
//#define NOT_DOT(p) strcmp((p), ".")
3996
//#define NOT_DOT(p) (!(*p == '.' && !p[1]))
3997
//#define NOT_DOT(p) ((*p) != '.' || (p)[1])
3998
//#define NOT_DOT(p) (q-p != 1 || memcmp(p, ".\0", 2))
3999
151k
#define NOT_DOT(p) (memcmp(p, ".\0", 2))
4000
4001
30.6k
    if (!s || !h || !v || !(s->s))
4002
0
        return ret;
4003
4004
    // Assumed in lots of places, but we may as well spot this early
4005
30.6k
    assert(sizeof(float) == sizeof(int32_t));
4006
4007
    // Ensure string we parse has space to permit some over-flow when during
4008
    // parsing.  Eg to do memcmp(key, "END", 4) in vcf_parse_info over
4009
    // the more straight forward looking strcmp, giving a speed advantage.
4010
30.6k
    if (ks_resize(s, s->l+4) < 0)
4011
0
        return -2;
4012
4013
    // Force our memory to be initialised so we avoid the technicality of
4014
    // undefined behaviour in using a 4-byte memcmp.  (The reality is this
4015
    // almost certainly is never detected by the compiler so has no impact,
4016
    // but equally so this code has minimal (often beneficial) impact on
4017
    // performance too.)
4018
30.6k
    s->s[s->l+0] = 0;
4019
30.6k
    s->s[s->l+1] = 0;
4020
30.6k
    s->s[s->l+2] = 0;
4021
30.6k
    s->s[s->l+3] = 0;
4022
4023
30.6k
    bcf_clear1(v);
4024
30.6k
    str = &v->shared;
4025
30.6k
    memset(&aux, 0, sizeof(ks_tokaux_t));
4026
4027
    // CHROM
4028
30.6k
    if (!(p = kstrtok(s->s, "\t", &aux)))
4029
0
        goto err;
4030
30.6k
    *(q = (char*)aux.p) = 0;
4031
4032
30.6k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
4033
30.6k
    k = kh_get(vdict, d, p);
4034
30.6k
    if (k == kh_end(d)) {
4035
3.20k
        hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p);
4036
3.20k
        v->errcode = BCF_ERR_CTG_UNDEF;
4037
3.20k
        if ((k = fix_chromosome(h, d, p)) == kh_end(d)) {
4038
47
            hts_log_error("Could not add dummy header for contig '%s'", p);
4039
47
            v->errcode |= BCF_ERR_CTG_INVALID;
4040
47
            goto err;
4041
47
        }
4042
3.20k
    }
4043
30.6k
    v->rid = kh_val(d, k).id;
4044
4045
    // POS
4046
30.6k
    if (!(p = kstrtok(0, 0, &aux)))
4047
191
        goto err;
4048
30.4k
    *(q = (char*)aux.p) = 0;
4049
4050
30.4k
    overflow = 0;
4051
30.4k
    char *tmp = p;
4052
30.4k
    v->pos = hts_str2uint(p, &p, 62, &overflow);
4053
30.4k
    if (overflow) {
4054
9
        hts_log_error("Position value '%s' is too large", tmp);
4055
9
        goto err;
4056
30.4k
    } else if ( *p ) {
4057
52
        hts_log_error("Could not parse the position '%s'", tmp);
4058
52
        goto err;
4059
30.3k
    } else {
4060
30.3k
        v->pos -= 1;
4061
30.3k
    }
4062
30.3k
    if (v->pos >= INT32_MAX)
4063
764
        v->unpacked |= BCF_IS_64BIT;
4064
4065
    // ID
4066
30.3k
    if (!(p = kstrtok(0, 0, &aux)))
4067
17
        goto err;
4068
30.3k
    *(q = (char*)aux.p) = 0;
4069
4070
30.3k
    if (NOT_DOT(p)) bcf_enc_vchar(str, q - p, p);
4071
72
    else bcf_enc_size(str, 0, BCF_BT_CHAR);
4072
4073
    // REF
4074
30.3k
    if (!(p = kstrtok(0, 0, &aux)))
4075
21
        goto err;
4076
30.3k
    *(q = (char*)aux.p) = 0;
4077
4078
30.3k
    bcf_enc_vchar(str, q - p, p);
4079
30.3k
    v->n_allele = 1, v->rlen = q - p;
4080
4081
    // ALT
4082
30.3k
    if (!(p = kstrtok(0, 0, &aux)))
4083
15
        goto err;
4084
30.3k
    *(q = (char*)aux.p) = 0;
4085
4086
30.3k
    if (NOT_DOT(p)) {
4087
72.8M
        for (r = t = p;; ++r) {
4088
72.8M
            if (*r == ',' || *r == 0) {
4089
2.19M
                if (v->n_allele == UINT16_MAX) {
4090
3
                    hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos,
4091
3
                                  bcf_seqname_safe(h,v), v->pos+1);
4092
3
                    v->errcode |= BCF_ERR_LIMITS;
4093
3
                    goto err;
4094
3
                }
4095
2.19M
                bcf_enc_vchar(str, r - t, t);
4096
2.19M
                t = r + 1;
4097
2.19M
                ++v->n_allele;
4098
2.19M
            }
4099
72.8M
            if (r == q) break;
4100
72.8M
        }
4101
29.8k
    }
4102
4103
    // QUAL
4104
30.3k
    if (!(p = kstrtok(0, 0, &aux)))
4105
38
        goto err;
4106
30.2k
    *(q = (char*)aux.p) = 0;
4107
4108
30.2k
    if (NOT_DOT(p)) v->qual = atof(p);
4109
528
    else bcf_float_set_missing(v->qual);
4110
30.2k
    if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR
4111
4112
    // FILTER
4113
30.2k
    if (!(p = kstrtok(0, 0, &aux)))
4114
35
        goto err;
4115
30.2k
    *(q = (char*)aux.p) = 0;
4116
4117
30.2k
    if (NOT_DOT(p)) {
4118
27.2k
        if (vcf_parse_filter(str, h, v, p, q)) {
4119
35
            goto err;
4120
35
        }
4121
27.2k
    } else bcf_enc_vint(str, 0, 0, -1);
4122
30.2k
    if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT
4123
4124
    // INFO
4125
30.2k
    if (!(p = kstrtok(0, 0, &aux)))
4126
40
        goto err;
4127
30.1k
    *(q = (char*)aux.p) = 0;
4128
4129
30.1k
    if (NOT_DOT(p)) {
4130
29.8k
        if (vcf_parse_info(str, h, v, p, q)) {
4131
53
            goto err;
4132
53
        }
4133
29.8k
    }
4134
30.1k
    if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
4135
4136
    // FORMAT; optional
4137
30.1k
    p = kstrtok(0, 0, &aux);
4138
30.1k
    if (p) {
4139
26.9k
        *(q = (char*)aux.p) = 0;
4140
4141
26.9k
        if (vcf_parse_format(s, h, v, p, q)) {
4142
172
            goto err;
4143
172
        }
4144
26.9k
    }
4145
4146
29.9k
 end:
4147
29.9k
    v->rlen = get_rlen(h, v);    //set rlen based on version
4148
29.9k
    ret = 0;
4149
4150
30.6k
 err:
4151
30.6k
    return ret;
4152
29.9k
}
4153
4154
int vcf_open_mode(char *mode, const char *fn, const char *format)
4155
0
{
4156
0
    if (format == NULL) {
4157
        // Try to pick a format based on the filename extension
4158
0
        char extension[HTS_MAX_EXT_LEN];
4159
0
        if (find_file_extension(fn, extension) < 0) return -1;
4160
0
        return vcf_open_mode(mode, fn, extension);
4161
0
    }
4162
0
    else if (strcasecmp(format, "bcf") == 0) strcpy(mode, "b");
4163
0
    else if (strcasecmp(format, "vcf") == 0) strcpy(mode, "");
4164
0
    else if (strcasecmp(format, "vcf.gz") == 0 || strcasecmp(format, "vcf.bgz") == 0) strcpy(mode, "z");
4165
0
    else return -1;
4166
4167
0
    return 0;
4168
0
}
4169
4170
int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
4171
31.3k
{
4172
31.3k
    int ret;
4173
31.3k
    ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
4174
31.3k
    if (ret < 0) return ret;
4175
30.6k
    return vcf_parse1(&fp->line, h, v);
4176
31.3k
}
4177
4178
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
4179
0
{
4180
0
    uint8_t *ptr_start = ptr;
4181
0
    fmt->id = bcf_dec_typed_int1(ptr, &ptr);
4182
0
    fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
4183
0
    fmt->size = fmt->n << bcf_type_shift[fmt->type];
4184
0
    fmt->p = ptr;
4185
0
    fmt->p_off  = ptr - ptr_start;
4186
0
    fmt->p_free = 0;
4187
0
    ptr += n_sample * fmt->size;
4188
0
    fmt->p_len = ptr - fmt->p;
4189
0
    return ptr;
4190
0
}
4191
4192
static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
4193
969
{
4194
969
    uint8_t *ptr_start = ptr;
4195
969
    int64_t len = 0;
4196
969
    info->key = bcf_dec_typed_int1(ptr, &ptr);
4197
969
    len = info->len = bcf_dec_size(ptr, &ptr, &info->type);
4198
969
    info->vptr = ptr;
4199
969
    info->vptr_off  = ptr - ptr_start;
4200
969
    info->vptr_free = 0;
4201
969
    info->v1.i = 0;
4202
969
    if (info->len == 1) {
4203
60
        switch(info->type) {
4204
0
        case BCF_BT_INT8:
4205
60
        case BCF_BT_CHAR:
4206
60
            info->v1.i = *(int8_t*)ptr;
4207
60
            break;
4208
0
        case BCF_BT_INT16:
4209
0
            info->v1.i = le_to_i16(ptr);
4210
0
            len <<= 1;
4211
0
            break;
4212
0
        case BCF_BT_INT32:
4213
0
            info->v1.i = le_to_i32(ptr);
4214
0
            len <<= 2;
4215
0
            break;
4216
0
        case BCF_BT_FLOAT:
4217
0
            info->v1.f = le_to_float(ptr);
4218
0
            len <<= 2;
4219
0
            break;
4220
0
        case BCF_BT_INT64:
4221
0
            info->v1.i = le_to_i64(ptr);
4222
0
            len <<= 3;
4223
0
            break;
4224
60
        }
4225
909
    } else {
4226
909
        len <<= bcf_type_shift[info->type];
4227
909
    }
4228
969
    ptr += len;
4229
4230
969
    info->vptr_len = ptr - info->vptr;
4231
969
    return ptr;
4232
969
}
4233
4234
int bcf_unpack(bcf1_t *b, int which)
4235
28.7k
{
4236
28.7k
    if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
4237
28.7k
    uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
4238
28.7k
    int i;
4239
28.7k
    bcf_dec_t *d = &b->d;
4240
28.7k
    if (which & BCF_UN_FLT) which |= BCF_UN_STR;
4241
28.7k
    if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
4242
28.7k
    if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
4243
28.7k
    {
4244
28.7k
        kstring_t tmp;
4245
4246
        // ID
4247
28.7k
        tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
4248
28.7k
        ptr_ori = ptr;
4249
28.7k
        ptr = bcf_fmt_sized_array(&tmp, ptr);
4250
28.7k
        b->unpack_size[0] = ptr - ptr_ori;
4251
28.7k
        kputc_('\0', &tmp);
4252
28.7k
        d->id = tmp.s; d->m_id = tmp.m;
4253
4254
        // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
4255
28.7k
        hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
4256
28.7k
        tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
4257
28.7k
        ptr_ori = ptr;
4258
1.44M
        for (i = 0; i < b->n_allele; ++i) {
4259
            // Use offset within tmp.s as realloc may change pointer
4260
1.41M
            d->allele[i] = (char *)(intptr_t)tmp.l;
4261
1.41M
            ptr = bcf_fmt_sized_array(&tmp, ptr);
4262
1.41M
            kputc_('\0', &tmp);
4263
1.41M
        }
4264
28.7k
        b->unpack_size[1] = ptr - ptr_ori;
4265
28.7k
        d->als = tmp.s; d->m_als = tmp.m;
4266
4267
        // Convert our offsets within tmp.s back to pointers again
4268
1.44M
        for (i = 0; i < b->n_allele; ++i)
4269
1.41M
            d->allele[i] = d->als + (ptrdiff_t)d->allele[i];
4270
28.7k
        b->unpacked |= BCF_UN_STR;
4271
28.7k
    }
4272
28.7k
    if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
4273
28.7k
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
4274
28.7k
        ptr_ori = ptr;
4275
28.7k
        if (*ptr>>4) {
4276
25.7k
            int type;
4277
25.7k
            d->n_flt = bcf_dec_size(ptr, &ptr, &type);
4278
25.7k
            hts_expand(int, d->n_flt, d->m_flt, d->flt);
4279
253k
            for (i = 0; i < d->n_flt; ++i)
4280
228k
                d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
4281
25.7k
        } else ++ptr, d->n_flt = 0;
4282
28.7k
        b->unpack_size[2] = ptr - ptr_ori;
4283
28.7k
        b->unpacked |= BCF_UN_FLT;
4284
28.7k
    }
4285
28.7k
    if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
4286
0
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
4287
0
        hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
4288
0
        for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
4289
0
        for (i = 0; i < b->n_info; ++i)
4290
0
            ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
4291
0
        b->unpacked |= BCF_UN_INFO;
4292
0
    }
4293
28.7k
    if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
4294
0
        ptr = (uint8_t*)b->indiv.s;
4295
0
        hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
4296
0
        for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
4297
0
        for (i = 0; i < b->n_fmt; ++i)
4298
0
            ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
4299
0
        b->unpacked |= BCF_UN_FMT;
4300
0
    }
4301
28.7k
    return 0;
4302
28.7k
}
4303
4304
int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
4305
28.7k
{
4306
28.7k
    int i;
4307
28.7k
    int32_t max_dt_id = h->n[BCF_DT_ID];
4308
28.7k
    const char *chrom = bcf_seqname(h, v);
4309
28.7k
    if (!chrom) {
4310
0
        hts_log_error("Invalid BCF, CONTIG id=%d not present in the header",
4311
0
                      v->rid);
4312
0
        errno = EINVAL;
4313
0
        return -1;
4314
0
    }
4315
4316
28.7k
    bcf_unpack((bcf1_t*)v, BCF_UN_ALL & ~(BCF_UN_INFO|BCF_UN_FMT));
4317
4318
    // Cache of key lengths so we don't keep repeatedly using them.
4319
    // This assumes we're not modifying the header between successive calls
4320
    // to vcf_format, but that would lead to many other forms of breakage
4321
    // so it feels like a valid assumption to make.
4322
    //
4323
    // We cannot just do this in bcf_hdr_sync as some code (eg bcftools
4324
    // annotate) manipulates the headers directly without calling sync to
4325
    // refresh the data structures.  So we must do just-in-time length
4326
    // calculation during writes instead.
4327
28.7k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
4328
28.7k
    if (!aux->key_len) {
4329
3.19k
        if (!(aux->key_len = calloc(h->n[BCF_DT_ID]+1, sizeof(*aux->key_len))))
4330
0
            return -1;
4331
3.19k
    }
4332
28.7k
    size_t *key_len = aux->key_len;
4333
4334
28.7k
    kputs(chrom, s); // CHROM
4335
28.7k
    kputc_('\t', s); kputll(v->pos + 1, s); // POS
4336
28.7k
    kputc_('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
4337
28.7k
    kputc_('\t', s); // REF
4338
28.7k
    if (v->n_allele > 0) kputs(v->d.allele[0], s);
4339
0
    else kputc_('.', s);
4340
28.7k
    kputc_('\t', s); // ALT
4341
28.7k
    if (v->n_allele > 1) {
4342
1.41M
        for (i = 1; i < v->n_allele; ++i) {
4343
1.38M
            if (i > 1) kputc_(',', s);
4344
1.38M
            kputs(v->d.allele[i], s);
4345
1.38M
        }
4346
28.2k
    } else kputc_('.', s);
4347
28.7k
    kputc_('\t', s); // QUAL
4348
28.7k
    if ( bcf_float_is_missing(v->qual) ) kputc_('.', s); // QUAL
4349
28.2k
    else kputd(v->qual, s);
4350
28.7k
    kputc_('\t', s); // FILTER
4351
28.7k
    if (v->d.n_flt) {
4352
253k
        for (i = 0; i < v->d.n_flt; ++i) {
4353
228k
            int32_t idx = v->d.flt[i];
4354
228k
            if (idx < 0 || idx >= max_dt_id
4355
228k
                || h->id[BCF_DT_ID][idx].key == NULL) {
4356
0
                hts_log_error("Invalid BCF, the FILTER tag id=%d at %s:%"PRIhts_pos" not present in the header",
4357
0
                              idx, bcf_seqname_safe(h, v), v->pos + 1);
4358
0
                errno = EINVAL;
4359
0
                return -1;
4360
0
            }
4361
228k
            if (i) kputc_(';', s);
4362
228k
            if (!key_len[idx])
4363
37.9k
                key_len[idx] = strlen(h->id[BCF_DT_ID][idx].key);
4364
228k
            kputsn(h->id[BCF_DT_ID][idx].key, key_len[idx], s);
4365
228k
        }
4366
25.7k
    } else kputc_('.', s);
4367
4368
28.7k
    kputc_('\t', s); // INFO
4369
28.7k
    if (v->n_info) {
4370
13.9k
        uint8_t *ptr = v->shared.s
4371
13.9k
            ? (uint8_t *)v->shared.s + v->unpack_size[0] +
4372
13.9k
               v->unpack_size[1] + v->unpack_size[2]
4373
13.9k
            : NULL;
4374
13.9k
        int first = 1;
4375
13.9k
        bcf_info_t *info = v->d.info;
4376
4377
        // Note if we duplicate this code into custom packed and unpacked
4378
        // implementations then we gain a bit more speed, particularly with
4379
        // clang 13 (up to 5%).  Not sure why this is, but code duplication
4380
        // isn't pleasant and it's still faster adding packed support than
4381
        // not so it's a win, just not as good as it should be.
4382
13.9k
        const int info_packed = !(v->unpacked & BCF_UN_INFO) && v->shared.l;
4383
1.48M
        for (i = 0; i < v->n_info; ++i) {
4384
1.47M
            bcf_info_t in, *z;
4385
1.47M
            if (info_packed) {
4386
                // Use a local bcf_info_t when data is packed
4387
1.47M
                z = &in;
4388
1.47M
                z->key  = bcf_dec_typed_int1(ptr, &ptr);
4389
1.47M
                z->len  = bcf_dec_size(ptr, &ptr, &z->type);
4390
1.47M
                z->vptr = ptr;
4391
1.47M
                ptr += z->len << bcf_type_shift[z->type];
4392
1.47M
            } else {
4393
                // Else previously unpacked INFO struct
4394
0
                z = &info[i];
4395
4396
                // Also potentially since deleted
4397
0
                if ( !z->vptr ) continue;
4398
0
            }
4399
4400
1.47M
            bcf_idpair_t *id = z->key >= 0 && z->key < max_dt_id
4401
1.47M
                ? &h->id[BCF_DT_ID][z->key]
4402
1.47M
                : NULL;
4403
4404
1.47M
            if (!id || !id->key) {
4405
0
                hts_log_error("Invalid BCF, the INFO tag id=%d is %s at %s:%"PRIhts_pos,
4406
0
                              z->key,
4407
0
                              z->key < 0 ? "negative"
4408
0
                              : (z->key >= max_dt_id ? "too large" : "not present in the header"),
4409
0
                              bcf_seqname_safe(h, v), v->pos+1);
4410
0
                errno = EINVAL;
4411
0
                return -1;
4412
0
            }
4413
4414
            // KEY
4415
1.47M
            if (!key_len[z->key])
4416
17.1k
                key_len[z->key] = strlen(id->key);
4417
1.47M
            size_t id_len = key_len[z->key];
4418
1.47M
            if (ks_resize(s, s->l + 3 + id_len) < 0)
4419
0
                return -1;
4420
1.47M
            char *sptr = s->s + s->l;
4421
1.47M
            if ( !first ) {
4422
1.46M
                *sptr++ = ';';
4423
1.46M
                s->l++;
4424
1.46M
            }
4425
1.47M
            first = 0;
4426
1.47M
            memcpy(sptr, id->key, id_len);
4427
1.47M
            s->l += id_len;
4428
4429
            // VALUE
4430
1.47M
            if (z->len <= 0) continue;
4431
681k
            sptr[id_len] = '=';
4432
681k
            s->l++;
4433
4434
681k
            if (z->len != 1 || info_packed) {
4435
681k
                bcf_fmt_array(s, z->len, z->type, z->vptr);
4436
681k
            } else {
4437
                // Single length vectors are unpacked into their
4438
                // own info.v1 union and handled separately.
4439
0
                if (z->type == BCF_BT_FLOAT) {
4440
0
                    if ( bcf_float_is_missing(z->v1.f) )
4441
0
                        kputc_('.', s);
4442
0
                    else
4443
0
                        kputd(z->v1.f, s);
4444
0
                } else if (z->type == BCF_BT_CHAR) {
4445
0
                    kputc_(z->v1.i, s);
4446
0
                } else if (z->type < BCF_BT_INT64) {
4447
0
                    int64_t missing[] = {
4448
0
                        0, // BCF_BT_NULL
4449
0
                        bcf_int8_missing,
4450
0
                        bcf_int16_missing,
4451
0
                        bcf_int32_missing,
4452
0
                    };
4453
0
                    if (z->v1.i == missing[z->type])
4454
0
                        kputc_('.', s);
4455
0
                    else
4456
0
                        kputw(z->v1.i, s);
4457
0
                } else if (z->type == BCF_BT_INT64) {
4458
0
                    if (z->v1.i == bcf_int64_missing)
4459
0
                        kputc_('.', s);
4460
0
                    else
4461
0
                        kputll(z->v1.i, s);
4462
0
                } else {
4463
0
                    hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, z->type, bcf_seqname_safe(h, v), v->pos+1);
4464
0
                    errno = EINVAL;
4465
0
                    return -1;
4466
0
                }
4467
0
            }
4468
681k
        }
4469
13.9k
        if ( first ) kputc_('.', s);
4470
14.7k
    } else kputc_('.', s);
4471
4472
    // FORMAT and individual information
4473
28.7k
    if (v->n_sample) {
4474
8.81k
        int i,j;
4475
8.81k
        if ( v->n_fmt) {
4476
8.63k
            uint8_t *ptr = (uint8_t *)v->indiv.s;
4477
8.63k
            int gt_i = -1;
4478
8.63k
            bcf_fmt_t *fmt = v->d.fmt;
4479
8.63k
            int first = 1, ret = 0;
4480
8.63k
            int fmt_packed = !(v->unpacked & BCF_UN_FMT);
4481
4482
8.63k
            if (fmt_packed) {
4483
                // Local fmt as we have an array of num FORMAT keys,
4484
                // each of which points to N.Sample values.
4485
4486
                // No real gain to be had in handling unpacked data here,
4487
                // but it doesn't cost us much in complexity either and
4488
                // it gives us flexibility.
4489
8.63k
                fmt = malloc(v->n_fmt * sizeof(*fmt));
4490
8.63k
                if (!fmt)
4491
0
                    return -1;
4492
8.63k
            }
4493
4494
            // KEYS
4495
27.5k
            for (i = 0; i < (int)v->n_fmt; ++i) {
4496
18.8k
                bcf_fmt_t *z;
4497
18.8k
                z = &fmt[i];
4498
18.8k
                if (fmt_packed) {
4499
18.8k
                    z->id   = bcf_dec_typed_int1(ptr, &ptr);
4500
18.8k
                    z->n    = bcf_dec_size(ptr, &ptr, &z->type);
4501
18.8k
                    z->p    = ptr;
4502
18.8k
                    z->size = z->n << bcf_type_shift[z->type];
4503
18.8k
                    ptr += v->n_sample * z->size;
4504
18.8k
                }
4505
18.8k
                if ( !z->p ) continue;
4506
18.8k
                kputc_(!first ? ':' : '\t', s); first = 0;
4507
4508
18.8k
                bcf_idpair_t *id = z->id >= 0 && z->id < max_dt_id
4509
18.8k
                    ? &h->id[BCF_DT_ID][z->id]
4510
18.8k
                    : NULL;
4511
4512
18.8k
                if (!id || !id->key) {
4513
0
                    hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", z->id, bcf_seqname_safe(h, v), v->pos+1);
4514
0
                    errno = EINVAL;
4515
0
                    if (fmt_packed)
4516
0
                        free(fmt);
4517
0
                    return -1;
4518
0
                }
4519
4520
18.8k
                if (!key_len[z->id])
4521
9.92k
                    key_len[z->id] = strlen(id->key);
4522
18.8k
                size_t id_len = key_len[z->id];
4523
18.8k
                kputsn(id->key, id_len, s);
4524
18.8k
                if (id_len == 2 && id->key[0] == 'G' && id->key[1] == 'T')
4525
4.20k
                    gt_i = i;
4526
18.8k
            }
4527
8.63k
            if ( first ) kputsn("\t.", 2, s);
4528
4529
            // VALUES per sample
4530
26.1k
            for (j = 0; j < v->n_sample; ++j) {
4531
17.5k
                kputc_('\t', s);
4532
17.5k
                first = 1;
4533
17.5k
                bcf_fmt_t *f = fmt;
4534
49.6k
                for (i = 0; i < (int)v->n_fmt; i++, f++) {
4535
42.6k
                    if ( !f->p ) continue;
4536
42.6k
                    if (!first) kputc_(':', s);
4537
42.6k
                    first = 0;
4538
42.6k
                    if (gt_i == i) {
4539
10.5k
                        if ((ret = bcf_format_gt_v2(h, f,j,s)) < 0) {
4540
0
                            hts_log_error("Failed to format GT value for sample %d, returned %d", i, ret);
4541
0
                            errno = EINVAL;
4542
0
                            if (fmt_packed)
4543
0
                                free(fmt);
4544
0
                            return -1;
4545
0
                        }
4546
10.5k
                        break;
4547
10.5k
                    }
4548
32.0k
                    else if (f->n == 1)
4549
3.95k
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4550
28.1k
                    else
4551
28.1k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4552
42.6k
                }
4553
4554
                // Simpler loop post GT and at least 1 iteration
4555
28.1k
                for (i++, f++; i < (int)v->n_fmt; i++, f++) {
4556
10.6k
                    if ( !f->p ) continue;
4557
10.6k
                    kputc_(':', s);
4558
10.6k
                    if (f->n == 1)
4559
1.12k
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4560
9.54k
                    else
4561
9.54k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4562
10.6k
                }
4563
17.5k
                if ( first ) kputc_('.', s);
4564
17.5k
            }
4565
8.63k
            if (fmt_packed)
4566
8.63k
                free(fmt);
4567
8.63k
        }
4568
175
        else
4569
690
            for (j=0; j<=v->n_sample; j++)
4570
515
                kputsn("\t.", 2, s);
4571
8.81k
    }
4572
28.7k
    kputc('\n', s);
4573
28.7k
    return 0;
4574
28.7k
}
4575
4576
int vcf_write_line(htsFile *fp, kstring_t *line)
4577
0
{
4578
0
    int ret;
4579
0
    if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
4580
0
    if ( fp->format.compression!=no_compression )
4581
0
        ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
4582
0
    else
4583
0
        ret = hwrite(fp->fp.hfile, line->s, line->l);
4584
0
    return ret==line->l ? 0 : -1;
4585
0
}
4586
4587
int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
4588
28.7k
{
4589
28.7k
    ssize_t ret;
4590
28.7k
    fp->line.l = 0;
4591
28.7k
    if (vcf_format1(h, v, &fp->line) != 0)
4592
0
        return -1;
4593
28.7k
    if ( fp->format.compression!=no_compression ) {
4594
0
        if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4595
0
            return -1;
4596
0
        if (fp->idx && !fp->fp.bgzf->mt)
4597
0
            hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
4598
0
        ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
4599
28.7k
    } else {
4600
28.7k
        ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
4601
28.7k
    }
4602
4603
28.7k
    if (fp->idx && fp->format.compression == bgzf) {
4604
0
        int tid;
4605
0
        if ((tid = hts_idx_tbi_name(fp->idx, v->rid, bcf_seqname_safe(h, v))) < 0)
4606
0
            return -1;
4607
4608
0
        if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
4609
0
                          tid, v->pos, v->pos + v->rlen,
4610
0
                          bgzf_tell(fp->fp.bgzf), 1) < 0)
4611
0
            return -1;
4612
0
    }
4613
4614
28.7k
    return ret==fp->line.l ? 0 : -1;
4615
28.7k
}
4616
4617
/************************
4618
 * Data access routines *
4619
 ************************/
4620
4621
int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
4622
174k
{
4623
174k
    khint_t k;
4624
174k
    vdict_t *d = (vdict_t*)h->dict[which];
4625
174k
    k = kh_get(vdict, d, id);
4626
174k
    return k == kh_end(d)? -1 : kh_val(d, k).id;
4627
174k
}
4628
4629
4630
/********************
4631
 *** BCF indexing ***
4632
 ********************/
4633
4634
// Calculate number of index levels given min_shift and the header contig
4635
// list.  Also returns number of contigs in *nids_out.
4636
static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int *min_shift_in_out,
4637
                               int starting_n_lvls, int *nids_out)
4638
0
{
4639
0
    int n_lvls = starting_n_lvls, i, nids = 0;
4640
0
    int64_t max_len = 0;
4641
4642
0
    for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
4643
0
    {
4644
0
        if ( !h->id[BCF_DT_CTG][i].val ) continue;
4645
0
        if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] )
4646
0
            max_len = h->id[BCF_DT_CTG][i].val->info[0];
4647
0
        nids++;
4648
0
    }
4649
0
    if ( !max_len ) max_len = (1LL<<31) - 1;  // In case contig line is broken.
4650
4651
0
    hts_adjust_csi_settings(max_len, min_shift_in_out, &n_lvls);
4652
4653
0
    if (nids_out) *nids_out = nids;
4654
0
    return n_lvls;
4655
0
}
4656
4657
hts_idx_t *bcf_index(htsFile *fp, int min_shift)
4658
0
{
4659
0
    int n_lvls;
4660
0
    bcf1_t *b = NULL;
4661
0
    hts_idx_t *idx = NULL;
4662
0
    bcf_hdr_t *h;
4663
0
    int r;
4664
0
    h = bcf_hdr_read(fp);
4665
0
    if ( !h ) return NULL;
4666
0
    int nids = 0;
4667
0
    n_lvls = idx_calc_n_lvls_ids(h, &min_shift, 0, &nids);
4668
0
    idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4669
0
    if (!idx) goto fail;
4670
0
    b = bcf_init1();
4671
0
    if (!b) goto fail;
4672
0
    while ((r = bcf_read1(fp,h, b)) >= 0) {
4673
0
        int ret;
4674
0
        ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
4675
0
        if (ret < 0) goto fail;
4676
0
    }
4677
0
    if (r < -1) goto fail;
4678
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
4679
0
    bcf_destroy1(b);
4680
0
    bcf_hdr_destroy(h);
4681
0
    return idx;
4682
4683
0
 fail:
4684
0
    hts_idx_destroy(idx);
4685
0
    bcf_destroy1(b);
4686
0
    bcf_hdr_destroy(h);
4687
0
    return NULL;
4688
0
}
4689
4690
hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
4691
0
{
4692
0
    return fnidx? hts_idx_load2(fn, fnidx) : bcf_index_load(fn);
4693
0
}
4694
4695
hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags)
4696
0
{
4697
0
    return hts_idx_load3(fn, fnidx, HTS_FMT_CSI, flags);
4698
0
}
4699
4700
int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads)
4701
0
{
4702
0
    htsFile *fp;
4703
0
    hts_idx_t *idx;
4704
0
    tbx_t *tbx;
4705
0
    int ret;
4706
0
    if ((fp = hts_open(fn, "rb")) == 0) return -2;
4707
0
    if (n_threads)
4708
0
        hts_set_threads(fp, n_threads);
4709
0
    if ( fp->format.compression!=bgzf ) { hts_close(fp); return -3; }
4710
0
    switch (fp->format.format) {
4711
0
        case bcf:
4712
0
            if (!min_shift) {
4713
0
                hts_log_error("TBI indices for BCF files are not supported");
4714
0
                ret = -1;
4715
0
            } else {
4716
0
                idx = bcf_index(fp, min_shift);
4717
0
                if (idx) {
4718
0
                    ret = hts_idx_save_as(idx, fn, fnidx, HTS_FMT_CSI);
4719
0
                    if (ret < 0) ret = -4;
4720
0
                    hts_idx_destroy(idx);
4721
0
                }
4722
0
                else ret = -1;
4723
0
            }
4724
0
            break;
4725
4726
0
        case vcf:
4727
0
            tbx = tbx_index(hts_get_bgzfp(fp), min_shift, &tbx_conf_vcf);
4728
0
            if (tbx) {
4729
0
                ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0 ? HTS_FMT_CSI : HTS_FMT_TBI);
4730
0
                if (ret < 0) ret = -4;
4731
0
                tbx_destroy(tbx);
4732
0
            }
4733
0
            else ret = -1;
4734
0
            break;
4735
4736
0
        default:
4737
0
            ret = -3;
4738
0
            break;
4739
0
    }
4740
0
    hts_close(fp);
4741
0
    return ret;
4742
0
}
4743
4744
int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
4745
0
{
4746
0
    return bcf_index_build3(fn, fnidx, min_shift, 0);
4747
0
}
4748
4749
int bcf_index_build(const char *fn, int min_shift)
4750
0
{
4751
0
    return bcf_index_build3(fn, NULL, min_shift, 0);
4752
0
}
4753
4754
// Initialise fp->idx for the current format type.
4755
// This must be called after the header has been written but no other data.
4756
0
static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4757
0
    int n_lvls, fmt;
4758
4759
0
    if (min_shift == 0) {
4760
0
        min_shift = 14;
4761
0
        n_lvls = 5;
4762
0
        fmt = HTS_FMT_TBI;
4763
0
    } else {
4764
        // Set initial n_lvls to match tbx_index()
4765
0
        int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3;
4766
        // Increase if necessary
4767
0
        n_lvls = idx_calc_n_lvls_ids(h, &min_shift, starting_n_lvls, NULL);
4768
0
        fmt = HTS_FMT_CSI;
4769
0
    }
4770
4771
0
    fp->idx = hts_idx_init(0, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4772
0
    if (!fp->idx) return -1;
4773
4774
    // Tabix meta data, added even in CSI for VCF
4775
0
    uint8_t conf[4*7];
4776
0
    u32_to_le(TBX_VCF, conf+0);  // fmt
4777
0
    u32_to_le(1,       conf+4);  // name col
4778
0
    u32_to_le(2,       conf+8);  // beg col
4779
0
    u32_to_le(0,       conf+12); // end col
4780
0
    u32_to_le('#',     conf+16); // comment
4781
0
    u32_to_le(0,       conf+20); // n.skip
4782
0
    u32_to_le(0,       conf+24); // ref name len
4783
0
    if (hts_idx_set_meta(fp->idx, sizeof(conf)*sizeof(*conf), (uint8_t *)conf, 1) < 0) {
4784
0
        hts_idx_destroy(fp->idx);
4785
0
        fp->idx = NULL;
4786
0
        return -1;
4787
0
    }
4788
0
    fp->fnidx = fnidx;
4789
4790
0
    return 0;
4791
0
}
4792
4793
// Initialise fp->idx for the current format type.
4794
// This must be called after the header has been written but no other data.
4795
0
int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4796
0
    int n_lvls, nids = 0;
4797
4798
0
    if (fp->format.compression != bgzf) {
4799
0
        hts_log_error("Indexing is only supported on BGZF-compressed files");
4800
0
        return -3; // Matches no-compression return for bcf_index_build3()
4801
0
    }
4802
4803
0
    if (fp->format.format == vcf)
4804
0
        return vcf_idx_init(fp, h, min_shift, fnidx);
4805
4806
0
    if (!min_shift)
4807
0
        min_shift = 14;
4808
4809
0
    n_lvls = idx_calc_n_lvls_ids(h, &min_shift, 0, &nids);
4810
4811
0
    fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4812
0
    if (!fp->idx) return -1;
4813
0
    fp->fnidx = fnidx;
4814
4815
0
    return 0;
4816
0
}
4817
4818
// Finishes an index. Call after the last record has been written.
4819
// Returns 0 on success, <0 on failure.
4820
//
4821
// NB: same format as SAM/BAM as it uses bgzf.
4822
0
int bcf_idx_save(htsFile *fp) {
4823
0
    return sam_idx_save(fp);
4824
0
}
4825
4826
/*****************
4827
 *** Utilities ***
4828
 *****************/
4829
4830
int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
4831
0
{
4832
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res;
4833
0
    for (i=0; i<src->nhrec; i++)
4834
0
    {
4835
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4836
0
        {
4837
0
            int j;
4838
0
            for (j=0; j<ndst_ori; j++)
4839
0
            {
4840
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4841
4842
                // Checking only the key part of generic lines, otherwise
4843
                // the VCFs are too verbose. Should we perhaps add a flag
4844
                // to bcf_hdr_combine() and make this optional?
4845
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4846
0
            }
4847
0
            if ( j>=ndst_ori ) {
4848
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4849
0
                if (res < 0) return -1;
4850
0
                need_sync += res;
4851
0
            }
4852
0
        }
4853
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4854
0
        {
4855
            // NB: we are ignoring fields without ID
4856
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4857
0
            if ( j>=0 )
4858
0
            {
4859
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4860
0
                if ( !rec ) {
4861
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4862
0
                    if (res < 0) return -1;
4863
0
                    need_sync += res;
4864
0
                }
4865
0
            }
4866
0
        }
4867
0
        else
4868
0
        {
4869
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4870
0
            assert( j>=0 ); // this should always be true for valid VCFs
4871
4872
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4873
0
            if ( !rec ) {
4874
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4875
0
                if (res < 0) return -1;
4876
0
                need_sync += res;
4877
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4878
0
            {
4879
                // Check that both records are of the same type. The bcf_hdr_id2length
4880
                // macro cannot be used here because dst header is not synced yet.
4881
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4882
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4883
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4884
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4885
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4886
0
                {
4887
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4888
0
                        src->hrec[i]->vals[0]);
4889
0
                    ret |= 1;
4890
0
                }
4891
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4892
0
                {
4893
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4894
0
                        src->hrec[i]->vals[0]);
4895
0
                    ret |= 1;
4896
0
                }
4897
0
            }
4898
0
        }
4899
0
    }
4900
0
    if ( need_sync ) {
4901
0
        if (bcf_hdr_sync(dst) < 0) return -1;
4902
0
    }
4903
0
    return ret;
4904
0
}
4905
4906
bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
4907
0
{
4908
0
    if ( !dst )
4909
0
    {
4910
        // this will effectively strip existing IDX attributes from src to become dst
4911
0
        dst = bcf_hdr_init("r");
4912
0
        kstring_t htxt = {0,0,0};
4913
0
        if (bcf_hdr_format(src, 0, &htxt) < 0) {
4914
0
            free(htxt.s);
4915
0
            return NULL;
4916
0
        }
4917
0
        if ( bcf_hdr_parse(dst, htxt.s) < 0 ) {
4918
0
            bcf_hdr_destroy(dst);
4919
0
            dst = NULL;
4920
0
        }
4921
0
        free(htxt.s);
4922
0
        return dst;
4923
0
    }
4924
4925
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, res;
4926
0
    for (i=0; i<src->nhrec; i++)
4927
0
    {
4928
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4929
0
        {
4930
0
            int j;
4931
0
            for (j=0; j<ndst_ori; j++)
4932
0
            {
4933
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4934
4935
                // Checking only the key part of generic lines, otherwise
4936
                // the VCFs are too verbose. Should we perhaps add a flag
4937
                // to bcf_hdr_combine() and make this optional?
4938
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4939
0
            }
4940
0
            if ( j>=ndst_ori ) {
4941
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4942
0
                if (res < 0) return NULL;
4943
0
                need_sync += res;
4944
0
            }
4945
0
            else if ( !strcmp(src->hrec[i]->key,"fileformat") )
4946
0
            {
4947
0
                int ver_src = bcf_get_version(src,src->hrec[i]->value);
4948
0
                int ver_dst = bcf_get_version(dst,dst->hrec[j]->value);
4949
0
                if ( ver_src > ver_dst )
4950
0
                {
4951
0
                    if (bcf_hdr_set_version(dst,src->hrec[i]->value) < 0)
4952
0
                        return NULL;
4953
0
                    need_sync = 1;
4954
0
                }
4955
0
            }
4956
0
        }
4957
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4958
0
        {
4959
            // NB: we are ignoring fields without ID
4960
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4961
0
            if ( j>=0 )
4962
0
            {
4963
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4964
0
                if ( !rec ) {
4965
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4966
0
                    if (res < 0) return NULL;
4967
0
                    need_sync += res;
4968
0
                }
4969
0
            }
4970
0
        }
4971
0
        else
4972
0
        {
4973
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4974
0
            assert( j>=0 ); // this should always be true for valid VCFs
4975
4976
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4977
0
            if ( !rec ) {
4978
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4979
0
                if (res < 0) return NULL;
4980
0
                need_sync += res;
4981
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4982
0
            {
4983
                // Check that both records are of the same type. The bcf_hdr_id2length
4984
                // macro cannot be used here because dst header is not synced yet.
4985
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4986
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4987
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4988
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4989
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4990
0
                {
4991
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4992
0
                        src->hrec[i]->vals[0]);
4993
0
                }
4994
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4995
0
                {
4996
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4997
0
                        src->hrec[i]->vals[0]);
4998
0
                }
4999
0
            }
5000
0
        }
5001
0
    }
5002
0
    if ( need_sync ) {
5003
0
        if (bcf_hdr_sync(dst) < 0) return NULL;
5004
0
    }
5005
0
    return dst;
5006
0
}
5007
5008
int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
5009
0
{
5010
0
    int i;
5011
0
    if ( line->errcode )
5012
0
    {
5013
0
        char errordescription[1024] = "";
5014
0
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_strerror(line->errcode, errordescription, sizeof(errordescription)),  bcf_seqname_safe(src_hdr,line), line->pos+1);
5015
0
        exit(1);
5016
0
    }
5017
0
    if ( src_hdr->ntransl==-1 ) return 0;    // no need to translate, all tags have the same id
5018
0
    if ( !src_hdr->ntransl )  // called for the first time, see what needs translating
5019
0
    {
5020
0
        int dict;
5021
0
        for (dict=0; dict<2; dict++)    // BCF_DT_ID and BCF_DT_CTG
5022
0
        {
5023
0
            src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
5024
0
            for (i=0; i<src_hdr->n[dict]; i++)
5025
0
            {
5026
0
                if ( !src_hdr->id[dict][i].key ) // gap left after removed BCF header lines
5027
0
                {
5028
0
                    src_hdr->transl[dict][i] = -1;
5029
0
                    continue;
5030
0
                }
5031
0
                src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
5032
0
                if ( src_hdr->transl[dict][i]!=-1 && i!=src_hdr->transl[dict][i] ) src_hdr->ntransl++;
5033
0
            }
5034
0
        }
5035
0
        if ( !src_hdr->ntransl )
5036
0
        {
5037
0
            free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
5038
0
            free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
5039
0
            src_hdr->ntransl = -1;
5040
0
        }
5041
0
        if ( src_hdr->ntransl==-1 ) return 0;
5042
0
    }
5043
0
    bcf_unpack(line,BCF_UN_ALL);
5044
5045
    // CHROM
5046
0
    if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
5047
5048
    // FILTER
5049
0
    for (i=0; i<line->d.n_flt; i++)
5050
0
    {
5051
0
        int src_id = line->d.flt[i];
5052
0
        if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
5053
0
            line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
5054
0
        line->d.shared_dirty |= BCF1_DIRTY_FLT;
5055
0
    }
5056
5057
    // INFO
5058
0
    for (i=0; i<line->n_info; i++)
5059
0
    {
5060
0
        int src_id = line->d.info[i].key;
5061
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
5062
0
        if ( dst_id<0 ) continue;
5063
0
        line->d.info[i].key = dst_id;
5064
0
        if ( !line->d.info[i].vptr ) continue;  // skip deleted
5065
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5066
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5067
0
        if ( src_size==dst_size )   // can overwrite
5068
0
        {
5069
0
            uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
5070
0
            if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
5071
0
            else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
5072
0
            else { *(uint32_t*)vptr = (uint32_t)dst_id; }
5073
0
        }
5074
0
        else    // must realloc
5075
0
        {
5076
0
            bcf_info_t *info = &line->d.info[i];
5077
0
            kstring_t str = {0,0,0};
5078
0
            bcf_enc_int1(&str, dst_id);
5079
0
            bcf_enc_size(&str, info->len,info->type);
5080
0
            uint32_t vptr_off = str.l;
5081
0
            kputsn((char*)info->vptr, info->vptr_len, &str);
5082
0
            if( info->vptr_free ) free(info->vptr - info->vptr_off);
5083
0
            info->vptr_off = vptr_off;
5084
0
            info->vptr = (uint8_t*)str.s + info->vptr_off;
5085
0
            info->vptr_free = 1;
5086
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5087
0
        }
5088
0
    }
5089
5090
    // FORMAT
5091
0
    for (i=0; i<line->n_fmt; i++)
5092
0
    {
5093
0
        int src_id = line->d.fmt[i].id;
5094
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
5095
0
        if ( dst_id<0 ) continue;
5096
0
        line->d.fmt[i].id = dst_id;
5097
0
        if( !line->d.fmt[i].p ) continue;  // skip deleted
5098
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5099
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5100
0
        if ( src_size==dst_size )   // can overwrite
5101
0
        {
5102
0
            uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off;    // pointer to the vector size (4bits) and BT type (4bits)
5103
0
            if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
5104
0
            else if ( dst_size==BCF_BT_INT16 ) { i16_to_le(dst_id, p + 1); }
5105
0
            else { i32_to_le(dst_id, p + 1); }
5106
0
        }
5107
0
        else    // must realloc
5108
0
        {
5109
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
5110
0
            kstring_t str = {0,0,0};
5111
0
            bcf_enc_int1(&str, dst_id);
5112
0
            bcf_enc_size(&str, fmt->n, fmt->type);
5113
0
            uint32_t p_off = str.l;
5114
0
            kputsn((char*)fmt->p, fmt->p_len, &str);
5115
0
            if( fmt->p_free ) free(fmt->p - fmt->p_off);
5116
0
            fmt->p_off = p_off;
5117
0
            fmt->p = (uint8_t*)str.s + fmt->p_off;
5118
0
            fmt->p_free = 1;
5119
0
            line->d.indiv_dirty = 1;
5120
0
        }
5121
0
    }
5122
0
    return 0;
5123
0
}
5124
5125
bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
5126
0
{
5127
0
    bcf_hdr_t *hout = bcf_hdr_init("r");
5128
0
    if (!hout) {
5129
0
        hts_log_error("Failed to allocate bcf header");
5130
0
        return NULL;
5131
0
    }
5132
0
    kstring_t htxt = {0,0,0};
5133
0
    if (bcf_hdr_format(hdr, 1, &htxt) < 0) {
5134
0
        free(htxt.s);
5135
0
        return NULL;
5136
0
    }
5137
0
    if ( bcf_hdr_parse(hout, htxt.s) < 0 ) {
5138
0
        bcf_hdr_destroy(hout);
5139
0
        hout = NULL;
5140
0
    }
5141
0
    free(htxt.s);
5142
0
    return hout;
5143
0
}
5144
5145
bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
5146
0
{
5147
0
    void *names_hash = khash_str2int_init();
5148
0
    kstring_t htxt = {0,0,0};
5149
0
    kstring_t str = {0,0,0};
5150
0
    bcf_hdr_t *h = bcf_hdr_init("w");
5151
0
    int r = 0;
5152
0
    if (!h || !names_hash) {
5153
0
        hts_log_error("Failed to allocate bcf header");
5154
0
        goto err;
5155
0
    }
5156
0
    if (bcf_hdr_format(h0, 1, &htxt) < 0) {
5157
0
        hts_log_error("Failed to get header text");
5158
0
        goto err;
5159
0
    }
5160
0
    bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
5161
0
    int j;
5162
0
    for (j=0; j<n; j++) imap[j] = -1;
5163
0
    if ( bcf_hdr_nsamples(h0) > 0) {
5164
0
        char *p = find_chrom_header_line(htxt.s);
5165
0
        int i = 0, end = n? 8 : 7;
5166
0
        while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
5167
0
        if (i != end) {
5168
0
            hts_log_error("Wrong number of columns in header #CHROM line");
5169
0
            goto err;
5170
0
        }
5171
0
        r |= kputsn(htxt.s, p - htxt.s, &str) < 0;
5172
0
        for (i = 0; i < n; ++i) {
5173
0
            if ( khash_str2int_has_key(names_hash,samples[i]) )
5174
0
            {
5175
0
                hts_log_error("Duplicate sample name \"%s\"", samples[i]);
5176
0
                goto err;
5177
0
            }
5178
0
            imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
5179
0
            if (imap[i] < 0) continue;
5180
0
            r |= kputc('\t', &str) < 0;
5181
0
            r |= kputs(samples[i], &str) < 0;
5182
0
            r |= khash_str2int_inc(names_hash,samples[i]) < 0;
5183
0
        }
5184
0
    } else r |= kputsn(htxt.s, htxt.l, &str) < 0;
5185
0
    while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
5186
0
    r |= kputc('\n',&str) < 0;
5187
0
    if (r) {
5188
0
        hts_log_error("%s", strerror(errno));
5189
0
        goto err;
5190
0
    }
5191
0
    if ( bcf_hdr_parse(h, str.s) < 0 ) {
5192
0
        bcf_hdr_destroy(h);
5193
0
        h = NULL;
5194
0
    }
5195
0
    free(str.s);
5196
0
    free(htxt.s);
5197
0
    khash_str2int_destroy(names_hash);
5198
0
    return h;
5199
5200
0
 err:
5201
0
    ks_free(&str);
5202
0
    ks_free(&htxt);
5203
0
    khash_str2int_destroy(names_hash);
5204
0
    bcf_hdr_destroy(h);
5205
0
    return NULL;
5206
0
}
5207
5208
int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
5209
0
{
5210
0
    if ( samples && !strcmp("-",samples) ) return 0;            // keep all samples
5211
5212
0
    int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
5213
0
    hdr->keep_samples = (uint8_t*) calloc(narr,1);
5214
0
    if (!hdr->keep_samples) return -1;
5215
5216
0
    hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
5217
0
    if ( !samples )
5218
0
    {
5219
        // exclude all samples
5220
0
        khint_t k;
5221
0
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE], *new_dict;
5222
0
        new_dict = kh_init(vdict);
5223
0
        if (!new_dict) return -1;
5224
5225
0
        bcf_hdr_nsamples(hdr) = 0;
5226
5227
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
5228
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
5229
0
        kh_destroy(vdict, d);
5230
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
5231
0
        if (bcf_hdr_sync(hdr) < 0) return -1;
5232
5233
0
        return 0;
5234
0
    }
5235
5236
0
    if ( samples[0]=='^' )
5237
0
        for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
5238
5239
0
    int idx, n, ret = 0;
5240
0
    char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
5241
0
    if ( !smpls ) return -1;
5242
0
    for (i=0; i<n; i++)
5243
0
    {
5244
0
        idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
5245
0
        if ( idx<0 )
5246
0
        {
5247
0
            if ( !ret ) ret = i+1;
5248
0
            continue;
5249
0
        }
5250
0
        assert( idx<bcf_hdr_nsamples(hdr) );
5251
0
        if (  samples[0]=='^' )
5252
0
            bit_array_clear(hdr->keep_samples, idx);
5253
0
        else
5254
0
            bit_array_set(hdr->keep_samples, idx);
5255
0
    }
5256
0
    for (i=0; i<n; i++) free(smpls[i]);
5257
0
    free(smpls);
5258
5259
0
    bcf_hdr_nsamples(hdr) = 0;
5260
0
    for (i=0; i<hdr->nsamples_ori; i++)
5261
0
        if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
5262
5263
0
    if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
5264
0
    else
5265
0
    {
5266
        // Make new list and dictionary with desired samples
5267
0
        char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr));
5268
0
        vdict_t *new_dict, *d;
5269
0
        int k, res;
5270
0
        if (!samples) return -1;
5271
5272
0
        new_dict = kh_init(vdict);
5273
0
        if (!new_dict) {
5274
0
            free(samples);
5275
0
            return -1;
5276
0
        }
5277
0
        idx = 0;
5278
0
        for (i=0; i<hdr->nsamples_ori; i++) {
5279
0
            if ( bit_array_test(hdr->keep_samples,i) ) {
5280
0
                samples[idx] = hdr->samples[i];
5281
0
                k = kh_put(vdict, new_dict, hdr->samples[i], &res);
5282
0
                if (res < 0) {
5283
0
                    free(samples);
5284
0
                    kh_destroy(vdict, new_dict);
5285
0
                    return -1;
5286
0
                }
5287
0
                kh_val(new_dict, k) = bcf_idinfo_def;
5288
0
                kh_val(new_dict, k).id = idx;
5289
0
                idx++;
5290
0
            }
5291
0
        }
5292
5293
        // Delete desired samples from old dictionary, so we don't free them
5294
0
        d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
5295
0
        for (i=0; i < idx; i++) {
5296
0
            int k = kh_get(vdict, d, samples[i]);
5297
0
            if (k < kh_end(d)) kh_del(vdict, d, k);
5298
0
        }
5299
5300
        // Free everything else
5301
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
5302
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
5303
0
        kh_destroy(vdict, d);
5304
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
5305
5306
0
        free(hdr->samples);
5307
0
        hdr->samples = samples;
5308
5309
0
        if (bcf_hdr_sync(hdr) < 0)
5310
0
            return -1;
5311
0
    }
5312
5313
0
    return ret;
5314
0
}
5315
5316
int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
5317
0
{
5318
0
    kstring_t ind;
5319
0
    ind.s = 0; ind.l = ind.m = 0;
5320
0
    if (n) {
5321
0
        bcf_fmt_t fmt[MAX_N_FMT];
5322
0
        int i, j;
5323
0
        uint8_t *ptr = (uint8_t*)v->indiv.s;
5324
0
        for (i = 0; i < v->n_fmt; ++i)
5325
0
            ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
5326
0
        for (i = 0; i < (int)v->n_fmt; ++i) {
5327
0
            bcf_fmt_t *f = &fmt[i];
5328
0
            bcf_enc_int1(&ind, f->id);
5329
0
            bcf_enc_size(&ind, f->n, f->type);
5330
0
            for (j = 0; j < n; ++j)
5331
0
                if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
5332
0
        }
5333
0
        for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
5334
0
        v->n_sample = i;
5335
0
    } else v->n_sample = 0;
5336
0
    if ( !v->n_sample ) v->n_fmt = 0;
5337
0
    free(v->indiv.s);
5338
0
    v->indiv = ind;
5339
0
    v->unpacked &= ~BCF_UN_FMT;    // only BCF is ready for output, VCF will need to unpack again
5340
0
    return 0;
5341
0
}
5342
5343
int bcf_is_snp(bcf1_t *v)
5344
0
{
5345
0
    int i;
5346
0
    bcf_unpack(v, BCF_UN_STR);
5347
0
    for (i = 0; i < v->n_allele; ++i)
5348
0
    {
5349
0
        if ( v->d.allele[i][1]==0 && v->d.allele[i][0]!='*' ) continue;
5350
5351
        // mpileup's <X> allele, see also below. This is not completely satisfactory,
5352
        // a general library is here narrowly tailored to fit samtools.
5353
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue;
5354
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='*' && v->d.allele[i][2]=='>' ) continue;
5355
5356
0
        break;
5357
0
    }
5358
0
    return i == v->n_allele;
5359
0
}
5360
5361
static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t *var)
5362
0
{
5363
0
    if ( *alt == '*' && !alt[1] ) { var->n = 0; var->type = VCF_OVERLAP; return; }  // overlapping variant
5364
5365
    // The most frequent case
5366
0
    if ( !ref[1] && !alt[1] )
5367
0
    {
5368
0
        if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
5369
0
        if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
5370
0
        var->n = 1; var->type = VCF_SNP; return;
5371
0
    }
5372
0
    if ( alt[0]=='<' )
5373
0
    {
5374
0
        if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
5375
0
        if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }
5376
0
        if ( !strcmp("NON_REF>",alt+1) ) { var->n = 0; var->type = VCF_REF; return; }
5377
0
        var->type = VCF_OTHER;
5378
0
        return;
5379
0
    }
5380
5381
    // Catch "joined before" breakend case
5382
0
    if ( alt[0]==']' || alt[0] == '[' )
5383
0
    {
5384
0
        var->type = VCF_BND; return;
5385
0
    }
5386
5387
    // Iterate through alt characters that match the reference
5388
0
    const char *r = ref, *a = alt;
5389
0
    while (*r && *a && toupper_c(*r)==toupper_c(*a) ) { r++; a++; }     // unfortunately, matching REF,ALT case is not guaranteed
5390
5391
0
    if ( *a && !*r )
5392
0
    {
5393
0
        while ( *a ) a++;
5394
0
        if ( *(a-1)==']' || *(a-1)=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend
5395
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_INS; return;
5396
0
    }
5397
0
    else if ( *r && !*a )
5398
0
    {
5399
0
        while ( *r ) r++;
5400
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_DEL; return;
5401
0
    }
5402
0
    else if ( !*r && !*a )
5403
0
    {
5404
0
        var->n = 0; var->type = VCF_REF; return;
5405
0
    }
5406
5407
0
    const char *re = r, *ae = a;
5408
0
    while ( re[1] ) re++;
5409
0
    while ( ae[1] ) ae++;
5410
0
    if ( ae[0]==']' || ae[0]=='[' ) { var->type = VCF_BND; return; }    // "joined after" breakend
5411
0
    while ( re>r && ae>a && toupper_c(*re)==toupper_c(*ae) ) { re--; ae--; }
5412
0
    if ( ae==a )
5413
0
    {
5414
0
        if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
5415
0
        var->n = -(re-r);
5416
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_DEL; return; }
5417
0
        var->type = VCF_OTHER; return;
5418
0
    }
5419
0
    else if ( re==r )
5420
0
    {
5421
0
        var->n = ae-a;
5422
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_INS; return; }
5423
0
        var->type = VCF_OTHER; return;
5424
0
    }
5425
5426
0
    var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
5427
0
    var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
5428
5429
    // should do also complex events, SVs, etc...
5430
0
}
5431
5432
static int bcf_set_variant_types(bcf1_t *b)
5433
0
{
5434
0
    if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
5435
0
    bcf_dec_t *d = &b->d;
5436
0
    if ( d->n_var < b->n_allele )
5437
0
    {
5438
0
        bcf_variant_t *new_var = realloc(d->var, sizeof(bcf_variant_t)*b->n_allele);
5439
0
        if (!new_var)
5440
0
            return -1;
5441
0
        d->var = new_var;
5442
0
        d->n_var = b->n_allele;
5443
0
    }
5444
0
    int i;
5445
0
    b->d.var_type = 0;
5446
0
    d->var[0].type = VCF_REF;
5447
0
    d->var[0].n    = 0;
5448
0
    for (i=1; i<b->n_allele; i++)
5449
0
    {
5450
0
        bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
5451
0
        b->d.var_type |= d->var[i].type;
5452
        //fprintf(stderr,"[set_variant_type] %d   %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
5453
0
    }
5454
0
    return 0;
5455
0
}
5456
5457
// bcf_get_variant_type/bcf_get_variant_types should only return the following,
5458
// to be compatible with callers that are not expecting newer values
5459
// like VCF_INS, VCF_DEL.  The full set is available from the newer
5460
// vcf_has_variant_type* interfaces.
5461
0
#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP)
5462
int bcf_get_variant_types(bcf1_t *rec)
5463
0
{
5464
0
    if ( rec->d.var_type==-1 ) {
5465
0
        if (bcf_set_variant_types(rec) != 0) {
5466
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5467
0
            exit(1); // Due to legacy API having no way to report failures
5468
0
        }
5469
0
    }
5470
0
    return rec->d.var_type & ORIG_VAR_TYPES;
5471
0
}
5472
5473
int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
5474
0
{
5475
0
    if ( rec->d.var_type==-1 ) {
5476
0
        if (bcf_set_variant_types(rec) != 0) {
5477
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5478
0
            exit(1); // Due to legacy API having no way to report failures
5479
0
        }
5480
0
    }
5481
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) {
5482
0
        hts_log_error("Requested allele outside valid range");
5483
0
        exit(1);
5484
0
    }
5485
0
    return rec->d.var[ith_allele].type & ORIG_VAR_TYPES;
5486
0
}
5487
#undef ORIG_VAR_TYPES
5488
5489
int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask)
5490
0
{
5491
0
    if ( rec->d.var_type==-1 ) {
5492
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5493
0
    }
5494
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return -1;
5495
0
    if (bitmask == VCF_REF) {  // VCF_REF is 0, so handled as a special case
5496
0
        return rec->d.var[ith_allele].type == VCF_REF;
5497
0
    }
5498
0
    return bitmask & rec->d.var[ith_allele].type;
5499
0
}
5500
5501
int bcf_variant_length(bcf1_t *rec, int ith_allele)
5502
0
{
5503
0
    if ( rec->d.var_type==-1 ) {
5504
0
        if (bcf_set_variant_types(rec) != 0) return bcf_int32_missing;
5505
0
    }
5506
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return bcf_int32_missing;
5507
0
    return rec->d.var[ith_allele].n;
5508
0
}
5509
5510
int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask,
5511
                          enum bcf_variant_match mode)
5512
0
{
5513
0
    if ( rec->d.var_type==-1 ) {
5514
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5515
0
    }
5516
0
    uint32_t type = rec->d.var_type;
5517
0
    if ( mode==bcf_match_overlap ) return bitmask & type;
5518
5519
    // VCF_INDEL is always set with VCF_INS and VCF_DEL by bcf_set_variant_type[s], but the bitmask may
5520
    // ask for say `VCF_INS` or `VCF_INDEL` only
5521
0
    if ( bitmask&(VCF_INS|VCF_DEL) && !(bitmask&VCF_INDEL) ) type &= ~VCF_INDEL;
5522
0
    else if ( bitmask&VCF_INDEL && !(bitmask&(VCF_INS|VCF_DEL)) ) type &= ~(VCF_INS|VCF_DEL);
5523
5524
0
    if ( mode==bcf_match_subset )
5525
0
    {
5526
0
        if ( ~bitmask & type ) return 0;
5527
0
        else return bitmask & type;
5528
0
    }
5529
    // mode == bcf_match_exact
5530
0
    if ( bitmask==VCF_REF ) return type==bitmask ? 1 : 0;
5531
0
    return type==bitmask ? type : 0;
5532
0
}
5533
5534
int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5535
0
{
5536
0
    static int negative_rlen_warned = 0;
5537
0
    int is_end_tag, is_svlen_tag = 0;
5538
5539
    // Is the field already present?
5540
0
    int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5541
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1;    // No such INFO field in the header
5542
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
5543
5544
0
    is_end_tag = strcmp(key, "END") == 0;
5545
0
    is_svlen_tag = strcmp(key, "SVLEN") == 0;
5546
5547
0
    for (i=0; i<line->n_info; i++)
5548
0
        if ( inf_id==line->d.info[i].key ) break;
5549
0
    bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
5550
5551
0
    if ( !n || (type==BCF_HT_STR && !values) )
5552
0
    {
5553
0
        if ( inf )
5554
0
        {
5555
            // Mark the tag for removal, free existing memory if necessary
5556
0
            if ( inf->vptr_free )
5557
0
            {
5558
0
                free(inf->vptr - inf->vptr_off);
5559
0
                inf->vptr_free = 0;
5560
0
            }
5561
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5562
0
            inf->vptr = NULL;
5563
0
            inf->vptr_off = inf->vptr_len = 0;
5564
0
        }
5565
0
        if ( n==0 && (is_end_tag || is_svlen_tag) ) {
5566
0
            line->rlen = get_rlen(hdr, line);
5567
0
        }
5568
0
        return 0;
5569
0
    }
5570
5571
0
    if (is_end_tag)
5572
0
    {
5573
0
        if (n != 1)
5574
0
        {
5575
0
            hts_log_error("END info tag should only have one value at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5576
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5577
0
            return -1;
5578
0
        }
5579
0
        if (type != BCF_HT_INT && type != BCF_HT_LONG)
5580
0
        {
5581
0
            hts_log_error("Wrong type (%d) for END info tag at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5582
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5583
0
            return -1;
5584
0
        }
5585
0
    }
5586
5587
    // Encode the values and determine the size required to accommodate the values
5588
0
    kstring_t str = {0,0,0};
5589
0
    bcf_enc_int1(&str, inf_id);
5590
0
    if ( type==BCF_HT_INT )
5591
0
        bcf_enc_vint(&str, n, (int32_t*)values, -1);
5592
0
    else if ( type==BCF_HT_REAL )
5593
0
        bcf_enc_vfloat(&str, n, (float*)values);
5594
0
    else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
5595
0
    {
5596
0
        if ( values==NULL )
5597
0
            bcf_enc_size(&str, 0, BCF_BT_NULL);
5598
0
        else
5599
0
            bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
5600
0
    }
5601
#ifdef VCF_ALLOW_INT64
5602
    else if ( type==BCF_HT_LONG )
5603
    {
5604
        if (n != 1) {
5605
            hts_log_error("Only storing a single BCF_HT_LONG value is supported at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5606
            abort();
5607
        }
5608
        bcf_enc_long1(&str, *(int64_t *) values);
5609
    }
5610
#endif
5611
0
    else
5612
0
    {
5613
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5614
0
        abort();
5615
0
    }
5616
5617
    // Is the INFO tag already present
5618
0
    if ( inf )
5619
0
    {
5620
        // Is it big enough to accommodate new block?
5621
0
        if ( inf->vptr && str.l <= inf->vptr_len + inf->vptr_off )
5622
0
        {
5623
0
            if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
5624
0
            uint8_t *ptr = inf->vptr - inf->vptr_off;
5625
0
            memcpy(ptr, str.s, str.l);
5626
0
            free(str.s);
5627
0
            int vptr_free = inf->vptr_free;
5628
0
            bcf_unpack_info_core1(ptr, inf);
5629
0
            inf->vptr_free = vptr_free;
5630
0
        }
5631
0
        else
5632
0
        {
5633
0
            if ( inf->vptr_free )
5634
0
                free(inf->vptr - inf->vptr_off);
5635
0
            bcf_unpack_info_core1((uint8_t*)str.s, inf);
5636
0
            inf->vptr_free = 1;
5637
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5638
0
        }
5639
0
    }
5640
0
    else
5641
0
    {
5642
        // The tag is not present, create new one
5643
0
        line->n_info++;
5644
0
        hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
5645
0
        inf = &line->d.info[line->n_info-1];
5646
0
        bcf_unpack_info_core1((uint8_t*)str.s, inf);
5647
0
        inf->vptr_free = 1;
5648
0
        line->d.shared_dirty |= BCF1_DIRTY_INF;
5649
0
    }
5650
0
    line->unpacked |= BCF_UN_INFO;
5651
5652
0
   if ( n==1 && is_end_tag) {
5653
0
        hts_pos_t end = type == BCF_HT_INT ? *(int32_t *) values : *(int64_t *) values;
5654
0
        if ( (type == BCF_HT_INT && end!=bcf_int32_missing) || (type == BCF_HT_LONG && end!=bcf_int64_missing) )
5655
0
        {
5656
0
            if ( end <= line->pos )
5657
0
            {
5658
0
                if ( !negative_rlen_warned )
5659
0
                {
5660
0
                    hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,end,bcf_seqname_safe(hdr,line),line->pos+1);
5661
0
                    negative_rlen_warned = 1;
5662
0
                }
5663
0
            }
5664
0
        }
5665
0
    }
5666
0
    if (is_svlen_tag || is_end_tag) {
5667
0
        line->rlen = get_rlen(hdr, line);
5668
0
    }
5669
0
    return 0;
5670
0
}
5671
5672
int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
5673
0
{
5674
0
    if ( !n )
5675
0
        return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
5676
5677
0
    int i, max_len = 0;
5678
0
    for (i=0; i<n; i++)
5679
0
    {
5680
0
        int len = strlen(values[i]);
5681
0
        if ( len > max_len ) max_len = len;
5682
0
    }
5683
0
    char *out = (char*) malloc(max_len*n);
5684
0
    if ( !out ) return -2;
5685
0
    for (i=0; i<n; i++)
5686
0
    {
5687
0
        char *dst = out+i*max_len;
5688
0
        const char *src = values[i];
5689
0
        int j = 0;
5690
0
        while ( src[j] ) { dst[j] = src[j]; j++; }
5691
0
        for (; j<max_len; j++) dst[j] = 0;
5692
0
    }
5693
0
    int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
5694
0
    free(out);
5695
0
    return ret;
5696
0
}
5697
5698
int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5699
0
{
5700
    // Is the field already present?
5701
0
    int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5702
0
    int is_len = 0;
5703
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
5704
0
    {
5705
0
        if ( !n ) return 0;
5706
0
        return -1;  // the key not present in the header
5707
0
    }
5708
5709
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5710
5711
0
    for (i=0; i<line->n_fmt; i++)
5712
0
        if ( line->d.fmt[i].id==fmt_id ) break;
5713
0
    bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
5714
5715
0
    is_len = strcmp(key, "LEN") == 0;
5716
0
    if ( !n )
5717
0
    {
5718
0
        if ( fmt )
5719
0
        {
5720
            // Mark the tag for removal, free existing memory if necessary
5721
0
            if ( fmt->p_free )
5722
0
            {
5723
0
                free(fmt->p - fmt->p_off);
5724
0
                fmt->p_free = 0;
5725
0
            }
5726
0
            line->d.indiv_dirty = 1;
5727
0
            fmt->p = NULL;
5728
0
        }
5729
0
        if (is_len) {
5730
0
            line->rlen = get_rlen(hdr, line);
5731
0
        }
5732
0
        return 0;
5733
0
    }
5734
5735
0
    line->n_sample = bcf_hdr_nsamples(hdr);
5736
0
    int nps = n / line->n_sample;  // number of values per sample
5737
0
    assert( nps && nps*line->n_sample==n );     // must be divisible by n_sample
5738
5739
    // Encode the values and determine the size required to accommodate the values
5740
0
    kstring_t str = {0,0,0};
5741
0
    bcf_enc_int1(&str, fmt_id);
5742
0
    assert(values != NULL);
5743
0
    if ( type==BCF_HT_INT )
5744
0
        bcf_enc_vint(&str, n, (int32_t*)values, nps);
5745
0
    else if ( type==BCF_HT_REAL )
5746
0
    {
5747
0
        bcf_enc_size(&str, nps, BCF_BT_FLOAT);
5748
0
        serialize_float_array(&str, nps*line->n_sample, (float *) values);
5749
0
    }
5750
0
    else if ( type==BCF_HT_STR )
5751
0
    {
5752
0
        bcf_enc_size(&str, nps, BCF_BT_CHAR);
5753
0
        kputsn((char*)values, nps*line->n_sample, &str);
5754
0
    }
5755
0
    else
5756
0
    {
5757
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5758
0
        abort();
5759
0
    }
5760
5761
0
    if ( !fmt )
5762
0
    {
5763
        // Not present, new format field
5764
0
        line->n_fmt++;
5765
0
        hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
5766
5767
        // Special case: VCF specification requires that GT is always first
5768
0
        if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
5769
0
        {
5770
0
            for (i=line->n_fmt-1; i>0; i--)
5771
0
                line->d.fmt[i] = line->d.fmt[i-1];
5772
0
            fmt = &line->d.fmt[0];
5773
0
        }
5774
0
        else
5775
0
            fmt = &line->d.fmt[line->n_fmt-1];
5776
0
        bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5777
0
        line->d.indiv_dirty = 1;
5778
0
        fmt->p_free = 1;
5779
0
    }
5780
0
    else
5781
0
    {
5782
        // The tag is already present, check if it is big enough to accommodate the new block
5783
0
        if ( fmt->p && str.l <= fmt->p_len + fmt->p_off )
5784
0
        {
5785
            // good, the block is big enough
5786
0
            if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
5787
0
            uint8_t *ptr = fmt->p - fmt->p_off;
5788
0
            memcpy(ptr, str.s, str.l);
5789
0
            free(str.s);
5790
0
            int p_free = fmt->p_free;
5791
0
            bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
5792
0
            fmt->p_free = p_free;
5793
0
        }
5794
0
        else
5795
0
        {
5796
0
            if ( fmt->p_free )
5797
0
                free(fmt->p - fmt->p_off);
5798
0
            bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5799
0
            fmt->p_free = 1;
5800
0
            line->d.indiv_dirty = 1;
5801
0
        }
5802
0
    }
5803
0
    line->unpacked |= BCF_UN_FMT;
5804
5805
0
    if (is_len) {
5806
0
        line->rlen = get_rlen(hdr, line);
5807
0
    }
5808
0
    return 0;
5809
0
}
5810
5811
5812
int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
5813
0
{
5814
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5815
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5816
0
    line->d.n_flt = n;
5817
0
    if ( !n ) return 0;
5818
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5819
0
    int i;
5820
0
    for (i=0; i<n; i++)
5821
0
        line->d.flt[i] = flt_ids[i];
5822
0
    return 0;
5823
0
}
5824
5825
int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
5826
0
{
5827
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5828
0
    int i;
5829
0
    for (i=0; i<line->d.n_flt; i++)
5830
0
        if ( flt_id==line->d.flt[i] ) break;
5831
0
    if ( i<line->d.n_flt ) return 0;    // this filter is already set
5832
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5833
0
    if ( flt_id==0 )    // set to PASS
5834
0
        line->d.n_flt = 1;
5835
0
    else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
5836
0
        line->d.n_flt = 1;
5837
0
    else
5838
0
        line->d.n_flt++;
5839
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5840
0
    line->d.flt[line->d.n_flt-1] = flt_id;
5841
0
    return 1;
5842
0
}
5843
int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
5844
0
{
5845
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5846
0
    int i;
5847
0
    for (i=0; i<line->d.n_flt; i++)
5848
0
        if ( flt_id==line->d.flt[i] ) break;
5849
0
    if ( i==line->d.n_flt ) return 0;   // the filter is not present
5850
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5851
0
    if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt));
5852
0
    line->d.n_flt--;
5853
0
    if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
5854
0
    return 0;
5855
0
}
5856
5857
int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
5858
0
{
5859
0
    if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
5860
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
5861
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1;  // not defined in the header
5862
5863
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5864
0
    if ( id==0 && !line->d.n_flt) return 1; // PASS
5865
5866
0
    int i;
5867
0
    for (i=0; i<line->d.n_flt; i++)
5868
0
        if ( line->d.flt[i]==id ) return 1;
5869
0
    return 0;
5870
0
}
5871
5872
static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
5873
0
{
5874
0
    line->d.shared_dirty |= BCF1_DIRTY_ALS;
5875
0
    line->d.var_type = -1;
5876
5877
0
    line->n_allele = nals;
5878
0
    hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
5879
5880
0
    char *als = line->d.als;
5881
0
    int n = 0;
5882
0
    while (n<nals)
5883
0
    {
5884
0
        line->d.allele[n] = als;
5885
0
        while ( *als ) als++;
5886
0
        als++;
5887
0
        n++;
5888
0
    }
5889
    // Update REF length. Note that END is 1-based while line->pos 0-based
5890
0
    line->rlen = get_rlen(hdr, line);
5891
5892
0
    return 0;
5893
0
}
5894
int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
5895
0
{
5896
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5897
0
    char *free_old = NULL;
5898
0
    char buffer[256];
5899
0
    size_t used = 0;
5900
5901
    // The pointers in alleles may point into the existing line->d.als memory,
5902
    // so care needs to be taken not to clobber them while updating.  Usually
5903
    // they will be short so we can copy through an intermediate buffer.
5904
    // If they're longer, or won't fit in the existing allocation we
5905
    // can allocate a new buffer to write into.  Note that in either case
5906
    // pointers to line->d.als memory in alleles may not be valid when we've
5907
    // finished.
5908
0
    int i;
5909
0
    size_t avail = line->d.m_als < sizeof(buffer) ? line->d.m_als : sizeof(buffer);
5910
0
    for (i=0; i<nals; i++) {
5911
0
        size_t sz = strlen(alleles[i]) + 1;
5912
0
        if (avail - used < sz)
5913
0
            break;
5914
0
        memcpy(buffer + used, alleles[i], sz);
5915
0
        used += sz;
5916
0
    }
5917
5918
    // Did we miss anything?
5919
0
    if (i < nals) {
5920
0
        int j;
5921
0
        size_t needed = used;
5922
0
        char *new_als;
5923
0
        for (j = i; j < nals; j++)
5924
0
            needed += strlen(alleles[j]) + 1;
5925
0
        if (needed < line->d.m_als) // Don't shrink the buffer
5926
0
            needed = line->d.m_als;
5927
0
        if (needed > INT_MAX) {
5928
0
            hts_log_error("REF + alleles too long to fit in a BCF record");
5929
0
            return -1;
5930
0
        }
5931
0
        new_als = malloc(needed);
5932
0
        if (!new_als)
5933
0
            return -1;
5934
0
        free_old = line->d.als;
5935
0
        line->d.als = new_als;
5936
0
        line->d.m_als = needed;
5937
0
    }
5938
5939
    // Copy from the temp buffer to the destination
5940
0
    if (used) {
5941
0
        assert(used <= line->d.m_als);
5942
0
        memcpy(line->d.als, buffer, used);
5943
0
    }
5944
5945
    // Add in any remaining entries - if this happens we will always be
5946
    // writing to a newly-allocated buffer.
5947
0
    for (; i < nals; i++) {
5948
0
        size_t sz = strlen(alleles[i]) + 1;
5949
0
        memcpy(line->d.als + used, alleles[i], sz);
5950
0
        used += sz;
5951
0
    }
5952
5953
0
    if (free_old)
5954
0
        free(free_old);
5955
0
    return _bcf1_sync_alleles(hdr,line,nals);
5956
0
}
5957
5958
int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
5959
0
{
5960
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5961
0
    kstring_t tmp;
5962
0
    tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
5963
0
    kputs(alleles_string, &tmp);
5964
0
    line->d.als = tmp.s; line->d.m_als = tmp.m;
5965
5966
0
    int nals = 1;
5967
0
    char *t = line->d.als;
5968
0
    while (*t)
5969
0
    {
5970
0
        if ( *t==',' ) { *t = 0; nals++; }
5971
0
        t++;
5972
0
    }
5973
0
    return _bcf1_sync_alleles(hdr, line, nals);
5974
0
}
5975
5976
int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
5977
0
{
5978
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5979
0
    kstring_t tmp;
5980
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
5981
0
    if ( id )
5982
0
        kputs(id, &tmp);
5983
0
    else
5984
0
        kputs(".", &tmp);
5985
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
5986
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
5987
0
    return 0;
5988
0
}
5989
5990
int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
5991
0
{
5992
0
    if ( !id ) return 0;
5993
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5994
5995
0
    kstring_t tmp;
5996
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
5997
5998
0
    int len = strlen(id);
5999
0
    char *dst = line->d.id;
6000
0
    while ( *dst && (dst=strstr(dst,id)) )
6001
0
    {
6002
0
        if ( dst[len]!=0 && dst[len]!=';' ) dst++;              // a prefix, not a match
6003
0
        else if ( dst==line->d.id || dst[-1]==';' ) return 0;   // already present
6004
0
        dst++;  // a suffix, not a match
6005
0
    }
6006
0
    if ( line->d.id && (line->d.id[0]!='.' || line->d.id[1]) )
6007
0
    {
6008
0
        tmp.l = strlen(line->d.id);
6009
0
        kputc(';',&tmp);
6010
0
    }
6011
0
    kputs(id,&tmp);
6012
6013
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
6014
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
6015
0
    return 0;
6016
6017
0
}
6018
6019
bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
6020
0
{
6021
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
6022
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL;   // no such FMT field in the header
6023
0
    return bcf_get_fmt_id(line, id);
6024
0
}
6025
6026
bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
6027
0
{
6028
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
6029
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL;   // no such INFO field in the header
6030
0
    return bcf_get_info_id(line, id);
6031
0
}
6032
6033
bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
6034
0
{
6035
0
    int i;
6036
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
6037
0
    for (i=0; i<line->n_fmt; i++)
6038
0
    {
6039
0
        if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
6040
0
    }
6041
0
    return NULL;
6042
0
}
6043
6044
bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
6045
0
{
6046
0
    int i;
6047
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
6048
0
    for (i=0; i<line->n_info; i++)
6049
0
    {
6050
0
        if ( line->d.info[i].key==id ) return &line->d.info[i];
6051
0
    }
6052
0
    return NULL;
6053
0
}
6054
6055
6056
int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
6057
0
{
6058
0
    int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
6059
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1;    // no such INFO field in the header
6060
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2;     // expected different type
6061
6062
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
6063
6064
0
    for (i=0; i<line->n_info; i++)
6065
0
        if ( line->d.info[i].key==tag_id ) break;
6066
0
    if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3;       // the tag is not present in this record
6067
0
    if ( type==BCF_HT_FLAG ) return 1;
6068
6069
0
    bcf_info_t *info = &line->d.info[i];
6070
0
    if ( !info->vptr ) return -3;           // the tag was marked for removal
6071
0
    if ( type==BCF_HT_STR )
6072
0
    {
6073
0
        if ( *ndst < info->len+1 )
6074
0
        {
6075
0
            *ndst = info->len + 1;
6076
0
            *dst  = realloc(*dst, *ndst);
6077
0
        }
6078
0
        memcpy(*dst,info->vptr,info->len);
6079
0
        ((uint8_t*)*dst)[info->len] = 0;
6080
0
        return info->len;
6081
0
    }
6082
6083
    // Make sure the buffer is big enough
6084
0
    int size1;
6085
0
    switch (type) {
6086
0
        case BCF_HT_INT:  size1 = sizeof(int32_t); break;
6087
0
        case BCF_HT_LONG: size1 = sizeof(int64_t); break;
6088
0
        case BCF_HT_REAL: size1 = sizeof(float); break;
6089
0
        default:
6090
0
            hts_log_error("Unexpected output type %d at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
6091
0
            return -2;
6092
0
    }
6093
0
    if ( *ndst < info->len )
6094
0
    {
6095
0
        *ndst = info->len;
6096
0
        *dst  = realloc(*dst, *ndst * size1);
6097
0
    }
6098
6099
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_regular, out_type_t) do { \
6100
0
        out_type_t *tmp = (out_type_t *) *dst; \
6101
0
        int j; \
6102
0
        for (j=0; j<info->len; j++) \
6103
0
        { \
6104
0
            type_t p = convert(info->vptr + j * sizeof(type_t)); \
6105
0
            if ( is_vector_end ) break; \
6106
0
            if ( is_missing ) set_missing; \
6107
0
            else set_regular; \
6108
0
            tmp++; \
6109
0
        } \
6110
0
        ret = j; \
6111
0
    } while (0)
6112
0
    switch (info->type) {
6113
0
        case BCF_BT_INT8:
6114
0
            if (type == BCF_HT_LONG) {
6115
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int64_missing, *tmp=p, int64_t);
6116
0
            } else {
6117
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=p, int32_t);
6118
0
            }
6119
0
            break;
6120
0
        case BCF_BT_INT16:
6121
0
            if (type == BCF_HT_LONG) {
6122
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t);
6123
0
            } else {
6124
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t);
6125
0
            }
6126
0
            break;
6127
0
        case BCF_BT_INT32:
6128
0
            if (type == BCF_HT_LONG) {
6129
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); break;
6130
0
            } else {
6131
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break;
6132
0
            }
6133
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break;
6134
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, info->type, bcf_seqname_safe(hdr,line), line->pos+1); return -2;
6135
0
    }
6136
0
    #undef BRANCH
6137
0
    return ret;  // set by BRANCH
6138
0
}
6139
6140
int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
6141
0
{
6142
0
    int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
6143
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
6144
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;     // expected different type
6145
6146
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
6147
6148
0
    for (i=0; i<line->n_fmt; i++)
6149
0
        if ( line->d.fmt[i].id==tag_id ) break;
6150
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
6151
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
6152
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
6153
6154
0
    int nsmpl = bcf_hdr_nsamples(hdr);
6155
0
    if ( !*dst )
6156
0
    {
6157
0
        *dst = (char**) malloc(sizeof(char*)*nsmpl);
6158
0
        if ( !*dst ) return -4;     // could not alloc
6159
0
        (*dst)[0] = NULL;
6160
0
    }
6161
0
    int n = (fmt->n+1)*nsmpl;
6162
0
    if ( *ndst < n )
6163
0
    {
6164
0
        (*dst)[0] = realloc((*dst)[0], n);
6165
0
        if ( !(*dst)[0] ) return -4;    // could not alloc
6166
0
        *ndst = n;
6167
0
    }
6168
0
    for (i=0; i<nsmpl; i++)
6169
0
    {
6170
0
        uint8_t *src = fmt->p + i*fmt->n;
6171
0
        uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
6172
0
        memcpy(tmp,src,fmt->n);
6173
0
        tmp[fmt->n] = 0;
6174
0
        (*dst)[i] = (char*) tmp;
6175
0
    }
6176
0
    return n;
6177
0
}
6178
6179
int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
6180
0
{
6181
0
    int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
6182
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
6183
0
    if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
6184
0
    {
6185
        // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
6186
0
        if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
6187
0
    }
6188
0
    else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2;     // expected different type
6189
6190
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
6191
6192
0
    for (i=0; i<line->n_fmt; i++)
6193
0
        if ( line->d.fmt[i].id==tag_id ) break;
6194
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
6195
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
6196
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
6197
6198
0
    if ( type==BCF_HT_STR )
6199
0
    {
6200
0
        int n = fmt->n*bcf_hdr_nsamples(hdr);
6201
0
        if ( *ndst < n )
6202
0
        {
6203
0
            *dst  = realloc(*dst, n);
6204
0
            if ( !*dst ) return -4;     // could not alloc
6205
0
            *ndst = n;
6206
0
        }
6207
0
        memcpy(*dst,fmt->p,n);
6208
0
        return n;
6209
0
    }
6210
6211
    // Make sure the buffer is big enough
6212
0
    int nsmpl = bcf_hdr_nsamples(hdr);
6213
0
    int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
6214
0
    if ( *ndst < fmt->n*nsmpl )
6215
0
    {
6216
0
        *ndst = fmt->n*nsmpl;
6217
0
        *dst  = realloc(*dst, *ndst*size1);
6218
0
        if ( !*dst ) return -4;     // could not alloc
6219
0
    }
6220
6221
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_vector_end, set_regular, out_type_t) { \
6222
0
        out_type_t *tmp = (out_type_t *) *dst; \
6223
0
        uint8_t *fmt_p = fmt->p; \
6224
0
        for (i=0; i<nsmpl; i++) \
6225
0
        { \
6226
0
            for (j=0; j<fmt->n; j++) \
6227
0
            { \
6228
0
                type_t p = convert(fmt_p + j * sizeof(type_t)); \
6229
0
                if ( is_missing ) set_missing; \
6230
0
                else if ( is_vector_end ) { set_vector_end; break; } \
6231
0
                else set_regular; \
6232
0
                tmp++; \
6233
0
            } \
6234
0
            for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
6235
0
            fmt_p += fmt->size; \
6236
0
        } \
6237
0
    }
6238
0
    switch (fmt->type) {
6239
0
        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6240
0
        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6241
0
        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6242
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), bcf_float_set(tmp, p), float); break;
6243
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt->type, bcf_seqname_safe(hdr,line), line->pos+1); exit(1);
6244
0
    }
6245
0
    #undef BRANCH
6246
6247
0
    return nsmpl*fmt->n;
6248
0
}
6249
6250
//error description structure definition
6251
typedef struct err_desc {
6252
    int  errorcode;
6253
    const char *description;
6254
}err_desc;
6255
6256
// error descriptions
6257
static const err_desc errdesc_bcf[] = {
6258
    { BCF_ERR_CTG_UNDEF, "Contig not defined in header"},
6259
    { BCF_ERR_TAG_UNDEF, "Tag not defined in header" },
6260
    { BCF_ERR_NCOLS, "Incorrect number of columns" },
6261
    { BCF_ERR_LIMITS, "Limits reached" },
6262
    { BCF_ERR_CHAR, "Invalid character" },
6263
    { BCF_ERR_CTG_INVALID, "Invalid contig" },
6264
    { BCF_ERR_TAG_INVALID, "Invalid tag" },
6265
};
6266
6267
/// append given description to buffer based on available size and add ... when not enough space
6268
    /** @param buffer       buffer to which description to be appended
6269
        @param offset       offset at which to be appended
6270
        @param maxbuffer    maximum size of the buffer
6271
        @param description  the description to be appended
6272
on failure returns -1 - when buffer is not big enough; returns -1 on invalid params and on too small buffer which are improbable due to validation at caller site
6273
on success returns 0
6274
    */
6275
2.55k
static int add_desc_to_buffer(char *buffer, size_t *offset, size_t maxbuffer, const char *description) {
6276
6277
2.55k
    if (!description || !buffer || !offset || (maxbuffer < 4))
6278
0
        return -1;
6279
6280
2.55k
    size_t rembuffer = maxbuffer - *offset;
6281
2.55k
    if (rembuffer > (strlen(description) + (rembuffer == maxbuffer ? 0 : 1))) {    //add description with optionally required ','
6282
2.55k
        *offset += snprintf(buffer + *offset, rembuffer, "%s%s", (rembuffer == maxbuffer)? "": ",", description);
6283
2.55k
    } else {    //not enough space for description, put ...
6284
0
        size_t tmppos = (rembuffer <= 4) ? maxbuffer - 4 : *offset;
6285
0
        snprintf(buffer + tmppos, 4, "...");    //ignore offset update
6286
0
        return -1;
6287
0
    }
6288
2.55k
    return 0;
6289
2.55k
}
6290
6291
//get description for given error code. return NULL on error
6292
1.22k
const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) {
6293
1.22k
    size_t usedup = 0;
6294
1.22k
    int ret = 0;
6295
1.22k
    int idx;
6296
6297
1.22k
    if (!buffer || maxbuffer < 4)
6298
0
        return NULL;           //invalid / insufficient buffer
6299
6300
1.22k
    if (!errorcode) {
6301
0
        buffer[0] = '\0';      //no error, set null
6302
0
        return buffer;
6303
0
    }
6304
6305
9.82k
    for (idx = 0; idx < sizeof(errdesc_bcf) / sizeof(err_desc); ++idx) {
6306
8.59k
        if (errorcode & errdesc_bcf[idx].errorcode) {    //error is set, add description
6307
2.55k
            ret = add_desc_to_buffer(buffer, &usedup, maxbuffer, errdesc_bcf[idx].description);
6308
2.55k
            if (ret < 0)
6309
0
                break;         //not enough space, ... added, no need to continue
6310
6311
2.55k
            errorcode &= ~errdesc_bcf[idx].errorcode;    //reset the error
6312
2.55k
        }
6313
8.59k
    }
6314
6315
1.22k
    if (errorcode && (ret >= 0))  {     //undescribed error is present in error code and had enough buffer, try to add unkonwn error as well§
6316
0
        add_desc_to_buffer(buffer, &usedup, maxbuffer, "Unknown error");
6317
0
    }
6318
1.22k
    return buffer;
6319
1.22k
}
6320
6321
/**
6322
 *  bcf_format_gt_v2 - formats GT information on a string
6323
 *  @param hdr - bcf header, to get version
6324
 *  @param fmt - pointer to bcf format data
6325
 *  @param isample - position of interested sample in data
6326
 *  @param str - pointer to output string
6327
 *  Returns 0 on success and -1 on failure
6328
 *  This method is preferred over bcf_format_gt as this supports vcf4.4 and
6329
 *  prefixed phasing. Explicit / prefixed phasing for 1st allele is used only
6330
 *  when it is a must to correctly express phasing.
6331
 * correctly express phasing.
6332
 */
6333
int bcf_format_gt_v2(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample, kstring_t *str)
6334
10.5k
{
6335
10.5k
    uint32_t e = 0;
6336
10.5k
    int ploidy = 1, anyunphased = 0;
6337
10.5k
    int32_t val0 = 0;
6338
10.5k
    size_t pos = str ? str->l : 0;
6339
6340
10.5k
    #define BRANCH(type_t, convert, missing, vector_end) { \
6341
9.38k
        uint8_t *ptr = fmt->p + isample*fmt->size; \
6342
9.38k
        int i; \
6343
24.5k
        for (i=0; i<fmt->n; i++, ptr += sizeof(type_t)) \
6344
19.7k
        { \
6345
19.7k
            type_t val = convert(ptr); \
6346
19.7k
            if ( val == vector_end ) break; \
6347
19.7k
            if (!i) { val0 = val; } \
6348
15.1k
            if (i) { \
6349
5.80k
                e |= kputc("/|"[val & 1], str) < 0; \
6350
5.80k
                anyunphased |= !(val & 1); \
6351
5.80k
            } \
6352
15.1k
            if (!(val >> 1)) e |= kputc('.', str) < 0; \
6353
15.1k
            else e |= kputw((val >> 1) - 1, str) < 0; \
6354
15.1k
        } \
6355
9.38k
        if (i == 0) e |= kputc('.', str) < 0; \
6356
9.38k
        ploidy = i; \
6357
9.38k
    }
6358
10.5k
    switch (fmt->type) {
6359
4.87k
        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  bcf_int8_missing,
6360
4.87k
            bcf_int8_vector_end); break;
6361
1.55k
        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing,
6362
1.55k
            bcf_int16_vector_end); break;
6363
2.95k
        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing,
6364
2.95k
            bcf_int32_vector_end); break;
6365
1.14k
        case BCF_BT_NULL:  e |= kputc('.', str) < 0; break;
6366
0
        default: hts_log_error("Unexpected type %d", fmt->type); return -2;
6367
10.5k
    }
6368
10.5k
    #undef BRANCH
6369
6370
10.5k
    if (hdr && get_hdr_aux(hdr)->version >= VCF44) {
6371
        //output which supports prefixed phasing
6372
6373
        /* update 1st allele's phasing if required and append rest to it.
6374
        use prefixed phasing only when it is a must. i.e. without which the
6375
        inferred value will be incorrect */
6376
6.45k
        if (val0 & 1) {
6377
            /* 1st one is phased, if ploidy is > 1 and an unphased allele exists
6378
             need to specify explicitly */
6379
878
            e |= (ploidy > 1 && anyunphased) ?
6380
57
                    (kinsert_char('|', pos, str) < 0) :
6381
878
                        (ploidy <= 1 && !((val0 >> 1)) ? //|. needs explicit o/p
6382
0
                            (kinsert_char('|', pos, str) < 0) :
6383
821
                            0);
6384
5.57k
        } else {
6385
            /* 1st allele is unphased, if ploidy is = 1 or allele is '.' or
6386
             ploidy > 1 and no other unphased allele exist, need to specify
6387
             explicitly */
6388
5.57k
            e |= ((ploidy <= 1 && val0 != 0) || (ploidy > 1 && !anyunphased)) ?
6389
3.35k
                    (kinsert_char('/', pos, str) < 0) :
6390
5.57k
                    0;
6391
5.57k
        }
6392
6.45k
    }
6393
10.5k
    return e == 0 ? 0 : -1;
6394
10.5k
}
6395
6396
/**
6397
 *  get_rlen - calculates and returns rlen value
6398
 *  @param h - bcf header
6399
 *  @param v - bcf data
6400
 *  Returns rlen calculated on success and -1 on failure.
6401
 *  rlen calculation is dependent on vcf version and a few other field data.
6402
 *  When bcf decoded data is available, refers it. When not available, retrieves
6403
 *  required field data by seeking on the data stream.
6404
 *  Ideally pos & version be set appropriately before any info/format field
6405
 *  update to have proper rlen calculation.
6406
 *  As version is not kept properly updated in practice, it is ignored in calcs.
6407
 */
6408
static int64_t get_rlen(const bcf_hdr_t *h, bcf1_t *v)
6409
29.9k
{
6410
29.9k
    uint8_t *f = (uint8_t*)v->shared.s, *t = NULL,
6411
29.9k
        *e = (uint8_t*)v->shared.s + v->shared.l;
6412
29.9k
    int size, type, id, lenid, endid, svlenid, i, bad, gvcf = 0, use_svlen = 0;
6413
29.9k
    bcf_info_t *endinfo = NULL, *svleninfo = NULL, end_lcl, svlen_lcl;
6414
29.9k
    bcf_fmt_t *lenfmt = NULL, len_lcl;
6415
6416
    //holds SVLEN allele status for the max no of alleles
6417
29.9k
    uint8_t svlenals[8192];
6418
    //pos from info END, fmt LEN, info SVLEN
6419
29.9k
    hts_pos_t end = 0, end_fmtlen = 0, end_svlen = 0, hpos;
6420
29.9k
    int64_t len_ref = 0, len = 0, tmp;
6421
29.9k
    endid = bcf_hdr_id2int(h, BCF_DT_ID, "END");
6422
6423
    //initialise bytes which are to be used
6424
29.9k
    memset(svlenals, 0, 1 + v->n_allele / 8);
6425
6426
    //use decoded data where ever available and where not, get from stream
6427
29.9k
    if (v->unpacked & BCF_UN_STR || v->d.shared_dirty & BCF1_DIRTY_ALS) {
6428
0
        for (i = 1; i < v->n_allele; ++i) {
6429
            // check only symbolic alt alleles
6430
0
            if (v->d.allele[i][0] != '<')
6431
0
                continue;
6432
0
            if (svlen_on_ref_for_vcf_alt(v->d.allele[i], -1)) {
6433
                // del, dup or cnv allele, note to check corresponding svlen val
6434
0
                svlenals[i >> 3] |= 1 << (i & 7);
6435
0
                use_svlen = 1;
6436
0
            } else if (!strcmp(v->d.allele[i], "<*>") ||
6437
0
                         !strcmp(v->d.allele[i], "<NON_REF>")) {
6438
0
                gvcf = 1;   //gvcf present, have to check for LEN field
6439
0
            }
6440
0
        }
6441
0
        f += v->unpack_size[0] + v->unpack_size[1];
6442
0
        len_ref = v->n_allele ? strlen(v->d.allele[0]) : 0;
6443
29.9k
    } else if (f < e) {
6444
        //skip ID
6445
29.9k
        size = bcf_dec_size(f, &f, &type);
6446
29.9k
        f += size << bcf_type_shift[type];
6447
        // REF, ALT
6448
2.01M
        for (i = 0; i < v->n_allele; ++i) {
6449
            //check all alleles, w/o NUL
6450
1.98M
            size = bcf_dec_size(f, &f, &type);
6451
1.98M
            if (!i) {   //REF length
6452
29.9k
                len_ref = size;
6453
1.95M
            } else if (size > 0 && *f == '<') {
6454
6.25k
                if (svlen_on_ref_for_vcf_alt((char *) f, size)) {
6455
                    // del, dup or cnv allele, note to check corresponding svlen val
6456
6
                    svlenals[i >> 3] |= 1 << (i & 7);
6457
6
                    use_svlen = 1;
6458
6.25k
                } else if ((size == 3 && !strncmp((char*)f, "<*>", size)) ||
6459
4.28k
                    (size == 9 && !strncmp((char*)f, "<NON_REF>", size))) {
6460
2.13k
                    gvcf = 1;   //gvcf present, have to check for LEN field
6461
2.13k
                }
6462
6.25k
            }
6463
1.98M
            f += size << bcf_type_shift[type];
6464
1.98M
        }
6465
29.9k
    }
6466
    // FILTER
6467
29.9k
    if (v->unpacked & BCF_UN_FLT) {
6468
0
        f += v->unpack_size[2];
6469
29.9k
    } else if (f < e) {
6470
29.9k
        size = bcf_dec_size(f, &f, &type);
6471
29.9k
        f += size << bcf_type_shift[type];
6472
29.9k
    }
6473
6474
    // Only do SVLEN lookup if there are suitable symbolic alleles
6475
29.9k
    svlenid = use_svlen ? bcf_hdr_id2int(h, BCF_DT_ID, "SVLEN") : -1;
6476
6477
    // INFO
6478
29.9k
    if (svlenid >= 0 || endid >= 0 ) {  //only if end/svlen present
6479
13.0k
        if (v->unpacked & BCF_UN_INFO || v->d.shared_dirty & BCF1_DIRTY_INF) {
6480
0
            endinfo = bcf_get_info(h, v, "END");
6481
0
            svleninfo = bcf_get_info(h, v, "SVLEN");
6482
13.0k
        } else if (f < e) {
6483
13.6k
            for (i = 0; i < v->n_info; ++i) {
6484
8.91k
                id = bcf_dec_typed_int1(f, &t);
6485
8.91k
                if (id == endid) {  //END
6486
969
                    t = bcf_unpack_info_core1(f, &end_lcl);
6487
969
                    endinfo = &end_lcl;
6488
969
                    if (svleninfo || svlenid < 0) {
6489
969
                        break;  //already got svlen or no need to search further
6490
969
                    }
6491
7.94k
                } else if (id == svlenid) { //SVLEN
6492
0
                    t = bcf_unpack_info_core1(f, &svlen_lcl);
6493
0
                    svleninfo = &svlen_lcl;
6494
0
                    if (endinfo || endid < 0 ) {
6495
0
                        break;  //already got end or no need to search further
6496
0
                    }
6497
7.94k
                } else {
6498
7.94k
                    f = t;
6499
7.94k
                    size = bcf_dec_size(f, &t, &type);
6500
7.94k
                    t += size << bcf_type_shift[type];
6501
7.94k
                }
6502
7.94k
                f = t;
6503
7.94k
            }
6504
5.75k
        }
6505
13.0k
    }
6506
6507
    // Only do LEN lookup if a <*> allele was found
6508
29.9k
    lenid = gvcf ? bcf_hdr_id2int(h, BCF_DT_ID, "LEN") : -1;
6509
6510
    // FORMAT
6511
29.9k
    if (lenid >= 0) {
6512
        //with LEN and has gvcf allele
6513
0
        f = (uint8_t*)v->indiv.s; t = NULL; e = (uint8_t*)v->indiv.s + v->indiv.l;
6514
0
        if (v->unpacked & BCF_UN_FMT || v->d.indiv_dirty) {
6515
0
            lenfmt = bcf_get_fmt(h, v, "LEN");
6516
0
        } else if (f < e) {
6517
0
            for (i = 0; i < v->n_fmt; ++i) {
6518
0
                id = bcf_dec_typed_int1(f, &t);
6519
0
                if (id == lenid) {
6520
0
                        t = bcf_unpack_fmt_core1(f, v->n_sample, &len_lcl);
6521
0
                    lenfmt = &len_lcl;
6522
0
                    break;  //that's all needed
6523
0
                } else {
6524
0
                    f = t;
6525
0
                    size = bcf_dec_size(f, &t, &type);
6526
0
                    t += size * v->n_sample << bcf_type_shift[type];
6527
0
                }
6528
0
                f = t;
6529
0
            }
6530
0
        }
6531
0
    }
6532
    //got required data, find end and rlen
6533
29.9k
    if (endinfo && endinfo->vptr) { //end position given by info END
6534
        //end info exists, not being deleted
6535
969
        end = endinfo->v1.i;
6536
969
        switch(endinfo->type) {
6537
0
            case BCF_BT_INT8:  end = end == bcf_int8_missing ? 0 : end;  break;
6538
0
            case BCF_BT_INT16: end = end == bcf_int16_missing ? 0 : end; break;
6539
0
            case BCF_BT_INT32: end = end == bcf_int32_missing ? 0 : end; break;
6540
0
            case BCF_BT_INT64: end = end == bcf_int64_missing ? 0 : end; break;
6541
969
            default: end = 0; break; //invalid
6542
969
        }
6543
969
    }
6544
6545
29.9k
    if (svleninfo && svleninfo->vptr) {
6546
        //svlen info exists, not being deleted
6547
0
        bad = 0;
6548
        //get largest svlen corresponding to a <DEL> symbolic allele
6549
0
        for (i = 0; i < svleninfo->len && i + 1 < v->n_allele; ++i) {
6550
0
            if (!(svlenals[i >> 3] & (1 << ((i + 1) & 7))))
6551
0
                continue;
6552
6553
0
            switch(svleninfo->type) {
6554
0
                case BCF_BT_INT8:
6555
0
                    tmp = le_to_i8(&svleninfo->vptr[i]);
6556
0
                    tmp = tmp == bcf_int8_missing ? 0 : tmp;
6557
0
                break;
6558
0
                case BCF_BT_INT16:
6559
0
                    tmp = le_to_i16(&svleninfo->vptr[i * 2]);
6560
0
                    tmp = tmp == bcf_int16_missing ? 0 : tmp;
6561
0
                break;
6562
0
                case BCF_BT_INT32:
6563
0
                    tmp = le_to_i32(&svleninfo->vptr[i * 4]);
6564
0
                    tmp = tmp == bcf_int32_missing ? 0 : tmp;
6565
0
                break;
6566
0
                case BCF_BT_INT64:
6567
0
                    tmp = le_to_i64(&svleninfo->vptr[i * 8]);
6568
0
                    tmp = tmp == bcf_int64_missing ? 0 : tmp;
6569
0
                break;
6570
0
                default: //invalid
6571
0
                    tmp = 0;
6572
0
                    bad = 1;
6573
0
                break;
6574
0
            }
6575
0
            if (bad) {  //stop svlen check
6576
0
                len = 0;
6577
0
                break;
6578
0
            }
6579
6580
0
            tmp = tmp < 0 ? llabs(tmp) : tmp;
6581
0
            if (len < tmp) len = tmp;
6582
0
        }
6583
0
    }
6584
29.9k
    if ((!svleninfo || !len) && end) { //no svlen, infer from end
6585
0
        len = end > v->pos ? end - v->pos - 1 : 0;
6586
0
    }
6587
29.9k
    end_svlen = v->pos + len + 1;   //end position found from SVLEN
6588
6589
29.9k
    len = 0;
6590
29.9k
    if (lenfmt && lenfmt->p) {
6591
        //fmt len exists, not being deleted, has gvcf and version >= 4.5
6592
0
        int j = 0;
6593
0
        int64_t offset = 0;
6594
0
        bad = 0;
6595
0
        for (i = 0; i < v->n_sample; ++i) {
6596
0
            for (j = 0; j < lenfmt->n; ++j) {
6597
0
                switch(lenfmt->type) {
6598
0
                case BCF_BT_INT8:
6599
0
                    tmp = le_to_i8(lenfmt->p + offset + j);
6600
0
                    tmp = tmp == bcf_int8_missing ? 0 : tmp;
6601
0
                break;
6602
0
                case BCF_BT_INT16:
6603
0
                    tmp = le_to_i16(lenfmt->p + offset + j * 2);
6604
0
                    tmp = tmp == bcf_int16_missing ? 0 : tmp;
6605
0
                break;
6606
0
                case BCF_BT_INT32:
6607
0
                    tmp = le_to_i32(lenfmt->p + offset + j * 4);
6608
0
                    tmp = tmp == bcf_int32_missing ? 0 : tmp;
6609
0
                break;
6610
0
                case BCF_BT_INT64:
6611
0
                    tmp = le_to_i64(lenfmt->p + offset + j * 8);
6612
0
                    tmp = tmp == bcf_int64_missing ? 0 : tmp;
6613
0
                break;
6614
0
                default: //invalid
6615
0
                    bad = 1;
6616
0
                break;
6617
0
                }
6618
0
                if (bad) {  //stop LEN check
6619
0
                    len = 0;
6620
0
                    break;
6621
0
                }
6622
                //assumes only gvcf have valid LEN
6623
0
                if (len < tmp) len = tmp;
6624
0
            }
6625
0
            offset += j << bcf_type_shift[lenfmt->type];
6626
0
        }
6627
0
    }
6628
29.9k
    if ((!lenfmt || !len) && end) { //no fmt len, infer from end
6629
0
        len = end > v->pos ? end - v->pos : 0;
6630
0
    }
6631
29.9k
    end_fmtlen = v->pos + len;  //end position found from LEN
6632
6633
    //get largest pos, based on END, SVLEN, fmt LEN and length using it
6634
29.9k
    hpos = end < end_svlen ?
6635
8.53k
            end_svlen < end_fmtlen ? end_fmtlen : end_svlen :
6636
29.9k
            end < end_fmtlen ? end_fmtlen : end;
6637
29.9k
    len = hpos - v->pos;
6638
6639
    //NOTE: 'end' calculation be in sync with tbx.c:tbx_parse1
6640
6641
    /* rlen to be calculated based on version, END, SVLEN, fmt LEN, ref len.
6642
    Relevance of these fields vary across different vcf versions.
6643
    Many times, these info/fmt fields are used without version updates;
6644
    hence these fields are used for calculation disregarding vcf version */
6645
29.9k
    return len < len_ref ? len_ref : len;
6646
29.9k
}