Coverage Report

Created: 2025-11-15 06:33

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/vcf.c
Line
Count
Source
1
/*  vcf.c -- VCF/BCF API functions.
2
3
    Copyright (C) 2012, 2013 Broad Institute.
4
    Copyright (C) 2012-2025 Genome Research Ltd.
5
    Portions copyright (C) 2014 Intel Corporation.
6
7
    Author: Heng Li <lh3@sanger.ac.uk>
8
9
Permission is hereby granted, free of charge, to any person obtaining a copy
10
of this software and associated documentation files (the "Software"), to deal
11
in the Software without restriction, including without limitation the rights
12
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
copies of the Software, and to permit persons to whom the Software is
14
furnished to do so, subject to the following conditions:
15
16
The above copyright notice and this permission notice shall be included in
17
all copies or substantial portions of the Software.
18
19
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25
DEALINGS IN THE SOFTWARE.  */
26
27
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
28
#include <config.h>
29
30
#include <stdio.h>
31
#include <assert.h>
32
#include <string.h>
33
#include <strings.h>
34
#include <stdlib.h>
35
#include <limits.h>
36
#include <stdint.h>
37
#include <inttypes.h>
38
#include <errno.h>
39
40
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
41
#include "fuzz_settings.h"
42
#endif
43
44
#include "htslib/vcf.h"
45
#include "htslib/bgzf.h"
46
#include "htslib/tbx.h"
47
#include "htslib/hfile.h"
48
#include "hts_internal.h"
49
#include "htslib/hts_endian.h"
50
#include "htslib/khash_str2int.h"
51
#include "htslib/kstring.h"
52
#include "htslib/sam.h"
53
#include "htslib/khash.h"
54
#include "bgzf_internal.h"
55
56
#if 0
57
// This helps on Intel a bit, often 6-7% faster VCF parsing.
58
// Conversely sometimes harms AMD Zen4 as ~9% slower.
59
// Possibly related to IPC differences.  However for now it's just a
60
// curiousity we ignore and stick with the simpler code.
61
//
62
// Left here as a hint for future explorers.
63
static inline int xstreq(const char *a, const char *b) {
64
    while (*a && *a == *b)
65
        a++, b++;
66
    return *a == *b;
67
}
68
69
#define KHASH_MAP_INIT_XSTR(name, khval_t) \
70
  KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, xstreq)
71
72
KHASH_MAP_INIT_XSTR(vdict, bcf_idinfo_t)
73
#else
74
KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
75
#endif
76
77
typedef khash_t(vdict) vdict_t;
78
79
KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*)
80
typedef khash_t(hdict) hdict_t;
81
82
83
#include "htslib/kseq.h"
84
HTSLIB_EXPORT
85
uint32_t bcf_float_missing    = 0x7F800001;
86
87
HTSLIB_EXPORT
88
uint32_t bcf_float_vector_end = 0x7F800002;
89
90
HTSLIB_EXPORT
91
uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
92
93
static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
94
95
/*
96
    Partial support for 64-bit POS and Number=1 INFO tags.
97
    Notes:
98
     - the support for 64-bit values is motivated by POS and INFO/END for large genomes
99
     - the use of 64-bit values does not conform to the specification
100
     - cannot output 64-bit BCF and if it does, it is not compatible with anything
101
     - experimental, use at your risk
102
*/
103
#ifdef VCF_ALLOW_INT64
104
    #define BCF_MAX_BT_INT64 (0x7fffffffffffffff)       /* INT64_MAX, for internal use only */
105
    #define BCF_MIN_BT_INT64 -9223372036854775800LL     /* INT64_MIN + 8, for internal use only */
106
#endif
107
108
440
#define BCF_IS_64BIT (1<<30)
109
110
111
// Opaque structure with auxilary data which allows to extend bcf_hdr_t without breaking ABI.
112
// Note that this preserving API and ABI requires that the first element is vdict_t struct
113
// rather than a pointer, as user programs may (and in some cases do) access the dictionary
114
// directly as (vdict_t*)hdr->dict.
115
typedef struct
116
{
117
    vdict_t dict;   // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT
118
    hdict_t *gen;   // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields
119
    size_t *key_len;// length of h->id[BCF_DT_ID] strings
120
    int version;    //cached version
121
    uint32_t ref_count; // reference count, low bit indicates bcf_hdr_destroy() has been called
122
}
123
bcf_hdr_aux_t;
124
125
static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr)
126
297k
{
127
297k
    return (bcf_hdr_aux_t *)hdr->dict[0];
128
297k
}
129
130
//version macros
131
88.4k
#define VCF_DEF 4002000
132
31.4k
#define VCF44   4004000
133
17.9k
#define VCF45   4005000
134
135
#define VCF_MAJOR_VER(x) ( (x) / 10000 / 100 )
136
#define VCF_MINOR_VER(x) ( ((x) % 1000000) / 1000 )
137
138
/**
139
 *  bcf_get_version - get the version as int
140
 *  @param hdr   - bcf header, to get version
141
 *  @param verstr- version string, which is already available
142
 *  Returns version on success and default version on failure
143
 *  version = major * 100 * 10000 + minor * 1000
144
 */
145
static int bcf_get_version(const bcf_hdr_t *hdr, const char *verstr)
146
23.9k
{
147
23.9k
    const char *version = NULL, vcf[] = "VCFv";
148
23.9k
    char *major = NULL, *minor = NULL;
149
23.9k
    int ver = -1;
150
23.9k
    long tmp = 0;
151
23.9k
    bcf_hdr_aux_t *aux = NULL;
152
153
23.9k
    if (!hdr && !verstr) {  //invalid input
154
0
        goto fail;
155
0
    }
156
157
23.9k
    if (hdr) {
158
20.5k
        if ((aux = get_hdr_aux(hdr)) && aux->version != 0) {    //use cached version
159
20.2k
            return aux->version;
160
20.2k
        }
161
        //get from header
162
337
        version = bcf_hdr_get_version(hdr);
163
3.34k
    } else {
164
        //get from version string
165
3.34k
        version = verstr;
166
3.34k
    }
167
3.67k
    if (!(major = strstr(version, vcf))) {  //bad format
168
2.50k
        goto fail;
169
2.50k
    }
170
1.17k
    major += sizeof(vcf) - 1;
171
1.17k
    if (!(minor = strchr(major, '.'))) {    //bad format
172
284
        goto fail;
173
284
    }
174
891
    tmp = strtol(major, NULL, 10);
175
891
    if ((!tmp && errno == EINVAL) ||
176
726
        ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) {    //failed
177
167
        goto fail;
178
167
    }
179
724
    ver = tmp * 100 * 10000;
180
724
    tmp = strtol(++minor, NULL, 10);
181
724
    if ((!tmp && errno == EINVAL) ||
182
691
        ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) {    //failed
183
112
        goto fail;
184
112
    }
185
612
    ver += tmp * 1000;
186
612
    return ver;
187
188
3.06k
fail:
189
3.06k
    hts_log_warning("Couldn't get VCF version, considering as %d.%d",
190
3.06k
        VCF_MAJOR_VER(VCF_DEF), VCF_MINOR_VER(VCF_DEF));
191
3.06k
    return VCF_DEF;
192
724
}
193
194
// Header reference counting
195
196
static void bcf_hdr_incr_ref(bcf_hdr_t *h)
197
1.29k
{
198
1.29k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
199
1.29k
    aux->ref_count += 2;
200
1.29k
}
201
202
static void bcf_hdr_decr_ref(bcf_hdr_t *h)
203
1.29k
{
204
1.29k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
205
1.29k
    if (aux->ref_count >= 2)
206
1.29k
        aux->ref_count -= 2;
207
208
1.29k
    if (aux->ref_count == 0)
209
1.20k
        bcf_hdr_destroy(h);
210
1.29k
}
211
212
static void hdr_bgzf_private_data_cleanup(void *data)
213
1.29k
{
214
1.29k
    bcf_hdr_t *h = (bcf_hdr_t *) data;
215
1.29k
    bcf_hdr_decr_ref(h);
216
1.29k
}
217
218
static char *find_chrom_header_line(char *s)
219
0
{
220
0
    char *nl;
221
0
    if (strncmp(s, "#CHROM\t", 7) == 0) return s;
222
0
    else if ((nl = strstr(s, "\n#CHROM\t")) != NULL) return nl+1;
223
0
    else return NULL;
224
0
}
225
226
static int64_t get_rlen(const bcf_hdr_t *h, bcf1_t *v);
227
228
/*************************
229
 *** VCF header parser ***
230
 *************************/
231
232
static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len)
233
4.64k
{
234
4.64k
    const char *ss = s;
235
4.79k
    while ( *ss && isspace_c(*ss) && ss - s < len) ss++;
236
4.64k
    if ( !*ss || ss - s == len)
237
2
    {
238
2
        hts_log_error("Empty sample name: trailing spaces/tabs in the header line?");
239
2
        return -1;
240
2
    }
241
242
4.64k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
243
4.64k
    int ret;
244
4.64k
    char *sdup = malloc(len + 1);
245
4.64k
    if (!sdup) return -1;
246
4.64k
    memcpy(sdup, s, len);
247
4.64k
    sdup[len] = 0;
248
249
    // Ensure space is available in h->samples
250
4.64k
    size_t n = kh_size(d);
251
4.64k
    char **new_samples = realloc(h->samples, sizeof(char*) * (n + 1));
252
4.64k
    if (!new_samples) {
253
0
        free(sdup);
254
0
        return -1;
255
0
    }
256
4.64k
    h->samples = new_samples;
257
258
4.64k
    int k = kh_put(vdict, d, sdup, &ret);
259
4.64k
    if (ret < 0) {
260
0
        free(sdup);
261
0
        return -1;
262
0
    }
263
4.64k
    if (ret) { // absent
264
4.64k
        kh_val(d, k) = bcf_idinfo_def;
265
4.64k
        kh_val(d, k).id = n;
266
4.64k
    } else {
267
0
        hts_log_error("Duplicated sample name '%s'", sdup);
268
0
        free(sdup);
269
0
        return -1;
270
0
    }
271
4.64k
    h->samples[n] = sdup;
272
4.64k
    h->dirty = 1;
273
4.64k
    return 0;
274
4.64k
}
275
276
int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
277
0
{
278
0
    if (!s) {
279
        // Allowed for backwards-compatibility, calling with s == NULL
280
        // used to trigger bcf_hdr_sync(h);
281
0
        return 0;
282
0
    }
283
0
    return bcf_hdr_add_sample_len(h, s, strlen(s));
284
0
}
285
286
int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *hdr, const char *str)
287
2.28k
{
288
2.28k
    const char *mandatory = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
289
2.28k
    if ( strncmp(str,mandatory,strlen(mandatory)) )
290
26
    {
291
26
        hts_log_error("Could not parse the \"#CHROM..\" line, either the fields are incorrect or spaces are present instead of tabs:\n\t%s",str);
292
26
        return -1;
293
26
    }
294
295
2.25k
    const char *beg = str + strlen(mandatory), *end;
296
2.25k
    if ( !*beg || *beg=='\n' ) return 0;
297
602
    if ( strncmp(beg,"\tFORMAT\t",8) )
298
4
    {
299
4
        hts_log_error("Could not parse the \"#CHROM..\" line, either FORMAT is missing or spaces are present instead of tabs:\n\t%s",str);
300
4
        return -1;
301
4
    }
302
598
    beg += 8;
303
304
598
    int ret = 0;
305
4.65k
    while ( *beg )
306
4.64k
    {
307
4.64k
        end = beg;
308
464M
        while ( *end && *end!='\t' && *end!='\n' ) end++;
309
4.64k
        if ( bcf_hdr_add_sample_len(hdr, beg, end-beg) < 0 ) ret = -1;
310
4.64k
        if ( !*end || *end=='\n' || ret<0 ) break;
311
4.05k
        beg = end + 1;
312
4.05k
    }
313
598
    return ret;
314
602
}
315
316
int bcf_hdr_sync(bcf_hdr_t *h)
317
64.5k
{
318
64.5k
    int i;
319
258k
    for (i = 0; i < 3; i++)
320
193k
    {
321
193k
        vdict_t *d = (vdict_t*)h->dict[i];
322
193k
        khint_t k;
323
193k
        if ( h->n[i] < kh_size(d) )
324
594
        {
325
594
            bcf_idpair_t *new_idpair;
326
            // this should be true only for i=2, BCF_DT_SAMPLE
327
594
            new_idpair = (bcf_idpair_t*) realloc(h->id[i], kh_size(d)*sizeof(bcf_idpair_t));
328
594
            if (!new_idpair) return -1;
329
594
            h->n[i] = kh_size(d);
330
594
            h->id[i] = new_idpair;
331
594
        }
332
2.24G
        for (k=kh_begin(d); k<kh_end(d); k++)
333
2.24G
        {
334
2.24G
            if (!kh_exist(d,k)) continue;
335
13.8M
            h->id[i][kh_val(d,k).id].key = kh_key(d,k);
336
13.8M
            h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
337
13.8M
        }
338
193k
    }
339
340
    // Invalidate key length cache
341
64.5k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
342
64.5k
    if (aux && aux->key_len) {
343
2.87k
        free(aux->key_len);
344
2.87k
        aux->key_len = NULL;
345
2.87k
    }
346
347
64.5k
    h->dirty = 0;
348
64.5k
    return 0;
349
64.5k
}
350
351
void bcf_hrec_destroy(bcf_hrec_t *hrec)
352
173k
{
353
173k
    if (!hrec) return;
354
164k
    free(hrec->key);
355
164k
    if ( hrec->value ) free(hrec->value);
356
164k
    int i;
357
493k
    for (i=0; i<hrec->nkeys; i++)
358
328k
    {
359
328k
        free(hrec->keys[i]);
360
328k
        free(hrec->vals[i]);
361
328k
    }
362
164k
    free(hrec->keys);
363
164k
    free(hrec->vals);
364
164k
    free(hrec);
365
164k
}
366
367
// Copies all fields except IDX.
368
bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
369
0
{
370
0
    int save_errno;
371
0
    bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
372
0
    if (!out) return NULL;
373
374
0
    out->type = hrec->type;
375
0
    if ( hrec->key ) {
376
0
        out->key = strdup(hrec->key);
377
0
        if (!out->key) goto fail;
378
0
    }
379
0
    if ( hrec->value ) {
380
0
        out->value = strdup(hrec->value);
381
0
        if (!out->value) goto fail;
382
0
    }
383
0
    out->nkeys = hrec->nkeys;
384
0
    out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys);
385
0
    if (!out->keys) goto fail;
386
0
    out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys);
387
0
    if (!out->vals) goto fail;
388
0
    int i, j = 0;
389
0
    for (i=0; i<hrec->nkeys; i++)
390
0
    {
391
0
        if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
392
0
        if ( hrec->keys[i] ) {
393
0
            out->keys[j] = strdup(hrec->keys[i]);
394
0
            if (!out->keys[j]) goto fail;
395
0
        }
396
0
        if ( hrec->vals[i] ) {
397
0
            out->vals[j] = strdup(hrec->vals[i]);
398
0
            if (!out->vals[j]) goto fail;
399
0
        }
400
0
        j++;
401
0
    }
402
0
    if ( i!=j ) out->nkeys -= i-j;   // IDX was omitted
403
0
    return out;
404
405
0
 fail:
406
0
    save_errno = errno;
407
0
    hts_log_error("%s", strerror(errno));
408
0
    bcf_hrec_destroy(out);
409
0
    errno = save_errno;
410
0
    return NULL;
411
0
}
412
413
void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
414
0
{
415
0
    fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
416
0
    int i;
417
0
    for (i=0; i<hrec->nkeys; i++)
418
0
        fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
419
0
    fprintf(fp, "\n");
420
0
}
421
422
void bcf_header_debug(bcf_hdr_t *hdr)
423
0
{
424
0
    int i, j;
425
0
    for (i=0; i<hdr->nhrec; i++)
426
0
    {
427
0
        if ( !hdr->hrec[i]->value )
428
0
        {
429
0
            fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
430
0
            fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
431
0
            for (j=1; j<hdr->hrec[i]->nkeys; j++)
432
0
                fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
433
0
            fprintf(stderr,">\n");
434
0
        }
435
0
        else
436
0
            fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
437
0
    }
438
0
}
439
440
int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len)
441
261k
{
442
261k
    char **tmp;
443
261k
    size_t n = hrec->nkeys + 1;
444
261k
    assert(len > 0 && len < SIZE_MAX);
445
261k
    tmp = realloc(hrec->keys, sizeof(char*)*n);
446
261k
    if (!tmp) return -1;
447
261k
    hrec->keys = tmp;
448
261k
    tmp = realloc(hrec->vals, sizeof(char*)*n);
449
261k
    if (!tmp) return -1;
450
261k
    hrec->vals = tmp;
451
452
261k
    hrec->keys[hrec->nkeys] = (char*) malloc((len+1)*sizeof(char));
453
261k
    if (!hrec->keys[hrec->nkeys]) return -1;
454
261k
    memcpy(hrec->keys[hrec->nkeys],str,len);
455
261k
    hrec->keys[hrec->nkeys][len] = 0;
456
261k
    hrec->vals[hrec->nkeys] = NULL;
457
261k
    hrec->nkeys = n;
458
261k
    return 0;
459
261k
}
460
461
int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted)
462
261k
{
463
261k
    if ( hrec->vals[i] ) {
464
0
        free(hrec->vals[i]);
465
0
        hrec->vals[i] = NULL;
466
0
    }
467
261k
    if ( !str ) return 0;
468
261k
    if ( is_quoted )
469
66.7k
    {
470
66.7k
        if (len >= SIZE_MAX - 3) {
471
0
            errno = ENOMEM;
472
0
            return -1;
473
0
        }
474
66.7k
        hrec->vals[i] = (char*) malloc((len+3)*sizeof(char));
475
66.7k
        if (!hrec->vals[i]) return -1;
476
66.7k
        hrec->vals[i][0] = '"';
477
66.7k
        memcpy(&hrec->vals[i][1],str,len);
478
66.7k
        hrec->vals[i][len+1] = '"';
479
66.7k
        hrec->vals[i][len+2] = 0;
480
66.7k
    }
481
194k
    else
482
194k
    {
483
194k
        if (len == SIZE_MAX) {
484
0
            errno = ENOMEM;
485
0
            return -1;
486
0
        }
487
194k
        hrec->vals[i] = (char*) malloc((len+1)*sizeof(char));
488
194k
        if (!hrec->vals[i]) return -1;
489
194k
        memcpy(hrec->vals[i],str,len);
490
194k
        hrec->vals[i][len] = 0;
491
194k
    }
492
261k
    return 0;
493
261k
}
494
495
int hrec_add_idx(bcf_hrec_t *hrec, int idx)
496
67.0k
{
497
67.0k
    int n = hrec->nkeys + 1;
498
67.0k
    char **tmp = (char**) realloc(hrec->keys, sizeof(char*)*n);
499
67.0k
    if (!tmp) return -1;
500
67.0k
    hrec->keys = tmp;
501
502
67.0k
    tmp = (char**) realloc(hrec->vals, sizeof(char*)*n);
503
67.0k
    if (!tmp) return -1;
504
67.0k
    hrec->vals = tmp;
505
506
67.0k
    hrec->keys[hrec->nkeys] = strdup("IDX");
507
67.0k
    if (!hrec->keys[hrec->nkeys]) return -1;
508
509
67.0k
    kstring_t str = {0,0,0};
510
67.0k
    if (kputw(idx, &str) < 0) {
511
0
        free(hrec->keys[hrec->nkeys]);
512
0
        return -1;
513
0
    }
514
67.0k
    hrec->vals[hrec->nkeys] = str.s;
515
67.0k
    hrec->nkeys = n;
516
67.0k
    return 0;
517
67.0k
}
518
519
int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
520
117k
{
521
117k
    int i;
522
190k
    for (i=0; i<hrec->nkeys; i++)
523
139k
        if ( !strcasecmp(key,hrec->keys[i]) ) return i;
524
51.5k
    return -1;
525
117k
}
526
527
static void bcf_hrec_set_type(bcf_hrec_t *hrec)
528
302k
{
529
302k
    if ( !strcmp(hrec->key, "contig") ) hrec->type = BCF_HL_CTG;
530
277k
    else if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
531
176k
    else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
532
114k
    else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
533
84.4k
    else if ( hrec->nkeys>0 ) hrec->type = BCF_HL_STR;
534
63.0k
    else hrec->type = BCF_HL_GEN;
535
302k
}
536
537
538
/**
539
    The arrays were generated with
540
541
    valid_ctg:
542
        perl -le '@v = (split(//,q[!#$%&*+./:;=?@^_|~-]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
543
544
    valid_tag:
545
        perl -le '@v = (split(//,q[_.]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
546
*/
547
static const uint8_t valid_ctg[256] =
548
{
549
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
550
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551
    0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
552
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
553
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
554
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
555
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
556
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
557
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
558
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
559
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
560
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
561
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
562
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
563
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
564
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
565
};
566
static const uint8_t valid_tag[256] =
567
{
568
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
569
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
570
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
571
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
572
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
573
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
574
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
575
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
576
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
577
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
578
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
579
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
580
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
581
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
582
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
583
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
584
};
585
586
/**
587
    bcf_hrec_check() - check the validity of structured header lines
588
589
    Returns 0 on success or negative value on error.
590
591
    Currently the return status is not checked by the caller
592
    and only a warning is printed on stderr. This should be improved
593
    to propagate the error all the way up to the caller and let it
594
    decide what to do: throw an error or proceed anyway.
595
 */
596
static int bcf_hrec_check(bcf_hrec_t *hrec)
597
151k
{
598
151k
    int i;
599
151k
    bcf_hrec_set_type(hrec);
600
601
151k
    if ( hrec->type==BCF_HL_CTG )
602
12.2k
    {
603
12.2k
        i = bcf_hrec_find_key(hrec,"ID");
604
12.2k
        if ( i<0 ) goto err_missing_id;
605
6.15k
        char *val = hrec->vals[i];
606
6.15k
        if ( val[0]=='*' || val[0]=='=' || !valid_ctg[(uint8_t)val[0]] ) goto err_invalid_ctg;
607
62.4k
        while ( *(++val) )
608
61.7k
            if ( !valid_ctg[(uint8_t)*val] ) goto err_invalid_ctg;
609
702
        return 0;
610
1.18k
    }
611
138k
    if ( hrec->type==BCF_HL_INFO )
612
50.4k
    {
613
50.4k
        i = bcf_hrec_find_key(hrec,"ID");
614
50.4k
        if ( i<0 ) goto err_missing_id;
615
32.9k
        char *val = hrec->vals[i];
616
32.9k
        if ( !strcmp(val,"1000G") ) return 0;
617
32.9k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
618
14.8k
        while ( *(++val) )
619
12.0k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
620
2.84k
        return 0;
621
3.88k
    }
622
88.4k
    if ( hrec->type==BCF_HL_FMT )
623
14.9k
    {
624
14.9k
        i = bcf_hrec_find_key(hrec,"ID");
625
14.9k
        if ( i<0 ) goto err_missing_id;
626
10.1k
        char *val = hrec->vals[i];
627
10.1k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
628
17.4k
        while ( *(++val) )
629
15.0k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
630
2.39k
        return 0;
631
7.47k
    }
632
73.5k
    return 0;
633
634
28.3k
  err_missing_id:
635
28.3k
    hts_log_warning("Missing ID attribute in one or more header lines");
636
28.3k
    return -1;
637
638
5.45k
  err_invalid_ctg:
639
5.45k
    hts_log_warning("Invalid contig name: \"%s\"", hrec->vals[i]);
640
5.45k
    return -1;
641
642
37.7k
  err_invalid_tag:
643
37.7k
    hts_log_warning("Invalid tag name: \"%s\"", hrec->vals[i]);
644
37.7k
    return -1;
645
88.4k
}
646
647
static inline int is_escaped(const char *min, const char *str)
648
65.9k
{
649
65.9k
    int n = 0;
650
65.9k
    while ( --str>=min && *str=='\\' ) n++;
651
65.9k
    return n%2;
652
65.9k
}
653
654
bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
655
176k
{
656
176k
    bcf_hrec_t *hrec = NULL;
657
176k
    const char *p = line;
658
176k
    if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
659
173k
    p += 2;
660
661
173k
    const char *q = p;
662
1.27M
    while ( *q && *q!='=' && *q != '\n' ) q++;
663
173k
    ptrdiff_t n = q-p;
664
173k
    if ( *q!='=' || !n ) // wrong format
665
8.99k
        goto malformed_line;
666
667
164k
    hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
668
164k
    if (!hrec) { *len = -1; return NULL; }
669
164k
    hrec->key = (char*) malloc(sizeof(char)*(n+1));
670
164k
    if (!hrec->key) goto fail;
671
164k
    memcpy(hrec->key,p,n);
672
164k
    hrec->key[n] = 0;
673
164k
    hrec->type = -1;
674
675
164k
    p = ++q;
676
164k
    if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
677
38.9k
    {
678
6.57M
        while ( *q && *q!='\n' ) q++;
679
38.9k
        hrec->value = (char*) malloc((q-p+1)*sizeof(char));
680
38.9k
        if (!hrec->value) goto fail;
681
38.9k
        memcpy(hrec->value, p, q-p);
682
38.9k
        hrec->value[q-p] = 0;
683
38.9k
        *len = q - line + (*q ? 1 : 0); // Skip \n but not \0
684
38.9k
        return hrec;
685
38.9k
    }
686
687
    // structured line, e.g.
688
    // ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
689
    // ##PEDIGREE=<Name_0=G0-ID,Name_1=G1-ID,Name_3=GN-ID>
690
125k
    int nopen = 1;
691
387k
    while ( *q && *q!='\n' && nopen>0 )
692
274k
    {
693
274k
        p = ++q;
694
275k
        while ( *q && *q==' ' ) { p++; q++; }
695
        // ^[A-Za-z_][0-9A-Za-z_.]*$
696
274k
        if (p==q && *q && (isalpha_c(*q) || *q=='_'))
697
271k
        {
698
271k
            q++;
699
1.41M
            while ( *q && (isalnum_c(*q) || *q=='_' || *q=='.') ) q++;
700
271k
        }
701
274k
        n = q-p;
702
274k
        int m = 0;
703
275k
        while ( *q && *q==' ' ) { q++; m++; }
704
274k
        if ( *q!='=' || !n )
705
13.4k
            goto malformed_line;
706
707
261k
        if (bcf_hrec_add_key(hrec, p, q-p-m) < 0) goto fail;
708
261k
        p = ++q;
709
264k
        while ( *q && *q==' ' ) { p++; q++; }
710
711
261k
        int quoted = 0;
712
261k
        char ending = '\0';
713
261k
        switch (*p) {
714
66.7k
        case '"':
715
66.7k
            quoted = 1;
716
66.7k
            ending = '"';
717
66.7k
            p++;
718
66.7k
            break;
719
14
        case '[':
720
14
            quoted = 1;
721
14
            ending = ']';
722
14
            break;
723
261k
        }
724
261k
        if ( quoted ) q++;
725
224M
        while ( *q && *q != '\n' )
726
224M
        {
727
224M
            if ( quoted ) { if ( *q==ending && !is_escaped(p,q) ) break; }
728
223M
            else
729
223M
            {
730
223M
                if ( *q=='<' ) nopen++;
731
223M
                if ( *q=='>' ) nopen--;
732
223M
                if ( !nopen ) break;
733
223M
                if ( *q==',' && nopen==1 ) break;
734
223M
            }
735
223M
            q++;
736
223M
        }
737
261k
        const char *r = q;
738
261k
        if (quoted && ending == ']') {
739
14
            if (*q == ending) {
740
1
                r++;
741
1
                q++;
742
1
                quoted = 0;
743
13
            } else {
744
13
                char buffer[320];
745
13
                hts_log_error("Missing ']' in header line %s",
746
13
                              hts_strprint(buffer, sizeof(buffer), '"',
747
13
                                           line, q-line));
748
13
                goto fail;
749
13
            }
750
14
        }
751
261k
        while ( r > p && r[-1] == ' ' ) r--;
752
261k
        if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0)
753
0
            goto fail;
754
261k
        if ( quoted && *q==ending ) q++;
755
261k
        if ( *q=='>' )
756
69.6k
        {
757
69.6k
            if (nopen) nopen--;     // this can happen with nested angle brackets <>
758
69.6k
            q++;
759
69.6k
        }
760
261k
    }
761
112k
    if ( nopen )
762
42.5k
        hts_log_warning("Incomplete header line, trying to proceed anyway:\n\t[%s]\n\t[%d]",line,q[0]);
763
764
    // Skip to end of line
765
112k
    int nonspace = 0;
766
112k
    p = q;
767
998k
    while ( *q && *q!='\n' ) { nonspace |= !isspace_c(*q); q++; }
768
112k
    if (nonspace) {
769
1.09k
        char buffer[320];
770
1.09k
        hts_log_warning("Dropped trailing junk from header line '%s'",
771
1.09k
                        hts_strprint(buffer, sizeof(buffer),
772
1.09k
                                     '"', line, q - line));
773
1.09k
    }
774
775
112k
    *len = q - line + (*q ? 1 : 0);
776
112k
    return hrec;
777
778
13
 fail:
779
13
    *len = -1;
780
13
    bcf_hrec_destroy(hrec);
781
13
    return NULL;
782
783
22.4k
 malformed_line:
784
22.4k
    {
785
22.4k
        char buffer[320];
786
15.9M
        while ( *q && *q!='\n' ) q++;  // Ensure *len includes full line
787
22.4k
        hts_log_error("Could not parse the header line: %s",
788
22.4k
                      hts_strprint(buffer, sizeof(buffer),
789
22.4k
                                   '"', line, q - line));
790
22.4k
        *len = q - line + (*q ? 1 : 0);
791
22.4k
        bcf_hrec_destroy(hrec);
792
22.4k
        return NULL;
793
125k
    }
794
125k
}
795
796
static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_idinfo_t *idinfo)
797
66.0k
{
798
66.0k
    size_t new_n;
799
800
    // If available, preserve existing IDX
801
66.0k
    if ( idinfo->id==-1 )
802
65.6k
        idinfo->id = hdr->n[dict_type];
803
336
    else if ( idinfo->id < hdr->n[dict_type] && hdr->id[dict_type][idinfo->id].key )
804
4
    {
805
4
        hts_log_error("Conflicting IDX=%d lines in the header dictionary, the new tag is %s",
806
4
            idinfo->id, tag);
807
4
        errno = EINVAL;
808
4
        return -1;
809
4
    }
810
811
66.0k
    new_n = idinfo->id >= hdr->n[dict_type] ? idinfo->id+1 : hdr->n[dict_type];
812
66.0k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
813
    // hts_resize() can attempt to allocate up to 2 * requested items
814
66.0k
    if (new_n > FUZZ_ALLOC_LIMIT/(2 * sizeof(bcf_idpair_t)))
815
4
        return -1;
816
65.9k
#endif
817
65.9k
    if (hts_resize(bcf_idpair_t, new_n, &hdr->m[dict_type],
818
65.9k
                   &hdr->id[dict_type], HTS_RESIZE_CLEAR)) {
819
0
        return -1;
820
0
    }
821
65.9k
    hdr->n[dict_type] = new_n;
822
823
    // NB: the next kh_put call can invalidate the idinfo pointer, therefore
824
    // we leave it unassigned here. It must be set explicitly in bcf_hdr_sync.
825
65.9k
    hdr->id[dict_type][idinfo->id].key = tag;
826
827
65.9k
    return 0;
828
65.9k
}
829
830
// returns: 1 when hdr needs to be synced, -1 on error, 0 otherwise
831
static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
832
151k
{
833
    // contig
834
151k
    int i, ret, replacing = 0;
835
151k
    khint_t k;
836
151k
    char *str = NULL;
837
838
151k
    bcf_hrec_set_type(hrec);
839
840
151k
    if ( hrec->type==BCF_HL_CTG )
841
12.2k
    {
842
12.2k
        hts_pos_t len = 0;
843
844
        // Get the contig ID ($str) and length ($j)
845
12.2k
        i = bcf_hrec_find_key(hrec,"length");
846
12.2k
        if ( i<0 ) len = 0;
847
2.56k
        else {
848
2.56k
            char *end = hrec->vals[i];
849
2.56k
            len = strtoll(hrec->vals[i], &end, 10);
850
2.56k
            if (end == hrec->vals[i] || len < 0) return 0;
851
2.56k
        }
852
853
11.3k
        i = bcf_hrec_find_key(hrec,"ID");
854
11.3k
        if ( i<0 ) return 0;
855
6.15k
        str = strdup(hrec->vals[i]);
856
6.15k
        if (!str) return -1;
857
858
        // Register in the dictionary
859
6.15k
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
860
6.15k
        khint_t k = kh_get(vdict, d, str);
861
6.15k
        if ( k != kh_end(d) ) { // already present
862
866
            free(str); str=NULL;
863
866
            if (kh_val(d, k).hrec[0] != NULL) // and not removed
864
866
                return 0;
865
0
            replacing = 1;
866
5.29k
        } else {
867
5.29k
            k = kh_put(vdict, d, str, &ret);
868
5.29k
            if (ret < 0) { free(str); return -1; }
869
5.29k
        }
870
871
5.29k
        int idx = bcf_hrec_find_key(hrec,"IDX");
872
5.29k
        if ( idx!=-1 )
873
2.52k
        {
874
2.52k
            char *tmp = hrec->vals[idx];
875
2.52k
            idx = strtol(hrec->vals[idx], &tmp, 10);
876
2.52k
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
877
2.50k
            {
878
2.50k
                if (!replacing) {
879
2.50k
                    kh_del(vdict, d, k);
880
2.50k
                    free(str);
881
2.50k
                }
882
2.50k
                hts_log_warning("Error parsing the IDX tag, skipping");
883
2.50k
                return 0;
884
2.50k
            }
885
2.52k
        }
886
887
2.78k
        kh_val(d, k) = bcf_idinfo_def;
888
2.78k
        kh_val(d, k).id = idx;
889
2.78k
        kh_val(d, k).info[0] = len;
890
2.78k
        kh_val(d, k).hrec[0] = hrec;
891
2.78k
        if (bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k)) < 0) {
892
4
            if (!replacing) {
893
4
                kh_del(vdict, d, k);
894
4
                free(str);
895
4
            }
896
4
            return -1;
897
4
        }
898
2.78k
        if ( idx==-1 ) {
899
2.77k
            if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
900
0
               return -1;
901
0
            }
902
2.77k
        }
903
904
2.78k
        return 1;
905
2.78k
    }
906
907
138k
    if ( hrec->type==BCF_HL_STR ) return 1;
908
128k
    if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_FMT ) return 0;
909
910
    // INFO/FILTER/FORMAT
911
96.6k
    char *id = NULL;
912
96.6k
    uint32_t type = UINT32_MAX, var = UINT32_MAX;
913
96.6k
    int num = -1, idx = -1;
914
322k
    for (i=0; i<hrec->nkeys; i++)
915
226k
    {
916
226k
        if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
917
152k
        else if ( !strcmp(hrec->keys[i], "IDX") )
918
2.71k
        {
919
2.71k
            char *tmp = hrec->vals[i];
920
2.71k
            idx = strtol(hrec->vals[i], &tmp, 10);
921
2.71k
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
922
666
            {
923
666
                hts_log_warning("Error parsing the IDX tag, skipping");
924
666
                return 0;
925
666
            }
926
2.71k
        }
927
149k
        else if ( !strcmp(hrec->keys[i], "Type") )
928
42.7k
        {
929
42.7k
            if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
930
41.2k
            else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
931
40.2k
            else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
932
9.37k
            else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
933
8.90k
            else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
934
7.31k
            else
935
7.31k
            {
936
7.31k
                hts_log_warning("The type \"%s\" is not supported, assuming \"String\"", hrec->vals[i]);
937
7.31k
                type = BCF_HT_STR;
938
7.31k
            }
939
42.7k
        }
940
107k
        else if ( !strcmp(hrec->keys[i], "Number") )
941
35.6k
        {
942
35.6k
            int is_fmt = hrec->type == BCF_HL_FMT;
943
35.6k
            if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
944
34.0k
            else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
945
33.9k
            else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
946
32.7k
            else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
947
32.7k
            else if ( is_fmt && !strcmp(hrec->vals[i],"P") )  var = BCF_VL_P;
948
32.7k
            else if ( is_fmt && !strcmp(hrec->vals[i],"LA") ) var = BCF_VL_LA;
949
32.7k
            else if ( is_fmt && !strcmp(hrec->vals[i],"LR") ) var = BCF_VL_LR;
950
32.7k
            else if ( is_fmt && !strcmp(hrec->vals[i],"LG") ) var = BCF_VL_LG;
951
32.7k
            else if ( is_fmt && !strcmp(hrec->vals[i],"M") )  var = BCF_VL_M;
952
32.7k
            else
953
32.7k
            {
954
32.7k
                if (sscanf(hrec->vals[i],"%d",&num) == 1)
955
32.2k
                    var = BCF_VL_FIXED;
956
32.7k
            }
957
35.6k
            if (var != BCF_VL_FIXED) num = 0xfffff;
958
35.6k
        }
959
226k
    }
960
95.9k
    if (hrec->type == BCF_HL_INFO || hrec->type == BCF_HL_FMT) {
961
64.7k
        if (type == -1) {
962
23.2k
            hts_log_warning("%s %s field has no Type defined. Assuming String",
963
23.2k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
964
23.2k
            type = BCF_HT_STR;
965
23.2k
        }
966
64.7k
        if (var == UINT32_MAX) {
967
29.5k
            hts_log_warning("%s %s field has no Number defined. Assuming '.'",
968
29.5k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
969
29.5k
            var = BCF_VL_VAR;
970
29.5k
        }
971
64.7k
        if ( type==BCF_HT_FLAG && (var!=BCF_VL_FIXED || num!=0) )
972
1.15k
        {
973
1.15k
            hts_log_warning("The definition of Flag \"%s/%s\" is invalid, forcing Number=0", hrec->key,id);
974
1.15k
            var = BCF_VL_FIXED;
975
1.15k
            num = 0;
976
1.15k
        }
977
64.7k
    }
978
95.9k
    uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 |
979
95.9k
                     (var & 0xf) << 8 |
980
95.9k
                     (type & 0xf) << 4 |
981
95.9k
                     (((uint32_t) hrec->type) & 0xf));
982
983
95.9k
    if ( !id ) return 0;
984
74.0k
    str = strdup(id);
985
74.0k
    if (!str) return -1;
986
987
74.0k
    vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
988
74.0k
    k = kh_get(vdict, d, str);
989
74.0k
    if ( k != kh_end(d) )
990
10.7k
    {
991
        // already present
992
10.7k
        free(str);
993
10.7k
        if ( kh_val(d, k).hrec[info&0xf] ) return 0;
994
1.41k
        kh_val(d, k).info[info&0xf] = info;
995
1.41k
        kh_val(d, k).hrec[info&0xf] = hrec;
996
1.41k
        if ( idx==-1 ) {
997
1.41k
            if (hrec_add_idx(hrec, kh_val(d, k).id) < 0) {
998
0
                return -1;
999
0
            }
1000
1.41k
        }
1001
1.41k
        return 1;
1002
1.41k
    }
1003
63.2k
    k = kh_put(vdict, d, str, &ret);
1004
63.2k
    if (ret < 0) {
1005
0
        free(str);
1006
0
        return -1;
1007
0
    }
1008
63.2k
    kh_val(d, k) = bcf_idinfo_def;
1009
63.2k
    kh_val(d, k).info[info&0xf] = info;
1010
63.2k
    kh_val(d, k).hrec[info&0xf] = hrec;
1011
63.2k
    kh_val(d, k).id = idx;
1012
63.2k
    if (bcf_hdr_set_idx(hdr, BCF_DT_ID, kh_key(d,k), &kh_val(d,k)) < 0) {
1013
4
        kh_del(vdict, d, k);
1014
4
        free(str);
1015
4
        return -1;
1016
4
    }
1017
63.2k
    if ( idx==-1 ) {
1018
62.9k
        if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
1019
0
            return -1;
1020
0
        }
1021
62.9k
    }
1022
1023
63.2k
    return 1;
1024
63.2k
}
1025
1026
static void bcf_hdr_unregister_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1027
0
{
1028
0
    if (hrec->type == BCF_HL_FLT ||
1029
0
        hrec->type == BCF_HL_INFO ||
1030
0
        hrec->type == BCF_HL_FMT ||
1031
0
        hrec->type == BCF_HL_CTG) {
1032
0
        int id = bcf_hrec_find_key(hrec, "ID");
1033
0
        if (id < 0 || !hrec->vals[id])
1034
0
            return;
1035
0
        vdict_t *dict = (hrec->type == BCF_HL_CTG
1036
0
                         ? (vdict_t*)hdr->dict[BCF_DT_CTG]
1037
0
                         : (vdict_t*)hdr->dict[BCF_DT_ID]);
1038
0
        khint_t k = kh_get(vdict, dict, hrec->vals[id]);
1039
0
        if (k != kh_end(dict))
1040
0
            kh_val(dict, k).hrec[hrec->type==BCF_HL_CTG ? 0 : hrec->type] = NULL;
1041
0
    }
1042
0
}
1043
1044
static void bcf_hdr_remove_from_hdict(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1045
0
{
1046
0
    kstring_t str = KS_INITIALIZE;
1047
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1048
0
    khint_t k;
1049
0
    int id;
1050
1051
0
    switch (hrec->type) {
1052
0
    case BCF_HL_GEN:
1053
0
        if (ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0)
1054
0
            str.l = 0;
1055
0
        break;
1056
0
    case BCF_HL_STR:
1057
0
        id = bcf_hrec_find_key(hrec, "ID");
1058
0
        if (id < 0)
1059
0
            return;
1060
0
        if (!hrec->vals[id] ||
1061
0
            ksprintf(&str, "##%s=<ID=%s>", hrec->key, hrec->vals[id]) < 0)
1062
0
            str.l = 0;
1063
0
        break;
1064
0
    default:
1065
0
        return;
1066
0
    }
1067
0
    if (str.l) {
1068
0
        k = kh_get(hdict, aux->gen, str.s);
1069
0
    } else {
1070
        // Couldn't get a string for some reason, so try the hard way...
1071
0
        for (k = kh_begin(aux->gen); k < kh_end(aux->gen); k++) {
1072
0
            if (kh_exist(aux->gen, k) && kh_val(aux->gen, k) == hrec)
1073
0
                break;
1074
0
        }
1075
0
    }
1076
0
    if (k != kh_end(aux->gen) && kh_val(aux->gen, k) == hrec) {
1077
0
        kh_val(aux->gen, k) = NULL;
1078
0
        free((char *) kh_key(aux->gen, k));
1079
0
        kh_key(aux->gen, k) = NULL;
1080
0
        kh_del(hdict, aux->gen, k);
1081
0
    }
1082
0
    free(str.s);
1083
0
}
1084
1085
int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp)
1086
0
{
1087
0
    assert( hrec->type==BCF_HL_GEN );
1088
0
    int ret;
1089
0
    khint_t k;
1090
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1091
0
    for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
1092
0
    {
1093
0
        if ( !kh_exist(aux->gen,k) ) continue;
1094
0
        if ( hrec!=(bcf_hrec_t*)kh_val(aux->gen,k) ) continue;
1095
0
        break;
1096
0
    }
1097
0
    assert( k<kh_end(aux->gen) );   // something went wrong, should never happen
1098
0
    free((char*)kh_key(aux->gen,k));
1099
0
    kh_del(hdict,aux->gen,k);
1100
0
    kstring_t str = {0,0,0};
1101
0
    if ( ksprintf(&str, "##%s=%s", tmp->key,tmp->value) < 0 )
1102
0
    {
1103
0
        free(str.s);
1104
0
        return -1;
1105
0
    }
1106
0
    k = kh_put(hdict, aux->gen, str.s, &ret);
1107
0
    if ( ret<0 )
1108
0
    {
1109
0
        free(str.s);
1110
0
        return -1;
1111
0
    }
1112
0
    free(hrec->value);
1113
0
    hrec->value = strdup(tmp->value);
1114
0
    if ( !hrec->value ) return -1;
1115
0
    kh_val(aux->gen,k) = hrec;
1116
1117
0
    if (!strcmp(hrec->key,"fileformat")) {
1118
        //update version
1119
0
        get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value);
1120
0
    }
1121
0
    return 0;
1122
0
}
1123
1124
int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1125
151k
{
1126
151k
    kstring_t str = {0,0,0};
1127
151k
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1128
1129
151k
    int res;
1130
151k
    if ( !hrec ) return 0;
1131
1132
151k
    bcf_hrec_check(hrec);   // todo: check return status and propagate errors up
1133
1134
151k
    res = bcf_hdr_register_hrec(hdr,hrec);
1135
151k
    if (res < 0) return -1;
1136
151k
    if ( !res )
1137
73.0k
    {
1138
        // If one of the hashed field, then it is already present
1139
73.0k
        if ( hrec->type != BCF_HL_GEN )
1140
41.5k
        {
1141
41.5k
            bcf_hrec_destroy(hrec);
1142
41.5k
            return 0;
1143
41.5k
        }
1144
        // Is one of the generic fields and already present?
1145
31.5k
        if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 )
1146
0
        {
1147
0
            free(str.s);
1148
0
            return -1;
1149
0
        }
1150
31.5k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1151
31.5k
        if ( k != kh_end(aux->gen) )
1152
20.7k
        {
1153
            // duplicate record
1154
20.7k
            bcf_hrec_destroy(hrec);
1155
20.7k
            free(str.s);
1156
20.7k
            return 0;
1157
20.7k
        }
1158
10.7k
        if (!strcmp(hrec->key, "fileformat")) {
1159
3.34k
            aux->version = bcf_get_version(NULL, hrec->value);
1160
3.34k
        }
1161
10.7k
    }
1162
1163
88.8k
    int i;
1164
88.8k
    if ( hrec->type==BCF_HL_STR && (i=bcf_hrec_find_key(hrec,"ID"))>=0 )
1165
5.14k
    {
1166
5.14k
        if ( ksprintf(&str, "##%s=<ID=%s>", hrec->key,hrec->vals[i]) < 0 )
1167
0
        {
1168
0
            free(str.s);
1169
0
            return -1;
1170
0
        }
1171
5.14k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1172
5.14k
        if ( k != kh_end(aux->gen) )
1173
4.18k
        {
1174
            // duplicate record
1175
4.18k
            bcf_hrec_destroy(hrec);
1176
4.18k
            free(str.s);
1177
4.18k
            return 0;
1178
4.18k
        }
1179
5.14k
    }
1180
1181
    // New record, needs to be added
1182
84.6k
    int n = hdr->nhrec + 1;
1183
84.6k
    bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
1184
84.6k
    if (!new_hrec) {
1185
0
        free(str.s);
1186
0
        bcf_hdr_unregister_hrec(hdr, hrec);
1187
0
        return -1;
1188
0
    }
1189
84.6k
    hdr->hrec = new_hrec;
1190
1191
84.6k
    if ( str.s )
1192
11.6k
    {
1193
11.6k
        khint_t k = kh_put(hdict, aux->gen, str.s, &res);
1194
11.6k
        if ( res<0 )
1195
0
        {
1196
0
            free(str.s);
1197
0
            return -1;
1198
0
        }
1199
11.6k
        kh_val(aux->gen,k) = hrec;
1200
11.6k
    }
1201
1202
84.6k
    hdr->hrec[hdr->nhrec] = hrec;
1203
84.6k
    hdr->dirty = 1;
1204
84.6k
    hdr->nhrec = n;
1205
1206
84.6k
    return hrec->type==BCF_HL_GEN ? 0 : 1;
1207
84.6k
}
1208
1209
bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
1210
337
{
1211
337
    int i;
1212
337
    if ( type==BCF_HL_GEN )
1213
337
    {
1214
        // e.g. ##fileformat=VCFv4.2
1215
        //      ##source=GenomicsDBImport
1216
        //      ##bcftools_viewVersion=1.16-80-gdfdb0923+htslib-1.16-34-g215d364
1217
337
        if ( value )
1218
0
        {
1219
0
            kstring_t str = {0,0,0};
1220
0
            ksprintf(&str, "##%s=%s", key,value);
1221
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1222
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1223
0
            free(str.s);
1224
0
            if ( k == kh_end(aux->gen) ) return NULL;
1225
0
            return kh_val(aux->gen, k);
1226
0
        }
1227
901
        for (i=0; i<hdr->nhrec; i++)
1228
591
        {
1229
591
            if ( hdr->hrec[i]->type!=type ) continue;
1230
115
            if ( strcmp(hdr->hrec[i]->key,key) ) continue;
1231
27
            return hdr->hrec[i];
1232
115
        }
1233
310
        return NULL;
1234
337
    }
1235
0
    else if ( type==BCF_HL_STR )
1236
0
    {
1237
        // e.g. ##GATKCommandLine=<ID=GenomicsDBImport,CommandLine="GenomicsDBImport....">
1238
        //      ##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
1239
0
        if (!str_class) return NULL;
1240
0
        if ( !strcmp("ID",key) )
1241
0
        {
1242
0
            kstring_t str = {0,0,0};
1243
0
            ksprintf(&str, "##%s=<%s=%s>",str_class,key,value);
1244
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1245
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1246
0
            free(str.s);
1247
0
            if ( k == kh_end(aux->gen) ) return NULL;
1248
0
            return kh_val(aux->gen, k);
1249
0
        }
1250
0
        for (i=0; i<hdr->nhrec; i++)
1251
0
        {
1252
0
            if ( hdr->hrec[i]->type!=type ) continue;
1253
0
            if ( strcmp(hdr->hrec[i]->key,str_class) ) continue;
1254
0
            int j = bcf_hrec_find_key(hdr->hrec[i],key);
1255
0
            if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i];
1256
0
        }
1257
0
        return NULL;
1258
0
    }
1259
0
    vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1260
0
    khint_t k = kh_get(vdict, d, value);
1261
0
    if ( k == kh_end(d) ) return NULL;
1262
0
    return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
1263
0
}
1264
1265
// Check the VCF header is correctly formatted as per the specification.
1266
// Note the code that calls this doesn't bother to check return values and
1267
// we have so many broken VCFs in the wild that for now we just reprt a
1268
// warning and continue anyway.  So currently this is a void function.
1269
void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
1270
2.24k
{
1271
2.24k
    int version = bcf_get_version(hdr, NULL);
1272
1273
2.24k
    struct tag {
1274
2.24k
        char name[10];
1275
2.24k
        char number_str[3];
1276
2.24k
        int number;
1277
2.24k
        int version;
1278
2.24k
        int type;
1279
2.24k
    };
1280
1281
2.24k
    char type_str[][8] = {"Flag", "Integer", "Float", "String"};
1282
1283
2.24k
    struct tag info_tags[] = {
1284
2.24k
        {"AD",        "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1285
2.24k
        {"ADF",       "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1286
2.24k
        {"ADR",       "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1287
2.24k
        {"AC",        "A",  BCF_VL_A,     VCF_DEF, BCF_HT_INT},
1288
2.24k
        {"AF",        "A",  BCF_VL_A,     VCF_DEF, BCF_HT_REAL},
1289
2.24k
        {"CIGAR",     "A",  BCF_VL_A,     VCF_DEF, BCF_HT_STR},
1290
2.24k
        {"AA",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_STR},
1291
2.24k
        {"AN",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1292
2.24k
        {"BQ",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_REAL},
1293
2.24k
        {"DB",        "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1294
2.24k
        {"DP",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1295
2.24k
        {"END",       "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1296
2.24k
        {"H2",        "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1297
2.24k
        {"H3",        "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1298
2.24k
        {"MQ",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_REAL},
1299
2.24k
        {"MQ0",       "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1300
2.24k
        {"NS",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1301
2.24k
        {"SB",        "4",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1302
2.24k
        {"SOMATIC",   "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1303
2.24k
        {"VALIDATED", "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1304
2.24k
        {"1000G",     "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1305
2.24k
    };
1306
2.24k
    static int info_warned[sizeof(info_tags)/sizeof(*info_tags)] = {0};
1307
1308
2.24k
    struct tag fmt_tags[] = {
1309
2.24k
        {"AD",   "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1310
2.24k
        {"ADF",  "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1311
2.24k
        {"ADR",  "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1312
2.24k
        {"EC",   "A",  BCF_VL_A,     VCF_DEF, BCF_HT_INT},
1313
2.24k
        {"GL",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_REAL},
1314
2.24k
        {"GP",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_REAL},
1315
2.24k
        {"PL",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_INT},
1316
2.24k
        {"PP",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_INT},
1317
2.24k
        {"DP",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1318
2.24k
        {"LEN",  "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1319
2.24k
        {"FT",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_STR},
1320
2.24k
        {"GQ",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1321
2.24k
        {"GT",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_STR},
1322
2.24k
        {"HQ",   "2",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1323
2.24k
        {"MQ",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1324
2.24k
        {"PQ",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1325
2.24k
        {"PS",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1326
2.24k
        {"PSL",  "P",  BCF_VL_P,     VCF44,   BCF_HT_STR},
1327
2.24k
        {"PSO",  "P",  BCF_VL_P,     VCF44,   BCF_HT_INT},
1328
2.24k
        {"PSQ",  "P",  BCF_VL_P,     VCF44,   BCF_HT_INT},
1329
2.24k
        {"LGL",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1330
2.24k
        {"LGP",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1331
2.24k
        {"LPL",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1332
2.24k
        {"LPP",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1333
2.24k
        {"LEC",  "LA", BCF_VL_LA,    VCF45,   BCF_HT_INT},
1334
2.24k
        {"LAD",  "LR", BCF_VL_LR,    VCF45,   BCF_HT_INT},
1335
2.24k
        {"LADF", "LR", BCF_VL_LR,    VCF45,   BCF_HT_INT},
1336
2.24k
        {"LADR", "LR", BCF_VL_LR,    VCF45,   BCF_HT_INT},
1337
2.24k
    };
1338
2.24k
    static int fmt_warned[sizeof(fmt_tags)/sizeof(*fmt_tags)] = {0};
1339
1340
    // Check INFO tag numbers.  We shouldn't really permit ".", but it's
1341
    // commonly misused so we let it slide unless it's a new tag and the
1342
    // file format claims to be new also.  We also cannot distinguish between
1343
    // Number=1 and Number=2, but we at least report the correct term if we
1344
    // get, say, Number=G in its place.
1345
    // Also check the types.
1346
2.24k
    int i;
1347
49.4k
    for (i = 0; i < sizeof(info_tags)/sizeof(*info_tags); i++) {
1348
47.2k
        if (info_warned[i])
1349
0
            continue;
1350
47.2k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, info_tags[i].name);
1351
47.2k
        if (bcf_hdr_idinfo_exists(hdr, BCF_HL_INFO, id)) {
1352
0
            if (bcf_hdr_id2length(hdr, BCF_HL_INFO, id) != info_tags[i].number &&
1353
0
                bcf_hdr_id2length(hdr, BCF_HL_INFO, id) != BCF_VL_VAR) {
1354
0
                info_warned[i] = 1;
1355
0
            } else if (bcf_hdr_id2length(hdr, BCF_HL_INFO, id) == BCF_VL_FIXED &&
1356
0
                       bcf_hdr_id2number(hdr, BCF_HL_INFO, id) != atoi(info_tags[i].number_str)) {
1357
0
                info_warned[i] = 1;
1358
0
            }
1359
1360
0
            if (info_warned[i]) {
1361
0
                hts_log_warning("%s should be declared as Number=%s",
1362
0
                                info_tags[i].name, info_tags[i].number_str);
1363
0
            }
1364
1365
0
            if (bcf_hdr_id2type(hdr, BCF_HL_INFO, id) != info_tags[i].type) {
1366
0
                hts_log_warning("%s should be declared as Type=%s",
1367
0
                                info_tags[i].name, type_str[info_tags[i].type]);
1368
0
                info_warned[i] = 1;
1369
0
            }
1370
0
        }
1371
47.2k
    }
1372
1373
    // Check FORMAT tag numbers and types.
1374
65.1k
    for (i = 0; i < sizeof(fmt_tags)/sizeof(*fmt_tags); i++) {
1375
62.9k
        if (fmt_warned[i])
1376
0
            continue;
1377
62.9k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, fmt_tags[i].name);
1378
62.9k
        if (bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, id)) {
1379
0
            if (bcf_hdr_id2length(hdr, BCF_HL_FMT, id) != fmt_tags[i].number) {
1380
                // Permit "Number=." if this tag predates the vcf version it is
1381
                // defined within.  This is a common tactic for callers to use
1382
                // new tags with older formats in order to avoid parsing failures
1383
                // with some software.
1384
                // We don't care for 4.3 and earlier as that's more of a wild-west
1385
                // and it's not abnormal to see incorrect usage of Number=. there.
1386
0
                if ((version < VCF44 &&
1387
0
                     bcf_hdr_id2length(hdr, BCF_HL_FMT, id) != BCF_VL_VAR) ||
1388
0
                    (version >= VCF44 && version >= fmt_tags[i].version)) {
1389
0
                    fmt_warned[i] = 1;
1390
0
                }
1391
0
            } else if (bcf_hdr_id2length(hdr, BCF_HL_FMT, id) == BCF_VL_FIXED &&
1392
0
                       bcf_hdr_id2number(hdr, BCF_HL_FMT, id) != atoi(fmt_tags[i].number_str)) {
1393
0
                fmt_warned[i] = 1;
1394
0
            }
1395
1396
0
            if (fmt_warned[i]) {
1397
0
                hts_log_warning("%s should be declared as Number=%s",
1398
0
                                fmt_tags[i].name, fmt_tags[i].number_str);
1399
0
            }
1400
1401
0
            if (bcf_hdr_id2type(hdr, BCF_HL_FMT, id) != fmt_tags[i].type) {
1402
0
                hts_log_warning("%s should be declared as Type=%s",
1403
0
                                fmt_tags[i].name, type_str[fmt_tags[i].type]);
1404
0
                fmt_warned[i] = 1;
1405
0
            }
1406
0
        }
1407
62.9k
    }
1408
2.24k
}
1409
1410
int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
1411
2.60k
{
1412
2.60k
    int len, done = 0;
1413
2.60k
    char *p = htxt;
1414
1415
    // Check sanity: "fileformat" string must come as first
1416
2.60k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
1417
2.60k
    if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") )
1418
166
        hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?");
1419
2.60k
    if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1420
0
        bcf_hrec_destroy(hrec);
1421
0
        return -1;
1422
0
    }
1423
1424
    // The filter PASS must appear first in the dictionary
1425
2.60k
    hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
1426
2.60k
    if (!hrec || bcf_hdr_add_hrec(hdr, hrec) < 0) {
1427
0
        bcf_hrec_destroy(hrec);
1428
0
        return -1;
1429
0
    }
1430
1431
    // Parse the whole header
1432
25.5k
    do {
1433
109k
        while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) {
1434
83.7k
            if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1435
8
                bcf_hrec_destroy(hrec);
1436
8
                return -1;
1437
8
            }
1438
83.7k
            p += len;
1439
83.7k
        }
1440
25.5k
        assert(hrec == NULL);
1441
25.5k
        if (len < 0) {
1442
            // len < 0 indicates out-of-memory, or similar error
1443
0
            hts_log_error("Could not parse header line: %s", strerror(errno));
1444
0
            return -1;
1445
25.5k
        } else if (len > 0) {
1446
            // Bad header line.  bcf_hdr_parse_line() will have logged it.
1447
            // Skip and try again on the next line (p + len will be the start
1448
            // of the next one).
1449
22.4k
            p += len;
1450
22.4k
            continue;
1451
22.4k
        }
1452
1453
        // Next should be the sample line.  If not, it was a malformed
1454
        // header, in which case print a warning and skip (many VCF
1455
        // operations do not really care about a few malformed lines).
1456
        // In the future we may want to add a strict mode that errors in
1457
        // this case.
1458
3.14k
        if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) {
1459
860
            char *eol = strchr(p, '\n');
1460
860
            if (*p != '\0') {
1461
556
                char buffer[320];
1462
556
                hts_log_warning("Could not parse header line: %s",
1463
556
                                hts_strprint(buffer, sizeof(buffer),
1464
556
                                               '"', p,
1465
556
                                               eol ? (eol - p) : SIZE_MAX));
1466
556
            }
1467
860
            if (eol) {
1468
546
                p = eol + 1; // Try from the next line.
1469
546
            } else {
1470
314
                done = -1; // No more lines left, give up.
1471
314
            }
1472
2.28k
        } else {
1473
2.28k
            done = 1; // Sample line found
1474
2.28k
        }
1475
25.5k
    } while (!done);
1476
1477
2.59k
    if (done < 0) {
1478
        // No sample line is fatal.
1479
314
        hts_log_error("Could not parse the header, sample line not found");
1480
314
        return -1;
1481
314
    }
1482
1483
2.28k
    if (bcf_hdr_parse_sample_line(hdr,p) < 0)
1484
32
        return -1;
1485
2.24k
    if (bcf_hdr_sync(hdr) < 0)
1486
0
        return -1;
1487
2.24k
    bcf_hdr_check_sanity(hdr);
1488
2.24k
    return 0;
1489
2.24k
}
1490
1491
int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
1492
0
{
1493
0
    int len;
1494
0
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
1495
0
    if ( !hrec ) return -1;
1496
0
    if (bcf_hdr_add_hrec(hdr, hrec) < 0)
1497
0
        return -1;
1498
0
    return 0;
1499
0
}
1500
1501
void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
1502
0
{
1503
0
    int i = 0;
1504
0
    bcf_hrec_t *hrec;
1505
0
    if ( !key )
1506
0
    {
1507
        // no key, remove all entries of this type
1508
0
        while ( i<hdr->nhrec )
1509
0
        {
1510
0
            if ( hdr->hrec[i]->type!=type ) { i++; continue; }
1511
0
            hrec = hdr->hrec[i];
1512
0
            bcf_hdr_unregister_hrec(hdr, hrec);
1513
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1514
0
            hdr->dirty = 1;
1515
0
            hdr->nhrec--;
1516
0
            if ( i < hdr->nhrec )
1517
0
                memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1518
0
            bcf_hrec_destroy(hrec);
1519
0
        }
1520
0
        return;
1521
0
    }
1522
0
    while (1)
1523
0
    {
1524
0
        if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
1525
0
        {
1526
0
            hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL);
1527
0
            if ( !hrec ) return;
1528
1529
0
            for (i=0; i<hdr->nhrec; i++)
1530
0
                if ( hdr->hrec[i]==hrec ) break;
1531
0
            assert( i<hdr->nhrec );
1532
1533
0
            vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1534
0
            khint_t k = kh_get(vdict, d, key);
1535
0
            kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
1536
0
        }
1537
0
        else
1538
0
        {
1539
0
            for (i=0; i<hdr->nhrec; i++)
1540
0
            {
1541
0
                if ( hdr->hrec[i]->type!=type ) continue;
1542
0
                if ( type==BCF_HL_GEN )
1543
0
                {
1544
0
                    if ( !strcmp(hdr->hrec[i]->key,key) ) break;
1545
0
                }
1546
0
                else
1547
0
                {
1548
                    // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
1549
0
                    int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
1550
0
                    if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break;
1551
0
                }
1552
0
            }
1553
0
            if ( i==hdr->nhrec ) return;
1554
0
            hrec = hdr->hrec[i];
1555
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1556
0
        }
1557
1558
0
        hdr->nhrec--;
1559
0
        if ( i < hdr->nhrec )
1560
0
            memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1561
0
        bcf_hrec_destroy(hrec);
1562
0
        hdr->dirty = 1;
1563
0
    }
1564
0
}
1565
1566
int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
1567
0
{
1568
0
    char tmp[256], *line = tmp;
1569
0
    va_list ap;
1570
0
    va_start(ap, fmt);
1571
0
    int n = vsnprintf(line, sizeof(tmp), fmt, ap);
1572
0
    va_end(ap);
1573
1574
0
    if (n >= sizeof(tmp)) {
1575
0
        n++; // For trailing NUL
1576
0
        line = (char*)malloc(n);
1577
0
        if (!line)
1578
0
            return -1;
1579
1580
0
        va_start(ap, fmt);
1581
0
        vsnprintf(line, n, fmt, ap);
1582
0
        va_end(ap);
1583
0
    }
1584
1585
0
    int ret = bcf_hdr_append(hdr, line);
1586
1587
0
    if (line != tmp) free(line);
1588
0
    return ret;
1589
0
}
1590
1591
1592
/**********************
1593
 *** BCF header I/O ***
1594
 **********************/
1595
1596
const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
1597
337
{
1598
337
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1599
337
    if ( !hrec )
1600
310
    {
1601
310
        hts_log_warning("No version string found, assuming VCFv4.2");
1602
310
        return "VCFv4.2";
1603
310
    }
1604
27
    return hrec->value;
1605
337
}
1606
1607
int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
1608
0
{
1609
0
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1610
0
    if ( !hrec )
1611
0
    {
1612
0
        int len;
1613
0
        kstring_t str = {0,0,0};
1614
0
        if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1;
1615
0
        hrec = bcf_hdr_parse_line(hdr, str.s, &len);
1616
0
        free(str.s);
1617
1618
0
        get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value);
1619
0
    }
1620
0
    else
1621
0
    {
1622
0
        bcf_hrec_t *tmp = bcf_hrec_dup(hrec);
1623
0
        if ( !tmp ) return -1;
1624
0
        free(tmp->value);
1625
0
        tmp->value = strdup(version);
1626
0
        if ( !tmp->value ) return -1;
1627
0
        bcf_hdr_update_hrec(hdr, hrec, tmp);
1628
0
        bcf_hrec_destroy(tmp);
1629
0
    }
1630
0
    hdr->dirty = 1;
1631
    //TODO rlen may change, deal with it
1632
0
    return 0; // FIXME: check for errs in this function (return < 0 if so)
1633
0
}
1634
1635
bcf_hdr_t *bcf_hdr_init(const char *mode)
1636
2.62k
{
1637
2.62k
    int i;
1638
2.62k
    bcf_hdr_t *h;
1639
2.62k
    h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
1640
2.62k
    if (!h) return NULL;
1641
10.5k
    for (i = 0; i < 3; ++i) {
1642
7.88k
        if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail;
1643
        // Supersize the hash to make collisions very unlikely
1644
7.88k
        static int dsize[3] = {16384,16384,2048}; // info, contig, format
1645
7.88k
        if (kh_resize(vdict, h->dict[i], dsize[i]) < 0) goto fail;
1646
7.88k
    }
1647
1648
2.62k
    bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t));
1649
2.62k
    if ( !aux ) goto fail;
1650
2.62k
    if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; }
1651
2.62k
    aux->key_len = NULL;
1652
2.62k
    aux->dict = *((vdict_t*)h->dict[0]);
1653
2.62k
    aux->version = 0;
1654
2.62k
    aux->ref_count = 1;
1655
2.62k
    free(h->dict[0]);
1656
2.62k
    h->dict[0] = aux;
1657
1658
2.62k
    if ( strchr(mode,'w') )
1659
0
    {
1660
0
        bcf_hdr_append(h, "##fileformat=VCFv4.2");
1661
        // The filter PASS must appear first in the dictionary
1662
0
        bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
1663
0
        aux->version = VCF_DEF;
1664
0
    }
1665
2.62k
    return h;
1666
1667
0
 fail:
1668
0
    for (i = 0; i < 3; ++i)
1669
0
        kh_destroy(vdict, h->dict[i]);
1670
0
    free(h);
1671
0
    return NULL;
1672
2.62k
}
1673
1674
void bcf_hdr_destroy(bcf_hdr_t *h)
1675
3.83k
{
1676
3.83k
    int i;
1677
3.83k
    khint_t k;
1678
3.83k
    if (!h) return;
1679
3.83k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
1680
3.83k
    if (aux->ref_count > 1) // Refs still held, so delay destruction
1681
1.20k
    {
1682
1.20k
        aux->ref_count &= ~1;
1683
1.20k
        return;
1684
1.20k
    }
1685
10.5k
    for (i = 0; i < 3; ++i) {
1686
7.88k
        vdict_t *d = (vdict_t*)h->dict[i];
1687
7.88k
        if (d == 0) continue;
1688
91.5M
        for (k = kh_begin(d); k != kh_end(d); ++k)
1689
91.4M
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
1690
7.88k
        if ( i==0 )
1691
2.62k
        {
1692
26.2k
            for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
1693
23.6k
                if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k));
1694
2.62k
            kh_destroy(hdict, aux->gen);
1695
2.62k
            free(aux->key_len); // may exist for dict[0] only
1696
2.62k
        }
1697
7.88k
        kh_destroy(vdict, d);
1698
7.88k
        free(h->id[i]);
1699
7.88k
    }
1700
87.2k
    for (i=0; i<h->nhrec; i++)
1701
84.6k
        bcf_hrec_destroy(h->hrec[i]);
1702
2.62k
    if (h->nhrec) free(h->hrec);
1703
2.62k
    if (h->samples) free(h->samples);
1704
2.62k
    free(h->keep_samples);
1705
2.62k
    free(h->transl[0]); free(h->transl[1]);
1706
2.62k
    free(h->mem.s);
1707
2.62k
    free(h);
1708
2.62k
}
1709
1710
bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
1711
2.62k
{
1712
2.62k
    if (hfp->format.format == vcf)
1713
2.46k
        return vcf_hdr_read(hfp);
1714
166
    if (hfp->format.format != bcf) {
1715
0
        hts_log_error("Input is not detected as bcf or vcf format");
1716
0
        return NULL;
1717
0
    }
1718
1719
166
    assert(hfp->is_bgzf);
1720
1721
166
    BGZF *fp = hfp->fp.bgzf;
1722
166
    uint8_t magic[5];
1723
166
    bcf_hdr_t *h;
1724
166
    h = bcf_hdr_init("r");
1725
166
    if (!h) {
1726
0
        hts_log_error("Failed to allocate bcf header");
1727
0
        return NULL;
1728
0
    }
1729
166
    if (bgzf_read(fp, magic, 5) != 5)
1730
0
    {
1731
0
        hts_log_error("Failed to read the header (reading BCF in text mode?)");
1732
0
        bcf_hdr_destroy(h);
1733
0
        return NULL;
1734
0
    }
1735
166
    if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
1736
0
    {
1737
0
        if (!strncmp((char*)magic, "BCF", 3))
1738
0
            hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported");
1739
0
        else
1740
0
            hts_log_error("Invalid BCF2 magic string");
1741
0
        bcf_hdr_destroy(h);
1742
0
        return NULL;
1743
0
    }
1744
166
    uint8_t buf[4];
1745
166
    size_t hlen;
1746
166
    char *htxt = NULL;
1747
166
    if (bgzf_read(fp, buf, 4) != 4) goto fail;
1748
166
    hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24);
1749
166
    if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; }
1750
166
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1751
166
    if (hlen > FUZZ_ALLOC_LIMIT/2) { errno = ENOMEM; goto fail; }
1752
166
#endif
1753
166
    htxt = (char*)malloc(hlen + 1);
1754
166
    if (!htxt) goto fail;
1755
166
    if (bgzf_read(fp, htxt, hlen) != hlen) goto fail;
1756
166
    htxt[hlen] = '\0'; // Ensure htxt is terminated
1757
166
    if ( bcf_hdr_parse(h, htxt) < 0 ) goto fail;
1758
166
    free(htxt);
1759
1760
166
    bcf_hdr_incr_ref(h);
1761
166
    bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup);
1762
1763
166
    return h;
1764
0
 fail:
1765
0
    hts_log_error("Failed to read BCF header");
1766
0
    free(htxt);
1767
0
    bcf_hdr_destroy(h);
1768
0
    return NULL;
1769
166
}
1770
1771
int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
1772
2.24k
{
1773
2.24k
    if (!h) {
1774
0
        errno = EINVAL;
1775
0
        return -1;
1776
0
    }
1777
2.24k
    if ( h->dirty ) {
1778
0
        if (bcf_hdr_sync(h) < 0) return -1;
1779
0
    }
1780
2.24k
    hfp->format.category = variant_data;
1781
2.24k
    if (hfp->format.format == vcf || hfp->format.format == text_format) {
1782
1.12k
        hfp->format.format = vcf;
1783
1.12k
        return vcf_hdr_write(hfp, h);
1784
1.12k
    }
1785
1786
1.12k
    if (hfp->format.format == binary_format)
1787
1.12k
        hfp->format.format = bcf;
1788
1789
1.12k
    kstring_t htxt = {0,0,0};
1790
1.12k
    if (bcf_hdr_format(h, 1, &htxt) < 0) {
1791
0
        free(htxt.s);
1792
0
        return -1;
1793
0
    }
1794
1.12k
    kputc('\0', &htxt); // include the \0 byte
1795
1796
1.12k
    BGZF *fp = hfp->fp.bgzf;
1797
1.12k
    if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
1798
1.12k
    uint8_t hlen[4];
1799
1.12k
    u32_to_le(htxt.l, hlen);
1800
1.12k
    if ( bgzf_write(fp, hlen, 4) !=4 ) return -1;
1801
1.12k
    if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1;
1802
1.12k
    if ( bgzf_flush(fp) < 0) return -1;
1803
1804
1.12k
    bcf_hdr_incr_ref(h);
1805
1.12k
    bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup);
1806
1807
1.12k
    free(htxt.s);
1808
1.12k
    return 0;
1809
1.12k
}
1810
1811
/********************
1812
 *** BCF site I/O ***
1813
 ********************/
1814
1815
bcf1_t *bcf_init(void)
1816
2.24k
{
1817
2.24k
    bcf1_t *v;
1818
2.24k
    v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
1819
2.24k
    return v;
1820
2.24k
}
1821
1822
void bcf_clear(bcf1_t *v)
1823
45.1k
{
1824
45.1k
    int i;
1825
45.1k
    for (i=0; i<v->d.m_info; i++)
1826
0
    {
1827
0
        if ( v->d.info[i].vptr_free )
1828
0
        {
1829
0
            free(v->d.info[i].vptr - v->d.info[i].vptr_off);
1830
0
            v->d.info[i].vptr_free = 0;
1831
0
        }
1832
0
    }
1833
45.1k
    for (i=0; i<v->d.m_fmt; i++)
1834
0
    {
1835
0
        if ( v->d.fmt[i].p_free )
1836
0
        {
1837
0
            free(v->d.fmt[i].p - v->d.fmt[i].p_off);
1838
0
            v->d.fmt[i].p_free = 0;
1839
0
        }
1840
0
    }
1841
45.1k
    v->rid = v->pos = v->rlen = v->unpacked = 0;
1842
45.1k
    bcf_float_set_missing(v->qual);
1843
45.1k
    v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
1844
45.1k
    v->shared.l = v->indiv.l = 0;
1845
45.1k
    v->d.var_type = -1;
1846
45.1k
    v->d.shared_dirty = 0;
1847
45.1k
    v->d.indiv_dirty  = 0;
1848
45.1k
    v->d.n_flt = 0;
1849
45.1k
    v->errcode = 0;
1850
45.1k
    if (v->d.m_als) v->d.als[0] = 0;
1851
45.1k
    if (v->d.m_id) v->d.id[0] = 0;
1852
45.1k
}
1853
1854
void bcf_empty(bcf1_t *v)
1855
2.24k
{
1856
2.24k
    bcf_clear1(v);
1857
2.24k
    free(v->d.id);
1858
2.24k
    free(v->d.als);
1859
2.24k
    free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
1860
2.24k
    if (v->d.var ) free(v->d.var);
1861
2.24k
    free(v->shared.s); free(v->indiv.s);
1862
2.24k
    memset(&v->d,0,sizeof(v->d));
1863
2.24k
    memset(&v->shared,0,sizeof(v->shared));
1864
2.24k
    memset(&v->indiv,0,sizeof(v->indiv));
1865
2.24k
}
1866
1867
void bcf_destroy(bcf1_t *v)
1868
2.24k
{
1869
2.24k
    if (!v) return;
1870
2.24k
    bcf_empty1(v);
1871
2.24k
    free(v);
1872
2.24k
}
1873
1874
static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
1875
166
{
1876
166
    uint8_t x[32];
1877
166
    ssize_t ret;
1878
166
    uint32_t shared_len, indiv_len;
1879
166
    if ((ret = bgzf_read(fp, x, 32)) != 32) {
1880
0
        if (ret == 0) return -1;
1881
0
        return -2;
1882
0
    }
1883
166
    bcf_clear1(v);
1884
166
    shared_len = le_to_u32(x);
1885
166
    if (shared_len < 24) return -2;
1886
164
    shared_len -= 24; // to exclude six 32-bit integers
1887
164
    indiv_len = le_to_u32(x + 4);
1888
164
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1889
    // ks_resize() normally allocates 1.5 * requested size to allow for growth
1890
164
    if ((uint64_t) shared_len + indiv_len > FUZZ_ALLOC_LIMIT / 3 * 2) return -2;
1891
160
#endif
1892
160
    if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2;
1893
160
    if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2;
1894
160
    v->rid  = le_to_i32(x + 8);
1895
160
    v->pos  = le_to_u32(x + 12);
1896
160
    if ( v->pos==UINT32_MAX ) v->pos = -1;  // this is for telomere coordinate, e.g. MT:0
1897
160
    v->rlen = le_to_i32(x + 16);
1898
160
    v->qual = le_to_float(x + 20);
1899
160
    v->n_info = le_to_u16(x + 24);
1900
160
    v->n_allele = le_to_u16(x + 26);
1901
160
    v->n_sample = le_to_u32(x + 28) & 0xffffff;
1902
160
    v->n_fmt = x[31];
1903
160
    v->shared.l = shared_len;
1904
160
    v->indiv.l = indiv_len;
1905
    // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
1906
160
    if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
1907
1908
160
    if (bgzf_read(fp, v->shared.s, v->shared.l) != v->shared.l) return -2;
1909
148
    if (bgzf_read(fp, v->indiv.s, v->indiv.l) != v->indiv.l) return -2;
1910
144
    return 0;
1911
148
}
1912
1913
0
#define bit_array_size(n) ((n)/8+1)
1914
0
#define bit_array_set(a,i)   ((a)[(i)/8] |=   1 << ((i)%8))
1915
0
#define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
1916
0
#define bit_array_test(a,i)  ((a)[(i)/8] &   (1 << ((i)%8)))
1917
1918
static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1919
3.68k
                                   int32_t *val) {
1920
3.68k
    uint32_t t;
1921
3.68k
    if (end - p < 2) return -1;
1922
3.67k
    t = *p++ & 0xf;
1923
    /* Use if .. else if ... else instead of switch to force order.  Assumption
1924
       is that small integers are more frequent than big ones. */
1925
3.67k
    if (t == BCF_BT_INT8) {
1926
1.92k
        *val = *(int8_t *) p++;
1927
1.92k
    } else {
1928
1.75k
        if (end - p < (1<<bcf_type_shift[t])) return -1;
1929
1.74k
        if (t == BCF_BT_INT16) {
1930
848
            *val = le_to_i16(p);
1931
848
            p += 2;
1932
898
        } else if (t == BCF_BT_INT32) {
1933
806
            *val = le_to_i32(p);
1934
806
            p += 4;
1935
#ifdef VCF_ALLOW_INT64
1936
        } else if (t == BCF_BT_INT64) {
1937
            // This case should never happen because there should be no
1938
            // 64-bit BCFs at all, definitely not coming from htslib
1939
            *val = le_to_i64(p);
1940
            p += 8;
1941
#endif
1942
806
        } else {
1943
92
            return -1;
1944
92
        }
1945
1.74k
    }
1946
3.57k
    *q = p;
1947
3.57k
    return 0;
1948
3.67k
}
1949
1950
static int bcf_dec_size_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1951
13.3k
                             int *num, int *type) {
1952
13.3k
    int r;
1953
13.3k
    if (p >= end) return -1;
1954
13.3k
    *type = *p & 0xf;
1955
13.3k
    if (*p>>4 != 15) {
1956
12.9k
        *q = p + 1;
1957
12.9k
        *num = *p >> 4;
1958
12.9k
        return 0;
1959
12.9k
    }
1960
410
    r = bcf_dec_typed_int1_safe(p + 1, end, q, num);
1961
410
    if (r) return r;
1962
394
    return *num >= 0 ? 0 : -1;
1963
410
}
1964
1965
248
static const char *get_type_name(int type) {
1966
248
    const char *types[9] = {
1967
248
        "null", "int (8-bit)", "int (16 bit)", "int (32 bit)",
1968
248
        "unknown", "float", "unknown", "char", "unknown"
1969
248
    };
1970
248
    int t = (type >= 0 && type < 8) ? type : 8;
1971
248
    return types[t];
1972
248
}
1973
1974
/**
1975
 *  updatephasing - updates 1st phasing based on other phasing status
1976
 *  @param p - pointer to phase value array
1977
 *  @param end - end of array
1978
 *  @param q - pointer to consumed data
1979
 *  @param samples - no. of samples in array
1980
 *  @param ploidy - no. of phasing values per sample
1981
 *  @param type - value type (one of BCF_BT_...)
1982
 *  Returns 0 on success and 1 on failure
1983
 *  Update for haploids made only if it is not unknown (.)
1984
 */
1985
static int updatephasing(uint8_t *p, uint8_t *end, uint8_t **q, int samples, int ploidy, int type)
1986
0
{
1987
0
    int j, k;
1988
0
    unsigned int inc = 1 << bcf_type_shift[type];
1989
0
    ptrdiff_t bytes = samples * ploidy * inc;
1990
1991
0
    if (samples < 0 || ploidy < 0 || end - p < bytes)
1992
0
        return 1;
1993
1994
    /*
1995
     * This works because phasing is stored in the least-significant bit
1996
     * of the GT encoding, and the data is always stored little-endian.
1997
     * Thus it's possible to get the desired result by doing bit operations
1998
     * on the least-significant byte of each value and ignoring the
1999
     * higher bytes (for 16-bit and 32-bit values).
2000
     */
2001
2002
0
    switch (ploidy) {
2003
0
    case 1:
2004
        // Trivial case - haploid data is phased by default
2005
0
        for (j = 0; j < samples; ++j) {
2006
0
            if (*p) *p |= 1;    //only if not unknown (.)
2007
0
            p += inc;
2008
0
        }
2009
0
        break;
2010
0
    case 2:
2011
        // Mostly trivial case - first is phased if second is.
2012
0
        for (j = 0; j < samples; ++j) {
2013
0
            *p |= (p[inc] & 1);
2014
0
            p += 2 * inc;
2015
0
        }
2016
0
        break;
2017
0
    default:
2018
        // Generic case - first is phased if all other alleles are.
2019
0
        for (j = 0; j < samples; ++j) {
2020
0
            uint8_t allphased = 1;
2021
0
            for (k = 1; k < ploidy; ++k)
2022
0
                allphased &= (p[inc * k]);
2023
0
            *p |= allphased;
2024
0
            p += ploidy * inc;
2025
0
        }
2026
0
    }
2027
0
    *q = p;
2028
0
    return 0;
2029
0
}
2030
2031
static void bcf_record_check_err(const bcf_hdr_t *hdr, bcf1_t *rec,
2032
1.86k
                                 char *type, uint32_t *reports, int i) {
2033
1.86k
    if (*reports == 0 || hts_verbose >= HTS_LOG_DEBUG)
2034
52
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos
2035
1.86k
                        ": Invalid FORMAT %s %d",
2036
1.86k
                        bcf_seqname_safe(hdr,rec), rec->pos+1, type, i);
2037
1.86k
    (*reports)++;
2038
1.86k
}
2039
2040
144
static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
2041
144
    uint8_t *ptr, *end;
2042
144
    size_t bytes;
2043
144
    uint32_t err = 0;
2044
144
    int type = 0;
2045
144
    int num  = 0;
2046
144
    uint32_t i, reports;
2047
144
    const uint32_t is_integer = ((1 << BCF_BT_INT8)  |
2048
144
                                 (1 << BCF_BT_INT16) |
2049
#ifdef VCF_ALLOW_INT64
2050
                                 (1 << BCF_BT_INT64) |
2051
#endif
2052
144
                                 (1 << BCF_BT_INT32));
2053
144
    const uint32_t is_valid_type = (is_integer          |
2054
144
                                    (1 << BCF_BT_NULL)  |
2055
144
                                    (1 << BCF_BT_FLOAT) |
2056
144
                                    (1 << BCF_BT_CHAR));
2057
144
    int32_t max_id = hdr ? hdr->n[BCF_DT_ID] : 0;
2058
    /* set phasing for 1st allele as in v44 for versions upto v43, to have
2059
    consistent binary values irrespective of version; not run for v >= v44,
2060
    to retain explicit phasing in v44 and higher */
2061
144
    int idgt = hdr ?
2062
144
                    bcf_get_version(hdr, NULL) < VCF44 ?
2063
144
                        bcf_hdr_id2int(hdr, BCF_DT_ID, "GT") : -1 :
2064
144
                    -1;
2065
2066
    // Check for valid contig ID
2067
144
    if (rec->rid < 0
2068
122
        || (hdr && (rec->rid >= hdr->n[BCF_DT_CTG]
2069
144
                    || hdr->id[BCF_DT_CTG][rec->rid].key == NULL))) {
2070
144
        hts_log_warning("Bad BCF record at %"PRIhts_pos": Invalid %s id %d", rec->pos+1, "CONTIG", rec->rid);
2071
144
        err |= BCF_ERR_CTG_INVALID;
2072
144
    }
2073
2074
    // Check ID
2075
144
    ptr = (uint8_t *) rec->shared.s;
2076
144
    end = ptr + rec->shared.l;
2077
144
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2078
142
    if (type != BCF_BT_CHAR) {
2079
142
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "ID", type, get_type_name(type));
2080
142
        err |= BCF_ERR_TAG_INVALID;
2081
142
    }
2082
142
    bytes = (size_t) num << bcf_type_shift[type];
2083
142
    if (end - ptr < bytes) goto bad_shared;
2084
142
    ptr += bytes;
2085
2086
    // Check REF and ALT
2087
142
    if (rec->n_allele < 1) {
2088
64
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": No REF allele",
2089
64
                        bcf_seqname_safe(hdr,rec), rec->pos+1);
2090
64
        err |= BCF_ERR_TAG_UNDEF;
2091
64
    }
2092
2093
142
    reports = 0;
2094
10.0k
    for (i = 0; i < rec->n_allele; i++) {
2095
9.88k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2096
9.87k
        if (type != BCF_BT_CHAR) {
2097
9.61k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2098
78
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "REF/ALT", type, get_type_name(type));
2099
9.61k
            err |= BCF_ERR_CHAR;
2100
9.61k
        }
2101
9.87k
        bytes = (size_t) num << bcf_type_shift[type];
2102
9.87k
        if (end - ptr < bytes) goto bad_shared;
2103
9.87k
        ptr += bytes;
2104
9.87k
    }
2105
2106
    // Check FILTER
2107
130
    reports = 0;
2108
130
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2109
130
    if (num > 0) {
2110
74
        bytes = (size_t) num << bcf_type_shift[type];
2111
74
        if (((1 << type) & is_integer) == 0) {
2112
24
            hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", type, get_type_name(type));
2113
24
            err |= BCF_ERR_TAG_INVALID;
2114
24
            if (end - ptr < bytes) goto bad_shared;
2115
24
            ptr += bytes;
2116
50
        } else {
2117
50
            if (end - ptr < bytes) goto bad_shared;
2118
5.01k
            for (i = 0; i < num; i++) {
2119
4.96k
                int32_t key = bcf_dec_int1(ptr, type, &ptr);
2120
4.96k
                if (key < 0
2121
4.03k
                    || (hdr && (key >= max_id
2122
4.67k
                                || hdr->id[BCF_DT_ID][key].key == NULL))) {
2123
4.67k
                    if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2124
50
                        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", key);
2125
4.67k
                    err |= BCF_ERR_TAG_UNDEF;
2126
4.67k
                }
2127
4.96k
            }
2128
50
        }
2129
74
    }
2130
2131
    // Check INFO
2132
130
    reports = 0;
2133
130
    bcf_idpair_t *id_tmp = hdr ? hdr->id[BCF_DT_ID] : NULL;
2134
1.42k
    for (i = 0; i < rec->n_info; i++) {
2135
1.36k
        int32_t key = -1;
2136
1.36k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_shared;
2137
1.30k
        if (key < 0 || (hdr && (key >= max_id
2138
890
                                || id_tmp[key].key == NULL))) {
2139
890
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2140
56
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", key);
2141
890
            err |= BCF_ERR_TAG_UNDEF;
2142
890
        }
2143
1.30k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2144
1.30k
        if (((1 << type) & is_valid_type) == 0
2145
1.24k
            || (type == BCF_BT_NULL && num > 0)) {
2146
68
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2147
4
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", type, get_type_name(type));
2148
68
            err |= BCF_ERR_TAG_INVALID;
2149
68
        }
2150
1.30k
        bytes = (size_t) num << bcf_type_shift[type];
2151
1.30k
        if (end - ptr < bytes) goto bad_shared;
2152
1.29k
        ptr += bytes;
2153
1.29k
    }
2154
2155
    // Check FORMAT and individual information
2156
56
    ptr = (uint8_t *) rec->indiv.s;
2157
56
    end = ptr + rec->indiv.l;
2158
56
    reports = 0;
2159
1.90k
    for (i = 0; i < rec->n_fmt; i++) {
2160
1.90k
        int32_t key = -1;
2161
1.90k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_indiv;
2162
1.87k
        if (key < 0
2163
1.73k
            || (hdr && (key >= max_id
2164
1.74k
                        || id_tmp[key].key == NULL))) {
2165
1.74k
            bcf_record_check_err(hdr, rec, "id", &reports, key);
2166
1.74k
            err |= BCF_ERR_TAG_UNDEF;
2167
1.74k
        }
2168
1.87k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv;
2169
1.87k
        if (((1 << type) & is_valid_type) == 0
2170
1.78k
            || (type == BCF_BT_NULL && num > 0)) {
2171
114
            bcf_record_check_err(hdr, rec, "type", &reports, type);
2172
114
            err |= BCF_ERR_TAG_INVALID;
2173
114
        }
2174
1.87k
        if (idgt >= 0 && idgt == key) {
2175
            // check first GT phasing bit and fix up if necessary
2176
0
            if (updatephasing(ptr, end, &ptr, rec->n_sample, num, type)) {
2177
0
                err |= BCF_ERR_TAG_INVALID;
2178
0
            }
2179
1.87k
        } else {
2180
1.87k
            bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample;
2181
1.87k
            if (end - ptr < bytes) goto bad_indiv;
2182
1.84k
            ptr += bytes;
2183
1.84k
        }
2184
1.87k
    }
2185
2186
0
    if (!err && rec->rlen < 0) {
2187
        // Treat bad rlen as a warning instead of an error, and try to
2188
        // fix up by using the length of the stored REF allele.
2189
0
        static int warned = 0;
2190
0
        if (!warned) {
2191
0
            hts_log_warning("BCF record at %s:%"PRIhts_pos" has invalid RLEN (%"PRIhts_pos"). "
2192
0
                            "Only one invalid RLEN will be reported.",
2193
0
                            bcf_seqname_safe(hdr,rec), rec->pos+1, rec->rlen);
2194
0
            warned = 1;
2195
0
        }
2196
        //find rlen considering reflen, END, SVLEN, fmt LEN
2197
0
        hts_pos_t len = get_rlen(hdr, rec);
2198
0
        rec->rlen = len >= 0 ? len : 0;
2199
0
    }
2200
2201
0
    rec->errcode |= err;
2202
2203
0
    return err ? -2 : 0; // Return -2 so bcf_read() reports an error
2204
2205
88
 bad_shared:
2206
88
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - shared section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
2207
88
    return -2;
2208
2209
56
 bad_indiv:
2210
56
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - individuals section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
2211
56
    return -2;
2212
56
}
2213
2214
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
2215
int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
2216
0
{
2217
0
    if ( !hdr->keep_samples ) return 0;
2218
0
    if ( !bcf_hdr_nsamples(hdr) )
2219
0
    {
2220
0
        rec->indiv.l = rec->n_sample = 0;
2221
0
        return 0;
2222
0
    }
2223
2224
0
    int i, j;
2225
0
    uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
2226
0
    bcf_dec_t *dec = &rec->d;
2227
0
    hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
2228
0
    for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
2229
2230
0
    for (i=0; i<rec->n_fmt; i++)
2231
0
    {
2232
0
        ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
2233
0
        src = dec->fmt[i].p - dec->fmt[i].size;
2234
0
        if ( dst )
2235
0
        {
2236
0
            memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
2237
0
            dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
2238
0
        }
2239
0
        dst = dec->fmt[i].p;
2240
0
        for (j=0; j<hdr->nsamples_ori; j++)
2241
0
        {
2242
0
            src += dec->fmt[i].size;
2243
0
            if ( !bit_array_test(hdr->keep_samples,j) ) continue;
2244
0
            memmove(dst, src, dec->fmt[i].size);
2245
0
            dst += dec->fmt[i].size;
2246
0
        }
2247
0
        rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
2248
0
        dec->fmt[i].p_len = dst - dec->fmt[i].p;
2249
0
    }
2250
0
    rec->unpacked |= BCF_UN_FMT;
2251
2252
0
    rec->n_sample = bcf_hdr_nsamples(hdr);
2253
0
    return 0;
2254
0
}
2255
2256
int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
2257
43.2k
{
2258
43.2k
    if (fp->format.format == vcf) return vcf_read(fp, h, v);
2259
166
    if (!h)
2260
0
        h = (const bcf_hdr_t *) bgzf_get_private_data(fp->fp.bgzf);
2261
166
    int ret = bcf_read1_core(fp->fp.bgzf, v);
2262
166
    if (ret == 0) ret = bcf_record_check(h, v);
2263
166
    if ( ret!=0 || !h->keep_samples ) return ret;
2264
0
    return bcf_subset_format(h,v);
2265
166
}
2266
2267
int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end)
2268
0
{
2269
0
    bcf1_t *v = (bcf1_t *) vv;
2270
0
    const bcf_hdr_t *hdr = (const bcf_hdr_t *) bgzf_get_private_data(fp);
2271
0
    int ret = bcf_read1_core(fp, v);
2272
0
    if (ret == 0) ret = bcf_record_check(hdr, v);
2273
0
    if (ret  >= 0)
2274
0
        *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
2275
0
    return ret;
2276
0
}
2277
2278
static inline int bcf1_sync_id(bcf1_t *line, kstring_t *str)
2279
0
{
2280
    // single typed string
2281
0
    if ( line->d.id && strcmp(line->d.id, ".") ) {
2282
0
        return bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
2283
0
    } else {
2284
0
        return bcf_enc_size(str, 0, BCF_BT_CHAR);
2285
0
    }
2286
0
}
2287
static inline int bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
2288
0
{
2289
    // list of typed strings
2290
0
    int i;
2291
0
    for (i=0; i<line->n_allele; i++) {
2292
0
        if (bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]) < 0)
2293
0
            return -1;
2294
0
    }
2295
0
    if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
2296
0
    return 0;
2297
0
}
2298
static inline int bcf1_sync_filter(bcf1_t *line, kstring_t *str)
2299
0
{
2300
    // typed vector of integers
2301
0
    if ( line->d.n_flt ) {
2302
0
        return bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
2303
0
    } else {
2304
0
        return bcf_enc_vint(str, 0, 0, -1);
2305
0
    }
2306
0
}
2307
2308
static inline int bcf1_sync_info(bcf1_t *line, kstring_t *str)
2309
0
{
2310
    // pairs of typed vectors
2311
0
    int i, irm = -1, e = 0;
2312
0
    for (i=0; i<line->n_info; i++)
2313
0
    {
2314
0
        bcf_info_t *info = &line->d.info[i];
2315
0
        if ( !info->vptr )
2316
0
        {
2317
            // marked for removal
2318
0
            if ( irm < 0 ) irm = i;
2319
0
            continue;
2320
0
        }
2321
0
        e |= kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str) < 0;
2322
0
        if ( irm >=0 )
2323
0
        {
2324
0
            bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
2325
0
            while ( irm<=i && line->d.info[irm].vptr ) irm++;
2326
0
        }
2327
0
    }
2328
0
    if ( irm>=0 ) line->n_info = irm;
2329
0
    return e == 0 ? 0 : -1;
2330
0
}
2331
2332
static int bcf1_sync(bcf1_t *line)
2333
0
{
2334
0
    char *shared_ori = line->shared.s;
2335
0
    size_t prev_len;
2336
2337
0
    kstring_t tmp = {0,0,0};
2338
0
    if ( !line->shared.l )
2339
0
    {
2340
        // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
2341
0
        tmp = line->shared;
2342
0
        bcf1_sync_id(line, &tmp);
2343
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2344
2345
0
        bcf1_sync_alleles(line, &tmp);
2346
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2347
2348
0
        bcf1_sync_filter(line, &tmp);
2349
0
        line->unpack_size[2] = tmp.l - prev_len;
2350
2351
0
        bcf1_sync_info(line, &tmp);
2352
0
        line->shared = tmp;
2353
0
    }
2354
0
    else if ( line->d.shared_dirty )
2355
0
    {
2356
        // The line was edited, update the BCF data block.
2357
2358
0
        if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line,BCF_UN_STR);
2359
2360
        // ptr_ori points to the original unchanged BCF data.
2361
0
        uint8_t *ptr_ori = (uint8_t *) line->shared.s;
2362
2363
        // ID: single typed string
2364
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ID )
2365
0
            bcf1_sync_id(line, &tmp);
2366
0
        else
2367
0
            kputsn_(ptr_ori, line->unpack_size[0], &tmp);
2368
0
        ptr_ori += line->unpack_size[0];
2369
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2370
2371
        // REF+ALT: list of typed strings
2372
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
2373
0
            bcf1_sync_alleles(line, &tmp);
2374
0
        else
2375
0
        {
2376
0
            kputsn_(ptr_ori, line->unpack_size[1], &tmp);
2377
0
            if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
2378
0
        }
2379
0
        ptr_ori += line->unpack_size[1];
2380
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2381
2382
0
        if ( line->unpacked & BCF_UN_FLT )
2383
0
        {
2384
            // FILTER: typed vector of integers
2385
0
            if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
2386
0
                bcf1_sync_filter(line, &tmp);
2387
0
            else if ( line->d.n_flt )
2388
0
                kputsn_(ptr_ori, line->unpack_size[2], &tmp);
2389
0
            else
2390
0
                bcf_enc_vint(&tmp, 0, 0, -1);
2391
0
            ptr_ori += line->unpack_size[2];
2392
0
            line->unpack_size[2] = tmp.l - prev_len;
2393
2394
0
            if ( line->unpacked & BCF_UN_INFO )
2395
0
            {
2396
                // INFO: pairs of typed vectors
2397
0
                if ( line->d.shared_dirty & BCF1_DIRTY_INF )
2398
0
                {
2399
0
                    bcf1_sync_info(line, &tmp);
2400
0
                    ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
2401
0
                }
2402
0
            }
2403
0
        }
2404
2405
0
        int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
2406
0
        if ( size ) kputsn_(ptr_ori, size, &tmp);
2407
2408
0
        free(line->shared.s);
2409
0
        line->shared = tmp;
2410
0
    }
2411
0
    if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
2412
0
    {
2413
        // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
2414
0
        size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
2415
0
        int i;
2416
0
        for (i=0; i<line->n_info; i++)
2417
0
        {
2418
0
            uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
2419
0
            line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
2420
0
            off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
2421
0
            if ( vptr_free )
2422
0
            {
2423
0
                free(vptr_free);
2424
0
                line->d.info[i].vptr_free = 0;
2425
0
            }
2426
0
        }
2427
0
    }
2428
2429
0
    if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
2430
0
    {
2431
        // The genotype fields changed or are not present
2432
0
        tmp.l = tmp.m = 0; tmp.s = NULL;
2433
0
        int i, irm = -1;
2434
0
        for (i=0; i<line->n_fmt; i++)
2435
0
        {
2436
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
2437
0
            if ( !fmt->p )
2438
0
            {
2439
                // marked for removal
2440
0
                if ( irm < 0 ) irm = i;
2441
0
                continue;
2442
0
            }
2443
0
            kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
2444
0
            if ( irm >=0 )
2445
0
            {
2446
0
                bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
2447
0
                while ( irm<=i && line->d.fmt[irm].p ) irm++;
2448
0
            }
2449
2450
0
        }
2451
0
        if ( irm>=0 ) line->n_fmt = irm;
2452
0
        free(line->indiv.s);
2453
0
        line->indiv = tmp;
2454
2455
        // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
2456
0
        size_t off_new = 0;
2457
0
        for (i=0; i<line->n_fmt; i++)
2458
0
        {
2459
0
            uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
2460
0
            line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
2461
0
            off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
2462
0
            if ( p_free )
2463
0
            {
2464
0
                free(p_free);
2465
0
                line->d.fmt[i].p_free = 0;
2466
0
            }
2467
0
        }
2468
0
    }
2469
0
    if ( !line->n_sample ) line->n_fmt = 0;
2470
0
    line->d.shared_dirty = line->d.indiv_dirty = 0;
2471
0
    return 0;
2472
0
}
2473
2474
bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
2475
0
{
2476
0
    bcf1_sync(src);
2477
2478
0
    bcf_clear(dst);
2479
0
    dst->rid  = src->rid;
2480
0
    dst->pos  = src->pos;
2481
0
    dst->rlen = src->rlen;
2482
0
    dst->qual = src->qual;
2483
0
    dst->n_info = src->n_info; dst->n_allele = src->n_allele;
2484
0
    dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
2485
2486
0
    if ( dst->shared.m < src->shared.l )
2487
0
    {
2488
0
        dst->shared.s = (char*) realloc(dst->shared.s, src->shared.l);
2489
0
        dst->shared.m = src->shared.l;
2490
0
    }
2491
0
    dst->shared.l = src->shared.l;
2492
0
    memcpy(dst->shared.s,src->shared.s,dst->shared.l);
2493
2494
0
    if ( dst->indiv.m < src->indiv.l )
2495
0
    {
2496
0
        dst->indiv.s = (char*) realloc(dst->indiv.s, src->indiv.l);
2497
0
        dst->indiv.m = src->indiv.l;
2498
0
    }
2499
0
    dst->indiv.l = src->indiv.l;
2500
0
    memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
2501
2502
0
    return dst;
2503
0
}
2504
bcf1_t *bcf_dup(bcf1_t *src)
2505
0
{
2506
0
    bcf1_t *out = bcf_init1();
2507
0
    return bcf_copy(out, src);
2508
0
}
2509
2510
int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
2511
42.0k
{
2512
42.0k
    if ( h->dirty ) {
2513
0
        if (bcf_hdr_sync(h) < 0) return -1;
2514
0
    }
2515
42.0k
    if ( bcf_hdr_nsamples(h)!=v->n_sample )
2516
25
    {
2517
25
        hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
2518
25
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
2519
25
        return -1;
2520
25
    }
2521
2522
41.9k
    if ( hfp->format.format == vcf || hfp->format.format == text_format )
2523
41.0k
        return vcf_write(hfp,h,v);
2524
2525
968
    if ( v->errcode & ~BCF_ERR_LIMITS ) // todo: unsure about the other BCF_ERR_LIMITS branches in vcf_parse_format_alloc4()
2526
968
    {
2527
        // vcf_parse1() encountered a new contig or tag, undeclared in the
2528
        // header.  At this point, the header must have been printed,
2529
        // proceeding would lead to a broken BCF file. Errors must be checked
2530
        // and cleared by the caller before we can proceed.
2531
968
        char errdescription[1024] = "";
2532
968
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos, v->errcode, bcf_strerror(v->errcode, errdescription, sizeof(errdescription)), bcf_seqname_safe(h,v), v->pos+1);
2533
968
        return -1;
2534
968
    }
2535
0
    bcf1_sync(v);   // check if the BCF record was modified
2536
2537
0
    if ( v->unpacked & BCF_IS_64BIT )
2538
0
    {
2539
0
        hts_log_error("Data at %s:%"PRIhts_pos" contains 64-bit values not representable in BCF. Please use VCF instead", bcf_seqname_safe(h,v), v->pos+1);
2540
0
        return -1;
2541
0
    }
2542
2543
0
    BGZF *fp = hfp->fp.bgzf;
2544
0
    uint8_t x[32];
2545
0
    u32_to_le(v->shared.l + 24, x); // to include six 32-bit integers
2546
0
    u32_to_le(v->indiv.l, x + 4);
2547
0
    i32_to_le(v->rid, x + 8);
2548
0
    u32_to_le(v->pos, x + 12);
2549
0
    u32_to_le(v->rlen, x + 16);
2550
0
    float_to_le(v->qual, x + 20);
2551
0
    u16_to_le(v->n_info, x + 24);
2552
0
    u16_to_le(v->n_allele, x + 26);
2553
0
    u32_to_le((uint32_t)v->n_fmt<<24 | (v->n_sample & 0xffffff), x + 28);
2554
0
    if ( bgzf_write(fp, x, 32) != 32 ) return -1;
2555
0
    if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
2556
0
    if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
2557
2558
0
    if (hfp->idx) {
2559
0
        if (bgzf_idx_push(fp, hfp->idx, v->rid, v->pos, v->pos + v->rlen,
2560
0
                          bgzf_tell(fp), 1) < 0)
2561
0
            return -1;
2562
0
    }
2563
2564
0
    return 0;
2565
0
}
2566
2567
/**********************
2568
 *** VCF header I/O ***
2569
 **********************/
2570
2571
0
static int add_missing_contig_hrec(bcf_hdr_t *h, const char *name) {
2572
0
    bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t));
2573
0
    int save_errno;
2574
0
    if (!hrec) goto fail;
2575
2576
0
    hrec->key = strdup("contig");
2577
0
    if (!hrec->key) goto fail;
2578
2579
0
    if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) goto fail;
2580
0
    if (bcf_hrec_set_val(hrec, hrec->nkeys-1, name, strlen(name), 0) < 0)
2581
0
        goto fail;
2582
0
    if (bcf_hdr_add_hrec(h, hrec) < 0)
2583
0
        goto fail;
2584
0
    return 0;
2585
2586
0
 fail:
2587
0
    save_errno = errno;
2588
0
    hts_log_error("%s", strerror(errno));
2589
0
    if (hrec) bcf_hrec_destroy(hrec);
2590
0
    errno = save_errno;
2591
0
    return -1;
2592
0
}
2593
2594
bcf_hdr_t *vcf_hdr_read(htsFile *fp)
2595
2.46k
{
2596
2.46k
    kstring_t txt, *s = &fp->line;
2597
2.46k
    int ret;
2598
2.46k
    bcf_hdr_t *h;
2599
2.46k
    tbx_t *idx = NULL;
2600
2.46k
    const char **names = NULL;
2601
2.46k
    h = bcf_hdr_init("r");
2602
2.46k
    if (!h) {
2603
0
        hts_log_error("Failed to allocate bcf header");
2604
0
        return NULL;
2605
0
    }
2606
2.46k
    txt.l = txt.m = 0; txt.s = 0;
2607
131k
    while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) {
2608
131k
        int e = 0;
2609
131k
        if (s->l == 0) continue;
2610
117k
        if (s->s[0] != '#') {
2611
16
            hts_log_error("No sample line");
2612
16
            goto error;
2613
16
        }
2614
116k
        if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
2615
0
            kstring_t tmp = { 0, 0, NULL };
2616
0
            hFILE *f = hopen(fp->fn_aux, "r");
2617
0
            if (f == NULL) {
2618
0
                hts_log_error("Couldn't open \"%s\"", fp->fn_aux);
2619
0
                goto error;
2620
0
            }
2621
0
            while (tmp.l = 0, kgetline(&tmp, (kgets_func *) hgets, f) >= 0) {
2622
0
                char *tab = strchr(tmp.s, '\t');
2623
0
                if (tab == NULL) continue;
2624
0
                e |= (kputs("##contig=<ID=", &txt) < 0);
2625
0
                e |= (kputsn(tmp.s, tab - tmp.s, &txt) < 0);
2626
0
                e |= (kputs(",length=", &txt) < 0);
2627
0
                e |= (kputl(atol(tab), &txt) < 0);
2628
0
                e |= (kputsn(">\n", 2, &txt) < 0);
2629
0
            }
2630
0
            free(tmp.s);
2631
0
            if (hclose(f) != 0) {
2632
0
                hts_log_error("Error on closing %s", fp->fn_aux);
2633
0
                goto error;
2634
0
            }
2635
0
            if (e) goto error;
2636
0
        }
2637
116k
        if (kputsn(s->s, s->l, &txt) < 0) goto error;
2638
116k
        if (kputc('\n', &txt) < 0) goto error;
2639
116k
        if (s->s[1] != '#') break;
2640
116k
    }
2641
2.44k
    if ( ret < -1 ) goto error;
2642
2.43k
    if ( !txt.s )
2643
0
    {
2644
0
        hts_log_error("Could not read the header");
2645
0
        goto error;
2646
0
    }
2647
2.43k
    if ( bcf_hdr_parse(h, txt.s) < 0 ) goto error;
2648
2649
    // check tabix index, are all contigs listed in the header? add the missing ones
2650
2.08k
    idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SILENT_FAIL);
2651
2.08k
    if ( idx )
2652
0
    {
2653
0
        int i, n, need_sync = 0;
2654
0
        names = tbx_seqnames(idx, &n);
2655
0
        if (!names) goto error;
2656
0
        for (i=0; i<n; i++)
2657
0
        {
2658
0
            bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL);
2659
0
            if ( hrec ) continue;
2660
0
            if (add_missing_contig_hrec(h, names[i]) < 0) goto error;
2661
0
            need_sync = 1;
2662
0
        }
2663
0
        if ( need_sync ) {
2664
0
            if (bcf_hdr_sync(h) < 0) goto error;
2665
0
        }
2666
0
        free(names);
2667
0
        tbx_destroy(idx);
2668
0
    }
2669
2.08k
    free(txt.s);
2670
2.08k
    return h;
2671
2672
380
 error:
2673
380
    if (idx) tbx_destroy(idx);
2674
380
    free(names);
2675
380
    free(txt.s);
2676
380
    if (h) bcf_hdr_destroy(h);
2677
380
    return NULL;
2678
2.08k
}
2679
2680
int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
2681
0
{
2682
0
    int i = 0, n = 0, save_errno;
2683
0
    char **lines = hts_readlines(fname, &n);
2684
0
    if ( !lines ) return 1;
2685
0
    for (i=0; i<n-1; i++)
2686
0
    {
2687
0
        int k;
2688
0
        bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
2689
0
        if (!hrec) goto fail;
2690
0
        if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
2691
0
            bcf_hrec_destroy(hrec);
2692
0
            goto fail;
2693
0
        }
2694
0
        free(lines[i]);
2695
0
        lines[i] = NULL;
2696
0
    }
2697
0
    if (bcf_hdr_parse_sample_line(hdr, lines[n-1]) < 0) goto fail;
2698
0
    if (bcf_hdr_sync(hdr) < 0) goto fail;
2699
0
    free(lines[n-1]);
2700
0
    free(lines);
2701
0
    return 0;
2702
2703
0
 fail:
2704
0
    save_errno = errno;
2705
0
    for (; i < n; i++)
2706
0
        free(lines[i]);
2707
0
    free(lines);
2708
0
    errno = save_errno;
2709
0
    return 1;
2710
0
}
2711
2712
static int _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
2713
13.4k
{
2714
13.4k
    uint32_t e = 0;
2715
13.4k
    if ( !hrec->value )
2716
7.61k
    {
2717
7.61k
        int j, nout = 0;
2718
7.61k
        e |= ksprintf(str, "##%s=<", hrec->key) < 0;
2719
26.4k
        for (j=0; j<hrec->nkeys; j++)
2720
18.8k
        {
2721
            // do not output IDX if output is VCF
2722
18.8k
            if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
2723
16.3k
            if ( nout ) e |= kputc(',',str) < 0;
2724
16.3k
            e |= ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]) < 0;
2725
16.3k
            nout++;
2726
16.3k
        }
2727
7.61k
        e |= ksprintf(str,">\n") < 0;
2728
7.61k
    }
2729
5.87k
    else
2730
5.87k
        e |= ksprintf(str,"##%s=%s\n", hrec->key,hrec->value) < 0;
2731
2732
13.4k
    return e == 0 ? 0 : -1;
2733
13.4k
}
2734
2735
int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
2736
0
{
2737
0
    return _bcf_hrec_format(hrec,0,str);
2738
0
}
2739
2740
int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str)
2741
2.24k
{
2742
2.24k
    int i, r = 0;
2743
15.7k
    for (i=0; i<hdr->nhrec; i++)
2744
13.4k
        r |= _bcf_hrec_format(hdr->hrec[i], is_bcf, str) < 0;
2745
2746
2.24k
    r |= ksprintf(str, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") < 0;
2747
2.24k
    if ( bcf_hdr_nsamples(hdr) )
2748
594
    {
2749
594
        r |= ksprintf(str, "\tFORMAT") < 0;
2750
5.23k
        for (i=0; i<bcf_hdr_nsamples(hdr); i++)
2751
4.64k
            r |= ksprintf(str, "\t%s", hdr->samples[i]) < 0;
2752
594
    }
2753
2.24k
    r |= ksprintf(str, "\n") < 0;
2754
2755
2.24k
    return r ? -1 : 0;
2756
2.24k
}
2757
2758
char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
2759
0
{
2760
0
    kstring_t txt = {0,0,0};
2761
0
    if (bcf_hdr_format(hdr, is_bcf, &txt) < 0)
2762
0
        return NULL;
2763
0
    if ( len ) *len = txt.l;
2764
0
    return txt.s;
2765
0
}
2766
2767
const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
2768
0
{
2769
0
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
2770
0
    int i, tid, m = kh_size(d);
2771
0
    const char **names = (const char**) calloc(m,sizeof(const char*));
2772
0
    if ( !names )
2773
0
    {
2774
0
        hts_log_error("Failed to allocate memory");
2775
0
        *n = 0;
2776
0
        return NULL;
2777
0
    }
2778
0
    khint_t k;
2779
0
    for (k=kh_begin(d); k<kh_end(d); k++)
2780
0
    {
2781
0
        if ( !kh_exist(d,k) ) continue;
2782
0
        if ( !kh_val(d, k).hrec[0] ) continue;  // removed via bcf_hdr_remove
2783
0
        tid = kh_val(d,k).id;
2784
0
        if ( tid >= m )
2785
0
        {
2786
            // This can happen after a contig has been removed from BCF header via bcf_hdr_remove()
2787
0
            if ( hts_resize(const char*, tid + 1, &m, &names, HTS_RESIZE_CLEAR)<0 )
2788
0
            {
2789
0
                hts_log_error("Failed to allocate memory");
2790
0
                *n = 0;
2791
0
                free(names);
2792
0
                return NULL;
2793
0
            }
2794
0
            m = tid + 1;
2795
0
        }
2796
0
        names[tid] = kh_key(d,k);
2797
0
    }
2798
    // ensure there are no gaps
2799
0
    for (i=0,tid=0; tid<m; i++,tid++)
2800
0
    {
2801
0
        while ( tid<m && !names[tid] ) tid++;
2802
0
        if ( tid==m ) break;
2803
0
        if ( i==tid ) continue;
2804
0
        names[i] = names[tid];
2805
0
        names[tid] = 0;
2806
0
    }
2807
0
    *n = i;
2808
0
    return names;
2809
0
}
2810
2811
int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
2812
1.12k
{
2813
1.12k
    kstring_t htxt = {0,0,0};
2814
1.12k
    if (bcf_hdr_format(h, 0, &htxt) < 0) {
2815
0
        free(htxt.s);
2816
0
        return -1;
2817
0
    }
2818
1.12k
    while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros
2819
1.12k
    int ret;
2820
1.12k
    if ( fp->format.compression!=no_compression ) {
2821
0
        ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l);
2822
0
        if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2823
1.12k
    } else {
2824
1.12k
        ret = hwrite(fp->fp.hfile, htxt.s, htxt.l);
2825
1.12k
    }
2826
1.12k
    free(htxt.s);
2827
1.12k
    return ret<0 ? -1 : 0;
2828
1.12k
}
2829
2830
/***********************
2831
 *** Typed value I/O ***
2832
 ***********************/
2833
2834
int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
2835
182k
{
2836
182k
    int32_t max = INT32_MIN, min = INT32_MAX;
2837
182k
    int i;
2838
182k
    if (n <= 0) {
2839
3.54k
        return bcf_enc_size(s, 0, BCF_BT_NULL);
2840
179k
    } else if (n == 1) {
2841
37.3k
        return bcf_enc_int1(s, a[0]);
2842
142k
    } else {
2843
142k
        if (wsize <= 0) wsize = n;
2844
2845
        // Equivalent to:
2846
        // for (i = 0; i < n; ++i) {
2847
        //     if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end )
2848
        //         continue;
2849
        //     if (max < a[i]) max = a[i];
2850
        //     if (min > a[i]) min = a[i];
2851
        // }
2852
142k
        int max4[4] = {INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN};
2853
142k
        int min4[4] = {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX};
2854
36.4M
        for (i = 0; i < (n&~3); i+=4) {
2855
            // bcf_int32_missing    == INT32_MIN and
2856
            // bcf_int32_vector_end == INT32_MIN+1.
2857
            // We skip these, but can mostly avoid explicit checking
2858
36.3M
            if (max4[0] < a[i+0]) max4[0] = a[i+0];
2859
36.3M
            if (max4[1] < a[i+1]) max4[1] = a[i+1];
2860
36.3M
            if (max4[2] < a[i+2]) max4[2] = a[i+2];
2861
36.3M
            if (max4[3] < a[i+3]) max4[3] = a[i+3];
2862
36.3M
            if (min4[0] > a[i+0] && a[i+0] > INT32_MIN+1) min4[0] = a[i+0];
2863
36.3M
            if (min4[1] > a[i+1] && a[i+1] > INT32_MIN+1) min4[1] = a[i+1];
2864
36.3M
            if (min4[2] > a[i+2] && a[i+2] > INT32_MIN+1) min4[2] = a[i+2];
2865
36.3M
            if (min4[3] > a[i+3] && a[i+3] > INT32_MIN+1) min4[3] = a[i+3];
2866
36.3M
        }
2867
142k
        min = min4[0];
2868
142k
        if (min > min4[1]) min = min4[1];
2869
142k
        if (min > min4[2]) min = min4[2];
2870
142k
        if (min > min4[3]) min = min4[3];
2871
142k
        max = max4[0];
2872
142k
        if (max < max4[1]) max = max4[1];
2873
142k
        if (max < max4[2]) max = max4[2];
2874
142k
        if (max < max4[3]) max = max4[3];
2875
363k
        for (; i < n; ++i) {
2876
221k
            if (max < a[i]) max = a[i];
2877
221k
            if (min > a[i] && a[i] > INT32_MIN+1) min = a[i];
2878
221k
        }
2879
2880
142k
        if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) {
2881
16.4k
            if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 ||
2882
16.4k
                ks_resize(s, s->l + n) < 0)
2883
0
                return -1;
2884
16.4k
            uint8_t *p = (uint8_t *) s->s + s->l;
2885
3.28M
            for (i = 0; i < n; ++i, p++) {
2886
3.27M
                if ( a[i]==bcf_int32_vector_end )   *p = bcf_int8_vector_end;
2887
3.26M
                else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing;
2888
104k
                else *p = a[i];
2889
3.27M
            }
2890
16.4k
            s->l += n;
2891
125k
        } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) {
2892
88.4k
            uint8_t *p;
2893
88.4k
            if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 ||
2894
88.4k
                ks_resize(s, s->l + n * sizeof(int16_t)) < 0)
2895
0
                return -1;
2896
88.4k
            p = (uint8_t *) s->s + s->l;
2897
32.7M
            for (i = 0; i < n; ++i)
2898
32.6M
            {
2899
32.6M
                int16_t x;
2900
32.6M
                if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
2901
32.6M
                else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
2902
779k
                else x = a[i];
2903
32.6M
                i16_to_le(x, p);
2904
32.6M
                p += sizeof(int16_t);
2905
32.6M
            }
2906
88.4k
            s->l += n * sizeof(int16_t);
2907
88.4k
        } else {
2908
37.2k
            uint8_t *p;
2909
37.2k
            if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 ||
2910
37.2k
                ks_resize(s, s->l + n * sizeof(int32_t)) < 0)
2911
0
                return -1;
2912
37.2k
            p = (uint8_t *) s->s + s->l;
2913
109M
            for (i = 0; i < n; ++i) {
2914
109M
                i32_to_le(a[i], p);
2915
109M
                p += sizeof(int32_t);
2916
109M
            }
2917
37.2k
            s->l += n * sizeof(int32_t);
2918
37.2k
        }
2919
142k
    }
2920
2921
142k
    return 0;
2922
182k
}
2923
2924
#ifdef VCF_ALLOW_INT64
2925
static int bcf_enc_long1(kstring_t *s, int64_t x) {
2926
    uint32_t e = 0;
2927
    if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32)
2928
        return bcf_enc_int1(s, x);
2929
    if (x == bcf_int64_vector_end) {
2930
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2931
        e |= kputc(bcf_int8_vector_end, s) < 0;
2932
    } else if (x == bcf_int64_missing) {
2933
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2934
        e |= kputc(bcf_int8_missing, s) < 0;
2935
    } else {
2936
        e |= bcf_enc_size(s, 1, BCF_BT_INT64);
2937
        e |= ks_expand(s, 8);
2938
        if (e == 0) { u64_to_le(x, (uint8_t *) s->s + s->l); s->l += 8; }
2939
    }
2940
    return e == 0 ? 0 : -1;
2941
}
2942
#endif
2943
2944
331k
static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) {
2945
331k
    uint8_t *p;
2946
331k
    size_t i;
2947
331k
    size_t bytes = n * sizeof(float);
2948
2949
331k
    if (bytes / sizeof(float) != n) return -1;
2950
331k
    if (ks_resize(s, s->l + bytes) < 0) return -1;
2951
2952
331k
    p = (uint8_t *) s->s + s->l;
2953
75.1M
    for (i = 0; i < n; i++) {
2954
74.8M
        float_to_le(a[i], p);
2955
74.8M
        p += sizeof(float);
2956
74.8M
    }
2957
331k
    s->l += bytes;
2958
2959
331k
    return 0;
2960
331k
}
2961
2962
int bcf_enc_vfloat(kstring_t *s, int n, float *a)
2963
331k
{
2964
331k
    assert(n >= 0);
2965
331k
    bcf_enc_size(s, n, BCF_BT_FLOAT);
2966
331k
    serialize_float_array(s, n, a);
2967
331k
    return 0; // FIXME: check for errs in this function
2968
331k
}
2969
2970
int bcf_enc_vchar(kstring_t *s, int l, const char *a)
2971
2.09M
{
2972
2.09M
    bcf_enc_size(s, l, BCF_BT_CHAR);
2973
2.09M
    kputsn(a, l, s);
2974
2.09M
    return 0; // FIXME: check for errs in this function
2975
2.09M
}
2976
2977
// Special case of n==1 as it also occurs quite often in FORMAT data.
2978
// This version is also small enough to get inlined.
2979
6.30k
static inline int bcf_fmt_array1(kstring_t *s, int type, void *data) {
2980
6.30k
    uint32_t e = 0;
2981
6.30k
    uint8_t *p = (uint8_t *)data;
2982
6.30k
    int32_t v;
2983
2984
    // helps gcc more than clang here. In billions of cycles:
2985
    //          bcf_fmt_array1  bcf_fmt_array
2986
    // gcc7:    23.2            24.3
2987
    // gcc13:   21.6            23.0
2988
    // clang13: 27.1            27.8
2989
6.30k
    switch (type) {
2990
6.30k
    case BCF_BT_CHAR:
2991
6.30k
        e |= kputc_(*p == bcf_str_missing ? '.' : *p, s) < 0;
2992
6.30k
        break;
2993
2994
0
    case BCF_BT_INT8:
2995
0
        if (*(int8_t *)p != bcf_int8_vector_end) {
2996
0
            e |= ((*(int8_t *)p == bcf_int8_missing)
2997
0
                  ? kputc_('.', s)
2998
0
                  : kputw(*(int8_t *)p, s)) < 0;
2999
0
        }
3000
0
        break;
3001
0
    case BCF_BT_INT16:
3002
0
        v = le_to_i16(p);
3003
0
        if (v != bcf_int16_vector_end) {
3004
0
            e |= (v == bcf_int16_missing
3005
0
                  ? kputc_('.', s)
3006
0
                  : kputw(v, s)) < 0;
3007
0
        }
3008
0
        break;
3009
3010
0
    case BCF_BT_INT32:
3011
0
        v = le_to_i32(p);
3012
0
        if (v != bcf_int32_vector_end) {
3013
0
            e |= (v == bcf_int32_missing
3014
0
                  ? kputc_('.', s)
3015
0
                  : kputw(v, s)) < 0;
3016
0
        }
3017
0
        break;
3018
3019
0
    case BCF_BT_FLOAT:
3020
0
        v = le_to_u32(p);
3021
0
        if (v != bcf_float_vector_end) {
3022
0
            e |= (v == bcf_float_missing
3023
0
                  ? kputc_('.', s)
3024
0
                  : kputd(le_to_float(p), s)) < 0;
3025
0
        }
3026
0
        break;
3027
3028
0
    default:
3029
0
        hts_log_error("Unexpected type %d", type);
3030
0
        return -1;
3031
6.30k
    }
3032
3033
6.30k
    return e == 0 ? 0 : -1;
3034
6.30k
}
3035
3036
int bcf_fmt_array(kstring_t *s, int n, int type, void *data)
3037
1.75M
{
3038
1.75M
    int j = 0;
3039
1.75M
    uint32_t e = 0;
3040
1.75M
    if (n == 0) {
3041
1.03M
        return kputc_('.', s) >= 0 ? 0 : -1;
3042
1.03M
    }
3043
3044
721k
    if (type == BCF_BT_CHAR)
3045
257k
    {
3046
257k
        char *p = (char *)data;
3047
3048
        // Note bcf_str_missing is already accounted for in n==0 above.
3049
257k
        if (n >= 8) {
3050
64.5k
            char *p_end = memchr(p, 0, n);
3051
64.5k
            e |= kputsn(p, p_end ? p_end-p : n, s) < 0;
3052
193k
        } else {
3053
736k
            for (j = 0; j < n && *p; ++j, ++p)
3054
543k
               e |= kputc(*p, s) < 0;
3055
193k
        }
3056
257k
    }
3057
463k
    else
3058
463k
    {
3059
463k
        #define BRANCH(type_t, convert, is_missing, is_vector_end, kprint) { \
3060
463k
            uint8_t *p = (uint8_t *) data; \
3061
110M
            for (j=0; j<n; j++, p += sizeof(type_t))    \
3062
109M
            { \
3063
109M
                type_t v = convert(p); \
3064
109M
                if ( is_vector_end ) break; \
3065
109M
                if ( j ) e |= kputc_(',', s) < 0; \
3066
109M
                e |= (is_missing ? kputc('.', s) : kprint) < 0; \
3067
109M
            } \
3068
463k
        }
3069
463k
        switch (type) {
3070
126k
            case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, v==bcf_int8_missing,  v==bcf_int8_vector_end,  kputw(v, s)); break;
3071
83.2k
            case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break;
3072
88.4k
            case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break;
3073
165k
            case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break;
3074
0
            default: hts_log_error("Unexpected type %d", type); exit(1); break;
3075
463k
        }
3076
463k
        #undef BRANCH
3077
463k
    }
3078
721k
    return e == 0 ? 0 : -1;
3079
721k
}
3080
3081
uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
3082
1.19M
{
3083
1.19M
    int x, type;
3084
1.19M
    x = bcf_dec_size(ptr, &ptr, &type);
3085
1.19M
    bcf_fmt_array(s, x, type, ptr);
3086
1.19M
    return ptr + (x << bcf_type_shift[type]);
3087
1.19M
}
3088
3089
/********************
3090
 *** VCF site I/O ***
3091
 ********************/
3092
3093
typedef struct {
3094
    int key;            // Key for h->id[BCF_DT_ID][key] vdict
3095
    int max_m;          // number of elements in field array (ie commas)
3096
    int size;           // field size (max_l or max_g*4 if is_gt)
3097
    int offset;         // offset of buf into h->mem
3098
    uint32_t is_gt:1,   // is genotype
3099
             max_g:31;  // maximum number of genotypes
3100
    uint32_t max_l;     // length of field
3101
    uint32_t y;         // h->id[0][fmt[j].key].val->info[BCF_HL_FMT]
3102
    uint8_t *buf;       // Pointer into h->mem
3103
} fmt_aux_t;
3104
3105
// fmt_aux_t field notes:
3106
// max_* are biggest sizes of the various FORMAT fields across all samples.
3107
// We use these after pivoting the data to ensure easy random access
3108
// of a specific sample.
3109
//
3110
// max_m is only used for type BCF_HT_REAL or BCF_HT_INT
3111
// max_g is only used for is_gt == 1 (will be BCF_HT_STR)
3112
// max_l is only used for is_gt == 0 (will be BCF_HT_STR)
3113
//
3114
// These are computed in vcf_parse_format_max3 and used in
3115
// vcf_parse_format_alloc4 to get the size.
3116
//
3117
// size is computed from max_g, max_l, max_m and is_gt.  Once computed
3118
// the max values are never accessed again.
3119
//
3120
// In theory all 4 vars could be coalesced into a single variable, but this
3121
// significantly harms speed (even if done via a union).  It's about 25-30%
3122
// slower.
3123
3124
static inline int align_mem(kstring_t *s)
3125
51.8k
{
3126
51.8k
    int e = 0;
3127
51.8k
    if (s->l&7) {
3128
12.2k
        uint64_t zero = 0;
3129
12.2k
        e = kputsn((char*)&zero, 8 - (s->l&7), s) < 0;
3130
12.2k
    }
3131
51.8k
    return e == 0 ? 0 : -1;
3132
51.8k
}
3133
3134
52.3k
#define MAX_N_FMT 255   /* Limited by size of bcf1_t n_fmt field */
3135
3136
// detect FORMAT "."
3137
static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3138
18.2k
                                   const char *p, const char *q) {
3139
18.2k
    const char *end = s->s + s->l;
3140
18.2k
    if ( q>=end )
3141
1
    {
3142
1
        hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1);
3143
1
        v->errcode |= BCF_ERR_NCOLS;
3144
1
        return -1;
3145
1
    }
3146
3147
18.2k
    v->n_fmt = 0;
3148
18.2k
    if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "."
3149
8
    {
3150
8
        v->n_sample = bcf_hdr_nsamples(h);
3151
8
        return 1;
3152
8
    }
3153
3154
18.2k
    return 0;
3155
18.2k
}
3156
3157
// get format information from the dictionary
3158
static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3159
18.2k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
3160
18.2k
    const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3161
18.2k
    char *t;
3162
18.2k
    int j;
3163
18.2k
    ks_tokaux_t aux1;
3164
3165
70.5k
    for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
3166
52.3k
        if (j >= MAX_N_FMT) {
3167
2
            v->errcode |= BCF_ERR_LIMITS;
3168
2
            hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle",
3169
2
                bcf_seqname_safe(h,v), v->pos+1);
3170
2
            return -1;
3171
2
        }
3172
3173
52.3k
        *(char*)aux1.p = 0;
3174
52.3k
        khint_t k = kh_get(vdict, d, t);
3175
52.3k
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
3176
3.20k
            if ( t[0]=='.' && t[1]==0 )
3177
0
            {
3178
0
                hts_log_error("Invalid FORMAT tag name '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3179
0
                v->errcode |= BCF_ERR_TAG_INVALID;
3180
0
                return -1;
3181
0
            }
3182
3.20k
            hts_log_warning("FORMAT '%s' at %s:%"PRIhts_pos" is not defined in the header, assuming Type=String", t, bcf_seqname_safe(h,v), v->pos+1);
3183
3.20k
            kstring_t tmp = {0,0,0};
3184
3.20k
            int l;
3185
3.20k
            ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
3186
3.20k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3187
3.20k
            free(tmp.s);
3188
3.20k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3189
3.20k
            if (res < 0) bcf_hrec_destroy(hrec);
3190
3.20k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3191
3192
3.20k
            k = kh_get(vdict, d, t);
3193
3.20k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3194
3.20k
            if (res || k == kh_end(d)) {
3195
4
                hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
3196
4
                v->errcode |= BCF_ERR_TAG_INVALID;
3197
4
                return -1;
3198
4
            }
3199
3.20k
        }
3200
52.3k
        fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
3201
52.3k
        fmt[j].key = kh_val(d, k).id;
3202
52.3k
        fmt[j].is_gt = (t[0] == 'G' && t[1] == 'T' && !t[2]);
3203
52.3k
        fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
3204
52.3k
        v->n_fmt++;
3205
52.3k
    }
3206
18.1k
    return 0;
3207
18.2k
}
3208
3209
// compute max
3210
static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3211
18.1k
                                 char *p, char *q, fmt_aux_t *fmt) {
3212
18.1k
    int n_sample_ori = -1;
3213
18.1k
    char *r = q + 1;  // r: position in the format string
3214
18.1k
    int l = 0, m = 1, g = 1, j;
3215
18.1k
    v->n_sample = 0;  // m: max vector size, l: max field len, g: max number of alleles
3216
18.1k
    const char *end = s->s + s->l;
3217
3218
33.6k
    while ( r<end )
3219
33.6k
    {
3220
        // can we skip some samples?
3221
33.6k
        if ( h->keep_samples )
3222
0
        {
3223
0
            n_sample_ori++;
3224
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
3225
0
            {
3226
0
                while ( *r!='\t' && r<end ) r++;
3227
0
                if ( *r=='\t' ) { *r = 0; r++; }
3228
0
                continue;
3229
0
            }
3230
0
        }
3231
3232
        // collect fmt stats: max vector size, length, number of alleles
3233
33.6k
        j = 0;  // j-th format field
3234
33.6k
        fmt_aux_t *f = fmt;
3235
33.6k
        static char meta[256] = {
3236
            // \0 \t , / : |
3237
33.6k
            1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3238
33.6k
            0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1, 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
3239
33.6k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3240
33.6k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
3241
33.6k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3242
33.6k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3243
33.6k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3244
33.6k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3245
33.6k
        };
3246
3247
33.6k
        char *r_start = r;
3248
4.78M
        for (;;) {
3249
            // Quickly skip ahead to an appropriate meta-character
3250
5.55M
            while (!meta[(unsigned char)*r]) r++;
3251
3252
4.78M
            switch (*r) {
3253
4.72M
            case ',':
3254
4.72M
                m++;
3255
4.72M
                break;
3256
3257
1.99k
            case '|':
3258
10.3k
            case '/':
3259
10.3k
                if (f->is_gt) g++;
3260
10.3k
                break;
3261
3262
18.4k
            case '\t':
3263
18.4k
                *r = 0; // fall through
3264
3265
18.4k
            default: // valid due to while loop above.
3266
33.6k
            case '\0':
3267
45.6k
            case ':':
3268
45.6k
                l = r - r_start; r_start = r;
3269
45.6k
                if (f->max_m < m) f->max_m = m;
3270
45.6k
                if (f->max_l < l) f->max_l = l;
3271
45.6k
                if (f->is_gt && f->max_g < g) f->max_g = g;
3272
45.6k
                l = 0, m = g = 1;
3273
45.6k
                if ( *r==':' ) {
3274
12.0k
                    j++; f++;
3275
12.0k
                    if ( j>=v->n_fmt ) {
3276
10
                        hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"",
3277
10
                                      h->id[BCF_DT_CTG][v->rid].key, v->pos+1);
3278
10
                        v->errcode |= BCF_ERR_NCOLS;
3279
10
                        return -1;
3280
10
                    }
3281
33.6k
                } else goto end_for;
3282
12.0k
                break;
3283
4.78M
            }
3284
4.74M
            if ( r>=end ) break;
3285
4.74M
            r++;
3286
4.74M
        }
3287
33.6k
    end_for:
3288
33.6k
        v->n_sample++;
3289
33.6k
        if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
3290
15.4k
        r++;
3291
15.4k
    }
3292
3293
18.1k
    return 0;
3294
18.1k
}
3295
3296
// allocate memory for arrays
3297
static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3298
                                   const char *p, const char *q,
3299
18.1k
                                   fmt_aux_t *fmt) {
3300
18.1k
    kstring_t *mem = (kstring_t*)&h->mem;
3301
3302
18.1k
    int j;
3303
69.9k
    for (j = 0; j < v->n_fmt; ++j) {
3304
51.8k
        fmt_aux_t *f = &fmt[j];
3305
51.8k
        if ( !f->max_m ) f->max_m = 1;  // omitted trailing format field
3306
3307
51.8k
        if ((f->y>>4&0xf) == BCF_HT_STR) {
3308
51.8k
            f->size = f->is_gt? f->max_g << 2 : f->max_l;
3309
51.8k
        } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
3310
0
            f->size = f->max_m << 2;
3311
0
        } else {
3312
0
            hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
3313
0
            v->errcode |= BCF_ERR_TAG_INVALID;
3314
0
            return -1;
3315
0
        }
3316
3317
51.8k
        if (align_mem(mem) < 0) {
3318
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3319
0
            v->errcode |= BCF_ERR_LIMITS;
3320
0
            return -1;
3321
0
        }
3322
3323
        // Limit the total memory to ~2Gb per VCF row.  This should mean
3324
        // malformed VCF data is less likely to take excessive memory and/or
3325
        // time.
3326
51.8k
        if ((uint64_t) mem->l + v->n_sample * (uint64_t)f->size > INT_MAX) {
3327
0
            static int warned = 0;
3328
0
            if ( !warned ) hts_log_warning("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3329
0
            warned = 1;
3330
0
            v->errcode |= BCF_ERR_LIMITS;
3331
0
            f->size = -1;
3332
0
            f->offset = 0;
3333
0
            continue;
3334
0
        }
3335
3336
51.8k
        f->offset = mem->l;
3337
51.8k
        if (ks_resize(mem, mem->l + v->n_sample * (size_t)f->size) < 0) {
3338
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3339
0
            v->errcode |= BCF_ERR_LIMITS;
3340
0
            return -1;
3341
0
        }
3342
51.8k
        mem->l += v->n_sample * f->size;
3343
51.8k
    }
3344
3345
18.1k
    {
3346
18.1k
        int j;
3347
69.9k
        for (j = 0; j < v->n_fmt; ++j)
3348
51.8k
            fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
3349
18.1k
    }
3350
3351
    // check for duplicate tags
3352
18.1k
    int i;
3353
51.8k
    for (i=1; i<v->n_fmt; i++)
3354
33.6k
    {
3355
33.6k
        fmt_aux_t *ifmt = &fmt[i];
3356
33.6k
        if ( ifmt->size==-1 ) continue; // already marked for removal
3357
110k
        for (j=0; j<i; j++)
3358
92.7k
        {
3359
92.7k
            fmt_aux_t *jfmt = &fmt[j];
3360
92.7k
            if ( jfmt->size==-1 ) continue; // already marked for removal
3361
53.1k
            if ( ifmt->key!=jfmt->key ) continue;
3362
16.3k
            static int warned = 0;
3363
16.3k
            if ( !warned ) hts_log_warning("Duplicate FORMAT tag %s at %s:%"PRIhts_pos, bcf_hdr_int2id(h,BCF_DT_ID,ifmt->key), bcf_seqname_safe(h,v), v->pos+1);
3364
16.3k
            warned = 1;
3365
16.3k
            v->errcode |= BCF_ERR_TAG_INVALID;
3366
16.3k
            ifmt->size = -1;
3367
16.3k
            ifmt->offset = 0;
3368
16.3k
            break;
3369
53.1k
        }
3370
33.6k
    }
3371
18.1k
    return 0;
3372
18.1k
}
3373
3374
// Fill the sample fields
3375
static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3376
18.1k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
3377
18.1k
    static int extreme_val_warned = 0;
3378
18.1k
    int n_sample_ori = -1;
3379
    // At beginning of the loop t points to the first char of a format
3380
18.1k
    const char *t = q + 1;
3381
18.1k
    int m = 0;   // m: sample id
3382
18.1k
    const int nsamples = bcf_hdr_nsamples(h);
3383
18.1k
    const char *end = s->s + s->l;
3384
3385
18.1k
    int ver = bcf_get_version(h, NULL);
3386
3387
51.6k
    while ( t<end )
3388
50.6k
    {
3389
        // can we skip some samples?
3390
50.6k
        if ( h->keep_samples )
3391
0
        {
3392
0
            n_sample_ori++;
3393
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
3394
0
            {
3395
0
                while ( *t && t<end ) t++;
3396
0
                t++;
3397
0
                continue;
3398
0
            }
3399
0
        }
3400
50.6k
        if ( m == nsamples ) break;
3401
3402
33.5k
        int j = 0; // j-th format field, m-th sample
3403
45.2k
        while ( t < end )
3404
45.2k
        {
3405
45.2k
            fmt_aux_t *z = &fmt[j++];
3406
45.2k
            const int htype = z->y>>4&0xf;
3407
45.2k
            if (!z->buf) {
3408
0
                hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos,
3409
0
                              z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
3410
0
                v->errcode |= BCF_ERR_LIMITS;
3411
0
                return -1;
3412
0
            }
3413
3414
45.2k
            if ( z->size==-1 )
3415
3.88k
            {
3416
                // this field is to be ignored, it's either too big or a duplicate
3417
40.7k
                while ( *t != ':' && *t ) t++;
3418
3.88k
            }
3419
41.3k
            else if (htype == BCF_HT_STR) {
3420
41.3k
                int l;
3421
41.3k
                if (z->is_gt) {
3422
                    // Genotypes.
3423
                    //([/|])?<val>)([|/]<val>)+... where <val> is [0-9]+ or ".".
3424
5.36k
                    int32_t is_phased = 0;
3425
5.36k
                    uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
3426
5.36k
                    uint32_t unreadable = 0;
3427
5.36k
                    uint32_t max = 0;
3428
5.36k
                    int overflow = 0, ploidy = 0, anyunphased = 0, \
3429
5.36k
                        phasingprfx = 0, unknown1 = 0;
3430
3431
                    /* with prefixed phasing, it is explicitly given for 1st one
3432
                    with non-prefixed, set based on ploidy and phasing of other
3433
                    alleles. */
3434
5.36k
                    if (ver >= VCF44 && (*t == '|' || *t == '/')) {
3435
                        // cache prefix and phasing status
3436
63
                        is_phased = *t++ == '|';
3437
63
                        phasingprfx = 1;
3438
63
                    }
3439
3440
12.1k
                    for (l = 0;; ++t) {
3441
12.1k
                        ploidy++;
3442
12.1k
                        if (*t == '.') {
3443
1.28k
                            ++t, x[l++] = is_phased;
3444
1.28k
                            if (l==1) {   //for 1st allele only
3445
566
                                unknown1 = 1;
3446
566
                            }
3447
10.8k
                        } else {
3448
10.8k
                            const char *tt = t;
3449
10.8k
                            uint32_t val;
3450
                            // Or "v->n_allele < 10", but it doesn't
3451
                            // seem to be any faster and this feels safer.
3452
10.8k
                            if (*t >= '0' && *t <= '9' &&
3453
10.6k
                                !(t[1] >= '0' && t[1] <= '9')) {
3454
4.11k
                                val = *t++ - '0';
3455
6.73k
                            } else {
3456
6.73k
                                val = hts_str2uint(t, (char **)&t,
3457
6.73k
                                                   sizeof(val) * CHAR_MAX - 2,
3458
6.73k
                                                   &overflow);
3459
6.73k
                                unreadable |= tt == t;
3460
6.73k
                            }
3461
10.8k
                            if (max < val) max = val;
3462
10.8k
                            x[l++] = (val + 1) << 1 | is_phased;
3463
10.8k
                        }
3464
12.1k
                        anyunphased |= (ploidy != 1) && !is_phased;
3465
12.1k
                        is_phased = (*t == '|');
3466
12.1k
                        if (*t != '|' && *t != '/') break;
3467
12.1k
                    }
3468
5.36k
                    if (!phasingprfx) { //get GT in v44 way when no prefixed phasing
3469
                        /* no explicit phasing for 1st allele, set based on
3470
                         other alleles and ploidy */
3471
5.30k
                        if (ploidy == 1) {  //implicitly phased
3472
1.19k
                            if (!unknown1) {
3473
1.12k
                                x[0] |= 1;
3474
1.12k
                            }
3475
4.10k
                        } else {            //set by other unphased alleles
3476
4.10k
                            x[0] |= (anyunphased)? 0 : 1;
3477
4.10k
                        }
3478
5.30k
                    }
3479
                    // Possibly check max against v->n_allele instead?
3480
5.36k
                    if (overflow || max > (INT32_MAX >> 1) - 1) {
3481
25
                        hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3482
25
                        return -1;
3483
25
                    }
3484
5.33k
                    if (unreadable) {
3485
9
                        hts_log_error("Couldn't read GT data: value not a number or '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3486
9
                        return -1;
3487
9
                    }
3488
5.32k
                    if ( !l ) x[l++] = 0;   // An empty field, insert missing value
3489
5.82k
                    for (; l < z->size>>2; ++l)
3490
494
                        x[l] = bcf_int32_vector_end;
3491
3492
35.9k
                } else {
3493
                    // Otherwise arbitrary strings
3494
35.9k
                    char *x = (char*)z->buf + z->size * (size_t)m;
3495
5.29M
                    for (l = 0; *t != ':' && *t; ++t)
3496
5.25M
                        x[l++] = *t;
3497
35.9k
                    if (z->size > l)
3498
15.7k
                        memset(&x[l], 0, (z->size-l) * sizeof(*x));
3499
35.9k
                }
3500
3501
41.3k
            } else if (htype == BCF_HT_INT) {
3502
                // One or more integers in an array
3503
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3504
0
                int l;
3505
0
                for (l = 0;; ++t) {
3506
0
                    if (*t == '.') {
3507
0
                        x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
3508
0
                    } else {
3509
0
                        int overflow = 0;
3510
0
                        char *te;
3511
0
                        long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3512
0
                        if ( te==t || overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3513
0
                        {
3514
0
                            if ( !extreme_val_warned )
3515
0
                            {
3516
0
                                hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos,
3517
0
                                                h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1);
3518
0
                                extreme_val_warned = 1;
3519
0
                            }
3520
0
                            tmp_val = bcf_int32_missing;
3521
0
                        }
3522
0
                        x[l++] = tmp_val;
3523
0
                        t = te;
3524
0
                    }
3525
0
                    if (*t != ',') break;
3526
0
                }
3527
0
                if ( !l )
3528
0
                    x[l++] = bcf_int32_missing;
3529
0
                for (; l < z->size>>2; ++l)
3530
0
                    x[l] = bcf_int32_vector_end;
3531
3532
0
            } else if (htype == BCF_HT_REAL) {
3533
                // One of more floating point values in an array
3534
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3535
0
                int l;
3536
0
                for (l = 0;; ++t) {
3537
0
                    if (*t == '.' && !isdigit_c(t[1])) {
3538
0
                        bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
3539
0
                    } else {
3540
0
                        int overflow = 0;
3541
0
                        char *te;
3542
0
                        float tmp_val = hts_str2dbl(t, &te, &overflow);
3543
0
                        if ( (te==t || overflow) && !extreme_val_warned )
3544
0
                        {
3545
0
                            hts_log_warning("Extreme FORMAT/%s value encountered at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname(h,v), v->pos+1);
3546
0
                            extreme_val_warned = 1;
3547
0
                        }
3548
0
                        x[l++] = tmp_val;
3549
0
                        t = te;
3550
0
                    }
3551
0
                    if (*t != ',') break;
3552
0
                }
3553
0
                if ( !l )
3554
                    // An empty field, insert missing value
3555
0
                    bcf_float_set_missing(x[l++]);
3556
0
                for (; l < z->size>>2; ++l)
3557
0
                    bcf_float_set_vector_end(x[l]);
3558
0
            } else {
3559
0
                hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, htype, bcf_seqname_safe(h,v), v->pos+1);
3560
0
                v->errcode |= BCF_ERR_TAG_INVALID;
3561
0
                return -1;
3562
0
            }
3563
3564
45.2k
            if (*t == '\0') {
3565
33.4k
                break;
3566
33.4k
            }
3567
11.7k
            else if (*t == ':') {
3568
11.7k
                t++;
3569
11.7k
            }
3570
10
            else {
3571
10
                char buffer[8];
3572
10
                hts_log_error("Invalid character %s in '%s' FORMAT field at %s:%"PRIhts_pos"",
3573
10
                    hts_strprint(buffer, sizeof buffer, '\'', t, 1),
3574
10
                    h->id[BCF_DT_ID][z->key].key, bcf_seqname_safe(h,v), v->pos+1);
3575
10
                v->errcode |= BCF_ERR_CHAR;
3576
10
                return -1;
3577
10
            }
3578
45.2k
        }
3579
3580
        // fill end-of-vector values
3581
353k
        for (; j < v->n_fmt; ++j) {
3582
319k
            fmt_aux_t *z = &fmt[j];
3583
319k
            const int htype = z->y>>4&0xf;
3584
319k
            int l;
3585
3586
319k
            if (z->size == -1) // this field is to be ignored
3587
251k
                continue;
3588
3589
68.2k
            if (htype == BCF_HT_STR) {
3590
68.2k
                if (z->is_gt) {
3591
9.21k
                    int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3592
9.21k
                    if (z->size) x[0] = bcf_int32_missing;
3593
17.9k
                    for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3594
59.0k
                } else {
3595
59.0k
                    char *x = (char*)z->buf + z->size * (size_t)m;
3596
59.0k
                    if ( z->size ) {
3597
12.9k
                        x[0] = '.';
3598
12.9k
                        memset(&x[1], 0, (z->size-1) * sizeof(*x));
3599
12.9k
                    }
3600
59.0k
                }
3601
68.2k
            } else if (htype == BCF_HT_INT) {
3602
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3603
0
                x[0] = bcf_int32_missing;
3604
0
                for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3605
0
            } else if (htype == BCF_HT_REAL) {
3606
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3607
0
                bcf_float_set_missing(x[0]);
3608
0
                for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
3609
0
            }
3610
68.2k
        }
3611
3612
33.4k
        m++; t++;
3613
33.4k
    }
3614
3615
18.1k
    return 0;
3616
18.1k
}
3617
3618
// write individual genotype information
3619
static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3620
18.1k
                                const char *p, const char *q, fmt_aux_t *fmt) {
3621
18.1k
    kstring_t *str = &v->indiv;
3622
18.1k
    int i, need_downsize = 0;
3623
18.1k
    if (v->n_sample > 0) {
3624
69.6k
        for (i = 0; i < v->n_fmt; ++i) {
3625
51.4k
            fmt_aux_t *z = &fmt[i];
3626
51.4k
            if ( z->size==-1 ) {
3627
16.1k
                need_downsize = 1;
3628
16.1k
                continue;
3629
16.1k
            }
3630
35.3k
            bcf_enc_int1(str, z->key);
3631
35.3k
            if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
3632
30.2k
                bcf_enc_size(str, z->size, BCF_BT_CHAR);
3633
30.2k
                kputsn((char*)z->buf, z->size * (size_t)v->n_sample, str);
3634
30.2k
            } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
3635
5.09k
                bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
3636
5.09k
            } else {
3637
0
                bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
3638
0
                if (serialize_float_array(str, (z->size>>2) * (size_t)v->n_sample,
3639
0
                                          (float *) z->buf) != 0) {
3640
0
                    v->errcode |= BCF_ERR_LIMITS;
3641
0
                    hts_log_error("Out of memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3642
0
                    return -1;
3643
0
                }
3644
0
            }
3645
35.3k
        }
3646
3647
18.1k
    }
3648
18.1k
    if ( need_downsize ) {
3649
5.33k
        i = 0;
3650
33.7k
        while ( i < v->n_fmt ) {
3651
28.4k
            if ( fmt[i].size==-1 )
3652
16.1k
            {
3653
16.1k
                v->n_fmt--;
3654
16.1k
                if ( i < v->n_fmt ) memmove(&fmt[i],&fmt[i+1],sizeof(*fmt)*(v->n_fmt-i));
3655
16.1k
            }
3656
12.2k
            else
3657
12.2k
                i++;
3658
28.4k
        }
3659
5.33k
    }
3660
18.1k
    return 0;
3661
18.1k
}
3662
3663
// validity checking
3664
18.1k
static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) {
3665
18.1k
    if ( v->n_sample!=bcf_hdr_nsamples(h) )
3666
43
    {
3667
43
        hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
3668
43
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
3669
43
        v->errcode |= BCF_ERR_NCOLS;
3670
43
        return -1;
3671
43
    }
3672
18.0k
    if ( v->indiv.l > 0xffffffff )
3673
0
    {
3674
0
        hts_log_error("The FORMAT at %s:%"PRIhts_pos" is too long", bcf_seqname_safe(h,v), v->pos+1);
3675
0
        v->errcode |= BCF_ERR_LIMITS;
3676
3677
        // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed
3678
0
        v->n_fmt = 0;
3679
0
        return -1;
3680
0
    }
3681
3682
18.0k
    return 0;
3683
18.0k
}
3684
3685
// p,q is the start and the end of the FORMAT field
3686
static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3687
                            char *p, char *q)
3688
37.7k
{
3689
37.7k
    if ( !bcf_hdr_nsamples(h) ) return 0;
3690
18.2k
    kstring_t *mem = (kstring_t*)&h->mem;
3691
18.2k
    mem->l = 0;
3692
3693
18.2k
    fmt_aux_t fmt[MAX_N_FMT];
3694
3695
    // detect FORMAT "."
3696
18.2k
    int ret; // +ve = ok, -ve = err
3697
18.2k
    if ((ret = vcf_parse_format_empty1(s, h, v, p, q)))
3698
9
        return ret ? 0 : -1;
3699
3700
    // get format information from the dictionary
3701
18.2k
    if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0)
3702
6
        return -1;
3703
3704
    // FORMAT data is per-sample A:B:C A:B:C A:B:C ... but in memory it is
3705
    // stored as per-type arrays AAA... BBB... CCC...  This is basically
3706
    // a data rotation or pivot.
3707
3708
    // The size of elements in the array grow to their maximum needed,
3709
    // permitting fast random access.  This means however we have to first
3710
    // scan the whole FORMAT line to find the maximum of each type, and
3711
    // then scan it again to find the store the data.
3712
    // We break this down into compute-max, allocate, fill-out-buffers
3713
3714
    // TODO: ?
3715
    // The alternative would be to pivot on the first pass, with fixed
3716
    // size entries for numerics and concatenated strings otherwise, also
3717
    // tracking maximum sizes.  Then on a second pass we reallocate and
3718
    // copy the data again to a uniformly sized array.  Two passes through
3719
    // memory, but without doubling string parsing.
3720
3721
    // compute max
3722
18.1k
    if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0)
3723
10
        return -1;
3724
3725
    // allocate memory for arrays
3726
18.1k
    if (vcf_parse_format_alloc4(s, h, v, p, q, fmt) < 0)
3727
0
        return -1;
3728
3729
    // fill the sample fields; at beginning of the loop
3730
18.1k
    if (vcf_parse_format_fill5(s, h, v, p, q, fmt) < 0)
3731
44
        return -1;
3732
3733
    // write individual genotype information
3734
18.1k
    if (vcf_parse_format_gt6(s, h, v, p, q, fmt) < 0)
3735
0
        return -1;
3736
3737
    // validity checking
3738
18.1k
    if (vcf_parse_format_check7(h, v) < 0)
3739
43
        return -1;
3740
3741
18.0k
    return 0;
3742
18.1k
}
3743
3744
2.74k
static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
3745
    // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
3746
    // been already printed, but will enable tools like vcfcheck to proceed.
3747
3748
2.74k
    kstring_t tmp = {0,0,0};
3749
2.74k
    khint_t k;
3750
2.74k
    int l;
3751
2.74k
    if (ksprintf(&tmp, "##contig=<ID=%s>", p) < 0)
3752
0
        return kh_end(d);
3753
2.74k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3754
2.74k
    free(tmp.s);
3755
2.74k
    int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3756
2.74k
    if (res < 0) bcf_hrec_destroy(hrec);
3757
2.74k
    if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3758
2.74k
    k = kh_get(vdict, d, p);
3759
3760
2.74k
    return k;
3761
2.74k
}
3762
3763
38.9k
static int vcf_parse_filter(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3764
38.9k
    int i, n_flt = 1, max_n_flt = 0;
3765
38.9k
    char *r, *t;
3766
38.9k
    int32_t *a_flt = NULL;
3767
38.9k
    ks_tokaux_t aux1;
3768
38.9k
    khint_t k;
3769
38.9k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3770
    // count the number of filters
3771
38.9k
    if (*(q-1) == ';') *(q-1) = 0;
3772
224M
    for (r = p; *r; ++r)
3773
224M
        if (*r == ';') ++n_flt;
3774
38.9k
    if (n_flt > max_n_flt) {
3775
38.9k
        a_flt = malloc(n_flt * sizeof(*a_flt));
3776
38.9k
        if (!a_flt) {
3777
0
            hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3778
0
            v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3779
0
            return -1;
3780
0
        }
3781
38.9k
        max_n_flt = n_flt;
3782
38.9k
    }
3783
    // add filters
3784
1.05M
    for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
3785
1.01M
        *(char*)aux1.p = 0;
3786
1.01M
        k = kh_get(vdict, d, t);
3787
1.01M
        if (k == kh_end(d))
3788
28.6k
        {
3789
            // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
3790
            // been already printed, but will enable tools like vcfcheck to proceed.
3791
28.6k
            hts_log_warning("FILTER '%s' is not defined in the header", t);
3792
28.6k
            kstring_t tmp = {0,0,0};
3793
28.6k
            int l;
3794
28.6k
            ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
3795
28.6k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3796
28.6k
            free(tmp.s);
3797
28.6k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3798
28.6k
            if (res < 0) bcf_hrec_destroy(hrec);
3799
28.6k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3800
28.6k
            k = kh_get(vdict, d, t);
3801
28.6k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3802
28.6k
            if (res || k == kh_end(d)) {
3803
36
                hts_log_error("Could not add dummy header for FILTER '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
3804
36
                v->errcode |= BCF_ERR_TAG_INVALID;
3805
36
                free(a_flt);
3806
36
                return -1;
3807
36
            }
3808
28.6k
        }
3809
1.01M
        a_flt[i++] = kh_val(d, k).id;
3810
1.01M
    }
3811
3812
38.9k
    bcf_enc_vint(str, n_flt, a_flt, -1);
3813
38.9k
    free(a_flt);
3814
3815
38.9k
    return 0;
3816
38.9k
}
3817
3818
41.6k
static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3819
41.6k
    static int extreme_int_warned = 0, negative_rlen_warned = 0;
3820
41.6k
    int max_n_val = 0, overflow = 0;
3821
41.6k
    char *r, *key;
3822
41.6k
    khint_t k;
3823
41.6k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3824
41.6k
    int32_t *a_val = NULL;
3825
3826
41.6k
    v->n_info = 0;
3827
41.6k
    if (*(q-1) == ';') *(q-1) = 0;
3828
2.70M
    for (r = key = p;; ++r) {
3829
2.70M
        int c;
3830
2.70M
        char *val, *end;
3831
246M
        while (*r > '=' || (*r != ';' && *r != '=' && *r != 0)) r++;
3832
2.70M
        if (v->n_info == UINT16_MAX) {
3833
1
            hts_log_error("Too many INFO entries at %s:%"PRIhts_pos,
3834
1
                          bcf_seqname_safe(h,v), v->pos+1);
3835
1
            v->errcode |= BCF_ERR_LIMITS;
3836
1
            goto fail;
3837
1
        }
3838
2.70M
        val = end = NULL;
3839
2.70M
        c = *r; *r = 0;
3840
2.70M
        if (c == '=') {
3841
991k
            val = r + 1;
3842
3843
325M
            for (end = val; *end != ';' && *end != 0; ++end);
3844
991k
            c = *end; *end = 0;
3845
1.71M
        } else end = r;
3846
2.70M
        if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; }  // faulty VCF, ";;" in the INFO
3847
2.64M
        k = kh_get(vdict, d, key);
3848
2.64M
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
3849
27.8k
        {
3850
27.8k
            hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key);
3851
27.8k
            kstring_t tmp = {0,0,0};
3852
27.8k
            int l;
3853
27.8k
            ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
3854
27.8k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3855
27.8k
            free(tmp.s);
3856
27.8k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3857
27.8k
            if (res < 0) bcf_hrec_destroy(hrec);
3858
27.8k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3859
27.8k
            k = kh_get(vdict, d, key);
3860
27.8k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3861
27.8k
            if (res || k == kh_end(d)) {
3862
55
                hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1);
3863
55
                v->errcode |= BCF_ERR_TAG_INVALID;
3864
55
                goto fail;
3865
55
            }
3866
27.8k
        }
3867
2.64M
        uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
3868
2.64M
        ++v->n_info;
3869
2.64M
        bcf_enc_int1(str, kh_val(d, k).id);
3870
2.64M
        if (val == 0) {
3871
1.65M
            bcf_enc_size(str, 0, BCF_BT_NULL);
3872
1.65M
        } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
3873
55.0k
            bcf_enc_vchar(str, end - val, val);
3874
936k
        } else { // int/float value/array
3875
936k
            int i, n_val;
3876
936k
            char *t, *te;
3877
314M
            for (t = val, n_val = 1; *t; ++t) // count the number of values
3878
313M
                if (*t == ',') ++n_val;
3879
            // Check both int and float size in one step for simplicity
3880
936k
            if (n_val > max_n_val) {
3881
2.99k
                int32_t *a_tmp = (int32_t *)realloc(a_val, n_val * sizeof(*a_val));
3882
2.99k
                if (!a_tmp) {
3883
0
                    hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3884
0
                    v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3885
0
                    goto fail;
3886
0
                }
3887
2.99k
                a_val = a_tmp;
3888
2.99k
                max_n_val = n_val;
3889
2.99k
            }
3890
936k
            if ((y>>4&0xf) == BCF_HT_INT) {
3891
604k
                i = 0, t = val;
3892
604k
                int64_t val1;
3893
604k
                int is_int64 = 0;
3894
#ifdef VCF_ALLOW_INT64
3895
                if ( n_val==1 )
3896
                {
3897
                    overflow = 0;
3898
                    long long int tmp_val = hts_str2int(val, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3899
                    if ( te==val ) tmp_val = bcf_int32_missing;
3900
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT64 || tmp_val>BCF_MAX_BT_INT64 )
3901
                    {
3902
                        if ( !extreme_int_warned )
3903
                        {
3904
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3905
                            extreme_int_warned = 1;
3906
                        }
3907
                        tmp_val = bcf_int32_missing;
3908
                    }
3909
                    else
3910
                        is_int64 = 1;
3911
                    val1 = tmp_val;
3912
                    t = te;
3913
                    i = 1;  // this is just to avoid adding another nested block...
3914
                }
3915
#endif
3916
145M
                for (; i < n_val; ++i, ++t)
3917
145M
                {
3918
145M
                    overflow = 0;
3919
145M
                    long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3920
145M
                    if ( te==t ) tmp_val = bcf_int32_missing;
3921
1.15M
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3922
137k
                    {
3923
137k
                        if ( !extreme_int_warned )
3924
1
                        {
3925
1
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3926
1
                            extreme_int_warned = 1;
3927
1
                        }
3928
137k
                        tmp_val = bcf_int32_missing;
3929
137k
                    }
3930
145M
                    a_val[i] = tmp_val;
3931
197M
                    for (t = te; *t && *t != ','; t++);
3932
145M
                }
3933
604k
                if (n_val == 1) {
3934
#ifdef VCF_ALLOW_INT64
3935
                    if ( is_int64 )
3936
                    {
3937
                        v->unpacked |= BCF_IS_64BIT;
3938
                        bcf_enc_long1(str, val1);
3939
                    }
3940
                    else
3941
                        bcf_enc_int1(str, (int32_t)val1);
3942
#else
3943
469k
                    val1 = a_val[0];
3944
469k
                    bcf_enc_int1(str, (int32_t)val1);
3945
469k
#endif
3946
469k
                } else {
3947
135k
                    bcf_enc_vint(str, n_val, a_val, -1);
3948
135k
                }
3949
604k
                if (n_val==1 && (val1!=bcf_int32_missing || is_int64)
3950
388k
                    && memcmp(key, "END", 4) == 0)
3951
0
                {
3952
0
                    if ( val1 <= v->pos )
3953
0
                    {
3954
0
                        if ( !negative_rlen_warned )
3955
0
                        {
3956
0
                            hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,val1,bcf_seqname_safe(h,v),v->pos+1);
3957
0
                            negative_rlen_warned = 1;
3958
0
                        }
3959
0
                    }
3960
0
                }
3961
604k
            } else if ((y>>4&0xf) == BCF_HT_REAL) {
3962
331k
                float *val_f = (float *)a_val;
3963
75.1M
                for (i = 0, t = val; i < n_val; ++i, ++t)
3964
74.8M
                {
3965
74.8M
                    overflow = 0;
3966
74.8M
                    val_f[i] = hts_str2dbl(t, &te, &overflow);
3967
74.8M
                    if ( te==t || overflow ) // conversion failed
3968
73.5M
                        bcf_float_set_missing(val_f[i]);
3969
106M
                    for (t = te; *t && *t != ','; t++);
3970
74.8M
                }
3971
331k
                bcf_enc_vfloat(str, n_val, val_f);
3972
331k
            }
3973
936k
        }
3974
2.64M
        if (c == 0) break;
3975
2.62M
        r = end;
3976
2.62M
        key = r + 1;
3977
2.62M
    }
3978
3979
41.6k
    free(a_val);
3980
41.6k
    return 0;
3981
3982
56
 fail:
3983
56
    free(a_val);
3984
56
    return -1;
3985
41.6k
}
3986
3987
int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
3988
42.7k
{
3989
42.7k
    int ret = -2, overflow = 0;
3990
42.7k
    char *p, *q, *r, *t;
3991
42.7k
    kstring_t *str;
3992
42.7k
    khint_t k;
3993
42.7k
    ks_tokaux_t aux;
3994
3995
//#define NOT_DOT(p) strcmp((p), ".")
3996
//#define NOT_DOT(p) (!(*p == '.' && !p[1]))
3997
//#define NOT_DOT(p) ((*p) != '.' || (p)[1])
3998
//#define NOT_DOT(p) (q-p != 1 || memcmp(p, ".\0", 2))
3999
211k
#define NOT_DOT(p) (memcmp(p, ".\0", 2))
4000
4001
42.7k
    if (!s || !h || !v || !(s->s))
4002
0
        return ret;
4003
4004
    // Assumed in lots of places, but we may as well spot this early
4005
42.7k
    assert(sizeof(float) == sizeof(int32_t));
4006
4007
    // Ensure string we parse has space to permit some over-flow when during
4008
    // parsing.  Eg to do memcmp(key, "END", 4) in vcf_parse_info over
4009
    // the more straight forward looking strcmp, giving a speed advantage.
4010
42.7k
    if (ks_resize(s, s->l+4) < 0)
4011
0
        return -2;
4012
4013
    // Force our memory to be initialised so we avoid the technicality of
4014
    // undefined behaviour in using a 4-byte memcmp.  (The reality is this
4015
    // almost certainly is never detected by the compiler so has no impact,
4016
    // but equally so this code has minimal (often beneficial) impact on
4017
    // performance too.)
4018
42.7k
    s->s[s->l+0] = 0;
4019
42.7k
    s->s[s->l+1] = 0;
4020
42.7k
    s->s[s->l+2] = 0;
4021
42.7k
    s->s[s->l+3] = 0;
4022
4023
42.7k
    bcf_clear1(v);
4024
42.7k
    str = &v->shared;
4025
42.7k
    memset(&aux, 0, sizeof(ks_tokaux_t));
4026
4027
    // CHROM
4028
42.7k
    if (!(p = kstrtok(s->s, "\t", &aux)))
4029
0
        goto err;
4030
42.7k
    *(q = (char*)aux.p) = 0;
4031
4032
42.7k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
4033
42.7k
    k = kh_get(vdict, d, p);
4034
42.7k
    if (k == kh_end(d)) {
4035
2.74k
        hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p);
4036
2.74k
        v->errcode = BCF_ERR_CTG_UNDEF;
4037
2.74k
        if ((k = fix_chromosome(h, d, p)) == kh_end(d)) {
4038
77
            hts_log_error("Could not add dummy header for contig '%s'", p);
4039
77
            v->errcode |= BCF_ERR_CTG_INVALID;
4040
77
            goto err;
4041
77
        }
4042
2.74k
    }
4043
42.6k
    v->rid = kh_val(d, k).id;
4044
4045
    // POS
4046
42.6k
    if (!(p = kstrtok(0, 0, &aux)))
4047
258
        goto err;
4048
42.4k
    *(q = (char*)aux.p) = 0;
4049
4050
42.4k
    overflow = 0;
4051
42.4k
    char *tmp = p;
4052
42.4k
    v->pos = hts_str2uint(p, &p, 62, &overflow);
4053
42.4k
    if (overflow) {
4054
4
        hts_log_error("Position value '%s' is too large", tmp);
4055
4
        goto err;
4056
42.4k
    } else if ( *p ) {
4057
56
        hts_log_error("Could not parse the position '%s'", tmp);
4058
56
        goto err;
4059
42.3k
    } else {
4060
42.3k
        v->pos -= 1;
4061
42.3k
    }
4062
42.3k
    if (v->pos >= INT32_MAX)
4063
440
        v->unpacked |= BCF_IS_64BIT;
4064
4065
    // ID
4066
42.3k
    if (!(p = kstrtok(0, 0, &aux)))
4067
2
        goto err;
4068
42.3k
    *(q = (char*)aux.p) = 0;
4069
4070
42.3k
    if (NOT_DOT(p)) bcf_enc_vchar(str, q - p, p);
4071
77
    else bcf_enc_size(str, 0, BCF_BT_CHAR);
4072
4073
    // REF
4074
42.3k
    if (!(p = kstrtok(0, 0, &aux)))
4075
12
        goto err;
4076
42.3k
    *(q = (char*)aux.p) = 0;
4077
4078
42.3k
    bcf_enc_vchar(str, q - p, p);
4079
42.3k
    v->n_allele = 1, v->rlen = q - p;
4080
4081
    // ALT
4082
42.3k
    if (!(p = kstrtok(0, 0, &aux)))
4083
14
        goto err;
4084
42.3k
    *(q = (char*)aux.p) = 0;
4085
4086
42.3k
    if (NOT_DOT(p)) {
4087
91.9M
        for (r = t = p;; ++r) {
4088
91.9M
            if (*r == ',' || *r == 0) {
4089
1.95M
                if (v->n_allele == UINT16_MAX) {
4090
3
                    hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos,
4091
3
                                  bcf_seqname_safe(h,v), v->pos+1);
4092
3
                    v->errcode |= BCF_ERR_LIMITS;
4093
3
                    goto err;
4094
3
                }
4095
1.95M
                bcf_enc_vchar(str, r - t, t);
4096
1.95M
                t = r + 1;
4097
1.95M
                ++v->n_allele;
4098
1.95M
            }
4099
91.9M
            if (r == q) break;
4100
91.9M
        }
4101
41.2k
    }
4102
4103
    // QUAL
4104
42.3k
    if (!(p = kstrtok(0, 0, &aux)))
4105
43
        goto err;
4106
42.2k
    *(q = (char*)aux.p) = 0;
4107
4108
42.2k
    if (NOT_DOT(p)) v->qual = atof(p);
4109
1.13k
    else bcf_float_set_missing(v->qual);
4110
42.2k
    if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR
4111
4112
    // FILTER
4113
42.2k
    if (!(p = kstrtok(0, 0, &aux)))
4114
30
        goto err;
4115
42.2k
    *(q = (char*)aux.p) = 0;
4116
4117
42.2k
    if (NOT_DOT(p)) {
4118
38.9k
        if (vcf_parse_filter(str, h, v, p, q)) {
4119
36
            goto err;
4120
36
        }
4121
38.9k
    } else bcf_enc_vint(str, 0, 0, -1);
4122
42.2k
    if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT
4123
4124
    // INFO
4125
42.2k
    if (!(p = kstrtok(0, 0, &aux)))
4126
42
        goto err;
4127
42.1k
    *(q = (char*)aux.p) = 0;
4128
4129
42.1k
    if (NOT_DOT(p)) {
4130
41.6k
        if (vcf_parse_info(str, h, v, p, q)) {
4131
56
            goto err;
4132
56
        }
4133
41.6k
    }
4134
42.1k
    if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
4135
4136
    // FORMAT; optional
4137
42.1k
    p = kstrtok(0, 0, &aux);
4138
42.1k
    if (p) {
4139
37.7k
        *(q = (char*)aux.p) = 0;
4140
4141
37.7k
        if (vcf_parse_format(s, h, v, p, q)) {
4142
103
            goto err;
4143
103
        }
4144
37.7k
    }
4145
4146
42.0k
 end:
4147
42.0k
    v->rlen = get_rlen(h, v);    //set rlen based on version
4148
42.0k
    ret = 0;
4149
4150
42.7k
 err:
4151
42.7k
    return ret;
4152
42.0k
}
4153
4154
int vcf_open_mode(char *mode, const char *fn, const char *format)
4155
0
{
4156
0
    if (format == NULL) {
4157
        // Try to pick a format based on the filename extension
4158
0
        char extension[HTS_MAX_EXT_LEN];
4159
0
        if (find_file_extension(fn, extension) < 0) return -1;
4160
0
        return vcf_open_mode(mode, fn, extension);
4161
0
    }
4162
0
    else if (strcasecmp(format, "bcf") == 0) strcpy(mode, "b");
4163
0
    else if (strcasecmp(format, "vcf") == 0) strcpy(mode, "");
4164
0
    else if (strcasecmp(format, "vcf.gz") == 0 || strcasecmp(format, "vcf.bgz") == 0) strcpy(mode, "z");
4165
0
    else return -1;
4166
4167
0
    return 0;
4168
0
}
4169
4170
int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
4171
43.1k
{
4172
43.1k
    int ret;
4173
43.1k
    ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
4174
43.1k
    if (ret < 0) return ret;
4175
42.7k
    return vcf_parse1(&fp->line, h, v);
4176
43.1k
}
4177
4178
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
4179
0
{
4180
0
    uint8_t *ptr_start = ptr;
4181
0
    fmt->id = bcf_dec_typed_int1(ptr, &ptr);
4182
0
    fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
4183
0
    fmt->size = fmt->n << bcf_type_shift[fmt->type];
4184
0
    fmt->p = ptr;
4185
0
    fmt->p_off  = ptr - ptr_start;
4186
0
    fmt->p_free = 0;
4187
0
    ptr += n_sample * fmt->size;
4188
0
    fmt->p_len = ptr - fmt->p;
4189
0
    return ptr;
4190
0
}
4191
4192
static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
4193
693
{
4194
693
    uint8_t *ptr_start = ptr;
4195
693
    int64_t len = 0;
4196
693
    info->key = bcf_dec_typed_int1(ptr, &ptr);
4197
693
    len = info->len = bcf_dec_size(ptr, &ptr, &info->type);
4198
693
    info->vptr = ptr;
4199
693
    info->vptr_off  = ptr - ptr_start;
4200
693
    info->vptr_free = 0;
4201
693
    info->v1.i = 0;
4202
693
    if (info->len == 1) {
4203
134
        switch(info->type) {
4204
0
        case BCF_BT_INT8:
4205
134
        case BCF_BT_CHAR:
4206
134
            info->v1.i = *(int8_t*)ptr;
4207
134
            break;
4208
0
        case BCF_BT_INT16:
4209
0
            info->v1.i = le_to_i16(ptr);
4210
0
            len <<= 1;
4211
0
            break;
4212
0
        case BCF_BT_INT32:
4213
0
            info->v1.i = le_to_i32(ptr);
4214
0
            len <<= 2;
4215
0
            break;
4216
0
        case BCF_BT_FLOAT:
4217
0
            info->v1.f = le_to_float(ptr);
4218
0
            len <<= 2;
4219
0
            break;
4220
0
        case BCF_BT_INT64:
4221
0
            info->v1.i = le_to_i64(ptr);
4222
0
            len <<= 3;
4223
0
            break;
4224
134
        }
4225
559
    } else {
4226
559
        len <<= bcf_type_shift[info->type];
4227
559
    }
4228
693
    ptr += len;
4229
4230
693
    info->vptr_len = ptr - info->vptr;
4231
693
    return ptr;
4232
693
}
4233
4234
int bcf_unpack(bcf1_t *b, int which)
4235
41.0k
{
4236
41.0k
    if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
4237
41.0k
    uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
4238
41.0k
    int i;
4239
41.0k
    bcf_dec_t *d = &b->d;
4240
41.0k
    if (which & BCF_UN_FLT) which |= BCF_UN_STR;
4241
41.0k
    if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
4242
41.0k
    if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
4243
41.0k
    {
4244
41.0k
        kstring_t tmp;
4245
4246
        // ID
4247
41.0k
        tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
4248
41.0k
        ptr_ori = ptr;
4249
41.0k
        ptr = bcf_fmt_sized_array(&tmp, ptr);
4250
41.0k
        b->unpack_size[0] = ptr - ptr_ori;
4251
41.0k
        kputc_('\0', &tmp);
4252
41.0k
        d->id = tmp.s; d->m_id = tmp.m;
4253
4254
        // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
4255
41.0k
        hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
4256
41.0k
        tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
4257
41.0k
        ptr_ori = ptr;
4258
1.19M
        for (i = 0; i < b->n_allele; ++i) {
4259
            // Use offset within tmp.s as realloc may change pointer
4260
1.15M
            d->allele[i] = (char *)(intptr_t)tmp.l;
4261
1.15M
            ptr = bcf_fmt_sized_array(&tmp, ptr);
4262
1.15M
            kputc_('\0', &tmp);
4263
1.15M
        }
4264
41.0k
        b->unpack_size[1] = ptr - ptr_ori;
4265
41.0k
        d->als = tmp.s; d->m_als = tmp.m;
4266
4267
        // Convert our offsets within tmp.s back to pointers again
4268
1.19M
        for (i = 0; i < b->n_allele; ++i)
4269
1.15M
            d->allele[i] = d->als + (ptrdiff_t)d->allele[i];
4270
41.0k
        b->unpacked |= BCF_UN_STR;
4271
41.0k
    }
4272
41.0k
    if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
4273
41.0k
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
4274
41.0k
        ptr_ori = ptr;
4275
41.0k
        if (*ptr>>4) {
4276
37.8k
            int type;
4277
37.8k
            d->n_flt = bcf_dec_size(ptr, &ptr, &type);
4278
37.8k
            hts_expand(int, d->n_flt, d->m_flt, d->flt);
4279
583k
            for (i = 0; i < d->n_flt; ++i)
4280
545k
                d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
4281
37.8k
        } else ++ptr, d->n_flt = 0;
4282
41.0k
        b->unpack_size[2] = ptr - ptr_ori;
4283
41.0k
        b->unpacked |= BCF_UN_FLT;
4284
41.0k
    }
4285
41.0k
    if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
4286
0
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
4287
0
        hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
4288
0
        for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
4289
0
        for (i = 0; i < b->n_info; ++i)
4290
0
            ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
4291
0
        b->unpacked |= BCF_UN_INFO;
4292
0
    }
4293
41.0k
    if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
4294
0
        ptr = (uint8_t*)b->indiv.s;
4295
0
        hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
4296
0
        for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
4297
0
        for (i = 0; i < b->n_fmt; ++i)
4298
0
            ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
4299
0
        b->unpacked |= BCF_UN_FMT;
4300
0
    }
4301
41.0k
    return 0;
4302
41.0k
}
4303
4304
int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
4305
41.0k
{
4306
41.0k
    int i;
4307
41.0k
    int32_t max_dt_id = h->n[BCF_DT_ID];
4308
41.0k
    const char *chrom = bcf_seqname(h, v);
4309
41.0k
    if (!chrom) {
4310
0
        hts_log_error("Invalid BCF, CONTIG id=%d not present in the header",
4311
0
                      v->rid);
4312
0
        errno = EINVAL;
4313
0
        return -1;
4314
0
    }
4315
4316
41.0k
    bcf_unpack((bcf1_t*)v, BCF_UN_ALL & ~(BCF_UN_INFO|BCF_UN_FMT));
4317
4318
    // Cache of key lengths so we don't keep repeatedly using them.
4319
    // This assumes we're not modifying the header between successive calls
4320
    // to vcf_format, but that would lead to many other forms of breakage
4321
    // so it feels like a valid assumption to make.
4322
    //
4323
    // We cannot just do this in bcf_hdr_sync as some code (eg bcftools
4324
    // annotate) manipulates the headers directly without calling sync to
4325
    // refresh the data structures.  So we must do just-in-time length
4326
    // calculation during writes instead.
4327
41.0k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
4328
41.0k
    if (!aux->key_len) {
4329
3.57k
        if (!(aux->key_len = calloc(h->n[BCF_DT_ID]+1, sizeof(*aux->key_len))))
4330
0
            return -1;
4331
3.57k
    }
4332
41.0k
    size_t *key_len = aux->key_len;
4333
4334
41.0k
    kputs(chrom, s); // CHROM
4335
41.0k
    kputc_('\t', s); kputll(v->pos + 1, s); // POS
4336
41.0k
    kputc_('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
4337
41.0k
    kputc_('\t', s); // REF
4338
41.0k
    if (v->n_allele > 0) kputs(v->d.allele[0], s);
4339
0
    else kputc_('.', s);
4340
41.0k
    kputc_('\t', s); // ALT
4341
41.0k
    if (v->n_allele > 1) {
4342
1.14M
        for (i = 1; i < v->n_allele; ++i) {
4343
1.10M
            if (i > 1) kputc_(',', s);
4344
1.10M
            kputs(v->d.allele[i], s);
4345
1.10M
        }
4346
39.9k
    } else kputc_('.', s);
4347
41.0k
    kputc_('\t', s); // QUAL
4348
41.0k
    if ( bcf_float_is_missing(v->qual) ) kputc_('.', s); // QUAL
4349
39.9k
    else kputd(v->qual, s);
4350
41.0k
    kputc_('\t', s); // FILTER
4351
41.0k
    if (v->d.n_flt) {
4352
583k
        for (i = 0; i < v->d.n_flt; ++i) {
4353
545k
            int32_t idx = v->d.flt[i];
4354
545k
            if (idx < 0 || idx >= max_dt_id
4355
545k
                || h->id[BCF_DT_ID][idx].key == NULL) {
4356
0
                hts_log_error("Invalid BCF, the FILTER tag id=%d at %s:%"PRIhts_pos" not present in the header",
4357
0
                              idx, bcf_seqname_safe(h, v), v->pos + 1);
4358
0
                errno = EINVAL;
4359
0
                return -1;
4360
0
            }
4361
545k
            if (i) kputc_(';', s);
4362
545k
            if (!key_len[idx])
4363
64.2k
                key_len[idx] = strlen(h->id[BCF_DT_ID][idx].key);
4364
545k
            kputsn(h->id[BCF_DT_ID][idx].key, key_len[idx], s);
4365
545k
        }
4366
37.8k
    } else kputc_('.', s);
4367
4368
41.0k
    kputc_('\t', s); // INFO
4369
41.0k
    if (v->n_info) {
4370
14.7k
        uint8_t *ptr = v->shared.s
4371
14.7k
            ? (uint8_t *)v->shared.s + v->unpack_size[0] +
4372
14.7k
               v->unpack_size[1] + v->unpack_size[2]
4373
14.7k
            : NULL;
4374
14.7k
        int first = 1;
4375
14.7k
        bcf_info_t *info = v->d.info;
4376
4377
        // Note if we duplicate this code into custom packed and unpacked
4378
        // implementations then we gain a bit more speed, particularly with
4379
        // clang 13 (up to 5%).  Not sure why this is, but code duplication
4380
        // isn't pleasant and it's still faster adding packed support than
4381
        // not so it's a win, just not as good as it should be.
4382
14.7k
        const int info_packed = !(v->unpacked & BCF_UN_INFO) && v->shared.l;
4383
1.39M
        for (i = 0; i < v->n_info; ++i) {
4384
1.38M
            bcf_info_t in, *z;
4385
1.38M
            if (info_packed) {
4386
                // Use a local bcf_info_t when data is packed
4387
1.38M
                z = &in;
4388
1.38M
                z->key  = bcf_dec_typed_int1(ptr, &ptr);
4389
1.38M
                z->len  = bcf_dec_size(ptr, &ptr, &z->type);
4390
1.38M
                z->vptr = ptr;
4391
1.38M
                ptr += z->len << bcf_type_shift[z->type];
4392
1.38M
            } else {
4393
                // Else previously unpacked INFO struct
4394
0
                z = &info[i];
4395
4396
                // Also potentially since deleted
4397
0
                if ( !z->vptr ) continue;
4398
0
            }
4399
4400
1.38M
            bcf_idpair_t *id = z->key >= 0 && z->key < max_dt_id
4401
1.38M
                ? &h->id[BCF_DT_ID][z->key]
4402
1.38M
                : NULL;
4403
4404
1.38M
            if (!id || !id->key) {
4405
0
                hts_log_error("Invalid BCF, the INFO tag id=%d is %s at %s:%"PRIhts_pos,
4406
0
                              z->key,
4407
0
                              z->key < 0 ? "negative"
4408
0
                              : (z->key >= max_dt_id ? "too large" : "not present in the header"),
4409
0
                              bcf_seqname_safe(h, v), v->pos+1);
4410
0
                errno = EINVAL;
4411
0
                return -1;
4412
0
            }
4413
4414
            // KEY
4415
1.38M
            if (!key_len[z->key])
4416
20.7k
                key_len[z->key] = strlen(id->key);
4417
1.38M
            size_t id_len = key_len[z->key];
4418
1.38M
            if (ks_resize(s, s->l + 3 + id_len) < 0)
4419
0
                return -1;
4420
1.38M
            char *sptr = s->s + s->l;
4421
1.38M
            if ( !first ) {
4422
1.36M
                *sptr++ = ';';
4423
1.36M
                s->l++;
4424
1.36M
            }
4425
1.38M
            first = 0;
4426
1.38M
            memcpy(sptr, id->key, id_len);
4427
1.38M
            s->l += id_len;
4428
4429
            // VALUE
4430
1.38M
            if (z->len <= 0) continue;
4431
491k
            sptr[id_len] = '=';
4432
491k
            s->l++;
4433
4434
491k
            if (z->len != 1 || info_packed) {
4435
491k
                bcf_fmt_array(s, z->len, z->type, z->vptr);
4436
491k
            } else {
4437
                // Single length vectors are unpacked into their
4438
                // own info.v1 union and handled separately.
4439
0
                if (z->type == BCF_BT_FLOAT) {
4440
0
                    if ( bcf_float_is_missing(z->v1.f) )
4441
0
                        kputc_('.', s);
4442
0
                    else
4443
0
                        kputd(z->v1.f, s);
4444
0
                } else if (z->type == BCF_BT_CHAR) {
4445
0
                    kputc_(z->v1.i, s);
4446
0
                } else if (z->type < BCF_BT_INT64) {
4447
0
                    int64_t missing[] = {
4448
0
                        0, // BCF_BT_NULL
4449
0
                        bcf_int8_missing,
4450
0
                        bcf_int16_missing,
4451
0
                        bcf_int32_missing,
4452
0
                    };
4453
0
                    if (z->v1.i == missing[z->type])
4454
0
                        kputc_('.', s);
4455
0
                    else
4456
0
                        kputw(z->v1.i, s);
4457
0
                } else if (z->type == BCF_BT_INT64) {
4458
0
                    if (z->v1.i == bcf_int64_missing)
4459
0
                        kputc_('.', s);
4460
0
                    else
4461
0
                        kputll(z->v1.i, s);
4462
0
                } else {
4463
0
                    hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, z->type, bcf_seqname_safe(h, v), v->pos+1);
4464
0
                    errno = EINVAL;
4465
0
                    return -1;
4466
0
                }
4467
0
            }
4468
491k
        }
4469
14.7k
        if ( first ) kputc_('.', s);
4470
26.2k
    } else kputc_('.', s);
4471
4472
    // FORMAT and individual information
4473
41.0k
    if (v->n_sample) {
4474
17.8k
        int i,j;
4475
17.8k
        if ( v->n_fmt) {
4476
17.8k
            uint8_t *ptr = (uint8_t *)v->indiv.s;
4477
17.8k
            int gt_i = -1;
4478
17.8k
            bcf_fmt_t *fmt = v->d.fmt;
4479
17.8k
            int first = 1, ret = 0;
4480
17.8k
            int fmt_packed = !(v->unpacked & BCF_UN_FMT);
4481
4482
17.8k
            if (fmt_packed) {
4483
                // Local fmt as we have an array of num FORMAT keys,
4484
                // each of which points to N.Sample values.
4485
4486
                // No real gain to be had in handling unpacked data here,
4487
                // but it doesn't cost us much in complexity either and
4488
                // it gives us flexibility.
4489
17.8k
                fmt = malloc(v->n_fmt * sizeof(*fmt));
4490
17.8k
                if (!fmt)
4491
0
                    return -1;
4492
17.8k
            }
4493
4494
            // KEYS
4495
52.0k
            for (i = 0; i < (int)v->n_fmt; ++i) {
4496
34.1k
                bcf_fmt_t *z;
4497
34.1k
                z = &fmt[i];
4498
34.1k
                if (fmt_packed) {
4499
34.1k
                    z->id   = bcf_dec_typed_int1(ptr, &ptr);
4500
34.1k
                    z->n    = bcf_dec_size(ptr, &ptr, &z->type);
4501
34.1k
                    z->p    = ptr;
4502
34.1k
                    z->size = z->n << bcf_type_shift[z->type];
4503
34.1k
                    ptr += v->n_sample * z->size;
4504
34.1k
                }
4505
34.1k
                if ( !z->p ) continue;
4506
34.1k
                kputc_(!first ? ':' : '\t', s); first = 0;
4507
4508
34.1k
                bcf_idpair_t *id = z->id >= 0 && z->id < max_dt_id
4509
34.1k
                    ? &h->id[BCF_DT_ID][z->id]
4510
34.1k
                    : NULL;
4511
4512
34.1k
                if (!id || !id->key) {
4513
0
                    hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", z->id, bcf_seqname_safe(h, v), v->pos+1);
4514
0
                    errno = EINVAL;
4515
0
                    if (fmt_packed)
4516
0
                        free(fmt);
4517
0
                    return -1;
4518
0
                }
4519
4520
34.1k
                if (!key_len[z->id])
4521
17.6k
                    key_len[z->id] = strlen(id->key);
4522
34.1k
                size_t id_len = key_len[z->id];
4523
34.1k
                kputsn(id->key, id_len, s);
4524
34.1k
                if (id_len == 2 && id->key[0] == 'G' && id->key[1] == 'T')
4525
4.98k
                    gt_i = i;
4526
34.1k
            }
4527
17.8k
            if ( first ) kputsn("\t.", 2, s);
4528
4529
            // VALUES per sample
4530
49.0k
            for (j = 0; j < v->n_sample; ++j) {
4531
31.2k
                kputc_('\t', s);
4532
31.2k
                first = 1;
4533
31.2k
                bcf_fmt_t *f = fmt;
4534
94.4k
                for (i = 0; i < (int)v->n_fmt; i++, f++) {
4535
77.0k
                    if ( !f->p ) continue;
4536
77.0k
                    if (!first) kputc_(':', s);
4537
77.0k
                    first = 0;
4538
77.0k
                    if (gt_i == i) {
4539
13.8k
                        if ((ret = bcf_format_gt_v2(h, f,j,s)) < 0) {
4540
0
                            hts_log_error("Failed to format GT value for sample %d, returned %d", i, ret);
4541
0
                            errno = EINVAL;
4542
0
                            if (fmt_packed)
4543
0
                                free(fmt);
4544
0
                            return -1;
4545
0
                        }
4546
13.8k
                        break;
4547
13.8k
                    }
4548
63.2k
                    else if (f->n == 1)
4549
4.59k
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4550
58.6k
                    else
4551
58.6k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4552
77.0k
                }
4553
4554
                // Simpler loop post GT and at least 1 iteration
4555
45.6k
                for (i++, f++; i < (int)v->n_fmt; i++, f++) {
4556
14.4k
                    if ( !f->p ) continue;
4557
14.4k
                    kputc_(':', s);
4558
14.4k
                    if (f->n == 1)
4559
1.71k
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4560
12.7k
                    else
4561
12.7k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4562
14.4k
                }
4563
31.2k
                if ( first ) kputc_('.', s);
4564
31.2k
            }
4565
17.8k
            if (fmt_packed)
4566
17.8k
                free(fmt);
4567
17.8k
        }
4568
7
        else
4569
21
            for (j=0; j<=v->n_sample; j++)
4570
14
                kputsn("\t.", 2, s);
4571
17.8k
    }
4572
41.0k
    kputc('\n', s);
4573
41.0k
    return 0;
4574
41.0k
}
4575
4576
int vcf_write_line(htsFile *fp, kstring_t *line)
4577
0
{
4578
0
    int ret;
4579
0
    if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
4580
0
    if ( fp->format.compression!=no_compression )
4581
0
        ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
4582
0
    else
4583
0
        ret = hwrite(fp->fp.hfile, line->s, line->l);
4584
0
    return ret==line->l ? 0 : -1;
4585
0
}
4586
4587
int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
4588
41.0k
{
4589
41.0k
    ssize_t ret;
4590
41.0k
    fp->line.l = 0;
4591
41.0k
    if (vcf_format1(h, v, &fp->line) != 0)
4592
0
        return -1;
4593
41.0k
    if ( fp->format.compression!=no_compression ) {
4594
0
        if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4595
0
            return -1;
4596
0
        if (fp->idx && !fp->fp.bgzf->mt)
4597
0
            hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
4598
0
        ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
4599
41.0k
    } else {
4600
41.0k
        ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
4601
41.0k
    }
4602
4603
41.0k
    if (fp->idx && fp->format.compression == bgzf) {
4604
0
        int tid;
4605
0
        if ((tid = hts_idx_tbi_name(fp->idx, v->rid, bcf_seqname_safe(h, v))) < 0)
4606
0
            return -1;
4607
4608
0
        if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
4609
0
                          tid, v->pos, v->pos + v->rlen,
4610
0
                          bgzf_tell(fp->fp.bgzf), 1) < 0)
4611
0
            return -1;
4612
0
    }
4613
4614
41.0k
    return ret==fp->line.l ? 0 : -1;
4615
41.0k
}
4616
4617
/************************
4618
 * Data access routines *
4619
 ************************/
4620
4621
int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
4622
152k
{
4623
152k
    khint_t k;
4624
152k
    vdict_t *d = (vdict_t*)h->dict[which];
4625
152k
    k = kh_get(vdict, d, id);
4626
152k
    return k == kh_end(d)? -1 : kh_val(d, k).id;
4627
152k
}
4628
4629
4630
/********************
4631
 *** BCF indexing ***
4632
 ********************/
4633
4634
// Calculate number of index levels given min_shift and the header contig
4635
// list.  Also returns number of contigs in *nids_out.
4636
static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift,
4637
                               int starting_n_lvls, int *nids_out)
4638
0
{
4639
0
    int n_lvls, i, nids = 0;
4640
0
    int64_t max_len = 0, s;
4641
4642
0
    for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
4643
0
    {
4644
0
        if ( !h->id[BCF_DT_CTG][i].val ) continue;
4645
0
        if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] )
4646
0
            max_len = h->id[BCF_DT_CTG][i].val->info[0];
4647
0
        nids++;
4648
0
    }
4649
0
    if ( !max_len ) max_len = (1LL<<31) - 1;  // In case contig line is broken.
4650
0
    max_len += 256;
4651
0
    s = hts_bin_maxpos(min_shift, starting_n_lvls);
4652
0
    for (n_lvls = starting_n_lvls; max_len > s; ++n_lvls, s <<= 3);
4653
4654
0
    if (nids_out) *nids_out = nids;
4655
0
    return n_lvls;
4656
0
}
4657
4658
hts_idx_t *bcf_index(htsFile *fp, int min_shift)
4659
0
{
4660
0
    int n_lvls;
4661
0
    bcf1_t *b = NULL;
4662
0
    hts_idx_t *idx = NULL;
4663
0
    bcf_hdr_t *h;
4664
0
    int r;
4665
0
    h = bcf_hdr_read(fp);
4666
0
    if ( !h ) return NULL;
4667
0
    int nids = 0;
4668
0
    n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
4669
0
    idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4670
0
    if (!idx) goto fail;
4671
0
    b = bcf_init1();
4672
0
    if (!b) goto fail;
4673
0
    while ((r = bcf_read1(fp,h, b)) >= 0) {
4674
0
        int ret;
4675
0
        ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
4676
0
        if (ret < 0) goto fail;
4677
0
    }
4678
0
    if (r < -1) goto fail;
4679
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
4680
0
    bcf_destroy1(b);
4681
0
    bcf_hdr_destroy(h);
4682
0
    return idx;
4683
4684
0
 fail:
4685
0
    hts_idx_destroy(idx);
4686
0
    bcf_destroy1(b);
4687
0
    bcf_hdr_destroy(h);
4688
0
    return NULL;
4689
0
}
4690
4691
hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
4692
0
{
4693
0
    return fnidx? hts_idx_load2(fn, fnidx) : bcf_index_load(fn);
4694
0
}
4695
4696
hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags)
4697
0
{
4698
0
    return hts_idx_load3(fn, fnidx, HTS_FMT_CSI, flags);
4699
0
}
4700
4701
int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads)
4702
0
{
4703
0
    htsFile *fp;
4704
0
    hts_idx_t *idx;
4705
0
    tbx_t *tbx;
4706
0
    int ret;
4707
0
    if ((fp = hts_open(fn, "rb")) == 0) return -2;
4708
0
    if (n_threads)
4709
0
        hts_set_threads(fp, n_threads);
4710
0
    if ( fp->format.compression!=bgzf ) { hts_close(fp); return -3; }
4711
0
    switch (fp->format.format) {
4712
0
        case bcf:
4713
0
            if (!min_shift) {
4714
0
                hts_log_error("TBI indices for BCF files are not supported");
4715
0
                ret = -1;
4716
0
            } else {
4717
0
                idx = bcf_index(fp, min_shift);
4718
0
                if (idx) {
4719
0
                    ret = hts_idx_save_as(idx, fn, fnidx, HTS_FMT_CSI);
4720
0
                    if (ret < 0) ret = -4;
4721
0
                    hts_idx_destroy(idx);
4722
0
                }
4723
0
                else ret = -1;
4724
0
            }
4725
0
            break;
4726
4727
0
        case vcf:
4728
0
            tbx = tbx_index(hts_get_bgzfp(fp), min_shift, &tbx_conf_vcf);
4729
0
            if (tbx) {
4730
0
                ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0 ? HTS_FMT_CSI : HTS_FMT_TBI);
4731
0
                if (ret < 0) ret = -4;
4732
0
                tbx_destroy(tbx);
4733
0
            }
4734
0
            else ret = -1;
4735
0
            break;
4736
4737
0
        default:
4738
0
            ret = -3;
4739
0
            break;
4740
0
    }
4741
0
    hts_close(fp);
4742
0
    return ret;
4743
0
}
4744
4745
int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
4746
0
{
4747
0
    return bcf_index_build3(fn, fnidx, min_shift, 0);
4748
0
}
4749
4750
int bcf_index_build(const char *fn, int min_shift)
4751
0
{
4752
0
    return bcf_index_build3(fn, NULL, min_shift, 0);
4753
0
}
4754
4755
// Initialise fp->idx for the current format type.
4756
// This must be called after the header has been written but no other data.
4757
0
static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4758
0
    int n_lvls, fmt;
4759
4760
0
    if (min_shift == 0) {
4761
0
        min_shift = 14;
4762
0
        n_lvls = 5;
4763
0
        fmt = HTS_FMT_TBI;
4764
0
    } else {
4765
        // Set initial n_lvls to match tbx_index()
4766
0
        int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3;
4767
        // Increase if necessary
4768
0
        n_lvls = idx_calc_n_lvls_ids(h, min_shift, starting_n_lvls, NULL);
4769
0
        fmt = HTS_FMT_CSI;
4770
0
    }
4771
4772
0
    fp->idx = hts_idx_init(0, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4773
0
    if (!fp->idx) return -1;
4774
4775
    // Tabix meta data, added even in CSI for VCF
4776
0
    uint8_t conf[4*7];
4777
0
    u32_to_le(TBX_VCF, conf+0);  // fmt
4778
0
    u32_to_le(1,       conf+4);  // name col
4779
0
    u32_to_le(2,       conf+8);  // beg col
4780
0
    u32_to_le(0,       conf+12); // end col
4781
0
    u32_to_le('#',     conf+16); // comment
4782
0
    u32_to_le(0,       conf+20); // n.skip
4783
0
    u32_to_le(0,       conf+24); // ref name len
4784
0
    if (hts_idx_set_meta(fp->idx, sizeof(conf)*sizeof(*conf), (uint8_t *)conf, 1) < 0) {
4785
0
        hts_idx_destroy(fp->idx);
4786
0
        fp->idx = NULL;
4787
0
        return -1;
4788
0
    }
4789
0
    fp->fnidx = fnidx;
4790
4791
0
    return 0;
4792
0
}
4793
4794
// Initialise fp->idx for the current format type.
4795
// This must be called after the header has been written but no other data.
4796
0
int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4797
0
    int n_lvls, nids = 0;
4798
4799
0
    if (fp->format.compression != bgzf) {
4800
0
        hts_log_error("Indexing is only supported on BGZF-compressed files");
4801
0
        return -3; // Matches no-compression return for bcf_index_build3()
4802
0
    }
4803
4804
0
    if (fp->format.format == vcf)
4805
0
        return vcf_idx_init(fp, h, min_shift, fnidx);
4806
4807
0
    if (!min_shift)
4808
0
        min_shift = 14;
4809
4810
0
    n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
4811
4812
0
    fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4813
0
    if (!fp->idx) return -1;
4814
0
    fp->fnidx = fnidx;
4815
4816
0
    return 0;
4817
0
}
4818
4819
// Finishes an index. Call after the last record has been written.
4820
// Returns 0 on success, <0 on failure.
4821
//
4822
// NB: same format as SAM/BAM as it uses bgzf.
4823
0
int bcf_idx_save(htsFile *fp) {
4824
0
    return sam_idx_save(fp);
4825
0
}
4826
4827
/*****************
4828
 *** Utilities ***
4829
 *****************/
4830
4831
int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
4832
0
{
4833
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res;
4834
0
    for (i=0; i<src->nhrec; i++)
4835
0
    {
4836
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4837
0
        {
4838
0
            int j;
4839
0
            for (j=0; j<ndst_ori; j++)
4840
0
            {
4841
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4842
4843
                // Checking only the key part of generic lines, otherwise
4844
                // the VCFs are too verbose. Should we perhaps add a flag
4845
                // to bcf_hdr_combine() and make this optional?
4846
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4847
0
            }
4848
0
            if ( j>=ndst_ori ) {
4849
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4850
0
                if (res < 0) return -1;
4851
0
                need_sync += res;
4852
0
            }
4853
0
        }
4854
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4855
0
        {
4856
            // NB: we are ignoring fields without ID
4857
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4858
0
            if ( j>=0 )
4859
0
            {
4860
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4861
0
                if ( !rec ) {
4862
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4863
0
                    if (res < 0) return -1;
4864
0
                    need_sync += res;
4865
0
                }
4866
0
            }
4867
0
        }
4868
0
        else
4869
0
        {
4870
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4871
0
            assert( j>=0 ); // this should always be true for valid VCFs
4872
4873
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4874
0
            if ( !rec ) {
4875
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4876
0
                if (res < 0) return -1;
4877
0
                need_sync += res;
4878
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4879
0
            {
4880
                // Check that both records are of the same type. The bcf_hdr_id2length
4881
                // macro cannot be used here because dst header is not synced yet.
4882
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4883
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4884
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4885
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4886
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4887
0
                {
4888
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4889
0
                        src->hrec[i]->vals[0]);
4890
0
                    ret |= 1;
4891
0
                }
4892
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4893
0
                {
4894
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4895
0
                        src->hrec[i]->vals[0]);
4896
0
                    ret |= 1;
4897
0
                }
4898
0
            }
4899
0
        }
4900
0
    }
4901
0
    if ( need_sync ) {
4902
0
        if (bcf_hdr_sync(dst) < 0) return -1;
4903
0
    }
4904
0
    return ret;
4905
0
}
4906
4907
bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
4908
0
{
4909
0
    if ( !dst )
4910
0
    {
4911
        // this will effectively strip existing IDX attributes from src to become dst
4912
0
        dst = bcf_hdr_init("r");
4913
0
        kstring_t htxt = {0,0,0};
4914
0
        if (bcf_hdr_format(src, 0, &htxt) < 0) {
4915
0
            free(htxt.s);
4916
0
            return NULL;
4917
0
        }
4918
0
        if ( bcf_hdr_parse(dst, htxt.s) < 0 ) {
4919
0
            bcf_hdr_destroy(dst);
4920
0
            dst = NULL;
4921
0
        }
4922
0
        free(htxt.s);
4923
0
        return dst;
4924
0
    }
4925
4926
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, res;
4927
0
    for (i=0; i<src->nhrec; i++)
4928
0
    {
4929
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4930
0
        {
4931
0
            int j;
4932
0
            for (j=0; j<ndst_ori; j++)
4933
0
            {
4934
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4935
4936
                // Checking only the key part of generic lines, otherwise
4937
                // the VCFs are too verbose. Should we perhaps add a flag
4938
                // to bcf_hdr_combine() and make this optional?
4939
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4940
0
            }
4941
0
            if ( j>=ndst_ori ) {
4942
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4943
0
                if (res < 0) return NULL;
4944
0
                need_sync += res;
4945
0
            }
4946
0
            else if ( !strcmp(src->hrec[i]->key,"fileformat") )
4947
0
            {
4948
0
                int ver_src = bcf_get_version(src,src->hrec[i]->value);
4949
0
                int ver_dst = bcf_get_version(dst,dst->hrec[j]->value);
4950
0
                if ( ver_src > ver_dst )
4951
0
                {
4952
0
                    if (bcf_hdr_set_version(dst,src->hrec[i]->value) < 0)
4953
0
                        return NULL;
4954
0
                    need_sync = 1;
4955
0
                }
4956
0
            }
4957
0
        }
4958
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4959
0
        {
4960
            // NB: we are ignoring fields without ID
4961
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4962
0
            if ( j>=0 )
4963
0
            {
4964
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4965
0
                if ( !rec ) {
4966
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4967
0
                    if (res < 0) return NULL;
4968
0
                    need_sync += res;
4969
0
                }
4970
0
            }
4971
0
        }
4972
0
        else
4973
0
        {
4974
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4975
0
            assert( j>=0 ); // this should always be true for valid VCFs
4976
4977
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4978
0
            if ( !rec ) {
4979
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4980
0
                if (res < 0) return NULL;
4981
0
                need_sync += res;
4982
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4983
0
            {
4984
                // Check that both records are of the same type. The bcf_hdr_id2length
4985
                // macro cannot be used here because dst header is not synced yet.
4986
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4987
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4988
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4989
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4990
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4991
0
                {
4992
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4993
0
                        src->hrec[i]->vals[0]);
4994
0
                }
4995
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4996
0
                {
4997
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4998
0
                        src->hrec[i]->vals[0]);
4999
0
                }
5000
0
            }
5001
0
        }
5002
0
    }
5003
0
    if ( need_sync ) {
5004
0
        if (bcf_hdr_sync(dst) < 0) return NULL;
5005
0
    }
5006
0
    return dst;
5007
0
}
5008
5009
int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
5010
0
{
5011
0
    int i;
5012
0
    if ( line->errcode )
5013
0
    {
5014
0
        char errordescription[1024] = "";
5015
0
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_strerror(line->errcode, errordescription, sizeof(errordescription)),  bcf_seqname_safe(src_hdr,line), line->pos+1);
5016
0
        exit(1);
5017
0
    }
5018
0
    if ( src_hdr->ntransl==-1 ) return 0;    // no need to translate, all tags have the same id
5019
0
    if ( !src_hdr->ntransl )  // called for the first time, see what needs translating
5020
0
    {
5021
0
        int dict;
5022
0
        for (dict=0; dict<2; dict++)    // BCF_DT_ID and BCF_DT_CTG
5023
0
        {
5024
0
            src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
5025
0
            for (i=0; i<src_hdr->n[dict]; i++)
5026
0
            {
5027
0
                if ( !src_hdr->id[dict][i].key ) // gap left after removed BCF header lines
5028
0
                {
5029
0
                    src_hdr->transl[dict][i] = -1;
5030
0
                    continue;
5031
0
                }
5032
0
                src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
5033
0
                if ( src_hdr->transl[dict][i]!=-1 && i!=src_hdr->transl[dict][i] ) src_hdr->ntransl++;
5034
0
            }
5035
0
        }
5036
0
        if ( !src_hdr->ntransl )
5037
0
        {
5038
0
            free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
5039
0
            free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
5040
0
            src_hdr->ntransl = -1;
5041
0
        }
5042
0
        if ( src_hdr->ntransl==-1 ) return 0;
5043
0
    }
5044
0
    bcf_unpack(line,BCF_UN_ALL);
5045
5046
    // CHROM
5047
0
    if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
5048
5049
    // FILTER
5050
0
    for (i=0; i<line->d.n_flt; i++)
5051
0
    {
5052
0
        int src_id = line->d.flt[i];
5053
0
        if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
5054
0
            line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
5055
0
        line->d.shared_dirty |= BCF1_DIRTY_FLT;
5056
0
    }
5057
5058
    // INFO
5059
0
    for (i=0; i<line->n_info; i++)
5060
0
    {
5061
0
        int src_id = line->d.info[i].key;
5062
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
5063
0
        if ( dst_id<0 ) continue;
5064
0
        line->d.info[i].key = dst_id;
5065
0
        if ( !line->d.info[i].vptr ) continue;  // skip deleted
5066
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5067
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5068
0
        if ( src_size==dst_size )   // can overwrite
5069
0
        {
5070
0
            uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
5071
0
            if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
5072
0
            else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
5073
0
            else { *(uint32_t*)vptr = (uint32_t)dst_id; }
5074
0
        }
5075
0
        else    // must realloc
5076
0
        {
5077
0
            bcf_info_t *info = &line->d.info[i];
5078
0
            kstring_t str = {0,0,0};
5079
0
            bcf_enc_int1(&str, dst_id);
5080
0
            bcf_enc_size(&str, info->len,info->type);
5081
0
            uint32_t vptr_off = str.l;
5082
0
            kputsn((char*)info->vptr, info->vptr_len, &str);
5083
0
            if( info->vptr_free ) free(info->vptr - info->vptr_off);
5084
0
            info->vptr_off = vptr_off;
5085
0
            info->vptr = (uint8_t*)str.s + info->vptr_off;
5086
0
            info->vptr_free = 1;
5087
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5088
0
        }
5089
0
    }
5090
5091
    // FORMAT
5092
0
    for (i=0; i<line->n_fmt; i++)
5093
0
    {
5094
0
        int src_id = line->d.fmt[i].id;
5095
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
5096
0
        if ( dst_id<0 ) continue;
5097
0
        line->d.fmt[i].id = dst_id;
5098
0
        if( !line->d.fmt[i].p ) continue;  // skip deleted
5099
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5100
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5101
0
        if ( src_size==dst_size )   // can overwrite
5102
0
        {
5103
0
            uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off;    // pointer to the vector size (4bits) and BT type (4bits)
5104
0
            if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
5105
0
            else if ( dst_size==BCF_BT_INT16 ) { i16_to_le(dst_id, p + 1); }
5106
0
            else { i32_to_le(dst_id, p + 1); }
5107
0
        }
5108
0
        else    // must realloc
5109
0
        {
5110
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
5111
0
            kstring_t str = {0,0,0};
5112
0
            bcf_enc_int1(&str, dst_id);
5113
0
            bcf_enc_size(&str, fmt->n, fmt->type);
5114
0
            uint32_t p_off = str.l;
5115
0
            kputsn((char*)fmt->p, fmt->p_len, &str);
5116
0
            if( fmt->p_free ) free(fmt->p - fmt->p_off);
5117
0
            fmt->p_off = p_off;
5118
0
            fmt->p = (uint8_t*)str.s + fmt->p_off;
5119
0
            fmt->p_free = 1;
5120
0
            line->d.indiv_dirty = 1;
5121
0
        }
5122
0
    }
5123
0
    return 0;
5124
0
}
5125
5126
bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
5127
0
{
5128
0
    bcf_hdr_t *hout = bcf_hdr_init("r");
5129
0
    if (!hout) {
5130
0
        hts_log_error("Failed to allocate bcf header");
5131
0
        return NULL;
5132
0
    }
5133
0
    kstring_t htxt = {0,0,0};
5134
0
    if (bcf_hdr_format(hdr, 1, &htxt) < 0) {
5135
0
        free(htxt.s);
5136
0
        return NULL;
5137
0
    }
5138
0
    if ( bcf_hdr_parse(hout, htxt.s) < 0 ) {
5139
0
        bcf_hdr_destroy(hout);
5140
0
        hout = NULL;
5141
0
    }
5142
0
    free(htxt.s);
5143
0
    return hout;
5144
0
}
5145
5146
bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
5147
0
{
5148
0
    void *names_hash = khash_str2int_init();
5149
0
    kstring_t htxt = {0,0,0};
5150
0
    kstring_t str = {0,0,0};
5151
0
    bcf_hdr_t *h = bcf_hdr_init("w");
5152
0
    int r = 0;
5153
0
    if (!h || !names_hash) {
5154
0
        hts_log_error("Failed to allocate bcf header");
5155
0
        goto err;
5156
0
    }
5157
0
    if (bcf_hdr_format(h0, 1, &htxt) < 0) {
5158
0
        hts_log_error("Failed to get header text");
5159
0
        goto err;
5160
0
    }
5161
0
    bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
5162
0
    int j;
5163
0
    for (j=0; j<n; j++) imap[j] = -1;
5164
0
    if ( bcf_hdr_nsamples(h0) > 0) {
5165
0
        char *p = find_chrom_header_line(htxt.s);
5166
0
        int i = 0, end = n? 8 : 7;
5167
0
        while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
5168
0
        if (i != end) {
5169
0
            hts_log_error("Wrong number of columns in header #CHROM line");
5170
0
            goto err;
5171
0
        }
5172
0
        r |= kputsn(htxt.s, p - htxt.s, &str) < 0;
5173
0
        for (i = 0; i < n; ++i) {
5174
0
            if ( khash_str2int_has_key(names_hash,samples[i]) )
5175
0
            {
5176
0
                hts_log_error("Duplicate sample name \"%s\"", samples[i]);
5177
0
                goto err;
5178
0
            }
5179
0
            imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
5180
0
            if (imap[i] < 0) continue;
5181
0
            r |= kputc('\t', &str) < 0;
5182
0
            r |= kputs(samples[i], &str) < 0;
5183
0
            r |= khash_str2int_inc(names_hash,samples[i]) < 0;
5184
0
        }
5185
0
    } else r |= kputsn(htxt.s, htxt.l, &str) < 0;
5186
0
    while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
5187
0
    r |= kputc('\n',&str) < 0;
5188
0
    if (r) {
5189
0
        hts_log_error("%s", strerror(errno));
5190
0
        goto err;
5191
0
    }
5192
0
    if ( bcf_hdr_parse(h, str.s) < 0 ) {
5193
0
        bcf_hdr_destroy(h);
5194
0
        h = NULL;
5195
0
    }
5196
0
    free(str.s);
5197
0
    free(htxt.s);
5198
0
    khash_str2int_destroy(names_hash);
5199
0
    return h;
5200
5201
0
 err:
5202
0
    ks_free(&str);
5203
0
    ks_free(&htxt);
5204
0
    khash_str2int_destroy(names_hash);
5205
0
    bcf_hdr_destroy(h);
5206
0
    return NULL;
5207
0
}
5208
5209
int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
5210
0
{
5211
0
    if ( samples && !strcmp("-",samples) ) return 0;            // keep all samples
5212
5213
0
    int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
5214
0
    hdr->keep_samples = (uint8_t*) calloc(narr,1);
5215
0
    if (!hdr->keep_samples) return -1;
5216
5217
0
    hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
5218
0
    if ( !samples )
5219
0
    {
5220
        // exclude all samples
5221
0
        khint_t k;
5222
0
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE], *new_dict;
5223
0
        new_dict = kh_init(vdict);
5224
0
        if (!new_dict) return -1;
5225
5226
0
        bcf_hdr_nsamples(hdr) = 0;
5227
5228
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
5229
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
5230
0
        kh_destroy(vdict, d);
5231
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
5232
0
        if (bcf_hdr_sync(hdr) < 0) return -1;
5233
5234
0
        return 0;
5235
0
    }
5236
5237
0
    if ( samples[0]=='^' )
5238
0
        for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
5239
5240
0
    int idx, n, ret = 0;
5241
0
    char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
5242
0
    if ( !smpls ) return -1;
5243
0
    for (i=0; i<n; i++)
5244
0
    {
5245
0
        idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
5246
0
        if ( idx<0 )
5247
0
        {
5248
0
            if ( !ret ) ret = i+1;
5249
0
            continue;
5250
0
        }
5251
0
        assert( idx<bcf_hdr_nsamples(hdr) );
5252
0
        if (  samples[0]=='^' )
5253
0
            bit_array_clear(hdr->keep_samples, idx);
5254
0
        else
5255
0
            bit_array_set(hdr->keep_samples, idx);
5256
0
    }
5257
0
    for (i=0; i<n; i++) free(smpls[i]);
5258
0
    free(smpls);
5259
5260
0
    bcf_hdr_nsamples(hdr) = 0;
5261
0
    for (i=0; i<hdr->nsamples_ori; i++)
5262
0
        if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
5263
5264
0
    if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
5265
0
    else
5266
0
    {
5267
        // Make new list and dictionary with desired samples
5268
0
        char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr));
5269
0
        vdict_t *new_dict, *d;
5270
0
        int k, res;
5271
0
        if (!samples) return -1;
5272
5273
0
        new_dict = kh_init(vdict);
5274
0
        if (!new_dict) {
5275
0
            free(samples);
5276
0
            return -1;
5277
0
        }
5278
0
        idx = 0;
5279
0
        for (i=0; i<hdr->nsamples_ori; i++) {
5280
0
            if ( bit_array_test(hdr->keep_samples,i) ) {
5281
0
                samples[idx] = hdr->samples[i];
5282
0
                k = kh_put(vdict, new_dict, hdr->samples[i], &res);
5283
0
                if (res < 0) {
5284
0
                    free(samples);
5285
0
                    kh_destroy(vdict, new_dict);
5286
0
                    return -1;
5287
0
                }
5288
0
                kh_val(new_dict, k) = bcf_idinfo_def;
5289
0
                kh_val(new_dict, k).id = idx;
5290
0
                idx++;
5291
0
            }
5292
0
        }
5293
5294
        // Delete desired samples from old dictionary, so we don't free them
5295
0
        d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
5296
0
        for (i=0; i < idx; i++) {
5297
0
            int k = kh_get(vdict, d, samples[i]);
5298
0
            if (k < kh_end(d)) kh_del(vdict, d, k);
5299
0
        }
5300
5301
        // Free everything else
5302
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
5303
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
5304
0
        kh_destroy(vdict, d);
5305
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
5306
5307
0
        free(hdr->samples);
5308
0
        hdr->samples = samples;
5309
5310
0
        if (bcf_hdr_sync(hdr) < 0)
5311
0
            return -1;
5312
0
    }
5313
5314
0
    return ret;
5315
0
}
5316
5317
int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
5318
0
{
5319
0
    kstring_t ind;
5320
0
    ind.s = 0; ind.l = ind.m = 0;
5321
0
    if (n) {
5322
0
        bcf_fmt_t fmt[MAX_N_FMT];
5323
0
        int i, j;
5324
0
        uint8_t *ptr = (uint8_t*)v->indiv.s;
5325
0
        for (i = 0; i < v->n_fmt; ++i)
5326
0
            ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
5327
0
        for (i = 0; i < (int)v->n_fmt; ++i) {
5328
0
            bcf_fmt_t *f = &fmt[i];
5329
0
            bcf_enc_int1(&ind, f->id);
5330
0
            bcf_enc_size(&ind, f->n, f->type);
5331
0
            for (j = 0; j < n; ++j)
5332
0
                if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
5333
0
        }
5334
0
        for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
5335
0
        v->n_sample = i;
5336
0
    } else v->n_sample = 0;
5337
0
    if ( !v->n_sample ) v->n_fmt = 0;
5338
0
    free(v->indiv.s);
5339
0
    v->indiv = ind;
5340
0
    v->unpacked &= ~BCF_UN_FMT;    // only BCF is ready for output, VCF will need to unpack again
5341
0
    return 0;
5342
0
}
5343
5344
int bcf_is_snp(bcf1_t *v)
5345
0
{
5346
0
    int i;
5347
0
    bcf_unpack(v, BCF_UN_STR);
5348
0
    for (i = 0; i < v->n_allele; ++i)
5349
0
    {
5350
0
        if ( v->d.allele[i][1]==0 && v->d.allele[i][0]!='*' ) continue;
5351
5352
        // mpileup's <X> allele, see also below. This is not completely satisfactory,
5353
        // a general library is here narrowly tailored to fit samtools.
5354
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue;
5355
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='*' && v->d.allele[i][2]=='>' ) continue;
5356
5357
0
        break;
5358
0
    }
5359
0
    return i == v->n_allele;
5360
0
}
5361
5362
static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t *var)
5363
0
{
5364
0
    if ( *alt == '*' && !alt[1] ) { var->n = 0; var->type = VCF_OVERLAP; return; }  // overlapping variant
5365
5366
    // The most frequent case
5367
0
    if ( !ref[1] && !alt[1] )
5368
0
    {
5369
0
        if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
5370
0
        if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
5371
0
        var->n = 1; var->type = VCF_SNP; return;
5372
0
    }
5373
0
    if ( alt[0]=='<' )
5374
0
    {
5375
0
        if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
5376
0
        if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }
5377
0
        if ( !strcmp("NON_REF>",alt+1) ) { var->n = 0; var->type = VCF_REF; return; }
5378
0
        var->type = VCF_OTHER;
5379
0
        return;
5380
0
    }
5381
5382
    // Catch "joined before" breakend case
5383
0
    if ( alt[0]==']' || alt[0] == '[' )
5384
0
    {
5385
0
        var->type = VCF_BND; return;
5386
0
    }
5387
5388
    // Iterate through alt characters that match the reference
5389
0
    const char *r = ref, *a = alt;
5390
0
    while (*r && *a && toupper_c(*r)==toupper_c(*a) ) { r++; a++; }     // unfortunately, matching REF,ALT case is not guaranteed
5391
5392
0
    if ( *a && !*r )
5393
0
    {
5394
0
        while ( *a ) a++;
5395
0
        if ( *(a-1)==']' || *(a-1)=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend
5396
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_INS; return;
5397
0
    }
5398
0
    else if ( *r && !*a )
5399
0
    {
5400
0
        while ( *r ) r++;
5401
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_DEL; return;
5402
0
    }
5403
0
    else if ( !*r && !*a )
5404
0
    {
5405
0
        var->n = 0; var->type = VCF_REF; return;
5406
0
    }
5407
5408
0
    const char *re = r, *ae = a;
5409
0
    while ( re[1] ) re++;
5410
0
    while ( ae[1] ) ae++;
5411
0
    if ( ae[0]==']' || ae[0]=='[' ) { var->type = VCF_BND; return; }    // "joined after" breakend
5412
0
    while ( re>r && ae>a && toupper_c(*re)==toupper_c(*ae) ) { re--; ae--; }
5413
0
    if ( ae==a )
5414
0
    {
5415
0
        if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
5416
0
        var->n = -(re-r);
5417
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_DEL; return; }
5418
0
        var->type = VCF_OTHER; return;
5419
0
    }
5420
0
    else if ( re==r )
5421
0
    {
5422
0
        var->n = ae-a;
5423
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_INS; return; }
5424
0
        var->type = VCF_OTHER; return;
5425
0
    }
5426
5427
0
    var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
5428
0
    var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
5429
5430
    // should do also complex events, SVs, etc...
5431
0
}
5432
5433
static int bcf_set_variant_types(bcf1_t *b)
5434
0
{
5435
0
    if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
5436
0
    bcf_dec_t *d = &b->d;
5437
0
    if ( d->n_var < b->n_allele )
5438
0
    {
5439
0
        bcf_variant_t *new_var = realloc(d->var, sizeof(bcf_variant_t)*b->n_allele);
5440
0
        if (!new_var)
5441
0
            return -1;
5442
0
        d->var = new_var;
5443
0
        d->n_var = b->n_allele;
5444
0
    }
5445
0
    int i;
5446
0
    b->d.var_type = 0;
5447
0
    d->var[0].type = VCF_REF;
5448
0
    d->var[0].n    = 0;
5449
0
    for (i=1; i<b->n_allele; i++)
5450
0
    {
5451
0
        bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
5452
0
        b->d.var_type |= d->var[i].type;
5453
        //fprintf(stderr,"[set_variant_type] %d   %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
5454
0
    }
5455
0
    return 0;
5456
0
}
5457
5458
// bcf_get_variant_type/bcf_get_variant_types should only return the following,
5459
// to be compatible with callers that are not expecting newer values
5460
// like VCF_INS, VCF_DEL.  The full set is available from the newer
5461
// vcf_has_variant_type* interfaces.
5462
0
#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP)
5463
int bcf_get_variant_types(bcf1_t *rec)
5464
0
{
5465
0
    if ( rec->d.var_type==-1 ) {
5466
0
        if (bcf_set_variant_types(rec) != 0) {
5467
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5468
0
            exit(1); // Due to legacy API having no way to report failures
5469
0
        }
5470
0
    }
5471
0
    return rec->d.var_type & ORIG_VAR_TYPES;
5472
0
}
5473
5474
int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
5475
0
{
5476
0
    if ( rec->d.var_type==-1 ) {
5477
0
        if (bcf_set_variant_types(rec) != 0) {
5478
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5479
0
            exit(1); // Due to legacy API having no way to report failures
5480
0
        }
5481
0
    }
5482
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) {
5483
0
        hts_log_error("Requested allele outside valid range");
5484
0
        exit(1);
5485
0
    }
5486
0
    return rec->d.var[ith_allele].type & ORIG_VAR_TYPES;
5487
0
}
5488
#undef ORIG_VAR_TYPES
5489
5490
int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask)
5491
0
{
5492
0
    if ( rec->d.var_type==-1 ) {
5493
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5494
0
    }
5495
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return -1;
5496
0
    if (bitmask == VCF_REF) {  // VCF_REF is 0, so handled as a special case
5497
0
        return rec->d.var[ith_allele].type == VCF_REF;
5498
0
    }
5499
0
    return bitmask & rec->d.var[ith_allele].type;
5500
0
}
5501
5502
int bcf_variant_length(bcf1_t *rec, int ith_allele)
5503
0
{
5504
0
    if ( rec->d.var_type==-1 ) {
5505
0
        if (bcf_set_variant_types(rec) != 0) return bcf_int32_missing;
5506
0
    }
5507
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return bcf_int32_missing;
5508
0
    return rec->d.var[ith_allele].n;
5509
0
}
5510
5511
int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask,
5512
                          enum bcf_variant_match mode)
5513
0
{
5514
0
    if ( rec->d.var_type==-1 ) {
5515
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5516
0
    }
5517
0
    uint32_t type = rec->d.var_type;
5518
0
    if ( mode==bcf_match_overlap ) return bitmask & type;
5519
5520
    // VCF_INDEL is always set with VCF_INS and VCF_DEL by bcf_set_variant_type[s], but the bitmask may
5521
    // ask for say `VCF_INS` or `VCF_INDEL` only
5522
0
    if ( bitmask&(VCF_INS|VCF_DEL) && !(bitmask&VCF_INDEL) ) type &= ~VCF_INDEL;
5523
0
    else if ( bitmask&VCF_INDEL && !(bitmask&(VCF_INS|VCF_DEL)) ) type &= ~(VCF_INS|VCF_DEL);
5524
5525
0
    if ( mode==bcf_match_subset )
5526
0
    {
5527
0
        if ( ~bitmask & type ) return 0;
5528
0
        else return bitmask & type;
5529
0
    }
5530
    // mode == bcf_match_exact
5531
0
    if ( bitmask==VCF_REF ) return type==bitmask ? 1 : 0;
5532
0
    return type==bitmask ? type : 0;
5533
0
}
5534
5535
int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5536
0
{
5537
0
    static int negative_rlen_warned = 0;
5538
0
    int is_end_tag, is_svlen_tag = 0;
5539
5540
    // Is the field already present?
5541
0
    int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5542
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1;    // No such INFO field in the header
5543
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
5544
5545
0
    is_end_tag = strcmp(key, "END") == 0;
5546
0
    is_svlen_tag = strcmp(key, "SVLEN") == 0;
5547
5548
0
    for (i=0; i<line->n_info; i++)
5549
0
        if ( inf_id==line->d.info[i].key ) break;
5550
0
    bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
5551
5552
0
    if ( !n || (type==BCF_HT_STR && !values) )
5553
0
    {
5554
0
        if ( inf )
5555
0
        {
5556
            // Mark the tag for removal, free existing memory if necessary
5557
0
            if ( inf->vptr_free )
5558
0
            {
5559
0
                free(inf->vptr - inf->vptr_off);
5560
0
                inf->vptr_free = 0;
5561
0
            }
5562
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5563
0
            inf->vptr = NULL;
5564
0
            inf->vptr_off = inf->vptr_len = 0;
5565
0
        }
5566
0
        if ( n==0 && (is_end_tag || is_svlen_tag) ) {
5567
0
            line->rlen = get_rlen(hdr, line);
5568
0
        }
5569
0
        return 0;
5570
0
    }
5571
5572
0
    if (is_end_tag)
5573
0
    {
5574
0
        if (n != 1)
5575
0
        {
5576
0
            hts_log_error("END info tag should only have one value at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5577
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5578
0
            return -1;
5579
0
        }
5580
0
        if (type != BCF_HT_INT && type != BCF_HT_LONG)
5581
0
        {
5582
0
            hts_log_error("Wrong type (%d) for END info tag at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5583
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5584
0
            return -1;
5585
0
        }
5586
0
    }
5587
5588
    // Encode the values and determine the size required to accommodate the values
5589
0
    kstring_t str = {0,0,0};
5590
0
    bcf_enc_int1(&str, inf_id);
5591
0
    if ( type==BCF_HT_INT )
5592
0
        bcf_enc_vint(&str, n, (int32_t*)values, -1);
5593
0
    else if ( type==BCF_HT_REAL )
5594
0
        bcf_enc_vfloat(&str, n, (float*)values);
5595
0
    else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
5596
0
    {
5597
0
        if ( values==NULL )
5598
0
            bcf_enc_size(&str, 0, BCF_BT_NULL);
5599
0
        else
5600
0
            bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
5601
0
    }
5602
#ifdef VCF_ALLOW_INT64
5603
    else if ( type==BCF_HT_LONG )
5604
    {
5605
        if (n != 1) {
5606
            hts_log_error("Only storing a single BCF_HT_LONG value is supported at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5607
            abort();
5608
        }
5609
        bcf_enc_long1(&str, *(int64_t *) values);
5610
    }
5611
#endif
5612
0
    else
5613
0
    {
5614
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5615
0
        abort();
5616
0
    }
5617
5618
    // Is the INFO tag already present
5619
0
    if ( inf )
5620
0
    {
5621
        // Is it big enough to accommodate new block?
5622
0
        if ( inf->vptr && str.l <= inf->vptr_len + inf->vptr_off )
5623
0
        {
5624
0
            if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
5625
0
            uint8_t *ptr = inf->vptr - inf->vptr_off;
5626
0
            memcpy(ptr, str.s, str.l);
5627
0
            free(str.s);
5628
0
            int vptr_free = inf->vptr_free;
5629
0
            bcf_unpack_info_core1(ptr, inf);
5630
0
            inf->vptr_free = vptr_free;
5631
0
        }
5632
0
        else
5633
0
        {
5634
0
            if ( inf->vptr_free )
5635
0
                free(inf->vptr - inf->vptr_off);
5636
0
            bcf_unpack_info_core1((uint8_t*)str.s, inf);
5637
0
            inf->vptr_free = 1;
5638
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5639
0
        }
5640
0
    }
5641
0
    else
5642
0
    {
5643
        // The tag is not present, create new one
5644
0
        line->n_info++;
5645
0
        hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
5646
0
        inf = &line->d.info[line->n_info-1];
5647
0
        bcf_unpack_info_core1((uint8_t*)str.s, inf);
5648
0
        inf->vptr_free = 1;
5649
0
        line->d.shared_dirty |= BCF1_DIRTY_INF;
5650
0
    }
5651
0
    line->unpacked |= BCF_UN_INFO;
5652
5653
0
   if ( n==1 && is_end_tag) {
5654
0
        hts_pos_t end = type == BCF_HT_INT ? *(int32_t *) values : *(int64_t *) values;
5655
0
        if ( (type == BCF_HT_INT && end!=bcf_int32_missing) || (type == BCF_HT_LONG && end!=bcf_int64_missing) )
5656
0
        {
5657
0
            if ( end <= line->pos )
5658
0
            {
5659
0
                if ( !negative_rlen_warned )
5660
0
                {
5661
0
                    hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,end,bcf_seqname_safe(hdr,line),line->pos+1);
5662
0
                    negative_rlen_warned = 1;
5663
0
                }
5664
0
            }
5665
0
        }
5666
0
    }
5667
0
    if (is_svlen_tag || is_end_tag) {
5668
0
        line->rlen = get_rlen(hdr, line);
5669
0
    }
5670
0
    return 0;
5671
0
}
5672
5673
int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
5674
0
{
5675
0
    if ( !n )
5676
0
        return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
5677
5678
0
    int i, max_len = 0;
5679
0
    for (i=0; i<n; i++)
5680
0
    {
5681
0
        int len = strlen(values[i]);
5682
0
        if ( len > max_len ) max_len = len;
5683
0
    }
5684
0
    char *out = (char*) malloc(max_len*n);
5685
0
    if ( !out ) return -2;
5686
0
    for (i=0; i<n; i++)
5687
0
    {
5688
0
        char *dst = out+i*max_len;
5689
0
        const char *src = values[i];
5690
0
        int j = 0;
5691
0
        while ( src[j] ) { dst[j] = src[j]; j++; }
5692
0
        for (; j<max_len; j++) dst[j] = 0;
5693
0
    }
5694
0
    int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
5695
0
    free(out);
5696
0
    return ret;
5697
0
}
5698
5699
int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5700
0
{
5701
    // Is the field already present?
5702
0
    int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5703
0
    int is_len = 0;
5704
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
5705
0
    {
5706
0
        if ( !n ) return 0;
5707
0
        return -1;  // the key not present in the header
5708
0
    }
5709
5710
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5711
5712
0
    for (i=0; i<line->n_fmt; i++)
5713
0
        if ( line->d.fmt[i].id==fmt_id ) break;
5714
0
    bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
5715
5716
0
    is_len = strcmp(key, "LEN") == 0;
5717
0
    if ( !n )
5718
0
    {
5719
0
        if ( fmt )
5720
0
        {
5721
            // Mark the tag for removal, free existing memory if necessary
5722
0
            if ( fmt->p_free )
5723
0
            {
5724
0
                free(fmt->p - fmt->p_off);
5725
0
                fmt->p_free = 0;
5726
0
            }
5727
0
            line->d.indiv_dirty = 1;
5728
0
            fmt->p = NULL;
5729
0
        }
5730
0
        if (is_len) {
5731
0
            line->rlen = get_rlen(hdr, line);
5732
0
        }
5733
0
        return 0;
5734
0
    }
5735
5736
0
    line->n_sample = bcf_hdr_nsamples(hdr);
5737
0
    int nps = n / line->n_sample;  // number of values per sample
5738
0
    assert( nps && nps*line->n_sample==n );     // must be divisible by n_sample
5739
5740
    // Encode the values and determine the size required to accommodate the values
5741
0
    kstring_t str = {0,0,0};
5742
0
    bcf_enc_int1(&str, fmt_id);
5743
0
    assert(values != NULL);
5744
0
    if ( type==BCF_HT_INT )
5745
0
        bcf_enc_vint(&str, n, (int32_t*)values, nps);
5746
0
    else if ( type==BCF_HT_REAL )
5747
0
    {
5748
0
        bcf_enc_size(&str, nps, BCF_BT_FLOAT);
5749
0
        serialize_float_array(&str, nps*line->n_sample, (float *) values);
5750
0
    }
5751
0
    else if ( type==BCF_HT_STR )
5752
0
    {
5753
0
        bcf_enc_size(&str, nps, BCF_BT_CHAR);
5754
0
        kputsn((char*)values, nps*line->n_sample, &str);
5755
0
    }
5756
0
    else
5757
0
    {
5758
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5759
0
        abort();
5760
0
    }
5761
5762
0
    if ( !fmt )
5763
0
    {
5764
        // Not present, new format field
5765
0
        line->n_fmt++;
5766
0
        hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
5767
5768
        // Special case: VCF specification requires that GT is always first
5769
0
        if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
5770
0
        {
5771
0
            for (i=line->n_fmt-1; i>0; i--)
5772
0
                line->d.fmt[i] = line->d.fmt[i-1];
5773
0
            fmt = &line->d.fmt[0];
5774
0
        }
5775
0
        else
5776
0
            fmt = &line->d.fmt[line->n_fmt-1];
5777
0
        bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5778
0
        line->d.indiv_dirty = 1;
5779
0
        fmt->p_free = 1;
5780
0
    }
5781
0
    else
5782
0
    {
5783
        // The tag is already present, check if it is big enough to accommodate the new block
5784
0
        if ( fmt->p && str.l <= fmt->p_len + fmt->p_off )
5785
0
        {
5786
            // good, the block is big enough
5787
0
            if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
5788
0
            uint8_t *ptr = fmt->p - fmt->p_off;
5789
0
            memcpy(ptr, str.s, str.l);
5790
0
            free(str.s);
5791
0
            int p_free = fmt->p_free;
5792
0
            bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
5793
0
            fmt->p_free = p_free;
5794
0
        }
5795
0
        else
5796
0
        {
5797
0
            if ( fmt->p_free )
5798
0
                free(fmt->p - fmt->p_off);
5799
0
            bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5800
0
            fmt->p_free = 1;
5801
0
            line->d.indiv_dirty = 1;
5802
0
        }
5803
0
    }
5804
0
    line->unpacked |= BCF_UN_FMT;
5805
5806
0
    if (is_len) {
5807
0
        line->rlen = get_rlen(hdr, line);
5808
0
    }
5809
0
    return 0;
5810
0
}
5811
5812
5813
int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
5814
0
{
5815
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5816
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5817
0
    line->d.n_flt = n;
5818
0
    if ( !n ) return 0;
5819
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5820
0
    int i;
5821
0
    for (i=0; i<n; i++)
5822
0
        line->d.flt[i] = flt_ids[i];
5823
0
    return 0;
5824
0
}
5825
5826
int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
5827
0
{
5828
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5829
0
    int i;
5830
0
    for (i=0; i<line->d.n_flt; i++)
5831
0
        if ( flt_id==line->d.flt[i] ) break;
5832
0
    if ( i<line->d.n_flt ) return 0;    // this filter is already set
5833
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5834
0
    if ( flt_id==0 )    // set to PASS
5835
0
        line->d.n_flt = 1;
5836
0
    else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
5837
0
        line->d.n_flt = 1;
5838
0
    else
5839
0
        line->d.n_flt++;
5840
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5841
0
    line->d.flt[line->d.n_flt-1] = flt_id;
5842
0
    return 1;
5843
0
}
5844
int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
5845
0
{
5846
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5847
0
    int i;
5848
0
    for (i=0; i<line->d.n_flt; i++)
5849
0
        if ( flt_id==line->d.flt[i] ) break;
5850
0
    if ( i==line->d.n_flt ) return 0;   // the filter is not present
5851
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5852
0
    if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt));
5853
0
    line->d.n_flt--;
5854
0
    if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
5855
0
    return 0;
5856
0
}
5857
5858
int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
5859
0
{
5860
0
    if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
5861
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
5862
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1;  // not defined in the header
5863
5864
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5865
0
    if ( id==0 && !line->d.n_flt) return 1; // PASS
5866
5867
0
    int i;
5868
0
    for (i=0; i<line->d.n_flt; i++)
5869
0
        if ( line->d.flt[i]==id ) return 1;
5870
0
    return 0;
5871
0
}
5872
5873
static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
5874
0
{
5875
0
    line->d.shared_dirty |= BCF1_DIRTY_ALS;
5876
0
    line->d.var_type = -1;
5877
5878
0
    line->n_allele = nals;
5879
0
    hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
5880
5881
0
    char *als = line->d.als;
5882
0
    int n = 0;
5883
0
    while (n<nals)
5884
0
    {
5885
0
        line->d.allele[n] = als;
5886
0
        while ( *als ) als++;
5887
0
        als++;
5888
0
        n++;
5889
0
    }
5890
    // Update REF length. Note that END is 1-based while line->pos 0-based
5891
0
    line->rlen = get_rlen(hdr, line);
5892
5893
0
    return 0;
5894
0
}
5895
int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
5896
0
{
5897
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5898
0
    char *free_old = NULL;
5899
0
    char buffer[256];
5900
0
    size_t used = 0;
5901
5902
    // The pointers in alleles may point into the existing line->d.als memory,
5903
    // so care needs to be taken not to clobber them while updating.  Usually
5904
    // they will be short so we can copy through an intermediate buffer.
5905
    // If they're longer, or won't fit in the existing allocation we
5906
    // can allocate a new buffer to write into.  Note that in either case
5907
    // pointers to line->d.als memory in alleles may not be valid when we've
5908
    // finished.
5909
0
    int i;
5910
0
    size_t avail = line->d.m_als < sizeof(buffer) ? line->d.m_als : sizeof(buffer);
5911
0
    for (i=0; i<nals; i++) {
5912
0
        size_t sz = strlen(alleles[i]) + 1;
5913
0
        if (avail - used < sz)
5914
0
            break;
5915
0
        memcpy(buffer + used, alleles[i], sz);
5916
0
        used += sz;
5917
0
    }
5918
5919
    // Did we miss anything?
5920
0
    if (i < nals) {
5921
0
        int j;
5922
0
        size_t needed = used;
5923
0
        char *new_als;
5924
0
        for (j = i; j < nals; j++)
5925
0
            needed += strlen(alleles[j]) + 1;
5926
0
        if (needed < line->d.m_als) // Don't shrink the buffer
5927
0
            needed = line->d.m_als;
5928
0
        if (needed > INT_MAX) {
5929
0
            hts_log_error("REF + alleles too long to fit in a BCF record");
5930
0
            return -1;
5931
0
        }
5932
0
        new_als = malloc(needed);
5933
0
        if (!new_als)
5934
0
            return -1;
5935
0
        free_old = line->d.als;
5936
0
        line->d.als = new_als;
5937
0
        line->d.m_als = needed;
5938
0
    }
5939
5940
    // Copy from the temp buffer to the destination
5941
0
    if (used) {
5942
0
        assert(used <= line->d.m_als);
5943
0
        memcpy(line->d.als, buffer, used);
5944
0
    }
5945
5946
    // Add in any remaining entries - if this happens we will always be
5947
    // writing to a newly-allocated buffer.
5948
0
    for (; i < nals; i++) {
5949
0
        size_t sz = strlen(alleles[i]) + 1;
5950
0
        memcpy(line->d.als + used, alleles[i], sz);
5951
0
        used += sz;
5952
0
    }
5953
5954
0
    if (free_old)
5955
0
        free(free_old);
5956
0
    return _bcf1_sync_alleles(hdr,line,nals);
5957
0
}
5958
5959
int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
5960
0
{
5961
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5962
0
    kstring_t tmp;
5963
0
    tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
5964
0
    kputs(alleles_string, &tmp);
5965
0
    line->d.als = tmp.s; line->d.m_als = tmp.m;
5966
5967
0
    int nals = 1;
5968
0
    char *t = line->d.als;
5969
0
    while (*t)
5970
0
    {
5971
0
        if ( *t==',' ) { *t = 0; nals++; }
5972
0
        t++;
5973
0
    }
5974
0
    return _bcf1_sync_alleles(hdr, line, nals);
5975
0
}
5976
5977
int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
5978
0
{
5979
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5980
0
    kstring_t tmp;
5981
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
5982
0
    if ( id )
5983
0
        kputs(id, &tmp);
5984
0
    else
5985
0
        kputs(".", &tmp);
5986
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
5987
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
5988
0
    return 0;
5989
0
}
5990
5991
int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
5992
0
{
5993
0
    if ( !id ) return 0;
5994
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5995
5996
0
    kstring_t tmp;
5997
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
5998
5999
0
    int len = strlen(id);
6000
0
    char *dst = line->d.id;
6001
0
    while ( *dst && (dst=strstr(dst,id)) )
6002
0
    {
6003
0
        if ( dst[len]!=0 && dst[len]!=';' ) dst++;              // a prefix, not a match
6004
0
        else if ( dst==line->d.id || dst[-1]==';' ) return 0;   // already present
6005
0
        dst++;  // a suffix, not a match
6006
0
    }
6007
0
    if ( line->d.id && (line->d.id[0]!='.' || line->d.id[1]) )
6008
0
    {
6009
0
        tmp.l = strlen(line->d.id);
6010
0
        kputc(';',&tmp);
6011
0
    }
6012
0
    kputs(id,&tmp);
6013
6014
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
6015
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
6016
0
    return 0;
6017
6018
0
}
6019
6020
bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
6021
0
{
6022
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
6023
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL;   // no such FMT field in the header
6024
0
    return bcf_get_fmt_id(line, id);
6025
0
}
6026
6027
bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
6028
0
{
6029
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
6030
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL;   // no such INFO field in the header
6031
0
    return bcf_get_info_id(line, id);
6032
0
}
6033
6034
bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
6035
0
{
6036
0
    int i;
6037
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
6038
0
    for (i=0; i<line->n_fmt; i++)
6039
0
    {
6040
0
        if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
6041
0
    }
6042
0
    return NULL;
6043
0
}
6044
6045
bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
6046
0
{
6047
0
    int i;
6048
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
6049
0
    for (i=0; i<line->n_info; i++)
6050
0
    {
6051
0
        if ( line->d.info[i].key==id ) return &line->d.info[i];
6052
0
    }
6053
0
    return NULL;
6054
0
}
6055
6056
6057
int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
6058
0
{
6059
0
    int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
6060
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1;    // no such INFO field in the header
6061
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2;     // expected different type
6062
6063
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
6064
6065
0
    for (i=0; i<line->n_info; i++)
6066
0
        if ( line->d.info[i].key==tag_id ) break;
6067
0
    if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3;       // the tag is not present in this record
6068
0
    if ( type==BCF_HT_FLAG ) return 1;
6069
6070
0
    bcf_info_t *info = &line->d.info[i];
6071
0
    if ( !info->vptr ) return -3;           // the tag was marked for removal
6072
0
    if ( type==BCF_HT_STR )
6073
0
    {
6074
0
        if ( *ndst < info->len+1 )
6075
0
        {
6076
0
            *ndst = info->len + 1;
6077
0
            *dst  = realloc(*dst, *ndst);
6078
0
        }
6079
0
        memcpy(*dst,info->vptr,info->len);
6080
0
        ((uint8_t*)*dst)[info->len] = 0;
6081
0
        return info->len;
6082
0
    }
6083
6084
    // Make sure the buffer is big enough
6085
0
    int size1;
6086
0
    switch (type) {
6087
0
        case BCF_HT_INT:  size1 = sizeof(int32_t); break;
6088
0
        case BCF_HT_LONG: size1 = sizeof(int64_t); break;
6089
0
        case BCF_HT_REAL: size1 = sizeof(float); break;
6090
0
        default:
6091
0
            hts_log_error("Unexpected output type %d at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
6092
0
            return -2;
6093
0
    }
6094
0
    if ( *ndst < info->len )
6095
0
    {
6096
0
        *ndst = info->len;
6097
0
        *dst  = realloc(*dst, *ndst * size1);
6098
0
    }
6099
6100
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_regular, out_type_t) do { \
6101
0
        out_type_t *tmp = (out_type_t *) *dst; \
6102
0
        int j; \
6103
0
        for (j=0; j<info->len; j++) \
6104
0
        { \
6105
0
            type_t p = convert(info->vptr + j * sizeof(type_t)); \
6106
0
            if ( is_vector_end ) break; \
6107
0
            if ( is_missing ) set_missing; \
6108
0
            else set_regular; \
6109
0
            tmp++; \
6110
0
        } \
6111
0
        ret = j; \
6112
0
    } while (0)
6113
0
    switch (info->type) {
6114
0
        case BCF_BT_INT8:
6115
0
            if (type == BCF_HT_LONG) {
6116
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int64_missing, *tmp=p, int64_t);
6117
0
            } else {
6118
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=p, int32_t);
6119
0
            }
6120
0
            break;
6121
0
        case BCF_BT_INT16:
6122
0
            if (type == BCF_HT_LONG) {
6123
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t);
6124
0
            } else {
6125
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t);
6126
0
            }
6127
0
            break;
6128
0
        case BCF_BT_INT32:
6129
0
            if (type == BCF_HT_LONG) {
6130
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); break;
6131
0
            } else {
6132
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break;
6133
0
            }
6134
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break;
6135
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, info->type, bcf_seqname_safe(hdr,line), line->pos+1); return -2;
6136
0
    }
6137
0
    #undef BRANCH
6138
0
    return ret;  // set by BRANCH
6139
0
}
6140
6141
int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
6142
0
{
6143
0
    int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
6144
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
6145
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;     // expected different type
6146
6147
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
6148
6149
0
    for (i=0; i<line->n_fmt; i++)
6150
0
        if ( line->d.fmt[i].id==tag_id ) break;
6151
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
6152
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
6153
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
6154
6155
0
    int nsmpl = bcf_hdr_nsamples(hdr);
6156
0
    if ( !*dst )
6157
0
    {
6158
0
        *dst = (char**) malloc(sizeof(char*)*nsmpl);
6159
0
        if ( !*dst ) return -4;     // could not alloc
6160
0
        (*dst)[0] = NULL;
6161
0
    }
6162
0
    int n = (fmt->n+1)*nsmpl;
6163
0
    if ( *ndst < n )
6164
0
    {
6165
0
        (*dst)[0] = realloc((*dst)[0], n);
6166
0
        if ( !(*dst)[0] ) return -4;    // could not alloc
6167
0
        *ndst = n;
6168
0
    }
6169
0
    for (i=0; i<nsmpl; i++)
6170
0
    {
6171
0
        uint8_t *src = fmt->p + i*fmt->n;
6172
0
        uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
6173
0
        memcpy(tmp,src,fmt->n);
6174
0
        tmp[fmt->n] = 0;
6175
0
        (*dst)[i] = (char*) tmp;
6176
0
    }
6177
0
    return n;
6178
0
}
6179
6180
int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
6181
0
{
6182
0
    int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
6183
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
6184
0
    if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
6185
0
    {
6186
        // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
6187
0
        if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
6188
0
    }
6189
0
    else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2;     // expected different type
6190
6191
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
6192
6193
0
    for (i=0; i<line->n_fmt; i++)
6194
0
        if ( line->d.fmt[i].id==tag_id ) break;
6195
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
6196
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
6197
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
6198
6199
0
    if ( type==BCF_HT_STR )
6200
0
    {
6201
0
        int n = fmt->n*bcf_hdr_nsamples(hdr);
6202
0
        if ( *ndst < n )
6203
0
        {
6204
0
            *dst  = realloc(*dst, n);
6205
0
            if ( !*dst ) return -4;     // could not alloc
6206
0
            *ndst = n;
6207
0
        }
6208
0
        memcpy(*dst,fmt->p,n);
6209
0
        return n;
6210
0
    }
6211
6212
    // Make sure the buffer is big enough
6213
0
    int nsmpl = bcf_hdr_nsamples(hdr);
6214
0
    int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
6215
0
    if ( *ndst < fmt->n*nsmpl )
6216
0
    {
6217
0
        *ndst = fmt->n*nsmpl;
6218
0
        *dst  = realloc(*dst, *ndst*size1);
6219
0
        if ( !*dst ) return -4;     // could not alloc
6220
0
    }
6221
6222
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_vector_end, set_regular, out_type_t) { \
6223
0
        out_type_t *tmp = (out_type_t *) *dst; \
6224
0
        uint8_t *fmt_p = fmt->p; \
6225
0
        for (i=0; i<nsmpl; i++) \
6226
0
        { \
6227
0
            for (j=0; j<fmt->n; j++) \
6228
0
            { \
6229
0
                type_t p = convert(fmt_p + j * sizeof(type_t)); \
6230
0
                if ( is_missing ) set_missing; \
6231
0
                else if ( is_vector_end ) { set_vector_end; break; } \
6232
0
                else set_regular; \
6233
0
                tmp++; \
6234
0
            } \
6235
0
            for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
6236
0
            fmt_p += fmt->size; \
6237
0
        } \
6238
0
    }
6239
0
    switch (fmt->type) {
6240
0
        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6241
0
        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6242
0
        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6243
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), bcf_float_set(tmp, p), float); break;
6244
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt->type, bcf_seqname_safe(hdr,line), line->pos+1); exit(1);
6245
0
    }
6246
0
    #undef BRANCH
6247
6248
0
    return nsmpl*fmt->n;
6249
0
}
6250
6251
//error description structure definition
6252
typedef struct err_desc {
6253
    int  errorcode;
6254
    const char *description;
6255
}err_desc;
6256
6257
// error descriptions
6258
static const err_desc errdesc_bcf[] = {
6259
    { BCF_ERR_CTG_UNDEF, "Contig not defined in header"},
6260
    { BCF_ERR_TAG_UNDEF, "Tag not defined in header" },
6261
    { BCF_ERR_NCOLS, "Incorrect number of columns" },
6262
    { BCF_ERR_LIMITS, "Limits reached" },
6263
    { BCF_ERR_CHAR, "Invalid character" },
6264
    { BCF_ERR_CTG_INVALID, "Invalid contig" },
6265
    { BCF_ERR_TAG_INVALID, "Invalid tag" },
6266
};
6267
6268
/// append given description to buffer based on available size and add ... when not enough space
6269
    /** @param buffer       buffer to which description to be appended
6270
        @param offset       offset at which to be appended
6271
        @param maxbuffer    maximum size of the buffer
6272
        @param description  the description to be appended
6273
on failure returns -1 - when buffer is not big enough; returns -1 on invalid params and on too small buffer which are improbable due to validation at caller site
6274
on success returns 0
6275
    */
6276
2.00k
static int add_desc_to_buffer(char *buffer, size_t *offset, size_t maxbuffer, const char *description) {
6277
6278
2.00k
    if (!description || !buffer || !offset || (maxbuffer < 4))
6279
0
        return -1;
6280
6281
2.00k
    size_t rembuffer = maxbuffer - *offset;
6282
2.00k
    if (rembuffer > (strlen(description) + (rembuffer == maxbuffer ? 0 : 1))) {    //add description with optionally required ','
6283
2.00k
        *offset += snprintf(buffer + *offset, rembuffer, "%s%s", (rembuffer == maxbuffer)? "": ",", description);
6284
2.00k
    } else {    //not enough space for description, put ...
6285
0
        size_t tmppos = (rembuffer <= 4) ? maxbuffer - 4 : *offset;
6286
0
        snprintf(buffer + tmppos, 4, "...");    //ignore offset update
6287
0
        return -1;
6288
0
    }
6289
2.00k
    return 0;
6290
2.00k
}
6291
6292
//get description for given error code. return NULL on error
6293
968
const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) {
6294
968
    size_t usedup = 0;
6295
968
    int ret = 0;
6296
968
    int idx;
6297
6298
968
    if (!buffer || maxbuffer < 4)
6299
0
        return NULL;           //invalid / insufficient buffer
6300
6301
968
    if (!errorcode) {
6302
0
        buffer[0] = '\0';      //no error, set null
6303
0
        return buffer;
6304
0
    }
6305
6306
7.74k
    for (idx = 0; idx < sizeof(errdesc_bcf) / sizeof(err_desc); ++idx) {
6307
6.77k
        if (errorcode & errdesc_bcf[idx].errorcode) {    //error is set, add description
6308
2.00k
            ret = add_desc_to_buffer(buffer, &usedup, maxbuffer, errdesc_bcf[idx].description);
6309
2.00k
            if (ret < 0)
6310
0
                break;         //not enough space, ... added, no need to continue
6311
6312
2.00k
            errorcode &= ~errdesc_bcf[idx].errorcode;    //reset the error
6313
2.00k
        }
6314
6.77k
    }
6315
6316
968
    if (errorcode && (ret >= 0))  {     //undescribed error is present in error code and had enough buffer, try to add unkonwn error as well§
6317
0
        add_desc_to_buffer(buffer, &usedup, maxbuffer, "Unknown error");
6318
0
    }
6319
968
    return buffer;
6320
968
}
6321
6322
/**
6323
 *  bcf_format_gt_v2 - formats GT information on a string
6324
 *  @param hdr - bcf header, to get version
6325
 *  @param fmt - pointer to bcf format data
6326
 *  @param isample - position of interested sample in data
6327
 *  @param str - pointer to output string
6328
 *  Returns 0 on success and -1 on failure
6329
 *  This method is preferred over bcf_format_gt as this supports vcf4.4 and
6330
 *  prefixed phasing. Explicit / prefixed phasing for 1st allele is used only
6331
 *  when it is a must to correctly express phasing.
6332
 * correctly express phasing.
6333
 */
6334
int bcf_format_gt_v2(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample, kstring_t *str)
6335
13.8k
{
6336
13.8k
    uint32_t e = 0;
6337
13.8k
    int ploidy = 1, anyunphased = 0;
6338
13.8k
    int32_t val0 = 0;
6339
13.8k
    size_t pos = str ? str->l : 0;
6340
6341
13.8k
    #define BRANCH(type_t, convert, missing, vector_end) { \
6342
13.2k
        uint8_t *ptr = fmt->p + isample*fmt->size; \
6343
13.2k
        int i; \
6344
30.9k
        for (i=0; i<fmt->n; i++, ptr += sizeof(type_t)) \
6345
24.9k
        { \
6346
24.9k
            type_t val = convert(ptr); \
6347
24.9k
            if ( val == vector_end ) break; \
6348
24.9k
            if (!i) { val0 = val; } \
6349
17.7k
            if (i) { \
6350
4.43k
                e |= kputc("/|"[val & 1], str) < 0; \
6351
4.43k
                anyunphased |= !(val & 1); \
6352
4.43k
            } \
6353
17.7k
            if (!(val >> 1)) e |= kputc('.', str) < 0; \
6354
17.7k
            else e |= kputw((val >> 1) - 1, str) < 0; \
6355
17.7k
        } \
6356
13.2k
        if (i == 0) e |= kputc('.', str) < 0; \
6357
13.2k
        ploidy = i; \
6358
13.2k
    }
6359
13.8k
    switch (fmt->type) {
6360
6.46k
        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  bcf_int8_missing,
6361
6.46k
            bcf_int8_vector_end); break;
6362
2.20k
        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing,
6363
2.20k
            bcf_int16_vector_end); break;
6364
4.60k
        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing,
6365
4.60k
            bcf_int32_vector_end); break;
6366
606
        case BCF_BT_NULL:  e |= kputc('.', str) < 0; break;
6367
0
        default: hts_log_error("Unexpected type %d", fmt->type); return -2;
6368
13.8k
    }
6369
13.8k
    #undef BRANCH
6370
6371
13.8k
    if (hdr && get_hdr_aux(hdr)->version >= VCF44) {
6372
        //output which supports prefixed phasing
6373
6374
        /* update 1st allele's phasing if required and append rest to it.
6375
        use prefixed phasing only when it is a must. i.e. without which the
6376
        inferred value will be incorrect */
6377
6.38k
        if (val0 & 1) {
6378
            /* 1st one is phased, if ploidy is > 1 and an unphased allele exists
6379
             need to specify explicitly */
6380
810
            e |= (ploidy > 1 && anyunphased) ?
6381
49
                    (kinsert_char('|', pos, str) < 0) :
6382
810
                        (ploidy <= 1 && !((val0 >> 1)) ? //|. needs explicit o/p
6383
0
                            (kinsert_char('|', pos, str) < 0) :
6384
761
                            0);
6385
5.57k
        } else {
6386
            /* 1st allele is unphased, if ploidy is = 1 or allele is '.' or
6387
             ploidy > 1 and no other unphased allele exist, need to specify
6388
             explicitly */
6389
5.57k
            e |= ((ploidy <= 1 && val0 != 0) || (ploidy > 1 && !anyunphased)) ?
6390
2.93k
                    (kinsert_char('/', pos, str) < 0) :
6391
5.57k
                    0;
6392
5.57k
        }
6393
6.38k
    }
6394
13.8k
    return e == 0 ? 0 : -1;
6395
13.8k
}
6396
6397
/**
6398
 *  get_rlen - calculates and returns rlen value
6399
 *  @param h - bcf header
6400
 *  @param v - bcf data
6401
 *  Returns rlen calculated on success and -1 on failure.
6402
 *  rlen calculation is dependent on vcf version and a few other field data.
6403
 *  When bcf decoded data is available, refers it. When not available, retrieves
6404
 *  required field data by seeking on the data stream.
6405
 *  Ideally pos & version be set appropriately before any info/format field
6406
 *  update to have proper rlen calculation.
6407
 *  As version is not kept properly updated in practice, it is ignored in calcs.
6408
 */
6409
static int64_t get_rlen(const bcf_hdr_t *h, bcf1_t *v)
6410
42.0k
{
6411
42.0k
    uint8_t *f = (uint8_t*)v->shared.s, *t = NULL,
6412
42.0k
        *e = (uint8_t*)v->shared.s + v->shared.l;
6413
42.0k
    int size, type, id, lenid, endid, svlenid, i, bad, gvcf = 0, use_svlen = 0;
6414
42.0k
    bcf_info_t *endinfo = NULL, *svleninfo = NULL, end_lcl, svlen_lcl;
6415
42.0k
    bcf_fmt_t *lenfmt = NULL, len_lcl;
6416
6417
    //holds SVLEN allele status for the max no of alleles
6418
42.0k
    uint8_t svlenals[8192];
6419
    //pos from info END, fmt LEN, info SVLEN
6420
42.0k
    hts_pos_t end = 0, end_fmtlen = 0, end_svlen = 0, hpos;
6421
42.0k
    int64_t len_ref = 0, len = 0, tmp;
6422
42.0k
    endid = bcf_hdr_id2int(h, BCF_DT_ID, "END");
6423
6424
    //initialise bytes which are to be used
6425
42.0k
    memset(svlenals, 0, 1 + v->n_allele / 8);
6426
6427
    //use decoded data where ever available and where not, get from stream
6428
42.0k
    if (v->unpacked & BCF_UN_STR || v->d.shared_dirty & BCF1_DIRTY_ALS) {
6429
0
        for (i = 1; i < v->n_allele; ++i) {
6430
            // check only symbolic alt alleles
6431
0
            if (v->d.allele[i][0] != '<')
6432
0
                continue;
6433
0
            if (svlen_on_ref_for_vcf_alt(v->d.allele[i], -1)) {
6434
                // del, dup or cnv allele, note to check corresponding svlen val
6435
0
                svlenals[i >> 3] |= 1 << (i & 7);
6436
0
                use_svlen = 1;
6437
0
            } else if (!strcmp(v->d.allele[i], "<*>") ||
6438
0
                         !strcmp(v->d.allele[i], "<NON_REF>")) {
6439
0
                gvcf = 1;   //gvcf present, have to check for LEN field
6440
0
            }
6441
0
        }
6442
0
        f += v->unpack_size[0] + v->unpack_size[1];
6443
0
        len_ref = v->n_allele ? strlen(v->d.allele[0]) : 0;
6444
42.0k
    } else if (f < e) {
6445
        //skip ID
6446
42.0k
        size = bcf_dec_size(f, &f, &type);
6447
42.0k
        f += size << bcf_type_shift[type];
6448
        // REF, ALT
6449
1.69M
        for (i = 0; i < v->n_allele; ++i) {
6450
            //check all alleles, w/o NUL
6451
1.64M
            size = bcf_dec_size(f, &f, &type);
6452
1.64M
            if (!i) {   //REF length
6453
42.0k
                len_ref = size;
6454
1.60M
            } else if (size > 0 && *f == '<') {
6455
1.95k
                if (svlen_on_ref_for_vcf_alt((char *) f, size)) {
6456
                    // del, dup or cnv allele, note to check corresponding svlen val
6457
0
                    svlenals[i >> 3] |= 1 << (i & 7);
6458
0
                    use_svlen = 1;
6459
1.95k
                } else if ((size == 3 && !strncmp((char*)f, "<*>", size)) ||
6460
1.95k
                    (size == 9 && !strncmp((char*)f, "<NON_REF>", size))) {
6461
47
                    gvcf = 1;   //gvcf present, have to check for LEN field
6462
47
                }
6463
1.95k
            }
6464
1.64M
            f += size << bcf_type_shift[type];
6465
1.64M
        }
6466
42.0k
    }
6467
    // FILTER
6468
42.0k
    if (v->unpacked & BCF_UN_FLT) {
6469
0
        f += v->unpack_size[2];
6470
42.0k
    } else if (f < e) {
6471
42.0k
        size = bcf_dec_size(f, &f, &type);
6472
42.0k
        f += size << bcf_type_shift[type];
6473
42.0k
    }
6474
6475
    // Only do SVLEN lookup if there are suitable symbolic alleles
6476
42.0k
    svlenid = use_svlen ? bcf_hdr_id2int(h, BCF_DT_ID, "SVLEN") : -1;
6477
6478
    // INFO
6479
42.0k
    if (svlenid >= 0 || endid >= 0 ) {  //only if end/svlen present
6480
9.46k
        if (v->unpacked & BCF_UN_INFO || v->d.shared_dirty & BCF1_DIRTY_INF) {
6481
0
            endinfo = bcf_get_info(h, v, "END");
6482
0
            svleninfo = bcf_get_info(h, v, "SVLEN");
6483
9.46k
        } else if (f < e) {
6484
11.6k
            for (i = 0; i < v->n_info; ++i) {
6485
6.93k
                id = bcf_dec_typed_int1(f, &t);
6486
6.93k
                if (id == endid) {  //END
6487
693
                    t = bcf_unpack_info_core1(f, &end_lcl);
6488
693
                    endinfo = &end_lcl;
6489
693
                    if (svleninfo || svlenid < 0) {
6490
693
                        break;  //already got svlen or no need to search further
6491
693
                    }
6492
6.24k
                } else if (id == svlenid) { //SVLEN
6493
0
                    t = bcf_unpack_info_core1(f, &svlen_lcl);
6494
0
                    svleninfo = &svlen_lcl;
6495
0
                    if (endinfo || endid < 0 ) {
6496
0
                        break;  //already got end or no need to search further
6497
0
                    }
6498
6.24k
                } else {
6499
6.24k
                    f = t;
6500
6.24k
                    size = bcf_dec_size(f, &t, &type);
6501
6.24k
                    t += size << bcf_type_shift[type];
6502
6.24k
                }
6503
6.24k
                f = t;
6504
6.24k
            }
6505
5.41k
        }
6506
9.46k
    }
6507
6508
    // Only do LEN lookup if a <*> allele was found
6509
42.0k
    lenid = gvcf ? bcf_hdr_id2int(h, BCF_DT_ID, "LEN") : -1;
6510
6511
    // FORMAT
6512
42.0k
    if (lenid >= 0) {
6513
        //with LEN and has gvcf allele
6514
0
        f = (uint8_t*)v->indiv.s; t = NULL; e = (uint8_t*)v->indiv.s + v->indiv.l;
6515
0
        if (v->unpacked & BCF_UN_FMT || v->d.indiv_dirty) {
6516
0
            lenfmt = bcf_get_fmt(h, v, "LEN");
6517
0
        } else if (f < e) {
6518
0
            for (i = 0; i < v->n_fmt; ++i) {
6519
0
                id = bcf_dec_typed_int1(f, &t);
6520
0
                if (id == lenid) {
6521
0
                        t = bcf_unpack_fmt_core1(f, v->n_sample, &len_lcl);
6522
0
                    lenfmt = &len_lcl;
6523
0
                    break;  //that's all needed
6524
0
                } else {
6525
0
                    f = t;
6526
0
                    size = bcf_dec_size(f, &t, &type);
6527
0
                    t += size * v->n_sample << bcf_type_shift[type];
6528
0
                }
6529
0
                f = t;
6530
0
            }
6531
0
        }
6532
0
    }
6533
    //got required data, find end and rlen
6534
42.0k
    if (endinfo && endinfo->vptr) { //end position given by info END
6535
        //end info exists, not being deleted
6536
693
        end = endinfo->v1.i;
6537
693
        switch(endinfo->type) {
6538
0
            case BCF_BT_INT8:  end = end == bcf_int8_missing ? 0 : end;  break;
6539
0
            case BCF_BT_INT16: end = end == bcf_int16_missing ? 0 : end; break;
6540
0
            case BCF_BT_INT32: end = end == bcf_int32_missing ? 0 : end; break;
6541
0
            case BCF_BT_INT64: end = end == bcf_int64_missing ? 0 : end; break;
6542
693
            default: end = 0; break; //invalid
6543
693
        }
6544
693
    }
6545
6546
42.0k
    if (svleninfo && svleninfo->vptr) {
6547
        //svlen info exists, not being deleted
6548
0
        bad = 0;
6549
        //get largest svlen corresponding to a <DEL> symbolic allele
6550
0
        for (i = 0; i < svleninfo->len && i + 1 < v->n_allele; ++i) {
6551
0
            if (!(svlenals[i >> 3] & (1 << ((i + 1) & 7))))
6552
0
                continue;
6553
6554
0
            switch(svleninfo->type) {
6555
0
                case BCF_BT_INT8:
6556
0
                    tmp = le_to_i8(&svleninfo->vptr[i]);
6557
0
                    tmp = tmp == bcf_int8_missing ? 0 : tmp;
6558
0
                break;
6559
0
                case BCF_BT_INT16:
6560
0
                    tmp = le_to_i16(&svleninfo->vptr[i * 2]);
6561
0
                    tmp = tmp == bcf_int16_missing ? 0 : tmp;
6562
0
                break;
6563
0
                case BCF_BT_INT32:
6564
0
                    tmp = le_to_i32(&svleninfo->vptr[i * 4]);
6565
0
                    tmp = tmp == bcf_int32_missing ? 0 : tmp;
6566
0
                break;
6567
0
                case BCF_BT_INT64:
6568
0
                    tmp = le_to_i64(&svleninfo->vptr[i * 8]);
6569
0
                    tmp = tmp == bcf_int64_missing ? 0 : tmp;
6570
0
                break;
6571
0
                default: //invalid
6572
0
                    tmp = 0;
6573
0
                    bad = 1;
6574
0
                break;
6575
0
            }
6576
0
            if (bad) {  //stop svlen check
6577
0
                len = 0;
6578
0
                break;
6579
0
            }
6580
6581
0
            tmp = tmp < 0 ? llabs(tmp) : tmp;
6582
0
            if (len < tmp) len = tmp;
6583
0
        }
6584
0
    }
6585
42.0k
    if ((!svleninfo || !len) && end) { //no svlen, infer from end
6586
0
        len = end > v->pos ? end - v->pos - 1 : 0;
6587
0
    }
6588
42.0k
    end_svlen = v->pos + len + 1;   //end position found from SVLEN
6589
6590
42.0k
    len = 0;
6591
42.0k
    if (lenfmt && lenfmt->p) {
6592
        //fmt len exists, not being deleted, has gvcf and version >= 4.5
6593
0
        int j = 0;
6594
0
        int64_t offset = 0;
6595
0
        bad = 0;
6596
0
        for (i = 0; i < v->n_sample; ++i) {
6597
0
            for (j = 0; j < lenfmt->n; ++j) {
6598
0
                switch(lenfmt->type) {
6599
0
                case BCF_BT_INT8:
6600
0
                    tmp = le_to_i8(lenfmt->p + offset + j);
6601
0
                    tmp = tmp == bcf_int8_missing ? 0 : tmp;
6602
0
                break;
6603
0
                case BCF_BT_INT16:
6604
0
                    tmp = le_to_i16(lenfmt->p + offset + j * 2);
6605
0
                    tmp = tmp == bcf_int16_missing ? 0 : tmp;
6606
0
                break;
6607
0
                case BCF_BT_INT32:
6608
0
                    tmp = le_to_i32(lenfmt->p + offset + j * 4);
6609
0
                    tmp = tmp == bcf_int32_missing ? 0 : tmp;
6610
0
                break;
6611
0
                case BCF_BT_INT64:
6612
0
                    tmp = le_to_i64(lenfmt->p + offset + j * 8);
6613
0
                    tmp = tmp == bcf_int64_missing ? 0 : tmp;
6614
0
                break;
6615
0
                default: //invalid
6616
0
                    bad = 1;
6617
0
                break;
6618
0
                }
6619
0
                if (bad) {  //stop LEN check
6620
0
                    len = 0;
6621
0
                    break;
6622
0
                }
6623
                //assumes only gvcf have valid LEN
6624
0
                if (len < tmp) len = tmp;
6625
0
            }
6626
0
            offset += j << bcf_type_shift[lenfmt->type];
6627
0
        }
6628
0
    }
6629
42.0k
    if ((!lenfmt || !len) && end) { //no fmt len, infer from end
6630
0
        len = end > v->pos ? end - v->pos : 0;
6631
0
    }
6632
42.0k
    end_fmtlen = v->pos + len;  //end position found from LEN
6633
6634
    //get largest pos, based on END, SVLEN, fmt LEN and length using it
6635
42.0k
    hpos = end < end_svlen ?
6636
6.40k
            end_svlen < end_fmtlen ? end_fmtlen : end_svlen :
6637
42.0k
            end < end_fmtlen ? end_fmtlen : end;
6638
42.0k
    len = hpos - v->pos;
6639
6640
    //NOTE: 'end' calculation be in sync with tbx.c:tbx_parse1
6641
6642
    /* rlen to be calculated based on version, END, SVLEN, fmt LEN, ref len.
6643
    Relevance of these fields vary across different vcf versions.
6644
    Many times, these info/fmt fields are used without version updates;
6645
    hence these fields are used for calculation disregarding vcf version */
6646
42.0k
    return len < len_ref ? len_ref : len;
6647
42.0k
}