Coverage Report

Created: 2026-01-09 06:27

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/vcf.c
Line
Count
Source
1
/*  vcf.c -- VCF/BCF API functions.
2
3
    Copyright (C) 2012, 2013 Broad Institute.
4
    Copyright (C) 2012-2025 Genome Research Ltd.
5
    Portions copyright (C) 2014 Intel Corporation.
6
7
    Author: Heng Li <lh3@sanger.ac.uk>
8
9
Permission is hereby granted, free of charge, to any person obtaining a copy
10
of this software and associated documentation files (the "Software"), to deal
11
in the Software without restriction, including without limitation the rights
12
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
copies of the Software, and to permit persons to whom the Software is
14
furnished to do so, subject to the following conditions:
15
16
The above copyright notice and this permission notice shall be included in
17
all copies or substantial portions of the Software.
18
19
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25
DEALINGS IN THE SOFTWARE.  */
26
27
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
28
#include <config.h>
29
30
#include <stdio.h>
31
#include <assert.h>
32
#include <string.h>
33
#include <strings.h>
34
#include <stdlib.h>
35
#include <limits.h>
36
#include <stdint.h>
37
#include <inttypes.h>
38
#include <errno.h>
39
40
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
41
#include "fuzz_settings.h"
42
#endif
43
44
#include "htslib/vcf.h"
45
#include "htslib/bgzf.h"
46
#include "htslib/tbx.h"
47
#include "htslib/hfile.h"
48
#include "hts_internal.h"
49
#include "htslib/hts_endian.h"
50
#include "htslib/khash_str2int.h"
51
#include "htslib/kstring.h"
52
#include "htslib/sam.h"
53
#include "htslib/khash.h"
54
#include "bgzf_internal.h"
55
56
#if 0
57
// This helps on Intel a bit, often 6-7% faster VCF parsing.
58
// Conversely sometimes harms AMD Zen4 as ~9% slower.
59
// Possibly related to IPC differences.  However for now it's just a
60
// curiousity we ignore and stick with the simpler code.
61
//
62
// Left here as a hint for future explorers.
63
static inline int xstreq(const char *a, const char *b) {
64
    while (*a && *a == *b)
65
        a++, b++;
66
    return *a == *b;
67
}
68
69
#define KHASH_MAP_INIT_XSTR(name, khval_t) \
70
  KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, xstreq)
71
72
KHASH_MAP_INIT_XSTR(vdict, bcf_idinfo_t)
73
#else
74
KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
75
#endif
76
77
typedef khash_t(vdict) vdict_t;
78
79
KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*)
80
typedef khash_t(hdict) hdict_t;
81
82
83
#include "htslib/kseq.h"
84
HTSLIB_EXPORT
85
uint32_t bcf_float_missing    = 0x7F800001;
86
87
HTSLIB_EXPORT
88
uint32_t bcf_float_vector_end = 0x7F800002;
89
90
HTSLIB_EXPORT
91
uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
92
93
static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
94
95
/*
96
    Partial support for 64-bit POS and Number=1 INFO tags.
97
    Notes:
98
     - the support for 64-bit values is motivated by POS and INFO/END for large genomes
99
     - the use of 64-bit values does not conform to the specification
100
     - cannot output 64-bit BCF and if it does, it is not compatible with anything
101
     - experimental, use at your risk
102
*/
103
#ifdef VCF_ALLOW_INT64
104
    #define BCF_MAX_BT_INT64 (0x7fffffffffffffff)       /* INT64_MAX, for internal use only */
105
    #define BCF_MIN_BT_INT64 -9223372036854775800LL     /* INT64_MIN + 8, for internal use only */
106
#endif
107
108
1.31k
#define BCF_IS_64BIT (1<<30)
109
110
111
// Opaque structure with auxilary data which allows to extend bcf_hdr_t without breaking ABI.
112
// Note that this preserving API and ABI requires that the first element is vdict_t struct
113
// rather than a pointer, as user programs may (and in some cases do) access the dictionary
114
// directly as (vdict_t*)hdr->dict.
115
typedef struct
116
{
117
    vdict_t dict;   // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT
118
    hdict_t *gen;   // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields
119
    size_t *key_len;// length of h->id[BCF_DT_ID] strings
120
    int version;    //cached version
121
    uint32_t ref_count; // reference count, low bit indicates bcf_hdr_destroy() has been called
122
}
123
bcf_hdr_aux_t;
124
125
static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr)
126
364k
{
127
364k
    return (bcf_hdr_aux_t *)hdr->dict[0];
128
364k
}
129
130
//version macros
131
100k
#define VCF_DEF 4002000
132
34.0k
#define VCF44   4004000
133
20.3k
#define VCF45   4005000
134
135
#define VCF_MAJOR_VER(x) ( (x) / 10000 / 100 )
136
#define VCF_MINOR_VER(x) ( ((x) % 1000000) / 1000 )
137
138
/**
139
 *  bcf_get_version - get the version as int
140
 *  @param hdr   - bcf header, to get version
141
 *  @param verstr- version string, which is already available
142
 *  Returns version on success and default version on failure
143
 *  version = major * 100 * 10000 + minor * 1000
144
 */
145
static int bcf_get_version(const bcf_hdr_t *hdr, const char *verstr)
146
20.4k
{
147
20.4k
    const char *version = NULL, vcf[] = "VCFv";
148
20.4k
    char *major = NULL, *minor = NULL;
149
20.4k
    int ver = -1;
150
20.4k
    long tmp = 0;
151
20.4k
    bcf_hdr_aux_t *aux = NULL;
152
153
20.4k
    if (!hdr && !verstr) {  //invalid input
154
0
        goto fail;
155
0
    }
156
157
20.4k
    if (hdr) {
158
16.5k
        if ((aux = get_hdr_aux(hdr)) && aux->version != 0) {    //use cached version
159
15.8k
            return aux->version;
160
15.8k
        }
161
        //get from header
162
730
        version = bcf_hdr_get_version(hdr);
163
3.84k
    } else {
164
        //get from version string
165
3.84k
        version = verstr;
166
3.84k
    }
167
4.57k
    if (!(major = strstr(version, vcf))) {  //bad format
168
2.81k
        goto fail;
169
2.81k
    }
170
1.76k
    major += sizeof(vcf) - 1;
171
1.76k
    if (!(minor = strchr(major, '.'))) {    //bad format
172
338
        goto fail;
173
338
    }
174
1.42k
    tmp = strtol(major, NULL, 10);
175
1.42k
    if ((!tmp && errno == EINVAL) ||
176
1.22k
        ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) {    //failed
177
206
        goto fail;
178
206
    }
179
1.21k
    ver = tmp * 100 * 10000;
180
1.21k
    tmp = strtol(++minor, NULL, 10);
181
1.21k
    if ((!tmp && errno == EINVAL) ||
182
1.09k
        ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) {    //failed
183
522
        goto fail;
184
522
    }
185
696
    ver += tmp * 1000;
186
696
    return ver;
187
188
3.88k
fail:
189
3.88k
    hts_log_warning("Couldn't get VCF version, considering as %d.%d",
190
3.88k
        VCF_MAJOR_VER(VCF_DEF), VCF_MINOR_VER(VCF_DEF));
191
3.88k
    return VCF_DEF;
192
1.21k
}
193
194
// Header reference counting
195
196
static void bcf_hdr_incr_ref(bcf_hdr_t *h)
197
1.49k
{
198
1.49k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
199
1.49k
    aux->ref_count += 2;
200
1.49k
}
201
202
static void bcf_hdr_decr_ref(bcf_hdr_t *h)
203
1.49k
{
204
1.49k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
205
1.49k
    if (aux->ref_count >= 2)
206
1.49k
        aux->ref_count -= 2;
207
208
1.49k
    if (aux->ref_count == 0)
209
1.38k
        bcf_hdr_destroy(h);
210
1.49k
}
211
212
static void hdr_bgzf_private_data_cleanup(void *data)
213
1.49k
{
214
1.49k
    bcf_hdr_t *h = (bcf_hdr_t *) data;
215
1.49k
    bcf_hdr_decr_ref(h);
216
1.49k
}
217
218
static char *find_chrom_header_line(char *s)
219
0
{
220
0
    char *nl;
221
0
    if (strncmp(s, "#CHROM\t", 7) == 0) return s;
222
0
    else if ((nl = strstr(s, "\n#CHROM\t")) != NULL) return nl+1;
223
0
    else return NULL;
224
0
}
225
226
static int64_t get_rlen(const bcf_hdr_t *h, bcf1_t *v);
227
228
/*************************
229
 *** VCF header parser ***
230
 *************************/
231
232
static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len)
233
5.23k
{
234
5.23k
    const char *ss = s;
235
5.43k
    while ( *ss && isspace_c(*ss) && ss - s < len) ss++;
236
5.23k
    if ( !*ss || ss - s == len)
237
2
    {
238
2
        hts_log_error("Empty sample name: trailing spaces/tabs in the header line?");
239
2
        return -1;
240
2
    }
241
242
5.23k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
243
5.23k
    int ret;
244
5.23k
    char *sdup = malloc(len + 1);
245
5.23k
    if (!sdup) return -1;
246
5.23k
    memcpy(sdup, s, len);
247
5.23k
    sdup[len] = 0;
248
249
    // Ensure space is available in h->samples
250
5.23k
    size_t n = kh_size(d);
251
5.23k
    char **new_samples = realloc(h->samples, sizeof(char*) * (n + 1));
252
5.23k
    if (!new_samples) {
253
0
        free(sdup);
254
0
        return -1;
255
0
    }
256
5.23k
    h->samples = new_samples;
257
258
5.23k
    int k = kh_put(vdict, d, sdup, &ret);
259
5.23k
    if (ret < 0) {
260
0
        free(sdup);
261
0
        return -1;
262
0
    }
263
5.23k
    if (ret) { // absent
264
5.23k
        kh_val(d, k) = bcf_idinfo_def;
265
5.23k
        kh_val(d, k).id = n;
266
5.23k
    } else {
267
0
        hts_log_error("Duplicated sample name '%s'", sdup);
268
0
        free(sdup);
269
0
        return -1;
270
0
    }
271
5.23k
    h->samples[n] = sdup;
272
5.23k
    h->dirty = 1;
273
5.23k
    return 0;
274
5.23k
}
275
276
int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
277
0
{
278
0
    if (!s) {
279
        // Allowed for backwards-compatibility, calling with s == NULL
280
        // used to trigger bcf_hdr_sync(h);
281
0
        return 0;
282
0
    }
283
0
    return bcf_hdr_add_sample_len(h, s, strlen(s));
284
0
}
285
286
int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *hdr, const char *str)
287
2.57k
{
288
2.57k
    const char *mandatory = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
289
2.57k
    if ( strncmp(str,mandatory,strlen(mandatory)) )
290
18
    {
291
18
        hts_log_error("Could not parse the \"#CHROM..\" line, either the fields are incorrect or spaces are present instead of tabs:\n\t%s",str);
292
18
        return -1;
293
18
    }
294
295
2.55k
    const char *beg = str + strlen(mandatory), *end;
296
2.55k
    if ( !*beg || *beg=='\n' ) return 0;
297
654
    if ( strncmp(beg,"\tFORMAT\t",8) )
298
6
    {
299
6
        hts_log_error("Could not parse the \"#CHROM..\" line, either FORMAT is missing or spaces are present instead of tabs:\n\t%s",str);
300
6
        return -1;
301
6
    }
302
648
    beg += 8;
303
304
648
    int ret = 0;
305
5.23k
    while ( *beg )
306
5.23k
    {
307
5.23k
        end = beg;
308
687M
        while ( *end && *end!='\t' && *end!='\n' ) end++;
309
5.23k
        if ( bcf_hdr_add_sample_len(hdr, beg, end-beg) < 0 ) ret = -1;
310
5.23k
        if ( !*end || *end=='\n' || ret<0 ) break;
311
4.58k
        beg = end + 1;
312
4.58k
    }
313
648
    return ret;
314
654
}
315
316
int bcf_hdr_sync(bcf_hdr_t *h)
317
98.9k
{
318
98.9k
    int i;
319
395k
    for (i = 0; i < 3; i++)
320
296k
    {
321
296k
        vdict_t *d = (vdict_t*)h->dict[i];
322
296k
        khint_t k;
323
296k
        if ( h->n[i] < kh_size(d) )
324
646
        {
325
646
            bcf_idpair_t *new_idpair;
326
            // this should be true only for i=2, BCF_DT_SAMPLE
327
646
            new_idpair = (bcf_idpair_t*) realloc(h->id[i], kh_size(d)*sizeof(bcf_idpair_t));
328
646
            if (!new_idpair) return -1;
329
646
            h->n[i] = kh_size(d);
330
646
            h->id[i] = new_idpair;
331
646
        }
332
3.44G
        for (k=kh_begin(d); k<kh_end(d); k++)
333
3.44G
        {
334
3.44G
            if (!kh_exist(d,k)) continue;
335
29.2M
            h->id[i][kh_val(d,k).id].key = kh_key(d,k);
336
29.2M
            h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
337
29.2M
        }
338
296k
    }
339
340
    // Invalidate key length cache
341
98.9k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
342
98.9k
    if (aux && aux->key_len) {
343
3.74k
        free(aux->key_len);
344
3.74k
        aux->key_len = NULL;
345
3.74k
    }
346
347
98.9k
    h->dirty = 0;
348
98.9k
    return 0;
349
98.9k
}
350
351
void bcf_hrec_destroy(bcf_hrec_t *hrec)
352
191k
{
353
191k
    if (!hrec) return;
354
184k
    free(hrec->key);
355
184k
    if ( hrec->value ) free(hrec->value);
356
184k
    int i;
357
639k
    for (i=0; i<hrec->nkeys; i++)
358
454k
    {
359
454k
        free(hrec->keys[i]);
360
454k
        free(hrec->vals[i]);
361
454k
    }
362
184k
    free(hrec->keys);
363
184k
    free(hrec->vals);
364
184k
    free(hrec);
365
184k
}
366
367
// Copies all fields except IDX.
368
bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
369
0
{
370
0
    int save_errno;
371
0
    bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
372
0
    if (!out) return NULL;
373
374
0
    out->type = hrec->type;
375
0
    if ( hrec->key ) {
376
0
        out->key = strdup(hrec->key);
377
0
        if (!out->key) goto fail;
378
0
    }
379
0
    if ( hrec->value ) {
380
0
        out->value = strdup(hrec->value);
381
0
        if (!out->value) goto fail;
382
0
    }
383
0
    out->nkeys = hrec->nkeys;
384
0
    out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys);
385
0
    if (!out->keys) goto fail;
386
0
    out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys);
387
0
    if (!out->vals) goto fail;
388
0
    int i, j = 0;
389
0
    for (i=0; i<hrec->nkeys; i++)
390
0
    {
391
0
        if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
392
0
        if ( hrec->keys[i] ) {
393
0
            out->keys[j] = strdup(hrec->keys[i]);
394
0
            if (!out->keys[j]) goto fail;
395
0
        }
396
0
        if ( hrec->vals[i] ) {
397
0
            out->vals[j] = strdup(hrec->vals[i]);
398
0
            if (!out->vals[j]) goto fail;
399
0
        }
400
0
        j++;
401
0
    }
402
0
    if ( i!=j ) out->nkeys -= i-j;   // IDX was omitted
403
0
    return out;
404
405
0
 fail:
406
0
    save_errno = errno;
407
0
    hts_log_error("%s", strerror(errno));
408
0
    bcf_hrec_destroy(out);
409
0
    errno = save_errno;
410
0
    return NULL;
411
0
}
412
413
void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
414
0
{
415
0
    fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
416
0
    int i;
417
0
    for (i=0; i<hrec->nkeys; i++)
418
0
        fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
419
0
    fprintf(fp, "\n");
420
0
}
421
422
void bcf_header_debug(bcf_hdr_t *hdr)
423
0
{
424
0
    int i, j;
425
0
    for (i=0; i<hdr->nhrec; i++)
426
0
    {
427
0
        if ( !hdr->hrec[i]->value )
428
0
        {
429
0
            fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
430
0
            fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
431
0
            for (j=1; j<hdr->hrec[i]->nkeys; j++)
432
0
                fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
433
0
            fprintf(stderr,">\n");
434
0
        }
435
0
        else
436
0
            fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
437
0
    }
438
0
}
439
440
int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len)
441
353k
{
442
353k
    char **tmp;
443
353k
    size_t n = hrec->nkeys + 1;
444
353k
    assert(len > 0 && len < SIZE_MAX);
445
353k
    tmp = realloc(hrec->keys, sizeof(char*)*n);
446
353k
    if (!tmp) return -1;
447
353k
    hrec->keys = tmp;
448
353k
    tmp = realloc(hrec->vals, sizeof(char*)*n);
449
353k
    if (!tmp) return -1;
450
353k
    hrec->vals = tmp;
451
452
353k
    hrec->keys[hrec->nkeys] = (char*) malloc((len+1)*sizeof(char));
453
353k
    if (!hrec->keys[hrec->nkeys]) return -1;
454
353k
    memcpy(hrec->keys[hrec->nkeys],str,len);
455
353k
    hrec->keys[hrec->nkeys][len] = 0;
456
353k
    hrec->vals[hrec->nkeys] = NULL;
457
353k
    hrec->nkeys = n;
458
353k
    return 0;
459
353k
}
460
461
int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted)
462
353k
{
463
353k
    if ( hrec->vals[i] ) {
464
0
        free(hrec->vals[i]);
465
0
        hrec->vals[i] = NULL;
466
0
    }
467
353k
    if ( !str ) return 0;
468
353k
    if ( is_quoted )
469
99.6k
    {
470
99.6k
        if (len >= SIZE_MAX - 3) {
471
0
            errno = ENOMEM;
472
0
            return -1;
473
0
        }
474
99.6k
        hrec->vals[i] = (char*) malloc((len+3)*sizeof(char));
475
99.6k
        if (!hrec->vals[i]) return -1;
476
99.6k
        hrec->vals[i][0] = '"';
477
99.6k
        memcpy(&hrec->vals[i][1],str,len);
478
99.6k
        hrec->vals[i][len+1] = '"';
479
99.6k
        hrec->vals[i][len+2] = 0;
480
99.6k
    }
481
253k
    else
482
253k
    {
483
253k
        if (len == SIZE_MAX) {
484
0
            errno = ENOMEM;
485
0
            return -1;
486
0
        }
487
253k
        hrec->vals[i] = (char*) malloc((len+1)*sizeof(char));
488
253k
        if (!hrec->vals[i]) return -1;
489
253k
        memcpy(hrec->vals[i],str,len);
490
253k
        hrec->vals[i][len] = 0;
491
253k
    }
492
353k
    return 0;
493
353k
}
494
495
int hrec_add_idx(bcf_hrec_t *hrec, int idx)
496
101k
{
497
101k
    int n = hrec->nkeys + 1;
498
101k
    char **tmp = (char**) realloc(hrec->keys, sizeof(char*)*n);
499
101k
    if (!tmp) return -1;
500
101k
    hrec->keys = tmp;
501
502
101k
    tmp = (char**) realloc(hrec->vals, sizeof(char*)*n);
503
101k
    if (!tmp) return -1;
504
101k
    hrec->vals = tmp;
505
506
101k
    hrec->keys[hrec->nkeys] = strdup("IDX");
507
101k
    if (!hrec->keys[hrec->nkeys]) return -1;
508
509
101k
    kstring_t str = {0,0,0};
510
101k
    if (kputw(idx, &str) < 0) {
511
0
        free(hrec->keys[hrec->nkeys]);
512
0
        return -1;
513
0
    }
514
101k
    hrec->vals[hrec->nkeys] = str.s;
515
101k
    hrec->nkeys = n;
516
101k
    return 0;
517
101k
}
518
519
int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
520
118k
{
521
118k
    int i;
522
178k
    for (i=0; i<hrec->nkeys; i++)
523
136k
        if ( !strcasecmp(key,hrec->keys[i]) ) return i;
524
42.3k
    return -1;
525
118k
}
526
527
static void bcf_hrec_set_type(bcf_hrec_t *hrec)
528
345k
{
529
345k
    if ( !strcmp(hrec->key, "contig") ) hrec->type = BCF_HL_CTG;
530
324k
    else if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
531
192k
    else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
532
94.1k
    else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
533
78.3k
    else if ( hrec->nkeys>0 ) hrec->type = BCF_HL_STR;
534
56.6k
    else hrec->type = BCF_HL_GEN;
535
345k
}
536
537
538
/**
539
    The arrays were generated with
540
541
    valid_ctg:
542
        perl -le '@v = (split(//,q[!#$%&*+./:;=?@^_|~-]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
543
544
    valid_tag:
545
        perl -le '@v = (split(//,q[_.]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
546
*/
547
static const uint8_t valid_ctg[256] =
548
{
549
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
550
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551
    0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
552
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
553
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
554
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
555
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
556
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
557
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
558
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
559
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
560
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
561
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
562
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
563
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
564
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
565
};
566
static const uint8_t valid_tag[256] =
567
{
568
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
569
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
570
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
571
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
572
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
573
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
574
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
575
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
576
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
577
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
578
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
579
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
580
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
581
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
582
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
583
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
584
};
585
586
/**
587
    bcf_hrec_check() - check the validity of structured header lines
588
589
    Returns 0 on success or negative value on error.
590
591
    Currently the return status is not checked by the caller
592
    and only a warning is printed on stderr. This should be improved
593
    to propagate the error all the way up to the caller and let it
594
    decide what to do: throw an error or proceed anyway.
595
 */
596
static int bcf_hrec_check(bcf_hrec_t *hrec)
597
172k
{
598
172k
    int i;
599
172k
    bcf_hrec_set_type(hrec);
600
601
172k
    if ( hrec->type==BCF_HL_CTG )
602
10.3k
    {
603
10.3k
        i = bcf_hrec_find_key(hrec,"ID");
604
10.3k
        if ( i<0 ) goto err_missing_id;
605
5.03k
        char *val = hrec->vals[i];
606
5.03k
        if ( val[0]=='*' || val[0]=='=' || !valid_ctg[(uint8_t)val[0]] ) goto err_invalid_ctg;
607
87.5k
        while ( *(++val) )
608
87.4k
            if ( !valid_ctg[(uint8_t)*val] ) goto err_invalid_ctg;
609
112
        return 0;
610
785
    }
611
162k
    if ( hrec->type==BCF_HL_INFO )
612
66.2k
    {
613
66.2k
        i = bcf_hrec_find_key(hrec,"ID");
614
66.2k
        if ( i<0 ) goto err_missing_id;
615
50.4k
        char *val = hrec->vals[i];
616
50.4k
        if ( !strcmp(val,"1000G") ) return 0;
617
50.3k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
618
18.3k
        while ( *(++val) )
619
15.5k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
620
2.85k
        return 0;
621
4.92k
    }
622
96.0k
    if ( hrec->type==BCF_HL_FMT )
623
7.93k
    {
624
7.93k
        i = bcf_hrec_find_key(hrec,"ID");
625
7.93k
        if ( i<0 ) goto err_missing_id;
626
6.48k
        char *val = hrec->vals[i];
627
6.48k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
628
7.87k
        while ( *(++val) )
629
6.40k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
630
1.47k
        return 0;
631
3.06k
    }
632
88.0k
    return 0;
633
634
22.5k
  err_missing_id:
635
22.5k
    hts_log_warning("Missing ID attribute in one or more header lines");
636
22.5k
    return -1;
637
638
4.92k
  err_invalid_ctg:
639
4.92k
    hts_log_warning("Invalid contig name: \"%s\"", hrec->vals[i]);
640
4.92k
    return -1;
641
642
52.5k
  err_invalid_tag:
643
52.5k
    hts_log_warning("Invalid tag name: \"%s\"", hrec->vals[i]);
644
52.5k
    return -1;
645
96.0k
}
646
647
static inline int is_escaped(const char *min, const char *str)
648
99.0k
{
649
99.0k
    int n = 0;
650
99.0k
    while ( --str>=min && *str=='\\' ) n++;
651
99.0k
    return n%2;
652
99.0k
}
653
654
bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
655
195k
{
656
195k
    bcf_hrec_t *hrec = NULL;
657
195k
    const char *p = line;
658
195k
    if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
659
191k
    p += 2;
660
661
191k
    const char *q = p;
662
1.43M
    while ( *q && *q!='=' && *q != '\n' ) q++;
663
191k
    ptrdiff_t n = q-p;
664
191k
    if ( *q!='=' || !n ) // wrong format
665
7.53k
        goto malformed_line;
666
667
184k
    hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
668
184k
    if (!hrec) { *len = -1; return NULL; }
669
184k
    hrec->key = (char*) malloc(sizeof(char)*(n+1));
670
184k
    if (!hrec->key) goto fail;
671
184k
    memcpy(hrec->key,p,n);
672
184k
    hrec->key[n] = 0;
673
184k
    hrec->type = -1;
674
675
184k
    p = ++q;
676
184k
    if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
677
33.0k
    {
678
9.83M
        while ( *q && *q!='\n' ) q++;
679
33.0k
        hrec->value = (char*) malloc((q-p+1)*sizeof(char));
680
33.0k
        if (!hrec->value) goto fail;
681
33.0k
        memcpy(hrec->value, p, q-p);
682
33.0k
        hrec->value[q-p] = 0;
683
33.0k
        *len = q - line + (*q ? 1 : 0); // Skip \n but not \0
684
33.0k
        return hrec;
685
33.0k
    }
686
687
    // structured line, e.g.
688
    // ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
689
    // ##PEDIGREE=<Name_0=G0-ID,Name_1=G1-ID,Name_3=GN-ID>
690
151k
    int nopen = 1;
691
504k
    while ( *q && *q!='\n' && nopen>0 )
692
364k
    {
693
364k
        p = ++q;
694
364k
        while ( *q && *q==' ' ) { p++; q++; }
695
        // ^[A-Za-z_][0-9A-Za-z_.]*$
696
364k
        if (p==q && *q && (isalpha_c(*q) || *q=='_'))
697
361k
        {
698
361k
            q++;
699
2.00M
            while ( *q && (isalnum_c(*q) || *q=='_' || *q=='.') ) q++;
700
361k
        }
701
364k
        n = q-p;
702
364k
        int m = 0;
703
365k
        while ( *q && *q==' ' ) { q++; m++; }
704
364k
        if ( *q!='=' || !n )
705
11.6k
            goto malformed_line;
706
707
353k
        if (bcf_hrec_add_key(hrec, p, q-p-m) < 0) goto fail;
708
353k
        p = ++q;
709
355k
        while ( *q && *q==' ' ) { p++; q++; }
710
711
353k
        int quoted = 0;
712
353k
        char ending = '\0';
713
353k
        switch (*p) {
714
99.6k
        case '"':
715
99.6k
            quoted = 1;
716
99.6k
            ending = '"';
717
99.6k
            p++;
718
99.6k
            break;
719
23
        case '[':
720
23
            quoted = 1;
721
23
            ending = ']';
722
23
            break;
723
353k
        }
724
353k
        if ( quoted ) q++;
725
290M
        while ( *q && *q != '\n' )
726
290M
        {
727
290M
            if ( quoted ) { if ( *q==ending && !is_escaped(p,q) ) break; }
728
289M
            else
729
289M
            {
730
289M
                if ( *q=='<' ) nopen++;
731
289M
                if ( *q=='>' ) nopen--;
732
289M
                if ( !nopen ) break;
733
289M
                if ( *q==',' && nopen==1 ) break;
734
289M
            }
735
289M
            q++;
736
289M
        }
737
353k
        const char *r = q;
738
353k
        if (quoted && ending == ']') {
739
23
            if (*q == ending) {
740
3
                r++;
741
3
                q++;
742
3
                quoted = 0;
743
20
            } else {
744
20
                char buffer[320];
745
20
                hts_log_error("Missing ']' in header line %s",
746
20
                              hts_strprint(buffer, sizeof(buffer), '"',
747
20
                                           line, q-line));
748
20
                goto fail;
749
20
            }
750
23
        }
751
353k
        while ( r > p && r[-1] == ' ' ) r--;
752
353k
        if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0)
753
0
            goto fail;
754
353k
        if ( quoted && *q==ending ) q++;
755
353k
        if ( *q=='>' )
756
103k
        {
757
103k
            if (nopen) nopen--;     // this can happen with nested angle brackets <>
758
103k
            q++;
759
103k
        }
760
353k
    }
761
139k
    if ( nopen )
762
36.3k
        hts_log_warning("Incomplete header line, trying to proceed anyway:\n\t[%s]\n\t[%d]",line,q[0]);
763
764
    // Skip to end of line
765
139k
    int nonspace = 0;
766
139k
    p = q;
767
2.77M
    while ( *q && *q!='\n' ) { nonspace |= !isspace_c(*q); q++; }
768
139k
    if (nonspace) {
769
981
        char buffer[320];
770
981
        hts_log_warning("Dropped trailing junk from header line '%s'",
771
981
                        hts_strprint(buffer, sizeof(buffer),
772
981
                                     '"', line, q - line));
773
981
    }
774
775
139k
    *len = q - line + (*q ? 1 : 0);
776
139k
    return hrec;
777
778
20
 fail:
779
20
    *len = -1;
780
20
    bcf_hrec_destroy(hrec);
781
20
    return NULL;
782
783
19.1k
 malformed_line:
784
19.1k
    {
785
19.1k
        char buffer[320];
786
31.0M
        while ( *q && *q!='\n' ) q++;  // Ensure *len includes full line
787
19.1k
        hts_log_error("Could not parse the header line: %s",
788
19.1k
                      hts_strprint(buffer, sizeof(buffer),
789
19.1k
                                   '"', line, q - line));
790
19.1k
        *len = q - line + (*q ? 1 : 0);
791
19.1k
        bcf_hrec_destroy(hrec);
792
19.1k
        return NULL;
793
151k
    }
794
151k
}
795
796
static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_idinfo_t *idinfo)
797
100k
{
798
100k
    size_t new_n;
799
800
    // If available, preserve existing IDX
801
100k
    if ( idinfo->id==-1 )
802
100k
        idinfo->id = hdr->n[dict_type];
803
314
    else if ( idinfo->id < hdr->n[dict_type] && hdr->id[dict_type][idinfo->id].key )
804
2
    {
805
2
        hts_log_error("Conflicting IDX=%d lines in the header dictionary, the new tag is %s",
806
2
            idinfo->id, tag);
807
2
        errno = EINVAL;
808
2
        return -1;
809
2
    }
810
811
100k
    new_n = idinfo->id >= hdr->n[dict_type] ? idinfo->id+1 : hdr->n[dict_type];
812
100k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
813
    // hts_resize() can attempt to allocate up to 2 * requested items
814
100k
    if (new_n > FUZZ_ALLOC_LIMIT/(2 * sizeof(bcf_idpair_t)))
815
4
        return -1;
816
100k
#endif
817
100k
    if (hts_resize(bcf_idpair_t, new_n, &hdr->m[dict_type],
818
100k
                   &hdr->id[dict_type], HTS_RESIZE_CLEAR)) {
819
0
        return -1;
820
0
    }
821
100k
    hdr->n[dict_type] = new_n;
822
823
    // NB: the next kh_put call can invalidate the idinfo pointer, therefore
824
    // we leave it unassigned here. It must be set explicitly in bcf_hdr_sync.
825
100k
    hdr->id[dict_type][idinfo->id].key = tag;
826
827
100k
    return 0;
828
100k
}
829
830
// returns: 1 when hdr needs to be synced, -1 on error, 0 otherwise
831
static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
832
172k
{
833
    // contig
834
172k
    int i, ret, replacing = 0;
835
172k
    khint_t k;
836
172k
    char *str = NULL;
837
838
172k
    bcf_hrec_set_type(hrec);
839
840
172k
    if ( hrec->type==BCF_HL_CTG )
841
10.3k
    {
842
10.3k
        hts_pos_t len = 0;
843
844
        // Get the contig ID ($str) and length ($j)
845
10.3k
        i = bcf_hrec_find_key(hrec,"length");
846
10.3k
        if ( i<0 ) len = 0;
847
2.46k
        else {
848
2.46k
            char *end = hrec->vals[i];
849
2.46k
            len = strtoll(hrec->vals[i], &end, 10);
850
2.46k
            if (end == hrec->vals[i] || len < 0) return 0;
851
2.46k
        }
852
853
9.12k
        i = bcf_hrec_find_key(hrec,"ID");
854
9.12k
        if ( i<0 ) return 0;
855
5.03k
        str = strdup(hrec->vals[i]);
856
5.03k
        if (!str) return -1;
857
858
        // Register in the dictionary
859
5.03k
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
860
5.03k
        khint_t k = kh_get(vdict, d, str);
861
5.03k
        if ( k != kh_end(d) ) { // already present
862
1.16k
            free(str); str=NULL;
863
1.16k
            if (kh_val(d, k).hrec[0] != NULL) // and not removed
864
1.16k
                return 0;
865
0
            replacing = 1;
866
3.87k
        } else {
867
3.87k
            k = kh_put(vdict, d, str, &ret);
868
3.87k
            if (ret < 0) { free(str); return -1; }
869
3.87k
        }
870
871
3.87k
        int idx = bcf_hrec_find_key(hrec,"IDX");
872
3.87k
        if ( idx!=-1 )
873
652
        {
874
652
            char *tmp = hrec->vals[idx];
875
652
            idx = strtol(hrec->vals[idx], &tmp, 10);
876
652
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
877
644
            {
878
644
                if (!replacing) {
879
644
                    kh_del(vdict, d, k);
880
644
                    free(str);
881
644
                }
882
644
                hts_log_warning("Error parsing the IDX tag, skipping");
883
644
                return 0;
884
644
            }
885
652
        }
886
887
3.22k
        kh_val(d, k) = bcf_idinfo_def;
888
3.22k
        kh_val(d, k).id = idx;
889
3.22k
        kh_val(d, k).info[0] = len;
890
3.22k
        kh_val(d, k).hrec[0] = hrec;
891
3.22k
        if (bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k)) < 0) {
892
4
            if (!replacing) {
893
4
                kh_del(vdict, d, k);
894
4
                free(str);
895
4
            }
896
4
            return -1;
897
4
        }
898
3.22k
        if ( idx==-1 ) {
899
3.22k
            if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
900
0
               return -1;
901
0
            }
902
3.22k
        }
903
904
3.22k
        return 1;
905
3.22k
    }
906
907
162k
    if ( hrec->type==BCF_HL_STR ) return 1;
908
151k
    if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_FMT ) return 0;
909
910
    // INFO/FILTER/FORMAT
911
123k
    char *id = NULL;
912
123k
    uint32_t type = UINT32_MAX, var = UINT32_MAX;
913
123k
    int num = -1, idx = -1;
914
444k
    for (i=0; i<hrec->nkeys; i++)
915
323k
    {
916
323k
        if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
917
218k
        else if ( !strcmp(hrec->keys[i], "IDX") )
918
4.06k
        {
919
4.06k
            char *tmp = hrec->vals[i];
920
4.06k
            idx = strtol(hrec->vals[i], &tmp, 10);
921
4.06k
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
922
1.27k
            {
923
1.27k
                hts_log_warning("Error parsing the IDX tag, skipping");
924
1.27k
                return 0;
925
1.27k
            }
926
4.06k
        }
927
213k
        else if ( !strcmp(hrec->keys[i], "Type") )
928
59.8k
        {
929
59.8k
            if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
930
57.4k
            else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
931
56.6k
            else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
932
9.24k
            else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
933
8.86k
            else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
934
7.01k
            else
935
7.01k
            {
936
7.01k
                hts_log_warning("The type \"%s\" is not supported, assuming \"String\"", hrec->vals[i]);
937
7.01k
                type = BCF_HT_STR;
938
7.01k
            }
939
59.8k
        }
940
154k
        else if ( !strcmp(hrec->keys[i], "Number") )
941
51.2k
        {
942
51.2k
            int is_fmt = hrec->type == BCF_HL_FMT;
943
51.2k
            if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
944
49.9k
            else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
945
49.8k
            else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
946
49.0k
            else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
947
49.0k
            else if ( is_fmt && !strcmp(hrec->vals[i],"P") )  var = BCF_VL_P;
948
49.0k
            else if ( is_fmt && !strcmp(hrec->vals[i],"LA") ) var = BCF_VL_LA;
949
49.0k
            else if ( is_fmt && !strcmp(hrec->vals[i],"LR") ) var = BCF_VL_LR;
950
49.0k
            else if ( is_fmt && !strcmp(hrec->vals[i],"LG") ) var = BCF_VL_LG;
951
49.0k
            else if ( is_fmt && !strcmp(hrec->vals[i],"M") )  var = BCF_VL_M;
952
49.0k
            else
953
49.0k
            {
954
49.0k
                if (sscanf(hrec->vals[i],"%d",&num) == 1)
955
48.4k
                    var = BCF_VL_FIXED;
956
49.0k
            }
957
51.2k
            if (var != BCF_VL_FIXED) num = 0xfffff;
958
51.2k
        }
959
323k
    }
960
121k
    if (hrec->type == BCF_HL_INFO || hrec->type == BCF_HL_FMT) {
961
72.9k
        if (type == -1) {
962
14.5k
            hts_log_warning("%s %s field has no Type defined. Assuming String",
963
14.5k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
964
14.5k
            type = BCF_HT_STR;
965
14.5k
        }
966
72.9k
        if (var == UINT32_MAX) {
967
22.3k
            hts_log_warning("%s %s field has no Number defined. Assuming '.'",
968
22.3k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
969
22.3k
            var = BCF_VL_VAR;
970
22.3k
        }
971
72.9k
        if ( type==BCF_HT_FLAG && (var!=BCF_VL_FIXED || num!=0) )
972
1.29k
        {
973
1.29k
            hts_log_warning("The definition of Flag \"%s/%s\" is invalid, forcing Number=0", hrec->key,id);
974
1.29k
            var = BCF_VL_FIXED;
975
1.29k
            num = 0;
976
1.29k
        }
977
72.9k
    }
978
121k
    uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 |
979
121k
                     (var & 0xf) << 8 |
980
121k
                     (type & 0xf) << 4 |
981
121k
                     (((uint32_t) hrec->type) & 0xf));
982
983
121k
    if ( !id ) return 0;
984
104k
    str = strdup(id);
985
104k
    if (!str) return -1;
986
987
104k
    vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
988
104k
    k = kh_get(vdict, d, str);
989
104k
    if ( k != kh_end(d) )
990
7.74k
    {
991
        // already present
992
7.74k
        free(str);
993
7.74k
        if ( kh_val(d, k).hrec[info&0xf] ) return 0;
994
1.73k
        kh_val(d, k).info[info&0xf] = info;
995
1.73k
        kh_val(d, k).hrec[info&0xf] = hrec;
996
1.73k
        if ( idx==-1 ) {
997
1.73k
            if (hrec_add_idx(hrec, kh_val(d, k).id) < 0) {
998
0
                return -1;
999
0
            }
1000
1.73k
        }
1001
1.73k
        return 1;
1002
1.73k
    }
1003
97.1k
    k = kh_put(vdict, d, str, &ret);
1004
97.1k
    if (ret < 0) {
1005
0
        free(str);
1006
0
        return -1;
1007
0
    }
1008
97.1k
    kh_val(d, k) = bcf_idinfo_def;
1009
97.1k
    kh_val(d, k).info[info&0xf] = info;
1010
97.1k
    kh_val(d, k).hrec[info&0xf] = hrec;
1011
97.1k
    kh_val(d, k).id = idx;
1012
97.1k
    if (bcf_hdr_set_idx(hdr, BCF_DT_ID, kh_key(d,k), &kh_val(d,k)) < 0) {
1013
2
        kh_del(vdict, d, k);
1014
2
        free(str);
1015
2
        return -1;
1016
2
    }
1017
97.1k
    if ( idx==-1 ) {
1018
96.8k
        if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
1019
0
            return -1;
1020
0
        }
1021
96.8k
    }
1022
1023
97.1k
    return 1;
1024
97.1k
}
1025
1026
static void bcf_hdr_unregister_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1027
0
{
1028
0
    if (hrec->type == BCF_HL_FLT ||
1029
0
        hrec->type == BCF_HL_INFO ||
1030
0
        hrec->type == BCF_HL_FMT ||
1031
0
        hrec->type == BCF_HL_CTG) {
1032
0
        int id = bcf_hrec_find_key(hrec, "ID");
1033
0
        if (id < 0 || !hrec->vals[id])
1034
0
            return;
1035
0
        vdict_t *dict = (hrec->type == BCF_HL_CTG
1036
0
                         ? (vdict_t*)hdr->dict[BCF_DT_CTG]
1037
0
                         : (vdict_t*)hdr->dict[BCF_DT_ID]);
1038
0
        khint_t k = kh_get(vdict, dict, hrec->vals[id]);
1039
0
        if (k != kh_end(dict))
1040
0
            kh_val(dict, k).hrec[hrec->type==BCF_HL_CTG ? 0 : hrec->type] = NULL;
1041
0
    }
1042
0
}
1043
1044
static void bcf_hdr_remove_from_hdict(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1045
0
{
1046
0
    kstring_t str = KS_INITIALIZE;
1047
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1048
0
    khint_t k;
1049
0
    int id;
1050
1051
0
    switch (hrec->type) {
1052
0
    case BCF_HL_GEN:
1053
0
        if (ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0)
1054
0
            str.l = 0;
1055
0
        break;
1056
0
    case BCF_HL_STR:
1057
0
        id = bcf_hrec_find_key(hrec, "ID");
1058
0
        if (id < 0)
1059
0
            return;
1060
0
        if (!hrec->vals[id] ||
1061
0
            ksprintf(&str, "##%s=<ID=%s>", hrec->key, hrec->vals[id]) < 0)
1062
0
            str.l = 0;
1063
0
        break;
1064
0
    default:
1065
0
        return;
1066
0
    }
1067
0
    if (str.l) {
1068
0
        k = kh_get(hdict, aux->gen, str.s);
1069
0
    } else {
1070
        // Couldn't get a string for some reason, so try the hard way...
1071
0
        for (k = kh_begin(aux->gen); k < kh_end(aux->gen); k++) {
1072
0
            if (kh_exist(aux->gen, k) && kh_val(aux->gen, k) == hrec)
1073
0
                break;
1074
0
        }
1075
0
    }
1076
0
    if (k != kh_end(aux->gen) && kh_val(aux->gen, k) == hrec) {
1077
0
        kh_val(aux->gen, k) = NULL;
1078
0
        free((char *) kh_key(aux->gen, k));
1079
0
        kh_key(aux->gen, k) = NULL;
1080
0
        kh_del(hdict, aux->gen, k);
1081
0
    }
1082
0
    free(str.s);
1083
0
}
1084
1085
int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp)
1086
0
{
1087
0
    assert( hrec->type==BCF_HL_GEN );
1088
0
    int ret;
1089
0
    khint_t k;
1090
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1091
0
    for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
1092
0
    {
1093
0
        if ( !kh_exist(aux->gen,k) ) continue;
1094
0
        if ( hrec!=(bcf_hrec_t*)kh_val(aux->gen,k) ) continue;
1095
0
        break;
1096
0
    }
1097
0
    assert( k<kh_end(aux->gen) );   // something went wrong, should never happen
1098
0
    free((char*)kh_key(aux->gen,k));
1099
0
    kh_del(hdict,aux->gen,k);
1100
0
    kstring_t str = {0,0,0};
1101
0
    if ( ksprintf(&str, "##%s=%s", tmp->key,tmp->value) < 0 )
1102
0
    {
1103
0
        free(str.s);
1104
0
        return -1;
1105
0
    }
1106
0
    k = kh_put(hdict, aux->gen, str.s, &ret);
1107
0
    if ( ret<0 )
1108
0
    {
1109
0
        free(str.s);
1110
0
        return -1;
1111
0
    }
1112
0
    free(hrec->value);
1113
0
    hrec->value = strdup(tmp->value);
1114
0
    if ( !hrec->value ) return -1;
1115
0
    kh_val(aux->gen,k) = hrec;
1116
1117
0
    if (!strcmp(hrec->key,"fileformat")) {
1118
        //update version
1119
0
        get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value);
1120
0
    }
1121
0
    return 0;
1122
0
}
1123
1124
int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1125
172k
{
1126
172k
    kstring_t str = {0,0,0};
1127
172k
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1128
1129
172k
    int res;
1130
172k
    if ( !hrec ) return 0;
1131
1132
172k
    bcf_hrec_check(hrec);   // todo: check return status and propagate errors up
1133
1134
172k
    res = bcf_hdr_register_hrec(hdr,hrec);
1135
172k
    if (res < 0) return -1;
1136
172k
    if ( !res )
1137
59.6k
    {
1138
        // If one of the hashed field, then it is already present
1139
59.6k
        if ( hrec->type != BCF_HL_GEN )
1140
31.2k
        {
1141
31.2k
            bcf_hrec_destroy(hrec);
1142
31.2k
            return 0;
1143
31.2k
        }
1144
        // Is one of the generic fields and already present?
1145
28.3k
        if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 )
1146
0
        {
1147
0
            free(str.s);
1148
0
            return -1;
1149
0
        }
1150
28.3k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1151
28.3k
        if ( k != kh_end(aux->gen) )
1152
17.2k
        {
1153
            // duplicate record
1154
17.2k
            bcf_hrec_destroy(hrec);
1155
17.2k
            free(str.s);
1156
17.2k
            return 0;
1157
17.2k
        }
1158
11.0k
        if (!strcmp(hrec->key, "fileformat")) {
1159
3.84k
            aux->version = bcf_get_version(NULL, hrec->value);
1160
3.84k
        }
1161
11.0k
    }
1162
1163
124k
    int i;
1164
124k
    if ( hrec->type==BCF_HL_STR && (i=bcf_hrec_find_key(hrec,"ID"))>=0 )
1165
6.14k
    {
1166
6.14k
        if ( ksprintf(&str, "##%s=<ID=%s>", hrec->key,hrec->vals[i]) < 0 )
1167
0
        {
1168
0
            free(str.s);
1169
0
            return -1;
1170
0
        }
1171
6.14k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1172
6.14k
        if ( k != kh_end(aux->gen) )
1173
4.61k
        {
1174
            // duplicate record
1175
4.61k
            bcf_hrec_destroy(hrec);
1176
4.61k
            free(str.s);
1177
4.61k
            return 0;
1178
4.61k
        }
1179
6.14k
    }
1180
1181
    // New record, needs to be added
1182
119k
    int n = hdr->nhrec + 1;
1183
119k
    bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
1184
119k
    if (!new_hrec) {
1185
0
        free(str.s);
1186
0
        bcf_hdr_unregister_hrec(hdr, hrec);
1187
0
        return -1;
1188
0
    }
1189
119k
    hdr->hrec = new_hrec;
1190
1191
119k
    if ( str.s )
1192
12.5k
    {
1193
12.5k
        khint_t k = kh_put(hdict, aux->gen, str.s, &res);
1194
12.5k
        if ( res<0 )
1195
0
        {
1196
0
            free(str.s);
1197
0
            return -1;
1198
0
        }
1199
12.5k
        kh_val(aux->gen,k) = hrec;
1200
12.5k
    }
1201
1202
119k
    hdr->hrec[hdr->nhrec] = hrec;
1203
119k
    hdr->dirty = 1;
1204
119k
    hdr->nhrec = n;
1205
1206
119k
    return hrec->type==BCF_HL_GEN ? 0 : 1;
1207
119k
}
1208
1209
bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
1210
730
{
1211
730
    int i;
1212
730
    if ( type==BCF_HL_GEN )
1213
730
    {
1214
        // e.g. ##fileformat=VCFv4.2
1215
        //      ##source=GenomicsDBImport
1216
        //      ##bcftools_viewVersion=1.16-80-gdfdb0923+htslib-1.16-34-g215d364
1217
730
        if ( value )
1218
0
        {
1219
0
            kstring_t str = {0,0,0};
1220
0
            ksprintf(&str, "##%s=%s", key,value);
1221
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1222
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1223
0
            free(str.s);
1224
0
            if ( k == kh_end(aux->gen) ) return NULL;
1225
0
            return kh_val(aux->gen, k);
1226
0
        }
1227
1.46k
        for (i=0; i<hdr->nhrec; i++)
1228
1.04k
        {
1229
1.04k
            if ( hdr->hrec[i]->type!=type ) continue;
1230
386
            if ( strcmp(hdr->hrec[i]->key,key) ) continue;
1231
312
            return hdr->hrec[i];
1232
386
        }
1233
418
        return NULL;
1234
730
    }
1235
0
    else if ( type==BCF_HL_STR )
1236
0
    {
1237
        // e.g. ##GATKCommandLine=<ID=GenomicsDBImport,CommandLine="GenomicsDBImport....">
1238
        //      ##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
1239
0
        if (!str_class) return NULL;
1240
0
        if ( !strcmp("ID",key) )
1241
0
        {
1242
0
            kstring_t str = {0,0,0};
1243
0
            ksprintf(&str, "##%s=<%s=%s>",str_class,key,value);
1244
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1245
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1246
0
            free(str.s);
1247
0
            if ( k == kh_end(aux->gen) ) return NULL;
1248
0
            return kh_val(aux->gen, k);
1249
0
        }
1250
0
        for (i=0; i<hdr->nhrec; i++)
1251
0
        {
1252
0
            if ( hdr->hrec[i]->type!=type ) continue;
1253
0
            if ( strcmp(hdr->hrec[i]->key,str_class) ) continue;
1254
0
            int j = bcf_hrec_find_key(hdr->hrec[i],key);
1255
0
            if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i];
1256
0
        }
1257
0
        return NULL;
1258
0
    }
1259
0
    vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1260
0
    khint_t k = kh_get(vdict, d, value);
1261
0
    if ( k == kh_end(d) ) return NULL;
1262
0
    return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
1263
0
}
1264
1265
// Check the VCF header is correctly formatted as per the specification.
1266
// Note the code that calls this doesn't bother to check return values and
1267
// we have so many broken VCFs in the wild that for now we just reprt a
1268
// warning and continue anyway.  So currently this is a void function.
1269
void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
1270
2.54k
{
1271
2.54k
    int version = bcf_get_version(hdr, NULL);
1272
1273
2.54k
    struct tag {
1274
2.54k
        char name[10];
1275
2.54k
        char number_str[3];
1276
2.54k
        int number;
1277
2.54k
        int version;
1278
2.54k
        int type;
1279
2.54k
    };
1280
1281
2.54k
    char type_str[][8] = {"Flag", "Integer", "Float", "String"};
1282
1283
2.54k
    struct tag info_tags[] = {
1284
2.54k
        {"AD",        "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1285
2.54k
        {"ADF",       "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1286
2.54k
        {"ADR",       "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1287
2.54k
        {"AC",        "A",  BCF_VL_A,     VCF_DEF, BCF_HT_INT},
1288
2.54k
        {"AF",        "A",  BCF_VL_A,     VCF_DEF, BCF_HT_REAL},
1289
2.54k
        {"CIGAR",     "A",  BCF_VL_A,     VCF_DEF, BCF_HT_STR},
1290
2.54k
        {"AA",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_STR},
1291
2.54k
        {"AN",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1292
2.54k
        {"BQ",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_REAL},
1293
2.54k
        {"DB",        "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1294
2.54k
        {"DP",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1295
2.54k
        {"END",       "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1296
2.54k
        {"H2",        "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1297
2.54k
        {"H3",        "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1298
2.54k
        {"MQ",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_REAL},
1299
2.54k
        {"MQ0",       "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1300
2.54k
        {"NS",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1301
2.54k
        {"SB",        "4",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1302
2.54k
        {"SOMATIC",   "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1303
2.54k
        {"VALIDATED", "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1304
2.54k
        {"1000G",     "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1305
2.54k
    };
1306
2.54k
    static int info_warned[sizeof(info_tags)/sizeof(*info_tags)] = {0};
1307
1308
2.54k
    struct tag fmt_tags[] = {
1309
2.54k
        {"AD",   "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1310
2.54k
        {"ADF",  "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1311
2.54k
        {"ADR",  "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1312
2.54k
        {"EC",   "A",  BCF_VL_A,     VCF_DEF, BCF_HT_INT},
1313
2.54k
        {"GL",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_REAL},
1314
2.54k
        {"GP",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_REAL},
1315
2.54k
        {"PL",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_INT},
1316
2.54k
        {"PP",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_INT},
1317
2.54k
        {"DP",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1318
2.54k
        {"LEN",  "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1319
2.54k
        {"FT",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_STR},
1320
2.54k
        {"GQ",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1321
2.54k
        {"GT",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_STR},
1322
2.54k
        {"HQ",   "2",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1323
2.54k
        {"MQ",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1324
2.54k
        {"PQ",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1325
2.54k
        {"PS",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1326
2.54k
        {"PSL",  "P",  BCF_VL_P,     VCF44,   BCF_HT_STR},
1327
2.54k
        {"PSO",  "P",  BCF_VL_P,     VCF44,   BCF_HT_INT},
1328
2.54k
        {"PSQ",  "P",  BCF_VL_P,     VCF44,   BCF_HT_INT},
1329
2.54k
        {"LGL",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1330
2.54k
        {"LGP",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1331
2.54k
        {"LPL",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1332
2.54k
        {"LPP",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1333
2.54k
        {"LEC",  "LA", BCF_VL_LA,    VCF45,   BCF_HT_INT},
1334
2.54k
        {"LAD",  "LR", BCF_VL_LR,    VCF45,   BCF_HT_INT},
1335
2.54k
        {"LADF", "LR", BCF_VL_LR,    VCF45,   BCF_HT_INT},
1336
2.54k
        {"LADR", "LR", BCF_VL_LR,    VCF45,   BCF_HT_INT},
1337
2.54k
    };
1338
2.54k
    static int fmt_warned[sizeof(fmt_tags)/sizeof(*fmt_tags)] = {0};
1339
1340
    // Check INFO tag numbers.  We shouldn't really permit ".", but it's
1341
    // commonly misused so we let it slide unless it's a new tag and the
1342
    // file format claims to be new also.  We also cannot distinguish between
1343
    // Number=1 and Number=2, but we at least report the correct term if we
1344
    // get, say, Number=G in its place.
1345
    // Also check the types.
1346
2.54k
    int i;
1347
56.0k
    for (i = 0; i < sizeof(info_tags)/sizeof(*info_tags); i++) {
1348
53.4k
        if (info_warned[i])
1349
0
            continue;
1350
53.4k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, info_tags[i].name);
1351
53.4k
        if (bcf_hdr_idinfo_exists(hdr, BCF_HL_INFO, id)) {
1352
0
            if (bcf_hdr_id2length(hdr, BCF_HL_INFO, id) != info_tags[i].number &&
1353
0
                bcf_hdr_id2length(hdr, BCF_HL_INFO, id) != BCF_VL_VAR) {
1354
0
                info_warned[i] = 1;
1355
0
            } else if (bcf_hdr_id2length(hdr, BCF_HL_INFO, id) == BCF_VL_FIXED &&
1356
0
                       bcf_hdr_id2number(hdr, BCF_HL_INFO, id) != atoi(info_tags[i].number_str)) {
1357
0
                info_warned[i] = 1;
1358
0
            }
1359
1360
0
            if (info_warned[i]) {
1361
0
                hts_log_warning("%s should be declared as Number=%s",
1362
0
                                info_tags[i].name, info_tags[i].number_str);
1363
0
            }
1364
1365
0
            if (bcf_hdr_id2type(hdr, BCF_HL_INFO, id) != info_tags[i].type) {
1366
0
                hts_log_warning("%s should be declared as Type=%s",
1367
0
                                info_tags[i].name, type_str[info_tags[i].type]);
1368
0
                info_warned[i] = 1;
1369
0
            }
1370
0
        }
1371
53.4k
    }
1372
1373
    // Check FORMAT tag numbers and types.
1374
73.8k
    for (i = 0; i < sizeof(fmt_tags)/sizeof(*fmt_tags); i++) {
1375
71.2k
        if (fmt_warned[i])
1376
0
            continue;
1377
71.2k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, fmt_tags[i].name);
1378
71.2k
        if (bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, id)) {
1379
0
            if (bcf_hdr_id2length(hdr, BCF_HL_FMT, id) != fmt_tags[i].number) {
1380
                // Permit "Number=." if this tag predates the vcf version it is
1381
                // defined within.  This is a common tactic for callers to use
1382
                // new tags with older formats in order to avoid parsing failures
1383
                // with some software.
1384
                // We don't care for 4.3 and earlier as that's more of a wild-west
1385
                // and it's not abnormal to see incorrect usage of Number=. there.
1386
0
                if ((version < VCF44 &&
1387
0
                     bcf_hdr_id2length(hdr, BCF_HL_FMT, id) != BCF_VL_VAR) ||
1388
0
                    (version >= VCF44 && version >= fmt_tags[i].version)) {
1389
0
                    fmt_warned[i] = 1;
1390
0
                }
1391
0
            } else if (bcf_hdr_id2length(hdr, BCF_HL_FMT, id) == BCF_VL_FIXED &&
1392
0
                       bcf_hdr_id2number(hdr, BCF_HL_FMT, id) != atoi(fmt_tags[i].number_str)) {
1393
0
                fmt_warned[i] = 1;
1394
0
            }
1395
1396
0
            if (fmt_warned[i]) {
1397
0
                hts_log_warning("%s should be declared as Number=%s",
1398
0
                                fmt_tags[i].name, fmt_tags[i].number_str);
1399
0
            }
1400
1401
0
            if (bcf_hdr_id2type(hdr, BCF_HL_FMT, id) != fmt_tags[i].type) {
1402
0
                hts_log_warning("%s should be declared as Type=%s",
1403
0
                                fmt_tags[i].name, type_str[fmt_tags[i].type]);
1404
0
                fmt_warned[i] = 1;
1405
0
            }
1406
0
        }
1407
71.2k
    }
1408
2.54k
}
1409
1410
int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
1411
2.94k
{
1412
2.94k
    int len, done = 0;
1413
2.94k
    char *p = htxt;
1414
1415
    // Check sanity: "fileformat" string must come as first
1416
2.94k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
1417
2.94k
    if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") )
1418
224
        hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?");
1419
2.94k
    if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1420
0
        bcf_hrec_destroy(hrec);
1421
0
        return -1;
1422
0
    }
1423
1424
    // The filter PASS must appear first in the dictionary
1425
2.94k
    hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
1426
2.94k
    if (!hrec || bcf_hdr_add_hrec(hdr, hrec) < 0) {
1427
0
        bcf_hrec_destroy(hrec);
1428
0
        return -1;
1429
0
    }
1430
1431
    // Parse the whole header
1432
22.6k
    do {
1433
93.0k
        while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) {
1434
70.4k
            if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1435
6
                bcf_hrec_destroy(hrec);
1436
6
                return -1;
1437
6
            }
1438
70.4k
            p += len;
1439
70.4k
        }
1440
22.6k
        assert(hrec == NULL);
1441
22.6k
        if (len < 0) {
1442
            // len < 0 indicates out-of-memory, or similar error
1443
2
            hts_log_error("Could not parse header line: %s", strerror(errno));
1444
2
            return -1;
1445
22.6k
        } else if (len > 0) {
1446
            // Bad header line.  bcf_hdr_parse_line() will have logged it.
1447
            // Skip and try again on the next line (p + len will be the start
1448
            // of the next one).
1449
19.0k
            p += len;
1450
19.0k
            continue;
1451
19.0k
        }
1452
1453
        // Next should be the sample line.  If not, it was a malformed
1454
        // header, in which case print a warning and skip (many VCF
1455
        // operations do not really care about a few malformed lines).
1456
        // In the future we may want to add a strict mode that errors in
1457
        // this case.
1458
3.59k
        if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) {
1459
1.01k
            char *eol = strchr(p, '\n');
1460
1.01k
            if (*p != '\0') {
1461
664
                char buffer[320];
1462
664
                hts_log_warning("Could not parse header line: %s",
1463
664
                                hts_strprint(buffer, sizeof(buffer),
1464
664
                                               '"', p,
1465
664
                                               eol ? (eol - p) : SIZE_MAX));
1466
664
            }
1467
1.01k
            if (eol) {
1468
656
                p = eol + 1; // Try from the next line.
1469
656
            } else {
1470
362
                done = -1; // No more lines left, give up.
1471
362
            }
1472
2.57k
        } else {
1473
2.57k
            done = 1; // Sample line found
1474
2.57k
        }
1475
22.6k
    } while (!done);
1476
1477
2.93k
    if (done < 0) {
1478
        // No sample line is fatal.
1479
362
        hts_log_error("Could not parse the header, sample line not found");
1480
362
        return -1;
1481
362
    }
1482
1483
2.57k
    if (bcf_hdr_parse_sample_line(hdr,p) < 0)
1484
26
        return -1;
1485
2.54k
    if (bcf_hdr_sync(hdr) < 0)
1486
0
        return -1;
1487
2.54k
    bcf_hdr_check_sanity(hdr);
1488
2.54k
    return 0;
1489
2.54k
}
1490
1491
int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
1492
0
{
1493
0
    int len;
1494
0
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
1495
0
    if ( !hrec ) return -1;
1496
0
    if (bcf_hdr_add_hrec(hdr, hrec) < 0)
1497
0
        return -1;
1498
0
    return 0;
1499
0
}
1500
1501
void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
1502
0
{
1503
0
    int i = 0;
1504
0
    bcf_hrec_t *hrec;
1505
0
    if ( !key )
1506
0
    {
1507
        // no key, remove all entries of this type
1508
0
        while ( i<hdr->nhrec )
1509
0
        {
1510
0
            if ( hdr->hrec[i]->type!=type ) { i++; continue; }
1511
0
            hrec = hdr->hrec[i];
1512
0
            bcf_hdr_unregister_hrec(hdr, hrec);
1513
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1514
0
            hdr->dirty = 1;
1515
0
            hdr->nhrec--;
1516
0
            if ( i < hdr->nhrec )
1517
0
                memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1518
0
            bcf_hrec_destroy(hrec);
1519
0
        }
1520
0
        return;
1521
0
    }
1522
0
    while (1)
1523
0
    {
1524
0
        if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
1525
0
        {
1526
0
            hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL);
1527
0
            if ( !hrec ) return;
1528
1529
0
            for (i=0; i<hdr->nhrec; i++)
1530
0
                if ( hdr->hrec[i]==hrec ) break;
1531
0
            assert( i<hdr->nhrec );
1532
1533
0
            vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1534
0
            khint_t k = kh_get(vdict, d, key);
1535
0
            kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
1536
0
        }
1537
0
        else
1538
0
        {
1539
0
            for (i=0; i<hdr->nhrec; i++)
1540
0
            {
1541
0
                if ( hdr->hrec[i]->type!=type ) continue;
1542
0
                if ( type==BCF_HL_GEN )
1543
0
                {
1544
0
                    if ( !strcmp(hdr->hrec[i]->key,key) ) break;
1545
0
                }
1546
0
                else
1547
0
                {
1548
                    // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
1549
0
                    int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
1550
0
                    if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break;
1551
0
                }
1552
0
            }
1553
0
            if ( i==hdr->nhrec ) return;
1554
0
            hrec = hdr->hrec[i];
1555
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1556
0
        }
1557
1558
0
        hdr->nhrec--;
1559
0
        if ( i < hdr->nhrec )
1560
0
            memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1561
0
        bcf_hrec_destroy(hrec);
1562
0
        hdr->dirty = 1;
1563
0
    }
1564
0
}
1565
1566
int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
1567
0
{
1568
0
    char tmp[256], *line = tmp;
1569
0
    va_list ap;
1570
0
    va_start(ap, fmt);
1571
0
    int n = vsnprintf(line, sizeof(tmp), fmt, ap);
1572
0
    va_end(ap);
1573
1574
0
    if (n >= sizeof(tmp)) {
1575
0
        n++; // For trailing NUL
1576
0
        line = (char*)malloc(n);
1577
0
        if (!line)
1578
0
            return -1;
1579
1580
0
        va_start(ap, fmt);
1581
0
        vsnprintf(line, n, fmt, ap);
1582
0
        va_end(ap);
1583
0
    }
1584
1585
0
    int ret = bcf_hdr_append(hdr, line);
1586
1587
0
    if (line != tmp) free(line);
1588
0
    return ret;
1589
0
}
1590
1591
1592
/**********************
1593
 *** BCF header I/O ***
1594
 **********************/
1595
1596
const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
1597
730
{
1598
730
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1599
730
    if ( !hrec )
1600
418
    {
1601
418
        hts_log_warning("No version string found, assuming VCFv4.2");
1602
418
        return "VCFv4.2";
1603
418
    }
1604
312
    return hrec->value;
1605
730
}
1606
1607
int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
1608
0
{
1609
0
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1610
0
    if ( !hrec )
1611
0
    {
1612
0
        int len;
1613
0
        kstring_t str = {0,0,0};
1614
0
        if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1;
1615
0
        hrec = bcf_hdr_parse_line(hdr, str.s, &len);
1616
0
        free(str.s);
1617
1618
0
        get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value);
1619
0
    }
1620
0
    else
1621
0
    {
1622
0
        bcf_hrec_t *tmp = bcf_hrec_dup(hrec);
1623
0
        if ( !tmp ) return -1;
1624
0
        free(tmp->value);
1625
0
        tmp->value = strdup(version);
1626
0
        if ( !tmp->value ) return -1;
1627
0
        bcf_hdr_update_hrec(hdr, hrec, tmp);
1628
0
        bcf_hrec_destroy(tmp);
1629
0
    }
1630
0
    hdr->dirty = 1;
1631
    //TODO rlen may change, deal with it
1632
0
    return 0; // FIXME: check for errs in this function (return < 0 if so)
1633
0
}
1634
1635
bcf_hdr_t *bcf_hdr_init(const char *mode)
1636
2.97k
{
1637
2.97k
    int i;
1638
2.97k
    bcf_hdr_t *h;
1639
2.97k
    h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
1640
2.97k
    if (!h) return NULL;
1641
11.9k
    for (i = 0; i < 3; ++i) {
1642
8.93k
        if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail;
1643
        // Supersize the hash to make collisions very unlikely
1644
8.93k
        static int dsize[3] = {16384,16384,2048}; // info, contig, format
1645
8.93k
        if (kh_resize(vdict, h->dict[i], dsize[i]) < 0) goto fail;
1646
8.93k
    }
1647
1648
2.97k
    bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t));
1649
2.97k
    if ( !aux ) goto fail;
1650
2.97k
    if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; }
1651
2.97k
    aux->key_len = NULL;
1652
2.97k
    aux->dict = *((vdict_t*)h->dict[0]);
1653
2.97k
    aux->version = 0;
1654
2.97k
    aux->ref_count = 1;
1655
2.97k
    free(h->dict[0]);
1656
2.97k
    h->dict[0] = aux;
1657
1658
2.97k
    if ( strchr(mode,'w') )
1659
0
    {
1660
0
        bcf_hdr_append(h, "##fileformat=VCFv4.2");
1661
        // The filter PASS must appear first in the dictionary
1662
0
        bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
1663
0
        aux->version = VCF_DEF;
1664
0
    }
1665
2.97k
    return h;
1666
1667
0
 fail:
1668
0
    for (i = 0; i < 3; ++i)
1669
0
        kh_destroy(vdict, h->dict[i]);
1670
0
    free(h);
1671
0
    return NULL;
1672
2.97k
}
1673
1674
void bcf_hdr_destroy(bcf_hdr_t *h)
1675
4.36k
{
1676
4.36k
    int i;
1677
4.36k
    khint_t k;
1678
4.36k
    if (!h) return;
1679
4.36k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
1680
4.36k
    if (aux->ref_count > 1) // Refs still held, so delay destruction
1681
1.38k
    {
1682
1.38k
        aux->ref_count &= ~1;
1683
1.38k
        return;
1684
1.38k
    }
1685
11.9k
    for (i = 0; i < 3; ++i) {
1686
8.93k
        vdict_t *d = (vdict_t*)h->dict[i];
1687
8.93k
        if (d == 0) continue;
1688
103M
        for (k = kh_begin(d); k != kh_end(d); ++k)
1689
103M
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
1690
8.93k
        if ( i==0 )
1691
2.97k
        {
1692
28.2k
            for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
1693
25.3k
                if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k));
1694
2.97k
            kh_destroy(hdict, aux->gen);
1695
2.97k
            free(aux->key_len); // may exist for dict[0] only
1696
2.97k
        }
1697
8.93k
        kh_destroy(vdict, d);
1698
8.93k
        free(h->id[i]);
1699
8.93k
    }
1700
122k
    for (i=0; i<h->nhrec; i++)
1701
119k
        bcf_hrec_destroy(h->hrec[i]);
1702
2.97k
    if (h->nhrec) free(h->hrec);
1703
2.97k
    if (h->samples) free(h->samples);
1704
2.97k
    free(h->keep_samples);
1705
2.97k
    free(h->transl[0]); free(h->transl[1]);
1706
2.97k
    free(h->mem.s);
1707
2.97k
    free(h);
1708
2.97k
}
1709
1710
bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
1711
2.97k
{
1712
2.97k
    if (hfp->format.format == vcf)
1713
2.75k
        return vcf_hdr_read(hfp);
1714
228
    if (hfp->format.format != bcf) {
1715
0
        hts_log_error("Input is not detected as bcf or vcf format");
1716
0
        return NULL;
1717
0
    }
1718
1719
228
    assert(hfp->is_bgzf);
1720
1721
228
    BGZF *fp = hfp->fp.bgzf;
1722
228
    uint8_t magic[5];
1723
228
    bcf_hdr_t *h;
1724
228
    h = bcf_hdr_init("r");
1725
228
    if (!h) {
1726
0
        hts_log_error("Failed to allocate bcf header");
1727
0
        return NULL;
1728
0
    }
1729
228
    if (bgzf_read(fp, magic, 5) != 5)
1730
0
    {
1731
0
        hts_log_error("Failed to read the header (reading BCF in text mode?)");
1732
0
        bcf_hdr_destroy(h);
1733
0
        return NULL;
1734
0
    }
1735
228
    if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
1736
4
    {
1737
4
        if (!strncmp((char*)magic, "BCF", 3))
1738
4
            hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported");
1739
0
        else
1740
0
            hts_log_error("Invalid BCF2 magic string");
1741
4
        bcf_hdr_destroy(h);
1742
4
        return NULL;
1743
4
    }
1744
224
    uint8_t buf[4];
1745
224
    size_t hlen;
1746
224
    char *htxt = NULL;
1747
224
    if (bgzf_read(fp, buf, 4) != 4) goto fail;
1748
224
    hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24);
1749
224
    if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; }
1750
224
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1751
224
    if (hlen > FUZZ_ALLOC_LIMIT/2) { errno = ENOMEM; goto fail; }
1752
224
#endif
1753
224
    htxt = (char*)malloc(hlen + 1);
1754
224
    if (!htxt) goto fail;
1755
224
    if (bgzf_read(fp, htxt, hlen) != hlen) goto fail;
1756
224
    htxt[hlen] = '\0'; // Ensure htxt is terminated
1757
224
    if ( bcf_hdr_parse(h, htxt) < 0 ) goto fail;
1758
220
    free(htxt);
1759
1760
220
    bcf_hdr_incr_ref(h);
1761
220
    bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup);
1762
1763
220
    return h;
1764
4
 fail:
1765
4
    hts_log_error("Failed to read BCF header");
1766
4
    free(htxt);
1767
4
    bcf_hdr_destroy(h);
1768
4
    return NULL;
1769
224
}
1770
1771
int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
1772
2.54k
{
1773
2.54k
    if (!h) {
1774
0
        errno = EINVAL;
1775
0
        return -1;
1776
0
    }
1777
2.54k
    if ( h->dirty ) {
1778
0
        if (bcf_hdr_sync(h) < 0) return -1;
1779
0
    }
1780
2.54k
    hfp->format.category = variant_data;
1781
2.54k
    if (hfp->format.format == vcf || hfp->format.format == text_format) {
1782
1.27k
        hfp->format.format = vcf;
1783
1.27k
        return vcf_hdr_write(hfp, h);
1784
1.27k
    }
1785
1786
1.27k
    if (hfp->format.format == binary_format)
1787
1.27k
        hfp->format.format = bcf;
1788
1789
1.27k
    kstring_t htxt = {0,0,0};
1790
1.27k
    if (bcf_hdr_format(h, 1, &htxt) < 0) {
1791
0
        free(htxt.s);
1792
0
        return -1;
1793
0
    }
1794
1.27k
    kputc('\0', &htxt); // include the \0 byte
1795
1796
1.27k
    BGZF *fp = hfp->fp.bgzf;
1797
1.27k
    if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
1798
1.27k
    uint8_t hlen[4];
1799
1.27k
    u32_to_le(htxt.l, hlen);
1800
1.27k
    if ( bgzf_write(fp, hlen, 4) !=4 ) return -1;
1801
1.27k
    if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1;
1802
1.27k
    if ( bgzf_flush(fp) < 0) return -1;
1803
1804
1.27k
    bcf_hdr_incr_ref(h);
1805
1.27k
    bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup);
1806
1807
1.27k
    free(htxt.s);
1808
1.27k
    return 0;
1809
1.27k
}
1810
1811
/********************
1812
 *** BCF site I/O ***
1813
 ********************/
1814
1815
bcf1_t *bcf_init(void)
1816
2.54k
{
1817
2.54k
    bcf1_t *v;
1818
2.54k
    v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
1819
2.54k
    return v;
1820
2.54k
}
1821
1822
void bcf_clear(bcf1_t *v)
1823
58.3k
{
1824
58.3k
    int i;
1825
58.3k
    for (i=0; i<v->d.m_info; i++)
1826
0
    {
1827
0
        if ( v->d.info[i].vptr_free )
1828
0
        {
1829
0
            free(v->d.info[i].vptr - v->d.info[i].vptr_off);
1830
0
            v->d.info[i].vptr_free = 0;
1831
0
        }
1832
0
    }
1833
58.3k
    for (i=0; i<v->d.m_fmt; i++)
1834
0
    {
1835
0
        if ( v->d.fmt[i].p_free )
1836
0
        {
1837
0
            free(v->d.fmt[i].p - v->d.fmt[i].p_off);
1838
0
            v->d.fmt[i].p_free = 0;
1839
0
        }
1840
0
    }
1841
58.3k
    v->rid = v->pos = v->rlen = v->unpacked = 0;
1842
58.3k
    bcf_float_set_missing(v->qual);
1843
58.3k
    v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
1844
58.3k
    v->shared.l = v->indiv.l = 0;
1845
58.3k
    v->d.var_type = -1;
1846
58.3k
    v->d.shared_dirty = 0;
1847
58.3k
    v->d.indiv_dirty  = 0;
1848
58.3k
    v->d.n_flt = 0;
1849
58.3k
    v->errcode = 0;
1850
58.3k
    if (v->d.m_als) v->d.als[0] = 0;
1851
58.3k
    if (v->d.m_id) v->d.id[0] = 0;
1852
58.3k
}
1853
1854
void bcf_empty(bcf1_t *v)
1855
2.54k
{
1856
2.54k
    bcf_clear1(v);
1857
2.54k
    free(v->d.id);
1858
2.54k
    free(v->d.als);
1859
2.54k
    free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
1860
2.54k
    if (v->d.var ) free(v->d.var);
1861
2.54k
    free(v->shared.s); free(v->indiv.s);
1862
2.54k
    memset(&v->d,0,sizeof(v->d));
1863
2.54k
    memset(&v->shared,0,sizeof(v->shared));
1864
2.54k
    memset(&v->indiv,0,sizeof(v->indiv));
1865
2.54k
}
1866
1867
void bcf_destroy(bcf1_t *v)
1868
2.54k
{
1869
2.54k
    if (!v) return;
1870
2.54k
    bcf_empty1(v);
1871
2.54k
    free(v);
1872
2.54k
}
1873
1874
static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
1875
220
{
1876
220
    uint8_t x[32];
1877
220
    ssize_t ret;
1878
220
    uint32_t shared_len, indiv_len;
1879
220
    if ((ret = bgzf_read(fp, x, 32)) != 32) {
1880
0
        if (ret == 0) return -1;
1881
0
        return -2;
1882
0
    }
1883
220
    bcf_clear1(v);
1884
220
    shared_len = le_to_u32(x);
1885
220
    if (shared_len < 24) return -2;
1886
220
    shared_len -= 24; // to exclude six 32-bit integers
1887
220
    indiv_len = le_to_u32(x + 4);
1888
220
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1889
    // ks_resize() normally allocates 1.5 * requested size to allow for growth
1890
220
    if ((uint64_t) shared_len + indiv_len > FUZZ_ALLOC_LIMIT / 3 * 2) return -2;
1891
214
#endif
1892
214
    if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2;
1893
214
    if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2;
1894
214
    v->rid  = le_to_i32(x + 8);
1895
214
    v->pos  = le_to_u32(x + 12);
1896
214
    if ( v->pos==UINT32_MAX ) v->pos = -1;  // this is for telomere coordinate, e.g. MT:0
1897
214
    v->rlen = le_to_i32(x + 16);
1898
214
    v->qual = le_to_float(x + 20);
1899
214
    v->n_info = le_to_u16(x + 24);
1900
214
    v->n_allele = le_to_u16(x + 26);
1901
214
    v->n_sample = le_to_u32(x + 28) & 0xffffff;
1902
214
    v->n_fmt = x[31];
1903
214
    v->shared.l = shared_len;
1904
214
    v->indiv.l = indiv_len;
1905
    // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
1906
214
    if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
1907
1908
214
    if (bgzf_read(fp, v->shared.s, v->shared.l) != v->shared.l) return -2;
1909
206
    if (bgzf_read(fp, v->indiv.s, v->indiv.l) != v->indiv.l) return -2;
1910
198
    return 0;
1911
206
}
1912
1913
0
#define bit_array_size(n) ((n)/8+1)
1914
0
#define bit_array_set(a,i)   ((a)[(i)/8] |=   1 << ((i)%8))
1915
0
#define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
1916
0
#define bit_array_test(a,i)  ((a)[(i)/8] &   (1 << ((i)%8)))
1917
1918
static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1919
6.89k
                                   int32_t *val) {
1920
6.89k
    uint32_t t;
1921
6.89k
    if (end - p < 2) return -1;
1922
6.89k
    t = *p++ & 0xf;
1923
    /* Use if .. else if ... else instead of switch to force order.  Assumption
1924
       is that small integers are more frequent than big ones. */
1925
6.89k
    if (t == BCF_BT_INT8) {
1926
3.61k
        *val = *(int8_t *) p++;
1927
3.61k
    } else {
1928
3.28k
        if (end - p < (1<<bcf_type_shift[t])) return -1;
1929
3.27k
        if (t == BCF_BT_INT16) {
1930
2.15k
            *val = le_to_i16(p);
1931
2.15k
            p += 2;
1932
2.15k
        } else if (t == BCF_BT_INT32) {
1933
984
            *val = le_to_i32(p);
1934
984
            p += 4;
1935
#ifdef VCF_ALLOW_INT64
1936
        } else if (t == BCF_BT_INT64) {
1937
            // This case should never happen because there should be no
1938
            // 64-bit BCFs at all, definitely not coming from htslib
1939
            *val = le_to_i64(p);
1940
            p += 8;
1941
#endif
1942
984
        } else {
1943
136
            return -1;
1944
136
        }
1945
3.27k
    }
1946
6.75k
    *q = p;
1947
6.75k
    return 0;
1948
6.89k
}
1949
1950
static int bcf_dec_size_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1951
28.9k
                             int *num, int *type) {
1952
28.9k
    int r;
1953
28.9k
    if (p >= end) return -1;
1954
28.9k
    *type = *p & 0xf;
1955
28.9k
    if (*p>>4 != 15) {
1956
28.4k
        *q = p + 1;
1957
28.4k
        *num = *p >> 4;
1958
28.4k
        return 0;
1959
28.4k
    }
1960
452
    r = bcf_dec_typed_int1_safe(p + 1, end, q, num);
1961
452
    if (r) return r;
1962
428
    return *num >= 0 ? 0 : -1;
1963
452
}
1964
1965
346
static const char *get_type_name(int type) {
1966
346
    const char *types[9] = {
1967
346
        "null", "int (8-bit)", "int (16 bit)", "int (32 bit)",
1968
346
        "unknown", "float", "unknown", "char", "unknown"
1969
346
    };
1970
346
    int t = (type >= 0 && type < 8) ? type : 8;
1971
346
    return types[t];
1972
346
}
1973
1974
/**
1975
 *  updatephasing - updates 1st phasing based on other phasing status
1976
 *  @param p - pointer to phase value array
1977
 *  @param end - end of array
1978
 *  @param q - pointer to consumed data
1979
 *  @param samples - no. of samples in array
1980
 *  @param ploidy - no. of phasing values per sample
1981
 *  @param type - value type (one of BCF_BT_...)
1982
 *  Returns 0 on success and 1 on failure
1983
 *  Update for haploids made only if it is not unknown (.)
1984
 */
1985
static int updatephasing(uint8_t *p, uint8_t *end, uint8_t **q, int samples, int ploidy, int type)
1986
0
{
1987
0
    int j, k;
1988
0
    unsigned int inc = 1 << bcf_type_shift[type];
1989
0
    ptrdiff_t bytes = samples * ploidy * inc;
1990
1991
0
    if (samples < 0 || ploidy < 0 || end - p < bytes)
1992
0
        return 1;
1993
1994
    /*
1995
     * This works because phasing is stored in the least-significant bit
1996
     * of the GT encoding, and the data is always stored little-endian.
1997
     * Thus it's possible to get the desired result by doing bit operations
1998
     * on the least-significant byte of each value and ignoring the
1999
     * higher bytes (for 16-bit and 32-bit values).
2000
     */
2001
2002
0
    switch (ploidy) {
2003
0
    case 1:
2004
        // Trivial case - haploid data is phased by default
2005
0
        for (j = 0; j < samples; ++j) {
2006
0
            if (*p) *p |= 1;    //only if not unknown (.)
2007
0
            p += inc;
2008
0
        }
2009
0
        break;
2010
0
    case 2:
2011
        // Mostly trivial case - first is phased if second is.
2012
0
        for (j = 0; j < samples; ++j) {
2013
0
            *p |= (p[inc] & 1);
2014
0
            p += 2 * inc;
2015
0
        }
2016
0
        break;
2017
0
    default:
2018
        // Generic case - first is phased if all other alleles are.
2019
0
        for (j = 0; j < samples; ++j) {
2020
0
            uint8_t allphased = 1;
2021
0
            for (k = 1; k < ploidy; ++k)
2022
0
                allphased &= (p[inc * k]);
2023
0
            *p |= allphased;
2024
0
            p += ploidy * inc;
2025
0
        }
2026
0
    }
2027
0
    *q = p;
2028
0
    return 0;
2029
0
}
2030
2031
static void bcf_record_check_err(const bcf_hdr_t *hdr, bcf1_t *rec,
2032
3.31k
                                 char *type, uint32_t *reports, int i) {
2033
3.31k
    if (*reports == 0 || hts_verbose >= HTS_LOG_DEBUG)
2034
64
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos
2035
3.31k
                        ": Invalid FORMAT %s %d",
2036
3.31k
                        bcf_seqname_safe(hdr,rec), rec->pos+1, type, i);
2037
3.31k
    (*reports)++;
2038
3.31k
}
2039
2040
198
static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
2041
198
    uint8_t *ptr, *end;
2042
198
    size_t bytes;
2043
198
    uint32_t err = 0;
2044
198
    int type = 0;
2045
198
    int num  = 0;
2046
198
    uint32_t i, reports;
2047
198
    const uint32_t is_integer = ((1 << BCF_BT_INT8)  |
2048
198
                                 (1 << BCF_BT_INT16) |
2049
#ifdef VCF_ALLOW_INT64
2050
                                 (1 << BCF_BT_INT64) |
2051
#endif
2052
198
                                 (1 << BCF_BT_INT32));
2053
198
    const uint32_t is_valid_type = (is_integer          |
2054
198
                                    (1 << BCF_BT_NULL)  |
2055
198
                                    (1 << BCF_BT_FLOAT) |
2056
198
                                    (1 << BCF_BT_CHAR));
2057
198
    int32_t max_id = hdr ? hdr->n[BCF_DT_ID] : 0;
2058
    /* set phasing for 1st allele as in v44 for versions upto v43, to have
2059
    consistent binary values irrespective of version; not run for v >= v44,
2060
    to retain explicit phasing in v44 and higher */
2061
198
    int idgt = hdr ?
2062
198
                    bcf_get_version(hdr, NULL) < VCF44 ?
2063
198
                        bcf_hdr_id2int(hdr, BCF_DT_ID, "GT") : -1 :
2064
198
                    -1;
2065
2066
    // Check for valid contig ID
2067
198
    if (rec->rid < 0
2068
164
        || (hdr && (rec->rid >= hdr->n[BCF_DT_CTG]
2069
198
                    || hdr->id[BCF_DT_CTG][rec->rid].key == NULL))) {
2070
198
        hts_log_warning("Bad BCF record at %"PRIhts_pos": Invalid %s id %d", rec->pos+1, "CONTIG", rec->rid);
2071
198
        err |= BCF_ERR_CTG_INVALID;
2072
198
    }
2073
2074
    // Check ID
2075
198
    ptr = (uint8_t *) rec->shared.s;
2076
198
    end = ptr + rec->shared.l;
2077
198
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2078
198
    if (type != BCF_BT_CHAR) {
2079
196
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "ID", type, get_type_name(type));
2080
196
        err |= BCF_ERR_TAG_INVALID;
2081
196
    }
2082
198
    bytes = (size_t) num << bcf_type_shift[type];
2083
198
    if (end - ptr < bytes) goto bad_shared;
2084
196
    ptr += bytes;
2085
2086
    // Check REF and ALT
2087
196
    if (rec->n_allele < 1) {
2088
74
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": No REF allele",
2089
74
                        bcf_seqname_safe(hdr,rec), rec->pos+1);
2090
74
        err |= BCF_ERR_TAG_UNDEF;
2091
74
    }
2092
2093
196
    reports = 0;
2094
22.3k
    for (i = 0; i < rec->n_allele; i++) {
2095
22.2k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2096
22.1k
        if (type != BCF_BT_CHAR) {
2097
21.7k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2098
122
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "REF/ALT", type, get_type_name(type));
2099
21.7k
            err |= BCF_ERR_CHAR;
2100
21.7k
        }
2101
22.1k
        bytes = (size_t) num << bcf_type_shift[type];
2102
22.1k
        if (end - ptr < bytes) goto bad_shared;
2103
22.1k
        ptr += bytes;
2104
22.1k
    }
2105
2106
    // Check FILTER
2107
174
    reports = 0;
2108
174
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2109
174
    if (num > 0) {
2110
80
        bytes = (size_t) num << bcf_type_shift[type];
2111
80
        if (((1 << type) & is_integer) == 0) {
2112
20
            hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", type, get_type_name(type));
2113
20
            err |= BCF_ERR_TAG_INVALID;
2114
20
            if (end - ptr < bytes) goto bad_shared;
2115
20
            ptr += bytes;
2116
60
        } else {
2117
60
            if (end - ptr < bytes) goto bad_shared;
2118
4.66k
            for (i = 0; i < num; i++) {
2119
4.60k
                int32_t key = bcf_dec_int1(ptr, type, &ptr);
2120
4.60k
                if (key < 0
2121
3.45k
                    || (hdr && (key >= max_id
2122
4.05k
                                || hdr->id[BCF_DT_ID][key].key == NULL))) {
2123
4.05k
                    if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2124
60
                        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", key);
2125
4.05k
                    err |= BCF_ERR_TAG_UNDEF;
2126
4.05k
                }
2127
4.60k
            }
2128
60
        }
2129
80
    }
2130
2131
    // Check INFO
2132
174
    reports = 0;
2133
174
    bcf_idpair_t *id_tmp = hdr ? hdr->id[BCF_DT_ID] : NULL;
2134
3.28k
    for (i = 0; i < rec->n_info; i++) {
2135
3.21k
        int32_t key = -1;
2136
3.21k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_shared;
2137
3.13k
        if (key < 0 || (hdr && (key >= max_id
2138
2.58k
                                || id_tmp[key].key == NULL))) {
2139
2.58k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2140
70
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", key);
2141
2.58k
            err |= BCF_ERR_TAG_UNDEF;
2142
2.58k
        }
2143
3.13k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2144
3.11k
        if (((1 << type) & is_valid_type) == 0
2145
3.02k
            || (type == BCF_BT_NULL && num > 0)) {
2146
102
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2147
8
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", type, get_type_name(type));
2148
102
            err |= BCF_ERR_TAG_INVALID;
2149
102
        }
2150
3.11k
        bytes = (size_t) num << bcf_type_shift[type];
2151
3.11k
        if (end - ptr < bytes) goto bad_shared;
2152
3.11k
        ptr += bytes;
2153
3.11k
    }
2154
2155
    // Check FORMAT and individual information
2156
66
    ptr = (uint8_t *) rec->indiv.s;
2157
66
    end = ptr + rec->indiv.l;
2158
66
    reports = 0;
2159
3.23k
    for (i = 0; i < rec->n_fmt; i++) {
2160
3.22k
        int32_t key = -1;
2161
3.22k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_indiv;
2162
3.19k
        if (key < 0
2163
3.03k
            || (hdr && (key >= max_id
2164
3.12k
                        || id_tmp[key].key == NULL))) {
2165
3.12k
            bcf_record_check_err(hdr, rec, "id", &reports, key);
2166
3.12k
            err |= BCF_ERR_TAG_UNDEF;
2167
3.12k
        }
2168
3.19k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv;
2169
3.18k
        if (((1 << type) & is_valid_type) == 0
2170
3.06k
            || (type == BCF_BT_NULL && num > 0)) {
2171
190
            bcf_record_check_err(hdr, rec, "type", &reports, type);
2172
190
            err |= BCF_ERR_TAG_INVALID;
2173
190
        }
2174
3.18k
        if (idgt >= 0 && idgt == key) {
2175
            // check first GT phasing bit and fix up if necessary
2176
0
            if (updatephasing(ptr, end, &ptr, rec->n_sample, num, type)) {
2177
0
                err |= BCF_ERR_TAG_INVALID;
2178
0
            }
2179
3.18k
        } else {
2180
3.18k
            bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample;
2181
3.18k
            if (end - ptr < bytes) goto bad_indiv;
2182
3.16k
            ptr += bytes;
2183
3.16k
        }
2184
3.18k
    }
2185
2186
2
    if (!err && rec->rlen < 0) {
2187
        // Treat bad rlen as a warning instead of an error, and try to
2188
        // fix up by using the length of the stored REF allele.
2189
0
        static int warned = 0;
2190
0
        if (!warned) {
2191
0
            hts_log_warning("BCF record at %s:%"PRIhts_pos" has invalid RLEN (%"PRIhts_pos"). "
2192
0
                            "Only one invalid RLEN will be reported.",
2193
0
                            bcf_seqname_safe(hdr,rec), rec->pos+1, rec->rlen);
2194
0
            warned = 1;
2195
0
        }
2196
        //find rlen considering reflen, END, SVLEN, fmt LEN
2197
0
        hts_pos_t len = get_rlen(hdr, rec);
2198
0
        rec->rlen = len >= 0 ? len : 0;
2199
0
    }
2200
2201
2
    rec->errcode |= err;
2202
2203
2
    return err ? -2 : 0; // Return -2 so bcf_read() reports an error
2204
2205
132
 bad_shared:
2206
132
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - shared section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
2207
132
    return -2;
2208
2209
64
 bad_indiv:
2210
64
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - individuals section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
2211
64
    return -2;
2212
66
}
2213
2214
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
2215
int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
2216
0
{
2217
0
    if ( !hdr->keep_samples ) return 0;
2218
0
    if ( !bcf_hdr_nsamples(hdr) )
2219
0
    {
2220
0
        rec->indiv.l = rec->n_sample = 0;
2221
0
        return 0;
2222
0
    }
2223
2224
0
    int i, j;
2225
0
    uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
2226
0
    bcf_dec_t *dec = &rec->d;
2227
0
    hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
2228
0
    for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
2229
2230
0
    for (i=0; i<rec->n_fmt; i++)
2231
0
    {
2232
0
        ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
2233
0
        src = dec->fmt[i].p - dec->fmt[i].size;
2234
0
        if ( dst )
2235
0
        {
2236
0
            memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
2237
0
            dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
2238
0
        }
2239
0
        dst = dec->fmt[i].p;
2240
0
        for (j=0; j<hdr->nsamples_ori; j++)
2241
0
        {
2242
0
            src += dec->fmt[i].size;
2243
0
            if ( !bit_array_test(hdr->keep_samples,j) ) continue;
2244
0
            memmove(dst, src, dec->fmt[i].size);
2245
0
            dst += dec->fmt[i].size;
2246
0
        }
2247
0
        rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
2248
0
        dec->fmt[i].p_len = dst - dec->fmt[i].p;
2249
0
    }
2250
0
    rec->unpacked |= BCF_UN_FMT;
2251
2252
0
    rec->n_sample = bcf_hdr_nsamples(hdr);
2253
0
    return 0;
2254
0
}
2255
2256
int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
2257
56.2k
{
2258
56.2k
    if (fp->format.format == vcf) return vcf_read(fp, h, v);
2259
220
    if (!h)
2260
0
        h = (const bcf_hdr_t *) bgzf_get_private_data(fp->fp.bgzf);
2261
220
    int ret = bcf_read1_core(fp->fp.bgzf, v);
2262
220
    if (ret == 0) ret = bcf_record_check(h, v);
2263
220
    if ( ret!=0 || !h->keep_samples ) return ret;
2264
0
    return bcf_subset_format(h,v);
2265
220
}
2266
2267
int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end)
2268
0
{
2269
0
    bcf1_t *v = (bcf1_t *) vv;
2270
0
    const bcf_hdr_t *hdr = (const bcf_hdr_t *) bgzf_get_private_data(fp);
2271
0
    int ret = bcf_read1_core(fp, v);
2272
0
    if (ret == 0) ret = bcf_record_check(hdr, v);
2273
0
    if (ret  >= 0)
2274
0
        *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
2275
0
    return ret;
2276
0
}
2277
2278
static inline int bcf1_sync_id(bcf1_t *line, kstring_t *str)
2279
0
{
2280
    // single typed string
2281
0
    if ( line->d.id && strcmp(line->d.id, ".") ) {
2282
0
        return bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
2283
0
    } else {
2284
0
        return bcf_enc_size(str, 0, BCF_BT_CHAR);
2285
0
    }
2286
0
}
2287
static inline int bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
2288
0
{
2289
    // list of typed strings
2290
0
    int i;
2291
0
    for (i=0; i<line->n_allele; i++) {
2292
0
        if (bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]) < 0)
2293
0
            return -1;
2294
0
    }
2295
0
    if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
2296
0
    return 0;
2297
0
}
2298
static inline int bcf1_sync_filter(bcf1_t *line, kstring_t *str)
2299
0
{
2300
    // typed vector of integers
2301
0
    if ( line->d.n_flt ) {
2302
0
        return bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
2303
0
    } else {
2304
0
        return bcf_enc_vint(str, 0, 0, -1);
2305
0
    }
2306
0
}
2307
2308
static inline int bcf1_sync_info(bcf1_t *line, kstring_t *str)
2309
0
{
2310
    // pairs of typed vectors
2311
0
    int i, irm = -1, e = 0;
2312
0
    for (i=0; i<line->n_info; i++)
2313
0
    {
2314
0
        bcf_info_t *info = &line->d.info[i];
2315
0
        if ( !info->vptr )
2316
0
        {
2317
            // marked for removal
2318
0
            if ( irm < 0 ) irm = i;
2319
0
            continue;
2320
0
        }
2321
0
        e |= kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str) < 0;
2322
0
        if ( irm >=0 )
2323
0
        {
2324
0
            bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
2325
0
            while ( irm<=i && line->d.info[irm].vptr ) irm++;
2326
0
        }
2327
0
    }
2328
0
    if ( irm>=0 ) line->n_info = irm;
2329
0
    return e == 0 ? 0 : -1;
2330
0
}
2331
2332
static int bcf1_sync(bcf1_t *line)
2333
0
{
2334
0
    char *shared_ori = line->shared.s;
2335
0
    size_t prev_len;
2336
2337
0
    kstring_t tmp = {0,0,0};
2338
0
    if ( !line->shared.l )
2339
0
    {
2340
        // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
2341
0
        tmp = line->shared;
2342
0
        bcf1_sync_id(line, &tmp);
2343
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2344
2345
0
        bcf1_sync_alleles(line, &tmp);
2346
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2347
2348
0
        bcf1_sync_filter(line, &tmp);
2349
0
        line->unpack_size[2] = tmp.l - prev_len;
2350
2351
0
        bcf1_sync_info(line, &tmp);
2352
0
        line->shared = tmp;
2353
0
    }
2354
0
    else if ( line->d.shared_dirty )
2355
0
    {
2356
        // The line was edited, update the BCF data block.
2357
2358
0
        if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line,BCF_UN_STR);
2359
2360
        // ptr_ori points to the original unchanged BCF data.
2361
0
        uint8_t *ptr_ori = (uint8_t *) line->shared.s;
2362
2363
        // ID: single typed string
2364
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ID )
2365
0
            bcf1_sync_id(line, &tmp);
2366
0
        else
2367
0
            kputsn_(ptr_ori, line->unpack_size[0], &tmp);
2368
0
        ptr_ori += line->unpack_size[0];
2369
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2370
2371
        // REF+ALT: list of typed strings
2372
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
2373
0
            bcf1_sync_alleles(line, &tmp);
2374
0
        else
2375
0
        {
2376
0
            kputsn_(ptr_ori, line->unpack_size[1], &tmp);
2377
0
            if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
2378
0
        }
2379
0
        ptr_ori += line->unpack_size[1];
2380
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2381
2382
0
        if ( line->unpacked & BCF_UN_FLT )
2383
0
        {
2384
            // FILTER: typed vector of integers
2385
0
            if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
2386
0
                bcf1_sync_filter(line, &tmp);
2387
0
            else if ( line->d.n_flt )
2388
0
                kputsn_(ptr_ori, line->unpack_size[2], &tmp);
2389
0
            else
2390
0
                bcf_enc_vint(&tmp, 0, 0, -1);
2391
0
            ptr_ori += line->unpack_size[2];
2392
0
            line->unpack_size[2] = tmp.l - prev_len;
2393
2394
0
            if ( line->unpacked & BCF_UN_INFO )
2395
0
            {
2396
                // INFO: pairs of typed vectors
2397
0
                if ( line->d.shared_dirty & BCF1_DIRTY_INF )
2398
0
                {
2399
0
                    bcf1_sync_info(line, &tmp);
2400
0
                    ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
2401
0
                }
2402
0
            }
2403
0
        }
2404
2405
0
        int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
2406
0
        if ( size ) kputsn_(ptr_ori, size, &tmp);
2407
2408
0
        free(line->shared.s);
2409
0
        line->shared = tmp;
2410
0
    }
2411
0
    if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
2412
0
    {
2413
        // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
2414
0
        size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
2415
0
        int i;
2416
0
        for (i=0; i<line->n_info; i++)
2417
0
        {
2418
0
            uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
2419
0
            line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
2420
0
            off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
2421
0
            if ( vptr_free )
2422
0
            {
2423
0
                free(vptr_free);
2424
0
                line->d.info[i].vptr_free = 0;
2425
0
            }
2426
0
        }
2427
0
    }
2428
2429
0
    if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
2430
0
    {
2431
        // The genotype fields changed or are not present
2432
0
        tmp.l = tmp.m = 0; tmp.s = NULL;
2433
0
        int i, irm = -1;
2434
0
        for (i=0; i<line->n_fmt; i++)
2435
0
        {
2436
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
2437
0
            if ( !fmt->p )
2438
0
            {
2439
                // marked for removal
2440
0
                if ( irm < 0 ) irm = i;
2441
0
                continue;
2442
0
            }
2443
0
            kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
2444
0
            if ( irm >=0 )
2445
0
            {
2446
0
                bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
2447
0
                while ( irm<=i && line->d.fmt[irm].p ) irm++;
2448
0
            }
2449
2450
0
        }
2451
0
        if ( irm>=0 ) line->n_fmt = irm;
2452
0
        free(line->indiv.s);
2453
0
        line->indiv = tmp;
2454
2455
        // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
2456
0
        size_t off_new = 0;
2457
0
        for (i=0; i<line->n_fmt; i++)
2458
0
        {
2459
0
            uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
2460
0
            line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
2461
0
            off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
2462
0
            if ( p_free )
2463
0
            {
2464
0
                free(p_free);
2465
0
                line->d.fmt[i].p_free = 0;
2466
0
            }
2467
0
        }
2468
0
    }
2469
0
    if ( !line->n_sample ) line->n_fmt = 0;
2470
0
    line->d.shared_dirty = line->d.indiv_dirty = 0;
2471
0
    return 0;
2472
0
}
2473
2474
bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
2475
0
{
2476
0
    bcf1_sync(src);
2477
2478
0
    bcf_clear(dst);
2479
0
    dst->rid  = src->rid;
2480
0
    dst->pos  = src->pos;
2481
0
    dst->rlen = src->rlen;
2482
0
    dst->qual = src->qual;
2483
0
    dst->n_info = src->n_info; dst->n_allele = src->n_allele;
2484
0
    dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
2485
2486
0
    if ( dst->shared.m < src->shared.l )
2487
0
    {
2488
0
        dst->shared.s = (char*) realloc(dst->shared.s, src->shared.l);
2489
0
        dst->shared.m = src->shared.l;
2490
0
    }
2491
0
    dst->shared.l = src->shared.l;
2492
0
    memcpy(dst->shared.s,src->shared.s,dst->shared.l);
2493
2494
0
    if ( dst->indiv.m < src->indiv.l )
2495
0
    {
2496
0
        dst->indiv.s = (char*) realloc(dst->indiv.s, src->indiv.l);
2497
0
        dst->indiv.m = src->indiv.l;
2498
0
    }
2499
0
    dst->indiv.l = src->indiv.l;
2500
0
    memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
2501
2502
0
    return dst;
2503
0
}
2504
bcf1_t *bcf_dup(bcf1_t *src)
2505
0
{
2506
0
    bcf1_t *out = bcf_init1();
2507
0
    return bcf_copy(out, src);
2508
0
}
2509
2510
int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
2511
54.8k
{
2512
54.8k
    if ( h->dirty ) {
2513
0
        if (bcf_hdr_sync(h) < 0) return -1;
2514
0
    }
2515
54.8k
    if ( bcf_hdr_nsamples(h)!=v->n_sample )
2516
21
    {
2517
21
        hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
2518
21
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
2519
21
        return -1;
2520
21
    }
2521
2522
54.7k
    if ( hfp->format.format == vcf || hfp->format.format == text_format )
2523
53.7k
        return vcf_write(hfp,h,v);
2524
2525
1.08k
    if ( v->errcode & ~BCF_ERR_LIMITS ) // todo: unsure about the other BCF_ERR_LIMITS branches in vcf_parse_format_alloc4()
2526
1.08k
    {
2527
        // vcf_parse1() encountered a new contig or tag, undeclared in the
2528
        // header.  At this point, the header must have been printed,
2529
        // proceeding would lead to a broken BCF file. Errors must be checked
2530
        // and cleared by the caller before we can proceed.
2531
1.08k
        char errdescription[1024] = "";
2532
1.08k
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos, v->errcode, bcf_strerror(v->errcode, errdescription, sizeof(errdescription)), bcf_seqname_safe(h,v), v->pos+1);
2533
1.08k
        return -1;
2534
1.08k
    }
2535
0
    bcf1_sync(v);   // check if the BCF record was modified
2536
2537
0
    if ( v->unpacked & BCF_IS_64BIT )
2538
0
    {
2539
0
        hts_log_error("Data at %s:%"PRIhts_pos" contains 64-bit values not representable in BCF. Please use VCF instead", bcf_seqname_safe(h,v), v->pos+1);
2540
0
        return -1;
2541
0
    }
2542
2543
0
    BGZF *fp = hfp->fp.bgzf;
2544
0
    uint8_t x[32];
2545
0
    u32_to_le(v->shared.l + 24, x); // to include six 32-bit integers
2546
0
    u32_to_le(v->indiv.l, x + 4);
2547
0
    i32_to_le(v->rid, x + 8);
2548
0
    u32_to_le(v->pos, x + 12);
2549
0
    u32_to_le(v->rlen, x + 16);
2550
0
    float_to_le(v->qual, x + 20);
2551
0
    u16_to_le(v->n_info, x + 24);
2552
0
    u16_to_le(v->n_allele, x + 26);
2553
0
    u32_to_le((uint32_t)v->n_fmt<<24 | (v->n_sample & 0xffffff), x + 28);
2554
0
    if ( bgzf_write(fp, x, 32) != 32 ) return -1;
2555
0
    if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
2556
0
    if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
2557
2558
0
    if (hfp->idx) {
2559
0
        if (bgzf_idx_push(fp, hfp->idx, v->rid, v->pos, v->pos + v->rlen,
2560
0
                          bgzf_tell(fp), 1) < 0)
2561
0
            return -1;
2562
0
    }
2563
2564
0
    return 0;
2565
0
}
2566
2567
/**********************
2568
 *** VCF header I/O ***
2569
 **********************/
2570
2571
0
static int add_missing_contig_hrec(bcf_hdr_t *h, const char *name) {
2572
0
    bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t));
2573
0
    int save_errno;
2574
0
    if (!hrec) goto fail;
2575
2576
0
    hrec->key = strdup("contig");
2577
0
    if (!hrec->key) goto fail;
2578
2579
0
    if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) goto fail;
2580
0
    if (bcf_hrec_set_val(hrec, hrec->nkeys-1, name, strlen(name), 0) < 0)
2581
0
        goto fail;
2582
0
    if (bcf_hdr_add_hrec(h, hrec) < 0)
2583
0
        goto fail;
2584
0
    return 0;
2585
2586
0
 fail:
2587
0
    save_errno = errno;
2588
0
    hts_log_error("%s", strerror(errno));
2589
0
    if (hrec) bcf_hrec_destroy(hrec);
2590
0
    errno = save_errno;
2591
0
    return -1;
2592
0
}
2593
2594
bcf_hdr_t *vcf_hdr_read(htsFile *fp)
2595
2.75k
{
2596
2.75k
    kstring_t txt, *s = &fp->line;
2597
2.75k
    int ret;
2598
2.75k
    bcf_hdr_t *h;
2599
2.75k
    tbx_t *idx = NULL;
2600
2.75k
    const char **names = NULL;
2601
2.75k
    h = bcf_hdr_init("r");
2602
2.75k
    if (!h) {
2603
0
        hts_log_error("Failed to allocate bcf header");
2604
0
        return NULL;
2605
0
    }
2606
2.75k
    txt.l = txt.m = 0; txt.s = 0;
2607
103k
    while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) {
2608
103k
        int e = 0;
2609
103k
        if (s->l == 0) continue;
2610
99.3k
        if (s->s[0] != '#') {
2611
18
            hts_log_error("No sample line");
2612
18
            goto error;
2613
18
        }
2614
99.3k
        if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
2615
0
            kstring_t tmp = { 0, 0, NULL };
2616
0
            hFILE *f = hopen(fp->fn_aux, "r");
2617
0
            if (f == NULL) {
2618
0
                hts_log_error("Couldn't open \"%s\"", fp->fn_aux);
2619
0
                goto error;
2620
0
            }
2621
0
            while (tmp.l = 0, kgetline(&tmp, (kgets_func *) hgets, f) >= 0) {
2622
0
                char *tab = strchr(tmp.s, '\t');
2623
0
                if (tab == NULL) continue;
2624
0
                e |= (kputs("##contig=<ID=", &txt) < 0);
2625
0
                e |= (kputsn(tmp.s, tab - tmp.s, &txt) < 0);
2626
0
                e |= (kputs(",length=", &txt) < 0);
2627
0
                e |= (kputl(atol(tab), &txt) < 0);
2628
0
                e |= (kputsn(">\n", 2, &txt) < 0);
2629
0
            }
2630
0
            free(tmp.s);
2631
0
            if (hclose(f) != 0) {
2632
0
                hts_log_error("Error on closing %s", fp->fn_aux);
2633
0
                goto error;
2634
0
            }
2635
0
            if (e) goto error;
2636
0
        }
2637
99.3k
        if (kputsn(s->s, s->l, &txt) < 0) goto error;
2638
99.3k
        if (kputc('\n', &txt) < 0) goto error;
2639
99.3k
        if (s->s[1] != '#') break;
2640
99.3k
    }
2641
2.73k
    if ( ret < -1 ) goto error;
2642
2.71k
    if ( !txt.s )
2643
0
    {
2644
0
        hts_log_error("Could not read the header");
2645
0
        goto error;
2646
0
    }
2647
2.71k
    if ( bcf_hdr_parse(h, txt.s) < 0 ) goto error;
2648
2649
    // check tabix index, are all contigs listed in the header? add the missing ones
2650
2.32k
    idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SILENT_FAIL);
2651
2.32k
    if ( idx )
2652
0
    {
2653
0
        int i, n, need_sync = 0;
2654
0
        names = tbx_seqnames(idx, &n);
2655
0
        if (!names) goto error;
2656
0
        for (i=0; i<n; i++)
2657
0
        {
2658
0
            bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL);
2659
0
            if ( hrec ) continue;
2660
0
            if (add_missing_contig_hrec(h, names[i]) < 0) goto error;
2661
0
            need_sync = 1;
2662
0
        }
2663
0
        if ( need_sync ) {
2664
0
            if (bcf_hdr_sync(h) < 0) goto error;
2665
0
        }
2666
0
        free(names);
2667
0
        tbx_destroy(idx);
2668
0
    }
2669
2.32k
    free(txt.s);
2670
2.32k
    return h;
2671
2672
424
 error:
2673
424
    if (idx) tbx_destroy(idx);
2674
424
    free(names);
2675
424
    free(txt.s);
2676
424
    if (h) bcf_hdr_destroy(h);
2677
424
    return NULL;
2678
2.32k
}
2679
2680
int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
2681
0
{
2682
0
    int i = 0, n = 0, save_errno;
2683
0
    char **lines = hts_readlines(fname, &n);
2684
0
    if ( !lines ) return 1;
2685
0
    for (i=0; i<n-1; i++)
2686
0
    {
2687
0
        int k;
2688
0
        bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
2689
0
        if (!hrec) goto fail;
2690
0
        if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
2691
0
            bcf_hrec_destroy(hrec);
2692
0
            goto fail;
2693
0
        }
2694
0
        free(lines[i]);
2695
0
        lines[i] = NULL;
2696
0
    }
2697
0
    if (bcf_hdr_parse_sample_line(hdr, lines[n-1]) < 0) goto fail;
2698
0
    if (bcf_hdr_sync(hdr) < 0) goto fail;
2699
0
    free(lines[n-1]);
2700
0
    free(lines);
2701
0
    return 0;
2702
2703
0
 fail:
2704
0
    save_errno = errno;
2705
0
    for (; i < n; i++)
2706
0
        free(lines[i]);
2707
0
    free(lines);
2708
0
    errno = save_errno;
2709
0
    return 1;
2710
0
}
2711
2712
static int _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
2713
13.2k
{
2714
13.2k
    uint32_t e = 0;
2715
13.2k
    if ( !hrec->value )
2716
7.41k
    {
2717
7.41k
        int j, nout = 0;
2718
7.41k
        e |= ksprintf(str, "##%s=<", hrec->key) < 0;
2719
26.8k
        for (j=0; j<hrec->nkeys; j++)
2720
19.4k
        {
2721
            // do not output IDX if output is VCF
2722
19.4k
            if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
2723
16.9k
            if ( nout ) e |= kputc(',',str) < 0;
2724
16.9k
            e |= ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]) < 0;
2725
16.9k
            nout++;
2726
16.9k
        }
2727
7.41k
        e |= ksprintf(str,">\n") < 0;
2728
7.41k
    }
2729
5.83k
    else
2730
5.83k
        e |= ksprintf(str,"##%s=%s\n", hrec->key,hrec->value) < 0;
2731
2732
13.2k
    return e == 0 ? 0 : -1;
2733
13.2k
}
2734
2735
int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
2736
0
{
2737
0
    return _bcf_hrec_format(hrec,0,str);
2738
0
}
2739
2740
int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str)
2741
2.54k
{
2742
2.54k
    int i, r = 0;
2743
15.7k
    for (i=0; i<hdr->nhrec; i++)
2744
13.2k
        r |= _bcf_hrec_format(hdr->hrec[i], is_bcf, str) < 0;
2745
2746
2.54k
    r |= ksprintf(str, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") < 0;
2747
2.54k
    if ( bcf_hdr_nsamples(hdr) )
2748
646
    {
2749
646
        r |= ksprintf(str, "\tFORMAT") < 0;
2750
5.87k
        for (i=0; i<bcf_hdr_nsamples(hdr); i++)
2751
5.22k
            r |= ksprintf(str, "\t%s", hdr->samples[i]) < 0;
2752
646
    }
2753
2.54k
    r |= ksprintf(str, "\n") < 0;
2754
2755
2.54k
    return r ? -1 : 0;
2756
2.54k
}
2757
2758
char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
2759
0
{
2760
0
    kstring_t txt = {0,0,0};
2761
0
    if (bcf_hdr_format(hdr, is_bcf, &txt) < 0)
2762
0
        return NULL;
2763
0
    if ( len ) *len = txt.l;
2764
0
    return txt.s;
2765
0
}
2766
2767
const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
2768
0
{
2769
0
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
2770
0
    int i, tid, m = kh_size(d);
2771
0
    const char **names = (const char**) calloc(m,sizeof(const char*));
2772
0
    if ( !names )
2773
0
    {
2774
0
        hts_log_error("Failed to allocate memory");
2775
0
        *n = 0;
2776
0
        return NULL;
2777
0
    }
2778
0
    khint_t k;
2779
0
    for (k=kh_begin(d); k<kh_end(d); k++)
2780
0
    {
2781
0
        if ( !kh_exist(d,k) ) continue;
2782
0
        if ( !kh_val(d, k).hrec[0] ) continue;  // removed via bcf_hdr_remove
2783
0
        tid = kh_val(d,k).id;
2784
0
        if ( tid >= m )
2785
0
        {
2786
            // This can happen after a contig has been removed from BCF header via bcf_hdr_remove()
2787
0
            if ( hts_resize(const char*, tid + 1, &m, &names, HTS_RESIZE_CLEAR)<0 )
2788
0
            {
2789
0
                hts_log_error("Failed to allocate memory");
2790
0
                *n = 0;
2791
0
                free(names);
2792
0
                return NULL;
2793
0
            }
2794
0
            m = tid + 1;
2795
0
        }
2796
0
        names[tid] = kh_key(d,k);
2797
0
    }
2798
    // ensure there are no gaps
2799
0
    for (i=0,tid=0; tid<m; i++,tid++)
2800
0
    {
2801
0
        while ( tid<m && !names[tid] ) tid++;
2802
0
        if ( tid==m ) break;
2803
0
        if ( i==tid ) continue;
2804
0
        names[i] = names[tid];
2805
0
        names[tid] = 0;
2806
0
    }
2807
0
    *n = i;
2808
0
    return names;
2809
0
}
2810
2811
int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
2812
1.27k
{
2813
1.27k
    kstring_t htxt = {0,0,0};
2814
1.27k
    if (bcf_hdr_format(h, 0, &htxt) < 0) {
2815
0
        free(htxt.s);
2816
0
        return -1;
2817
0
    }
2818
1.27k
    while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros
2819
1.27k
    int ret;
2820
1.27k
    if ( fp->format.compression!=no_compression ) {
2821
0
        ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l);
2822
0
        if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2823
1.27k
    } else {
2824
1.27k
        ret = hwrite(fp->fp.hfile, htxt.s, htxt.l);
2825
1.27k
    }
2826
1.27k
    free(htxt.s);
2827
1.27k
    return ret<0 ? -1 : 0;
2828
1.27k
}
2829
2830
/***********************
2831
 *** Typed value I/O ***
2832
 ***********************/
2833
2834
int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
2835
211k
{
2836
211k
    int32_t max = INT32_MIN, min = INT32_MAX;
2837
211k
    int i;
2838
211k
    if (n <= 0) {
2839
5.20k
        return bcf_enc_size(s, 0, BCF_BT_NULL);
2840
206k
    } else if (n == 1) {
2841
47.4k
        return bcf_enc_int1(s, a[0]);
2842
159k
    } else {
2843
159k
        if (wsize <= 0) wsize = n;
2844
2845
        // Equivalent to:
2846
        // for (i = 0; i < n; ++i) {
2847
        //     if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end )
2848
        //         continue;
2849
        //     if (max < a[i]) max = a[i];
2850
        //     if (min > a[i]) min = a[i];
2851
        // }
2852
159k
        int max4[4] = {INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN};
2853
159k
        int min4[4] = {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX};
2854
27.7M
        for (i = 0; i < (n&~3); i+=4) {
2855
            // bcf_int32_missing    == INT32_MIN and
2856
            // bcf_int32_vector_end == INT32_MIN+1.
2857
            // We skip these, but can mostly avoid explicit checking
2858
27.5M
            if (max4[0] < a[i+0]) max4[0] = a[i+0];
2859
27.5M
            if (max4[1] < a[i+1]) max4[1] = a[i+1];
2860
27.5M
            if (max4[2] < a[i+2]) max4[2] = a[i+2];
2861
27.5M
            if (max4[3] < a[i+3]) max4[3] = a[i+3];
2862
27.5M
            if (min4[0] > a[i+0] && a[i+0] > INT32_MIN+1) min4[0] = a[i+0];
2863
27.5M
            if (min4[1] > a[i+1] && a[i+1] > INT32_MIN+1) min4[1] = a[i+1];
2864
27.5M
            if (min4[2] > a[i+2] && a[i+2] > INT32_MIN+1) min4[2] = a[i+2];
2865
27.5M
            if (min4[3] > a[i+3] && a[i+3] > INT32_MIN+1) min4[3] = a[i+3];
2866
27.5M
        }
2867
159k
        min = min4[0];
2868
159k
        if (min > min4[1]) min = min4[1];
2869
159k
        if (min > min4[2]) min = min4[2];
2870
159k
        if (min > min4[3]) min = min4[3];
2871
159k
        max = max4[0];
2872
159k
        if (max < max4[1]) max = max4[1];
2873
159k
        if (max < max4[2]) max = max4[2];
2874
159k
        if (max < max4[3]) max = max4[3];
2875
389k
        for (; i < n; ++i) {
2876
230k
            if (max < a[i]) max = a[i];
2877
230k
            if (min > a[i] && a[i] > INT32_MIN+1) min = a[i];
2878
230k
        }
2879
2880
159k
        if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) {
2881
16.5k
            if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 ||
2882
16.5k
                ks_resize(s, s->l + n) < 0)
2883
0
                return -1;
2884
16.5k
            uint8_t *p = (uint8_t *) s->s + s->l;
2885
3.43M
            for (i = 0; i < n; ++i, p++) {
2886
3.41M
                if ( a[i]==bcf_int32_vector_end )   *p = bcf_int8_vector_end;
2887
3.41M
                else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing;
2888
170k
                else *p = a[i];
2889
3.41M
            }
2890
16.5k
            s->l += n;
2891
142k
        } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) {
2892
109k
            uint8_t *p;
2893
109k
            if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 ||
2894
109k
                ks_resize(s, s->l + n * sizeof(int16_t)) < 0)
2895
0
                return -1;
2896
109k
            p = (uint8_t *) s->s + s->l;
2897
47.3M
            for (i = 0; i < n; ++i)
2898
47.2M
            {
2899
47.2M
                int16_t x;
2900
47.2M
                if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
2901
47.2M
                else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
2902
1.10M
                else x = a[i];
2903
47.2M
                i16_to_le(x, p);
2904
47.2M
                p += sizeof(int16_t);
2905
47.2M
            }
2906
109k
            s->l += n * sizeof(int16_t);
2907
109k
        } else {
2908
32.7k
            uint8_t *p;
2909
32.7k
            if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 ||
2910
32.7k
                ks_resize(s, s->l + n * sizeof(int32_t)) < 0)
2911
0
                return -1;
2912
32.7k
            p = (uint8_t *) s->s + s->l;
2913
59.8M
            for (i = 0; i < n; ++i) {
2914
59.8M
                i32_to_le(a[i], p);
2915
59.8M
                p += sizeof(int32_t);
2916
59.8M
            }
2917
32.7k
            s->l += n * sizeof(int32_t);
2918
32.7k
        }
2919
159k
    }
2920
2921
159k
    return 0;
2922
211k
}
2923
2924
#ifdef VCF_ALLOW_INT64
2925
static int bcf_enc_long1(kstring_t *s, int64_t x) {
2926
    uint32_t e = 0;
2927
    if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32)
2928
        return bcf_enc_int1(s, x);
2929
    if (x == bcf_int64_vector_end) {
2930
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2931
        e |= kputc(bcf_int8_vector_end, s) < 0;
2932
    } else if (x == bcf_int64_missing) {
2933
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2934
        e |= kputc(bcf_int8_missing, s) < 0;
2935
    } else {
2936
        e |= bcf_enc_size(s, 1, BCF_BT_INT64);
2937
        e |= ks_expand(s, 8);
2938
        if (e == 0) { u64_to_le(x, (uint8_t *) s->s + s->l); s->l += 8; }
2939
    }
2940
    return e == 0 ? 0 : -1;
2941
}
2942
#endif
2943
2944
530k
static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) {
2945
530k
    uint8_t *p;
2946
530k
    size_t i;
2947
530k
    size_t bytes = n * sizeof(float);
2948
2949
530k
    if (bytes / sizeof(float) != n) return -1;
2950
530k
    if (ks_resize(s, s->l + bytes) < 0) return -1;
2951
2952
530k
    p = (uint8_t *) s->s + s->l;
2953
103M
    for (i = 0; i < n; i++) {
2954
103M
        float_to_le(a[i], p);
2955
103M
        p += sizeof(float);
2956
103M
    }
2957
530k
    s->l += bytes;
2958
2959
530k
    return 0;
2960
530k
}
2961
2962
int bcf_enc_vfloat(kstring_t *s, int n, float *a)
2963
530k
{
2964
530k
    assert(n >= 0);
2965
530k
    bcf_enc_size(s, n, BCF_BT_FLOAT);
2966
530k
    serialize_float_array(s, n, a);
2967
530k
    return 0; // FIXME: check for errs in this function
2968
530k
}
2969
2970
int bcf_enc_vchar(kstring_t *s, int l, const char *a)
2971
4.14M
{
2972
4.14M
    bcf_enc_size(s, l, BCF_BT_CHAR);
2973
4.14M
    kputsn(a, l, s);
2974
4.14M
    return 0; // FIXME: check for errs in this function
2975
4.14M
}
2976
2977
// Special case of n==1 as it also occurs quite often in FORMAT data.
2978
// This version is also small enough to get inlined.
2979
7.05k
static inline int bcf_fmt_array1(kstring_t *s, int type, void *data) {
2980
7.05k
    uint32_t e = 0;
2981
7.05k
    uint8_t *p = (uint8_t *)data;
2982
7.05k
    int32_t v;
2983
2984
    // helps gcc more than clang here. In billions of cycles:
2985
    //          bcf_fmt_array1  bcf_fmt_array
2986
    // gcc7:    23.2            24.3
2987
    // gcc13:   21.6            23.0
2988
    // clang13: 27.1            27.8
2989
7.05k
    switch (type) {
2990
7.05k
    case BCF_BT_CHAR:
2991
7.05k
        e |= kputc_(*p == bcf_str_missing ? '.' : *p, s) < 0;
2992
7.05k
        break;
2993
2994
0
    case BCF_BT_INT8:
2995
0
        if (*(int8_t *)p != bcf_int8_vector_end) {
2996
0
            e |= ((*(int8_t *)p == bcf_int8_missing)
2997
0
                  ? kputc_('.', s)
2998
0
                  : kputw(*(int8_t *)p, s)) < 0;
2999
0
        }
3000
0
        break;
3001
0
    case BCF_BT_INT16:
3002
0
        v = le_to_i16(p);
3003
0
        if (v != bcf_int16_vector_end) {
3004
0
            e |= (v == bcf_int16_missing
3005
0
                  ? kputc_('.', s)
3006
0
                  : kputw(v, s)) < 0;
3007
0
        }
3008
0
        break;
3009
3010
0
    case BCF_BT_INT32:
3011
0
        v = le_to_i32(p);
3012
0
        if (v != bcf_int32_vector_end) {
3013
0
            e |= (v == bcf_int32_missing
3014
0
                  ? kputc_('.', s)
3015
0
                  : kputw(v, s)) < 0;
3016
0
        }
3017
0
        break;
3018
3019
0
    case BCF_BT_FLOAT:
3020
0
        v = le_to_u32(p);
3021
0
        if (v != bcf_float_vector_end) {
3022
0
            e |= (v == bcf_float_missing
3023
0
                  ? kputc_('.', s)
3024
0
                  : kputd(le_to_float(p), s)) < 0;
3025
0
        }
3026
0
        break;
3027
3028
0
    default:
3029
0
        hts_log_error("Unexpected type %d", type);
3030
0
        return -1;
3031
7.05k
    }
3032
3033
7.05k
    return e == 0 ? 0 : -1;
3034
7.05k
}
3035
3036
int bcf_fmt_array(kstring_t *s, int n, int type, void *data)
3037
3.10M
{
3038
3.10M
    int j = 0;
3039
3.10M
    uint32_t e = 0;
3040
3.10M
    if (n == 0) {
3041
2.08M
        return kputc_('.', s) >= 0 ? 0 : -1;
3042
2.08M
    }
3043
3044
1.01M
    if (type == BCF_BT_CHAR)
3045
361k
    {
3046
361k
        char *p = (char *)data;
3047
3048
        // Note bcf_str_missing is already accounted for in n==0 above.
3049
361k
        if (n >= 8) {
3050
78.6k
            char *p_end = memchr(p, 0, n);
3051
78.6k
            e |= kputsn(p, p_end ? p_end-p : n, s) < 0;
3052
282k
        } else {
3053
1.05M
            for (j = 0; j < n && *p; ++j, ++p)
3054
769k
               e |= kputc(*p, s) < 0;
3055
282k
        }
3056
361k
    }
3057
652k
    else
3058
652k
    {
3059
652k
        #define BRANCH(type_t, convert, is_missing, is_vector_end, kprint) { \
3060
652k
            uint8_t *p = (uint8_t *) data; \
3061
107M
            for (j=0; j<n; j++, p += sizeof(type_t))    \
3062
106M
            { \
3063
106M
                type_t v = convert(p); \
3064
106M
                if ( is_vector_end ) break; \
3065
106M
                if ( j ) e |= kputc_(',', s) < 0; \
3066
106M
                e |= (is_missing ? kputc('.', s) : kprint) < 0; \
3067
106M
            } \
3068
652k
        }
3069
652k
        switch (type) {
3070
166k
            case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, v==bcf_int8_missing,  v==bcf_int8_vector_end,  kputw(v, s)); break;
3071
106k
            case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break;
3072
112k
            case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break;
3073
266k
            case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break;
3074
0
            default: hts_log_error("Unexpected type %d", type); exit(1); break;
3075
652k
        }
3076
652k
        #undef BRANCH
3077
652k
    }
3078
1.01M
    return e == 0 ? 0 : -1;
3079
1.01M
}
3080
3081
uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
3082
2.32M
{
3083
2.32M
    int x, type;
3084
2.32M
    x = bcf_dec_size(ptr, &ptr, &type);
3085
2.32M
    bcf_fmt_array(s, x, type, ptr);
3086
2.32M
    return ptr + (x << bcf_type_shift[type]);
3087
2.32M
}
3088
3089
/********************
3090
 *** VCF site I/O ***
3091
 ********************/
3092
3093
typedef struct {
3094
    int key;            // Key for h->id[BCF_DT_ID][key] vdict
3095
    int max_m;          // number of elements in field array (ie commas)
3096
    int size;           // field size (max_l or max_g*4 if is_gt)
3097
    int offset;         // offset of buf into h->mem
3098
    uint32_t is_gt:1,   // is genotype
3099
             max_g:31;  // maximum number of genotypes
3100
    uint32_t max_l;     // length of field
3101
    uint32_t y;         // h->id[0][fmt[j].key].val->info[BCF_HL_FMT]
3102
    uint8_t *buf;       // Pointer into h->mem
3103
} fmt_aux_t;
3104
3105
// fmt_aux_t field notes:
3106
// max_* are biggest sizes of the various FORMAT fields across all samples.
3107
// We use these after pivoting the data to ensure easy random access
3108
// of a specific sample.
3109
//
3110
// max_m is only used for type BCF_HT_REAL or BCF_HT_INT
3111
// max_g is only used for is_gt == 1 (will be BCF_HT_STR)
3112
// max_l is only used for is_gt == 0 (will be BCF_HT_STR)
3113
//
3114
// These are computed in vcf_parse_format_max3 and used in
3115
// vcf_parse_format_alloc4 to get the size.
3116
//
3117
// size is computed from max_g, max_l, max_m and is_gt.  Once computed
3118
// the max values are never accessed again.
3119
//
3120
// In theory all 4 vars could be coalesced into a single variable, but this
3121
// significantly harms speed (even if done via a union).  It's about 25-30%
3122
// slower.
3123
3124
static inline int align_mem(kstring_t *s)
3125
62.2k
{
3126
62.2k
    int e = 0;
3127
62.2k
    if (s->l&7) {
3128
8.88k
        uint64_t zero = 0;
3129
8.88k
        e = kputsn((char*)&zero, 8 - (s->l&7), s) < 0;
3130
8.88k
    }
3131
62.2k
    return e == 0 ? 0 : -1;
3132
62.2k
}
3133
3134
62.6k
#define MAX_N_FMT 255   /* Limited by size of bcf1_t n_fmt field */
3135
3136
// detect FORMAT "."
3137
static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3138
14.0k
                                   const char *p, const char *q) {
3139
14.0k
    const char *end = s->s + s->l;
3140
14.0k
    if ( q>=end )
3141
6
    {
3142
6
        hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1);
3143
6
        v->errcode |= BCF_ERR_NCOLS;
3144
6
        return -1;
3145
6
    }
3146
3147
13.9k
    v->n_fmt = 0;
3148
13.9k
    if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "."
3149
119
    {
3150
119
        v->n_sample = bcf_hdr_nsamples(h);
3151
119
        return 1;
3152
119
    }
3153
3154
13.8k
    return 0;
3155
13.9k
}
3156
3157
// get format information from the dictionary
3158
static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3159
13.8k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
3160
13.8k
    const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3161
13.8k
    char *t;
3162
13.8k
    int j;
3163
13.8k
    ks_tokaux_t aux1;
3164
3165
76.5k
    for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
3166
62.6k
        if (j >= MAX_N_FMT) {
3167
0
            v->errcode |= BCF_ERR_LIMITS;
3168
0
            hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle",
3169
0
                bcf_seqname_safe(h,v), v->pos+1);
3170
0
            return -1;
3171
0
        }
3172
3173
62.6k
        *(char*)aux1.p = 0;
3174
62.6k
        khint_t k = kh_get(vdict, d, t);
3175
62.6k
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
3176
3.48k
            if ( t[0]=='.' && t[1]==0 )
3177
0
            {
3178
0
                hts_log_error("Invalid FORMAT tag name '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3179
0
                v->errcode |= BCF_ERR_TAG_INVALID;
3180
0
                return -1;
3181
0
            }
3182
3.48k
            hts_log_warning("FORMAT '%s' at %s:%"PRIhts_pos" is not defined in the header, assuming Type=String", t, bcf_seqname_safe(h,v), v->pos+1);
3183
3.48k
            kstring_t tmp = {0,0,0};
3184
3.48k
            int l;
3185
3.48k
            ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
3186
3.48k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3187
3.48k
            free(tmp.s);
3188
3.48k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3189
3.48k
            if (res < 0) bcf_hrec_destroy(hrec);
3190
3.48k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3191
3192
3.48k
            k = kh_get(vdict, d, t);
3193
3.48k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3194
3.48k
            if (res || k == kh_end(d)) {
3195
18
                hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
3196
18
                v->errcode |= BCF_ERR_TAG_INVALID;
3197
18
                return -1;
3198
18
            }
3199
3.48k
        }
3200
62.6k
        fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
3201
62.6k
        fmt[j].key = kh_val(d, k).id;
3202
62.6k
        fmt[j].is_gt = (t[0] == 'G' && t[1] == 'T' && !t[2]);
3203
62.6k
        fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
3204
62.6k
        v->n_fmt++;
3205
62.6k
    }
3206
13.8k
    return 0;
3207
13.8k
}
3208
3209
// compute max
3210
static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3211
13.8k
                                 char *p, char *q, fmt_aux_t *fmt) {
3212
13.8k
    int n_sample_ori = -1;
3213
13.8k
    char *r = q + 1;  // r: position in the format string
3214
13.8k
    int l = 0, m = 1, g = 1, j;
3215
13.8k
    v->n_sample = 0;  // m: max vector size, l: max field len, g: max number of alleles
3216
13.8k
    const char *end = s->s + s->l;
3217
3218
29.9k
    while ( r<end )
3219
29.8k
    {
3220
        // can we skip some samples?
3221
29.8k
        if ( h->keep_samples )
3222
0
        {
3223
0
            n_sample_ori++;
3224
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
3225
0
            {
3226
0
                while ( *r!='\t' && r<end ) r++;
3227
0
                if ( *r=='\t' ) { *r = 0; r++; }
3228
0
                continue;
3229
0
            }
3230
0
        }
3231
3232
        // collect fmt stats: max vector size, length, number of alleles
3233
29.8k
        j = 0;  // j-th format field
3234
29.8k
        fmt_aux_t *f = fmt;
3235
29.8k
        static char meta[256] = {
3236
            // \0 \t , / : |
3237
29.8k
            1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3238
29.8k
            0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1, 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
3239
29.8k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3240
29.8k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
3241
29.8k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3242
29.8k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3243
29.8k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3244
29.8k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3245
29.8k
        };
3246
3247
29.8k
        char *r_start = r;
3248
4.75M
        for (;;) {
3249
            // Quickly skip ahead to an appropriate meta-character
3250
5.37M
            while (!meta[(unsigned char)*r]) r++;
3251
3252
4.75M
            switch (*r) {
3253
4.70M
            case ',':
3254
4.70M
                m++;
3255
4.70M
                break;
3256
3257
766
            case '|':
3258
8.94k
            case '/':
3259
8.94k
                if (f->is_gt) g++;
3260
8.94k
                break;
3261
3262
13.8k
            case '\t':
3263
13.8k
                *r = 0; // fall through
3264
3265
13.8k
            default: // valid due to while loop above.
3266
29.8k
            case '\0':
3267
43.9k
            case ':':
3268
43.9k
                l = r - r_start; r_start = r;
3269
43.9k
                if (f->max_m < m) f->max_m = m;
3270
43.9k
                if (f->max_l < l) f->max_l = l;
3271
43.9k
                if (f->is_gt && f->max_g < g) f->max_g = g;
3272
43.9k
                l = 0, m = g = 1;
3273
43.9k
                if ( *r==':' ) {
3274
14.1k
                    j++; f++;
3275
14.1k
                    if ( j>=v->n_fmt ) {
3276
28
                        hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"",
3277
28
                                      h->id[BCF_DT_CTG][v->rid].key, v->pos+1);
3278
28
                        v->errcode |= BCF_ERR_NCOLS;
3279
28
                        return -1;
3280
28
                    }
3281
29.8k
                } else goto end_for;
3282
14.0k
                break;
3283
4.75M
            }
3284
4.72M
            if ( r>=end ) break;
3285
4.72M
            r++;
3286
4.72M
        }
3287
29.8k
    end_for:
3288
29.8k
        v->n_sample++;
3289
29.8k
        if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
3290
16.0k
        r++;
3291
16.0k
    }
3292
3293
13.8k
    return 0;
3294
13.8k
}
3295
3296
// allocate memory for arrays
3297
static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3298
                                   const char *p, const char *q,
3299
13.8k
                                   fmt_aux_t *fmt) {
3300
13.8k
    kstring_t *mem = (kstring_t*)&h->mem;
3301
3302
13.8k
    int j;
3303
76.1k
    for (j = 0; j < v->n_fmt; ++j) {
3304
62.2k
        fmt_aux_t *f = &fmt[j];
3305
62.2k
        if ( !f->max_m ) f->max_m = 1;  // omitted trailing format field
3306
3307
62.2k
        if ((f->y>>4&0xf) == BCF_HT_STR) {
3308
62.2k
            f->size = f->is_gt? f->max_g << 2 : f->max_l;
3309
62.2k
        } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
3310
0
            f->size = f->max_m << 2;
3311
0
        } else {
3312
0
            hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
3313
0
            v->errcode |= BCF_ERR_TAG_INVALID;
3314
0
            return -1;
3315
0
        }
3316
3317
62.2k
        if (align_mem(mem) < 0) {
3318
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3319
0
            v->errcode |= BCF_ERR_LIMITS;
3320
0
            return -1;
3321
0
        }
3322
3323
        // Limit the total memory to ~2Gb per VCF row.  This should mean
3324
        // malformed VCF data is less likely to take excessive memory and/or
3325
        // time.
3326
62.2k
        if ((uint64_t) mem->l + v->n_sample * (uint64_t)f->size > INT_MAX) {
3327
0
            static int warned = 0;
3328
0
            if ( !warned ) hts_log_warning("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3329
0
            warned = 1;
3330
0
            v->errcode |= BCF_ERR_LIMITS;
3331
0
            f->size = -1;
3332
0
            f->offset = 0;
3333
0
            continue;
3334
0
        }
3335
3336
62.2k
        f->offset = mem->l;
3337
62.2k
        if (ks_resize(mem, mem->l + v->n_sample * (size_t)f->size) < 0) {
3338
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3339
0
            v->errcode |= BCF_ERR_LIMITS;
3340
0
            return -1;
3341
0
        }
3342
62.2k
        mem->l += v->n_sample * f->size;
3343
62.2k
    }
3344
3345
13.8k
    {
3346
13.8k
        int j;
3347
76.1k
        for (j = 0; j < v->n_fmt; ++j)
3348
62.2k
            fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
3349
13.8k
    }
3350
3351
    // check for duplicate tags
3352
13.8k
    int i;
3353
62.2k
    for (i=1; i<v->n_fmt; i++)
3354
48.4k
    {
3355
48.4k
        fmt_aux_t *ifmt = &fmt[i];
3356
48.4k
        if ( ifmt->size==-1 ) continue; // already marked for removal
3357
238k
        for (j=0; j<i; j++)
3358
222k
        {
3359
222k
            fmt_aux_t *jfmt = &fmt[j];
3360
222k
            if ( jfmt->size==-1 ) continue; // already marked for removal
3361
122k
            if ( ifmt->key!=jfmt->key ) continue;
3362
32.0k
            static int warned = 0;
3363
32.0k
            if ( !warned ) hts_log_warning("Duplicate FORMAT tag %s at %s:%"PRIhts_pos, bcf_hdr_int2id(h,BCF_DT_ID,ifmt->key), bcf_seqname_safe(h,v), v->pos+1);
3364
32.0k
            warned = 1;
3365
32.0k
            v->errcode |= BCF_ERR_TAG_INVALID;
3366
32.0k
            ifmt->size = -1;
3367
32.0k
            ifmt->offset = 0;
3368
32.0k
            break;
3369
122k
        }
3370
48.4k
    }
3371
13.8k
    return 0;
3372
13.8k
}
3373
3374
// Fill the sample fields
3375
static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3376
13.8k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
3377
13.8k
    static int extreme_val_warned = 0;
3378
13.8k
    int n_sample_ori = -1;
3379
    // At beginning of the loop t points to the first char of a format
3380
13.8k
    const char *t = q + 1;
3381
13.8k
    int m = 0;   // m: sample id
3382
13.8k
    const int nsamples = bcf_hdr_nsamples(h);
3383
13.8k
    const char *end = s->s + s->l;
3384
3385
13.8k
    int ver = bcf_get_version(h, NULL);
3386
3387
43.5k
    while ( t<end )
3388
42.2k
    {
3389
        // can we skip some samples?
3390
42.2k
        if ( h->keep_samples )
3391
0
        {
3392
0
            n_sample_ori++;
3393
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
3394
0
            {
3395
0
                while ( *t && t<end ) t++;
3396
0
                t++;
3397
0
                continue;
3398
0
            }
3399
0
        }
3400
42.2k
        if ( m == nsamples ) break;
3401
3402
29.7k
        int j = 0; // j-th format field, m-th sample
3403
43.6k
        while ( t < end )
3404
43.3k
        {
3405
43.3k
            fmt_aux_t *z = &fmt[j++];
3406
43.3k
            const int htype = z->y>>4&0xf;
3407
43.3k
            if (!z->buf) {
3408
2
                hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos,
3409
2
                              z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
3410
2
                v->errcode |= BCF_ERR_LIMITS;
3411
2
                return -1;
3412
2
            }
3413
3414
43.3k
            if ( z->size==-1 )
3415
4.99k
            {
3416
                // this field is to be ignored, it's either too big or a duplicate
3417
61.5k
                while ( *t != ':' && *t ) t++;
3418
4.99k
            }
3419
38.3k
            else if (htype == BCF_HT_STR) {
3420
38.3k
                int l;
3421
38.3k
                if (z->is_gt) {
3422
                    // Genotypes.
3423
                    //([/|])?<val>)([|/]<val>)+... where <val> is [0-9]+ or ".".
3424
5.41k
                    int32_t is_phased = 0;
3425
5.41k
                    uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
3426
5.41k
                    uint32_t unreadable = 0;
3427
5.41k
                    uint32_t max = 0;
3428
5.41k
                    int overflow = 0, ploidy = 0, anyunphased = 0, \
3429
5.41k
                        phasingprfx = 0, unknown1 = 0;
3430
3431
                    /* with prefixed phasing, it is explicitly given for 1st one
3432
                    with non-prefixed, set based on ploidy and phasing of other
3433
                    alleles. */
3434
5.41k
                    if (ver >= VCF44 && (*t == '|' || *t == '/')) {
3435
                        // cache prefix and phasing status
3436
665
                        is_phased = *t++ == '|';
3437
665
                        phasingprfx = 1;
3438
665
                    }
3439
3440
10.7k
                    for (l = 0;; ++t) {
3441
10.7k
                        ploidy++;
3442
10.7k
                        if (*t == '.') {
3443
645
                            ++t, x[l++] = is_phased;
3444
645
                            if (l==1) {   //for 1st allele only
3445
488
                                unknown1 = 1;
3446
488
                            }
3447
10.1k
                        } else {
3448
10.1k
                            const char *tt = t;
3449
10.1k
                            uint32_t val;
3450
                            // Or "v->n_allele < 10", but it doesn't
3451
                            // seem to be any faster and this feels safer.
3452
10.1k
                            if (*t >= '0' && *t <= '9' &&
3453
10.1k
                                !(t[1] >= '0' && t[1] <= '9')) {
3454
4.42k
                                val = *t++ - '0';
3455
5.69k
                            } else {
3456
5.69k
                                val = hts_str2uint(t, (char **)&t,
3457
5.69k
                                                   sizeof(val) * CHAR_MAX - 2,
3458
5.69k
                                                   &overflow);
3459
5.69k
                                unreadable |= tt == t;
3460
5.69k
                            }
3461
10.1k
                            if (max < val) max = val;
3462
10.1k
                            x[l++] = (val + 1) << 1 | is_phased;
3463
10.1k
                        }
3464
10.7k
                        anyunphased |= (ploidy != 1) && !is_phased;
3465
10.7k
                        is_phased = (*t == '|');
3466
10.7k
                        if (*t != '|' && *t != '/') break;
3467
10.7k
                    }
3468
5.41k
                    if (!phasingprfx) { //get GT in v44 way when no prefixed phasing
3469
                        /* no explicit phasing for 1st allele, set based on
3470
                         other alleles and ploidy */
3471
4.74k
                        if (ploidy == 1) {  //implicitly phased
3472
1.34k
                            if (!unknown1) {
3473
1.01k
                                x[0] |= 1;
3474
1.01k
                            }
3475
3.40k
                        } else {            //set by other unphased alleles
3476
3.40k
                            x[0] |= (anyunphased)? 0 : 1;
3477
3.40k
                        }
3478
4.74k
                    }
3479
                    // Possibly check max against v->n_allele instead?
3480
5.41k
                    if (overflow || max > (INT32_MAX >> 1) - 1) {
3481
31
                        hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3482
31
                        return -1;
3483
31
                    }
3484
5.38k
                    if (unreadable) {
3485
6
                        hts_log_error("Couldn't read GT data: value not a number or '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3486
6
                        return -1;
3487
6
                    }
3488
5.37k
                    if ( !l ) x[l++] = 0;   // An empty field, insert missing value
3489
6.31k
                    for (; l < z->size>>2; ++l)
3490
935
                        x[l] = bcf_int32_vector_end;
3491
3492
32.9k
                } else {
3493
                    // Otherwise arbitrary strings
3494
32.9k
                    char *x = (char*)z->buf + z->size * (size_t)m;
3495
5.14M
                    for (l = 0; *t != ':' && *t; ++t)
3496
5.11M
                        x[l++] = *t;
3497
32.9k
                    if (z->size > l)
3498
17.3k
                        memset(&x[l], 0, (z->size-l) * sizeof(*x));
3499
32.9k
                }
3500
3501
38.3k
            } else if (htype == BCF_HT_INT) {
3502
                // One or more integers in an array
3503
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3504
0
                int l;
3505
0
                for (l = 0;; ++t) {
3506
0
                    if (*t == '.') {
3507
0
                        x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
3508
0
                    } else {
3509
0
                        int overflow = 0;
3510
0
                        char *te;
3511
0
                        long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3512
0
                        if ( te==t || overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3513
0
                        {
3514
0
                            if ( !extreme_val_warned )
3515
0
                            {
3516
0
                                hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos,
3517
0
                                                h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1);
3518
0
                                extreme_val_warned = 1;
3519
0
                            }
3520
0
                            tmp_val = bcf_int32_missing;
3521
0
                        }
3522
0
                        x[l++] = tmp_val;
3523
0
                        t = te;
3524
0
                    }
3525
0
                    if (*t != ',') break;
3526
0
                }
3527
0
                if ( !l )
3528
0
                    x[l++] = bcf_int32_missing;
3529
0
                for (; l < z->size>>2; ++l)
3530
0
                    x[l] = bcf_int32_vector_end;
3531
3532
0
            } else if (htype == BCF_HT_REAL) {
3533
                // One of more floating point values in an array
3534
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3535
0
                int l;
3536
0
                for (l = 0;; ++t) {
3537
0
                    if (*t == '.' && !isdigit_c(t[1])) {
3538
0
                        bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
3539
0
                    } else {
3540
0
                        int overflow = 0;
3541
0
                        char *te;
3542
0
                        float tmp_val = hts_str2dbl(t, &te, &overflow);
3543
0
                        if ( (te==t || overflow) && !extreme_val_warned )
3544
0
                        {
3545
0
                            hts_log_warning("Extreme FORMAT/%s value encountered at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname(h,v), v->pos+1);
3546
0
                            extreme_val_warned = 1;
3547
0
                        }
3548
0
                        x[l++] = tmp_val;
3549
0
                        t = te;
3550
0
                    }
3551
0
                    if (*t != ',') break;
3552
0
                }
3553
0
                if ( !l )
3554
                    // An empty field, insert missing value
3555
0
                    bcf_float_set_missing(x[l++]);
3556
0
                for (; l < z->size>>2; ++l)
3557
0
                    bcf_float_set_vector_end(x[l]);
3558
0
            } else {
3559
0
                hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, htype, bcf_seqname_safe(h,v), v->pos+1);
3560
0
                v->errcode |= BCF_ERR_TAG_INVALID;
3561
0
                return -1;
3562
0
            }
3563
3564
43.2k
            if (*t == '\0') {
3565
29.3k
                break;
3566
29.3k
            }
3567
13.9k
            else if (*t == ':') {
3568
13.9k
                t++;
3569
13.9k
            }
3570
5
            else {
3571
5
                char buffer[8];
3572
5
                hts_log_error("Invalid character %s in '%s' FORMAT field at %s:%"PRIhts_pos"",
3573
5
                    hts_strprint(buffer, sizeof buffer, '\'', t, 1),
3574
5
                    h->id[BCF_DT_ID][z->key].key, bcf_seqname_safe(h,v), v->pos+1);
3575
5
                v->errcode |= BCF_ERR_CHAR;
3576
5
                return -1;
3577
5
            }
3578
43.2k
        }
3579
3580
        // fill end-of-vector values
3581
468k
        for (; j < v->n_fmt; ++j) {
3582
438k
            fmt_aux_t *z = &fmt[j];
3583
438k
            const int htype = z->y>>4&0xf;
3584
438k
            int l;
3585
3586
438k
            if (z->size == -1) // this field is to be ignored
3587
369k
                continue;
3588
3589
69.7k
            if (htype == BCF_HT_STR) {
3590
69.7k
                if (z->is_gt) {
3591
10.9k
                    int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3592
10.9k
                    if (z->size) x[0] = bcf_int32_missing;
3593
19.2k
                    for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3594
58.7k
                } else {
3595
58.7k
                    char *x = (char*)z->buf + z->size * (size_t)m;
3596
58.7k
                    if ( z->size ) {
3597
9.68k
                        x[0] = '.';
3598
9.68k
                        memset(&x[1], 0, (z->size-1) * sizeof(*x));
3599
9.68k
                    }
3600
58.7k
                }
3601
69.7k
            } else if (htype == BCF_HT_INT) {
3602
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3603
0
                x[0] = bcf_int32_missing;
3604
0
                for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3605
0
            } else if (htype == BCF_HT_REAL) {
3606
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3607
0
                bcf_float_set_missing(x[0]);
3608
0
                for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
3609
0
            }
3610
69.7k
        }
3611
3612
29.6k
        m++; t++;
3613
29.6k
    }
3614
3615
13.7k
    return 0;
3616
13.8k
}
3617
3618
// write individual genotype information
3619
static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3620
13.7k
                                const char *p, const char *q, fmt_aux_t *fmt) {
3621
13.7k
    kstring_t *str = &v->indiv;
3622
13.7k
    int i, need_downsize = 0;
3623
13.7k
    if (v->n_sample > 0) {
3624
75.8k
        for (i = 0; i < v->n_fmt; ++i) {
3625
62.1k
            fmt_aux_t *z = &fmt[i];
3626
62.1k
            if ( z->size==-1 ) {
3627
31.9k
                need_downsize = 1;
3628
31.9k
                continue;
3629
31.9k
            }
3630
30.1k
            bcf_enc_int1(str, z->key);
3631
30.1k
            if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
3632
23.9k
                bcf_enc_size(str, z->size, BCF_BT_CHAR);
3633
23.9k
                kputsn((char*)z->buf, z->size * (size_t)v->n_sample, str);
3634
23.9k
            } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
3635
6.24k
                bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
3636
6.24k
            } else {
3637
0
                bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
3638
0
                if (serialize_float_array(str, (z->size>>2) * (size_t)v->n_sample,
3639
0
                                          (float *) z->buf) != 0) {
3640
0
                    v->errcode |= BCF_ERR_LIMITS;
3641
0
                    hts_log_error("Out of memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3642
0
                    return -1;
3643
0
                }
3644
0
            }
3645
30.1k
        }
3646
3647
13.7k
    }
3648
13.7k
    if ( need_downsize ) {
3649
6.58k
        i = 0;
3650
56.8k
        while ( i < v->n_fmt ) {
3651
50.2k
            if ( fmt[i].size==-1 )
3652
31.9k
            {
3653
31.9k
                v->n_fmt--;
3654
31.9k
                if ( i < v->n_fmt ) memmove(&fmt[i],&fmt[i+1],sizeof(*fmt)*(v->n_fmt-i));
3655
31.9k
            }
3656
18.3k
            else
3657
18.3k
                i++;
3658
50.2k
        }
3659
6.58k
    }
3660
13.7k
    return 0;
3661
13.7k
}
3662
3663
// validity checking
3664
13.7k
static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) {
3665
13.7k
    if ( v->n_sample!=bcf_hdr_nsamples(h) )
3666
33
    {
3667
33
        hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
3668
33
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
3669
33
        v->errcode |= BCF_ERR_NCOLS;
3670
33
        return -1;
3671
33
    }
3672
13.7k
    if ( v->indiv.l > 0xffffffff )
3673
0
    {
3674
0
        hts_log_error("The FORMAT at %s:%"PRIhts_pos" is too long", bcf_seqname_safe(h,v), v->pos+1);
3675
0
        v->errcode |= BCF_ERR_LIMITS;
3676
3677
        // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed
3678
0
        v->n_fmt = 0;
3679
0
        return -1;
3680
0
    }
3681
3682
13.7k
    return 0;
3683
13.7k
}
3684
3685
// p,q is the start and the end of the FORMAT field
3686
static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3687
                            char *p, char *q)
3688
48.3k
{
3689
48.3k
    if ( !bcf_hdr_nsamples(h) ) return 0;
3690
14.0k
    kstring_t *mem = (kstring_t*)&h->mem;
3691
14.0k
    mem->l = 0;
3692
3693
14.0k
    fmt_aux_t fmt[MAX_N_FMT];
3694
3695
    // detect FORMAT "."
3696
14.0k
    int ret; // +ve = ok, -ve = err
3697
14.0k
    if ((ret = vcf_parse_format_empty1(s, h, v, p, q)))
3698
125
        return ret ? 0 : -1;
3699
3700
    // get format information from the dictionary
3701
13.8k
    if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0)
3702
18
        return -1;
3703
3704
    // FORMAT data is per-sample A:B:C A:B:C A:B:C ... but in memory it is
3705
    // stored as per-type arrays AAA... BBB... CCC...  This is basically
3706
    // a data rotation or pivot.
3707
3708
    // The size of elements in the array grow to their maximum needed,
3709
    // permitting fast random access.  This means however we have to first
3710
    // scan the whole FORMAT line to find the maximum of each type, and
3711
    // then scan it again to find the store the data.
3712
    // We break this down into compute-max, allocate, fill-out-buffers
3713
3714
    // TODO: ?
3715
    // The alternative would be to pivot on the first pass, with fixed
3716
    // size entries for numerics and concatenated strings otherwise, also
3717
    // tracking maximum sizes.  Then on a second pass we reallocate and
3718
    // copy the data again to a uniformly sized array.  Two passes through
3719
    // memory, but without doubling string parsing.
3720
3721
    // compute max
3722
13.8k
    if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0)
3723
28
        return -1;
3724
3725
    // allocate memory for arrays
3726
13.8k
    if (vcf_parse_format_alloc4(s, h, v, p, q, fmt) < 0)
3727
0
        return -1;
3728
3729
    // fill the sample fields; at beginning of the loop
3730
13.8k
    if (vcf_parse_format_fill5(s, h, v, p, q, fmt) < 0)
3731
44
        return -1;
3732
3733
    // write individual genotype information
3734
13.7k
    if (vcf_parse_format_gt6(s, h, v, p, q, fmt) < 0)
3735
0
        return -1;
3736
3737
    // validity checking
3738
13.7k
    if (vcf_parse_format_check7(h, v) < 0)
3739
33
        return -1;
3740
3741
13.7k
    return 0;
3742
13.7k
}
3743
3744
3.11k
static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
3745
    // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
3746
    // been already printed, but will enable tools like vcfcheck to proceed.
3747
3748
3.11k
    kstring_t tmp = {0,0,0};
3749
3.11k
    khint_t k;
3750
3.11k
    int l;
3751
3.11k
    if (ksprintf(&tmp, "##contig=<ID=%s>", p) < 0)
3752
0
        return kh_end(d);
3753
3.11k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3754
3.11k
    free(tmp.s);
3755
3.11k
    int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3756
3.11k
    if (res < 0) bcf_hrec_destroy(hrec);
3757
3.11k
    if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3758
3.11k
    k = kh_get(vdict, d, p);
3759
3760
3.11k
    return k;
3761
3.11k
}
3762
3763
50.8k
static int vcf_parse_filter(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3764
50.8k
    int i, n_flt = 1, max_n_flt = 0;
3765
50.8k
    char *r, *t;
3766
50.8k
    int32_t *a_flt = NULL;
3767
50.8k
    ks_tokaux_t aux1;
3768
50.8k
    khint_t k;
3769
50.8k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3770
    // count the number of filters
3771
50.8k
    if (*(q-1) == ';') *(q-1) = 0;
3772
437M
    for (r = p; *r; ++r)
3773
437M
        if (*r == ';') ++n_flt;
3774
50.8k
    if (n_flt > max_n_flt) {
3775
50.8k
        a_flt = malloc(n_flt * sizeof(*a_flt));
3776
50.8k
        if (!a_flt) {
3777
0
            hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3778
0
            v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3779
0
            return -1;
3780
0
        }
3781
50.8k
        max_n_flt = n_flt;
3782
50.8k
    }
3783
    // add filters
3784
2.10M
    for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
3785
2.05M
        *(char*)aux1.p = 0;
3786
2.05M
        k = kh_get(vdict, d, t);
3787
2.05M
        if (k == kh_end(d))
3788
45.9k
        {
3789
            // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
3790
            // been already printed, but will enable tools like vcfcheck to proceed.
3791
45.9k
            hts_log_warning("FILTER '%s' is not defined in the header", t);
3792
45.9k
            kstring_t tmp = {0,0,0};
3793
45.9k
            int l;
3794
45.9k
            ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
3795
45.9k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3796
45.9k
            free(tmp.s);
3797
45.9k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3798
45.9k
            if (res < 0) bcf_hrec_destroy(hrec);
3799
45.9k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3800
45.9k
            k = kh_get(vdict, d, t);
3801
45.9k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3802
45.9k
            if (res || k == kh_end(d)) {
3803
46
                hts_log_error("Could not add dummy header for FILTER '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
3804
46
                v->errcode |= BCF_ERR_TAG_INVALID;
3805
46
                free(a_flt);
3806
46
                return -1;
3807
46
            }
3808
45.9k
        }
3809
2.05M
        a_flt[i++] = kh_val(d, k).id;
3810
2.05M
    }
3811
3812
50.8k
    bcf_enc_vint(str, n_flt, a_flt, -1);
3813
50.8k
    free(a_flt);
3814
3815
50.8k
    return 0;
3816
50.8k
}
3817
3818
54.6k
static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3819
54.6k
    static int extreme_int_warned = 0, negative_rlen_warned = 0;
3820
54.6k
    int max_n_val = 0, overflow = 0;
3821
54.6k
    char *r, *key;
3822
54.6k
    khint_t k;
3823
54.6k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3824
54.6k
    int32_t *a_val = NULL;
3825
3826
54.6k
    v->n_info = 0;
3827
54.6k
    if (*(q-1) == ';') *(q-1) = 0;
3828
4.13M
    for (r = key = p;; ++r) {
3829
4.13M
        int c;
3830
4.13M
        char *val, *end;
3831
375M
        while (*r > '=' || (*r != ';' && *r != '=' && *r != 0)) r++;
3832
4.13M
        if (v->n_info == UINT16_MAX) {
3833
4
            hts_log_error("Too many INFO entries at %s:%"PRIhts_pos,
3834
4
                          bcf_seqname_safe(h,v), v->pos+1);
3835
4
            v->errcode |= BCF_ERR_LIMITS;
3836
4
            goto fail;
3837
4
        }
3838
4.13M
        val = end = NULL;
3839
4.13M
        c = *r; *r = 0;
3840
4.13M
        if (c == '=') {
3841
1.40M
            val = r + 1;
3842
3843
342M
            for (end = val; *end != ';' && *end != 0; ++end);
3844
1.40M
            c = *end; *end = 0;
3845
2.72M
        } else end = r;
3846
4.13M
        if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; }  // faulty VCF, ";;" in the INFO
3847
4.06M
        k = kh_get(vdict, d, key);
3848
4.06M
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
3849
44.0k
        {
3850
44.0k
            hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key);
3851
44.0k
            kstring_t tmp = {0,0,0};
3852
44.0k
            int l;
3853
44.0k
            ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
3854
44.0k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3855
44.0k
            free(tmp.s);
3856
44.0k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3857
44.0k
            if (res < 0) bcf_hrec_destroy(hrec);
3858
44.0k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3859
44.0k
            k = kh_get(vdict, d, key);
3860
44.0k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3861
44.0k
            if (res || k == kh_end(d)) {
3862
75
                hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1);
3863
75
                v->errcode |= BCF_ERR_TAG_INVALID;
3864
75
                goto fail;
3865
75
            }
3866
44.0k
        }
3867
4.06M
        uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
3868
4.06M
        ++v->n_info;
3869
4.06M
        bcf_enc_int1(str, kh_val(d, k).id);
3870
4.06M
        if (val == 0) {
3871
2.66M
            bcf_enc_size(str, 0, BCF_BT_NULL);
3872
2.66M
        } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
3873
118k
            bcf_enc_vchar(str, end - val, val);
3874
1.28M
        } else { // int/float value/array
3875
1.28M
            int i, n_val;
3876
1.28M
            char *t, *te;
3877
300M
            for (t = val, n_val = 1; *t; ++t) // count the number of values
3878
299M
                if (*t == ',') ++n_val;
3879
            // Check both int and float size in one step for simplicity
3880
1.28M
            if (n_val > max_n_val) {
3881
7.37k
                int32_t *a_tmp = (int32_t *)realloc(a_val, n_val * sizeof(*a_val));
3882
7.37k
                if (!a_tmp) {
3883
0
                    hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3884
0
                    v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3885
0
                    goto fail;
3886
0
                }
3887
7.37k
                a_val = a_tmp;
3888
7.37k
                max_n_val = n_val;
3889
7.37k
            }
3890
1.28M
            if ((y>>4&0xf) == BCF_HT_INT) {
3891
756k
                i = 0, t = val;
3892
756k
                int64_t val1;
3893
756k
                int is_int64 = 0;
3894
#ifdef VCF_ALLOW_INT64
3895
                if ( n_val==1 )
3896
                {
3897
                    overflow = 0;
3898
                    long long int tmp_val = hts_str2int(val, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3899
                    if ( te==val ) tmp_val = bcf_int32_missing;
3900
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT64 || tmp_val>BCF_MAX_BT_INT64 )
3901
                    {
3902
                        if ( !extreme_int_warned )
3903
                        {
3904
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3905
                            extreme_int_warned = 1;
3906
                        }
3907
                        tmp_val = bcf_int32_missing;
3908
                    }
3909
                    else
3910
                        is_int64 = 1;
3911
                    val1 = tmp_val;
3912
                    t = te;
3913
                    i = 1;  // this is just to avoid adding another nested block...
3914
                }
3915
#endif
3916
110M
                for (; i < n_val; ++i, ++t)
3917
110M
                {
3918
110M
                    overflow = 0;
3919
110M
                    long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3920
110M
                    if ( te==t ) tmp_val = bcf_int32_missing;
3921
1.42M
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3922
186k
                    {
3923
186k
                        if ( !extreme_int_warned )
3924
1
                        {
3925
1
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3926
1
                            extreme_int_warned = 1;
3927
1
                        }
3928
186k
                        tmp_val = bcf_int32_missing;
3929
186k
                    }
3930
110M
                    a_val[i] = tmp_val;
3931
150M
                    for (t = te; *t && *t != ','; t++);
3932
110M
                }
3933
756k
                if (n_val == 1) {
3934
#ifdef VCF_ALLOW_INT64
3935
                    if ( is_int64 )
3936
                    {
3937
                        v->unpacked |= BCF_IS_64BIT;
3938
                        bcf_enc_long1(str, val1);
3939
                    }
3940
                    else
3941
                        bcf_enc_int1(str, (int32_t)val1);
3942
#else
3943
606k
                    val1 = a_val[0];
3944
606k
                    bcf_enc_int1(str, (int32_t)val1);
3945
606k
#endif
3946
606k
                } else {
3947
150k
                    bcf_enc_vint(str, n_val, a_val, -1);
3948
150k
                }
3949
756k
                if (n_val==1 && (val1!=bcf_int32_missing || is_int64)
3950
497k
                    && memcmp(key, "END", 4) == 0)
3951
0
                {
3952
0
                    if ( val1 <= v->pos )
3953
0
                    {
3954
0
                        if ( !negative_rlen_warned )
3955
0
                        {
3956
0
                            hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,val1,bcf_seqname_safe(h,v),v->pos+1);
3957
0
                            negative_rlen_warned = 1;
3958
0
                        }
3959
0
                    }
3960
0
                }
3961
756k
            } else if ((y>>4&0xf) == BCF_HT_REAL) {
3962
530k
                float *val_f = (float *)a_val;
3963
103M
                for (i = 0, t = val; i < n_val; ++i, ++t)
3964
103M
                {
3965
103M
                    overflow = 0;
3966
103M
                    val_f[i] = hts_str2dbl(t, &te, &overflow);
3967
103M
                    if ( te==t || overflow ) // conversion failed
3968
101M
                        bcf_float_set_missing(val_f[i]);
3969
136M
                    for (t = te; *t && *t != ','; t++);
3970
103M
                }
3971
530k
                bcf_enc_vfloat(str, n_val, val_f);
3972
530k
            }
3973
1.28M
        }
3974
4.06M
        if (c == 0) break;
3975
4.03M
        r = end;
3976
4.03M
        key = r + 1;
3977
4.03M
    }
3978
3979
54.5k
    free(a_val);
3980
54.5k
    return 0;
3981
3982
79
 fail:
3983
79
    free(a_val);
3984
79
    return -1;
3985
54.6k
}
3986
3987
int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
3988
55.6k
{
3989
55.6k
    int ret = -2, overflow = 0;
3990
55.6k
    char *p, *q, *r, *t;
3991
55.6k
    kstring_t *str;
3992
55.6k
    khint_t k;
3993
55.6k
    ks_tokaux_t aux;
3994
3995
//#define NOT_DOT(p) strcmp((p), ".")
3996
//#define NOT_DOT(p) (!(*p == '.' && !p[1]))
3997
//#define NOT_DOT(p) ((*p) != '.' || (p)[1])
3998
//#define NOT_DOT(p) (q-p != 1 || memcmp(p, ".\0", 2))
3999
275k
#define NOT_DOT(p) (memcmp(p, ".\0", 2))
4000
4001
55.6k
    if (!s || !h || !v || !(s->s))
4002
0
        return ret;
4003
4004
    // Assumed in lots of places, but we may as well spot this early
4005
55.6k
    assert(sizeof(float) == sizeof(int32_t));
4006
4007
    // Ensure string we parse has space to permit some over-flow when during
4008
    // parsing.  Eg to do memcmp(key, "END", 4) in vcf_parse_info over
4009
    // the more straight forward looking strcmp, giving a speed advantage.
4010
55.6k
    if (ks_resize(s, s->l+4) < 0)
4011
0
        return -2;
4012
4013
    // Force our memory to be initialised so we avoid the technicality of
4014
    // undefined behaviour in using a 4-byte memcmp.  (The reality is this
4015
    // almost certainly is never detected by the compiler so has no impact,
4016
    // but equally so this code has minimal (often beneficial) impact on
4017
    // performance too.)
4018
55.6k
    s->s[s->l+0] = 0;
4019
55.6k
    s->s[s->l+1] = 0;
4020
55.6k
    s->s[s->l+2] = 0;
4021
55.6k
    s->s[s->l+3] = 0;
4022
4023
55.6k
    bcf_clear1(v);
4024
55.6k
    str = &v->shared;
4025
55.6k
    memset(&aux, 0, sizeof(ks_tokaux_t));
4026
4027
    // CHROM
4028
55.6k
    if (!(p = kstrtok(s->s, "\t", &aux)))
4029
0
        goto err;
4030
55.6k
    *(q = (char*)aux.p) = 0;
4031
4032
55.6k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
4033
55.6k
    k = kh_get(vdict, d, p);
4034
55.6k
    if (k == kh_end(d)) {
4035
3.11k
        hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p);
4036
3.11k
        v->errcode = BCF_ERR_CTG_UNDEF;
4037
3.11k
        if ((k = fix_chromosome(h, d, p)) == kh_end(d)) {
4038
46
            hts_log_error("Could not add dummy header for contig '%s'", p);
4039
46
            v->errcode |= BCF_ERR_CTG_INVALID;
4040
46
            goto err;
4041
46
        }
4042
3.11k
    }
4043
55.5k
    v->rid = kh_val(d, k).id;
4044
4045
    // POS
4046
55.5k
    if (!(p = kstrtok(0, 0, &aux)))
4047
303
        goto err;
4048
55.2k
    *(q = (char*)aux.p) = 0;
4049
4050
55.2k
    overflow = 0;
4051
55.2k
    char *tmp = p;
4052
55.2k
    v->pos = hts_str2uint(p, &p, 62, &overflow);
4053
55.2k
    if (overflow) {
4054
2
        hts_log_error("Position value '%s' is too large", tmp);
4055
2
        goto err;
4056
55.2k
    } else if ( *p ) {
4057
54
        hts_log_error("Could not parse the position '%s'", tmp);
4058
54
        goto err;
4059
55.2k
    } else {
4060
55.2k
        v->pos -= 1;
4061
55.2k
    }
4062
55.2k
    if (v->pos >= INT32_MAX)
4063
1.31k
        v->unpacked |= BCF_IS_64BIT;
4064
4065
    // ID
4066
55.2k
    if (!(p = kstrtok(0, 0, &aux)))
4067
6
        goto err;
4068
55.2k
    *(q = (char*)aux.p) = 0;
4069
4070
55.2k
    if (NOT_DOT(p)) bcf_enc_vchar(str, q - p, p);
4071
574
    else bcf_enc_size(str, 0, BCF_BT_CHAR);
4072
4073
    // REF
4074
55.2k
    if (!(p = kstrtok(0, 0, &aux)))
4075
24
        goto err;
4076
55.1k
    *(q = (char*)aux.p) = 0;
4077
4078
55.1k
    bcf_enc_vchar(str, q - p, p);
4079
55.1k
    v->n_allele = 1, v->rlen = q - p;
4080
4081
    // ALT
4082
55.1k
    if (!(p = kstrtok(0, 0, &aux)))
4083
4
        goto err;
4084
55.1k
    *(q = (char*)aux.p) = 0;
4085
4086
55.1k
    if (NOT_DOT(p)) {
4087
64.8M
        for (r = t = p;; ++r) {
4088
64.8M
            if (*r == ',' || *r == 0) {
4089
3.91M
                if (v->n_allele == UINT16_MAX) {
4090
3
                    hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos,
4091
3
                                  bcf_seqname_safe(h,v), v->pos+1);
4092
3
                    v->errcode |= BCF_ERR_LIMITS;
4093
3
                    goto err;
4094
3
                }
4095
3.91M
                bcf_enc_vchar(str, r - t, t);
4096
3.91M
                t = r + 1;
4097
3.91M
                ++v->n_allele;
4098
3.91M
            }
4099
64.8M
            if (r == q) break;
4100
64.8M
        }
4101
52.3k
    }
4102
4103
    // QUAL
4104
55.1k
    if (!(p = kstrtok(0, 0, &aux)))
4105
34
        goto err;
4106
55.1k
    *(q = (char*)aux.p) = 0;
4107
4108
55.1k
    if (NOT_DOT(p)) v->qual = atof(p);
4109
2.46k
    else bcf_float_set_missing(v->qual);
4110
55.1k
    if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR
4111
4112
    // FILTER
4113
55.1k
    if (!(p = kstrtok(0, 0, &aux)))
4114
36
        goto err;
4115
55.1k
    *(q = (char*)aux.p) = 0;
4116
4117
55.1k
    if (NOT_DOT(p)) {
4118
50.8k
        if (vcf_parse_filter(str, h, v, p, q)) {
4119
46
            goto err;
4120
46
        }
4121
50.8k
    } else bcf_enc_vint(str, 0, 0, -1);
4122
55.0k
    if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT
4123
4124
    // INFO
4125
55.0k
    if (!(p = kstrtok(0, 0, &aux)))
4126
55
        goto err;
4127
55.0k
    *(q = (char*)aux.p) = 0;
4128
4129
55.0k
    if (NOT_DOT(p)) {
4130
54.6k
        if (vcf_parse_info(str, h, v, p, q)) {
4131
79
            goto err;
4132
79
        }
4133
54.6k
    }
4134
54.9k
    if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
4135
4136
    // FORMAT; optional
4137
54.9k
    p = kstrtok(0, 0, &aux);
4138
54.9k
    if (p) {
4139
48.3k
        *(q = (char*)aux.p) = 0;
4140
4141
48.3k
        if (vcf_parse_format(s, h, v, p, q)) {
4142
123
            goto err;
4143
123
        }
4144
48.3k
    }
4145
4146
54.8k
 end:
4147
54.8k
    v->rlen = get_rlen(h, v);    //set rlen based on version
4148
54.8k
    ret = 0;
4149
4150
55.6k
 err:
4151
55.6k
    return ret;
4152
54.8k
}
4153
4154
int vcf_open_mode(char *mode, const char *fn, const char *format)
4155
0
{
4156
0
    if (format == NULL) {
4157
        // Try to pick a format based on the filename extension
4158
0
        char extension[HTS_MAX_EXT_LEN];
4159
0
        if (find_file_extension(fn, extension) < 0) return -1;
4160
0
        return vcf_open_mode(mode, fn, extension);
4161
0
    }
4162
0
    else if (strcasecmp(format, "bcf") == 0) strcpy(mode, "b");
4163
0
    else if (strcasecmp(format, "vcf") == 0) strcpy(mode, "");
4164
0
    else if (strcasecmp(format, "vcf.gz") == 0 || strcasecmp(format, "vcf.bgz") == 0) strcpy(mode, "z");
4165
0
    else return -1;
4166
4167
0
    return 0;
4168
0
}
4169
4170
int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
4171
56.0k
{
4172
56.0k
    int ret;
4173
56.0k
    ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
4174
56.0k
    if (ret < 0) return ret;
4175
55.6k
    return vcf_parse1(&fp->line, h, v);
4176
56.0k
}
4177
4178
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
4179
0
{
4180
0
    uint8_t *ptr_start = ptr;
4181
0
    fmt->id = bcf_dec_typed_int1(ptr, &ptr);
4182
0
    fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
4183
0
    fmt->size = fmt->n << bcf_type_shift[fmt->type];
4184
0
    fmt->p = ptr;
4185
0
    fmt->p_off  = ptr - ptr_start;
4186
0
    fmt->p_free = 0;
4187
0
    ptr += n_sample * fmt->size;
4188
0
    fmt->p_len = ptr - fmt->p;
4189
0
    return ptr;
4190
0
}
4191
4192
static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
4193
2.43k
{
4194
2.43k
    uint8_t *ptr_start = ptr;
4195
2.43k
    int64_t len = 0;
4196
2.43k
    info->key = bcf_dec_typed_int1(ptr, &ptr);
4197
2.43k
    len = info->len = bcf_dec_size(ptr, &ptr, &info->type);
4198
2.43k
    info->vptr = ptr;
4199
2.43k
    info->vptr_off  = ptr - ptr_start;
4200
2.43k
    info->vptr_free = 0;
4201
2.43k
    info->v1.i = 0;
4202
2.43k
    if (info->len == 1) {
4203
73
        switch(info->type) {
4204
0
        case BCF_BT_INT8:
4205
73
        case BCF_BT_CHAR:
4206
73
            info->v1.i = *(int8_t*)ptr;
4207
73
            break;
4208
0
        case BCF_BT_INT16:
4209
0
            info->v1.i = le_to_i16(ptr);
4210
0
            len <<= 1;
4211
0
            break;
4212
0
        case BCF_BT_INT32:
4213
0
            info->v1.i = le_to_i32(ptr);
4214
0
            len <<= 2;
4215
0
            break;
4216
0
        case BCF_BT_FLOAT:
4217
0
            info->v1.f = le_to_float(ptr);
4218
0
            len <<= 2;
4219
0
            break;
4220
0
        case BCF_BT_INT64:
4221
0
            info->v1.i = le_to_i64(ptr);
4222
0
            len <<= 3;
4223
0
            break;
4224
73
        }
4225
2.36k
    } else {
4226
2.36k
        len <<= bcf_type_shift[info->type];
4227
2.36k
    }
4228
2.43k
    ptr += len;
4229
4230
2.43k
    info->vptr_len = ptr - info->vptr;
4231
2.43k
    return ptr;
4232
2.43k
}
4233
4234
int bcf_unpack(bcf1_t *b, int which)
4235
53.7k
{
4236
53.7k
    if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
4237
53.7k
    uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
4238
53.7k
    int i;
4239
53.7k
    bcf_dec_t *d = &b->d;
4240
53.7k
    if (which & BCF_UN_FLT) which |= BCF_UN_STR;
4241
53.7k
    if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
4242
53.7k
    if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
4243
53.7k
    {
4244
53.7k
        kstring_t tmp;
4245
4246
        // ID
4247
53.7k
        tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
4248
53.7k
        ptr_ori = ptr;
4249
53.7k
        ptr = bcf_fmt_sized_array(&tmp, ptr);
4250
53.7k
        b->unpack_size[0] = ptr - ptr_ori;
4251
53.7k
        kputc_('\0', &tmp);
4252
53.7k
        d->id = tmp.s; d->m_id = tmp.m;
4253
4254
        // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
4255
53.7k
        hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
4256
53.7k
        tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
4257
53.7k
        ptr_ori = ptr;
4258
2.32M
        for (i = 0; i < b->n_allele; ++i) {
4259
            // Use offset within tmp.s as realloc may change pointer
4260
2.26M
            d->allele[i] = (char *)(intptr_t)tmp.l;
4261
2.26M
            ptr = bcf_fmt_sized_array(&tmp, ptr);
4262
2.26M
            kputc_('\0', &tmp);
4263
2.26M
        }
4264
53.7k
        b->unpack_size[1] = ptr - ptr_ori;
4265
53.7k
        d->als = tmp.s; d->m_als = tmp.m;
4266
4267
        // Convert our offsets within tmp.s back to pointers again
4268
2.32M
        for (i = 0; i < b->n_allele; ++i)
4269
2.26M
            d->allele[i] = d->als + (ptrdiff_t)d->allele[i];
4270
53.7k
        b->unpacked |= BCF_UN_STR;
4271
53.7k
    }
4272
53.7k
    if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
4273
53.7k
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
4274
53.7k
        ptr_ori = ptr;
4275
53.7k
        if (*ptr>>4) {
4276
49.5k
            int type;
4277
49.5k
            d->n_flt = bcf_dec_size(ptr, &ptr, &type);
4278
49.5k
            hts_expand(int, d->n_flt, d->m_flt, d->flt);
4279
786k
            for (i = 0; i < d->n_flt; ++i)
4280
736k
                d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
4281
49.5k
        } else ++ptr, d->n_flt = 0;
4282
53.7k
        b->unpack_size[2] = ptr - ptr_ori;
4283
53.7k
        b->unpacked |= BCF_UN_FLT;
4284
53.7k
    }
4285
53.7k
    if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
4286
0
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
4287
0
        hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
4288
0
        for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
4289
0
        for (i = 0; i < b->n_info; ++i)
4290
0
            ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
4291
0
        b->unpacked |= BCF_UN_INFO;
4292
0
    }
4293
53.7k
    if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
4294
0
        ptr = (uint8_t*)b->indiv.s;
4295
0
        hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
4296
0
        for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
4297
0
        for (i = 0; i < b->n_fmt; ++i)
4298
0
            ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
4299
0
        b->unpacked |= BCF_UN_FMT;
4300
0
    }
4301
53.7k
    return 0;
4302
53.7k
}
4303
4304
int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
4305
53.7k
{
4306
53.7k
    int i;
4307
53.7k
    int32_t max_dt_id = h->n[BCF_DT_ID];
4308
53.7k
    const char *chrom = bcf_seqname(h, v);
4309
53.7k
    if (!chrom) {
4310
0
        hts_log_error("Invalid BCF, CONTIG id=%d not present in the header",
4311
0
                      v->rid);
4312
0
        errno = EINVAL;
4313
0
        return -1;
4314
0
    }
4315
4316
53.7k
    bcf_unpack((bcf1_t*)v, BCF_UN_ALL & ~(BCF_UN_INFO|BCF_UN_FMT));
4317
4318
    // Cache of key lengths so we don't keep repeatedly using them.
4319
    // This assumes we're not modifying the header between successive calls
4320
    // to vcf_format, but that would lead to many other forms of breakage
4321
    // so it feels like a valid assumption to make.
4322
    //
4323
    // We cannot just do this in bcf_hdr_sync as some code (eg bcftools
4324
    // annotate) manipulates the headers directly without calling sync to
4325
    // refresh the data structures.  So we must do just-in-time length
4326
    // calculation during writes instead.
4327
53.7k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
4328
53.7k
    if (!aux->key_len) {
4329
4.53k
        if (!(aux->key_len = calloc(h->n[BCF_DT_ID]+1, sizeof(*aux->key_len))))
4330
0
            return -1;
4331
4.53k
    }
4332
53.7k
    size_t *key_len = aux->key_len;
4333
4334
53.7k
    kputs(chrom, s); // CHROM
4335
53.7k
    kputc_('\t', s); kputll(v->pos + 1, s); // POS
4336
53.7k
    kputc_('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
4337
53.7k
    kputc_('\t', s); // REF
4338
53.7k
    if (v->n_allele > 0) kputs(v->d.allele[0], s);
4339
0
    else kputc_('.', s);
4340
53.7k
    kputc_('\t', s); // ALT
4341
53.7k
    if (v->n_allele > 1) {
4342
2.26M
        for (i = 1; i < v->n_allele; ++i) {
4343
2.21M
            if (i > 1) kputc_(',', s);
4344
2.21M
            kputs(v->d.allele[i], s);
4345
2.21M
        }
4346
50.9k
    } else kputc_('.', s);
4347
53.7k
    kputc_('\t', s); // QUAL
4348
53.7k
    if ( bcf_float_is_missing(v->qual) ) kputc_('.', s); // QUAL
4349
51.3k
    else kputd(v->qual, s);
4350
53.7k
    kputc_('\t', s); // FILTER
4351
53.7k
    if (v->d.n_flt) {
4352
786k
        for (i = 0; i < v->d.n_flt; ++i) {
4353
736k
            int32_t idx = v->d.flt[i];
4354
736k
            if (idx < 0 || idx >= max_dt_id
4355
736k
                || h->id[BCF_DT_ID][idx].key == NULL) {
4356
0
                hts_log_error("Invalid BCF, the FILTER tag id=%d at %s:%"PRIhts_pos" not present in the header",
4357
0
                              idx, bcf_seqname_safe(h, v), v->pos + 1);
4358
0
                errno = EINVAL;
4359
0
                return -1;
4360
0
            }
4361
736k
            if (i) kputc_(';', s);
4362
736k
            if (!key_len[idx])
4363
127k
                key_len[idx] = strlen(h->id[BCF_DT_ID][idx].key);
4364
736k
            kputsn(h->id[BCF_DT_ID][idx].key, key_len[idx], s);
4365
736k
        }
4366
49.5k
    } else kputc_('.', s);
4367
4368
53.7k
    kputc_('\t', s); // INFO
4369
53.7k
    if (v->n_info) {
4370
27.8k
        uint8_t *ptr = v->shared.s
4371
27.8k
            ? (uint8_t *)v->shared.s + v->unpack_size[0] +
4372
27.8k
               v->unpack_size[1] + v->unpack_size[2]
4373
27.8k
            : NULL;
4374
27.8k
        int first = 1;
4375
27.8k
        bcf_info_t *info = v->d.info;
4376
4377
        // Note if we duplicate this code into custom packed and unpacked
4378
        // implementations then we gain a bit more speed, particularly with
4379
        // clang 13 (up to 5%).  Not sure why this is, but code duplication
4380
        // isn't pleasant and it's still faster adding packed support than
4381
        // not so it's a win, just not as good as it should be.
4382
27.8k
        const int info_packed = !(v->unpacked & BCF_UN_INFO) && v->shared.l;
4383
2.02M
        for (i = 0; i < v->n_info; ++i) {
4384
1.99M
            bcf_info_t in, *z;
4385
1.99M
            if (info_packed) {
4386
                // Use a local bcf_info_t when data is packed
4387
1.99M
                z = &in;
4388
1.99M
                z->key  = bcf_dec_typed_int1(ptr, &ptr);
4389
1.99M
                z->len  = bcf_dec_size(ptr, &ptr, &z->type);
4390
1.99M
                z->vptr = ptr;
4391
1.99M
                ptr += z->len << bcf_type_shift[z->type];
4392
1.99M
            } else {
4393
                // Else previously unpacked INFO struct
4394
0
                z = &info[i];
4395
4396
                // Also potentially since deleted
4397
0
                if ( !z->vptr ) continue;
4398
0
            }
4399
4400
1.99M
            bcf_idpair_t *id = z->key >= 0 && z->key < max_dt_id
4401
1.99M
                ? &h->id[BCF_DT_ID][z->key]
4402
1.99M
                : NULL;
4403
4404
1.99M
            if (!id || !id->key) {
4405
0
                hts_log_error("Invalid BCF, the INFO tag id=%d is %s at %s:%"PRIhts_pos,
4406
0
                              z->key,
4407
0
                              z->key < 0 ? "negative"
4408
0
                              : (z->key >= max_dt_id ? "too large" : "not present in the header"),
4409
0
                              bcf_seqname_safe(h, v), v->pos+1);
4410
0
                errno = EINVAL;
4411
0
                return -1;
4412
0
            }
4413
4414
            // KEY
4415
1.99M
            if (!key_len[z->key])
4416
32.2k
                key_len[z->key] = strlen(id->key);
4417
1.99M
            size_t id_len = key_len[z->key];
4418
1.99M
            if (ks_resize(s, s->l + 3 + id_len) < 0)
4419
0
                return -1;
4420
1.99M
            char *sptr = s->s + s->l;
4421
1.99M
            if ( !first ) {
4422
1.97M
                *sptr++ = ';';
4423
1.97M
                s->l++;
4424
1.97M
            }
4425
1.99M
            first = 0;
4426
1.99M
            memcpy(sptr, id->key, id_len);
4427
1.99M
            s->l += id_len;
4428
4429
            // VALUE
4430
1.99M
            if (z->len <= 0) continue;
4431
714k
            sptr[id_len] = '=';
4432
714k
            s->l++;
4433
4434
714k
            if (z->len != 1 || info_packed) {
4435
714k
                bcf_fmt_array(s, z->len, z->type, z->vptr);
4436
714k
            } else {
4437
                // Single length vectors are unpacked into their
4438
                // own info.v1 union and handled separately.
4439
0
                if (z->type == BCF_BT_FLOAT) {
4440
0
                    if ( bcf_float_is_missing(z->v1.f) )
4441
0
                        kputc_('.', s);
4442
0
                    else
4443
0
                        kputd(z->v1.f, s);
4444
0
                } else if (z->type == BCF_BT_CHAR) {
4445
0
                    kputc_(z->v1.i, s);
4446
0
                } else if (z->type < BCF_BT_INT64) {
4447
0
                    int64_t missing[] = {
4448
0
                        0, // BCF_BT_NULL
4449
0
                        bcf_int8_missing,
4450
0
                        bcf_int16_missing,
4451
0
                        bcf_int32_missing,
4452
0
                    };
4453
0
                    if (z->v1.i == missing[z->type])
4454
0
                        kputc_('.', s);
4455
0
                    else
4456
0
                        kputw(z->v1.i, s);
4457
0
                } else if (z->type == BCF_BT_INT64) {
4458
0
                    if (z->v1.i == bcf_int64_missing)
4459
0
                        kputc_('.', s);
4460
0
                    else
4461
0
                        kputll(z->v1.i, s);
4462
0
                } else {
4463
0
                    hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, z->type, bcf_seqname_safe(h, v), v->pos+1);
4464
0
                    errno = EINVAL;
4465
0
                    return -1;
4466
0
                }
4467
0
            }
4468
714k
        }
4469
27.8k
        if ( first ) kputc_('.', s);
4470
27.8k
    } else kputc_('.', s);
4471
4472
    // FORMAT and individual information
4473
53.7k
    if (v->n_sample) {
4474
13.5k
        int i,j;
4475
13.5k
        if ( v->n_fmt) {
4476
13.4k
            uint8_t *ptr = (uint8_t *)v->indiv.s;
4477
13.4k
            int gt_i = -1;
4478
13.4k
            bcf_fmt_t *fmt = v->d.fmt;
4479
13.4k
            int first = 1, ret = 0;
4480
13.4k
            int fmt_packed = !(v->unpacked & BCF_UN_FMT);
4481
4482
13.4k
            if (fmt_packed) {
4483
                // Local fmt as we have an array of num FORMAT keys,
4484
                // each of which points to N.Sample values.
4485
4486
                // No real gain to be had in handling unpacked data here,
4487
                // but it doesn't cost us much in complexity either and
4488
                // it gives us flexibility.
4489
13.4k
                fmt = malloc(v->n_fmt * sizeof(*fmt));
4490
13.4k
                if (!fmt)
4491
0
                    return -1;
4492
13.4k
            }
4493
4494
            // KEYS
4495
42.3k
            for (i = 0; i < (int)v->n_fmt; ++i) {
4496
28.8k
                bcf_fmt_t *z;
4497
28.8k
                z = &fmt[i];
4498
28.8k
                if (fmt_packed) {
4499
28.8k
                    z->id   = bcf_dec_typed_int1(ptr, &ptr);
4500
28.8k
                    z->n    = bcf_dec_size(ptr, &ptr, &z->type);
4501
28.8k
                    z->p    = ptr;
4502
28.8k
                    z->size = z->n << bcf_type_shift[z->type];
4503
28.8k
                    ptr += v->n_sample * z->size;
4504
28.8k
                }
4505
28.8k
                if ( !z->p ) continue;
4506
28.8k
                kputc_(!first ? ':' : '\t', s); first = 0;
4507
4508
28.8k
                bcf_idpair_t *id = z->id >= 0 && z->id < max_dt_id
4509
28.8k
                    ? &h->id[BCF_DT_ID][z->id]
4510
28.8k
                    : NULL;
4511
4512
28.8k
                if (!id || !id->key) {
4513
0
                    hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", z->id, bcf_seqname_safe(h, v), v->pos+1);
4514
0
                    errno = EINVAL;
4515
0
                    if (fmt_packed)
4516
0
                        free(fmt);
4517
0
                    return -1;
4518
0
                }
4519
4520
28.8k
                if (!key_len[z->id])
4521
15.1k
                    key_len[z->id] = strlen(id->key);
4522
28.8k
                size_t id_len = key_len[z->id];
4523
28.8k
                kputsn(id->key, id_len, s);
4524
28.8k
                if (id_len == 2 && id->key[0] == 'G' && id->key[1] == 'T')
4525
6.09k
                    gt_i = i;
4526
28.8k
            }
4527
13.4k
            if ( first ) kputsn("\t.", 2, s);
4528
4529
            // VALUES per sample
4530
40.5k
            for (j = 0; j < v->n_sample; ++j) {
4531
27.0k
                kputc_('\t', s);
4532
27.0k
                first = 1;
4533
27.0k
                bcf_fmt_t *f = fmt;
4534
84.8k
                for (i = 0; i < (int)v->n_fmt; i++, f++) {
4535
73.1k
                    if ( !f->p ) continue;
4536
73.1k
                    if (!first) kputc_(':', s);
4537
73.1k
                    first = 0;
4538
73.1k
                    if (gt_i == i) {
4539
15.3k
                        if ((ret = bcf_format_gt_v2(h, f,j,s)) < 0) {
4540
0
                            hts_log_error("Failed to format GT value for sample %d, returned %d", i, ret);
4541
0
                            errno = EINVAL;
4542
0
                            if (fmt_packed)
4543
0
                                free(fmt);
4544
0
                            return -1;
4545
0
                        }
4546
15.3k
                        break;
4547
15.3k
                    }
4548
57.8k
                    else if (f->n == 1)
4549
5.95k
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4550
51.8k
                    else
4551
51.8k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4552
73.1k
                }
4553
4554
                // Simpler loop post GT and at least 1 iteration
4555
41.9k
                for (i++, f++; i < (int)v->n_fmt; i++, f++) {
4556
14.9k
                    if ( !f->p ) continue;
4557
14.9k
                    kputc_(':', s);
4558
14.9k
                    if (f->n == 1)
4559
1.10k
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4560
13.8k
                    else
4561
13.8k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4562
14.9k
                }
4563
27.0k
                if ( first ) kputc_('.', s);
4564
27.0k
            }
4565
13.4k
            if (fmt_packed)
4566
13.4k
                free(fmt);
4567
13.4k
        }
4568
111
        else
4569
851
            for (j=0; j<=v->n_sample; j++)
4570
740
                kputsn("\t.", 2, s);
4571
13.5k
    }
4572
53.7k
    kputc('\n', s);
4573
53.7k
    return 0;
4574
53.7k
}
4575
4576
int vcf_write_line(htsFile *fp, kstring_t *line)
4577
0
{
4578
0
    int ret;
4579
0
    if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
4580
0
    if ( fp->format.compression!=no_compression )
4581
0
        ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
4582
0
    else
4583
0
        ret = hwrite(fp->fp.hfile, line->s, line->l);
4584
0
    return ret==line->l ? 0 : -1;
4585
0
}
4586
4587
int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
4588
53.7k
{
4589
53.7k
    ssize_t ret;
4590
53.7k
    fp->line.l = 0;
4591
53.7k
    if (vcf_format1(h, v, &fp->line) != 0)
4592
0
        return -1;
4593
53.7k
    if ( fp->format.compression!=no_compression ) {
4594
0
        if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4595
0
            return -1;
4596
0
        if (fp->idx && !fp->fp.bgzf->mt)
4597
0
            hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
4598
0
        ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
4599
53.7k
    } else {
4600
53.7k
        ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
4601
53.7k
    }
4602
4603
53.7k
    if (fp->idx && fp->format.compression == bgzf) {
4604
0
        int tid;
4605
0
        if ((tid = hts_idx_tbi_name(fp->idx, v->rid, bcf_seqname_safe(h, v))) < 0)
4606
0
            return -1;
4607
4608
0
        if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
4609
0
                          tid, v->pos, v->pos + v->rlen,
4610
0
                          bgzf_tell(fp->fp.bgzf), 1) < 0)
4611
0
            return -1;
4612
0
    }
4613
4614
53.7k
    return ret==fp->line.l ? 0 : -1;
4615
53.7k
}
4616
4617
/************************
4618
 * Data access routines *
4619
 ************************/
4620
4621
int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
4622
180k
{
4623
180k
    khint_t k;
4624
180k
    vdict_t *d = (vdict_t*)h->dict[which];
4625
180k
    k = kh_get(vdict, d, id);
4626
180k
    return k == kh_end(d)? -1 : kh_val(d, k).id;
4627
180k
}
4628
4629
4630
/********************
4631
 *** BCF indexing ***
4632
 ********************/
4633
4634
// Calculate number of index levels given min_shift and the header contig
4635
// list.  Also returns number of contigs in *nids_out.
4636
static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int *min_shift_in_out,
4637
                               int starting_n_lvls, int *nids_out)
4638
0
{
4639
0
    int n_lvls = starting_n_lvls, i, nids = 0;
4640
0
    int64_t max_len = 0;
4641
4642
0
    for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
4643
0
    {
4644
0
        if ( !h->id[BCF_DT_CTG][i].val ) continue;
4645
0
        if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] )
4646
0
            max_len = h->id[BCF_DT_CTG][i].val->info[0];
4647
0
        nids++;
4648
0
    }
4649
0
    if ( !max_len ) max_len = (1LL<<31) - 1;  // In case contig line is broken.
4650
4651
0
    hts_adjust_csi_settings(max_len, min_shift_in_out, &n_lvls);
4652
4653
0
    if (nids_out) *nids_out = nids;
4654
0
    return n_lvls;
4655
0
}
4656
4657
hts_idx_t *bcf_index(htsFile *fp, int min_shift)
4658
0
{
4659
0
    int n_lvls;
4660
0
    bcf1_t *b = NULL;
4661
0
    hts_idx_t *idx = NULL;
4662
0
    bcf_hdr_t *h;
4663
0
    int r;
4664
0
    h = bcf_hdr_read(fp);
4665
0
    if ( !h ) return NULL;
4666
0
    int nids = 0;
4667
0
    n_lvls = idx_calc_n_lvls_ids(h, &min_shift, 0, &nids);
4668
0
    idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4669
0
    if (!idx) goto fail;
4670
0
    b = bcf_init1();
4671
0
    if (!b) goto fail;
4672
0
    while ((r = bcf_read1(fp,h, b)) >= 0) {
4673
0
        int ret;
4674
0
        ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
4675
0
        if (ret < 0) goto fail;
4676
0
    }
4677
0
    if (r < -1) goto fail;
4678
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
4679
0
    bcf_destroy1(b);
4680
0
    bcf_hdr_destroy(h);
4681
0
    return idx;
4682
4683
0
 fail:
4684
0
    hts_idx_destroy(idx);
4685
0
    bcf_destroy1(b);
4686
0
    bcf_hdr_destroy(h);
4687
0
    return NULL;
4688
0
}
4689
4690
hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
4691
0
{
4692
0
    return fnidx? hts_idx_load2(fn, fnidx) : bcf_index_load(fn);
4693
0
}
4694
4695
hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags)
4696
0
{
4697
0
    return hts_idx_load3(fn, fnidx, HTS_FMT_CSI, flags);
4698
0
}
4699
4700
int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads)
4701
0
{
4702
0
    htsFile *fp;
4703
0
    hts_idx_t *idx;
4704
0
    tbx_t *tbx;
4705
0
    int ret;
4706
0
    if ((fp = hts_open(fn, "rb")) == 0) return -2;
4707
0
    if (n_threads)
4708
0
        hts_set_threads(fp, n_threads);
4709
0
    if ( fp->format.compression!=bgzf ) { hts_close(fp); return -3; }
4710
0
    switch (fp->format.format) {
4711
0
        case bcf:
4712
0
            if (!min_shift) {
4713
0
                hts_log_error("TBI indices for BCF files are not supported");
4714
0
                ret = -1;
4715
0
            } else {
4716
0
                idx = bcf_index(fp, min_shift);
4717
0
                if (idx) {
4718
0
                    ret = hts_idx_save_as(idx, fn, fnidx, HTS_FMT_CSI);
4719
0
                    if (ret < 0) ret = -4;
4720
0
                    hts_idx_destroy(idx);
4721
0
                }
4722
0
                else ret = -1;
4723
0
            }
4724
0
            break;
4725
4726
0
        case vcf:
4727
0
            tbx = tbx_index(hts_get_bgzfp(fp), min_shift, &tbx_conf_vcf);
4728
0
            if (tbx) {
4729
0
                ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0 ? HTS_FMT_CSI : HTS_FMT_TBI);
4730
0
                if (ret < 0) ret = -4;
4731
0
                tbx_destroy(tbx);
4732
0
            }
4733
0
            else ret = -1;
4734
0
            break;
4735
4736
0
        default:
4737
0
            ret = -3;
4738
0
            break;
4739
0
    }
4740
0
    hts_close(fp);
4741
0
    return ret;
4742
0
}
4743
4744
int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
4745
0
{
4746
0
    return bcf_index_build3(fn, fnidx, min_shift, 0);
4747
0
}
4748
4749
int bcf_index_build(const char *fn, int min_shift)
4750
0
{
4751
0
    return bcf_index_build3(fn, NULL, min_shift, 0);
4752
0
}
4753
4754
// Initialise fp->idx for the current format type.
4755
// This must be called after the header has been written but no other data.
4756
0
static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4757
0
    int n_lvls, fmt;
4758
4759
0
    if (min_shift == 0) {
4760
0
        min_shift = 14;
4761
0
        n_lvls = 5;
4762
0
        fmt = HTS_FMT_TBI;
4763
0
    } else {
4764
        // Set initial n_lvls to match tbx_index()
4765
0
        int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3;
4766
        // Increase if necessary
4767
0
        n_lvls = idx_calc_n_lvls_ids(h, &min_shift, starting_n_lvls, NULL);
4768
0
        fmt = HTS_FMT_CSI;
4769
0
    }
4770
4771
0
    fp->idx = hts_idx_init(0, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4772
0
    if (!fp->idx) return -1;
4773
4774
    // Tabix meta data, added even in CSI for VCF
4775
0
    uint8_t conf[4*7];
4776
0
    u32_to_le(TBX_VCF, conf+0);  // fmt
4777
0
    u32_to_le(1,       conf+4);  // name col
4778
0
    u32_to_le(2,       conf+8);  // beg col
4779
0
    u32_to_le(0,       conf+12); // end col
4780
0
    u32_to_le('#',     conf+16); // comment
4781
0
    u32_to_le(0,       conf+20); // n.skip
4782
0
    u32_to_le(0,       conf+24); // ref name len
4783
0
    if (hts_idx_set_meta(fp->idx, sizeof(conf)*sizeof(*conf), (uint8_t *)conf, 1) < 0) {
4784
0
        hts_idx_destroy(fp->idx);
4785
0
        fp->idx = NULL;
4786
0
        return -1;
4787
0
    }
4788
0
    fp->fnidx = fnidx;
4789
4790
0
    return 0;
4791
0
}
4792
4793
// Initialise fp->idx for the current format type.
4794
// This must be called after the header has been written but no other data.
4795
0
int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4796
0
    int n_lvls, nids = 0;
4797
4798
0
    if (fp->format.compression != bgzf) {
4799
0
        hts_log_error("Indexing is only supported on BGZF-compressed files");
4800
0
        return -3; // Matches no-compression return for bcf_index_build3()
4801
0
    }
4802
4803
0
    if (fp->format.format == vcf)
4804
0
        return vcf_idx_init(fp, h, min_shift, fnidx);
4805
4806
0
    if (!min_shift)
4807
0
        min_shift = 14;
4808
4809
0
    n_lvls = idx_calc_n_lvls_ids(h, &min_shift, 0, &nids);
4810
4811
0
    fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4812
0
    if (!fp->idx) return -1;
4813
0
    fp->fnidx = fnidx;
4814
4815
0
    return 0;
4816
0
}
4817
4818
// Finishes an index. Call after the last record has been written.
4819
// Returns 0 on success, <0 on failure.
4820
//
4821
// NB: same format as SAM/BAM as it uses bgzf.
4822
0
int bcf_idx_save(htsFile *fp) {
4823
0
    return sam_idx_save(fp);
4824
0
}
4825
4826
/*****************
4827
 *** Utilities ***
4828
 *****************/
4829
4830
int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
4831
0
{
4832
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res;
4833
0
    for (i=0; i<src->nhrec; i++)
4834
0
    {
4835
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4836
0
        {
4837
0
            int j;
4838
0
            for (j=0; j<ndst_ori; j++)
4839
0
            {
4840
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4841
4842
                // Checking only the key part of generic lines, otherwise
4843
                // the VCFs are too verbose. Should we perhaps add a flag
4844
                // to bcf_hdr_combine() and make this optional?
4845
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4846
0
            }
4847
0
            if ( j>=ndst_ori ) {
4848
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4849
0
                if (res < 0) return -1;
4850
0
                need_sync += res;
4851
0
            }
4852
0
        }
4853
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4854
0
        {
4855
            // NB: we are ignoring fields without ID
4856
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4857
0
            if ( j>=0 )
4858
0
            {
4859
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4860
0
                if ( !rec ) {
4861
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4862
0
                    if (res < 0) return -1;
4863
0
                    need_sync += res;
4864
0
                }
4865
0
            }
4866
0
        }
4867
0
        else
4868
0
        {
4869
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4870
0
            assert( j>=0 ); // this should always be true for valid VCFs
4871
4872
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4873
0
            if ( !rec ) {
4874
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4875
0
                if (res < 0) return -1;
4876
0
                need_sync += res;
4877
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4878
0
            {
4879
                // Check that both records are of the same type. The bcf_hdr_id2length
4880
                // macro cannot be used here because dst header is not synced yet.
4881
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4882
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4883
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4884
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4885
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4886
0
                {
4887
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4888
0
                        src->hrec[i]->vals[0]);
4889
0
                    ret |= 1;
4890
0
                }
4891
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4892
0
                {
4893
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4894
0
                        src->hrec[i]->vals[0]);
4895
0
                    ret |= 1;
4896
0
                }
4897
0
            }
4898
0
        }
4899
0
    }
4900
0
    if ( need_sync ) {
4901
0
        if (bcf_hdr_sync(dst) < 0) return -1;
4902
0
    }
4903
0
    return ret;
4904
0
}
4905
4906
bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
4907
0
{
4908
0
    if ( !dst )
4909
0
    {
4910
        // this will effectively strip existing IDX attributes from src to become dst
4911
0
        dst = bcf_hdr_init("r");
4912
0
        kstring_t htxt = {0,0,0};
4913
0
        if (bcf_hdr_format(src, 0, &htxt) < 0) {
4914
0
            free(htxt.s);
4915
0
            return NULL;
4916
0
        }
4917
0
        if ( bcf_hdr_parse(dst, htxt.s) < 0 ) {
4918
0
            bcf_hdr_destroy(dst);
4919
0
            dst = NULL;
4920
0
        }
4921
0
        free(htxt.s);
4922
0
        return dst;
4923
0
    }
4924
4925
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, res;
4926
0
    for (i=0; i<src->nhrec; i++)
4927
0
    {
4928
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4929
0
        {
4930
0
            int j;
4931
0
            for (j=0; j<ndst_ori; j++)
4932
0
            {
4933
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4934
4935
                // Checking only the key part of generic lines, otherwise
4936
                // the VCFs are too verbose. Should we perhaps add a flag
4937
                // to bcf_hdr_combine() and make this optional?
4938
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4939
0
            }
4940
0
            if ( j>=ndst_ori ) {
4941
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4942
0
                if (res < 0) return NULL;
4943
0
                need_sync += res;
4944
0
            }
4945
0
            else if ( !strcmp(src->hrec[i]->key,"fileformat") )
4946
0
            {
4947
0
                int ver_src = bcf_get_version(src,src->hrec[i]->value);
4948
0
                int ver_dst = bcf_get_version(dst,dst->hrec[j]->value);
4949
0
                if ( ver_src > ver_dst )
4950
0
                {
4951
0
                    if (bcf_hdr_set_version(dst,src->hrec[i]->value) < 0)
4952
0
                        return NULL;
4953
0
                    need_sync = 1;
4954
0
                }
4955
0
            }
4956
0
        }
4957
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4958
0
        {
4959
            // NB: we are ignoring fields without ID
4960
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4961
0
            if ( j>=0 )
4962
0
            {
4963
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4964
0
                if ( !rec ) {
4965
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4966
0
                    if (res < 0) return NULL;
4967
0
                    need_sync += res;
4968
0
                }
4969
0
            }
4970
0
        }
4971
0
        else
4972
0
        {
4973
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4974
0
            assert( j>=0 ); // this should always be true for valid VCFs
4975
4976
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4977
0
            if ( !rec ) {
4978
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4979
0
                if (res < 0) return NULL;
4980
0
                need_sync += res;
4981
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4982
0
            {
4983
                // Check that both records are of the same type. The bcf_hdr_id2length
4984
                // macro cannot be used here because dst header is not synced yet.
4985
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4986
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4987
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4988
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4989
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4990
0
                {
4991
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4992
0
                        src->hrec[i]->vals[0]);
4993
0
                }
4994
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4995
0
                {
4996
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4997
0
                        src->hrec[i]->vals[0]);
4998
0
                }
4999
0
            }
5000
0
        }
5001
0
    }
5002
0
    if ( need_sync ) {
5003
0
        if (bcf_hdr_sync(dst) < 0) return NULL;
5004
0
    }
5005
0
    return dst;
5006
0
}
5007
5008
int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
5009
0
{
5010
0
    int i;
5011
0
    if ( line->errcode )
5012
0
    {
5013
0
        char errordescription[1024] = "";
5014
0
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_strerror(line->errcode, errordescription, sizeof(errordescription)),  bcf_seqname_safe(src_hdr,line), line->pos+1);
5015
0
        exit(1);
5016
0
    }
5017
0
    if ( src_hdr->ntransl==-1 ) return 0;    // no need to translate, all tags have the same id
5018
0
    if ( !src_hdr->ntransl )  // called for the first time, see what needs translating
5019
0
    {
5020
0
        int dict;
5021
0
        for (dict=0; dict<2; dict++)    // BCF_DT_ID and BCF_DT_CTG
5022
0
        {
5023
0
            src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
5024
0
            for (i=0; i<src_hdr->n[dict]; i++)
5025
0
            {
5026
0
                if ( !src_hdr->id[dict][i].key ) // gap left after removed BCF header lines
5027
0
                {
5028
0
                    src_hdr->transl[dict][i] = -1;
5029
0
                    continue;
5030
0
                }
5031
0
                src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
5032
0
                if ( src_hdr->transl[dict][i]!=-1 && i!=src_hdr->transl[dict][i] ) src_hdr->ntransl++;
5033
0
            }
5034
0
        }
5035
0
        if ( !src_hdr->ntransl )
5036
0
        {
5037
0
            free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
5038
0
            free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
5039
0
            src_hdr->ntransl = -1;
5040
0
        }
5041
0
        if ( src_hdr->ntransl==-1 ) return 0;
5042
0
    }
5043
0
    bcf_unpack(line,BCF_UN_ALL);
5044
5045
    // CHROM
5046
0
    if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
5047
5048
    // FILTER
5049
0
    for (i=0; i<line->d.n_flt; i++)
5050
0
    {
5051
0
        int src_id = line->d.flt[i];
5052
0
        if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
5053
0
            line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
5054
0
        line->d.shared_dirty |= BCF1_DIRTY_FLT;
5055
0
    }
5056
5057
    // INFO
5058
0
    for (i=0; i<line->n_info; i++)
5059
0
    {
5060
0
        int src_id = line->d.info[i].key;
5061
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
5062
0
        if ( dst_id<0 ) continue;
5063
0
        line->d.info[i].key = dst_id;
5064
0
        if ( !line->d.info[i].vptr ) continue;  // skip deleted
5065
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5066
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5067
0
        if ( src_size==dst_size )   // can overwrite
5068
0
        {
5069
0
            uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
5070
0
            if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
5071
0
            else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
5072
0
            else { *(uint32_t*)vptr = (uint32_t)dst_id; }
5073
0
        }
5074
0
        else    // must realloc
5075
0
        {
5076
0
            bcf_info_t *info = &line->d.info[i];
5077
0
            kstring_t str = {0,0,0};
5078
0
            bcf_enc_int1(&str, dst_id);
5079
0
            bcf_enc_size(&str, info->len,info->type);
5080
0
            uint32_t vptr_off = str.l;
5081
0
            kputsn((char*)info->vptr, info->vptr_len, &str);
5082
0
            if( info->vptr_free ) free(info->vptr - info->vptr_off);
5083
0
            info->vptr_off = vptr_off;
5084
0
            info->vptr = (uint8_t*)str.s + info->vptr_off;
5085
0
            info->vptr_free = 1;
5086
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5087
0
        }
5088
0
    }
5089
5090
    // FORMAT
5091
0
    for (i=0; i<line->n_fmt; i++)
5092
0
    {
5093
0
        int src_id = line->d.fmt[i].id;
5094
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
5095
0
        if ( dst_id<0 ) continue;
5096
0
        line->d.fmt[i].id = dst_id;
5097
0
        if( !line->d.fmt[i].p ) continue;  // skip deleted
5098
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5099
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5100
0
        if ( src_size==dst_size )   // can overwrite
5101
0
        {
5102
0
            uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off;    // pointer to the vector size (4bits) and BT type (4bits)
5103
0
            if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
5104
0
            else if ( dst_size==BCF_BT_INT16 ) { i16_to_le(dst_id, p + 1); }
5105
0
            else { i32_to_le(dst_id, p + 1); }
5106
0
        }
5107
0
        else    // must realloc
5108
0
        {
5109
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
5110
0
            kstring_t str = {0,0,0};
5111
0
            bcf_enc_int1(&str, dst_id);
5112
0
            bcf_enc_size(&str, fmt->n, fmt->type);
5113
0
            uint32_t p_off = str.l;
5114
0
            kputsn((char*)fmt->p, fmt->p_len, &str);
5115
0
            if( fmt->p_free ) free(fmt->p - fmt->p_off);
5116
0
            fmt->p_off = p_off;
5117
0
            fmt->p = (uint8_t*)str.s + fmt->p_off;
5118
0
            fmt->p_free = 1;
5119
0
            line->d.indiv_dirty = 1;
5120
0
        }
5121
0
    }
5122
0
    return 0;
5123
0
}
5124
5125
bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
5126
0
{
5127
0
    bcf_hdr_t *hout = bcf_hdr_init("r");
5128
0
    if (!hout) {
5129
0
        hts_log_error("Failed to allocate bcf header");
5130
0
        return NULL;
5131
0
    }
5132
0
    kstring_t htxt = {0,0,0};
5133
0
    if (bcf_hdr_format(hdr, 1, &htxt) < 0) {
5134
0
        free(htxt.s);
5135
0
        return NULL;
5136
0
    }
5137
0
    if ( bcf_hdr_parse(hout, htxt.s) < 0 ) {
5138
0
        bcf_hdr_destroy(hout);
5139
0
        hout = NULL;
5140
0
    }
5141
0
    free(htxt.s);
5142
0
    return hout;
5143
0
}
5144
5145
bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
5146
0
{
5147
0
    void *names_hash = khash_str2int_init();
5148
0
    kstring_t htxt = {0,0,0};
5149
0
    kstring_t str = {0,0,0};
5150
0
    bcf_hdr_t *h = bcf_hdr_init("w");
5151
0
    int r = 0;
5152
0
    if (!h || !names_hash) {
5153
0
        hts_log_error("Failed to allocate bcf header");
5154
0
        goto err;
5155
0
    }
5156
0
    if (bcf_hdr_format(h0, 1, &htxt) < 0) {
5157
0
        hts_log_error("Failed to get header text");
5158
0
        goto err;
5159
0
    }
5160
0
    bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
5161
0
    int j;
5162
0
    for (j=0; j<n; j++) imap[j] = -1;
5163
0
    if ( bcf_hdr_nsamples(h0) > 0) {
5164
0
        char *p = find_chrom_header_line(htxt.s);
5165
0
        int i = 0, end = n? 8 : 7;
5166
0
        while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
5167
0
        if (i != end) {
5168
0
            hts_log_error("Wrong number of columns in header #CHROM line");
5169
0
            goto err;
5170
0
        }
5171
0
        r |= kputsn(htxt.s, p - htxt.s, &str) < 0;
5172
0
        for (i = 0; i < n; ++i) {
5173
0
            if ( khash_str2int_has_key(names_hash,samples[i]) )
5174
0
            {
5175
0
                hts_log_error("Duplicate sample name \"%s\"", samples[i]);
5176
0
                goto err;
5177
0
            }
5178
0
            imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
5179
0
            if (imap[i] < 0) continue;
5180
0
            r |= kputc('\t', &str) < 0;
5181
0
            r |= kputs(samples[i], &str) < 0;
5182
0
            r |= khash_str2int_inc(names_hash,samples[i]) < 0;
5183
0
        }
5184
0
    } else r |= kputsn(htxt.s, htxt.l, &str) < 0;
5185
0
    while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
5186
0
    r |= kputc('\n',&str) < 0;
5187
0
    if (r) {
5188
0
        hts_log_error("%s", strerror(errno));
5189
0
        goto err;
5190
0
    }
5191
0
    if ( bcf_hdr_parse(h, str.s) < 0 ) {
5192
0
        bcf_hdr_destroy(h);
5193
0
        h = NULL;
5194
0
    }
5195
0
    free(str.s);
5196
0
    free(htxt.s);
5197
0
    khash_str2int_destroy(names_hash);
5198
0
    return h;
5199
5200
0
 err:
5201
0
    ks_free(&str);
5202
0
    ks_free(&htxt);
5203
0
    khash_str2int_destroy(names_hash);
5204
0
    bcf_hdr_destroy(h);
5205
0
    return NULL;
5206
0
}
5207
5208
int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
5209
0
{
5210
0
    if ( samples && !strcmp("-",samples) ) return 0;            // keep all samples
5211
5212
0
    int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
5213
0
    hdr->keep_samples = (uint8_t*) calloc(narr,1);
5214
0
    if (!hdr->keep_samples) return -1;
5215
5216
0
    hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
5217
0
    if ( !samples )
5218
0
    {
5219
        // exclude all samples
5220
0
        khint_t k;
5221
0
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE], *new_dict;
5222
0
        new_dict = kh_init(vdict);
5223
0
        if (!new_dict) return -1;
5224
5225
0
        bcf_hdr_nsamples(hdr) = 0;
5226
5227
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
5228
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
5229
0
        kh_destroy(vdict, d);
5230
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
5231
0
        if (bcf_hdr_sync(hdr) < 0) return -1;
5232
5233
0
        return 0;
5234
0
    }
5235
5236
0
    if ( samples[0]=='^' )
5237
0
        for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
5238
5239
0
    int idx, n, ret = 0;
5240
0
    char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
5241
0
    if ( !smpls ) return -1;
5242
0
    for (i=0; i<n; i++)
5243
0
    {
5244
0
        idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
5245
0
        if ( idx<0 )
5246
0
        {
5247
0
            if ( !ret ) ret = i+1;
5248
0
            continue;
5249
0
        }
5250
0
        assert( idx<bcf_hdr_nsamples(hdr) );
5251
0
        if (  samples[0]=='^' )
5252
0
            bit_array_clear(hdr->keep_samples, idx);
5253
0
        else
5254
0
            bit_array_set(hdr->keep_samples, idx);
5255
0
    }
5256
0
    for (i=0; i<n; i++) free(smpls[i]);
5257
0
    free(smpls);
5258
5259
0
    bcf_hdr_nsamples(hdr) = 0;
5260
0
    for (i=0; i<hdr->nsamples_ori; i++)
5261
0
        if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
5262
5263
0
    if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
5264
0
    else
5265
0
    {
5266
        // Make new list and dictionary with desired samples
5267
0
        char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr));
5268
0
        vdict_t *new_dict, *d;
5269
0
        int k, res;
5270
0
        if (!samples) return -1;
5271
5272
0
        new_dict = kh_init(vdict);
5273
0
        if (!new_dict) {
5274
0
            free(samples);
5275
0
            return -1;
5276
0
        }
5277
0
        idx = 0;
5278
0
        for (i=0; i<hdr->nsamples_ori; i++) {
5279
0
            if ( bit_array_test(hdr->keep_samples,i) ) {
5280
0
                samples[idx] = hdr->samples[i];
5281
0
                k = kh_put(vdict, new_dict, hdr->samples[i], &res);
5282
0
                if (res < 0) {
5283
0
                    free(samples);
5284
0
                    kh_destroy(vdict, new_dict);
5285
0
                    return -1;
5286
0
                }
5287
0
                kh_val(new_dict, k) = bcf_idinfo_def;
5288
0
                kh_val(new_dict, k).id = idx;
5289
0
                idx++;
5290
0
            }
5291
0
        }
5292
5293
        // Delete desired samples from old dictionary, so we don't free them
5294
0
        d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
5295
0
        for (i=0; i < idx; i++) {
5296
0
            int k = kh_get(vdict, d, samples[i]);
5297
0
            if (k < kh_end(d)) kh_del(vdict, d, k);
5298
0
        }
5299
5300
        // Free everything else
5301
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
5302
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
5303
0
        kh_destroy(vdict, d);
5304
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
5305
5306
0
        free(hdr->samples);
5307
0
        hdr->samples = samples;
5308
5309
0
        if (bcf_hdr_sync(hdr) < 0)
5310
0
            return -1;
5311
0
    }
5312
5313
0
    return ret;
5314
0
}
5315
5316
int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
5317
0
{
5318
0
    kstring_t ind;
5319
0
    ind.s = 0; ind.l = ind.m = 0;
5320
0
    if (n) {
5321
0
        bcf_fmt_t fmt[MAX_N_FMT];
5322
0
        int i, j;
5323
0
        uint8_t *ptr = (uint8_t*)v->indiv.s;
5324
0
        for (i = 0; i < v->n_fmt; ++i)
5325
0
            ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
5326
0
        for (i = 0; i < (int)v->n_fmt; ++i) {
5327
0
            bcf_fmt_t *f = &fmt[i];
5328
0
            bcf_enc_int1(&ind, f->id);
5329
0
            bcf_enc_size(&ind, f->n, f->type);
5330
0
            for (j = 0; j < n; ++j)
5331
0
                if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
5332
0
        }
5333
0
        for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
5334
0
        v->n_sample = i;
5335
0
    } else v->n_sample = 0;
5336
0
    if ( !v->n_sample ) v->n_fmt = 0;
5337
0
    free(v->indiv.s);
5338
0
    v->indiv = ind;
5339
0
    v->unpacked &= ~BCF_UN_FMT;    // only BCF is ready for output, VCF will need to unpack again
5340
0
    return 0;
5341
0
}
5342
5343
int bcf_is_snp(bcf1_t *v)
5344
0
{
5345
0
    int i;
5346
0
    bcf_unpack(v, BCF_UN_STR);
5347
0
    for (i = 0; i < v->n_allele; ++i)
5348
0
    {
5349
0
        if ( v->d.allele[i][1]==0 && v->d.allele[i][0]!='*' ) continue;
5350
5351
        // mpileup's <X> allele, see also below. This is not completely satisfactory,
5352
        // a general library is here narrowly tailored to fit samtools.
5353
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue;
5354
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='*' && v->d.allele[i][2]=='>' ) continue;
5355
5356
0
        break;
5357
0
    }
5358
0
    return i == v->n_allele;
5359
0
}
5360
5361
static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t *var)
5362
0
{
5363
0
    if ( *alt == '*' && !alt[1] ) { var->n = 0; var->type = VCF_OVERLAP; return; }  // overlapping variant
5364
5365
    // The most frequent case
5366
0
    if ( !ref[1] && !alt[1] )
5367
0
    {
5368
0
        if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
5369
0
        if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
5370
0
        var->n = 1; var->type = VCF_SNP; return;
5371
0
    }
5372
0
    if ( alt[0]=='<' )
5373
0
    {
5374
0
        if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
5375
0
        if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }
5376
0
        if ( !strcmp("NON_REF>",alt+1) ) { var->n = 0; var->type = VCF_REF; return; }
5377
0
        var->type = VCF_OTHER;
5378
0
        return;
5379
0
    }
5380
5381
    // Catch "joined before" breakend case
5382
0
    if ( alt[0]==']' || alt[0] == '[' )
5383
0
    {
5384
0
        var->type = VCF_BND; return;
5385
0
    }
5386
5387
    // Iterate through alt characters that match the reference
5388
0
    const char *r = ref, *a = alt;
5389
0
    while (*r && *a && toupper_c(*r)==toupper_c(*a) ) { r++; a++; }     // unfortunately, matching REF,ALT case is not guaranteed
5390
5391
0
    if ( *a && !*r )
5392
0
    {
5393
0
        while ( *a ) a++;
5394
0
        if ( *(a-1)==']' || *(a-1)=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend
5395
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_INS; return;
5396
0
    }
5397
0
    else if ( *r && !*a )
5398
0
    {
5399
0
        while ( *r ) r++;
5400
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_DEL; return;
5401
0
    }
5402
0
    else if ( !*r && !*a )
5403
0
    {
5404
0
        var->n = 0; var->type = VCF_REF; return;
5405
0
    }
5406
5407
0
    const char *re = r, *ae = a;
5408
0
    while ( re[1] ) re++;
5409
0
    while ( ae[1] ) ae++;
5410
0
    if ( ae[0]==']' || ae[0]=='[' ) { var->type = VCF_BND; return; }    // "joined after" breakend
5411
0
    while ( re>r && ae>a && toupper_c(*re)==toupper_c(*ae) ) { re--; ae--; }
5412
0
    if ( ae==a )
5413
0
    {
5414
0
        if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
5415
0
        var->n = -(re-r);
5416
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_DEL; return; }
5417
0
        var->type = VCF_OTHER; return;
5418
0
    }
5419
0
    else if ( re==r )
5420
0
    {
5421
0
        var->n = ae-a;
5422
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_INS; return; }
5423
0
        var->type = VCF_OTHER; return;
5424
0
    }
5425
5426
0
    var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
5427
0
    var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
5428
5429
    // should do also complex events, SVs, etc...
5430
0
}
5431
5432
static int bcf_set_variant_types(bcf1_t *b)
5433
0
{
5434
0
    if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
5435
0
    bcf_dec_t *d = &b->d;
5436
0
    if ( d->n_var < b->n_allele )
5437
0
    {
5438
0
        bcf_variant_t *new_var = realloc(d->var, sizeof(bcf_variant_t)*b->n_allele);
5439
0
        if (!new_var)
5440
0
            return -1;
5441
0
        d->var = new_var;
5442
0
        d->n_var = b->n_allele;
5443
0
    }
5444
0
    int i;
5445
0
    b->d.var_type = 0;
5446
0
    d->var[0].type = VCF_REF;
5447
0
    d->var[0].n    = 0;
5448
0
    for (i=1; i<b->n_allele; i++)
5449
0
    {
5450
0
        bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
5451
0
        b->d.var_type |= d->var[i].type;
5452
        //fprintf(stderr,"[set_variant_type] %d   %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
5453
0
    }
5454
0
    return 0;
5455
0
}
5456
5457
// bcf_get_variant_type/bcf_get_variant_types should only return the following,
5458
// to be compatible with callers that are not expecting newer values
5459
// like VCF_INS, VCF_DEL.  The full set is available from the newer
5460
// vcf_has_variant_type* interfaces.
5461
0
#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP)
5462
int bcf_get_variant_types(bcf1_t *rec)
5463
0
{
5464
0
    if ( rec->d.var_type==-1 ) {
5465
0
        if (bcf_set_variant_types(rec) != 0) {
5466
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5467
0
            exit(1); // Due to legacy API having no way to report failures
5468
0
        }
5469
0
    }
5470
0
    return rec->d.var_type & ORIG_VAR_TYPES;
5471
0
}
5472
5473
int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
5474
0
{
5475
0
    if ( rec->d.var_type==-1 ) {
5476
0
        if (bcf_set_variant_types(rec) != 0) {
5477
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5478
0
            exit(1); // Due to legacy API having no way to report failures
5479
0
        }
5480
0
    }
5481
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) {
5482
0
        hts_log_error("Requested allele outside valid range");
5483
0
        exit(1);
5484
0
    }
5485
0
    return rec->d.var[ith_allele].type & ORIG_VAR_TYPES;
5486
0
}
5487
#undef ORIG_VAR_TYPES
5488
5489
int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask)
5490
0
{
5491
0
    if ( rec->d.var_type==-1 ) {
5492
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5493
0
    }
5494
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return -1;
5495
0
    if (bitmask == VCF_REF) {  // VCF_REF is 0, so handled as a special case
5496
0
        return rec->d.var[ith_allele].type == VCF_REF;
5497
0
    }
5498
0
    return bitmask & rec->d.var[ith_allele].type;
5499
0
}
5500
5501
int bcf_variant_length(bcf1_t *rec, int ith_allele)
5502
0
{
5503
0
    if ( rec->d.var_type==-1 ) {
5504
0
        if (bcf_set_variant_types(rec) != 0) return bcf_int32_missing;
5505
0
    }
5506
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return bcf_int32_missing;
5507
0
    return rec->d.var[ith_allele].n;
5508
0
}
5509
5510
int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask,
5511
                          enum bcf_variant_match mode)
5512
0
{
5513
0
    if ( rec->d.var_type==-1 ) {
5514
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5515
0
    }
5516
0
    uint32_t type = rec->d.var_type;
5517
0
    if ( mode==bcf_match_overlap ) return bitmask & type;
5518
5519
    // VCF_INDEL is always set with VCF_INS and VCF_DEL by bcf_set_variant_type[s], but the bitmask may
5520
    // ask for say `VCF_INS` or `VCF_INDEL` only
5521
0
    if ( bitmask&(VCF_INS|VCF_DEL) && !(bitmask&VCF_INDEL) ) type &= ~VCF_INDEL;
5522
0
    else if ( bitmask&VCF_INDEL && !(bitmask&(VCF_INS|VCF_DEL)) ) type &= ~(VCF_INS|VCF_DEL);
5523
5524
0
    if ( mode==bcf_match_subset )
5525
0
    {
5526
0
        if ( ~bitmask & type ) return 0;
5527
0
        else return bitmask & type;
5528
0
    }
5529
    // mode == bcf_match_exact
5530
0
    if ( bitmask==VCF_REF ) return type==bitmask ? 1 : 0;
5531
0
    return type==bitmask ? type : 0;
5532
0
}
5533
5534
int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5535
0
{
5536
0
    static int negative_rlen_warned = 0;
5537
0
    int is_end_tag, is_svlen_tag = 0;
5538
5539
    // Is the field already present?
5540
0
    int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5541
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1;    // No such INFO field in the header
5542
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
5543
5544
0
    is_end_tag = strcmp(key, "END") == 0;
5545
0
    is_svlen_tag = strcmp(key, "SVLEN") == 0;
5546
5547
0
    for (i=0; i<line->n_info; i++)
5548
0
        if ( inf_id==line->d.info[i].key ) break;
5549
0
    bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
5550
5551
0
    if ( !n || (type==BCF_HT_STR && !values) )
5552
0
    {
5553
0
        if ( inf )
5554
0
        {
5555
            // Mark the tag for removal, free existing memory if necessary
5556
0
            if ( inf->vptr_free )
5557
0
            {
5558
0
                free(inf->vptr - inf->vptr_off);
5559
0
                inf->vptr_free = 0;
5560
0
            }
5561
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5562
0
            inf->vptr = NULL;
5563
0
            inf->vptr_off = inf->vptr_len = 0;
5564
0
        }
5565
0
        if ( n==0 && (is_end_tag || is_svlen_tag) ) {
5566
0
            line->rlen = get_rlen(hdr, line);
5567
0
        }
5568
0
        return 0;
5569
0
    }
5570
5571
0
    if (is_end_tag)
5572
0
    {
5573
0
        if (n != 1)
5574
0
        {
5575
0
            hts_log_error("END info tag should only have one value at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5576
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5577
0
            return -1;
5578
0
        }
5579
0
        if (type != BCF_HT_INT && type != BCF_HT_LONG)
5580
0
        {
5581
0
            hts_log_error("Wrong type (%d) for END info tag at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5582
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5583
0
            return -1;
5584
0
        }
5585
0
    }
5586
5587
    // Encode the values and determine the size required to accommodate the values
5588
0
    kstring_t str = {0,0,0};
5589
0
    bcf_enc_int1(&str, inf_id);
5590
0
    if ( type==BCF_HT_INT )
5591
0
        bcf_enc_vint(&str, n, (int32_t*)values, -1);
5592
0
    else if ( type==BCF_HT_REAL )
5593
0
        bcf_enc_vfloat(&str, n, (float*)values);
5594
0
    else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
5595
0
    {
5596
0
        if ( values==NULL )
5597
0
            bcf_enc_size(&str, 0, BCF_BT_NULL);
5598
0
        else
5599
0
            bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
5600
0
    }
5601
#ifdef VCF_ALLOW_INT64
5602
    else if ( type==BCF_HT_LONG )
5603
    {
5604
        if (n != 1) {
5605
            hts_log_error("Only storing a single BCF_HT_LONG value is supported at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5606
            abort();
5607
        }
5608
        bcf_enc_long1(&str, *(int64_t *) values);
5609
    }
5610
#endif
5611
0
    else
5612
0
    {
5613
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5614
0
        abort();
5615
0
    }
5616
5617
    // Is the INFO tag already present
5618
0
    if ( inf )
5619
0
    {
5620
        // Is it big enough to accommodate new block?
5621
0
        if ( inf->vptr && str.l <= inf->vptr_len + inf->vptr_off )
5622
0
        {
5623
0
            if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
5624
0
            uint8_t *ptr = inf->vptr - inf->vptr_off;
5625
0
            memcpy(ptr, str.s, str.l);
5626
0
            free(str.s);
5627
0
            int vptr_free = inf->vptr_free;
5628
0
            bcf_unpack_info_core1(ptr, inf);
5629
0
            inf->vptr_free = vptr_free;
5630
0
        }
5631
0
        else
5632
0
        {
5633
0
            if ( inf->vptr_free )
5634
0
                free(inf->vptr - inf->vptr_off);
5635
0
            bcf_unpack_info_core1((uint8_t*)str.s, inf);
5636
0
            inf->vptr_free = 1;
5637
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5638
0
        }
5639
0
    }
5640
0
    else
5641
0
    {
5642
        // The tag is not present, create new one
5643
0
        line->n_info++;
5644
0
        hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
5645
0
        inf = &line->d.info[line->n_info-1];
5646
0
        bcf_unpack_info_core1((uint8_t*)str.s, inf);
5647
0
        inf->vptr_free = 1;
5648
0
        line->d.shared_dirty |= BCF1_DIRTY_INF;
5649
0
    }
5650
0
    line->unpacked |= BCF_UN_INFO;
5651
5652
0
   if ( n==1 && is_end_tag) {
5653
0
        hts_pos_t end = type == BCF_HT_INT ? *(int32_t *) values : *(int64_t *) values;
5654
0
        if ( (type == BCF_HT_INT && end!=bcf_int32_missing) || (type == BCF_HT_LONG && end!=bcf_int64_missing) )
5655
0
        {
5656
0
            if ( end <= line->pos )
5657
0
            {
5658
0
                if ( !negative_rlen_warned )
5659
0
                {
5660
0
                    hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,end,bcf_seqname_safe(hdr,line),line->pos+1);
5661
0
                    negative_rlen_warned = 1;
5662
0
                }
5663
0
            }
5664
0
        }
5665
0
    }
5666
0
    if (is_svlen_tag || is_end_tag) {
5667
0
        line->rlen = get_rlen(hdr, line);
5668
0
    }
5669
0
    return 0;
5670
0
}
5671
5672
int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
5673
0
{
5674
0
    if ( !n )
5675
0
        return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
5676
5677
0
    int i, max_len = 0;
5678
0
    for (i=0; i<n; i++)
5679
0
    {
5680
0
        int len = strlen(values[i]);
5681
0
        if ( len > max_len ) max_len = len;
5682
0
    }
5683
0
    char *out = (char*) malloc(max_len*n);
5684
0
    if ( !out ) return -2;
5685
0
    for (i=0; i<n; i++)
5686
0
    {
5687
0
        char *dst = out+i*max_len;
5688
0
        const char *src = values[i];
5689
0
        int j = 0;
5690
0
        while ( src[j] ) { dst[j] = src[j]; j++; }
5691
0
        for (; j<max_len; j++) dst[j] = 0;
5692
0
    }
5693
0
    int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
5694
0
    free(out);
5695
0
    return ret;
5696
0
}
5697
5698
int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5699
0
{
5700
    // Is the field already present?
5701
0
    int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5702
0
    int is_len = 0;
5703
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
5704
0
    {
5705
0
        if ( !n ) return 0;
5706
0
        return -1;  // the key not present in the header
5707
0
    }
5708
5709
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5710
5711
0
    for (i=0; i<line->n_fmt; i++)
5712
0
        if ( line->d.fmt[i].id==fmt_id ) break;
5713
0
    bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
5714
5715
0
    is_len = strcmp(key, "LEN") == 0;
5716
0
    if ( !n )
5717
0
    {
5718
0
        if ( fmt )
5719
0
        {
5720
            // Mark the tag for removal, free existing memory if necessary
5721
0
            if ( fmt->p_free )
5722
0
            {
5723
0
                free(fmt->p - fmt->p_off);
5724
0
                fmt->p_free = 0;
5725
0
            }
5726
0
            line->d.indiv_dirty = 1;
5727
0
            fmt->p = NULL;
5728
0
        }
5729
0
        if (is_len) {
5730
0
            line->rlen = get_rlen(hdr, line);
5731
0
        }
5732
0
        return 0;
5733
0
    }
5734
5735
0
    line->n_sample = bcf_hdr_nsamples(hdr);
5736
0
    int nps = n / line->n_sample;  // number of values per sample
5737
0
    assert( nps && nps*line->n_sample==n );     // must be divisible by n_sample
5738
5739
    // Encode the values and determine the size required to accommodate the values
5740
0
    kstring_t str = {0,0,0};
5741
0
    bcf_enc_int1(&str, fmt_id);
5742
0
    assert(values != NULL);
5743
0
    if ( type==BCF_HT_INT )
5744
0
        bcf_enc_vint(&str, n, (int32_t*)values, nps);
5745
0
    else if ( type==BCF_HT_REAL )
5746
0
    {
5747
0
        bcf_enc_size(&str, nps, BCF_BT_FLOAT);
5748
0
        serialize_float_array(&str, nps*line->n_sample, (float *) values);
5749
0
    }
5750
0
    else if ( type==BCF_HT_STR )
5751
0
    {
5752
0
        bcf_enc_size(&str, nps, BCF_BT_CHAR);
5753
0
        kputsn((char*)values, nps*line->n_sample, &str);
5754
0
    }
5755
0
    else
5756
0
    {
5757
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5758
0
        abort();
5759
0
    }
5760
5761
0
    if ( !fmt )
5762
0
    {
5763
        // Not present, new format field
5764
0
        line->n_fmt++;
5765
0
        hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
5766
5767
        // Special case: VCF specification requires that GT is always first
5768
0
        if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
5769
0
        {
5770
0
            for (i=line->n_fmt-1; i>0; i--)
5771
0
                line->d.fmt[i] = line->d.fmt[i-1];
5772
0
            fmt = &line->d.fmt[0];
5773
0
        }
5774
0
        else
5775
0
            fmt = &line->d.fmt[line->n_fmt-1];
5776
0
        bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5777
0
        line->d.indiv_dirty = 1;
5778
0
        fmt->p_free = 1;
5779
0
    }
5780
0
    else
5781
0
    {
5782
        // The tag is already present, check if it is big enough to accommodate the new block
5783
0
        if ( fmt->p && str.l <= fmt->p_len + fmt->p_off )
5784
0
        {
5785
            // good, the block is big enough
5786
0
            if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
5787
0
            uint8_t *ptr = fmt->p - fmt->p_off;
5788
0
            memcpy(ptr, str.s, str.l);
5789
0
            free(str.s);
5790
0
            int p_free = fmt->p_free;
5791
0
            bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
5792
0
            fmt->p_free = p_free;
5793
0
        }
5794
0
        else
5795
0
        {
5796
0
            if ( fmt->p_free )
5797
0
                free(fmt->p - fmt->p_off);
5798
0
            bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5799
0
            fmt->p_free = 1;
5800
0
            line->d.indiv_dirty = 1;
5801
0
        }
5802
0
    }
5803
0
    line->unpacked |= BCF_UN_FMT;
5804
5805
0
    if (is_len) {
5806
0
        line->rlen = get_rlen(hdr, line);
5807
0
    }
5808
0
    return 0;
5809
0
}
5810
5811
5812
int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
5813
0
{
5814
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5815
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5816
0
    line->d.n_flt = n;
5817
0
    if ( !n ) return 0;
5818
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5819
0
    int i;
5820
0
    for (i=0; i<n; i++)
5821
0
        line->d.flt[i] = flt_ids[i];
5822
0
    return 0;
5823
0
}
5824
5825
int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
5826
0
{
5827
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5828
0
    int i;
5829
0
    for (i=0; i<line->d.n_flt; i++)
5830
0
        if ( flt_id==line->d.flt[i] ) break;
5831
0
    if ( i<line->d.n_flt ) return 0;    // this filter is already set
5832
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5833
0
    if ( flt_id==0 )    // set to PASS
5834
0
        line->d.n_flt = 1;
5835
0
    else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
5836
0
        line->d.n_flt = 1;
5837
0
    else
5838
0
        line->d.n_flt++;
5839
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5840
0
    line->d.flt[line->d.n_flt-1] = flt_id;
5841
0
    return 1;
5842
0
}
5843
int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
5844
0
{
5845
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5846
0
    int i;
5847
0
    for (i=0; i<line->d.n_flt; i++)
5848
0
        if ( flt_id==line->d.flt[i] ) break;
5849
0
    if ( i==line->d.n_flt ) return 0;   // the filter is not present
5850
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5851
0
    if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt));
5852
0
    line->d.n_flt--;
5853
0
    if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
5854
0
    return 0;
5855
0
}
5856
5857
int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
5858
0
{
5859
0
    if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
5860
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
5861
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1;  // not defined in the header
5862
5863
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5864
0
    if ( id==0 && !line->d.n_flt) return 1; // PASS
5865
5866
0
    int i;
5867
0
    for (i=0; i<line->d.n_flt; i++)
5868
0
        if ( line->d.flt[i]==id ) return 1;
5869
0
    return 0;
5870
0
}
5871
5872
static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
5873
0
{
5874
0
    line->d.shared_dirty |= BCF1_DIRTY_ALS;
5875
0
    line->d.var_type = -1;
5876
5877
0
    line->n_allele = nals;
5878
0
    hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
5879
5880
0
    char *als = line->d.als;
5881
0
    int n = 0;
5882
0
    while (n<nals)
5883
0
    {
5884
0
        line->d.allele[n] = als;
5885
0
        while ( *als ) als++;
5886
0
        als++;
5887
0
        n++;
5888
0
    }
5889
    // Update REF length. Note that END is 1-based while line->pos 0-based
5890
0
    line->rlen = get_rlen(hdr, line);
5891
5892
0
    return 0;
5893
0
}
5894
int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
5895
0
{
5896
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5897
0
    char *free_old = NULL;
5898
0
    char buffer[256];
5899
0
    size_t used = 0;
5900
5901
    // The pointers in alleles may point into the existing line->d.als memory,
5902
    // so care needs to be taken not to clobber them while updating.  Usually
5903
    // they will be short so we can copy through an intermediate buffer.
5904
    // If they're longer, or won't fit in the existing allocation we
5905
    // can allocate a new buffer to write into.  Note that in either case
5906
    // pointers to line->d.als memory in alleles may not be valid when we've
5907
    // finished.
5908
0
    int i;
5909
0
    size_t avail = line->d.m_als < sizeof(buffer) ? line->d.m_als : sizeof(buffer);
5910
0
    for (i=0; i<nals; i++) {
5911
0
        size_t sz = strlen(alleles[i]) + 1;
5912
0
        if (avail - used < sz)
5913
0
            break;
5914
0
        memcpy(buffer + used, alleles[i], sz);
5915
0
        used += sz;
5916
0
    }
5917
5918
    // Did we miss anything?
5919
0
    if (i < nals) {
5920
0
        int j;
5921
0
        size_t needed = used;
5922
0
        char *new_als;
5923
0
        for (j = i; j < nals; j++)
5924
0
            needed += strlen(alleles[j]) + 1;
5925
0
        if (needed < line->d.m_als) // Don't shrink the buffer
5926
0
            needed = line->d.m_als;
5927
0
        if (needed > INT_MAX) {
5928
0
            hts_log_error("REF + alleles too long to fit in a BCF record");
5929
0
            return -1;
5930
0
        }
5931
0
        new_als = malloc(needed);
5932
0
        if (!new_als)
5933
0
            return -1;
5934
0
        free_old = line->d.als;
5935
0
        line->d.als = new_als;
5936
0
        line->d.m_als = needed;
5937
0
    }
5938
5939
    // Copy from the temp buffer to the destination
5940
0
    if (used) {
5941
0
        assert(used <= line->d.m_als);
5942
0
        memcpy(line->d.als, buffer, used);
5943
0
    }
5944
5945
    // Add in any remaining entries - if this happens we will always be
5946
    // writing to a newly-allocated buffer.
5947
0
    for (; i < nals; i++) {
5948
0
        size_t sz = strlen(alleles[i]) + 1;
5949
0
        memcpy(line->d.als + used, alleles[i], sz);
5950
0
        used += sz;
5951
0
    }
5952
5953
0
    if (free_old)
5954
0
        free(free_old);
5955
0
    return _bcf1_sync_alleles(hdr,line,nals);
5956
0
}
5957
5958
int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
5959
0
{
5960
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5961
0
    kstring_t tmp;
5962
0
    tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
5963
0
    kputs(alleles_string, &tmp);
5964
0
    line->d.als = tmp.s; line->d.m_als = tmp.m;
5965
5966
0
    int nals = 1;
5967
0
    char *t = line->d.als;
5968
0
    while (*t)
5969
0
    {
5970
0
        if ( *t==',' ) { *t = 0; nals++; }
5971
0
        t++;
5972
0
    }
5973
0
    return _bcf1_sync_alleles(hdr, line, nals);
5974
0
}
5975
5976
int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
5977
0
{
5978
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5979
0
    kstring_t tmp;
5980
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
5981
0
    if ( id )
5982
0
        kputs(id, &tmp);
5983
0
    else
5984
0
        kputs(".", &tmp);
5985
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
5986
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
5987
0
    return 0;
5988
0
}
5989
5990
int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
5991
0
{
5992
0
    if ( !id ) return 0;
5993
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5994
5995
0
    kstring_t tmp;
5996
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
5997
5998
0
    int len = strlen(id);
5999
0
    char *dst = line->d.id;
6000
0
    while ( *dst && (dst=strstr(dst,id)) )
6001
0
    {
6002
0
        if ( dst[len]!=0 && dst[len]!=';' ) dst++;              // a prefix, not a match
6003
0
        else if ( dst==line->d.id || dst[-1]==';' ) return 0;   // already present
6004
0
        dst++;  // a suffix, not a match
6005
0
    }
6006
0
    if ( line->d.id && (line->d.id[0]!='.' || line->d.id[1]) )
6007
0
    {
6008
0
        tmp.l = strlen(line->d.id);
6009
0
        kputc(';',&tmp);
6010
0
    }
6011
0
    kputs(id,&tmp);
6012
6013
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
6014
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
6015
0
    return 0;
6016
6017
0
}
6018
6019
bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
6020
0
{
6021
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
6022
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL;   // no such FMT field in the header
6023
0
    return bcf_get_fmt_id(line, id);
6024
0
}
6025
6026
bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
6027
0
{
6028
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
6029
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL;   // no such INFO field in the header
6030
0
    return bcf_get_info_id(line, id);
6031
0
}
6032
6033
bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
6034
0
{
6035
0
    int i;
6036
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
6037
0
    for (i=0; i<line->n_fmt; i++)
6038
0
    {
6039
0
        if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
6040
0
    }
6041
0
    return NULL;
6042
0
}
6043
6044
bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
6045
0
{
6046
0
    int i;
6047
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
6048
0
    for (i=0; i<line->n_info; i++)
6049
0
    {
6050
0
        if ( line->d.info[i].key==id ) return &line->d.info[i];
6051
0
    }
6052
0
    return NULL;
6053
0
}
6054
6055
6056
int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
6057
0
{
6058
0
    int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
6059
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1;    // no such INFO field in the header
6060
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2;     // expected different type
6061
6062
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
6063
6064
0
    for (i=0; i<line->n_info; i++)
6065
0
        if ( line->d.info[i].key==tag_id ) break;
6066
0
    if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3;       // the tag is not present in this record
6067
0
    if ( type==BCF_HT_FLAG ) return 1;
6068
6069
0
    bcf_info_t *info = &line->d.info[i];
6070
0
    if ( !info->vptr ) return -3;           // the tag was marked for removal
6071
0
    if ( type==BCF_HT_STR )
6072
0
    {
6073
0
        if ( *ndst < info->len+1 )
6074
0
        {
6075
0
            *ndst = info->len + 1;
6076
0
            *dst  = realloc(*dst, *ndst);
6077
0
        }
6078
0
        memcpy(*dst,info->vptr,info->len);
6079
0
        ((uint8_t*)*dst)[info->len] = 0;
6080
0
        return info->len;
6081
0
    }
6082
6083
    // Make sure the buffer is big enough
6084
0
    int size1;
6085
0
    switch (type) {
6086
0
        case BCF_HT_INT:  size1 = sizeof(int32_t); break;
6087
0
        case BCF_HT_LONG: size1 = sizeof(int64_t); break;
6088
0
        case BCF_HT_REAL: size1 = sizeof(float); break;
6089
0
        default:
6090
0
            hts_log_error("Unexpected output type %d at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
6091
0
            return -2;
6092
0
    }
6093
0
    if ( *ndst < info->len )
6094
0
    {
6095
0
        *ndst = info->len;
6096
0
        *dst  = realloc(*dst, *ndst * size1);
6097
0
    }
6098
6099
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_regular, out_type_t) do { \
6100
0
        out_type_t *tmp = (out_type_t *) *dst; \
6101
0
        int j; \
6102
0
        for (j=0; j<info->len; j++) \
6103
0
        { \
6104
0
            type_t p = convert(info->vptr + j * sizeof(type_t)); \
6105
0
            if ( is_vector_end ) break; \
6106
0
            if ( is_missing ) set_missing; \
6107
0
            else set_regular; \
6108
0
            tmp++; \
6109
0
        } \
6110
0
        ret = j; \
6111
0
    } while (0)
6112
0
    switch (info->type) {
6113
0
        case BCF_BT_INT8:
6114
0
            if (type == BCF_HT_LONG) {
6115
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int64_missing, *tmp=p, int64_t);
6116
0
            } else {
6117
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=p, int32_t);
6118
0
            }
6119
0
            break;
6120
0
        case BCF_BT_INT16:
6121
0
            if (type == BCF_HT_LONG) {
6122
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t);
6123
0
            } else {
6124
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t);
6125
0
            }
6126
0
            break;
6127
0
        case BCF_BT_INT32:
6128
0
            if (type == BCF_HT_LONG) {
6129
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); break;
6130
0
            } else {
6131
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break;
6132
0
            }
6133
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break;
6134
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, info->type, bcf_seqname_safe(hdr,line), line->pos+1); return -2;
6135
0
    }
6136
0
    #undef BRANCH
6137
0
    return ret;  // set by BRANCH
6138
0
}
6139
6140
int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
6141
0
{
6142
0
    int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
6143
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
6144
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;     // expected different type
6145
6146
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
6147
6148
0
    for (i=0; i<line->n_fmt; i++)
6149
0
        if ( line->d.fmt[i].id==tag_id ) break;
6150
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
6151
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
6152
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
6153
6154
0
    int nsmpl = bcf_hdr_nsamples(hdr);
6155
0
    if ( !*dst )
6156
0
    {
6157
0
        *dst = (char**) malloc(sizeof(char*)*nsmpl);
6158
0
        if ( !*dst ) return -4;     // could not alloc
6159
0
        (*dst)[0] = NULL;
6160
0
    }
6161
0
    int n = (fmt->n+1)*nsmpl;
6162
0
    if ( *ndst < n )
6163
0
    {
6164
0
        (*dst)[0] = realloc((*dst)[0], n);
6165
0
        if ( !(*dst)[0] ) return -4;    // could not alloc
6166
0
        *ndst = n;
6167
0
    }
6168
0
    for (i=0; i<nsmpl; i++)
6169
0
    {
6170
0
        uint8_t *src = fmt->p + i*fmt->n;
6171
0
        uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
6172
0
        memcpy(tmp,src,fmt->n);
6173
0
        tmp[fmt->n] = 0;
6174
0
        (*dst)[i] = (char*) tmp;
6175
0
    }
6176
0
    return n;
6177
0
}
6178
6179
int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
6180
0
{
6181
0
    int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
6182
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
6183
0
    if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
6184
0
    {
6185
        // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
6186
0
        if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
6187
0
    }
6188
0
    else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2;     // expected different type
6189
6190
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
6191
6192
0
    for (i=0; i<line->n_fmt; i++)
6193
0
        if ( line->d.fmt[i].id==tag_id ) break;
6194
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
6195
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
6196
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
6197
6198
0
    if ( type==BCF_HT_STR )
6199
0
    {
6200
0
        int n = fmt->n*bcf_hdr_nsamples(hdr);
6201
0
        if ( *ndst < n )
6202
0
        {
6203
0
            *dst  = realloc(*dst, n);
6204
0
            if ( !*dst ) return -4;     // could not alloc
6205
0
            *ndst = n;
6206
0
        }
6207
0
        memcpy(*dst,fmt->p,n);
6208
0
        return n;
6209
0
    }
6210
6211
    // Make sure the buffer is big enough
6212
0
    int nsmpl = bcf_hdr_nsamples(hdr);
6213
0
    int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
6214
0
    if ( *ndst < fmt->n*nsmpl )
6215
0
    {
6216
0
        *ndst = fmt->n*nsmpl;
6217
0
        *dst  = realloc(*dst, *ndst*size1);
6218
0
        if ( !*dst ) return -4;     // could not alloc
6219
0
    }
6220
6221
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_vector_end, set_regular, out_type_t) { \
6222
0
        out_type_t *tmp = (out_type_t *) *dst; \
6223
0
        uint8_t *fmt_p = fmt->p; \
6224
0
        for (i=0; i<nsmpl; i++) \
6225
0
        { \
6226
0
            for (j=0; j<fmt->n; j++) \
6227
0
            { \
6228
0
                type_t p = convert(fmt_p + j * sizeof(type_t)); \
6229
0
                if ( is_missing ) set_missing; \
6230
0
                else if ( is_vector_end ) { set_vector_end; break; } \
6231
0
                else set_regular; \
6232
0
                tmp++; \
6233
0
            } \
6234
0
            for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
6235
0
            fmt_p += fmt->size; \
6236
0
        } \
6237
0
    }
6238
0
    switch (fmt->type) {
6239
0
        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6240
0
        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6241
0
        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6242
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), bcf_float_set(tmp, p), float); break;
6243
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt->type, bcf_seqname_safe(hdr,line), line->pos+1); exit(1);
6244
0
    }
6245
0
    #undef BRANCH
6246
6247
0
    return nsmpl*fmt->n;
6248
0
}
6249
6250
//error description structure definition
6251
typedef struct err_desc {
6252
    int  errorcode;
6253
    const char *description;
6254
}err_desc;
6255
6256
// error descriptions
6257
static const err_desc errdesc_bcf[] = {
6258
    { BCF_ERR_CTG_UNDEF, "Contig not defined in header"},
6259
    { BCF_ERR_TAG_UNDEF, "Tag not defined in header" },
6260
    { BCF_ERR_NCOLS, "Incorrect number of columns" },
6261
    { BCF_ERR_LIMITS, "Limits reached" },
6262
    { BCF_ERR_CHAR, "Invalid character" },
6263
    { BCF_ERR_CTG_INVALID, "Invalid contig" },
6264
    { BCF_ERR_TAG_INVALID, "Invalid tag" },
6265
};
6266
6267
/// append given description to buffer based on available size and add ... when not enough space
6268
    /** @param buffer       buffer to which description to be appended
6269
        @param offset       offset at which to be appended
6270
        @param maxbuffer    maximum size of the buffer
6271
        @param description  the description to be appended
6272
on failure returns -1 - when buffer is not big enough; returns -1 on invalid params and on too small buffer which are improbable due to validation at caller site
6273
on success returns 0
6274
    */
6275
2.26k
static int add_desc_to_buffer(char *buffer, size_t *offset, size_t maxbuffer, const char *description) {
6276
6277
2.26k
    if (!description || !buffer || !offset || (maxbuffer < 4))
6278
0
        return -1;
6279
6280
2.26k
    size_t rembuffer = maxbuffer - *offset;
6281
2.26k
    if (rembuffer > (strlen(description) + (rembuffer == maxbuffer ? 0 : 1))) {    //add description with optionally required ','
6282
2.26k
        *offset += snprintf(buffer + *offset, rembuffer, "%s%s", (rembuffer == maxbuffer)? "": ",", description);
6283
2.26k
    } else {    //not enough space for description, put ...
6284
0
        size_t tmppos = (rembuffer <= 4) ? maxbuffer - 4 : *offset;
6285
0
        snprintf(buffer + tmppos, 4, "...");    //ignore offset update
6286
0
        return -1;
6287
0
    }
6288
2.26k
    return 0;
6289
2.26k
}
6290
6291
//get description for given error code. return NULL on error
6292
1.08k
const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) {
6293
1.08k
    size_t usedup = 0;
6294
1.08k
    int ret = 0;
6295
1.08k
    int idx;
6296
6297
1.08k
    if (!buffer || maxbuffer < 4)
6298
0
        return NULL;           //invalid / insufficient buffer
6299
6300
1.08k
    if (!errorcode) {
6301
0
        buffer[0] = '\0';      //no error, set null
6302
0
        return buffer;
6303
0
    }
6304
6305
8.71k
    for (idx = 0; idx < sizeof(errdesc_bcf) / sizeof(err_desc); ++idx) {
6306
7.62k
        if (errorcode & errdesc_bcf[idx].errorcode) {    //error is set, add description
6307
2.26k
            ret = add_desc_to_buffer(buffer, &usedup, maxbuffer, errdesc_bcf[idx].description);
6308
2.26k
            if (ret < 0)
6309
0
                break;         //not enough space, ... added, no need to continue
6310
6311
2.26k
            errorcode &= ~errdesc_bcf[idx].errorcode;    //reset the error
6312
2.26k
        }
6313
7.62k
    }
6314
6315
1.08k
    if (errorcode && (ret >= 0))  {     //undescribed error is present in error code and had enough buffer, try to add unkonwn error as well§
6316
0
        add_desc_to_buffer(buffer, &usedup, maxbuffer, "Unknown error");
6317
0
    }
6318
1.08k
    return buffer;
6319
1.08k
}
6320
6321
/**
6322
 *  bcf_format_gt_v2 - formats GT information on a string
6323
 *  @param hdr - bcf header, to get version
6324
 *  @param fmt - pointer to bcf format data
6325
 *  @param isample - position of interested sample in data
6326
 *  @param str - pointer to output string
6327
 *  Returns 0 on success and -1 on failure
6328
 *  This method is preferred over bcf_format_gt as this supports vcf4.4 and
6329
 *  prefixed phasing. Explicit / prefixed phasing for 1st allele is used only
6330
 *  when it is a must to correctly express phasing.
6331
 * correctly express phasing.
6332
 */
6333
int bcf_format_gt_v2(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample, kstring_t *str)
6334
15.3k
{
6335
15.3k
    uint32_t e = 0;
6336
15.3k
    int ploidy = 1, anyunphased = 0;
6337
15.3k
    int32_t val0 = 0;
6338
15.3k
    size_t pos = str ? str->l : 0;
6339
6340
15.3k
    #define BRANCH(type_t, convert, missing, vector_end) { \
6341
12.8k
        uint8_t *ptr = fmt->p + isample*fmt->size; \
6342
12.8k
        int i; \
6343
29.7k
        for (i=0; i<fmt->n; i++, ptr += sizeof(type_t)) \
6344
23.5k
        { \
6345
23.5k
            type_t val = convert(ptr); \
6346
23.5k
            if ( val == vector_end ) break; \
6347
23.5k
            if (!i) { val0 = val; } \
6348
16.9k
            if (i) { \
6349
4.06k
                e |= kputc("/|"[val & 1], str) < 0; \
6350
4.06k
                anyunphased |= !(val & 1); \
6351
4.06k
            } \
6352
16.9k
            if (!(val >> 1)) e |= kputc('.', str) < 0; \
6353
16.9k
            else e |= kputw((val >> 1) - 1, str) < 0; \
6354
16.9k
        } \
6355
12.8k
        if (i == 0) e |= kputc('.', str) < 0; \
6356
12.8k
        ploidy = i; \
6357
12.8k
    }
6358
15.3k
    switch (fmt->type) {
6359
5.76k
        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  bcf_int8_missing,
6360
5.76k
            bcf_int8_vector_end); break;
6361
2.43k
        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing,
6362
2.43k
            bcf_int16_vector_end); break;
6363
4.65k
        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing,
6364
4.65k
            bcf_int32_vector_end); break;
6365
2.49k
        case BCF_BT_NULL:  e |= kputc('.', str) < 0; break;
6366
0
        default: hts_log_error("Unexpected type %d", fmt->type); return -2;
6367
15.3k
    }
6368
15.3k
    #undef BRANCH
6369
6370
15.3k
    if (hdr && get_hdr_aux(hdr)->version >= VCF44) {
6371
        //output which supports prefixed phasing
6372
6373
        /* update 1st allele's phasing if required and append rest to it.
6374
        use prefixed phasing only when it is a must. i.e. without which the
6375
        inferred value will be incorrect */
6376
6.71k
        if (val0 & 1) {
6377
            /* 1st one is phased, if ploidy is > 1 and an unphased allele exists
6378
             need to specify explicitly */
6379
1.21k
            e |= (ploidy > 1 && anyunphased) ?
6380
529
                    (kinsert_char('|', pos, str) < 0) :
6381
1.21k
                        (ploidy <= 1 && !((val0 >> 1)) ? //|. needs explicit o/p
6382
0
                            (kinsert_char('|', pos, str) < 0) :
6383
690
                            0);
6384
5.49k
        } else {
6385
            /* 1st allele is unphased, if ploidy is = 1 or allele is '.' or
6386
             ploidy > 1 and no other unphased allele exist, need to specify
6387
             explicitly */
6388
5.49k
            e |= ((ploidy <= 1 && val0 != 0) || (ploidy > 1 && !anyunphased)) ?
6389
3.29k
                    (kinsert_char('/', pos, str) < 0) :
6390
5.49k
                    0;
6391
5.49k
        }
6392
6.71k
    }
6393
15.3k
    return e == 0 ? 0 : -1;
6394
15.3k
}
6395
6396
/**
6397
 *  get_rlen - calculates and returns rlen value
6398
 *  @param h - bcf header
6399
 *  @param v - bcf data
6400
 *  Returns rlen calculated on success and -1 on failure.
6401
 *  rlen calculation is dependent on vcf version and a few other field data.
6402
 *  When bcf decoded data is available, refers it. When not available, retrieves
6403
 *  required field data by seeking on the data stream.
6404
 *  Ideally pos & version be set appropriately before any info/format field
6405
 *  update to have proper rlen calculation.
6406
 *  As version is not kept properly updated in practice, it is ignored in calcs.
6407
 */
6408
static int64_t get_rlen(const bcf_hdr_t *h, bcf1_t *v)
6409
54.8k
{
6410
54.8k
    uint8_t *f = (uint8_t*)v->shared.s, *t = NULL,
6411
54.8k
        *e = (uint8_t*)v->shared.s + v->shared.l;
6412
54.8k
    int size, type, id, lenid, endid, svlenid, i, bad, gvcf = 0, use_svlen = 0;
6413
54.8k
    bcf_info_t *endinfo = NULL, *svleninfo = NULL, end_lcl, svlen_lcl;
6414
54.8k
    bcf_fmt_t *lenfmt = NULL, len_lcl;
6415
6416
    //holds SVLEN allele status for the max no of alleles
6417
54.8k
    uint8_t svlenals[8192];
6418
    //pos from info END, fmt LEN, info SVLEN
6419
54.8k
    hts_pos_t end = 0, end_fmtlen = 0, end_svlen = 0, hpos;
6420
54.8k
    int64_t len_ref = 0, len = 0, tmp;
6421
54.8k
    endid = bcf_hdr_id2int(h, BCF_DT_ID, "END");
6422
6423
    //initialise bytes which are to be used
6424
54.8k
    memset(svlenals, 0, 1 + v->n_allele / 8);
6425
6426
    //use decoded data where ever available and where not, get from stream
6427
54.8k
    if (v->unpacked & BCF_UN_STR || v->d.shared_dirty & BCF1_DIRTY_ALS) {
6428
0
        for (i = 1; i < v->n_allele; ++i) {
6429
            // check only symbolic alt alleles
6430
0
            if (v->d.allele[i][0] != '<')
6431
0
                continue;
6432
0
            if (svlen_on_ref_for_vcf_alt(v->d.allele[i], -1)) {
6433
                // del, dup or cnv allele, note to check corresponding svlen val
6434
0
                svlenals[i >> 3] |= 1 << (i & 7);
6435
0
                use_svlen = 1;
6436
0
            } else if (!strcmp(v->d.allele[i], "<*>") ||
6437
0
                         !strcmp(v->d.allele[i], "<NON_REF>")) {
6438
0
                gvcf = 1;   //gvcf present, have to check for LEN field
6439
0
            }
6440
0
        }
6441
0
        f += v->unpack_size[0] + v->unpack_size[1];
6442
0
        len_ref = v->n_allele ? strlen(v->d.allele[0]) : 0;
6443
54.8k
    } else if (f < e) {
6444
        //skip ID
6445
54.8k
        size = bcf_dec_size(f, &f, &type);
6446
54.8k
        f += size << bcf_type_shift[type];
6447
        // REF, ALT
6448
3.71M
        for (i = 0; i < v->n_allele; ++i) {
6449
            //check all alleles, w/o NUL
6450
3.66M
            size = bcf_dec_size(f, &f, &type);
6451
3.66M
            if (!i) {   //REF length
6452
54.8k
                len_ref = size;
6453
3.60M
            } else if (size > 0 && *f == '<') {
6454
11.4k
                if (svlen_on_ref_for_vcf_alt((char *) f, size)) {
6455
                    // del, dup or cnv allele, note to check corresponding svlen val
6456
0
                    svlenals[i >> 3] |= 1 << (i & 7);
6457
0
                    use_svlen = 1;
6458
11.4k
                } else if ((size == 3 && !strncmp((char*)f, "<*>", size)) ||
6459
7.66k
                    (size == 9 && !strncmp((char*)f, "<NON_REF>", size))) {
6460
4.00k
                    gvcf = 1;   //gvcf present, have to check for LEN field
6461
4.00k
                }
6462
11.4k
            }
6463
3.66M
            f += size << bcf_type_shift[type];
6464
3.66M
        }
6465
54.8k
    }
6466
    // FILTER
6467
54.8k
    if (v->unpacked & BCF_UN_FLT) {
6468
0
        f += v->unpack_size[2];
6469
54.8k
    } else if (f < e) {
6470
54.8k
        size = bcf_dec_size(f, &f, &type);
6471
54.8k
        f += size << bcf_type_shift[type];
6472
54.8k
    }
6473
6474
    // Only do SVLEN lookup if there are suitable symbolic alleles
6475
54.8k
    svlenid = use_svlen ? bcf_hdr_id2int(h, BCF_DT_ID, "SVLEN") : -1;
6476
6477
    // INFO
6478
54.8k
    if (svlenid >= 0 || endid >= 0 ) {  //only if end/svlen present
6479
25.3k
        if (v->unpacked & BCF_UN_INFO || v->d.shared_dirty & BCF1_DIRTY_INF) {
6480
0
            endinfo = bcf_get_info(h, v, "END");
6481
0
            svleninfo = bcf_get_info(h, v, "SVLEN");
6482
25.3k
        } else if (f < e) {
6483
35.7k
            for (i = 0; i < v->n_info; ++i) {
6484
25.9k
                id = bcf_dec_typed_int1(f, &t);
6485
25.9k
                if (id == endid) {  //END
6486
2.43k
                    t = bcf_unpack_info_core1(f, &end_lcl);
6487
2.43k
                    endinfo = &end_lcl;
6488
2.43k
                    if (svleninfo || svlenid < 0) {
6489
2.43k
                        break;  //already got svlen or no need to search further
6490
2.43k
                    }
6491
23.5k
                } else if (id == svlenid) { //SVLEN
6492
0
                    t = bcf_unpack_info_core1(f, &svlen_lcl);
6493
0
                    svleninfo = &svlen_lcl;
6494
0
                    if (endinfo || endid < 0 ) {
6495
0
                        break;  //already got end or no need to search further
6496
0
                    }
6497
23.5k
                } else {
6498
23.5k
                    f = t;
6499
23.5k
                    size = bcf_dec_size(f, &t, &type);
6500
23.5k
                    t += size << bcf_type_shift[type];
6501
23.5k
                }
6502
23.5k
                f = t;
6503
23.5k
            }
6504
12.1k
        }
6505
25.3k
    }
6506
6507
    // Only do LEN lookup if a <*> allele was found
6508
54.8k
    lenid = gvcf ? bcf_hdr_id2int(h, BCF_DT_ID, "LEN") : -1;
6509
6510
    // FORMAT
6511
54.8k
    if (lenid >= 0) {
6512
        //with LEN and has gvcf allele
6513
0
        f = (uint8_t*)v->indiv.s; t = NULL; e = (uint8_t*)v->indiv.s + v->indiv.l;
6514
0
        if (v->unpacked & BCF_UN_FMT || v->d.indiv_dirty) {
6515
0
            lenfmt = bcf_get_fmt(h, v, "LEN");
6516
0
        } else if (f < e) {
6517
0
            for (i = 0; i < v->n_fmt; ++i) {
6518
0
                id = bcf_dec_typed_int1(f, &t);
6519
0
                if (id == lenid) {
6520
0
                        t = bcf_unpack_fmt_core1(f, v->n_sample, &len_lcl);
6521
0
                    lenfmt = &len_lcl;
6522
0
                    break;  //that's all needed
6523
0
                } else {
6524
0
                    f = t;
6525
0
                    size = bcf_dec_size(f, &t, &type);
6526
0
                    t += size * v->n_sample << bcf_type_shift[type];
6527
0
                }
6528
0
                f = t;
6529
0
            }
6530
0
        }
6531
0
    }
6532
    //got required data, find end and rlen
6533
54.8k
    if (endinfo && endinfo->vptr) { //end position given by info END
6534
        //end info exists, not being deleted
6535
2.43k
        end = endinfo->v1.i;
6536
2.43k
        switch(endinfo->type) {
6537
0
            case BCF_BT_INT8:  end = end == bcf_int8_missing ? 0 : end;  break;
6538
0
            case BCF_BT_INT16: end = end == bcf_int16_missing ? 0 : end; break;
6539
0
            case BCF_BT_INT32: end = end == bcf_int32_missing ? 0 : end; break;
6540
0
            case BCF_BT_INT64: end = end == bcf_int64_missing ? 0 : end; break;
6541
2.43k
            default: end = 0; break; //invalid
6542
2.43k
        }
6543
2.43k
    }
6544
6545
54.8k
    if (svleninfo && svleninfo->vptr) {
6546
        //svlen info exists, not being deleted
6547
0
        bad = 0;
6548
        //get largest svlen corresponding to a <DEL> symbolic allele
6549
0
        for (i = 0; i < svleninfo->len && i + 1 < v->n_allele; ++i) {
6550
0
            if (!(svlenals[i >> 3] & (1 << ((i + 1) & 7))))
6551
0
                continue;
6552
6553
0
            switch(svleninfo->type) {
6554
0
                case BCF_BT_INT8:
6555
0
                    tmp = le_to_i8(&svleninfo->vptr[i]);
6556
0
                    tmp = tmp == bcf_int8_missing ? 0 : tmp;
6557
0
                break;
6558
0
                case BCF_BT_INT16:
6559
0
                    tmp = le_to_i16(&svleninfo->vptr[i * 2]);
6560
0
                    tmp = tmp == bcf_int16_missing ? 0 : tmp;
6561
0
                break;
6562
0
                case BCF_BT_INT32:
6563
0
                    tmp = le_to_i32(&svleninfo->vptr[i * 4]);
6564
0
                    tmp = tmp == bcf_int32_missing ? 0 : tmp;
6565
0
                break;
6566
0
                case BCF_BT_INT64:
6567
0
                    tmp = le_to_i64(&svleninfo->vptr[i * 8]);
6568
0
                    tmp = tmp == bcf_int64_missing ? 0 : tmp;
6569
0
                break;
6570
0
                default: //invalid
6571
0
                    tmp = 0;
6572
0
                    bad = 1;
6573
0
                break;
6574
0
            }
6575
0
            if (bad) {  //stop svlen check
6576
0
                len = 0;
6577
0
                break;
6578
0
            }
6579
6580
0
            tmp = tmp < 0 ? llabs(tmp) : tmp;
6581
0
            if (len < tmp) len = tmp;
6582
0
        }
6583
0
    }
6584
54.8k
    if ((!svleninfo || !len) && end) { //no svlen, infer from end
6585
0
        len = end > v->pos ? end - v->pos - 1 : 0;
6586
0
    }
6587
54.8k
    end_svlen = v->pos + len + 1;   //end position found from SVLEN
6588
6589
54.8k
    len = 0;
6590
54.8k
    if (lenfmt && lenfmt->p) {
6591
        //fmt len exists, not being deleted, has gvcf and version >= 4.5
6592
0
        int j = 0;
6593
0
        int64_t offset = 0;
6594
0
        bad = 0;
6595
0
        for (i = 0; i < v->n_sample; ++i) {
6596
0
            for (j = 0; j < lenfmt->n; ++j) {
6597
0
                switch(lenfmt->type) {
6598
0
                case BCF_BT_INT8:
6599
0
                    tmp = le_to_i8(lenfmt->p + offset + j);
6600
0
                    tmp = tmp == bcf_int8_missing ? 0 : tmp;
6601
0
                break;
6602
0
                case BCF_BT_INT16:
6603
0
                    tmp = le_to_i16(lenfmt->p + offset + j * 2);
6604
0
                    tmp = tmp == bcf_int16_missing ? 0 : tmp;
6605
0
                break;
6606
0
                case BCF_BT_INT32:
6607
0
                    tmp = le_to_i32(lenfmt->p + offset + j * 4);
6608
0
                    tmp = tmp == bcf_int32_missing ? 0 : tmp;
6609
0
                break;
6610
0
                case BCF_BT_INT64:
6611
0
                    tmp = le_to_i64(lenfmt->p + offset + j * 8);
6612
0
                    tmp = tmp == bcf_int64_missing ? 0 : tmp;
6613
0
                break;
6614
0
                default: //invalid
6615
0
                    bad = 1;
6616
0
                break;
6617
0
                }
6618
0
                if (bad) {  //stop LEN check
6619
0
                    len = 0;
6620
0
                    break;
6621
0
                }
6622
                //assumes only gvcf have valid LEN
6623
0
                if (len < tmp) len = tmp;
6624
0
            }
6625
0
            offset += j << bcf_type_shift[lenfmt->type];
6626
0
        }
6627
0
    }
6628
54.8k
    if ((!lenfmt || !len) && end) { //no fmt len, infer from end
6629
0
        len = end > v->pos ? end - v->pos : 0;
6630
0
    }
6631
54.8k
    end_fmtlen = v->pos + len;  //end position found from LEN
6632
6633
    //get largest pos, based on END, SVLEN, fmt LEN and length using it
6634
54.8k
    hpos = end < end_svlen ?
6635
14.2k
            end_svlen < end_fmtlen ? end_fmtlen : end_svlen :
6636
54.8k
            end < end_fmtlen ? end_fmtlen : end;
6637
54.8k
    len = hpos - v->pos;
6638
6639
    //NOTE: 'end' calculation be in sync with tbx.c:tbx_parse1
6640
6641
    /* rlen to be calculated based on version, END, SVLEN, fmt LEN, ref len.
6642
    Relevance of these fields vary across different vcf versions.
6643
    Many times, these info/fmt fields are used without version updates;
6644
    hence these fields are used for calculation disregarding vcf version */
6645
54.8k
    return len < len_ref ? len_ref : len;
6646
54.8k
}