Coverage Report

Created: 2025-10-10 07:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/vcf.c
Line
Count
Source
1
/*  vcf.c -- VCF/BCF API functions.
2
3
    Copyright (C) 2012, 2013 Broad Institute.
4
    Copyright (C) 2012-2025 Genome Research Ltd.
5
    Portions copyright (C) 2014 Intel Corporation.
6
7
    Author: Heng Li <lh3@sanger.ac.uk>
8
9
Permission is hereby granted, free of charge, to any person obtaining a copy
10
of this software and associated documentation files (the "Software"), to deal
11
in the Software without restriction, including without limitation the rights
12
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
copies of the Software, and to permit persons to whom the Software is
14
furnished to do so, subject to the following conditions:
15
16
The above copyright notice and this permission notice shall be included in
17
all copies or substantial portions of the Software.
18
19
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25
DEALINGS IN THE SOFTWARE.  */
26
27
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
28
#include <config.h>
29
30
#include <stdio.h>
31
#include <assert.h>
32
#include <string.h>
33
#include <strings.h>
34
#include <stdlib.h>
35
#include <limits.h>
36
#include <stdint.h>
37
#include <inttypes.h>
38
#include <errno.h>
39
40
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
41
#include "fuzz_settings.h"
42
#endif
43
44
#include "htslib/vcf.h"
45
#include "htslib/bgzf.h"
46
#include "htslib/tbx.h"
47
#include "htslib/hfile.h"
48
#include "hts_internal.h"
49
#include "htslib/hts_endian.h"
50
#include "htslib/khash_str2int.h"
51
#include "htslib/kstring.h"
52
#include "htslib/sam.h"
53
#include "htslib/khash.h"
54
55
#if 0
56
// This helps on Intel a bit, often 6-7% faster VCF parsing.
57
// Conversely sometimes harms AMD Zen4 as ~9% slower.
58
// Possibly related to IPC differences.  However for now it's just a
59
// curiousity we ignore and stick with the simpler code.
60
//
61
// Left here as a hint for future explorers.
62
static inline int xstreq(const char *a, const char *b) {
63
    while (*a && *a == *b)
64
        a++, b++;
65
    return *a == *b;
66
}
67
68
#define KHASH_MAP_INIT_XSTR(name, khval_t) \
69
  KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, xstreq)
70
71
KHASH_MAP_INIT_XSTR(vdict, bcf_idinfo_t)
72
#else
73
KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
74
#endif
75
76
typedef khash_t(vdict) vdict_t;
77
78
KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*)
79
typedef khash_t(hdict) hdict_t;
80
81
82
#include "htslib/kseq.h"
83
HTSLIB_EXPORT
84
uint32_t bcf_float_missing    = 0x7F800001;
85
86
HTSLIB_EXPORT
87
uint32_t bcf_float_vector_end = 0x7F800002;
88
89
HTSLIB_EXPORT
90
uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
91
92
static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
93
94
/*
95
    Partial support for 64-bit POS and Number=1 INFO tags.
96
    Notes:
97
     - the support for 64-bit values is motivated by POS and INFO/END for large genomes
98
     - the use of 64-bit values does not conform to the specification
99
     - cannot output 64-bit BCF and if it does, it is not compatible with anything
100
     - experimental, use at your risk
101
*/
102
#ifdef VCF_ALLOW_INT64
103
    #define BCF_MAX_BT_INT64 (0x7fffffffffffffff)       /* INT64_MAX, for internal use only */
104
    #define BCF_MIN_BT_INT64 -9223372036854775800LL     /* INT64_MIN + 8, for internal use only */
105
#endif
106
107
602
#define BCF_IS_64BIT (1<<30)
108
109
110
// Opaque structure with auxilary data which allows to extend bcf_hdr_t without breaking ABI.
111
// Note that this preserving API and ABI requires that the first element is vdict_t struct
112
// rather than a pointer, as user programs may (and in some cases do) access the dictionary
113
// directly as (vdict_t*)hdr->dict.
114
typedef struct
115
{
116
    vdict_t dict;   // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT
117
    hdict_t *gen;   // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields
118
    size_t *key_len;// length of h->id[BCF_DT_ID] strings
119
    int version;    //cached version
120
}
121
bcf_hdr_aux_t;
122
123
static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr)
124
309k
{
125
309k
    return (bcf_hdr_aux_t *)hdr->dict[0];
126
309k
}
127
128
//version macros
129
4.87k
#define VCF_DEF 4002000
130
41.9k
#define VCF44   4004000
131
#define VCF45   4005000
132
133
#define VCF_MAJOR_VER(x) ( (x) / 10000 / 100 )
134
#define VCF_MINOR_VER(x) ( ((x) % 1000000) / 1000 )
135
136
/**
137
 *  bcf_get_version - get the version as int
138
 *  @param hdr   - bcf header, to get version
139
 *  @param verstr- version string, which is already available
140
 *  Returns version on success and default version on failure
141
 *  version = major * 100 * 10000 + minor * 1000
142
 */
143
static int bcf_get_version(const bcf_hdr_t *hdr, const char *verstr)
144
17.5k
{
145
17.5k
    const char *version = NULL, vcf[] = "VCFv";
146
17.5k
    char *major = NULL, *minor = NULL;
147
17.5k
    int ver = -1;
148
17.5k
    long tmp = 0;
149
17.5k
    bcf_hdr_aux_t *aux = NULL;
150
151
17.5k
    if (!hdr && !verstr) {  //invalid input
152
0
        goto fail;
153
0
    }
154
155
17.5k
    if (hdr) {
156
12.3k
        if ((aux = get_hdr_aux(hdr)) && aux->version != 0) {    //use cached version
157
12.3k
            return aux->version;
158
12.3k
        }
159
        //get from header
160
56
        version = bcf_hdr_get_version(hdr);
161
5.21k
    } else {
162
        //get from version string
163
5.21k
        version = verstr;
164
5.21k
    }
165
5.27k
    if (!(major = strstr(version, vcf))) {  //bad format
166
4.43k
        goto fail;
167
4.43k
    }
168
838
    major += sizeof(vcf) - 1;
169
838
    if (!(minor = strchr(major, '.'))) {    //bad format
170
194
        goto fail;
171
194
    }
172
644
    tmp = strtol(major, NULL, 10);
173
644
    if ((!tmp && errno == EINVAL) ||
174
586
        ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) {    //failed
175
90
        goto fail;
176
90
    }
177
554
    ver = tmp * 100 * 10000;
178
554
    tmp = strtol(++minor, NULL, 10);
179
554
    if ((!tmp && errno == EINVAL) ||
180
515
        ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) {    //failed
181
159
        goto fail;
182
159
    }
183
395
    ver += tmp * 1000;
184
395
    return ver;
185
186
4.87k
fail:
187
4.87k
    hts_log_warning("Couldn't get VCF version, considering as %d.%d",
188
4.87k
        VCF_MAJOR_VER(VCF_DEF), VCF_MINOR_VER(VCF_DEF));
189
4.87k
    return VCF_DEF;
190
554
}
191
192
static char *find_chrom_header_line(char *s)
193
0
{
194
0
    char *nl;
195
0
    if (strncmp(s, "#CHROM\t", 7) == 0) return s;
196
0
    else if ((nl = strstr(s, "\n#CHROM\t")) != NULL) return nl+1;
197
0
    else return NULL;
198
0
}
199
200
static int64_t get_rlen(const bcf_hdr_t *h, bcf1_t *v);
201
202
/*************************
203
 *** VCF header parser ***
204
 *************************/
205
206
static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len)
207
7.14k
{
208
7.14k
    const char *ss = s;
209
7.58k
    while ( *ss && isspace_c(*ss) && ss - s < len) ss++;
210
7.14k
    if ( !*ss || ss - s == len)
211
4
    {
212
4
        hts_log_error("Empty sample name: trailing spaces/tabs in the header line?");
213
4
        return -1;
214
4
    }
215
216
7.14k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
217
7.14k
    int ret;
218
7.14k
    char *sdup = malloc(len + 1);
219
7.14k
    if (!sdup) return -1;
220
7.14k
    memcpy(sdup, s, len);
221
7.14k
    sdup[len] = 0;
222
223
    // Ensure space is available in h->samples
224
7.14k
    size_t n = kh_size(d);
225
7.14k
    char **new_samples = realloc(h->samples, sizeof(char*) * (n + 1));
226
7.14k
    if (!new_samples) {
227
0
        free(sdup);
228
0
        return -1;
229
0
    }
230
7.14k
    h->samples = new_samples;
231
232
7.14k
    int k = kh_put(vdict, d, sdup, &ret);
233
7.14k
    if (ret < 0) {
234
0
        free(sdup);
235
0
        return -1;
236
0
    }
237
7.14k
    if (ret) { // absent
238
7.14k
        kh_val(d, k) = bcf_idinfo_def;
239
7.14k
        kh_val(d, k).id = n;
240
7.14k
    } else {
241
0
        hts_log_error("Duplicated sample name '%s'", sdup);
242
0
        free(sdup);
243
0
        return -1;
244
0
    }
245
7.14k
    h->samples[n] = sdup;
246
7.14k
    h->dirty = 1;
247
7.14k
    return 0;
248
7.14k
}
249
250
int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
251
0
{
252
0
    if (!s) {
253
        // Allowed for backwards-compatibility, calling with s == NULL
254
        // used to trigger bcf_hdr_sync(h);
255
0
        return 0;
256
0
    }
257
0
    return bcf_hdr_add_sample_len(h, s, strlen(s));
258
0
}
259
260
int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *hdr, const char *str)
261
3.83k
{
262
3.83k
    const char *mandatory = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
263
3.83k
    if ( strncmp(str,mandatory,strlen(mandatory)) )
264
112
    {
265
112
        hts_log_error("Could not parse the \"#CHROM..\" line, either the fields are incorrect or spaces are present instead of tabs:\n\t%s",str);
266
112
        return -1;
267
112
    }
268
269
3.72k
    const char *beg = str + strlen(mandatory), *end;
270
3.72k
    if ( !*beg || *beg=='\n' ) return 0;
271
1.19k
    if ( strncmp(beg,"\tFORMAT\t",8) )
272
32
    {
273
32
        hts_log_error("Could not parse the \"#CHROM..\" line, either FORMAT is missing or spaces are present instead of tabs:\n\t%s",str);
274
32
        return -1;
275
32
    }
276
1.15k
    beg += 8;
277
278
1.15k
    int ret = 0;
279
7.16k
    while ( *beg )
280
7.14k
    {
281
7.14k
        end = beg;
282
990M
        while ( *end && *end!='\t' && *end!='\n' ) end++;
283
7.14k
        if ( bcf_hdr_add_sample_len(hdr, beg, end-beg) < 0 ) ret = -1;
284
7.14k
        if ( !*end || *end=='\n' || ret<0 ) break;
285
6.01k
        beg = end + 1;
286
6.01k
    }
287
1.15k
    return ret;
288
1.19k
}
289
290
int bcf_hdr_sync(bcf_hdr_t *h)
291
83.1k
{
292
83.1k
    int i;
293
332k
    for (i = 0; i < 3; i++)
294
249k
    {
295
249k
        vdict_t *d = (vdict_t*)h->dict[i];
296
249k
        khint_t k;
297
249k
        if ( h->n[i] < kh_size(d) )
298
1.13k
        {
299
1.13k
            bcf_idpair_t *new_idpair;
300
            // this should be true only for i=2, BCF_DT_SAMPLE
301
1.13k
            new_idpair = (bcf_idpair_t*) realloc(h->id[i], kh_size(d)*sizeof(bcf_idpair_t));
302
1.13k
            if (!new_idpair) return -1;
303
1.13k
            h->n[i] = kh_size(d);
304
1.13k
            h->id[i] = new_idpair;
305
1.13k
        }
306
2.89G
        for (k=kh_begin(d); k<kh_end(d); k++)
307
2.89G
        {
308
2.89G
            if (!kh_exist(d,k)) continue;
309
20.6M
            h->id[i][kh_val(d,k).id].key = kh_key(d,k);
310
20.6M
            h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
311
20.6M
        }
312
249k
    }
313
314
    // Invalidate key length cache
315
83.1k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
316
83.1k
    if (aux && aux->key_len) {
317
2.90k
        free(aux->key_len);
318
2.90k
        aux->key_len = NULL;
319
2.90k
    }
320
321
83.1k
    h->dirty = 0;
322
83.1k
    return 0;
323
83.1k
}
324
325
void bcf_hrec_destroy(bcf_hrec_t *hrec)
326
161k
{
327
161k
    if (!hrec) return;
328
156k
    free(hrec->key);
329
156k
    if ( hrec->value ) free(hrec->value);
330
156k
    int i;
331
534k
    for (i=0; i<hrec->nkeys; i++)
332
378k
    {
333
378k
        free(hrec->keys[i]);
334
378k
        free(hrec->vals[i]);
335
378k
    }
336
156k
    free(hrec->keys);
337
156k
    free(hrec->vals);
338
156k
    free(hrec);
339
156k
}
340
341
// Copies all fields except IDX.
342
bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
343
0
{
344
0
    int save_errno;
345
0
    bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
346
0
    if (!out) return NULL;
347
348
0
    out->type = hrec->type;
349
0
    if ( hrec->key ) {
350
0
        out->key = strdup(hrec->key);
351
0
        if (!out->key) goto fail;
352
0
    }
353
0
    if ( hrec->value ) {
354
0
        out->value = strdup(hrec->value);
355
0
        if (!out->value) goto fail;
356
0
    }
357
0
    out->nkeys = hrec->nkeys;
358
0
    out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys);
359
0
    if (!out->keys) goto fail;
360
0
    out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys);
361
0
    if (!out->vals) goto fail;
362
0
    int i, j = 0;
363
0
    for (i=0; i<hrec->nkeys; i++)
364
0
    {
365
0
        if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
366
0
        if ( hrec->keys[i] ) {
367
0
            out->keys[j] = strdup(hrec->keys[i]);
368
0
            if (!out->keys[j]) goto fail;
369
0
        }
370
0
        if ( hrec->vals[i] ) {
371
0
            out->vals[j] = strdup(hrec->vals[i]);
372
0
            if (!out->vals[j]) goto fail;
373
0
        }
374
0
        j++;
375
0
    }
376
0
    if ( i!=j ) out->nkeys -= i-j;   // IDX was omitted
377
0
    return out;
378
379
0
 fail:
380
0
    save_errno = errno;
381
0
    hts_log_error("%s", strerror(errno));
382
0
    bcf_hrec_destroy(out);
383
0
    errno = save_errno;
384
0
    return NULL;
385
0
}
386
387
void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
388
0
{
389
0
    fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
390
0
    int i;
391
0
    for (i=0; i<hrec->nkeys; i++)
392
0
        fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
393
0
    fprintf(fp, "\n");
394
0
}
395
396
void bcf_header_debug(bcf_hdr_t *hdr)
397
0
{
398
0
    int i, j;
399
0
    for (i=0; i<hdr->nhrec; i++)
400
0
    {
401
0
        if ( !hdr->hrec[i]->value )
402
0
        {
403
0
            fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
404
0
            fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
405
0
            for (j=1; j<hdr->hrec[i]->nkeys; j++)
406
0
                fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
407
0
            fprintf(stderr,">\n");
408
0
        }
409
0
        else
410
0
            fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
411
0
    }
412
0
}
413
414
int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len)
415
290k
{
416
290k
    char **tmp;
417
290k
    size_t n = hrec->nkeys + 1;
418
290k
    assert(len > 0 && len < SIZE_MAX);
419
290k
    tmp = realloc(hrec->keys, sizeof(char*)*n);
420
290k
    if (!tmp) return -1;
421
290k
    hrec->keys = tmp;
422
290k
    tmp = realloc(hrec->vals, sizeof(char*)*n);
423
290k
    if (!tmp) return -1;
424
290k
    hrec->vals = tmp;
425
426
290k
    hrec->keys[hrec->nkeys] = (char*) malloc((len+1)*sizeof(char));
427
290k
    if (!hrec->keys[hrec->nkeys]) return -1;
428
290k
    memcpy(hrec->keys[hrec->nkeys],str,len);
429
290k
    hrec->keys[hrec->nkeys][len] = 0;
430
290k
    hrec->vals[hrec->nkeys] = NULL;
431
290k
    hrec->nkeys = n;
432
290k
    return 0;
433
290k
}
434
435
int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted)
436
290k
{
437
290k
    if ( hrec->vals[i] ) {
438
0
        free(hrec->vals[i]);
439
0
        hrec->vals[i] = NULL;
440
0
    }
441
290k
    if ( !str ) return 0;
442
290k
    if ( is_quoted )
443
82.2k
    {
444
82.2k
        if (len >= SIZE_MAX - 3) {
445
0
            errno = ENOMEM;
446
0
            return -1;
447
0
        }
448
82.2k
        hrec->vals[i] = (char*) malloc((len+3)*sizeof(char));
449
82.2k
        if (!hrec->vals[i]) return -1;
450
82.2k
        hrec->vals[i][0] = '"';
451
82.2k
        memcpy(&hrec->vals[i][1],str,len);
452
82.2k
        hrec->vals[i][len+1] = '"';
453
82.2k
        hrec->vals[i][len+2] = 0;
454
82.2k
    }
455
208k
    else
456
208k
    {
457
208k
        if (len == SIZE_MAX) {
458
0
            errno = ENOMEM;
459
0
            return -1;
460
0
        }
461
208k
        hrec->vals[i] = (char*) malloc((len+1)*sizeof(char));
462
208k
        if (!hrec->vals[i]) return -1;
463
208k
        memcpy(hrec->vals[i],str,len);
464
208k
        hrec->vals[i][len] = 0;
465
208k
    }
466
290k
    return 0;
467
290k
}
468
469
int hrec_add_idx(bcf_hrec_t *hrec, int idx)
470
87.3k
{
471
87.3k
    int n = hrec->nkeys + 1;
472
87.3k
    char **tmp = (char**) realloc(hrec->keys, sizeof(char*)*n);
473
87.3k
    if (!tmp) return -1;
474
87.3k
    hrec->keys = tmp;
475
476
87.3k
    tmp = (char**) realloc(hrec->vals, sizeof(char*)*n);
477
87.3k
    if (!tmp) return -1;
478
87.3k
    hrec->vals = tmp;
479
480
87.3k
    hrec->keys[hrec->nkeys] = strdup("IDX");
481
87.3k
    if (!hrec->keys[hrec->nkeys]) return -1;
482
483
87.3k
    kstring_t str = {0,0,0};
484
87.3k
    if (kputw(idx, &str) < 0) {
485
0
        free(hrec->keys[hrec->nkeys]);
486
0
        return -1;
487
0
    }
488
87.3k
    hrec->vals[hrec->nkeys] = str.s;
489
87.3k
    hrec->nkeys = n;
490
87.3k
    return 0;
491
87.3k
}
492
493
int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
494
102k
{
495
102k
    int i;
496
156k
    for (i=0; i<hrec->nkeys; i++)
497
120k
        if ( !strcasecmp(key,hrec->keys[i]) ) return i;
498
35.9k
    return -1;
499
102k
}
500
501
static void bcf_hrec_set_type(bcf_hrec_t *hrec)
502
300k
{
503
300k
    if ( !strcmp(hrec->key, "contig") ) hrec->type = BCF_HL_CTG;
504
281k
    else if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
505
185k
    else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
506
100k
    else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
507
67.2k
    else if ( hrec->nkeys>0 ) hrec->type = BCF_HL_STR;
508
54.3k
    else hrec->type = BCF_HL_GEN;
509
300k
}
510
511
512
/**
513
    The arrays were generated with
514
515
    valid_ctg:
516
        perl -le '@v = (split(//,q[!#$%&*+./:;=?@^_|~-]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
517
518
    valid_tag:
519
        perl -le '@v = (split(//,q[_.]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
520
*/
521
static const uint8_t valid_ctg[256] =
522
{
523
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
524
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
525
    0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
526
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
527
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
528
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
529
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
530
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
531
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
532
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
533
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
534
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
535
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
536
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
537
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
538
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
539
};
540
static const uint8_t valid_tag[256] =
541
{
542
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
543
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
544
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
545
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
546
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
547
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
548
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
549
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
550
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
552
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
553
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
554
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
555
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
556
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
557
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
558
};
559
560
/**
561
    bcf_hrec_check() - check the validity of structured header lines
562
563
    Returns 0 on success or negative value on error.
564
565
    Currently the return status is not checked by the caller
566
    and only a warning is printed on stderr. This should be improved
567
    to propagate the error all the way up to the caller and let it
568
    decide what to do: throw an error or proceed anyway.
569
 */
570
static int bcf_hrec_check(bcf_hrec_t *hrec)
571
150k
{
572
150k
    int i;
573
150k
    bcf_hrec_set_type(hrec);
574
575
150k
    if ( hrec->type==BCF_HL_CTG )
576
9.17k
    {
577
9.17k
        i = bcf_hrec_find_key(hrec,"ID");
578
9.17k
        if ( i<0 ) goto err_missing_id;
579
5.94k
        char *val = hrec->vals[i];
580
5.94k
        if ( val[0]=='*' || val[0]=='=' || !valid_ctg[(uint8_t)val[0]] ) goto err_invalid_ctg;
581
65.1k
        while ( *(++val) )
582
64.8k
            if ( !valid_ctg[(uint8_t)*val] ) goto err_invalid_ctg;
583
340
        return 0;
584
952
    }
585
140k
    if ( hrec->type==BCF_HL_INFO )
586
47.9k
    {
587
47.9k
        i = bcf_hrec_find_key(hrec,"ID");
588
47.9k
        if ( i<0 ) goto err_missing_id;
589
35.5k
        char *val = hrec->vals[i];
590
35.5k
        if ( !strcmp(val,"1000G") ) return 0;
591
35.5k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
592
8.72k
        while ( *(++val) )
593
7.23k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
594
1.49k
        return 0;
595
2.99k
    }
596
92.9k
    if ( hrec->type==BCF_HL_FMT )
597
16.5k
    {
598
16.5k
        i = bcf_hrec_find_key(hrec,"ID");
599
16.5k
        if ( i<0 ) goto err_missing_id;
600
12.4k
        char *val = hrec->vals[i];
601
12.4k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
602
19.1k
        while ( *(++val) )
603
16.0k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
604
3.04k
        return 0;
605
8.17k
    }
606
76.4k
    return 0;
607
608
19.7k
  err_missing_id:
609
19.7k
    hts_log_warning("Missing ID attribute in one or more header lines");
610
19.7k
    return -1;
611
612
5.60k
  err_invalid_ctg:
613
5.60k
    hts_log_warning("Invalid contig name: \"%s\"", hrec->vals[i]);
614
5.60k
    return -1;
615
616
43.4k
  err_invalid_tag:
617
43.4k
    hts_log_warning("Invalid tag name: \"%s\"", hrec->vals[i]);
618
43.4k
    return -1;
619
92.9k
}
620
621
static inline int is_escaped(const char *min, const char *str)
622
81.9k
{
623
81.9k
    int n = 0;
624
81.9k
    while ( --str>=min && *str=='\\' ) n++;
625
81.9k
    return n%2;
626
81.9k
}
627
628
bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
629
167k
{
630
167k
    bcf_hrec_t *hrec = NULL;
631
167k
    const char *p = line;
632
167k
    if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
633
161k
    p += 2;
634
635
161k
    const char *q = p;
636
1.26M
    while ( *q && *q!='=' && *q != '\n' ) q++;
637
161k
    ptrdiff_t n = q-p;
638
161k
    if ( *q!='=' || !n ) // wrong format
639
5.22k
        goto malformed_line;
640
641
156k
    hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
642
156k
    if (!hrec) { *len = -1; return NULL; }
643
156k
    hrec->key = (char*) malloc(sizeof(char)*(n+1));
644
156k
    if (!hrec->key) goto fail;
645
156k
    memcpy(hrec->key,p,n);
646
156k
    hrec->key[n] = 0;
647
156k
    hrec->type = -1;
648
649
156k
    p = ++q;
650
156k
    if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
651
32.2k
    {
652
7.42M
        while ( *q && *q!='\n' ) q++;
653
32.2k
        hrec->value = (char*) malloc((q-p+1)*sizeof(char));
654
32.2k
        if (!hrec->value) goto fail;
655
32.2k
        memcpy(hrec->value, p, q-p);
656
32.2k
        hrec->value[q-p] = 0;
657
32.2k
        *len = q - line + (*q ? 1 : 0); // Skip \n but not \0
658
32.2k
        return hrec;
659
32.2k
    }
660
661
    // structured line, e.g.
662
    // ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
663
    // ##PEDIGREE=<Name_0=G0-ID,Name_1=G1-ID,Name_3=GN-ID>
664
123k
    int nopen = 1;
665
414k
    while ( *q && *q!='\n' && nopen>0 )
666
296k
    {
667
296k
        p = ++q;
668
297k
        while ( *q && *q==' ' ) { p++; q++; }
669
        // ^[A-Za-z_][0-9A-Za-z_.]*$
670
296k
        if (p==q && *q && (isalpha_c(*q) || *q=='_'))
671
295k
        {
672
295k
            q++;
673
1.62M
            while ( *q && (isalnum_c(*q) || *q=='_' || *q=='.') ) q++;
674
295k
        }
675
296k
        n = q-p;
676
296k
        int m = 0;
677
297k
        while ( *q && *q==' ' ) { q++; m++; }
678
296k
        if ( *q!='=' || !n )
679
6.02k
            goto malformed_line;
680
681
290k
        if (bcf_hrec_add_key(hrec, p, q-p-m) < 0) goto fail;
682
290k
        p = ++q;
683
293k
        while ( *q && *q==' ' ) { p++; q++; }
684
685
290k
        int quoted = 0;
686
290k
        char ending = '\0';
687
290k
        switch (*p) {
688
82.2k
        case '"':
689
82.2k
            quoted = 1;
690
82.2k
            ending = '"';
691
82.2k
            p++;
692
82.2k
            break;
693
49
        case '[':
694
49
            quoted = 1;
695
49
            ending = ']';
696
49
            break;
697
290k
        }
698
290k
        if ( quoted ) q++;
699
184M
        while ( *q && *q != '\n' )
700
184M
        {
701
184M
            if ( quoted ) { if ( *q==ending && !is_escaped(p,q) ) break; }
702
182M
            else
703
182M
            {
704
182M
                if ( *q=='<' ) nopen++;
705
182M
                if ( *q=='>' ) nopen--;
706
182M
                if ( !nopen ) break;
707
182M
                if ( *q==',' && nopen==1 ) break;
708
182M
            }
709
183M
            q++;
710
183M
        }
711
290k
        const char *r = q;
712
290k
        if (quoted && ending == ']') {
713
49
            if (*q == ending) {
714
33
                r++;
715
33
                q++;
716
33
                quoted = 0;
717
33
            } else {
718
16
                char buffer[320];
719
16
                hts_log_error("Missing ']' in header line %s",
720
16
                              hts_strprint(buffer, sizeof(buffer), '"',
721
16
                                           line, q-line));
722
16
                goto fail;
723
16
            }
724
49
        }
725
291k
        while ( r > p && r[-1] == ' ' ) r--;
726
290k
        if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0)
727
0
            goto fail;
728
290k
        if ( quoted && *q==ending ) q++;
729
290k
        if ( *q=='>' )
730
87.0k
        {
731
87.0k
            if (nopen) nopen--;     // this can happen with nested angle brackets <>
732
87.0k
            q++;
733
87.0k
        }
734
290k
    }
735
117k
    if ( nopen )
736
30.9k
        hts_log_warning("Incomplete header line, trying to proceed anyway:\n\t[%s]\n\t[%d]",line,q[0]);
737
738
    // Skip to end of line
739
117k
    int nonspace = 0;
740
117k
    p = q;
741
6.76M
    while ( *q && *q!='\n' ) { nonspace |= !isspace_c(*q); q++; }
742
117k
    if (nonspace) {
743
600
        char buffer[320];
744
600
        hts_log_warning("Dropped trailing junk from header line '%s'",
745
600
                        hts_strprint(buffer, sizeof(buffer),
746
600
                                     '"', line, q - line));
747
600
    }
748
749
117k
    *len = q - line + (*q ? 1 : 0);
750
117k
    return hrec;
751
752
16
 fail:
753
16
    *len = -1;
754
16
    bcf_hrec_destroy(hrec);
755
16
    return NULL;
756
757
11.2k
 malformed_line:
758
11.2k
    {
759
11.2k
        char buffer[320];
760
287k
        while ( *q && *q!='\n' ) q++;  // Ensure *len includes full line
761
11.2k
        hts_log_error("Could not parse the header line: %s",
762
11.2k
                      hts_strprint(buffer, sizeof(buffer),
763
11.2k
                                   '"', line, q - line));
764
11.2k
        *len = q - line + (*q ? 1 : 0);
765
11.2k
        bcf_hrec_destroy(hrec);
766
11.2k
        return NULL;
767
123k
    }
768
123k
}
769
770
static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_idinfo_t *idinfo)
771
86.1k
{
772
86.1k
    size_t new_n;
773
774
    // If available, preserve existing IDX
775
86.1k
    if ( idinfo->id==-1 )
776
85.7k
        idinfo->id = hdr->n[dict_type];
777
444
    else if ( idinfo->id < hdr->n[dict_type] && hdr->id[dict_type][idinfo->id].key )
778
2
    {
779
2
        hts_log_error("Conflicting IDX=%d lines in the header dictionary, the new tag is %s",
780
2
            idinfo->id, tag);
781
2
        errno = EINVAL;
782
2
        return -1;
783
2
    }
784
785
86.1k
    new_n = idinfo->id >= hdr->n[dict_type] ? idinfo->id+1 : hdr->n[dict_type];
786
86.1k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
787
    // hts_resize() can attempt to allocate up to 2 * requested items
788
86.1k
    if (new_n > FUZZ_ALLOC_LIMIT/(2 * sizeof(bcf_idpair_t)))
789
6
        return -1;
790
86.1k
#endif
791
86.1k
    if (hts_resize(bcf_idpair_t, new_n, &hdr->m[dict_type],
792
86.1k
                   &hdr->id[dict_type], HTS_RESIZE_CLEAR)) {
793
0
        return -1;
794
0
    }
795
86.1k
    hdr->n[dict_type] = new_n;
796
797
    // NB: the next kh_put call can invalidate the idinfo pointer, therefore
798
    // we leave it unassigned here. It must be set explicitly in bcf_hdr_sync.
799
86.1k
    hdr->id[dict_type][idinfo->id].key = tag;
800
801
86.1k
    return 0;
802
86.1k
}
803
804
// returns: 1 when hdr needs to be synced, -1 on error, 0 otherwise
805
static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
806
150k
{
807
    // contig
808
150k
    int i, ret, replacing = 0;
809
150k
    khint_t k;
810
150k
    char *str = NULL;
811
812
150k
    bcf_hrec_set_type(hrec);
813
814
150k
    if ( hrec->type==BCF_HL_CTG )
815
9.17k
    {
816
9.17k
        hts_pos_t len = 0;
817
818
        // Get the contig ID ($str) and length ($j)
819
9.17k
        i = bcf_hrec_find_key(hrec,"length");
820
9.17k
        if ( i<0 ) len = 0;
821
1.70k
        else {
822
1.70k
            char *end = hrec->vals[i];
823
1.70k
            len = strtoll(hrec->vals[i], &end, 10);
824
1.70k
            if (end == hrec->vals[i] || len < 0) return 0;
825
1.70k
        }
826
827
8.14k
        i = bcf_hrec_find_key(hrec,"ID");
828
8.14k
        if ( i<0 ) return 0;
829
5.94k
        str = strdup(hrec->vals[i]);
830
5.94k
        if (!str) return -1;
831
832
        // Register in the dictionary
833
5.94k
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
834
5.94k
        khint_t k = kh_get(vdict, d, str);
835
5.94k
        if ( k != kh_end(d) ) { // already present
836
757
            free(str); str=NULL;
837
757
            if (kh_val(d, k).hrec[0] != NULL) // and not removed
838
757
                return 0;
839
0
            replacing = 1;
840
5.19k
        } else {
841
5.19k
            k = kh_put(vdict, d, str, &ret);
842
5.19k
            if (ret < 0) { free(str); return -1; }
843
5.19k
        }
844
845
5.19k
        int idx = bcf_hrec_find_key(hrec,"IDX");
846
5.19k
        if ( idx!=-1 )
847
1.03k
        {
848
1.03k
            char *tmp = hrec->vals[idx];
849
1.03k
            idx = strtol(hrec->vals[idx], &tmp, 10);
850
1.03k
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
851
1.01k
            {
852
1.01k
                if (!replacing) {
853
1.01k
                    kh_del(vdict, d, k);
854
1.01k
                    free(str);
855
1.01k
                }
856
1.01k
                hts_log_warning("Error parsing the IDX tag, skipping");
857
1.01k
                return 0;
858
1.01k
            }
859
1.03k
        }
860
861
4.18k
        kh_val(d, k) = bcf_idinfo_def;
862
4.18k
        kh_val(d, k).id = idx;
863
4.18k
        kh_val(d, k).info[0] = len;
864
4.18k
        kh_val(d, k).hrec[0] = hrec;
865
4.18k
        if (bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k)) < 0) {
866
6
            if (!replacing) {
867
6
                kh_del(vdict, d, k);
868
6
                free(str);
869
6
            }
870
6
            return -1;
871
6
        }
872
4.17k
        if ( idx==-1 ) {
873
4.15k
            if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
874
0
               return -1;
875
0
            }
876
4.15k
        }
877
878
4.17k
        return 1;
879
4.17k
    }
880
881
140k
    if ( hrec->type==BCF_HL_STR ) return 1;
882
134k
    if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_FMT ) return 0;
883
884
    // INFO/FILTER/FORMAT
885
107k
    char *id = NULL;
886
107k
    uint32_t type = UINT32_MAX, var = UINT32_MAX;
887
107k
    int num = -1, idx = -1;
888
374k
    for (i=0; i<hrec->nkeys; i++)
889
267k
    {
890
267k
        if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
891
176k
        else if ( !strcmp(hrec->keys[i], "IDX") )
892
872
        {
893
872
            char *tmp = hrec->vals[i];
894
872
            idx = strtol(hrec->vals[i], &tmp, 10);
895
872
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
896
150
            {
897
150
                hts_log_warning("Error parsing the IDX tag, skipping");
898
150
                return 0;
899
150
            }
900
872
        }
901
175k
        else if ( !strcmp(hrec->keys[i], "Type") )
902
47.7k
        {
903
47.7k
            if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
904
44.6k
            else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
905
43.8k
            else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
906
6.61k
            else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
907
6.46k
            else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
908
4.74k
            else
909
4.74k
            {
910
4.74k
                hts_log_warning("The type \"%s\" is not supported, assuming \"String\"", hrec->vals[i]);
911
4.74k
                type = BCF_HT_STR;
912
4.74k
            }
913
47.7k
        }
914
128k
        else if ( !strcmp(hrec->keys[i], "Number") )
915
40.7k
        {
916
40.7k
            if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
917
39.6k
            else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
918
39.4k
            else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
919
38.7k
            else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
920
38.7k
            else
921
38.7k
            {
922
38.7k
                sscanf(hrec->vals[i],"%d",&num);
923
38.7k
                var = BCF_VL_FIXED;
924
38.7k
            }
925
40.7k
            if (var != BCF_VL_FIXED) num = 0xfffff;
926
40.7k
        }
927
267k
    }
928
107k
    if (hrec->type == BCF_HL_INFO || hrec->type == BCF_HL_FMT) {
929
64.3k
        if (type == -1) {
930
18.0k
            hts_log_warning("%s %s field has no Type defined. Assuming String",
931
18.0k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
932
18.0k
            type = BCF_HT_STR;
933
18.0k
        }
934
64.3k
        if (var == -1) {
935
23.6k
            hts_log_warning("%s %s field has no Number defined. Assuming '.'",
936
23.6k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
937
23.6k
            var = BCF_VL_VAR;
938
23.6k
        }
939
64.3k
        if ( type==BCF_HT_FLAG && (var!=BCF_VL_FIXED || num!=0) )
940
1.25k
        {
941
1.25k
            hts_log_warning("The definition of Flag \"%s/%s\" is invalid, forcing Number=0", hrec->key,id);
942
1.25k
            var = BCF_VL_FIXED;
943
1.25k
            num = 0;
944
1.25k
        }
945
64.3k
    }
946
107k
    uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 |
947
107k
                     (var & 0xf) << 8 |
948
107k
                     (type & 0xf) << 4 |
949
107k
                     (((uint32_t) hrec->type) & 0xf));
950
951
107k
    if ( !id ) return 0;
952
90.7k
    str = strdup(id);
953
90.7k
    if (!str) return -1;
954
955
90.7k
    vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
956
90.7k
    k = kh_get(vdict, d, str);
957
90.7k
    if ( k != kh_end(d) )
958
8.80k
    {
959
        // already present
960
8.80k
        free(str);
961
8.80k
        if ( kh_val(d, k).hrec[info&0xf] ) return 0;
962
1.61k
        kh_val(d, k).info[info&0xf] = info;
963
1.61k
        kh_val(d, k).hrec[info&0xf] = hrec;
964
1.61k
        if ( idx==-1 ) {
965
1.60k
            if (hrec_add_idx(hrec, kh_val(d, k).id) < 0) {
966
0
                return -1;
967
0
            }
968
1.60k
        }
969
1.61k
        return 1;
970
1.61k
    }
971
81.9k
    k = kh_put(vdict, d, str, &ret);
972
81.9k
    if (ret < 0) {
973
0
        free(str);
974
0
        return -1;
975
0
    }
976
81.9k
    kh_val(d, k) = bcf_idinfo_def;
977
81.9k
    kh_val(d, k).info[info&0xf] = info;
978
81.9k
    kh_val(d, k).hrec[info&0xf] = hrec;
979
81.9k
    kh_val(d, k).id = idx;
980
81.9k
    if (bcf_hdr_set_idx(hdr, BCF_DT_ID, kh_key(d,k), &kh_val(d,k)) < 0) {
981
2
        kh_del(vdict, d, k);
982
2
        free(str);
983
2
        return -1;
984
2
    }
985
81.9k
    if ( idx==-1 ) {
986
81.5k
        if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
987
0
            return -1;
988
0
        }
989
81.5k
    }
990
991
81.9k
    return 1;
992
81.9k
}
993
994
static void bcf_hdr_unregister_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
995
0
{
996
0
    if (hrec->type == BCF_HL_FLT ||
997
0
        hrec->type == BCF_HL_INFO ||
998
0
        hrec->type == BCF_HL_FMT ||
999
0
        hrec->type == BCF_HL_CTG) {
1000
0
        int id = bcf_hrec_find_key(hrec, "ID");
1001
0
        if (id < 0 || !hrec->vals[id])
1002
0
            return;
1003
0
        vdict_t *dict = (hrec->type == BCF_HL_CTG
1004
0
                         ? (vdict_t*)hdr->dict[BCF_DT_CTG]
1005
0
                         : (vdict_t*)hdr->dict[BCF_DT_ID]);
1006
0
        khint_t k = kh_get(vdict, dict, hrec->vals[id]);
1007
0
        if (k != kh_end(dict))
1008
0
            kh_val(dict, k).hrec[hrec->type==BCF_HL_CTG ? 0 : hrec->type] = NULL;
1009
0
    }
1010
0
}
1011
1012
static void bcf_hdr_remove_from_hdict(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1013
0
{
1014
0
    kstring_t str = KS_INITIALIZE;
1015
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1016
0
    khint_t k;
1017
0
    int id;
1018
1019
0
    switch (hrec->type) {
1020
0
    case BCF_HL_GEN:
1021
0
        if (ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0)
1022
0
            str.l = 0;
1023
0
        break;
1024
0
    case BCF_HL_STR:
1025
0
        id = bcf_hrec_find_key(hrec, "ID");
1026
0
        if (id < 0)
1027
0
            return;
1028
0
        if (!hrec->vals[id] ||
1029
0
            ksprintf(&str, "##%s=<ID=%s>", hrec->key, hrec->vals[id]) < 0)
1030
0
            str.l = 0;
1031
0
        break;
1032
0
    default:
1033
0
        return;
1034
0
    }
1035
0
    if (str.l) {
1036
0
        k = kh_get(hdict, aux->gen, str.s);
1037
0
    } else {
1038
        // Couldn't get a string for some reason, so try the hard way...
1039
0
        for (k = kh_begin(aux->gen); k < kh_end(aux->gen); k++) {
1040
0
            if (kh_exist(aux->gen, k) && kh_val(aux->gen, k) == hrec)
1041
0
                break;
1042
0
        }
1043
0
    }
1044
0
    if (k != kh_end(aux->gen) && kh_val(aux->gen, k) == hrec) {
1045
0
        kh_val(aux->gen, k) = NULL;
1046
0
        free((char *) kh_key(aux->gen, k));
1047
0
        kh_key(aux->gen, k) = NULL;
1048
0
        kh_del(hdict, aux->gen, k);
1049
0
    }
1050
0
    free(str.s);
1051
0
}
1052
1053
int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp)
1054
0
{
1055
0
    assert( hrec->type==BCF_HL_GEN );
1056
0
    int ret;
1057
0
    khint_t k;
1058
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1059
0
    for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
1060
0
    {
1061
0
        if ( !kh_exist(aux->gen,k) ) continue;
1062
0
        if ( hrec!=(bcf_hrec_t*)kh_val(aux->gen,k) ) continue;
1063
0
        break;
1064
0
    }
1065
0
    assert( k<kh_end(aux->gen) );   // something went wrong, should never happen
1066
0
    free((char*)kh_key(aux->gen,k));
1067
0
    kh_del(hdict,aux->gen,k);
1068
0
    kstring_t str = {0,0,0};
1069
0
    if ( ksprintf(&str, "##%s=%s", tmp->key,tmp->value) < 0 )
1070
0
    {
1071
0
        free(str.s);
1072
0
        return -1;
1073
0
    }
1074
0
    k = kh_put(hdict, aux->gen, str.s, &ret);
1075
0
    if ( ret<0 )
1076
0
    {
1077
0
        free(str.s);
1078
0
        return -1;
1079
0
    }
1080
0
    free(hrec->value);
1081
0
    hrec->value = strdup(tmp->value);
1082
0
    if ( !hrec->value ) return -1;
1083
0
    kh_val(aux->gen,k) = hrec;
1084
1085
0
    if (!strcmp(hrec->key,"fileformat")) {
1086
        //update version
1087
0
        get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value);
1088
0
    }
1089
0
    return 0;
1090
0
}
1091
1092
int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1093
150k
{
1094
150k
    kstring_t str = {0,0,0};
1095
150k
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1096
1097
150k
    int res;
1098
150k
    if ( !hrec ) return 0;
1099
1100
150k
    bcf_hrec_check(hrec);   // todo: check return status and propagate errors up
1101
1102
150k
    res = bcf_hdr_register_hrec(hdr,hrec);
1103
150k
    if (res < 0) return -1;
1104
150k
    if ( !res )
1105
55.9k
    {
1106
        // If one of the hashed field, then it is already present
1107
55.9k
        if ( hrec->type != BCF_HL_GEN )
1108
28.7k
        {
1109
28.7k
            bcf_hrec_destroy(hrec);
1110
28.7k
            return 0;
1111
28.7k
        }
1112
        // Is one of the generic fields and already present?
1113
27.1k
        if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 )
1114
0
        {
1115
0
            free(str.s);
1116
0
            return -1;
1117
0
        }
1118
27.1k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1119
27.1k
        if ( k != kh_end(aux->gen) )
1120
15.2k
        {
1121
            // duplicate record
1122
15.2k
            bcf_hrec_destroy(hrec);
1123
15.2k
            free(str.s);
1124
15.2k
            return 0;
1125
15.2k
        }
1126
11.9k
        if (!strcmp(hrec->key, "fileformat")) {
1127
5.21k
            aux->version = bcf_get_version(NULL, hrec->value);
1128
5.21k
        }
1129
11.9k
    }
1130
1131
106k
    int i;
1132
106k
    if ( hrec->type==BCF_HL_STR && (i=bcf_hrec_find_key(hrec,"ID"))>=0 )
1133
4.02k
    {
1134
4.02k
        if ( ksprintf(&str, "##%s=<ID=%s>", hrec->key,hrec->vals[i]) < 0 )
1135
0
        {
1136
0
            free(str.s);
1137
0
            return -1;
1138
0
        }
1139
4.02k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1140
4.02k
        if ( k != kh_end(aux->gen) )
1141
2.56k
        {
1142
            // duplicate record
1143
2.56k
            bcf_hrec_destroy(hrec);
1144
2.56k
            free(str.s);
1145
2.56k
            return 0;
1146
2.56k
        }
1147
4.02k
    }
1148
1149
    // New record, needs to be added
1150
103k
    int n = hdr->nhrec + 1;
1151
103k
    bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
1152
103k
    if (!new_hrec) {
1153
0
        free(str.s);
1154
0
        bcf_hdr_unregister_hrec(hdr, hrec);
1155
0
        return -1;
1156
0
    }
1157
103k
    hdr->hrec = new_hrec;
1158
1159
103k
    if ( str.s )
1160
13.3k
    {
1161
13.3k
        khint_t k = kh_put(hdict, aux->gen, str.s, &res);
1162
13.3k
        if ( res<0 )
1163
0
        {
1164
0
            free(str.s);
1165
0
            return -1;
1166
0
        }
1167
13.3k
        kh_val(aux->gen,k) = hrec;
1168
13.3k
    }
1169
1170
103k
    hdr->hrec[hdr->nhrec] = hrec;
1171
103k
    hdr->dirty = 1;
1172
103k
    hdr->nhrec = n;
1173
1174
103k
    return hrec->type==BCF_HL_GEN ? 0 : 1;
1175
103k
}
1176
1177
bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
1178
56
{
1179
56
    int i;
1180
56
    if ( type==BCF_HL_GEN )
1181
56
    {
1182
        // e.g. ##fileformat=VCFv4.2
1183
        //      ##source=GenomicsDBImport
1184
        //      ##bcftools_viewVersion=1.16-80-gdfdb0923+htslib-1.16-34-g215d364
1185
56
        if ( value )
1186
0
        {
1187
0
            kstring_t str = {0,0,0};
1188
0
            ksprintf(&str, "##%s=%s", key,value);
1189
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1190
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1191
0
            free(str.s);
1192
0
            if ( k == kh_end(aux->gen) ) return NULL;
1193
0
            return kh_val(aux->gen, k);
1194
0
        }
1195
56
        for (i=0; i<hdr->nhrec; i++)
1196
56
        {
1197
56
            if ( hdr->hrec[i]->type!=type ) continue;
1198
56
            if ( strcmp(hdr->hrec[i]->key,key) ) continue;
1199
56
            return hdr->hrec[i];
1200
56
        }
1201
0
        return NULL;
1202
56
    }
1203
0
    else if ( type==BCF_HL_STR )
1204
0
    {
1205
        // e.g. ##GATKCommandLine=<ID=GenomicsDBImport,CommandLine="GenomicsDBImport....">
1206
        //      ##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
1207
0
        if (!str_class) return NULL;
1208
0
        if ( !strcmp("ID",key) )
1209
0
        {
1210
0
            kstring_t str = {0,0,0};
1211
0
            ksprintf(&str, "##%s=<%s=%s>",str_class,key,value);
1212
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1213
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1214
0
            free(str.s);
1215
0
            if ( k == kh_end(aux->gen) ) return NULL;
1216
0
            return kh_val(aux->gen, k);
1217
0
        }
1218
0
        for (i=0; i<hdr->nhrec; i++)
1219
0
        {
1220
0
            if ( hdr->hrec[i]->type!=type ) continue;
1221
0
            if ( strcmp(hdr->hrec[i]->key,str_class) ) continue;
1222
0
            int j = bcf_hrec_find_key(hdr->hrec[i],key);
1223
0
            if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i];
1224
0
        }
1225
0
        return NULL;
1226
0
    }
1227
0
    vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1228
0
    khint_t k = kh_get(vdict, d, value);
1229
0
    if ( k == kh_end(d) ) return NULL;
1230
0
    return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
1231
0
}
1232
1233
void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
1234
3.68k
{
1235
3.68k
    static int PL_warned = 0, GL_warned = 0;
1236
1237
3.68k
    if ( !PL_warned )
1238
3.68k
    {
1239
3.68k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "PL");
1240
3.68k
        if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
1241
0
        {
1242
0
            hts_log_warning("PL should be declared as Number=G");
1243
0
            PL_warned = 1;
1244
0
        }
1245
3.68k
    }
1246
3.68k
    if ( !GL_warned )
1247
3.68k
    {
1248
3.68k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GL");
1249
3.68k
        if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
1250
0
        {
1251
0
            hts_log_warning("GL should be declared as Number=G");
1252
0
            GL_warned = 1;
1253
0
        }
1254
3.68k
    }
1255
3.68k
}
1256
1257
int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
1258
4.64k
{
1259
4.64k
    int len, done = 0;
1260
4.64k
    char *p = htxt;
1261
1262
    // Check sanity: "fileformat" string must come as first
1263
4.64k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
1264
4.64k
    if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") )
1265
314
        hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?");
1266
4.64k
    if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1267
0
        bcf_hrec_destroy(hrec);
1268
0
        return -1;
1269
0
    }
1270
1271
    // The filter PASS must appear first in the dictionary
1272
4.64k
    hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
1273
4.64k
    if (!hrec || bcf_hdr_add_hrec(hdr, hrec) < 0) {
1274
0
        bcf_hrec_destroy(hrec);
1275
0
        return -1;
1276
0
    }
1277
1278
    // Parse the whole header
1279
16.5k
    do {
1280
78.1k
        while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) {
1281
61.6k
            if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1282
8
                bcf_hrec_destroy(hrec);
1283
8
                return -1;
1284
8
            }
1285
61.6k
            p += len;
1286
61.6k
        }
1287
16.5k
        assert(hrec == NULL);
1288
16.5k
        if (len < 0) {
1289
            // len < 0 indicates out-of-memory, or similar error
1290
4
            hts_log_error("Could not parse header line: %s", strerror(errno));
1291
4
            return -1;
1292
16.5k
        } else if (len > 0) {
1293
            // Bad header line.  bcf_hdr_parse_line() will have logged it.
1294
            // Skip and try again on the next line (p + len will be the start
1295
            // of the next one).
1296
11.1k
            p += len;
1297
11.1k
            continue;
1298
11.1k
        }
1299
1300
        // Next should be the sample line.  If not, it was a malformed
1301
        // header, in which case print a warning and skip (many VCF
1302
        // operations do not really care about a few malformed lines).
1303
        // In the future we may want to add a strict mode that errors in
1304
        // this case.
1305
5.40k
        if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) {
1306
1.56k
            char *eol = strchr(p, '\n');
1307
1.56k
            if (*p != '\0') {
1308
812
                char buffer[320];
1309
812
                hts_log_warning("Could not parse header line: %s",
1310
812
                                hts_strprint(buffer, sizeof(buffer),
1311
812
                                               '"', p,
1312
812
                                               eol ? (eol - p) : SIZE_MAX));
1313
812
            }
1314
1.56k
            if (eol) {
1315
768
                p = eol + 1; // Try from the next line.
1316
796
            } else {
1317
796
                done = -1; // No more lines left, give up.
1318
796
            }
1319
3.83k
        } else {
1320
3.83k
            done = 1; // Sample line found
1321
3.83k
        }
1322
16.5k
    } while (!done);
1323
1324
4.63k
    if (done < 0) {
1325
        // No sample line is fatal.
1326
796
        hts_log_error("Could not parse the header, sample line not found");
1327
796
        return -1;
1328
796
    }
1329
1330
3.83k
    if (bcf_hdr_parse_sample_line(hdr,p) < 0)
1331
148
        return -1;
1332
3.68k
    if (bcf_hdr_sync(hdr) < 0)
1333
0
        return -1;
1334
3.68k
    bcf_hdr_check_sanity(hdr);
1335
3.68k
    return 0;
1336
3.68k
}
1337
1338
int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
1339
0
{
1340
0
    int len;
1341
0
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
1342
0
    if ( !hrec ) return -1;
1343
0
    if (bcf_hdr_add_hrec(hdr, hrec) < 0)
1344
0
        return -1;
1345
0
    return 0;
1346
0
}
1347
1348
void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
1349
0
{
1350
0
    int i = 0;
1351
0
    bcf_hrec_t *hrec;
1352
0
    if ( !key )
1353
0
    {
1354
        // no key, remove all entries of this type
1355
0
        while ( i<hdr->nhrec )
1356
0
        {
1357
0
            if ( hdr->hrec[i]->type!=type ) { i++; continue; }
1358
0
            hrec = hdr->hrec[i];
1359
0
            bcf_hdr_unregister_hrec(hdr, hrec);
1360
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1361
0
            hdr->dirty = 1;
1362
0
            hdr->nhrec--;
1363
0
            if ( i < hdr->nhrec )
1364
0
                memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1365
0
            bcf_hrec_destroy(hrec);
1366
0
        }
1367
0
        return;
1368
0
    }
1369
0
    while (1)
1370
0
    {
1371
0
        if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
1372
0
        {
1373
0
            hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL);
1374
0
            if ( !hrec ) return;
1375
1376
0
            for (i=0; i<hdr->nhrec; i++)
1377
0
                if ( hdr->hrec[i]==hrec ) break;
1378
0
            assert( i<hdr->nhrec );
1379
1380
0
            vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1381
0
            khint_t k = kh_get(vdict, d, key);
1382
0
            kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
1383
0
        }
1384
0
        else
1385
0
        {
1386
0
            for (i=0; i<hdr->nhrec; i++)
1387
0
            {
1388
0
                if ( hdr->hrec[i]->type!=type ) continue;
1389
0
                if ( type==BCF_HL_GEN )
1390
0
                {
1391
0
                    if ( !strcmp(hdr->hrec[i]->key,key) ) break;
1392
0
                }
1393
0
                else
1394
0
                {
1395
                    // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
1396
0
                    int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
1397
0
                    if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break;
1398
0
                }
1399
0
            }
1400
0
            if ( i==hdr->nhrec ) return;
1401
0
            hrec = hdr->hrec[i];
1402
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1403
0
        }
1404
1405
0
        hdr->nhrec--;
1406
0
        if ( i < hdr->nhrec )
1407
0
            memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1408
0
        bcf_hrec_destroy(hrec);
1409
0
        hdr->dirty = 1;
1410
0
    }
1411
0
}
1412
1413
int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
1414
0
{
1415
0
    char tmp[256], *line = tmp;
1416
0
    va_list ap;
1417
0
    va_start(ap, fmt);
1418
0
    int n = vsnprintf(line, sizeof(tmp), fmt, ap);
1419
0
    va_end(ap);
1420
1421
0
    if (n >= sizeof(tmp)) {
1422
0
        n++; // For trailing NUL
1423
0
        line = (char*)malloc(n);
1424
0
        if (!line)
1425
0
            return -1;
1426
1427
0
        va_start(ap, fmt);
1428
0
        vsnprintf(line, n, fmt, ap);
1429
0
        va_end(ap);
1430
0
    }
1431
1432
0
    int ret = bcf_hdr_append(hdr, line);
1433
1434
0
    if (line != tmp) free(line);
1435
0
    return ret;
1436
0
}
1437
1438
1439
/**********************
1440
 *** BCF header I/O ***
1441
 **********************/
1442
1443
const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
1444
56
{
1445
56
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1446
56
    if ( !hrec )
1447
0
    {
1448
0
        hts_log_warning("No version string found, assuming VCFv4.2");
1449
0
        return "VCFv4.2";
1450
0
    }
1451
56
    return hrec->value;
1452
56
}
1453
1454
int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
1455
0
{
1456
0
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1457
0
    if ( !hrec )
1458
0
    {
1459
0
        int len;
1460
0
        kstring_t str = {0,0,0};
1461
0
        if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1;
1462
0
        hrec = bcf_hdr_parse_line(hdr, str.s, &len);
1463
0
        free(str.s);
1464
1465
0
        get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value);
1466
0
    }
1467
0
    else
1468
0
    {
1469
0
        bcf_hrec_t *tmp = bcf_hrec_dup(hrec);
1470
0
        if ( !tmp ) return -1;
1471
0
        free(tmp->value);
1472
0
        tmp->value = strdup(version);
1473
0
        if ( !tmp->value ) return -1;
1474
0
        bcf_hdr_update_hrec(hdr, hrec, tmp);
1475
0
        bcf_hrec_destroy(tmp);
1476
0
    }
1477
0
    hdr->dirty = 1;
1478
    //TODO rlen may change, deal with it
1479
0
    return 0; // FIXME: check for errs in this function (return < 0 if so)
1480
0
}
1481
1482
bcf_hdr_t *bcf_hdr_init(const char *mode)
1483
4.67k
{
1484
4.67k
    int i;
1485
4.67k
    bcf_hdr_t *h;
1486
4.67k
    h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
1487
4.67k
    if (!h) return NULL;
1488
18.7k
    for (i = 0; i < 3; ++i) {
1489
14.0k
        if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail;
1490
        // Supersize the hash to make collisions very unlikely
1491
14.0k
        static int dsize[3] = {16384,16384,2048}; // info, contig, format
1492
14.0k
        if (kh_resize(vdict, h->dict[i], dsize[i]) < 0) goto fail;
1493
14.0k
    }
1494
1495
4.67k
    bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t));
1496
4.67k
    if ( !aux ) goto fail;
1497
4.67k
    if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; }
1498
4.67k
    aux->key_len = NULL;
1499
4.67k
    aux->dict = *((vdict_t*)h->dict[0]);
1500
4.67k
    aux->version = 0;
1501
4.67k
    free(h->dict[0]);
1502
4.67k
    h->dict[0] = aux;
1503
1504
4.67k
    if ( strchr(mode,'w') )
1505
0
    {
1506
0
        bcf_hdr_append(h, "##fileformat=VCFv4.2");
1507
        // The filter PASS must appear first in the dictionary
1508
0
        bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
1509
0
        aux->version = VCF_DEF;
1510
0
    }
1511
4.67k
    return h;
1512
1513
0
 fail:
1514
0
    for (i = 0; i < 3; ++i)
1515
0
        kh_destroy(vdict, h->dict[i]);
1516
0
    free(h);
1517
0
    return NULL;
1518
4.67k
}
1519
1520
void bcf_hdr_destroy(bcf_hdr_t *h)
1521
4.67k
{
1522
4.67k
    int i;
1523
4.67k
    khint_t k;
1524
4.67k
    if (!h) return;
1525
18.7k
    for (i = 0; i < 3; ++i) {
1526
14.0k
        vdict_t *d = (vdict_t*)h->dict[i];
1527
14.0k
        if (d == 0) continue;
1528
162M
        for (k = kh_begin(d); k != kh_end(d); ++k)
1529
162M
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
1530
14.0k
        if ( i==0 )
1531
4.67k
        {
1532
4.67k
            bcf_hdr_aux_t *aux = get_hdr_aux(h);
1533
34.3k
            for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
1534
29.6k
                if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k));
1535
4.67k
            kh_destroy(hdict, aux->gen);
1536
4.67k
            free(aux->key_len); // may exist for dict[0] only
1537
4.67k
        }
1538
14.0k
        kh_destroy(vdict, d);
1539
14.0k
        free(h->id[i]);
1540
14.0k
    }
1541
108k
    for (i=0; i<h->nhrec; i++)
1542
103k
        bcf_hrec_destroy(h->hrec[i]);
1543
4.67k
    if (h->nhrec) free(h->hrec);
1544
4.67k
    if (h->samples) free(h->samples);
1545
4.67k
    free(h->keep_samples);
1546
4.67k
    free(h->transl[0]); free(h->transl[1]);
1547
4.67k
    free(h->mem.s);
1548
4.67k
    free(h);
1549
4.67k
}
1550
1551
bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
1552
4.67k
{
1553
4.67k
    if (hfp->format.format == vcf)
1554
4.36k
        return vcf_hdr_read(hfp);
1555
318
    if (hfp->format.format != bcf) {
1556
0
        hts_log_error("Input is not detected as bcf or vcf format");
1557
0
        return NULL;
1558
0
    }
1559
1560
318
    assert(hfp->is_bgzf);
1561
1562
318
    BGZF *fp = hfp->fp.bgzf;
1563
318
    uint8_t magic[5];
1564
318
    bcf_hdr_t *h;
1565
318
    h = bcf_hdr_init("r");
1566
318
    if (!h) {
1567
0
        hts_log_error("Failed to allocate bcf header");
1568
0
        return NULL;
1569
0
    }
1570
318
    if (bgzf_read(fp, magic, 5) != 5)
1571
0
    {
1572
0
        hts_log_error("Failed to read the header (reading BCF in text mode?)");
1573
0
        bcf_hdr_destroy(h);
1574
0
        return NULL;
1575
0
    }
1576
318
    if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
1577
2
    {
1578
2
        if (!strncmp((char*)magic, "BCF", 3))
1579
2
            hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported");
1580
0
        else
1581
0
            hts_log_error("Invalid BCF2 magic string");
1582
2
        bcf_hdr_destroy(h);
1583
2
        return NULL;
1584
2
    }
1585
316
    uint8_t buf[4];
1586
316
    size_t hlen;
1587
316
    char *htxt = NULL;
1588
316
    if (bgzf_read(fp, buf, 4) != 4) goto fail;
1589
316
    hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24);
1590
316
    if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; }
1591
316
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1592
316
    if (hlen > FUZZ_ALLOC_LIMIT/2) { errno = ENOMEM; goto fail; }
1593
314
#endif
1594
314
    htxt = (char*)malloc(hlen + 1);
1595
314
    if (!htxt) goto fail;
1596
314
    if (bgzf_read(fp, htxt, hlen) != hlen) goto fail;
1597
314
    htxt[hlen] = '\0'; // Ensure htxt is terminated
1598
314
    if ( bcf_hdr_parse(h, htxt) < 0 ) goto fail;
1599
288
    free(htxt);
1600
288
    return h;
1601
28
 fail:
1602
28
    hts_log_error("Failed to read BCF header");
1603
28
    free(htxt);
1604
28
    bcf_hdr_destroy(h);
1605
28
    return NULL;
1606
314
}
1607
1608
int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
1609
3.68k
{
1610
3.68k
    if (!h) {
1611
0
        errno = EINVAL;
1612
0
        return -1;
1613
0
    }
1614
3.68k
    if ( h->dirty ) {
1615
0
        if (bcf_hdr_sync(h) < 0) return -1;
1616
0
    }
1617
3.68k
    hfp->format.category = variant_data;
1618
3.68k
    if (hfp->format.format == vcf || hfp->format.format == text_format) {
1619
1.84k
        hfp->format.format = vcf;
1620
1.84k
        return vcf_hdr_write(hfp, h);
1621
1.84k
    }
1622
1623
1.84k
    if (hfp->format.format == binary_format)
1624
1.84k
        hfp->format.format = bcf;
1625
1626
1.84k
    kstring_t htxt = {0,0,0};
1627
1.84k
    if (bcf_hdr_format(h, 1, &htxt) < 0) {
1628
0
        free(htxt.s);
1629
0
        return -1;
1630
0
    }
1631
1.84k
    kputc('\0', &htxt); // include the \0 byte
1632
1633
1.84k
    BGZF *fp = hfp->fp.bgzf;
1634
1.84k
    if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
1635
1.84k
    uint8_t hlen[4];
1636
1.84k
    u32_to_le(htxt.l, hlen);
1637
1.84k
    if ( bgzf_write(fp, hlen, 4) !=4 ) return -1;
1638
1.84k
    if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1;
1639
1.84k
    if ( bgzf_flush(fp) < 0) return -1;
1640
1641
1.84k
    free(htxt.s);
1642
1.84k
    return 0;
1643
1.84k
}
1644
1645
/********************
1646
 *** BCF site I/O ***
1647
 ********************/
1648
1649
bcf1_t *bcf_init(void)
1650
3.68k
{
1651
3.68k
    bcf1_t *v;
1652
3.68k
    v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
1653
3.68k
    return v;
1654
3.68k
}
1655
1656
void bcf_clear(bcf1_t *v)
1657
46.2k
{
1658
46.2k
    int i;
1659
46.2k
    for (i=0; i<v->d.m_info; i++)
1660
0
    {
1661
0
        if ( v->d.info[i].vptr_free )
1662
0
        {
1663
0
            free(v->d.info[i].vptr - v->d.info[i].vptr_off);
1664
0
            v->d.info[i].vptr_free = 0;
1665
0
        }
1666
0
    }
1667
46.2k
    for (i=0; i<v->d.m_fmt; i++)
1668
0
    {
1669
0
        if ( v->d.fmt[i].p_free )
1670
0
        {
1671
0
            free(v->d.fmt[i].p - v->d.fmt[i].p_off);
1672
0
            v->d.fmt[i].p_free = 0;
1673
0
        }
1674
0
    }
1675
46.2k
    v->rid = v->pos = v->rlen = v->unpacked = 0;
1676
46.2k
    bcf_float_set_missing(v->qual);
1677
46.2k
    v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
1678
46.2k
    v->shared.l = v->indiv.l = 0;
1679
46.2k
    v->d.var_type = -1;
1680
46.2k
    v->d.shared_dirty = 0;
1681
46.2k
    v->d.indiv_dirty  = 0;
1682
46.2k
    v->d.n_flt = 0;
1683
46.2k
    v->errcode = 0;
1684
46.2k
    if (v->d.m_als) v->d.als[0] = 0;
1685
46.2k
    if (v->d.m_id) v->d.id[0] = 0;
1686
46.2k
}
1687
1688
void bcf_empty(bcf1_t *v)
1689
3.68k
{
1690
3.68k
    bcf_clear1(v);
1691
3.68k
    free(v->d.id);
1692
3.68k
    free(v->d.als);
1693
3.68k
    free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
1694
3.68k
    if (v->d.var ) free(v->d.var);
1695
3.68k
    free(v->shared.s); free(v->indiv.s);
1696
3.68k
    memset(&v->d,0,sizeof(v->d));
1697
3.68k
    memset(&v->shared,0,sizeof(v->shared));
1698
3.68k
    memset(&v->indiv,0,sizeof(v->indiv));
1699
3.68k
}
1700
1701
void bcf_destroy(bcf1_t *v)
1702
3.68k
{
1703
3.68k
    if (!v) return;
1704
3.68k
    bcf_empty1(v);
1705
3.68k
    free(v);
1706
3.68k
}
1707
1708
static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
1709
318
{
1710
318
    uint8_t x[32];
1711
318
    ssize_t ret;
1712
318
    uint32_t shared_len, indiv_len;
1713
318
    if ((ret = bgzf_read(fp, x, 32)) != 32) {
1714
2
        if (ret == 0) return -1;
1715
0
        return -2;
1716
2
    }
1717
316
    bcf_clear1(v);
1718
316
    shared_len = le_to_u32(x);
1719
316
    if (shared_len < 24) return -2;
1720
312
    shared_len -= 24; // to exclude six 32-bit integers
1721
312
    indiv_len = le_to_u32(x + 4);
1722
312
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1723
    // ks_resize() normally allocates 1.5 * requested size to allow for growth
1724
312
    if ((uint64_t) shared_len + indiv_len > FUZZ_ALLOC_LIMIT / 3 * 2) return -2;
1725
310
#endif
1726
310
    if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2;
1727
310
    if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2;
1728
310
    v->rid  = le_to_i32(x + 8);
1729
310
    v->pos  = le_to_u32(x + 12);
1730
310
    if ( v->pos==UINT32_MAX ) v->pos = -1;  // this is for telomere coordinate, e.g. MT:0
1731
310
    v->rlen = le_to_i32(x + 16);
1732
310
    v->qual = le_to_float(x + 20);
1733
310
    v->n_info = le_to_u16(x + 24);
1734
310
    v->n_allele = le_to_u16(x + 26);
1735
310
    v->n_sample = le_to_u32(x + 28) & 0xffffff;
1736
310
    v->n_fmt = x[31];
1737
310
    v->shared.l = shared_len;
1738
310
    v->indiv.l = indiv_len;
1739
    // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
1740
310
    if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
1741
1742
310
    if (bgzf_read(fp, v->shared.s, v->shared.l) != v->shared.l) return -2;
1743
288
    if (bgzf_read(fp, v->indiv.s, v->indiv.l) != v->indiv.l) return -2;
1744
286
    return 0;
1745
288
}
1746
1747
0
#define bit_array_size(n) ((n)/8+1)
1748
0
#define bit_array_set(a,i)   ((a)[(i)/8] |=   1 << ((i)%8))
1749
0
#define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
1750
0
#define bit_array_test(a,i)  ((a)[(i)/8] &   (1 << ((i)%8)))
1751
1752
static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1753
3.39k
                                   int32_t *val) {
1754
3.39k
    uint32_t t;
1755
3.39k
    if (end - p < 2) return -1;
1756
3.38k
    t = *p++ & 0xf;
1757
    /* Use if .. else if ... else instead of switch to force order.  Assumption
1758
       is that small integers are more frequent than big ones. */
1759
3.38k
    if (t == BCF_BT_INT8) {
1760
1.68k
        *val = *(int8_t *) p++;
1761
1.70k
    } else {
1762
1.70k
        if (end - p < (1<<bcf_type_shift[t])) return -1;
1763
1.69k
        if (t == BCF_BT_INT16) {
1764
900
            *val = le_to_i16(p);
1765
900
            p += 2;
1766
900
        } else if (t == BCF_BT_INT32) {
1767
678
            *val = le_to_i32(p);
1768
678
            p += 4;
1769
#ifdef VCF_ALLOW_INT64
1770
        } else if (t == BCF_BT_INT64) {
1771
            // This case should never happen because there should be no
1772
            // 64-bit BCFs at all, definitely not coming from htslib
1773
            *val = le_to_i64(p);
1774
            p += 8;
1775
#endif
1776
678
        } else {
1777
118
            return -1;
1778
118
        }
1779
1.69k
    }
1780
3.26k
    *q = p;
1781
3.26k
    return 0;
1782
3.38k
}
1783
1784
static int bcf_dec_size_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1785
14.7k
                             int *num, int *type) {
1786
14.7k
    int r;
1787
14.7k
    if (p >= end) return -1;
1788
14.6k
    *type = *p & 0xf;
1789
14.6k
    if (*p>>4 != 15) {
1790
14.4k
        *q = p + 1;
1791
14.4k
        *num = *p >> 4;
1792
14.4k
        return 0;
1793
14.4k
    }
1794
292
    r = bcf_dec_typed_int1_safe(p + 1, end, q, num);
1795
292
    if (r) return r;
1796
260
    return *num >= 0 ? 0 : -1;
1797
292
}
1798
1799
406
static const char *get_type_name(int type) {
1800
406
    const char *types[9] = {
1801
406
        "null", "int (8-bit)", "int (16 bit)", "int (32 bit)",
1802
406
        "unknown", "float", "unknown", "char", "unknown"
1803
406
    };
1804
406
    int t = (type >= 0 && type < 8) ? type : 8;
1805
406
    return types[t];
1806
406
}
1807
1808
static void bcf_record_check_err(const bcf_hdr_t *hdr, bcf1_t *rec,
1809
1.57k
                                 char *type, uint32_t *reports, int i) {
1810
1.57k
    if (*reports == 0 || hts_verbose >= HTS_LOG_DEBUG)
1811
80
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos
1812
1.57k
                        ": Invalid FORMAT %s %d",
1813
1.57k
                        bcf_seqname_safe(hdr,rec), rec->pos+1, type, i);
1814
1.57k
    (*reports)++;
1815
1.57k
}
1816
1817
286
static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
1818
286
    uint8_t *ptr, *end;
1819
286
    size_t bytes;
1820
286
    uint32_t err = 0;
1821
286
    int type = 0;
1822
286
    int num  = 0;
1823
286
    uint32_t i, reports;
1824
286
    const uint32_t is_integer = ((1 << BCF_BT_INT8)  |
1825
286
                                 (1 << BCF_BT_INT16) |
1826
#ifdef VCF_ALLOW_INT64
1827
                                 (1 << BCF_BT_INT64) |
1828
#endif
1829
286
                                 (1 << BCF_BT_INT32));
1830
286
    const uint32_t is_valid_type = (is_integer          |
1831
286
                                    (1 << BCF_BT_NULL)  |
1832
286
                                    (1 << BCF_BT_FLOAT) |
1833
286
                                    (1 << BCF_BT_CHAR));
1834
286
    int32_t max_id = hdr ? hdr->n[BCF_DT_ID] : 0;
1835
1836
    // Check for valid contig ID
1837
286
    if (rec->rid < 0
1838
212
        || (hdr && (rec->rid >= hdr->n[BCF_DT_CTG]
1839
238
                    || hdr->id[BCF_DT_CTG][rec->rid].key == NULL))) {
1840
238
        hts_log_warning("Bad BCF record at %"PRIhts_pos": Invalid %s id %d", rec->pos+1, "CONTIG", rec->rid);
1841
238
        err |= BCF_ERR_CTG_INVALID;
1842
238
    }
1843
1844
    // Check ID
1845
286
    ptr = (uint8_t *) rec->shared.s;
1846
286
    end = ptr + rec->shared.l;
1847
286
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1848
278
    if (type != BCF_BT_CHAR) {
1849
234
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "ID", type, get_type_name(type));
1850
234
        err |= BCF_ERR_TAG_INVALID;
1851
234
    }
1852
278
    bytes = (size_t) num << bcf_type_shift[type];
1853
278
    if (end - ptr < bytes) goto bad_shared;
1854
266
    ptr += bytes;
1855
1856
    // Check REF and ALT
1857
266
    if (rec->n_allele < 1) {
1858
116
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": No REF allele",
1859
116
                        bcf_seqname_safe(hdr,rec), rec->pos+1);
1860
116
        err |= BCF_ERR_TAG_UNDEF;
1861
116
    }
1862
1863
266
    reports = 0;
1864
11.4k
    for (i = 0; i < rec->n_allele; i++) {
1865
11.1k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1866
11.1k
        if (type != BCF_BT_CHAR) {
1867
10.8k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1868
110
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "REF/ALT", type, get_type_name(type));
1869
10.8k
            err |= BCF_ERR_CHAR;
1870
10.8k
        }
1871
11.1k
        bytes = (size_t) num << bcf_type_shift[type];
1872
11.1k
        if (end - ptr < bytes) goto bad_shared;
1873
11.1k
        ptr += bytes;
1874
11.1k
    }
1875
1876
    // Check FILTER
1877
236
    reports = 0;
1878
236
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1879
230
    if (num > 0) {
1880
128
        bytes = (size_t) num << bcf_type_shift[type];
1881
128
        if (((1 << type) & is_integer) == 0) {
1882
52
            hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", type, get_type_name(type));
1883
52
            err |= BCF_ERR_TAG_INVALID;
1884
52
            if (end - ptr < bytes) goto bad_shared;
1885
50
            ptr += bytes;
1886
76
        } else {
1887
76
            if (end - ptr < bytes) goto bad_shared;
1888
3.92k
            for (i = 0; i < num; i++) {
1889
3.84k
                int32_t key = bcf_dec_int1(ptr, type, &ptr);
1890
3.84k
                if (key < 0
1891
2.94k
                    || (hdr && (key >= max_id
1892
3.67k
                                || hdr->id[BCF_DT_ID][key].key == NULL))) {
1893
3.67k
                    if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1894
64
                        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", key);
1895
3.67k
                    err |= BCF_ERR_TAG_UNDEF;
1896
3.67k
                }
1897
3.84k
            }
1898
74
        }
1899
128
    }
1900
1901
    // Check INFO
1902
226
    reports = 0;
1903
226
    bcf_idpair_t *id_tmp = hdr ? hdr->id[BCF_DT_ID] : NULL;
1904
1.63k
    for (i = 0; i < rec->n_info; i++) {
1905
1.50k
        int32_t key = -1;
1906
1.50k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_shared;
1907
1.43k
        if (key < 0 || (hdr && (key >= max_id
1908
1.11k
                                || id_tmp[key].key == NULL))) {
1909
1.11k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1910
76
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", key);
1911
1.11k
            err |= BCF_ERR_TAG_UNDEF;
1912
1.11k
        }
1913
1.43k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1914
1.42k
        if (((1 << type) & is_valid_type) == 0
1915
1.34k
            || (type == BCF_BT_NULL && num > 0)) {
1916
102
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1917
10
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", type, get_type_name(type));
1918
102
            err |= BCF_ERR_TAG_INVALID;
1919
102
        }
1920
1.42k
        bytes = (size_t) num << bcf_type_shift[type];
1921
1.42k
        if (end - ptr < bytes) goto bad_shared;
1922
1.41k
        ptr += bytes;
1923
1.41k
    }
1924
1925
    // Check FORMAT and individual information
1926
134
    ptr = (uint8_t *) rec->indiv.s;
1927
134
    end = ptr + rec->indiv.l;
1928
134
    reports = 0;
1929
1.65k
    for (i = 0; i < rec->n_fmt; i++) {
1930
1.60k
        int32_t key = -1;
1931
1.60k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_indiv;
1932
1.56k
        if (key < 0
1933
1.51k
            || (hdr && (key >= max_id
1934
1.45k
                        || id_tmp[key].key == NULL))) {
1935
1.45k
            bcf_record_check_err(hdr, rec, "id", &reports, key);
1936
1.45k
            err |= BCF_ERR_TAG_UNDEF;
1937
1.45k
        }
1938
1.56k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv;
1939
1.55k
        if (((1 << type) & is_valid_type) == 0
1940
1.44k
            || (type == BCF_BT_NULL && num > 0)) {
1941
120
            bcf_record_check_err(hdr, rec, "type", &reports, type);
1942
120
            err |= BCF_ERR_TAG_INVALID;
1943
120
        }
1944
1.55k
        bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample;
1945
1.55k
        if (end - ptr < bytes) goto bad_indiv;
1946
1.51k
        ptr += bytes;
1947
1.51k
    }
1948
1949
48
    if (!err && rec->rlen < 0) {
1950
        // Treat bad rlen as a warning instead of an error, and try to
1951
        // fix up by using the length of the stored REF allele.
1952
8
        static int warned = 0;
1953
8
        if (!warned) {
1954
1
            hts_log_warning("BCF record at %s:%"PRIhts_pos" has invalid RLEN (%"PRIhts_pos"). "
1955
1
                            "Only one invalid RLEN will be reported.",
1956
1
                            bcf_seqname_safe(hdr,rec), rec->pos+1, rec->rlen);
1957
1
            warned = 1;
1958
1
        }
1959
        //find rlen considering reflen, END, SVLEN, fmt LEN
1960
8
        hts_pos_t len = get_rlen(hdr, rec);
1961
8
        rec->rlen = len >= 0 ? len : 0;
1962
8
    }
1963
1964
48
    rec->errcode |= err;
1965
1966
48
    return err ? -2 : 0; // Return -2 so bcf_read() reports an error
1967
1968
152
 bad_shared:
1969
152
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - shared section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
1970
152
    return -2;
1971
1972
86
 bad_indiv:
1973
86
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - individuals section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
1974
86
    return -2;
1975
134
}
1976
1977
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
1978
int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
1979
0
{
1980
0
    if ( !hdr->keep_samples ) return 0;
1981
0
    if ( !bcf_hdr_nsamples(hdr) )
1982
0
    {
1983
0
        rec->indiv.l = rec->n_sample = 0;
1984
0
        return 0;
1985
0
    }
1986
1987
0
    int i, j;
1988
0
    uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
1989
0
    bcf_dec_t *dec = &rec->d;
1990
0
    hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
1991
0
    for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
1992
1993
0
    for (i=0; i<rec->n_fmt; i++)
1994
0
    {
1995
0
        ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
1996
0
        src = dec->fmt[i].p - dec->fmt[i].size;
1997
0
        if ( dst )
1998
0
        {
1999
0
            memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
2000
0
            dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
2001
0
        }
2002
0
        dst = dec->fmt[i].p;
2003
0
        for (j=0; j<hdr->nsamples_ori; j++)
2004
0
        {
2005
0
            src += dec->fmt[i].size;
2006
0
            if ( !bit_array_test(hdr->keep_samples,j) ) continue;
2007
0
            memmove(dst, src, dec->fmt[i].size);
2008
0
            dst += dec->fmt[i].size;
2009
0
        }
2010
0
        rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
2011
0
        dec->fmt[i].p_len = dst - dec->fmt[i].p;
2012
0
    }
2013
0
    rec->unpacked |= BCF_UN_FMT;
2014
2015
0
    rec->n_sample = bcf_hdr_nsamples(hdr);
2016
0
    return 0;
2017
0
}
2018
2019
int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
2020
43.2k
{
2021
43.2k
    if (fp->format.format == vcf) return vcf_read(fp,h,v);
2022
318
    int ret = bcf_read1_core(fp->fp.bgzf, v);
2023
318
    if (ret == 0) ret = bcf_record_check(h, v);
2024
318
    if ( ret!=0 || !h->keep_samples ) return ret;
2025
0
    return bcf_subset_format(h,v);
2026
318
}
2027
2028
int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end)
2029
0
{
2030
0
    bcf1_t *v = (bcf1_t *) vv;
2031
0
    int ret = bcf_read1_core(fp, v);
2032
0
    if (ret == 0) ret = bcf_record_check(NULL, v);
2033
0
    if (ret  >= 0)
2034
0
        *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
2035
0
    return ret;
2036
0
}
2037
2038
static inline int bcf1_sync_id(bcf1_t *line, kstring_t *str)
2039
0
{
2040
    // single typed string
2041
0
    if ( line->d.id && strcmp(line->d.id, ".") ) {
2042
0
        return bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
2043
0
    } else {
2044
0
        return bcf_enc_size(str, 0, BCF_BT_CHAR);
2045
0
    }
2046
0
}
2047
static inline int bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
2048
0
{
2049
    // list of typed strings
2050
0
    int i;
2051
0
    for (i=0; i<line->n_allele; i++) {
2052
0
        if (bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]) < 0)
2053
0
            return -1;
2054
0
    }
2055
0
    if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
2056
0
    return 0;
2057
0
}
2058
static inline int bcf1_sync_filter(bcf1_t *line, kstring_t *str)
2059
0
{
2060
    // typed vector of integers
2061
0
    if ( line->d.n_flt ) {
2062
0
        return bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
2063
0
    } else {
2064
0
        return bcf_enc_vint(str, 0, 0, -1);
2065
0
    }
2066
0
}
2067
2068
static inline int bcf1_sync_info(bcf1_t *line, kstring_t *str)
2069
0
{
2070
    // pairs of typed vectors
2071
0
    int i, irm = -1, e = 0;
2072
0
    for (i=0; i<line->n_info; i++)
2073
0
    {
2074
0
        bcf_info_t *info = &line->d.info[i];
2075
0
        if ( !info->vptr )
2076
0
        {
2077
            // marked for removal
2078
0
            if ( irm < 0 ) irm = i;
2079
0
            continue;
2080
0
        }
2081
0
        e |= kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str) < 0;
2082
0
        if ( irm >=0 )
2083
0
        {
2084
0
            bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
2085
0
            while ( irm<=i && line->d.info[irm].vptr ) irm++;
2086
0
        }
2087
0
    }
2088
0
    if ( irm>=0 ) line->n_info = irm;
2089
0
    return e == 0 ? 0 : -1;
2090
0
}
2091
2092
static int bcf1_sync(bcf1_t *line)
2093
15
{
2094
15
    char *shared_ori = line->shared.s;
2095
15
    size_t prev_len;
2096
2097
15
    kstring_t tmp = {0,0,0};
2098
15
    if ( !line->shared.l )
2099
0
    {
2100
        // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
2101
0
        tmp = line->shared;
2102
0
        bcf1_sync_id(line, &tmp);
2103
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2104
2105
0
        bcf1_sync_alleles(line, &tmp);
2106
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2107
2108
0
        bcf1_sync_filter(line, &tmp);
2109
0
        line->unpack_size[2] = tmp.l - prev_len;
2110
2111
0
        bcf1_sync_info(line, &tmp);
2112
0
        line->shared = tmp;
2113
0
    }
2114
15
    else if ( line->d.shared_dirty )
2115
0
    {
2116
        // The line was edited, update the BCF data block.
2117
2118
0
        if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line,BCF_UN_STR);
2119
2120
        // ptr_ori points to the original unchanged BCF data.
2121
0
        uint8_t *ptr_ori = (uint8_t *) line->shared.s;
2122
2123
        // ID: single typed string
2124
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ID )
2125
0
            bcf1_sync_id(line, &tmp);
2126
0
        else
2127
0
            kputsn_(ptr_ori, line->unpack_size[0], &tmp);
2128
0
        ptr_ori += line->unpack_size[0];
2129
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2130
2131
        // REF+ALT: list of typed strings
2132
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
2133
0
            bcf1_sync_alleles(line, &tmp);
2134
0
        else
2135
0
        {
2136
0
            kputsn_(ptr_ori, line->unpack_size[1], &tmp);
2137
0
            if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
2138
0
        }
2139
0
        ptr_ori += line->unpack_size[1];
2140
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2141
2142
0
        if ( line->unpacked & BCF_UN_FLT )
2143
0
        {
2144
            // FILTER: typed vector of integers
2145
0
            if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
2146
0
                bcf1_sync_filter(line, &tmp);
2147
0
            else if ( line->d.n_flt )
2148
0
                kputsn_(ptr_ori, line->unpack_size[2], &tmp);
2149
0
            else
2150
0
                bcf_enc_vint(&tmp, 0, 0, -1);
2151
0
            ptr_ori += line->unpack_size[2];
2152
0
            line->unpack_size[2] = tmp.l - prev_len;
2153
2154
0
            if ( line->unpacked & BCF_UN_INFO )
2155
0
            {
2156
                // INFO: pairs of typed vectors
2157
0
                if ( line->d.shared_dirty & BCF1_DIRTY_INF )
2158
0
                {
2159
0
                    bcf1_sync_info(line, &tmp);
2160
0
                    ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
2161
0
                }
2162
0
            }
2163
0
        }
2164
2165
0
        int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
2166
0
        if ( size ) kputsn_(ptr_ori, size, &tmp);
2167
2168
0
        free(line->shared.s);
2169
0
        line->shared = tmp;
2170
0
    }
2171
15
    if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
2172
0
    {
2173
        // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
2174
0
        size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
2175
0
        int i;
2176
0
        for (i=0; i<line->n_info; i++)
2177
0
        {
2178
0
            uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
2179
0
            line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
2180
0
            off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
2181
0
            if ( vptr_free )
2182
0
            {
2183
0
                free(vptr_free);
2184
0
                line->d.info[i].vptr_free = 0;
2185
0
            }
2186
0
        }
2187
0
    }
2188
2189
15
    if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
2190
0
    {
2191
        // The genotype fields changed or are not present
2192
0
        tmp.l = tmp.m = 0; tmp.s = NULL;
2193
0
        int i, irm = -1;
2194
0
        for (i=0; i<line->n_fmt; i++)
2195
0
        {
2196
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
2197
0
            if ( !fmt->p )
2198
0
            {
2199
                // marked for removal
2200
0
                if ( irm < 0 ) irm = i;
2201
0
                continue;
2202
0
            }
2203
0
            kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
2204
0
            if ( irm >=0 )
2205
0
            {
2206
0
                bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
2207
0
                while ( irm<=i && line->d.fmt[irm].p ) irm++;
2208
0
            }
2209
2210
0
        }
2211
0
        if ( irm>=0 ) line->n_fmt = irm;
2212
0
        free(line->indiv.s);
2213
0
        line->indiv = tmp;
2214
2215
        // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
2216
0
        size_t off_new = 0;
2217
0
        for (i=0; i<line->n_fmt; i++)
2218
0
        {
2219
0
            uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
2220
0
            line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
2221
0
            off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
2222
0
            if ( p_free )
2223
0
            {
2224
0
                free(p_free);
2225
0
                line->d.fmt[i].p_free = 0;
2226
0
            }
2227
0
        }
2228
0
    }
2229
15
    if ( !line->n_sample ) line->n_fmt = 0;
2230
15
    line->d.shared_dirty = line->d.indiv_dirty = 0;
2231
15
    return 0;
2232
15
}
2233
2234
bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
2235
0
{
2236
0
    bcf1_sync(src);
2237
2238
0
    bcf_clear(dst);
2239
0
    dst->rid  = src->rid;
2240
0
    dst->pos  = src->pos;
2241
0
    dst->rlen = src->rlen;
2242
0
    dst->qual = src->qual;
2243
0
    dst->n_info = src->n_info; dst->n_allele = src->n_allele;
2244
0
    dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
2245
2246
0
    if ( dst->shared.m < src->shared.l )
2247
0
    {
2248
0
        dst->shared.s = (char*) realloc(dst->shared.s, src->shared.l);
2249
0
        dst->shared.m = src->shared.l;
2250
0
    }
2251
0
    dst->shared.l = src->shared.l;
2252
0
    memcpy(dst->shared.s,src->shared.s,dst->shared.l);
2253
2254
0
    if ( dst->indiv.m < src->indiv.l )
2255
0
    {
2256
0
        dst->indiv.s = (char*) realloc(dst->indiv.s, src->indiv.l);
2257
0
        dst->indiv.m = src->indiv.l;
2258
0
    }
2259
0
    dst->indiv.l = src->indiv.l;
2260
0
    memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
2261
2262
0
    return dst;
2263
0
}
2264
bcf1_t *bcf_dup(bcf1_t *src)
2265
0
{
2266
0
    bcf1_t *out = bcf_init1();
2267
0
    return bcf_copy(out, src);
2268
0
}
2269
2270
int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
2271
41.1k
{
2272
41.1k
    if ( h->dirty ) {
2273
0
        if (bcf_hdr_sync(h) < 0) return -1;
2274
0
    }
2275
41.1k
    if ( bcf_hdr_nsamples(h)!=v->n_sample )
2276
35
    {
2277
35
        hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
2278
35
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
2279
35
        return -1;
2280
35
    }
2281
2282
41.1k
    if ( hfp->format.format == vcf || hfp->format.format == text_format )
2283
39.5k
        return vcf_write(hfp,h,v);
2284
2285
1.56k
    if ( v->errcode & ~BCF_ERR_LIMITS ) // todo: unsure about the other BCF_ERR_LIMITS branches in vcf_parse_format_alloc4()
2286
1.54k
    {
2287
        // vcf_parse1() encountered a new contig or tag, undeclared in the
2288
        // header.  At this point, the header must have been printed,
2289
        // proceeding would lead to a broken BCF file. Errors must be checked
2290
        // and cleared by the caller before we can proceed.
2291
1.54k
        char errdescription[1024] = "";
2292
1.54k
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos, v->errcode, bcf_strerror(v->errcode, errdescription, sizeof(errdescription)), bcf_seqname_safe(h,v), v->pos+1);
2293
1.54k
        return -1;
2294
1.54k
    }
2295
15
    bcf1_sync(v);   // check if the BCF record was modified
2296
2297
15
    if ( v->unpacked & BCF_IS_64BIT )
2298
0
    {
2299
0
        hts_log_error("Data at %s:%"PRIhts_pos" contains 64-bit values not representable in BCF. Please use VCF instead", bcf_seqname_safe(h,v), v->pos+1);
2300
0
        return -1;
2301
0
    }
2302
2303
15
    BGZF *fp = hfp->fp.bgzf;
2304
15
    uint8_t x[32];
2305
15
    u32_to_le(v->shared.l + 24, x); // to include six 32-bit integers
2306
15
    u32_to_le(v->indiv.l, x + 4);
2307
15
    i32_to_le(v->rid, x + 8);
2308
15
    u32_to_le(v->pos, x + 12);
2309
15
    u32_to_le(v->rlen, x + 16);
2310
15
    float_to_le(v->qual, x + 20);
2311
15
    u16_to_le(v->n_info, x + 24);
2312
15
    u16_to_le(v->n_allele, x + 26);
2313
15
    u32_to_le((uint32_t)v->n_fmt<<24 | (v->n_sample & 0xffffff), x + 28);
2314
15
    if ( bgzf_write(fp, x, 32) != 32 ) return -1;
2315
15
    if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
2316
15
    if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
2317
2318
15
    if (hfp->idx) {
2319
0
        if (bgzf_idx_push(fp, hfp->idx, v->rid, v->pos, v->pos + v->rlen,
2320
0
                          bgzf_tell(fp), 1) < 0)
2321
0
            return -1;
2322
0
    }
2323
2324
15
    return 0;
2325
15
}
2326
2327
/**********************
2328
 *** VCF header I/O ***
2329
 **********************/
2330
2331
0
static int add_missing_contig_hrec(bcf_hdr_t *h, const char *name) {
2332
0
    bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t));
2333
0
    int save_errno;
2334
0
    if (!hrec) goto fail;
2335
2336
0
    hrec->key = strdup("contig");
2337
0
    if (!hrec->key) goto fail;
2338
2339
0
    if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) goto fail;
2340
0
    if (bcf_hrec_set_val(hrec, hrec->nkeys-1, name, strlen(name), 0) < 0)
2341
0
        goto fail;
2342
0
    if (bcf_hdr_add_hrec(h, hrec) < 0)
2343
0
        goto fail;
2344
0
    return 0;
2345
2346
0
 fail:
2347
0
    save_errno = errno;
2348
0
    hts_log_error("%s", strerror(errno));
2349
0
    if (hrec) bcf_hrec_destroy(hrec);
2350
0
    errno = save_errno;
2351
0
    return -1;
2352
0
}
2353
2354
bcf_hdr_t *vcf_hdr_read(htsFile *fp)
2355
4.36k
{
2356
4.36k
    kstring_t txt, *s = &fp->line;
2357
4.36k
    int ret;
2358
4.36k
    bcf_hdr_t *h;
2359
4.36k
    tbx_t *idx = NULL;
2360
4.36k
    const char **names = NULL;
2361
4.36k
    h = bcf_hdr_init("r");
2362
4.36k
    if (!h) {
2363
0
        hts_log_error("Failed to allocate bcf header");
2364
0
        return NULL;
2365
0
    }
2366
4.36k
    txt.l = txt.m = 0; txt.s = 0;
2367
95.0k
    while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) {
2368
94.4k
        int e = 0;
2369
94.4k
        if (s->l == 0) continue;
2370
82.0k
        if (s->s[0] != '#') {
2371
18
            hts_log_error("No sample line");
2372
18
            goto error;
2373
18
        }
2374
82.0k
        if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
2375
0
            kstring_t tmp = { 0, 0, NULL };
2376
0
            hFILE *f = hopen(fp->fn_aux, "r");
2377
0
            if (f == NULL) {
2378
0
                hts_log_error("Couldn't open \"%s\"", fp->fn_aux);
2379
0
                goto error;
2380
0
            }
2381
0
            while (tmp.l = 0, kgetline(&tmp, (kgets_func *) hgets, f) >= 0) {
2382
0
                char *tab = strchr(tmp.s, '\t');
2383
0
                if (tab == NULL) continue;
2384
0
                e |= (kputs("##contig=<ID=", &txt) < 0);
2385
0
                e |= (kputsn(tmp.s, tab - tmp.s, &txt) < 0);
2386
0
                e |= (kputs(",length=", &txt) < 0);
2387
0
                e |= (kputl(atol(tab), &txt) < 0);
2388
0
                e |= (kputsn(">\n", 2, &txt) < 0);
2389
0
            }
2390
0
            free(tmp.s);
2391
0
            if (hclose(f) != 0) {
2392
0
                hts_log_error("Error on closing %s", fp->fn_aux);
2393
0
                goto error;
2394
0
            }
2395
0
            if (e) goto error;
2396
0
        }
2397
82.0k
        if (kputsn(s->s, s->l, &txt) < 0) goto error;
2398
82.0k
        if (kputc('\n', &txt) < 0) goto error;
2399
82.0k
        if (s->s[1] != '#') break;
2400
82.0k
    }
2401
4.34k
    if ( ret < -1 ) goto error;
2402
4.33k
    if ( !txt.s )
2403
0
    {
2404
0
        hts_log_error("Could not read the header");
2405
0
        goto error;
2406
0
    }
2407
4.33k
    if ( bcf_hdr_parse(h, txt.s) < 0 ) goto error;
2408
2409
    // check tabix index, are all contigs listed in the header? add the missing ones
2410
3.40k
    idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SILENT_FAIL);
2411
3.40k
    if ( idx )
2412
0
    {
2413
0
        int i, n, need_sync = 0;
2414
0
        names = tbx_seqnames(idx, &n);
2415
0
        if (!names) goto error;
2416
0
        for (i=0; i<n; i++)
2417
0
        {
2418
0
            bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL);
2419
0
            if ( hrec ) continue;
2420
0
            if (add_missing_contig_hrec(h, names[i]) < 0) goto error;
2421
0
            need_sync = 1;
2422
0
        }
2423
0
        if ( need_sync ) {
2424
0
            if (bcf_hdr_sync(h) < 0) goto error;
2425
0
        }
2426
0
        free(names);
2427
0
        tbx_destroy(idx);
2428
0
    }
2429
3.40k
    free(txt.s);
2430
3.40k
    return h;
2431
2432
960
 error:
2433
960
    if (idx) tbx_destroy(idx);
2434
960
    free(names);
2435
960
    free(txt.s);
2436
960
    if (h) bcf_hdr_destroy(h);
2437
960
    return NULL;
2438
3.40k
}
2439
2440
int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
2441
0
{
2442
0
    int i = 0, n = 0, save_errno;
2443
0
    char **lines = hts_readlines(fname, &n);
2444
0
    if ( !lines ) return 1;
2445
0
    for (i=0; i<n-1; i++)
2446
0
    {
2447
0
        int k;
2448
0
        bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
2449
0
        if (!hrec) goto fail;
2450
0
        if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
2451
0
            bcf_hrec_destroy(hrec);
2452
0
            goto fail;
2453
0
        }
2454
0
        free(lines[i]);
2455
0
        lines[i] = NULL;
2456
0
    }
2457
0
    if (bcf_hdr_parse_sample_line(hdr, lines[n-1]) < 0) goto fail;
2458
0
    if (bcf_hdr_sync(hdr) < 0) goto fail;
2459
0
    free(lines[n-1]);
2460
0
    free(lines);
2461
0
    return 0;
2462
2463
0
 fail:
2464
0
    save_errno = errno;
2465
0
    for (; i < n; i++)
2466
0
        free(lines[i]);
2467
0
    free(lines);
2468
0
    errno = save_errno;
2469
0
    return 1;
2470
0
}
2471
2472
static int _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
2473
13.0k
{
2474
13.0k
    uint32_t e = 0;
2475
13.0k
    if ( !hrec->value )
2476
6.99k
    {
2477
6.99k
        int j, nout = 0;
2478
6.99k
        e |= ksprintf(str, "##%s=<", hrec->key) < 0;
2479
28.7k
        for (j=0; j<hrec->nkeys; j++)
2480
21.7k
        {
2481
            // do not output IDX if output is VCF
2482
21.7k
            if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
2483
18.5k
            if ( nout ) e |= kputc(',',str) < 0;
2484
18.5k
            e |= ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]) < 0;
2485
18.5k
            nout++;
2486
18.5k
        }
2487
6.99k
        e |= ksprintf(str,">\n") < 0;
2488
6.99k
    }
2489
6.07k
    else
2490
6.07k
        e |= ksprintf(str,"##%s=%s\n", hrec->key,hrec->value) < 0;
2491
2492
13.0k
    return e == 0 ? 0 : -1;
2493
13.0k
}
2494
2495
int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
2496
0
{
2497
0
    return _bcf_hrec_format(hrec,0,str);
2498
0
}
2499
2500
int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str)
2501
3.68k
{
2502
3.68k
    int i, r = 0;
2503
16.7k
    for (i=0; i<hdr->nhrec; i++)
2504
13.0k
        r |= _bcf_hrec_format(hdr->hrec[i], is_bcf, str) < 0;
2505
2506
3.68k
    r |= ksprintf(str, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") < 0;
2507
3.68k
    if ( bcf_hdr_nsamples(hdr) )
2508
1.13k
    {
2509
1.13k
        r |= ksprintf(str, "\tFORMAT") < 0;
2510
8.27k
        for (i=0; i<bcf_hdr_nsamples(hdr); i++)
2511
7.13k
            r |= ksprintf(str, "\t%s", hdr->samples[i]) < 0;
2512
1.13k
    }
2513
3.68k
    r |= ksprintf(str, "\n") < 0;
2514
2515
3.68k
    return r ? -1 : 0;
2516
3.68k
}
2517
2518
char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
2519
0
{
2520
0
    kstring_t txt = {0,0,0};
2521
0
    if (bcf_hdr_format(hdr, is_bcf, &txt) < 0)
2522
0
        return NULL;
2523
0
    if ( len ) *len = txt.l;
2524
0
    return txt.s;
2525
0
}
2526
2527
const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
2528
0
{
2529
0
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
2530
0
    int i, tid, m = kh_size(d);
2531
0
    const char **names = (const char**) calloc(m,sizeof(const char*));
2532
0
    if ( !names )
2533
0
    {
2534
0
        hts_log_error("Failed to allocate memory");
2535
0
        *n = 0;
2536
0
        return NULL;
2537
0
    }
2538
0
    khint_t k;
2539
0
    for (k=kh_begin(d); k<kh_end(d); k++)
2540
0
    {
2541
0
        if ( !kh_exist(d,k) ) continue;
2542
0
        if ( !kh_val(d, k).hrec[0] ) continue;  // removed via bcf_hdr_remove
2543
0
        tid = kh_val(d,k).id;
2544
0
        if ( tid >= m )
2545
0
        {
2546
            // This can happen after a contig has been removed from BCF header via bcf_hdr_remove()
2547
0
            if ( hts_resize(const char*, tid + 1, &m, &names, HTS_RESIZE_CLEAR)<0 )
2548
0
            {
2549
0
                hts_log_error("Failed to allocate memory");
2550
0
                *n = 0;
2551
0
                free(names);
2552
0
                return NULL;
2553
0
            }
2554
0
            m = tid + 1;
2555
0
        }
2556
0
        names[tid] = kh_key(d,k);
2557
0
    }
2558
    // ensure there are no gaps
2559
0
    for (i=0,tid=0; tid<m; i++,tid++)
2560
0
    {
2561
0
        while ( tid<m && !names[tid] ) tid++;
2562
0
        if ( tid==m ) break;
2563
0
        if ( i==tid ) continue;
2564
0
        names[i] = names[tid];
2565
0
        names[tid] = 0;
2566
0
    }
2567
0
    *n = i;
2568
0
    return names;
2569
0
}
2570
2571
int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
2572
1.84k
{
2573
1.84k
    kstring_t htxt = {0,0,0};
2574
1.84k
    if (bcf_hdr_format(h, 0, &htxt) < 0) {
2575
0
        free(htxt.s);
2576
0
        return -1;
2577
0
    }
2578
1.84k
    while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros
2579
1.84k
    int ret;
2580
1.84k
    if ( fp->format.compression!=no_compression ) {
2581
0
        ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l);
2582
0
        if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2583
1.84k
    } else {
2584
1.84k
        ret = hwrite(fp->fp.hfile, htxt.s, htxt.l);
2585
1.84k
    }
2586
1.84k
    free(htxt.s);
2587
1.84k
    return ret<0 ? -1 : 0;
2588
1.84k
}
2589
2590
/***********************
2591
 *** Typed value I/O ***
2592
 ***********************/
2593
2594
int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
2595
260k
{
2596
260k
    int32_t max = INT32_MIN, min = INT32_MAX;
2597
260k
    int i;
2598
260k
    if (n <= 0) {
2599
4.24k
        return bcf_enc_size(s, 0, BCF_BT_NULL);
2600
255k
    } else if (n == 1) {
2601
37.5k
        return bcf_enc_int1(s, a[0]);
2602
218k
    } else {
2603
218k
        if (wsize <= 0) wsize = n;
2604
2605
        // Equivalent to:
2606
        // for (i = 0; i < n; ++i) {
2607
        //     if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end )
2608
        //         continue;
2609
        //     if (max < a[i]) max = a[i];
2610
        //     if (min > a[i]) min = a[i];
2611
        // }
2612
218k
        int max4[4] = {INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN};
2613
218k
        int min4[4] = {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX};
2614
32.9M
        for (i = 0; i < (n&~3); i+=4) {
2615
            // bcf_int32_missing    == INT32_MIN and
2616
            // bcf_int32_vector_end == INT32_MIN+1.
2617
            // We skip these, but can mostly avoid explicit checking
2618
32.6M
            if (max4[0] < a[i+0]) max4[0] = a[i+0];
2619
32.6M
            if (max4[1] < a[i+1]) max4[1] = a[i+1];
2620
32.6M
            if (max4[2] < a[i+2]) max4[2] = a[i+2];
2621
32.6M
            if (max4[3] < a[i+3]) max4[3] = a[i+3];
2622
32.6M
            if (min4[0] > a[i+0] && a[i+0] > INT32_MIN+1) min4[0] = a[i+0];
2623
32.6M
            if (min4[1] > a[i+1] && a[i+1] > INT32_MIN+1) min4[1] = a[i+1];
2624
32.6M
            if (min4[2] > a[i+2] && a[i+2] > INT32_MIN+1) min4[2] = a[i+2];
2625
32.6M
            if (min4[3] > a[i+3] && a[i+3] > INT32_MIN+1) min4[3] = a[i+3];
2626
32.6M
        }
2627
218k
        min = min4[0];
2628
218k
        if (min > min4[1]) min = min4[1];
2629
218k
        if (min > min4[2]) min = min4[2];
2630
218k
        if (min > min4[3]) min = min4[3];
2631
218k
        max = max4[0];
2632
218k
        if (max < max4[1]) max = max4[1];
2633
218k
        if (max < max4[2]) max = max4[2];
2634
218k
        if (max < max4[3]) max = max4[3];
2635
529k
        for (; i < n; ++i) {
2636
311k
            if (max < a[i]) max = a[i];
2637
311k
            if (min > a[i] && a[i] > INT32_MIN+1) min = a[i];
2638
311k
        }
2639
2640
218k
        if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) {
2641
23.3k
            if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 ||
2642
23.3k
                ks_resize(s, s->l + n) < 0)
2643
0
                return -1;
2644
23.3k
            uint8_t *p = (uint8_t *) s->s + s->l;
2645
8.29M
            for (i = 0; i < n; ++i, p++) {
2646
8.26M
                if ( a[i]==bcf_int32_vector_end )   *p = bcf_int8_vector_end;
2647
8.25M
                else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing;
2648
120k
                else *p = a[i];
2649
8.26M
            }
2650
23.3k
            s->l += n;
2651
194k
        } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) {
2652
153k
            uint8_t *p;
2653
153k
            if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 ||
2654
153k
                ks_resize(s, s->l + n * sizeof(int16_t)) < 0)
2655
0
                return -1;
2656
153k
            p = (uint8_t *) s->s + s->l;
2657
67.8M
            for (i = 0; i < n; ++i)
2658
67.7M
            {
2659
67.7M
                int16_t x;
2660
67.7M
                if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
2661
67.7M
                else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
2662
1.15M
                else x = a[i];
2663
67.7M
                i16_to_le(x, p);
2664
67.7M
                p += sizeof(int16_t);
2665
67.7M
            }
2666
153k
            s->l += n * sizeof(int16_t);
2667
153k
        } else {
2668
41.3k
            uint8_t *p;
2669
41.3k
            if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 ||
2670
41.3k
                ks_resize(s, s->l + n * sizeof(int32_t)) < 0)
2671
0
                return -1;
2672
41.3k
            p = (uint8_t *) s->s + s->l;
2673
55.1M
            for (i = 0; i < n; ++i) {
2674
55.0M
                i32_to_le(a[i], p);
2675
55.0M
                p += sizeof(int32_t);
2676
55.0M
            }
2677
41.3k
            s->l += n * sizeof(int32_t);
2678
41.3k
        }
2679
218k
    }
2680
2681
218k
    return 0;
2682
260k
}
2683
2684
#ifdef VCF_ALLOW_INT64
2685
static int bcf_enc_long1(kstring_t *s, int64_t x) {
2686
    uint32_t e = 0;
2687
    if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32)
2688
        return bcf_enc_int1(s, x);
2689
    if (x == bcf_int64_vector_end) {
2690
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2691
        e |= kputc(bcf_int8_vector_end, s) < 0;
2692
    } else if (x == bcf_int64_missing) {
2693
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2694
        e |= kputc(bcf_int8_missing, s) < 0;
2695
    } else {
2696
        e |= bcf_enc_size(s, 1, BCF_BT_INT64);
2697
        e |= ks_expand(s, 8);
2698
        if (e == 0) { u64_to_le(x, (uint8_t *) s->s + s->l); s->l += 8; }
2699
    }
2700
    return e == 0 ? 0 : -1;
2701
}
2702
#endif
2703
2704
321k
static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) {
2705
321k
    uint8_t *p;
2706
321k
    size_t i;
2707
321k
    size_t bytes = n * sizeof(float);
2708
2709
321k
    if (bytes / sizeof(float) != n) return -1;
2710
321k
    if (ks_resize(s, s->l + bytes) < 0) return -1;
2711
2712
321k
    p = (uint8_t *) s->s + s->l;
2713
104M
    for (i = 0; i < n; i++) {
2714
104M
        float_to_le(a[i], p);
2715
104M
        p += sizeof(float);
2716
104M
    }
2717
321k
    s->l += bytes;
2718
2719
321k
    return 0;
2720
321k
}
2721
2722
int bcf_enc_vfloat(kstring_t *s, int n, float *a)
2723
321k
{
2724
321k
    assert(n >= 0);
2725
321k
    bcf_enc_size(s, n, BCF_BT_FLOAT);
2726
321k
    serialize_float_array(s, n, a);
2727
321k
    return 0; // FIXME: check for errs in this function
2728
321k
}
2729
2730
int bcf_enc_vchar(kstring_t *s, int l, const char *a)
2731
2.62M
{
2732
2.62M
    bcf_enc_size(s, l, BCF_BT_CHAR);
2733
2.62M
    kputsn(a, l, s);
2734
2.62M
    return 0; // FIXME: check for errs in this function
2735
2.62M
}
2736
2737
// Special case of n==1 as it also occurs quite often in FORMAT data.
2738
// This version is also small enough to get inlined.
2739
6.27k
static inline int bcf_fmt_array1(kstring_t *s, int type, void *data) {
2740
6.27k
    uint32_t e = 0;
2741
6.27k
    uint8_t *p = (uint8_t *)data;
2742
6.27k
    int32_t v;
2743
2744
    // helps gcc more than clang here. In billions of cycles:
2745
    //          bcf_fmt_array1  bcf_fmt_array
2746
    // gcc7:    23.2            24.3
2747
    // gcc13:   21.6            23.0
2748
    // clang13: 27.1            27.8
2749
6.27k
    switch (type) {
2750
6.27k
    case BCF_BT_CHAR:
2751
6.27k
        e |= kputc_(*p == bcf_str_missing ? '.' : *p, s) < 0;
2752
6.27k
        break;
2753
2754
0
    case BCF_BT_INT8:
2755
0
        if (*(int8_t *)p != bcf_int8_vector_end) {
2756
0
            e |= ((*(int8_t *)p == bcf_int8_missing)
2757
0
                  ? kputc_('.', s)
2758
0
                  : kputw(*(int8_t *)p, s)) < 0;
2759
0
        }
2760
0
        break;
2761
0
    case BCF_BT_INT16:
2762
0
        v = le_to_i16(p);
2763
0
        if (v != bcf_int16_vector_end) {
2764
0
            e |= (v == bcf_int16_missing
2765
0
                  ? kputc_('.', s)
2766
0
                  : kputw(v, s)) < 0;
2767
0
        }
2768
0
        break;
2769
2770
0
    case BCF_BT_INT32:
2771
0
        v = le_to_i32(p);
2772
0
        if (v != bcf_int32_vector_end) {
2773
0
            e |= (v == bcf_int32_missing
2774
0
                  ? kputc_('.', s)
2775
0
                  : kputw(v, s)) < 0;
2776
0
        }
2777
0
        break;
2778
2779
0
    case BCF_BT_FLOAT:
2780
0
        v = le_to_u32(p);
2781
0
        if (v != bcf_float_vector_end) {
2782
0
            e |= (v == bcf_float_missing
2783
0
                  ? kputc_('.', s)
2784
0
                  : kputd(le_to_float(p), s)) < 0;
2785
0
        }
2786
0
        break;
2787
2788
0
    default:
2789
0
        hts_log_error("Unexpected type %d", type);
2790
0
        return -1;
2791
6.27k
    }
2792
2793
6.27k
    return e == 0 ? 0 : -1;
2794
6.27k
}
2795
2796
int bcf_fmt_array(kstring_t *s, int n, int type, void *data)
2797
2.27M
{
2798
2.27M
    int j = 0;
2799
2.27M
    uint32_t e = 0;
2800
2.27M
    if (n == 0) {
2801
1.42M
        return kputc_('.', s) >= 0 ? 0 : -1;
2802
1.42M
    }
2803
2804
857k
    if (type == BCF_BT_CHAR)
2805
191k
    {
2806
191k
        char *p = (char *)data;
2807
2808
        // Note bcf_str_missing is already accounted for in n==0 above.
2809
191k
        if (n >= 8) {
2810
42.8k
            char *p_end = memchr(p, 0, n);
2811
42.8k
            e |= kputsn(p, p_end ? p_end-p : n, s) < 0;
2812
148k
        } else {
2813
530k
            for (j = 0; j < n && *p; ++j, ++p)
2814
381k
               e |= kputc(*p, s) < 0;
2815
148k
        }
2816
191k
    }
2817
665k
    else
2818
665k
    {
2819
665k
        #define BRANCH(type_t, convert, is_missing, is_vector_end, kprint) { \
2820
665k
            uint8_t *p = (uint8_t *) data; \
2821
117M
            for (j=0; j<n; j++, p += sizeof(type_t))    \
2822
116M
            { \
2823
116M
                type_t v = convert(p); \
2824
116M
                if ( is_vector_end ) break; \
2825
116M
                if ( j ) e |= kputc_(',', s) < 0; \
2826
116M
                e |= (is_missing ? kputc('.', s) : kprint) < 0; \
2827
116M
            } \
2828
665k
        }
2829
665k
        switch (type) {
2830
212k
            case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, v==bcf_int8_missing,  v==bcf_int8_vector_end,  kputw(v, s)); break;
2831
145k
            case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break;
2832
146k
            case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break;
2833
160k
            case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break;
2834
0
            default: hts_log_error("Unexpected type %d", type); exit(1); break;
2835
665k
        }
2836
665k
        #undef BRANCH
2837
665k
    }
2838
857k
    return e == 0 ? 0 : -1;
2839
857k
}
2840
2841
uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
2842
1.51M
{
2843
1.51M
    int x, type;
2844
1.51M
    x = bcf_dec_size(ptr, &ptr, &type);
2845
1.51M
    bcf_fmt_array(s, x, type, ptr);
2846
1.51M
    return ptr + (x << bcf_type_shift[type]);
2847
1.51M
}
2848
2849
/********************
2850
 *** VCF site I/O ***
2851
 ********************/
2852
2853
typedef struct {
2854
    int key;            // Key for h->id[BCF_DT_ID][key] vdict
2855
    int max_m;          // number of elements in field array (ie commas)
2856
    int size;           // field size (max_l or max_g*4 if is_gt)
2857
    int offset;         // offset of buf into h->mem
2858
    uint32_t is_gt:1,   // is genotype
2859
             max_g:31;  // maximum number of genotypes
2860
    uint32_t max_l;     // length of field
2861
    uint32_t y;         // h->id[0][fmt[j].key].val->info[BCF_HL_FMT]
2862
    uint8_t *buf;       // Pointer into h->mem
2863
} fmt_aux_t;
2864
2865
// fmt_aux_t field notes:
2866
// max_* are biggest sizes of the various FORMAT fields across all samples.
2867
// We use these after pivoting the data to ensure easy random access
2868
// of a specific sample.
2869
//
2870
// max_m is only used for type BCF_HT_REAL or BCF_HT_INT
2871
// max_g is only used for is_gt == 1 (will be BCF_HT_STR)
2872
// max_l is only used for is_gt == 0 (will be BCF_HT_STR)
2873
//
2874
// These are computed in vcf_parse_format_max3 and used in
2875
// vcf_parse_format_alloc4 to get the size.
2876
//
2877
// size is computed from max_g, max_l, max_m and is_gt.  Once computed
2878
// the max values are never accessed again.
2879
//
2880
// In theory all 4 vars could be coalesced into a single variable, but this
2881
// significantly harms speed (even if done via a union).  It's about 25-30%
2882
// slower.
2883
2884
static inline int align_mem(kstring_t *s)
2885
47.4k
{
2886
47.4k
    int e = 0;
2887
47.4k
    if (s->l&7) {
2888
7.96k
        uint64_t zero = 0;
2889
7.96k
        e = kputsn((char*)&zero, 8 - (s->l&7), s) < 0;
2890
7.96k
    }
2891
47.4k
    return e == 0 ? 0 : -1;
2892
47.4k
}
2893
2894
48.3k
#define MAX_N_FMT 255   /* Limited by size of bcf1_t n_fmt field */
2895
2896
// detect FORMAT "."
2897
static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2898
12.4k
                                   const char *p, const char *q) {
2899
12.4k
    const char *end = s->s + s->l;
2900
12.4k
    if ( q>=end )
2901
18
    {
2902
18
        hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1);
2903
18
        v->errcode |= BCF_ERR_NCOLS;
2904
18
        return -1;
2905
18
    }
2906
2907
12.4k
    v->n_fmt = 0;
2908
12.4k
    if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "."
2909
43
    {
2910
43
        v->n_sample = bcf_hdr_nsamples(h);
2911
43
        return 1;
2912
43
    }
2913
2914
12.3k
    return 0;
2915
12.4k
}
2916
2917
// get format information from the dictionary
2918
static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2919
12.3k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
2920
12.3k
    const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
2921
12.3k
    char *t;
2922
12.3k
    int j;
2923
12.3k
    ks_tokaux_t aux1;
2924
2925
60.7k
    for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
2926
48.3k
        if (j >= MAX_N_FMT) {
2927
3
            v->errcode |= BCF_ERR_LIMITS;
2928
3
            hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle",
2929
3
                bcf_seqname_safe(h,v), v->pos+1);
2930
3
            return -1;
2931
3
        }
2932
2933
48.3k
        *(char*)aux1.p = 0;
2934
48.3k
        khint_t k = kh_get(vdict, d, t);
2935
48.3k
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
2936
5.40k
            if ( t[0]=='.' && t[1]==0 )
2937
3
            {
2938
3
                hts_log_error("Invalid FORMAT tag name '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2939
3
                v->errcode |= BCF_ERR_TAG_INVALID;
2940
3
                return -1;
2941
3
            }
2942
5.39k
            hts_log_warning("FORMAT '%s' at %s:%"PRIhts_pos" is not defined in the header, assuming Type=String", t, bcf_seqname_safe(h,v), v->pos+1);
2943
5.39k
            kstring_t tmp = {0,0,0};
2944
5.39k
            int l;
2945
5.39k
            ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
2946
5.39k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
2947
5.39k
            free(tmp.s);
2948
5.39k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
2949
5.39k
            if (res < 0) bcf_hrec_destroy(hrec);
2950
5.39k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
2951
2952
5.39k
            k = kh_get(vdict, d, t);
2953
5.39k
            v->errcode |= BCF_ERR_TAG_UNDEF;
2954
5.39k
            if (res || k == kh_end(d)) {
2955
15
                hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
2956
15
                v->errcode |= BCF_ERR_TAG_INVALID;
2957
15
                return -1;
2958
15
            }
2959
5.39k
        }
2960
48.3k
        fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
2961
48.3k
        fmt[j].key = kh_val(d, k).id;
2962
48.3k
        fmt[j].is_gt = (t[0] == 'G' && t[1] == 'T' && !t[2]);
2963
48.3k
        fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
2964
48.3k
        v->n_fmt++;
2965
48.3k
    }
2966
12.3k
    return 0;
2967
12.3k
}
2968
2969
// compute max
2970
static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2971
12.3k
                                 char *p, char *q, fmt_aux_t *fmt) {
2972
12.3k
    int n_sample_ori = -1;
2973
12.3k
    char *r = q + 1;  // r: position in the format string
2974
12.3k
    int l = 0, m = 1, g = 1, j;
2975
12.3k
    v->n_sample = 0;  // m: max vector size, l: max field len, g: max number of alleles
2976
12.3k
    const char *end = s->s + s->l;
2977
2978
34.5k
    while ( r<end )
2979
34.4k
    {
2980
        // can we skip some samples?
2981
34.4k
        if ( h->keep_samples )
2982
0
        {
2983
0
            n_sample_ori++;
2984
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
2985
0
            {
2986
0
                while ( *r!='\t' && r<end ) r++;
2987
0
                if ( *r=='\t' ) { *r = 0; r++; }
2988
0
                continue;
2989
0
            }
2990
0
        }
2991
2992
        // collect fmt stats: max vector size, length, number of alleles
2993
34.4k
        j = 0;  // j-th format field
2994
34.4k
        fmt_aux_t *f = fmt;
2995
34.4k
        static char meta[256] = {
2996
            // \0 \t , / : |
2997
34.4k
            1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2998
34.4k
            0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1, 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
2999
34.4k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3000
34.4k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
3001
34.4k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3002
34.4k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3003
34.4k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3004
34.4k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3005
34.4k
        };
3006
3007
34.4k
        char *r_start = r;
3008
4.68M
        for (;;) {
3009
            // Quickly skip ahead to an appropriate meta-character
3010
5.26M
            while (!meta[(unsigned char)*r]) r++;
3011
3012
4.68M
            switch (*r) {
3013
4.62M
            case ',':
3014
4.62M
                m++;
3015
4.62M
                break;
3016
3017
867
            case '|':
3018
17.5k
            case '/':
3019
17.5k
                if (f->is_gt) g++;
3020
17.5k
                break;
3021
3022
11.7k
            case '\t':
3023
11.7k
                *r = 0; // fall through
3024
3025
11.7k
            default: // valid due to while loop above.
3026
34.4k
            case '\0':
3027
48.0k
            case ':':
3028
48.0k
                l = r - r_start; r_start = r;
3029
48.0k
                if (f->max_m < m) f->max_m = m;
3030
48.0k
                if (f->max_l < l) f->max_l = l;
3031
48.0k
                if (f->is_gt && f->max_g < g) f->max_g = g;
3032
48.0k
                l = 0, m = g = 1;
3033
48.0k
                if ( *r==':' ) {
3034
13.6k
                    j++; f++;
3035
13.6k
                    if ( j>=v->n_fmt ) {
3036
5
                        hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"",
3037
5
                                      h->id[BCF_DT_CTG][v->rid].key, v->pos+1);
3038
5
                        v->errcode |= BCF_ERR_NCOLS;
3039
5
                        return -1;
3040
5
                    }
3041
34.4k
                } else goto end_for;
3042
13.5k
                break;
3043
4.68M
            }
3044
4.65M
            if ( r>=end ) break;
3045
4.65M
            r++;
3046
4.65M
        }
3047
34.4k
    end_for:
3048
34.4k
        v->n_sample++;
3049
34.4k
        if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
3050
22.1k
        r++;
3051
22.1k
    }
3052
3053
12.3k
    return 0;
3054
12.3k
}
3055
3056
// allocate memory for arrays
3057
static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3058
                                   const char *p, const char *q,
3059
12.3k
                                   fmt_aux_t *fmt) {
3060
12.3k
    kstring_t *mem = (kstring_t*)&h->mem;
3061
3062
12.3k
    int j;
3063
59.8k
    for (j = 0; j < v->n_fmt; ++j) {
3064
47.4k
        fmt_aux_t *f = &fmt[j];
3065
47.4k
        if ( !f->max_m ) f->max_m = 1;  // omitted trailing format field
3066
3067
47.4k
        if ((f->y>>4&0xf) == BCF_HT_STR) {
3068
47.4k
            f->size = f->is_gt? f->max_g << 2 : f->max_l;
3069
47.4k
        } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
3070
0
            f->size = f->max_m << 2;
3071
0
        } else {
3072
0
            hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
3073
0
            v->errcode |= BCF_ERR_TAG_INVALID;
3074
0
            return -1;
3075
0
        }
3076
3077
47.4k
        if (align_mem(mem) < 0) {
3078
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3079
0
            v->errcode |= BCF_ERR_LIMITS;
3080
0
            return -1;
3081
0
        }
3082
3083
        // Limit the total memory to ~2Gb per VCF row.  This should mean
3084
        // malformed VCF data is less likely to take excessive memory and/or
3085
        // time.
3086
47.4k
        if ((uint64_t) mem->l + v->n_sample * (uint64_t)f->size > INT_MAX) {
3087
0
            static int warned = 0;
3088
0
            if ( !warned ) hts_log_warning("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3089
0
            warned = 1;
3090
0
            v->errcode |= BCF_ERR_LIMITS;
3091
0
            f->size = -1;
3092
0
            f->offset = 0;
3093
0
            continue;
3094
0
        }
3095
3096
47.4k
        f->offset = mem->l;
3097
47.4k
        if (ks_resize(mem, mem->l + v->n_sample * (size_t)f->size) < 0) {
3098
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3099
0
            v->errcode |= BCF_ERR_LIMITS;
3100
0
            return -1;
3101
0
        }
3102
47.4k
        mem->l += v->n_sample * f->size;
3103
47.4k
    }
3104
3105
12.3k
    {
3106
12.3k
        int j;
3107
59.8k
        for (j = 0; j < v->n_fmt; ++j)
3108
47.4k
            fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
3109
12.3k
    }
3110
3111
    // check for duplicate tags
3112
12.3k
    int i;
3113
47.4k
    for (i=1; i<v->n_fmt; i++)
3114
35.1k
    {
3115
35.1k
        fmt_aux_t *ifmt = &fmt[i];
3116
35.1k
        if ( ifmt->size==-1 ) continue; // already marked for removal
3117
154k
        for (j=0; j<i; j++)
3118
140k
        {
3119
140k
            fmt_aux_t *jfmt = &fmt[j];
3120
140k
            if ( jfmt->size==-1 ) continue; // already marked for removal
3121
74.7k
            if ( ifmt->key!=jfmt->key ) continue;
3122
20.7k
            static int warned = 0;
3123
20.7k
            if ( !warned ) hts_log_warning("Duplicate FORMAT tag %s at %s:%"PRIhts_pos, bcf_hdr_int2id(h,BCF_DT_ID,ifmt->key), bcf_seqname_safe(h,v), v->pos+1);
3124
20.7k
            warned = 1;
3125
20.7k
            v->errcode |= BCF_ERR_TAG_INVALID;
3126
20.7k
            ifmt->size = -1;
3127
20.7k
            ifmt->offset = 0;
3128
20.7k
            break;
3129
74.7k
        }
3130
35.1k
    }
3131
12.3k
    return 0;
3132
12.3k
}
3133
3134
// Fill the sample fields
3135
static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3136
12.3k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
3137
12.3k
    static int extreme_val_warned = 0;
3138
12.3k
    int n_sample_ori = -1;
3139
    // At beginning of the loop t points to the first char of a format
3140
12.3k
    const char *t = q + 1;
3141
12.3k
    int m = 0;   // m: sample id
3142
12.3k
    const int nsamples = bcf_hdr_nsamples(h);
3143
12.3k
    const char *end = s->s + s->l;
3144
3145
12.3k
    int ver = bcf_get_version(h, NULL);
3146
3147
46.5k
    while ( t<end )
3148
45.3k
    {
3149
        // can we skip some samples?
3150
45.3k
        if ( h->keep_samples )
3151
0
        {
3152
0
            n_sample_ori++;
3153
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
3154
0
            {
3155
0
                while ( *t && t<end ) t++;
3156
0
                t++;
3157
0
                continue;
3158
0
            }
3159
0
        }
3160
45.3k
        if ( m == nsamples ) break;
3161
3162
34.3k
        int j = 0; // j-th format field, m-th sample
3163
47.8k
        while ( t < end )
3164
47.1k
        {
3165
47.1k
            fmt_aux_t *z = &fmt[j++];
3166
47.1k
            const int htype = z->y>>4&0xf;
3167
47.1k
            if (!z->buf) {
3168
7
                hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos,
3169
7
                              z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
3170
7
                v->errcode |= BCF_ERR_LIMITS;
3171
7
                return -1;
3172
7
            }
3173
3174
47.1k
            if ( z->size==-1 )
3175
4.80k
            {
3176
                // this field is to be ignored, it's either too big or a duplicate
3177
144k
                while ( *t != ':' && *t ) t++;
3178
4.80k
            }
3179
42.3k
            else if (htype == BCF_HT_STR) {
3180
42.3k
                int l;
3181
42.3k
                if (z->is_gt) {
3182
                    // Genotypes.
3183
                    //([/|])?<val>)([|/]<val>)+... where <val> is [0-9]+ or ".".
3184
5.67k
                    int32_t is_phased = 0;
3185
5.67k
                    uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
3186
5.67k
                    uint32_t unreadable = 0;
3187
5.67k
                    uint32_t max = 0;
3188
5.67k
                    int overflow = 0, ploidy = 0, anyunphased = 0, \
3189
5.67k
                        phasingprfx = 0, unknown1 = 0;
3190
3191
                    /* with prefixed phasing, it is explicitly given for 1st one
3192
                    with non-prefixed, set based on ploidy and phasing of other
3193
                    alleles. */
3194
5.67k
                    if (ver >= VCF44 && (*t == '|' || *t == '/')) {
3195
                        // cache prefix and phasing status
3196
4
                        is_phased = *t++ == '|';
3197
4
                        phasingprfx = 1;
3198
4
                    }
3199
3200
20.1k
                    for (l = 0;; ++t) {
3201
20.1k
                        ploidy++;
3202
20.1k
                        if (*t == '.') {
3203
5.96k
                            ++t, x[l++] = is_phased;
3204
5.96k
                            if (l==1) {   //for 1st allele only
3205
678
                                unknown1 = 1;
3206
678
                            }
3207
14.2k
                        } else {
3208
14.2k
                            const char *tt = t;
3209
14.2k
                            uint32_t val;
3210
                            // Or "v->n_allele < 10", but it doesn't
3211
                            // seem to be any faster and this feels safer.
3212
14.2k
                            if (*t >= '0' && *t <= '9' &&
3213
14.1k
                                !(t[1] >= '0' && t[1] <= '9')) {
3214
7.61k
                                val = *t++ - '0';
3215
7.61k
                            } else {
3216
6.62k
                                val = hts_str2uint(t, (char **)&t,
3217
6.62k
                                                   sizeof(val) * CHAR_MAX - 2,
3218
6.62k
                                                   &overflow);
3219
6.62k
                                unreadable |= tt == t;
3220
6.62k
                            }
3221
14.2k
                            if (max < val) max = val;
3222
14.2k
                            x[l++] = (val + 1) << 1 | is_phased;
3223
14.2k
                        }
3224
20.1k
                        anyunphased |= (ploidy != 1) && !is_phased;
3225
20.1k
                        is_phased = (*t == '|');
3226
20.1k
                        if (*t != '|' && *t != '/') break;
3227
20.1k
                    }
3228
5.67k
                    if (ver >= VCF44 && !phasingprfx) {
3229
                        /* no explicit phasing for 1st allele, set based on
3230
                         other alleles and ploidy */
3231
4.84k
                        if (ploidy == 1) {  //implicitly phased
3232
1.13k
                            if (!unknown1) {
3233
1.06k
                                x[0] |= 1;
3234
1.06k
                            }
3235
3.70k
                        } else {            //set by other unphased alleles
3236
3.70k
                            x[0] |= (anyunphased)? 0 : 1;
3237
3.70k
                        }
3238
4.84k
                    }
3239
                    // Possibly check max against v->n_allele instead?
3240
5.67k
                    if (overflow || max > (INT32_MAX >> 1) - 1) {
3241
50
                        hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3242
50
                        return -1;
3243
50
                    }
3244
5.62k
                    if (unreadable) {
3245
33
                        hts_log_error("Couldn't read GT data: value not a number or '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3246
33
                        return -1;
3247
33
                    }
3248
5.58k
                    if ( !l ) x[l++] = 0;   // An empty field, insert missing value
3249
6.51k
                    for (; l < z->size>>2; ++l)
3250
929
                        x[l] = bcf_int32_vector_end;
3251
3252
36.6k
                } else {
3253
                    // Otherwise arbitrary strings
3254
36.6k
                    char *x = (char*)z->buf + z->size * (size_t)m;
3255
4.97M
                    for (l = 0; *t != ':' && *t; ++t)
3256
4.94M
                        x[l++] = *t;
3257
36.6k
                    if (z->size > l)
3258
21.6k
                        memset(&x[l], 0, (z->size-l) * sizeof(*x));
3259
36.6k
                }
3260
3261
42.3k
            } else if (htype == BCF_HT_INT) {
3262
                // One or more integers in an array
3263
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3264
0
                int l;
3265
0
                for (l = 0;; ++t) {
3266
0
                    if (*t == '.') {
3267
0
                        x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
3268
0
                    } else {
3269
0
                        int overflow = 0;
3270
0
                        char *te;
3271
0
                        long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3272
0
                        if ( te==t || overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3273
0
                        {
3274
0
                            if ( !extreme_val_warned )
3275
0
                            {
3276
0
                                hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos,
3277
0
                                                h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1);
3278
0
                                extreme_val_warned = 1;
3279
0
                            }
3280
0
                            tmp_val = bcf_int32_missing;
3281
0
                        }
3282
0
                        x[l++] = tmp_val;
3283
0
                        t = te;
3284
0
                    }
3285
0
                    if (*t != ',') break;
3286
0
                }
3287
0
                if ( !l )
3288
0
                    x[l++] = bcf_int32_missing;
3289
0
                for (; l < z->size>>2; ++l)
3290
0
                    x[l] = bcf_int32_vector_end;
3291
3292
0
            } else if (htype == BCF_HT_REAL) {
3293
                // One of more floating point values in an array
3294
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3295
0
                int l;
3296
0
                for (l = 0;; ++t) {
3297
0
                    if (*t == '.' && !isdigit_c(t[1])) {
3298
0
                        bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
3299
0
                    } else {
3300
0
                        int overflow = 0;
3301
0
                        char *te;
3302
0
                        float tmp_val = hts_str2dbl(t, &te, &overflow);
3303
0
                        if ( (te==t || overflow) && !extreme_val_warned )
3304
0
                        {
3305
0
                            hts_log_warning("Extreme FORMAT/%s value encountered at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname(h,v), v->pos+1);
3306
0
                            extreme_val_warned = 1;
3307
0
                        }
3308
0
                        x[l++] = tmp_val;
3309
0
                        t = te;
3310
0
                    }
3311
0
                    if (*t != ',') break;
3312
0
                }
3313
0
                if ( !l )
3314
                    // An empty field, insert missing value
3315
0
                    bcf_float_set_missing(x[l++]);
3316
0
                for (; l < z->size>>2; ++l)
3317
0
                    bcf_float_set_vector_end(x[l]);
3318
0
            } else {
3319
0
                hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, htype, bcf_seqname_safe(h,v), v->pos+1);
3320
0
                v->errcode |= BCF_ERR_TAG_INVALID;
3321
0
                return -1;
3322
0
            }
3323
3324
47.0k
            if (*t == '\0') {
3325
33.4k
                break;
3326
33.4k
            }
3327
13.5k
            else if (*t == ':') {
3328
13.5k
                t++;
3329
13.5k
            }
3330
39
            else {
3331
39
                char buffer[8];
3332
39
                hts_log_error("Invalid character %s in '%s' FORMAT field at %s:%"PRIhts_pos"",
3333
39
                    hts_strprint(buffer, sizeof buffer, '\'', t, 1),
3334
39
                    h->id[BCF_DT_ID][z->key].key, bcf_seqname_safe(h,v), v->pos+1);
3335
39
                v->errcode |= BCF_ERR_CHAR;
3336
39
                return -1;
3337
39
            }
3338
47.0k
        }
3339
3340
        // fill end-of-vector values
3341
474k
        for (; j < v->n_fmt; ++j) {
3342
439k
            fmt_aux_t *z = &fmt[j];
3343
439k
            const int htype = z->y>>4&0xf;
3344
439k
            int l;
3345
3346
439k
            if (z->size == -1) // this field is to be ignored
3347
353k
                continue;
3348
3349
86.2k
            if (htype == BCF_HT_STR) {
3350
86.2k
                if (z->is_gt) {
3351
14.8k
                    int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3352
14.8k
                    if (z->size) x[0] = bcf_int32_missing;
3353
37.3k
                    for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3354
71.3k
                } else {
3355
71.3k
                    char *x = (char*)z->buf + z->size * (size_t)m;
3356
71.3k
                    if ( z->size ) {
3357
12.8k
                        x[0] = '.';
3358
12.8k
                        memset(&x[1], 0, (z->size-1) * sizeof(*x));
3359
12.8k
                    }
3360
71.3k
                }
3361
86.2k
            } else if (htype == BCF_HT_INT) {
3362
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3363
0
                x[0] = bcf_int32_missing;
3364
0
                for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3365
0
            } else if (htype == BCF_HT_REAL) {
3366
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3367
0
                bcf_float_set_missing(x[0]);
3368
0
                for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
3369
0
            }
3370
86.2k
        }
3371
3372
34.2k
        m++; t++;
3373
34.2k
    }
3374
3375
12.2k
    return 0;
3376
12.3k
}
3377
3378
// write individual genotype information
3379
static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3380
12.2k
                                const char *p, const char *q, fmt_aux_t *fmt) {
3381
12.2k
    kstring_t *str = &v->indiv;
3382
12.2k
    int i, need_downsize = 0;
3383
12.2k
    if (v->n_sample > 0) {
3384
59.2k
        for (i = 0; i < v->n_fmt; ++i) {
3385
47.0k
            fmt_aux_t *z = &fmt[i];
3386
47.0k
            if ( z->size==-1 ) {
3387
20.6k
                need_downsize = 1;
3388
20.6k
                continue;
3389
20.6k
            }
3390
26.3k
            bcf_enc_int1(str, z->key);
3391
26.3k
            if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
3392
20.1k
                bcf_enc_size(str, z->size, BCF_BT_CHAR);
3393
20.1k
                kputsn((char*)z->buf, z->size * (size_t)v->n_sample, str);
3394
20.1k
            } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
3395
6.19k
                bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
3396
6.19k
            } else {
3397
0
                bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
3398
0
                if (serialize_float_array(str, (z->size>>2) * (size_t)v->n_sample,
3399
0
                                          (float *) z->buf) != 0) {
3400
0
                    v->errcode |= BCF_ERR_LIMITS;
3401
0
                    hts_log_error("Out of memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3402
0
                    return -1;
3403
0
                }
3404
0
            }
3405
26.3k
        }
3406
3407
12.2k
    }
3408
12.2k
    if ( need_downsize ) {
3409
5.31k
        i = 0;
3410
39.6k
        while ( i < v->n_fmt ) {
3411
34.3k
            if ( fmt[i].size==-1 )
3412
20.6k
            {
3413
20.6k
                v->n_fmt--;
3414
20.6k
                if ( i < v->n_fmt ) memmove(&fmt[i],&fmt[i+1],sizeof(*fmt)*(v->n_fmt-i));
3415
20.6k
            }
3416
13.6k
            else
3417
13.6k
                i++;
3418
34.3k
        }
3419
5.31k
    }
3420
12.2k
    return 0;
3421
12.2k
}
3422
3423
// validity checking
3424
12.2k
static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) {
3425
12.2k
    if ( v->n_sample!=bcf_hdr_nsamples(h) )
3426
60
    {
3427
60
        hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
3428
60
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
3429
60
        v->errcode |= BCF_ERR_NCOLS;
3430
60
        return -1;
3431
60
    }
3432
12.1k
    if ( v->indiv.l > 0xffffffff )
3433
0
    {
3434
0
        hts_log_error("The FORMAT at %s:%"PRIhts_pos" is too long", bcf_seqname_safe(h,v), v->pos+1);
3435
0
        v->errcode |= BCF_ERR_LIMITS;
3436
3437
        // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed
3438
0
        v->n_fmt = 0;
3439
0
        return -1;
3440
0
    }
3441
3442
12.1k
    return 0;
3443
12.1k
}
3444
3445
// p,q is the start and the end of the FORMAT field
3446
static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3447
                            char *p, char *q)
3448
37.2k
{
3449
37.2k
    if ( !bcf_hdr_nsamples(h) ) return 0;
3450
12.4k
    kstring_t *mem = (kstring_t*)&h->mem;
3451
12.4k
    mem->l = 0;
3452
3453
12.4k
    fmt_aux_t fmt[MAX_N_FMT];
3454
3455
    // detect FORMAT "."
3456
12.4k
    int ret; // +ve = ok, -ve = err
3457
12.4k
    if ((ret = vcf_parse_format_empty1(s, h, v, p, q)))
3458
61
        return ret ? 0 : -1;
3459
3460
    // get format information from the dictionary
3461
12.3k
    if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0)
3462
21
        return -1;
3463
3464
    // FORMAT data is per-sample A:B:C A:B:C A:B:C ... but in memory it is
3465
    // stored as per-type arrays AAA... BBB... CCC...  This is basically
3466
    // a data rotation or pivot.
3467
3468
    // The size of elements in the array grow to their maximum needed,
3469
    // permitting fast random access.  This means however we have to first
3470
    // scan the whole FORMAT line to find the maximum of each type, and
3471
    // then scan it again to find the store the data.
3472
    // We break this down into compute-max, allocate, fill-out-buffers
3473
3474
    // TODO: ?
3475
    // The alternative would be to pivot on the first pass, with fixed
3476
    // size entries for numerics and concatenated strings otherwise, also
3477
    // tracking maximum sizes.  Then on a second pass we reallocate and
3478
    // copy the data again to a uniformly sized array.  Two passes through
3479
    // memory, but without doubling string parsing.
3480
3481
    // compute max
3482
12.3k
    if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0)
3483
5
        return -1;
3484
3485
    // allocate memory for arrays
3486
12.3k
    if (vcf_parse_format_alloc4(s, h, v, p, q, fmt) < 0)
3487
0
        return -1;
3488
3489
    // fill the sample fields; at beginning of the loop
3490
12.3k
    if (vcf_parse_format_fill5(s, h, v, p, q, fmt) < 0)
3491
129
        return -1;
3492
3493
    // write individual genotype information
3494
12.2k
    if (vcf_parse_format_gt6(s, h, v, p, q, fmt) < 0)
3495
0
        return -1;
3496
3497
    // validity checking
3498
12.2k
    if (vcf_parse_format_check7(h, v) < 0)
3499
60
        return -1;
3500
3501
12.1k
    return 0;
3502
12.2k
}
3503
3504
4.05k
static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
3505
    // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
3506
    // been already printed, but will enable tools like vcfcheck to proceed.
3507
3508
4.05k
    kstring_t tmp = {0,0,0};
3509
4.05k
    khint_t k;
3510
4.05k
    int l;
3511
4.05k
    if (ksprintf(&tmp, "##contig=<ID=%s>", p) < 0)
3512
0
        return kh_end(d);
3513
4.05k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3514
4.05k
    free(tmp.s);
3515
4.05k
    int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3516
4.05k
    if (res < 0) bcf_hrec_destroy(hrec);
3517
4.05k
    if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3518
4.05k
    k = kh_get(vdict, d, p);
3519
3520
4.05k
    return k;
3521
4.05k
}
3522
3523
38.0k
static int vcf_parse_filter(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3524
38.0k
    int i, n_flt = 1, max_n_flt = 0;
3525
38.0k
    char *r, *t;
3526
38.0k
    int32_t *a_flt = NULL;
3527
38.0k
    ks_tokaux_t aux1;
3528
38.0k
    khint_t k;
3529
38.0k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3530
    // count the number of filters
3531
38.0k
    if (*(q-1) == ';') *(q-1) = 0;
3532
322M
    for (r = p; *r; ++r)
3533
322M
        if (*r == ';') ++n_flt;
3534
38.0k
    if (n_flt > max_n_flt) {
3535
38.0k
        a_flt = malloc(n_flt * sizeof(*a_flt));
3536
38.0k
        if (!a_flt) {
3537
0
            hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3538
0
            v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3539
0
            return -1;
3540
0
        }
3541
38.0k
        max_n_flt = n_flt;
3542
38.0k
    }
3543
    // add filters
3544
1.42M
    for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
3545
1.38M
        *(char*)aux1.p = 0;
3546
1.38M
        k = kh_get(vdict, d, t);
3547
1.38M
        if (k == kh_end(d))
3548
38.1k
        {
3549
            // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
3550
            // been already printed, but will enable tools like vcfcheck to proceed.
3551
38.1k
            hts_log_warning("FILTER '%s' is not defined in the header", t);
3552
38.1k
            kstring_t tmp = {0,0,0};
3553
38.1k
            int l;
3554
38.1k
            ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
3555
38.1k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3556
38.1k
            free(tmp.s);
3557
38.1k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3558
38.1k
            if (res < 0) bcf_hrec_destroy(hrec);
3559
38.1k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3560
38.1k
            k = kh_get(vdict, d, t);
3561
38.1k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3562
38.1k
            if (res || k == kh_end(d)) {
3563
51
                hts_log_error("Could not add dummy header for FILTER '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
3564
51
                v->errcode |= BCF_ERR_TAG_INVALID;
3565
51
                free(a_flt);
3566
51
                return -1;
3567
51
            }
3568
38.1k
        }
3569
1.38M
        a_flt[i++] = kh_val(d, k).id;
3570
1.38M
    }
3571
3572
38.0k
    bcf_enc_vint(str, n_flt, a_flt, -1);
3573
38.0k
    free(a_flt);
3574
3575
38.0k
    return 0;
3576
38.0k
}
3577
3578
41.1k
static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3579
41.1k
    static int extreme_int_warned = 0, negative_rlen_warned = 0;
3580
41.1k
    int max_n_val = 0, overflow = 0;
3581
41.1k
    char *r, *key;
3582
41.1k
    khint_t k;
3583
41.1k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3584
41.1k
    int32_t *a_val = NULL;
3585
3586
41.1k
    v->n_info = 0;
3587
41.1k
    if (*(q-1) == ';') *(q-1) = 0;
3588
3.23M
    for (r = key = p;; ++r) {
3589
3.23M
        int c;
3590
3.23M
        char *val, *end;
3591
270M
        while (*r > '=' || (*r != ';' && *r != '=' && *r != 0)) r++;
3592
3.23M
        if (v->n_info == UINT16_MAX) {
3593
6
            hts_log_error("Too many INFO entries at %s:%"PRIhts_pos,
3594
6
                          bcf_seqname_safe(h,v), v->pos+1);
3595
6
            v->errcode |= BCF_ERR_LIMITS;
3596
6
            goto fail;
3597
6
        }
3598
3.23M
        val = end = NULL;
3599
3.23M
        c = *r; *r = 0;
3600
3.23M
        if (c == '=') {
3601
1.40M
            val = r + 1;
3602
3603
329M
            for (end = val; *end != ';' && *end != 0; ++end);
3604
1.40M
            c = *end; *end = 0;
3605
1.83M
        } else end = r;
3606
3.23M
        if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; }  // faulty VCF, ";;" in the INFO
3607
3.21M
        k = kh_get(vdict, d, key);
3608
3.21M
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
3609
32.0k
        {
3610
32.0k
            hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key);
3611
32.0k
            kstring_t tmp = {0,0,0};
3612
32.0k
            int l;
3613
32.0k
            ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
3614
32.0k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3615
32.0k
            free(tmp.s);
3616
32.0k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3617
32.0k
            if (res < 0) bcf_hrec_destroy(hrec);
3618
32.0k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3619
32.0k
            k = kh_get(vdict, d, key);
3620
32.0k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3621
32.0k
            if (res || k == kh_end(d)) {
3622
86
                hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1);
3623
86
                v->errcode |= BCF_ERR_TAG_INVALID;
3624
86
                goto fail;
3625
86
            }
3626
32.0k
        }
3627
3.21M
        uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
3628
3.21M
        ++v->n_info;
3629
3.21M
        bcf_enc_int1(str, kh_val(d, k).id);
3630
3.21M
        if (val == 0) {
3631
1.80M
            bcf_enc_size(str, 0, BCF_BT_NULL);
3632
1.80M
        } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
3633
51.4k
            bcf_enc_vchar(str, end - val, val);
3634
1.35M
        } else { // int/float value/array
3635
1.35M
            int i, n_val;
3636
1.35M
            char *t, *te;
3637
318M
            for (t = val, n_val = 1; *t; ++t) // count the number of values
3638
317M
                if (*t == ',') ++n_val;
3639
            // Check both int and float size in one step for simplicity
3640
1.35M
            if (n_val > max_n_val) {
3641
3.00k
                int32_t *a_tmp = (int32_t *)realloc(a_val, n_val * sizeof(*a_val));
3642
3.00k
                if (!a_tmp) {
3643
0
                    hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3644
0
                    v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3645
0
                    goto fail;
3646
0
                }
3647
3.00k
                a_val = a_tmp;
3648
3.00k
                max_n_val = n_val;
3649
3.00k
            }
3650
1.35M
            if ((y>>4&0xf) == BCF_HT_INT) {
3651
1.02M
                i = 0, t = val;
3652
1.02M
                int64_t val1;
3653
1.02M
                int is_int64 = 0;
3654
#ifdef VCF_ALLOW_INT64
3655
                if ( n_val==1 )
3656
                {
3657
                    overflow = 0;
3658
                    long long int tmp_val = hts_str2int(val, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3659
                    if ( te==val ) tmp_val = bcf_int32_missing;
3660
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT64 || tmp_val>BCF_MAX_BT_INT64 )
3661
                    {
3662
                        if ( !extreme_int_warned )
3663
                        {
3664
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3665
                            extreme_int_warned = 1;
3666
                        }
3667
                        tmp_val = bcf_int32_missing;
3668
                    }
3669
                    else
3670
                        is_int64 = 1;
3671
                    val1 = tmp_val;
3672
                    t = te;
3673
                    i = 1;  // this is just to avoid adding another nested block...
3674
                }
3675
#endif
3676
132M
                for (; i < n_val; ++i, ++t)
3677
131M
                {
3678
131M
                    overflow = 0;
3679
131M
                    long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3680
131M
                    if ( te==t ) tmp_val = bcf_int32_missing;
3681
1.89M
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3682
260k
                    {
3683
260k
                        if ( !extreme_int_warned )
3684
1
                        {
3685
1
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3686
1
                            extreme_int_warned = 1;
3687
1
                        }
3688
260k
                        tmp_val = bcf_int32_missing;
3689
260k
                    }
3690
131M
                    a_val[i] = tmp_val;
3691
163M
                    for (t = te; *t && *t != ','; t++);
3692
131M
                }
3693
1.02M
                if (n_val == 1) {
3694
#ifdef VCF_ALLOW_INT64
3695
                    if ( is_int64 )
3696
                    {
3697
                        v->unpacked |= BCF_IS_64BIT;
3698
                        bcf_enc_long1(str, val1);
3699
                    }
3700
                    else
3701
                        bcf_enc_int1(str, (int32_t)val1);
3702
#else
3703
816k
                    val1 = a_val[0];
3704
816k
                    bcf_enc_int1(str, (int32_t)val1);
3705
816k
#endif
3706
816k
                } else {
3707
212k
                    bcf_enc_vint(str, n_val, a_val, -1);
3708
212k
                }
3709
1.02M
                if (n_val==1 && (val1!=bcf_int32_missing || is_int64)
3710
675k
                    && memcmp(key, "END", 4) == 0)
3711
0
                {
3712
0
                    if ( val1 <= v->pos )
3713
0
                    {
3714
0
                        if ( !negative_rlen_warned )
3715
0
                        {
3716
0
                            hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,val1,bcf_seqname_safe(h,v),v->pos+1);
3717
0
                            negative_rlen_warned = 1;
3718
0
                        }
3719
0
                    }
3720
0
                }
3721
1.02M
            } else if ((y>>4&0xf) == BCF_HT_REAL) {
3722
321k
                float *val_f = (float *)a_val;
3723
104M
                for (i = 0, t = val; i < n_val; ++i, ++t)
3724
104M
                {
3725
104M
                    overflow = 0;
3726
104M
                    val_f[i] = hts_str2dbl(t, &te, &overflow);
3727
104M
                    if ( te==t || overflow ) // conversion failed
3728
102M
                        bcf_float_set_missing(val_f[i]);
3729
142M
                    for (t = te; *t && *t != ','; t++);
3730
104M
                }
3731
321k
                bcf_enc_vfloat(str, n_val, val_f);
3732
321k
            }
3733
1.35M
        }
3734
3.21M
        if (c == 0) break;
3735
3.19M
        r = end;
3736
3.19M
        key = r + 1;
3737
3.19M
    }
3738
3739
41.0k
    free(a_val);
3740
41.0k
    return 0;
3741
3742
92
 fail:
3743
92
    free(a_val);
3744
92
    return -1;
3745
41.1k
}
3746
3747
int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
3748
42.1k
{
3749
42.1k
    int ret = -2, overflow = 0;
3750
42.1k
    char *p, *q, *r, *t;
3751
42.1k
    kstring_t *str;
3752
42.1k
    khint_t k;
3753
42.1k
    ks_tokaux_t aux;
3754
3755
//#define NOT_DOT(p) strcmp((p), ".")
3756
//#define NOT_DOT(p) (!(*p == '.' && !p[1]))
3757
//#define NOT_DOT(p) ((*p) != '.' || (p)[1])
3758
//#define NOT_DOT(p) (q-p != 1 || memcmp(p, ".\0", 2))
3759
208k
#define NOT_DOT(p) (memcmp(p, ".\0", 2))
3760
3761
42.1k
    if (!s || !h || !v || !(s->s))
3762
0
        return ret;
3763
3764
    // Assumed in lots of places, but we may as well spot this early
3765
42.1k
    assert(sizeof(float) == sizeof(int32_t));
3766
3767
    // Ensure string we parse has space to permit some over-flow when during
3768
    // parsing.  Eg to do memcmp(key, "END", 4) in vcf_parse_info over
3769
    // the more straight forward looking strcmp, giving a speed advantage.
3770
42.1k
    if (ks_resize(s, s->l+4) < 0)
3771
0
        return -2;
3772
3773
    // Force our memory to be initialised so we avoid the technicality of
3774
    // undefined behaviour in using a 4-byte memcmp.  (The reality is this
3775
    // almost certainly is never detected by the compiler so has no impact,
3776
    // but equally so this code has minimal (often beneficial) impact on
3777
    // performance too.)
3778
42.1k
    s->s[s->l+0] = 0;
3779
42.1k
    s->s[s->l+1] = 0;
3780
42.1k
    s->s[s->l+2] = 0;
3781
42.1k
    s->s[s->l+3] = 0;
3782
3783
42.1k
    bcf_clear1(v);
3784
42.1k
    str = &v->shared;
3785
42.1k
    memset(&aux, 0, sizeof(ks_tokaux_t));
3786
3787
    // CHROM
3788
42.1k
    if (!(p = kstrtok(s->s, "\t", &aux)))
3789
0
        goto err;
3790
42.1k
    *(q = (char*)aux.p) = 0;
3791
3792
42.1k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
3793
42.1k
    k = kh_get(vdict, d, p);
3794
42.1k
    if (k == kh_end(d)) {
3795
4.05k
        hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p);
3796
4.05k
        v->errcode = BCF_ERR_CTG_UNDEF;
3797
4.05k
        if ((k = fix_chromosome(h, d, p)) == kh_end(d)) {
3798
90
            hts_log_error("Could not add dummy header for contig '%s'", p);
3799
90
            v->errcode |= BCF_ERR_CTG_INVALID;
3800
90
            goto err;
3801
90
        }
3802
4.05k
    }
3803
42.1k
    v->rid = kh_val(d, k).id;
3804
3805
    // POS
3806
42.1k
    if (!(p = kstrtok(0, 0, &aux)))
3807
271
        goto err;
3808
41.8k
    *(q = (char*)aux.p) = 0;
3809
3810
41.8k
    overflow = 0;
3811
41.8k
    char *tmp = p;
3812
41.8k
    v->pos = hts_str2uint(p, &p, 62, &overflow);
3813
41.8k
    if (overflow) {
3814
11
        hts_log_error("Position value '%s' is too large", tmp);
3815
11
        goto err;
3816
41.8k
    } else if ( *p ) {
3817
66
        hts_log_error("Could not parse the position '%s'", tmp);
3818
66
        goto err;
3819
41.7k
    } else {
3820
41.7k
        v->pos -= 1;
3821
41.7k
    }
3822
41.7k
    if (v->pos >= INT32_MAX)
3823
587
        v->unpacked |= BCF_IS_64BIT;
3824
3825
    // ID
3826
41.7k
    if (!(p = kstrtok(0, 0, &aux)))
3827
5
        goto err;
3828
41.7k
    *(q = (char*)aux.p) = 0;
3829
3830
41.7k
    if (NOT_DOT(p)) bcf_enc_vchar(str, q - p, p);
3831
4
    else bcf_enc_size(str, 0, BCF_BT_CHAR);
3832
3833
    // REF
3834
41.7k
    if (!(p = kstrtok(0, 0, &aux)))
3835
40
        goto err;
3836
41.7k
    *(q = (char*)aux.p) = 0;
3837
3838
41.7k
    bcf_enc_vchar(str, q - p, p);
3839
41.7k
    v->n_allele = 1, v->rlen = q - p;
3840
3841
    // ALT
3842
41.7k
    if (!(p = kstrtok(0, 0, &aux)))
3843
25
        goto err;
3844
41.6k
    *(q = (char*)aux.p) = 0;
3845
3846
41.6k
    if (NOT_DOT(p)) {
3847
96.6M
        for (r = t = p;; ++r) {
3848
96.6M
            if (*r == ',' || *r == 0) {
3849
2.48M
                if (v->n_allele == UINT16_MAX) {
3850
3
                    hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos,
3851
3
                                  bcf_seqname_safe(h,v), v->pos+1);
3852
3
                    v->errcode |= BCF_ERR_LIMITS;
3853
3
                    goto err;
3854
3
                }
3855
2.48M
                bcf_enc_vchar(str, r - t, t);
3856
2.48M
                t = r + 1;
3857
2.48M
                ++v->n_allele;
3858
2.48M
            }
3859
96.6M
            if (r == q) break;
3860
96.6M
        }
3861
41.1k
    }
3862
3863
    // QUAL
3864
41.6k
    if (!(p = kstrtok(0, 0, &aux)))
3865
44
        goto err;
3866
41.6k
    *(q = (char*)aux.p) = 0;
3867
3868
41.6k
    if (NOT_DOT(p)) v->qual = atof(p);
3869
607
    else bcf_float_set_missing(v->qual);
3870
41.6k
    if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR
3871
3872
    // FILTER
3873
41.6k
    if (!(p = kstrtok(0, 0, &aux)))
3874
51
        goto err;
3875
41.5k
    *(q = (char*)aux.p) = 0;
3876
3877
41.5k
    if (NOT_DOT(p)) {
3878
38.0k
        if (vcf_parse_filter(str, h, v, p, q)) {
3879
51
            goto err;
3880
51
        }
3881
38.0k
    } else bcf_enc_vint(str, 0, 0, -1);
3882
41.5k
    if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT
3883
3884
    // INFO
3885
41.5k
    if (!(p = kstrtok(0, 0, &aux)))
3886
77
        goto err;
3887
41.4k
    *(q = (char*)aux.p) = 0;
3888
3889
41.4k
    if (NOT_DOT(p)) {
3890
41.1k
        if (vcf_parse_info(str, h, v, p, q)) {
3891
92
            goto err;
3892
92
        }
3893
41.1k
    }
3894
41.3k
    if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
3895
3896
    // FORMAT; optional
3897
41.3k
    p = kstrtok(0, 0, &aux);
3898
41.3k
    if (p) {
3899
37.2k
        *(q = (char*)aux.p) = 0;
3900
3901
37.2k
        if (vcf_parse_format(s, h, v, p, q)) {
3902
215
            goto err;
3903
215
        }
3904
37.2k
    }
3905
3906
41.1k
 end:
3907
41.1k
    v->rlen = get_rlen(h, v);    //set rlen based on version
3908
41.1k
    ret = 0;
3909
3910
42.1k
 err:
3911
42.1k
    return ret;
3912
41.1k
}
3913
3914
int vcf_open_mode(char *mode, const char *fn, const char *format)
3915
0
{
3916
0
    if (format == NULL) {
3917
        // Try to pick a format based on the filename extension
3918
0
        char extension[HTS_MAX_EXT_LEN];
3919
0
        if (find_file_extension(fn, extension) < 0) return -1;
3920
0
        return vcf_open_mode(mode, fn, extension);
3921
0
    }
3922
0
    else if (strcasecmp(format, "bcf") == 0) strcpy(mode, "b");
3923
0
    else if (strcasecmp(format, "vcf") == 0) strcpy(mode, "");
3924
0
    else if (strcasecmp(format, "vcf.gz") == 0 || strcasecmp(format, "vcf.bgz") == 0) strcpy(mode, "z");
3925
0
    else return -1;
3926
3927
0
    return 0;
3928
0
}
3929
3930
int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
3931
42.9k
{
3932
42.9k
    int ret;
3933
42.9k
    ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
3934
42.9k
    if (ret < 0) return ret;
3935
42.1k
    return vcf_parse1(&fp->line, h, v);
3936
42.9k
}
3937
3938
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
3939
0
{
3940
0
    uint8_t *ptr_start = ptr;
3941
0
    fmt->id = bcf_dec_typed_int1(ptr, &ptr);
3942
0
    fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
3943
0
    fmt->size = fmt->n << bcf_type_shift[fmt->type];
3944
0
    fmt->p = ptr;
3945
0
    fmt->p_off  = ptr - ptr_start;
3946
0
    fmt->p_free = 0;
3947
0
    ptr += n_sample * fmt->size;
3948
0
    fmt->p_len = ptr - fmt->p;
3949
0
    return ptr;
3950
0
}
3951
3952
static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
3953
540
{
3954
540
    uint8_t *ptr_start = ptr;
3955
540
    int64_t len = 0;
3956
540
    info->key = bcf_dec_typed_int1(ptr, &ptr);
3957
540
    len = info->len = bcf_dec_size(ptr, &ptr, &info->type);
3958
540
    info->vptr = ptr;
3959
540
    info->vptr_off  = ptr - ptr_start;
3960
540
    info->vptr_free = 0;
3961
540
    info->v1.i = 0;
3962
540
    if (info->len == 1) {
3963
39
        switch(info->type) {
3964
0
        case BCF_BT_INT8:
3965
39
        case BCF_BT_CHAR:
3966
39
            info->v1.i = *(int8_t*)ptr;
3967
39
            break;
3968
0
        case BCF_BT_INT16:
3969
0
            info->v1.i = le_to_i16(ptr);
3970
0
            len <<= 1;
3971
0
            break;
3972
0
        case BCF_BT_INT32:
3973
0
            info->v1.i = le_to_i32(ptr);
3974
0
            len <<= 2;
3975
0
            break;
3976
0
        case BCF_BT_FLOAT:
3977
0
            info->v1.f = le_to_float(ptr);
3978
0
            len <<= 2;
3979
0
            break;
3980
0
        case BCF_BT_INT64:
3981
0
            info->v1.i = le_to_i64(ptr);
3982
0
            len <<= 3;
3983
0
            break;
3984
39
        }
3985
501
    } else {
3986
501
        len <<= bcf_type_shift[info->type];
3987
501
    }
3988
540
    ptr += len;
3989
3990
540
    info->vptr_len = ptr - info->vptr;
3991
540
    return ptr;
3992
540
}
3993
3994
int bcf_unpack(bcf1_t *b, int which)
3995
39.5k
{
3996
39.5k
    if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
3997
39.5k
    uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
3998
39.5k
    int i;
3999
39.5k
    bcf_dec_t *d = &b->d;
4000
39.5k
    if (which & BCF_UN_FLT) which |= BCF_UN_STR;
4001
39.5k
    if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
4002
39.5k
    if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
4003
39.5k
    {
4004
39.5k
        kstring_t tmp;
4005
4006
        // ID
4007
39.5k
        tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
4008
39.5k
        ptr_ori = ptr;
4009
39.5k
        ptr = bcf_fmt_sized_array(&tmp, ptr);
4010
39.5k
        b->unpack_size[0] = ptr - ptr_ori;
4011
39.5k
        kputc_('\0', &tmp);
4012
39.5k
        d->id = tmp.s; d->m_id = tmp.m;
4013
4014
        // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
4015
39.5k
        hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
4016
39.5k
        tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
4017
39.5k
        ptr_ori = ptr;
4018
1.51M
        for (i = 0; i < b->n_allele; ++i) {
4019
            // Use offset within tmp.s as realloc may change pointer
4020
1.47M
            d->allele[i] = (char *)(intptr_t)tmp.l;
4021
1.47M
            ptr = bcf_fmt_sized_array(&tmp, ptr);
4022
1.47M
            kputc_('\0', &tmp);
4023
1.47M
        }
4024
39.5k
        b->unpack_size[1] = ptr - ptr_ori;
4025
39.5k
        d->als = tmp.s; d->m_als = tmp.m;
4026
4027
        // Convert our offsets within tmp.s back to pointers again
4028
1.51M
        for (i = 0; i < b->n_allele; ++i)
4029
1.47M
            d->allele[i] = d->als + (ptrdiff_t)d->allele[i];
4030
39.5k
        b->unpacked |= BCF_UN_STR;
4031
39.5k
    }
4032
39.5k
    if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
4033
39.5k
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
4034
39.5k
        ptr_ori = ptr;
4035
39.5k
        if (*ptr>>4) {
4036
36.1k
            int type;
4037
36.1k
            d->n_flt = bcf_dec_size(ptr, &ptr, &type);
4038
36.1k
            hts_expand(int, d->n_flt, d->m_flt, d->flt);
4039
634k
            for (i = 0; i < d->n_flt; ++i)
4040
598k
                d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
4041
36.1k
        } else ++ptr, d->n_flt = 0;
4042
39.5k
        b->unpack_size[2] = ptr - ptr_ori;
4043
39.5k
        b->unpacked |= BCF_UN_FLT;
4044
39.5k
    }
4045
39.5k
    if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
4046
0
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
4047
0
        hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
4048
0
        for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
4049
0
        for (i = 0; i < b->n_info; ++i)
4050
0
            ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
4051
0
        b->unpacked |= BCF_UN_INFO;
4052
0
    }
4053
39.5k
    if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
4054
0
        ptr = (uint8_t*)b->indiv.s;
4055
0
        hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
4056
0
        for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
4057
0
        for (i = 0; i < b->n_fmt; ++i)
4058
0
            ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
4059
0
        b->unpacked |= BCF_UN_FMT;
4060
0
    }
4061
39.5k
    return 0;
4062
39.5k
}
4063
4064
int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
4065
39.5k
{
4066
39.5k
    int i;
4067
39.5k
    int32_t max_dt_id = h->n[BCF_DT_ID];
4068
39.5k
    const char *chrom = bcf_seqname(h, v);
4069
39.5k
    if (!chrom) {
4070
0
        hts_log_error("Invalid BCF, CONTIG id=%d not present in the header",
4071
0
                      v->rid);
4072
0
        errno = EINVAL;
4073
0
        return -1;
4074
0
    }
4075
4076
39.5k
    bcf_unpack((bcf1_t*)v, BCF_UN_ALL & ~(BCF_UN_INFO|BCF_UN_FMT));
4077
4078
    // Cache of key lengths so we don't keep repeatedly using them.
4079
    // This assumes we're not modifying the header between successive calls
4080
    // to vcf_format, but that would lead to many other forms of breakage
4081
    // so it feels like a valid assumption to make.
4082
    //
4083
    // We cannot just do this in bcf_hdr_sync as some code (eg bcftools
4084
    // annotate) manipulates the headers directly without calling sync to
4085
    // refresh the data structures.  So we must do just-in-time length
4086
    // calculation during writes instead.
4087
39.5k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
4088
39.5k
    if (!aux->key_len) {
4089
4.10k
        if (!(aux->key_len = calloc(h->n[BCF_DT_ID]+1, sizeof(*aux->key_len))))
4090
0
            return -1;
4091
4.10k
    }
4092
39.5k
    size_t *key_len = aux->key_len;
4093
4094
39.5k
    kputs(chrom, s); // CHROM
4095
39.5k
    kputc_('\t', s); kputll(v->pos + 1, s); // POS
4096
39.5k
    kputc_('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
4097
39.5k
    kputc_('\t', s); // REF
4098
39.5k
    if (v->n_allele > 0) kputs(v->d.allele[0], s);
4099
0
    else kputc_('.', s);
4100
39.5k
    kputc_('\t', s); // ALT
4101
39.5k
    if (v->n_allele > 1) {
4102
1.47M
        for (i = 1; i < v->n_allele; ++i) {
4103
1.43M
            if (i > 1) kputc_(',', s);
4104
1.43M
            kputs(v->d.allele[i], s);
4105
1.43M
        }
4106
39.0k
    } else kputc_('.', s);
4107
39.5k
    kputc_('\t', s); // QUAL
4108
39.5k
    if ( bcf_float_is_missing(v->qual) ) kputc_('.', s); // QUAL
4109
39.0k
    else kputd(v->qual, s);
4110
39.5k
    kputc_('\t', s); // FILTER
4111
39.5k
    if (v->d.n_flt) {
4112
634k
        for (i = 0; i < v->d.n_flt; ++i) {
4113
598k
            int32_t idx = v->d.flt[i];
4114
598k
            if (idx < 0 || idx >= max_dt_id
4115
598k
                || h->id[BCF_DT_ID][idx].key == NULL) {
4116
0
                hts_log_error("Invalid BCF, the FILTER tag id=%d at %s:%"PRIhts_pos" not present in the header",
4117
0
                              idx, bcf_seqname_safe(h, v), v->pos + 1);
4118
0
                errno = EINVAL;
4119
0
                return -1;
4120
0
            }
4121
598k
            if (i) kputc_(';', s);
4122
598k
            if (!key_len[idx])
4123
58.3k
                key_len[idx] = strlen(h->id[BCF_DT_ID][idx].key);
4124
598k
            kputsn(h->id[BCF_DT_ID][idx].key, key_len[idx], s);
4125
598k
        }
4126
36.1k
    } else kputc_('.', s);
4127
4128
39.5k
    kputc_('\t', s); // INFO
4129
39.5k
    if (v->n_info) {
4130
18.5k
        uint8_t *ptr = v->shared.s
4131
18.5k
            ? (uint8_t *)v->shared.s + v->unpack_size[0] +
4132
18.5k
               v->unpack_size[1] + v->unpack_size[2]
4133
18.5k
            : NULL;
4134
18.5k
        int first = 1;
4135
18.5k
        bcf_info_t *info = v->d.info;
4136
4137
        // Note if we duplicate this code into custom packed and unpacked
4138
        // implementations then we gain a bit more speed, particularly with
4139
        // clang 13 (up to 5%).  Not sure why this is, but code duplication
4140
        // isn't pleasant and it's still faster adding packed support than
4141
        // not so it's a win, just not as good as it should be.
4142
18.5k
        const int info_packed = !(v->unpacked & BCF_UN_INFO) && v->shared.l;
4143
1.40M
        for (i = 0; i < v->n_info; ++i) {
4144
1.38M
            bcf_info_t in, *z;
4145
1.38M
            if (info_packed) {
4146
                // Use a local bcf_info_t when data is packed
4147
1.38M
                z = &in;
4148
1.38M
                z->key  = bcf_dec_typed_int1(ptr, &ptr);
4149
1.38M
                z->len  = bcf_dec_size(ptr, &ptr, &z->type);
4150
1.38M
                z->vptr = ptr;
4151
1.38M
                ptr += z->len << bcf_type_shift[z->type];
4152
1.38M
            } else {
4153
                // Else previously unpacked INFO struct
4154
0
                z = &info[i];
4155
4156
                // Also potentially since deleted
4157
0
                if ( !z->vptr ) continue;
4158
0
            }
4159
4160
1.38M
            bcf_idpair_t *id = z->key >= 0 && z->key < max_dt_id
4161
1.38M
                ? &h->id[BCF_DT_ID][z->key]
4162
1.38M
                : NULL;
4163
4164
1.38M
            if (!id || !id->key) {
4165
0
                hts_log_error("Invalid BCF, the INFO tag id=%d is %s at %s:%"PRIhts_pos,
4166
0
                              z->key,
4167
0
                              z->key < 0 ? "negative"
4168
0
                              : (z->key >= max_dt_id ? "too large" : "not present in the header"),
4169
0
                              bcf_seqname_safe(h, v), v->pos+1);
4170
0
                errno = EINVAL;
4171
0
                return -1;
4172
0
            }
4173
4174
            // KEY
4175
1.38M
            if (!key_len[z->key])
4176
18.0k
                key_len[z->key] = strlen(id->key);
4177
1.38M
            size_t id_len = key_len[z->key];
4178
1.38M
            if (ks_resize(s, s->l + 3 + id_len) < 0)
4179
0
                return -1;
4180
1.38M
            char *sptr = s->s + s->l;
4181
1.38M
            if ( !first ) {
4182
1.36M
                *sptr++ = ';';
4183
1.36M
                s->l++;
4184
1.36M
            }
4185
1.38M
            first = 0;
4186
1.38M
            memcpy(sptr, id->key, id_len);
4187
1.38M
            s->l += id_len;
4188
4189
            // VALUE
4190
1.38M
            if (z->len <= 0) continue;
4191
691k
            sptr[id_len] = '=';
4192
691k
            s->l++;
4193
4194
691k
            if (z->len != 1 || info_packed) {
4195
691k
                bcf_fmt_array(s, z->len, z->type, z->vptr);
4196
691k
            } else {
4197
                // Single length vectors are unpacked into their
4198
                // own info.v1 union and handled separately.
4199
0
                if (z->type == BCF_BT_FLOAT) {
4200
0
                    if ( bcf_float_is_missing(z->v1.f) )
4201
0
                        kputc_('.', s);
4202
0
                    else
4203
0
                        kputd(z->v1.f, s);
4204
0
                } else if (z->type == BCF_BT_CHAR) {
4205
0
                    kputc_(z->v1.i, s);
4206
0
                } else if (z->type < BCF_BT_INT64) {
4207
0
                    int64_t missing[] = {
4208
0
                        0, // BCF_BT_NULL
4209
0
                        bcf_int8_missing,
4210
0
                        bcf_int16_missing,
4211
0
                        bcf_int32_missing,
4212
0
                    };
4213
0
                    if (z->v1.i == missing[z->type])
4214
0
                        kputc_('.', s);
4215
0
                    else
4216
0
                        kputw(z->v1.i, s);
4217
0
                } else if (z->type == BCF_BT_INT64) {
4218
0
                    if (z->v1.i == bcf_int64_missing)
4219
0
                        kputc_('.', s);
4220
0
                    else
4221
0
                        kputll(z->v1.i, s);
4222
0
                } else {
4223
0
                    hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, z->type, bcf_seqname_safe(h, v), v->pos+1);
4224
0
                    errno = EINVAL;
4225
0
                    return -1;
4226
0
                }
4227
0
            }
4228
691k
        }
4229
18.5k
        if ( first ) kputc_('.', s);
4230
21.0k
    } else kputc_('.', s);
4231
4232
    // FORMAT and individual information
4233
39.5k
    if (v->n_sample) {
4234
11.7k
        int i,j;
4235
11.7k
        if ( v->n_fmt) {
4236
11.7k
            uint8_t *ptr = (uint8_t *)v->indiv.s;
4237
11.7k
            int gt_i = -1;
4238
11.7k
            bcf_fmt_t *fmt = v->d.fmt;
4239
11.7k
            int first = 1, ret = 0;
4240
11.7k
            int fmt_packed = !(v->unpacked & BCF_UN_FMT);
4241
4242
11.7k
            if (fmt_packed) {
4243
                // Local fmt as we have an array of num FORMAT keys,
4244
                // each of which points to N.Sample values.
4245
4246
                // No real gain to be had in handling unpacked data here,
4247
                // but it doesn't cost us much in complexity either and
4248
                // it gives us flexibility.
4249
11.7k
                fmt = malloc(v->n_fmt * sizeof(*fmt));
4250
11.7k
                if (!fmt)
4251
0
                    return -1;
4252
11.7k
            }
4253
4254
            // KEYS
4255
35.8k
            for (i = 0; i < (int)v->n_fmt; ++i) {
4256
24.1k
                bcf_fmt_t *z;
4257
24.1k
                z = &fmt[i];
4258
24.1k
                if (fmt_packed) {
4259
24.1k
                    z->id   = bcf_dec_typed_int1(ptr, &ptr);
4260
24.1k
                    z->n    = bcf_dec_size(ptr, &ptr, &z->type);
4261
24.1k
                    z->p    = ptr;
4262
24.1k
                    z->size = z->n << bcf_type_shift[z->type];
4263
24.1k
                    ptr += v->n_sample * z->size;
4264
24.1k
                }
4265
24.1k
                if ( !z->p ) continue;
4266
24.1k
                kputc_(!first ? ':' : '\t', s); first = 0;
4267
4268
24.1k
                bcf_idpair_t *id = z->id >= 0 && z->id < max_dt_id
4269
24.1k
                    ? &h->id[BCF_DT_ID][z->id]
4270
24.1k
                    : NULL;
4271
4272
24.1k
                if (!id || !id->key) {
4273
0
                    hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", z->id, bcf_seqname_safe(h, v), v->pos+1);
4274
0
                    errno = EINVAL;
4275
0
                    if (fmt_packed)
4276
0
                        free(fmt);
4277
0
                    return -1;
4278
0
                }
4279
4280
24.1k
                if (!key_len[z->id])
4281
13.3k
                    key_len[z->id] = strlen(id->key);
4282
24.1k
                size_t id_len = key_len[z->id];
4283
24.1k
                kputsn(id->key, id_len, s);
4284
24.1k
                if (id_len == 2 && id->key[0] == 'G' && id->key[1] == 'T')
4285
5.95k
                    gt_i = i;
4286
24.1k
            }
4287
11.7k
            if ( first ) kputsn("\t.", 2, s);
4288
4289
            // VALUES per sample
4290
42.2k
            for (j = 0; j < v->n_sample; ++j) {
4291
30.5k
                kputc_('\t', s);
4292
30.5k
                first = 1;
4293
30.5k
                bcf_fmt_t *f = fmt;
4294
91.8k
                for (i = 0; i < (int)v->n_fmt; i++, f++) {
4295
80.4k
                    if ( !f->p ) continue;
4296
80.4k
                    if (!first) kputc_(':', s);
4297
80.4k
                    first = 0;
4298
80.4k
                    if (gt_i == i) {
4299
19.2k
                        if ((ret = bcf_format_gt_v2(h, f,j,s)) < 0) {
4300
0
                            hts_log_error("Failed to format GT value for sample %d, returned %d", i, ret);
4301
0
                            errno = EINVAL;
4302
0
                            if (fmt_packed)
4303
0
                                free(fmt);
4304
0
                            return -1;
4305
0
                        }
4306
19.2k
                        break;
4307
19.2k
                    }
4308
61.2k
                    else if (f->n == 1)
4309
5.79k
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4310
55.4k
                    else
4311
55.4k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4312
80.4k
                }
4313
4314
                // Simpler loop post GT and at least 1 iteration
4315
51.5k
                for (i++, f++; i < (int)v->n_fmt; i++, f++) {
4316
20.9k
                    if ( !f->p ) continue;
4317
20.9k
                    kputc_(':', s);
4318
20.9k
                    if (f->n == 1)
4319
479
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4320
20.4k
                    else
4321
20.4k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4322
20.9k
                }
4323
30.5k
                if ( first ) kputc_('.', s);
4324
30.5k
            }
4325
11.7k
            if (fmt_packed)
4326
11.7k
                free(fmt);
4327
11.7k
        }
4328
34
        else
4329
353
            for (j=0; j<=v->n_sample; j++)
4330
319
                kputsn("\t.", 2, s);
4331
11.7k
    }
4332
39.5k
    kputc('\n', s);
4333
39.5k
    return 0;
4334
39.5k
}
4335
4336
int vcf_write_line(htsFile *fp, kstring_t *line)
4337
0
{
4338
0
    int ret;
4339
0
    if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
4340
0
    if ( fp->format.compression!=no_compression )
4341
0
        ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
4342
0
    else
4343
0
        ret = hwrite(fp->fp.hfile, line->s, line->l);
4344
0
    return ret==line->l ? 0 : -1;
4345
0
}
4346
4347
int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
4348
39.5k
{
4349
39.5k
    ssize_t ret;
4350
39.5k
    fp->line.l = 0;
4351
39.5k
    if (vcf_format1(h, v, &fp->line) != 0)
4352
0
        return -1;
4353
39.5k
    if ( fp->format.compression!=no_compression ) {
4354
0
        if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4355
0
            return -1;
4356
0
        if (fp->idx && !fp->fp.bgzf->mt)
4357
0
            hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
4358
0
        ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
4359
39.5k
    } else {
4360
39.5k
        ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
4361
39.5k
    }
4362
4363
39.5k
    if (fp->idx && fp->format.compression == bgzf) {
4364
0
        int tid;
4365
0
        if ((tid = hts_idx_tbi_name(fp->idx, v->rid, bcf_seqname_safe(h, v))) < 0)
4366
0
            return -1;
4367
4368
0
        if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
4369
0
                          tid, v->pos, v->pos + v->rlen,
4370
0
                          bgzf_tell(fp->fp.bgzf), 1) < 0)
4371
0
            return -1;
4372
0
    }
4373
4374
39.5k
    return ret==fp->line.l ? 0 : -1;
4375
39.5k
}
4376
4377
/************************
4378
 * Data access routines *
4379
 ************************/
4380
4381
int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
4382
48.6k
{
4383
48.6k
    khint_t k;
4384
48.6k
    vdict_t *d = (vdict_t*)h->dict[which];
4385
48.6k
    k = kh_get(vdict, d, id);
4386
48.6k
    return k == kh_end(d)? -1 : kh_val(d, k).id;
4387
48.6k
}
4388
4389
4390
/********************
4391
 *** BCF indexing ***
4392
 ********************/
4393
4394
// Calculate number of index levels given min_shift and the header contig
4395
// list.  Also returns number of contigs in *nids_out.
4396
static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift,
4397
                               int starting_n_lvls, int *nids_out)
4398
0
{
4399
0
    int n_lvls, i, nids = 0;
4400
0
    int64_t max_len = 0, s;
4401
4402
0
    for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
4403
0
    {
4404
0
        if ( !h->id[BCF_DT_CTG][i].val ) continue;
4405
0
        if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] )
4406
0
            max_len = h->id[BCF_DT_CTG][i].val->info[0];
4407
0
        nids++;
4408
0
    }
4409
0
    if ( !max_len ) max_len = (1LL<<31) - 1;  // In case contig line is broken.
4410
0
    max_len += 256;
4411
0
    s = hts_bin_maxpos(min_shift, starting_n_lvls);
4412
0
    for (n_lvls = starting_n_lvls; max_len > s; ++n_lvls, s <<= 3);
4413
4414
0
    if (nids_out) *nids_out = nids;
4415
0
    return n_lvls;
4416
0
}
4417
4418
hts_idx_t *bcf_index(htsFile *fp, int min_shift)
4419
0
{
4420
0
    int n_lvls;
4421
0
    bcf1_t *b = NULL;
4422
0
    hts_idx_t *idx = NULL;
4423
0
    bcf_hdr_t *h;
4424
0
    int r;
4425
0
    h = bcf_hdr_read(fp);
4426
0
    if ( !h ) return NULL;
4427
0
    int nids = 0;
4428
0
    n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
4429
0
    idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4430
0
    if (!idx) goto fail;
4431
0
    b = bcf_init1();
4432
0
    if (!b) goto fail;
4433
0
    while ((r = bcf_read1(fp,h, b)) >= 0) {
4434
0
        int ret;
4435
0
        ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
4436
0
        if (ret < 0) goto fail;
4437
0
    }
4438
0
    if (r < -1) goto fail;
4439
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
4440
0
    bcf_destroy1(b);
4441
0
    bcf_hdr_destroy(h);
4442
0
    return idx;
4443
4444
0
 fail:
4445
0
    hts_idx_destroy(idx);
4446
0
    bcf_destroy1(b);
4447
0
    bcf_hdr_destroy(h);
4448
0
    return NULL;
4449
0
}
4450
4451
hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
4452
0
{
4453
0
    return fnidx? hts_idx_load2(fn, fnidx) : bcf_index_load(fn);
4454
0
}
4455
4456
hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags)
4457
0
{
4458
0
    return hts_idx_load3(fn, fnidx, HTS_FMT_CSI, flags);
4459
0
}
4460
4461
int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads)
4462
0
{
4463
0
    htsFile *fp;
4464
0
    hts_idx_t *idx;
4465
0
    tbx_t *tbx;
4466
0
    int ret;
4467
0
    if ((fp = hts_open(fn, "rb")) == 0) return -2;
4468
0
    if (n_threads)
4469
0
        hts_set_threads(fp, n_threads);
4470
0
    if ( fp->format.compression!=bgzf ) { hts_close(fp); return -3; }
4471
0
    switch (fp->format.format) {
4472
0
        case bcf:
4473
0
            if (!min_shift) {
4474
0
                hts_log_error("TBI indices for BCF files are not supported");
4475
0
                ret = -1;
4476
0
            } else {
4477
0
                idx = bcf_index(fp, min_shift);
4478
0
                if (idx) {
4479
0
                    ret = hts_idx_save_as(idx, fn, fnidx, HTS_FMT_CSI);
4480
0
                    if (ret < 0) ret = -4;
4481
0
                    hts_idx_destroy(idx);
4482
0
                }
4483
0
                else ret = -1;
4484
0
            }
4485
0
            break;
4486
4487
0
        case vcf:
4488
0
            tbx = tbx_index(hts_get_bgzfp(fp), min_shift, &tbx_conf_vcf);
4489
0
            if (tbx) {
4490
0
                ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0 ? HTS_FMT_CSI : HTS_FMT_TBI);
4491
0
                if (ret < 0) ret = -4;
4492
0
                tbx_destroy(tbx);
4493
0
            }
4494
0
            else ret = -1;
4495
0
            break;
4496
4497
0
        default:
4498
0
            ret = -3;
4499
0
            break;
4500
0
    }
4501
0
    hts_close(fp);
4502
0
    return ret;
4503
0
}
4504
4505
int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
4506
0
{
4507
0
    return bcf_index_build3(fn, fnidx, min_shift, 0);
4508
0
}
4509
4510
int bcf_index_build(const char *fn, int min_shift)
4511
0
{
4512
0
    return bcf_index_build3(fn, NULL, min_shift, 0);
4513
0
}
4514
4515
// Initialise fp->idx for the current format type.
4516
// This must be called after the header has been written but no other data.
4517
0
static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4518
0
    int n_lvls, fmt;
4519
4520
0
    if (min_shift == 0) {
4521
0
        min_shift = 14;
4522
0
        n_lvls = 5;
4523
0
        fmt = HTS_FMT_TBI;
4524
0
    } else {
4525
        // Set initial n_lvls to match tbx_index()
4526
0
        int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3;
4527
        // Increase if necessary
4528
0
        n_lvls = idx_calc_n_lvls_ids(h, min_shift, starting_n_lvls, NULL);
4529
0
        fmt = HTS_FMT_CSI;
4530
0
    }
4531
4532
0
    fp->idx = hts_idx_init(0, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4533
0
    if (!fp->idx) return -1;
4534
4535
    // Tabix meta data, added even in CSI for VCF
4536
0
    uint8_t conf[4*7];
4537
0
    u32_to_le(TBX_VCF, conf+0);  // fmt
4538
0
    u32_to_le(1,       conf+4);  // name col
4539
0
    u32_to_le(2,       conf+8);  // beg col
4540
0
    u32_to_le(0,       conf+12); // end col
4541
0
    u32_to_le('#',     conf+16); // comment
4542
0
    u32_to_le(0,       conf+20); // n.skip
4543
0
    u32_to_le(0,       conf+24); // ref name len
4544
0
    if (hts_idx_set_meta(fp->idx, sizeof(conf)*sizeof(*conf), (uint8_t *)conf, 1) < 0) {
4545
0
        hts_idx_destroy(fp->idx);
4546
0
        fp->idx = NULL;
4547
0
        return -1;
4548
0
    }
4549
0
    fp->fnidx = fnidx;
4550
4551
0
    return 0;
4552
0
}
4553
4554
// Initialise fp->idx for the current format type.
4555
// This must be called after the header has been written but no other data.
4556
0
int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4557
0
    int n_lvls, nids = 0;
4558
4559
0
    if (fp->format.compression != bgzf) {
4560
0
        hts_log_error("Indexing is only supported on BGZF-compressed files");
4561
0
        return -3; // Matches no-compression return for bcf_index_build3()
4562
0
    }
4563
4564
0
    if (fp->format.format == vcf)
4565
0
        return vcf_idx_init(fp, h, min_shift, fnidx);
4566
4567
0
    if (!min_shift)
4568
0
        min_shift = 14;
4569
4570
0
    n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
4571
4572
0
    fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4573
0
    if (!fp->idx) return -1;
4574
0
    fp->fnidx = fnidx;
4575
4576
0
    return 0;
4577
0
}
4578
4579
// Finishes an index. Call after the last record has been written.
4580
// Returns 0 on success, <0 on failure.
4581
//
4582
// NB: same format as SAM/BAM as it uses bgzf.
4583
0
int bcf_idx_save(htsFile *fp) {
4584
0
    return sam_idx_save(fp);
4585
0
}
4586
4587
/*****************
4588
 *** Utilities ***
4589
 *****************/
4590
4591
int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
4592
0
{
4593
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res;
4594
0
    for (i=0; i<src->nhrec; i++)
4595
0
    {
4596
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4597
0
        {
4598
0
            int j;
4599
0
            for (j=0; j<ndst_ori; j++)
4600
0
            {
4601
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4602
4603
                // Checking only the key part of generic lines, otherwise
4604
                // the VCFs are too verbose. Should we perhaps add a flag
4605
                // to bcf_hdr_combine() and make this optional?
4606
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4607
0
            }
4608
0
            if ( j>=ndst_ori ) {
4609
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4610
0
                if (res < 0) return -1;
4611
0
                need_sync += res;
4612
0
            }
4613
0
        }
4614
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4615
0
        {
4616
            // NB: we are ignoring fields without ID
4617
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4618
0
            if ( j>=0 )
4619
0
            {
4620
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4621
0
                if ( !rec ) {
4622
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4623
0
                    if (res < 0) return -1;
4624
0
                    need_sync += res;
4625
0
                }
4626
0
            }
4627
0
        }
4628
0
        else
4629
0
        {
4630
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4631
0
            assert( j>=0 ); // this should always be true for valid VCFs
4632
4633
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4634
0
            if ( !rec ) {
4635
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4636
0
                if (res < 0) return -1;
4637
0
                need_sync += res;
4638
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4639
0
            {
4640
                // Check that both records are of the same type. The bcf_hdr_id2length
4641
                // macro cannot be used here because dst header is not synced yet.
4642
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4643
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4644
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4645
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4646
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4647
0
                {
4648
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4649
0
                        src->hrec[i]->vals[0]);
4650
0
                    ret |= 1;
4651
0
                }
4652
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4653
0
                {
4654
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4655
0
                        src->hrec[i]->vals[0]);
4656
0
                    ret |= 1;
4657
0
                }
4658
0
            }
4659
0
        }
4660
0
    }
4661
0
    if ( need_sync ) {
4662
0
        if (bcf_hdr_sync(dst) < 0) return -1;
4663
0
    }
4664
0
    return ret;
4665
0
}
4666
4667
bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
4668
0
{
4669
0
    if ( !dst )
4670
0
    {
4671
        // this will effectively strip existing IDX attributes from src to become dst
4672
0
        dst = bcf_hdr_init("r");
4673
0
        kstring_t htxt = {0,0,0};
4674
0
        if (bcf_hdr_format(src, 0, &htxt) < 0) {
4675
0
            free(htxt.s);
4676
0
            return NULL;
4677
0
        }
4678
0
        if ( bcf_hdr_parse(dst, htxt.s) < 0 ) {
4679
0
            bcf_hdr_destroy(dst);
4680
0
            dst = NULL;
4681
0
        }
4682
0
        free(htxt.s);
4683
0
        return dst;
4684
0
    }
4685
4686
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, res;
4687
0
    for (i=0; i<src->nhrec; i++)
4688
0
    {
4689
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4690
0
        {
4691
0
            int j;
4692
0
            for (j=0; j<ndst_ori; j++)
4693
0
            {
4694
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4695
4696
                // Checking only the key part of generic lines, otherwise
4697
                // the VCFs are too verbose. Should we perhaps add a flag
4698
                // to bcf_hdr_combine() and make this optional?
4699
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4700
0
            }
4701
0
            if ( j>=ndst_ori ) {
4702
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4703
0
                if (res < 0) return NULL;
4704
0
                need_sync += res;
4705
0
            }
4706
0
            else if ( !strcmp(src->hrec[i]->key,"fileformat") )
4707
0
            {
4708
0
                int ver_src = bcf_get_version(src,src->hrec[i]->value);
4709
0
                int ver_dst = bcf_get_version(dst,dst->hrec[j]->value);
4710
0
                if ( ver_src > ver_dst )
4711
0
                {
4712
0
                    if (bcf_hdr_set_version(dst,src->hrec[i]->value) < 0)
4713
0
                        return NULL;
4714
0
                    need_sync = 1;
4715
0
                }
4716
0
            }
4717
0
        }
4718
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4719
0
        {
4720
            // NB: we are ignoring fields without ID
4721
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4722
0
            if ( j>=0 )
4723
0
            {
4724
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4725
0
                if ( !rec ) {
4726
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4727
0
                    if (res < 0) return NULL;
4728
0
                    need_sync += res;
4729
0
                }
4730
0
            }
4731
0
        }
4732
0
        else
4733
0
        {
4734
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4735
0
            assert( j>=0 ); // this should always be true for valid VCFs
4736
4737
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4738
0
            if ( !rec ) {
4739
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4740
0
                if (res < 0) return NULL;
4741
0
                need_sync += res;
4742
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4743
0
            {
4744
                // Check that both records are of the same type. The bcf_hdr_id2length
4745
                // macro cannot be used here because dst header is not synced yet.
4746
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4747
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4748
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4749
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4750
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4751
0
                {
4752
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4753
0
                        src->hrec[i]->vals[0]);
4754
0
                }
4755
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4756
0
                {
4757
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4758
0
                        src->hrec[i]->vals[0]);
4759
0
                }
4760
0
            }
4761
0
        }
4762
0
    }
4763
0
    if ( need_sync ) {
4764
0
        if (bcf_hdr_sync(dst) < 0) return NULL;
4765
0
    }
4766
0
    return dst;
4767
0
}
4768
4769
int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
4770
0
{
4771
0
    int i;
4772
0
    if ( line->errcode )
4773
0
    {
4774
0
        char errordescription[1024] = "";
4775
0
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_strerror(line->errcode, errordescription, sizeof(errordescription)),  bcf_seqname_safe(src_hdr,line), line->pos+1);
4776
0
        exit(1);
4777
0
    }
4778
0
    if ( src_hdr->ntransl==-1 ) return 0;    // no need to translate, all tags have the same id
4779
0
    if ( !src_hdr->ntransl )  // called for the first time, see what needs translating
4780
0
    {
4781
0
        int dict;
4782
0
        for (dict=0; dict<2; dict++)    // BCF_DT_ID and BCF_DT_CTG
4783
0
        {
4784
0
            src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
4785
0
            for (i=0; i<src_hdr->n[dict]; i++)
4786
0
            {
4787
0
                if ( !src_hdr->id[dict][i].key ) // gap left after removed BCF header lines
4788
0
                {
4789
0
                    src_hdr->transl[dict][i] = -1;
4790
0
                    continue;
4791
0
                }
4792
0
                src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
4793
0
                if ( src_hdr->transl[dict][i]!=-1 && i!=src_hdr->transl[dict][i] ) src_hdr->ntransl++;
4794
0
            }
4795
0
        }
4796
0
        if ( !src_hdr->ntransl )
4797
0
        {
4798
0
            free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
4799
0
            free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
4800
0
            src_hdr->ntransl = -1;
4801
0
        }
4802
0
        if ( src_hdr->ntransl==-1 ) return 0;
4803
0
    }
4804
0
    bcf_unpack(line,BCF_UN_ALL);
4805
4806
    // CHROM
4807
0
    if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
4808
4809
    // FILTER
4810
0
    for (i=0; i<line->d.n_flt; i++)
4811
0
    {
4812
0
        int src_id = line->d.flt[i];
4813
0
        if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
4814
0
            line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
4815
0
        line->d.shared_dirty |= BCF1_DIRTY_FLT;
4816
0
    }
4817
4818
    // INFO
4819
0
    for (i=0; i<line->n_info; i++)
4820
0
    {
4821
0
        int src_id = line->d.info[i].key;
4822
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
4823
0
        if ( dst_id<0 ) continue;
4824
0
        line->d.info[i].key = dst_id;
4825
0
        if ( !line->d.info[i].vptr ) continue;  // skip deleted
4826
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4827
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4828
0
        if ( src_size==dst_size )   // can overwrite
4829
0
        {
4830
0
            uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
4831
0
            if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
4832
0
            else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
4833
0
            else { *(uint32_t*)vptr = (uint32_t)dst_id; }
4834
0
        }
4835
0
        else    // must realloc
4836
0
        {
4837
0
            bcf_info_t *info = &line->d.info[i];
4838
0
            kstring_t str = {0,0,0};
4839
0
            bcf_enc_int1(&str, dst_id);
4840
0
            bcf_enc_size(&str, info->len,info->type);
4841
0
            uint32_t vptr_off = str.l;
4842
0
            kputsn((char*)info->vptr, info->vptr_len, &str);
4843
0
            if( info->vptr_free ) free(info->vptr - info->vptr_off);
4844
0
            info->vptr_off = vptr_off;
4845
0
            info->vptr = (uint8_t*)str.s + info->vptr_off;
4846
0
            info->vptr_free = 1;
4847
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
4848
0
        }
4849
0
    }
4850
4851
    // FORMAT
4852
0
    for (i=0; i<line->n_fmt; i++)
4853
0
    {
4854
0
        int src_id = line->d.fmt[i].id;
4855
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
4856
0
        if ( dst_id<0 ) continue;
4857
0
        line->d.fmt[i].id = dst_id;
4858
0
        if( !line->d.fmt[i].p ) continue;  // skip deleted
4859
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4860
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4861
0
        if ( src_size==dst_size )   // can overwrite
4862
0
        {
4863
0
            uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off;    // pointer to the vector size (4bits) and BT type (4bits)
4864
0
            if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
4865
0
            else if ( dst_size==BCF_BT_INT16 ) { i16_to_le(dst_id, p + 1); }
4866
0
            else { i32_to_le(dst_id, p + 1); }
4867
0
        }
4868
0
        else    // must realloc
4869
0
        {
4870
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
4871
0
            kstring_t str = {0,0,0};
4872
0
            bcf_enc_int1(&str, dst_id);
4873
0
            bcf_enc_size(&str, fmt->n, fmt->type);
4874
0
            uint32_t p_off = str.l;
4875
0
            kputsn((char*)fmt->p, fmt->p_len, &str);
4876
0
            if( fmt->p_free ) free(fmt->p - fmt->p_off);
4877
0
            fmt->p_off = p_off;
4878
0
            fmt->p = (uint8_t*)str.s + fmt->p_off;
4879
0
            fmt->p_free = 1;
4880
0
            line->d.indiv_dirty = 1;
4881
0
        }
4882
0
    }
4883
0
    return 0;
4884
0
}
4885
4886
bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
4887
0
{
4888
0
    bcf_hdr_t *hout = bcf_hdr_init("r");
4889
0
    if (!hout) {
4890
0
        hts_log_error("Failed to allocate bcf header");
4891
0
        return NULL;
4892
0
    }
4893
0
    kstring_t htxt = {0,0,0};
4894
0
    if (bcf_hdr_format(hdr, 1, &htxt) < 0) {
4895
0
        free(htxt.s);
4896
0
        return NULL;
4897
0
    }
4898
0
    if ( bcf_hdr_parse(hout, htxt.s) < 0 ) {
4899
0
        bcf_hdr_destroy(hout);
4900
0
        hout = NULL;
4901
0
    }
4902
0
    free(htxt.s);
4903
0
    return hout;
4904
0
}
4905
4906
bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
4907
0
{
4908
0
    void *names_hash = khash_str2int_init();
4909
0
    kstring_t htxt = {0,0,0};
4910
0
    kstring_t str = {0,0,0};
4911
0
    bcf_hdr_t *h = bcf_hdr_init("w");
4912
0
    int r = 0;
4913
0
    if (!h || !names_hash) {
4914
0
        hts_log_error("Failed to allocate bcf header");
4915
0
        goto err;
4916
0
    }
4917
0
    if (bcf_hdr_format(h0, 1, &htxt) < 0) {
4918
0
        hts_log_error("Failed to get header text");
4919
0
        goto err;
4920
0
    }
4921
0
    bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
4922
0
    int j;
4923
0
    for (j=0; j<n; j++) imap[j] = -1;
4924
0
    if ( bcf_hdr_nsamples(h0) > 0) {
4925
0
        char *p = find_chrom_header_line(htxt.s);
4926
0
        int i = 0, end = n? 8 : 7;
4927
0
        while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
4928
0
        if (i != end) {
4929
0
            hts_log_error("Wrong number of columns in header #CHROM line");
4930
0
            goto err;
4931
0
        }
4932
0
        r |= kputsn(htxt.s, p - htxt.s, &str) < 0;
4933
0
        for (i = 0; i < n; ++i) {
4934
0
            if ( khash_str2int_has_key(names_hash,samples[i]) )
4935
0
            {
4936
0
                hts_log_error("Duplicate sample name \"%s\"", samples[i]);
4937
0
                goto err;
4938
0
            }
4939
0
            imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
4940
0
            if (imap[i] < 0) continue;
4941
0
            r |= kputc('\t', &str) < 0;
4942
0
            r |= kputs(samples[i], &str) < 0;
4943
0
            r |= khash_str2int_inc(names_hash,samples[i]) < 0;
4944
0
        }
4945
0
    } else r |= kputsn(htxt.s, htxt.l, &str) < 0;
4946
0
    while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
4947
0
    r |= kputc('\n',&str) < 0;
4948
0
    if (r) {
4949
0
        hts_log_error("%s", strerror(errno));
4950
0
        goto err;
4951
0
    }
4952
0
    if ( bcf_hdr_parse(h, str.s) < 0 ) {
4953
0
        bcf_hdr_destroy(h);
4954
0
        h = NULL;
4955
0
    }
4956
0
    free(str.s);
4957
0
    free(htxt.s);
4958
0
    khash_str2int_destroy(names_hash);
4959
0
    return h;
4960
4961
0
 err:
4962
0
    ks_free(&str);
4963
0
    ks_free(&htxt);
4964
0
    khash_str2int_destroy(names_hash);
4965
0
    bcf_hdr_destroy(h);
4966
0
    return NULL;
4967
0
}
4968
4969
int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
4970
0
{
4971
0
    if ( samples && !strcmp("-",samples) ) return 0;            // keep all samples
4972
4973
0
    int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
4974
0
    hdr->keep_samples = (uint8_t*) calloc(narr,1);
4975
0
    if (!hdr->keep_samples) return -1;
4976
4977
0
    hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
4978
0
    if ( !samples )
4979
0
    {
4980
        // exclude all samples
4981
0
        khint_t k;
4982
0
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE], *new_dict;
4983
0
        new_dict = kh_init(vdict);
4984
0
        if (!new_dict) return -1;
4985
4986
0
        bcf_hdr_nsamples(hdr) = 0;
4987
4988
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
4989
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
4990
0
        kh_destroy(vdict, d);
4991
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
4992
0
        if (bcf_hdr_sync(hdr) < 0) return -1;
4993
4994
0
        return 0;
4995
0
    }
4996
4997
0
    if ( samples[0]=='^' )
4998
0
        for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
4999
5000
0
    int idx, n, ret = 0;
5001
0
    char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
5002
0
    if ( !smpls ) return -1;
5003
0
    for (i=0; i<n; i++)
5004
0
    {
5005
0
        idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
5006
0
        if ( idx<0 )
5007
0
        {
5008
0
            if ( !ret ) ret = i+1;
5009
0
            continue;
5010
0
        }
5011
0
        assert( idx<bcf_hdr_nsamples(hdr) );
5012
0
        if (  samples[0]=='^' )
5013
0
            bit_array_clear(hdr->keep_samples, idx);
5014
0
        else
5015
0
            bit_array_set(hdr->keep_samples, idx);
5016
0
    }
5017
0
    for (i=0; i<n; i++) free(smpls[i]);
5018
0
    free(smpls);
5019
5020
0
    bcf_hdr_nsamples(hdr) = 0;
5021
0
    for (i=0; i<hdr->nsamples_ori; i++)
5022
0
        if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
5023
5024
0
    if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
5025
0
    else
5026
0
    {
5027
        // Make new list and dictionary with desired samples
5028
0
        char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr));
5029
0
        vdict_t *new_dict, *d;
5030
0
        int k, res;
5031
0
        if (!samples) return -1;
5032
5033
0
        new_dict = kh_init(vdict);
5034
0
        if (!new_dict) {
5035
0
            free(samples);
5036
0
            return -1;
5037
0
        }
5038
0
        idx = 0;
5039
0
        for (i=0; i<hdr->nsamples_ori; i++) {
5040
0
            if ( bit_array_test(hdr->keep_samples,i) ) {
5041
0
                samples[idx] = hdr->samples[i];
5042
0
                k = kh_put(vdict, new_dict, hdr->samples[i], &res);
5043
0
                if (res < 0) {
5044
0
                    free(samples);
5045
0
                    kh_destroy(vdict, new_dict);
5046
0
                    return -1;
5047
0
                }
5048
0
                kh_val(new_dict, k) = bcf_idinfo_def;
5049
0
                kh_val(new_dict, k).id = idx;
5050
0
                idx++;
5051
0
            }
5052
0
        }
5053
5054
        // Delete desired samples from old dictionary, so we don't free them
5055
0
        d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
5056
0
        for (i=0; i < idx; i++) {
5057
0
            int k = kh_get(vdict, d, samples[i]);
5058
0
            if (k < kh_end(d)) kh_del(vdict, d, k);
5059
0
        }
5060
5061
        // Free everything else
5062
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
5063
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
5064
0
        kh_destroy(vdict, d);
5065
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
5066
5067
0
        free(hdr->samples);
5068
0
        hdr->samples = samples;
5069
5070
0
        if (bcf_hdr_sync(hdr) < 0)
5071
0
            return -1;
5072
0
    }
5073
5074
0
    return ret;
5075
0
}
5076
5077
int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
5078
0
{
5079
0
    kstring_t ind;
5080
0
    ind.s = 0; ind.l = ind.m = 0;
5081
0
    if (n) {
5082
0
        bcf_fmt_t fmt[MAX_N_FMT];
5083
0
        int i, j;
5084
0
        uint8_t *ptr = (uint8_t*)v->indiv.s;
5085
0
        for (i = 0; i < v->n_fmt; ++i)
5086
0
            ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
5087
0
        for (i = 0; i < (int)v->n_fmt; ++i) {
5088
0
            bcf_fmt_t *f = &fmt[i];
5089
0
            bcf_enc_int1(&ind, f->id);
5090
0
            bcf_enc_size(&ind, f->n, f->type);
5091
0
            for (j = 0; j < n; ++j)
5092
0
                if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
5093
0
        }
5094
0
        for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
5095
0
        v->n_sample = i;
5096
0
    } else v->n_sample = 0;
5097
0
    if ( !v->n_sample ) v->n_fmt = 0;
5098
0
    free(v->indiv.s);
5099
0
    v->indiv = ind;
5100
0
    v->unpacked &= ~BCF_UN_FMT;    // only BCF is ready for output, VCF will need to unpack again
5101
0
    return 0;
5102
0
}
5103
5104
int bcf_is_snp(bcf1_t *v)
5105
0
{
5106
0
    int i;
5107
0
    bcf_unpack(v, BCF_UN_STR);
5108
0
    for (i = 0; i < v->n_allele; ++i)
5109
0
    {
5110
0
        if ( v->d.allele[i][1]==0 && v->d.allele[i][0]!='*' ) continue;
5111
5112
        // mpileup's <X> allele, see also below. This is not completely satisfactory,
5113
        // a general library is here narrowly tailored to fit samtools.
5114
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue;
5115
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='*' && v->d.allele[i][2]=='>' ) continue;
5116
5117
0
        break;
5118
0
    }
5119
0
    return i == v->n_allele;
5120
0
}
5121
5122
static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t *var)
5123
0
{
5124
0
    if ( *alt == '*' && !alt[1] ) { var->n = 0; var->type = VCF_OVERLAP; return; }  // overlapping variant
5125
5126
    // The most frequent case
5127
0
    if ( !ref[1] && !alt[1] )
5128
0
    {
5129
0
        if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
5130
0
        if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
5131
0
        var->n = 1; var->type = VCF_SNP; return;
5132
0
    }
5133
0
    if ( alt[0]=='<' )
5134
0
    {
5135
0
        if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
5136
0
        if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }
5137
0
        if ( !strcmp("NON_REF>",alt+1) ) { var->n = 0; var->type = VCF_REF; return; }
5138
0
        var->type = VCF_OTHER;
5139
0
        return;
5140
0
    }
5141
5142
    // Catch "joined before" breakend case
5143
0
    if ( alt[0]==']' || alt[0] == '[' )
5144
0
    {
5145
0
        var->type = VCF_BND; return;
5146
0
    }
5147
5148
    // Iterate through alt characters that match the reference
5149
0
    const char *r = ref, *a = alt;
5150
0
    while (*r && *a && toupper_c(*r)==toupper_c(*a) ) { r++; a++; }     // unfortunately, matching REF,ALT case is not guaranteed
5151
5152
0
    if ( *a && !*r )
5153
0
    {
5154
0
        while ( *a ) a++;
5155
0
        if ( *(a-1)==']' || *(a-1)=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend
5156
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_INS; return;
5157
0
    }
5158
0
    else if ( *r && !*a )
5159
0
    {
5160
0
        while ( *r ) r++;
5161
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_DEL; return;
5162
0
    }
5163
0
    else if ( !*r && !*a )
5164
0
    {
5165
0
        var->n = 0; var->type = VCF_REF; return;
5166
0
    }
5167
5168
0
    const char *re = r, *ae = a;
5169
0
    while ( re[1] ) re++;
5170
0
    while ( ae[1] ) ae++;
5171
0
    if ( ae[0]==']' || ae[0]=='[' ) { var->type = VCF_BND; return; }    // "joined after" breakend
5172
0
    while ( re>r && ae>a && toupper_c(*re)==toupper_c(*ae) ) { re--; ae--; }
5173
0
    if ( ae==a )
5174
0
    {
5175
0
        if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
5176
0
        var->n = -(re-r);
5177
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_DEL; return; }
5178
0
        var->type = VCF_OTHER; return;
5179
0
    }
5180
0
    else if ( re==r )
5181
0
    {
5182
0
        var->n = ae-a;
5183
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_INS; return; }
5184
0
        var->type = VCF_OTHER; return;
5185
0
    }
5186
5187
0
    var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
5188
0
    var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
5189
5190
    // should do also complex events, SVs, etc...
5191
0
}
5192
5193
static int bcf_set_variant_types(bcf1_t *b)
5194
0
{
5195
0
    if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
5196
0
    bcf_dec_t *d = &b->d;
5197
0
    if ( d->n_var < b->n_allele )
5198
0
    {
5199
0
        bcf_variant_t *new_var = realloc(d->var, sizeof(bcf_variant_t)*b->n_allele);
5200
0
        if (!new_var)
5201
0
            return -1;
5202
0
        d->var = new_var;
5203
0
        d->n_var = b->n_allele;
5204
0
    }
5205
0
    int i;
5206
0
    b->d.var_type = 0;
5207
0
    d->var[0].type = VCF_REF;
5208
0
    d->var[0].n    = 0;
5209
0
    for (i=1; i<b->n_allele; i++)
5210
0
    {
5211
0
        bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
5212
0
        b->d.var_type |= d->var[i].type;
5213
        //fprintf(stderr,"[set_variant_type] %d   %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
5214
0
    }
5215
0
    return 0;
5216
0
}
5217
5218
// bcf_get_variant_type/bcf_get_variant_types should only return the following,
5219
// to be compatible with callers that are not expecting newer values
5220
// like VCF_INS, VCF_DEL.  The full set is available from the newer
5221
// vcf_has_variant_type* interfaces.
5222
0
#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP)
5223
int bcf_get_variant_types(bcf1_t *rec)
5224
0
{
5225
0
    if ( rec->d.var_type==-1 ) {
5226
0
        if (bcf_set_variant_types(rec) != 0) {
5227
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5228
0
            exit(1); // Due to legacy API having no way to report failures
5229
0
        }
5230
0
    }
5231
0
    return rec->d.var_type & ORIG_VAR_TYPES;
5232
0
}
5233
5234
int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
5235
0
{
5236
0
    if ( rec->d.var_type==-1 ) {
5237
0
        if (bcf_set_variant_types(rec) != 0) {
5238
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5239
0
            exit(1); // Due to legacy API having no way to report failures
5240
0
        }
5241
0
    }
5242
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) {
5243
0
        hts_log_error("Requested allele outside valid range");
5244
0
        exit(1);
5245
0
    }
5246
0
    return rec->d.var[ith_allele].type & ORIG_VAR_TYPES;
5247
0
}
5248
#undef ORIG_VAR_TYPES
5249
5250
int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask)
5251
0
{
5252
0
    if ( rec->d.var_type==-1 ) {
5253
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5254
0
    }
5255
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return -1;
5256
0
    if (bitmask == VCF_REF) {  // VCF_REF is 0, so handled as a special case
5257
0
        return rec->d.var[ith_allele].type == VCF_REF;
5258
0
    }
5259
0
    return bitmask & rec->d.var[ith_allele].type;
5260
0
}
5261
5262
int bcf_variant_length(bcf1_t *rec, int ith_allele)
5263
0
{
5264
0
    if ( rec->d.var_type==-1 ) {
5265
0
        if (bcf_set_variant_types(rec) != 0) return bcf_int32_missing;
5266
0
    }
5267
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return bcf_int32_missing;
5268
0
    return rec->d.var[ith_allele].n;
5269
0
}
5270
5271
int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask,
5272
                          enum bcf_variant_match mode)
5273
0
{
5274
0
    if ( rec->d.var_type==-1 ) {
5275
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5276
0
    }
5277
0
    uint32_t type = rec->d.var_type;
5278
0
    if ( mode==bcf_match_overlap ) return bitmask & type;
5279
5280
    // VCF_INDEL is always set with VCF_INS and VCF_DEL by bcf_set_variant_type[s], but the bitmask may
5281
    // ask for say `VCF_INS` or `VCF_INDEL` only
5282
0
    if ( bitmask&(VCF_INS|VCF_DEL) && !(bitmask&VCF_INDEL) ) type &= ~VCF_INDEL;
5283
0
    else if ( bitmask&VCF_INDEL && !(bitmask&(VCF_INS|VCF_DEL)) ) type &= ~(VCF_INS|VCF_DEL);
5284
5285
0
    if ( mode==bcf_match_subset )
5286
0
    {
5287
0
        if ( ~bitmask & type ) return 0;
5288
0
        else return bitmask & type;
5289
0
    }
5290
    // mode == bcf_match_exact
5291
0
    if ( bitmask==VCF_REF ) return type==bitmask ? 1 : 0;
5292
0
    return type==bitmask ? type : 0;
5293
0
}
5294
5295
int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5296
0
{
5297
0
    static int negative_rlen_warned = 0;
5298
0
    int is_end_tag, is_svlen_tag = 0;
5299
5300
    // Is the field already present?
5301
0
    int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5302
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1;    // No such INFO field in the header
5303
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
5304
5305
0
    is_end_tag = strcmp(key, "END") == 0;
5306
0
    is_svlen_tag = strcmp(key, "SVLEN") == 0;
5307
5308
0
    for (i=0; i<line->n_info; i++)
5309
0
        if ( inf_id==line->d.info[i].key ) break;
5310
0
    bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
5311
5312
0
    if ( !n || (type==BCF_HT_STR && !values) )
5313
0
    {
5314
0
        if ( inf )
5315
0
        {
5316
            // Mark the tag for removal, free existing memory if necessary
5317
0
            if ( inf->vptr_free )
5318
0
            {
5319
0
                free(inf->vptr - inf->vptr_off);
5320
0
                inf->vptr_free = 0;
5321
0
            }
5322
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5323
0
            inf->vptr = NULL;
5324
0
            inf->vptr_off = inf->vptr_len = 0;
5325
0
        }
5326
0
        if ( n==0 && (is_end_tag || is_svlen_tag) ) {
5327
0
            line->rlen = get_rlen(hdr, line);
5328
0
        }
5329
0
        return 0;
5330
0
    }
5331
5332
0
    if (is_end_tag)
5333
0
    {
5334
0
        if (n != 1)
5335
0
        {
5336
0
            hts_log_error("END info tag should only have one value at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5337
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5338
0
            return -1;
5339
0
        }
5340
0
        if (type != BCF_HT_INT && type != BCF_HT_LONG)
5341
0
        {
5342
0
            hts_log_error("Wrong type (%d) for END info tag at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5343
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5344
0
            return -1;
5345
0
        }
5346
0
    }
5347
5348
    // Encode the values and determine the size required to accommodate the values
5349
0
    kstring_t str = {0,0,0};
5350
0
    bcf_enc_int1(&str, inf_id);
5351
0
    if ( type==BCF_HT_INT )
5352
0
        bcf_enc_vint(&str, n, (int32_t*)values, -1);
5353
0
    else if ( type==BCF_HT_REAL )
5354
0
        bcf_enc_vfloat(&str, n, (float*)values);
5355
0
    else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
5356
0
    {
5357
0
        if ( values==NULL )
5358
0
            bcf_enc_size(&str, 0, BCF_BT_NULL);
5359
0
        else
5360
0
            bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
5361
0
    }
5362
#ifdef VCF_ALLOW_INT64
5363
    else if ( type==BCF_HT_LONG )
5364
    {
5365
        if (n != 1) {
5366
            hts_log_error("Only storing a single BCF_HT_LONG value is supported at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5367
            abort();
5368
        }
5369
        bcf_enc_long1(&str, *(int64_t *) values);
5370
    }
5371
#endif
5372
0
    else
5373
0
    {
5374
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5375
0
        abort();
5376
0
    }
5377
5378
    // Is the INFO tag already present
5379
0
    if ( inf )
5380
0
    {
5381
        // Is it big enough to accommodate new block?
5382
0
        if ( inf->vptr && str.l <= inf->vptr_len + inf->vptr_off )
5383
0
        {
5384
0
            if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
5385
0
            uint8_t *ptr = inf->vptr - inf->vptr_off;
5386
0
            memcpy(ptr, str.s, str.l);
5387
0
            free(str.s);
5388
0
            int vptr_free = inf->vptr_free;
5389
0
            bcf_unpack_info_core1(ptr, inf);
5390
0
            inf->vptr_free = vptr_free;
5391
0
        }
5392
0
        else
5393
0
        {
5394
0
            if ( inf->vptr_free )
5395
0
                free(inf->vptr - inf->vptr_off);
5396
0
            bcf_unpack_info_core1((uint8_t*)str.s, inf);
5397
0
            inf->vptr_free = 1;
5398
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5399
0
        }
5400
0
    }
5401
0
    else
5402
0
    {
5403
        // The tag is not present, create new one
5404
0
        line->n_info++;
5405
0
        hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
5406
0
        inf = &line->d.info[line->n_info-1];
5407
0
        bcf_unpack_info_core1((uint8_t*)str.s, inf);
5408
0
        inf->vptr_free = 1;
5409
0
        line->d.shared_dirty |= BCF1_DIRTY_INF;
5410
0
    }
5411
0
    line->unpacked |= BCF_UN_INFO;
5412
5413
0
   if ( n==1 && is_end_tag) {
5414
0
        hts_pos_t end = type == BCF_HT_INT ? *(int32_t *) values : *(int64_t *) values;
5415
0
        if ( (type == BCF_HT_INT && end!=bcf_int32_missing) || (type == BCF_HT_LONG && end!=bcf_int64_missing) )
5416
0
        {
5417
0
            if ( end <= line->pos )
5418
0
            {
5419
0
                if ( !negative_rlen_warned )
5420
0
                {
5421
0
                    hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,end,bcf_seqname_safe(hdr,line),line->pos+1);
5422
0
                    negative_rlen_warned = 1;
5423
0
                }
5424
0
            }
5425
0
        }
5426
0
    }
5427
0
    if (is_svlen_tag || is_end_tag) {
5428
0
        line->rlen = get_rlen(hdr, line);
5429
0
    }
5430
0
    return 0;
5431
0
}
5432
5433
int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
5434
0
{
5435
0
    if ( !n )
5436
0
        return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
5437
5438
0
    int i, max_len = 0;
5439
0
    for (i=0; i<n; i++)
5440
0
    {
5441
0
        int len = strlen(values[i]);
5442
0
        if ( len > max_len ) max_len = len;
5443
0
    }
5444
0
    char *out = (char*) malloc(max_len*n);
5445
0
    if ( !out ) return -2;
5446
0
    for (i=0; i<n; i++)
5447
0
    {
5448
0
        char *dst = out+i*max_len;
5449
0
        const char *src = values[i];
5450
0
        int j = 0;
5451
0
        while ( src[j] ) { dst[j] = src[j]; j++; }
5452
0
        for (; j<max_len; j++) dst[j] = 0;
5453
0
    }
5454
0
    int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
5455
0
    free(out);
5456
0
    return ret;
5457
0
}
5458
5459
int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5460
0
{
5461
    // Is the field already present?
5462
0
    int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5463
0
    int is_len = 0;
5464
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
5465
0
    {
5466
0
        if ( !n ) return 0;
5467
0
        return -1;  // the key not present in the header
5468
0
    }
5469
5470
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5471
5472
0
    for (i=0; i<line->n_fmt; i++)
5473
0
        if ( line->d.fmt[i].id==fmt_id ) break;
5474
0
    bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
5475
5476
0
    is_len = strcmp(key, "LEN") == 0;
5477
0
    if ( !n )
5478
0
    {
5479
0
        if ( fmt )
5480
0
        {
5481
            // Mark the tag for removal, free existing memory if necessary
5482
0
            if ( fmt->p_free )
5483
0
            {
5484
0
                free(fmt->p - fmt->p_off);
5485
0
                fmt->p_free = 0;
5486
0
            }
5487
0
            line->d.indiv_dirty = 1;
5488
0
            fmt->p = NULL;
5489
0
        }
5490
0
        if (is_len) {
5491
0
            line->rlen = get_rlen(hdr, line);
5492
0
        }
5493
0
        return 0;
5494
0
    }
5495
5496
0
    line->n_sample = bcf_hdr_nsamples(hdr);
5497
0
    int nps = n / line->n_sample;  // number of values per sample
5498
0
    assert( nps && nps*line->n_sample==n );     // must be divisible by n_sample
5499
5500
    // Encode the values and determine the size required to accommodate the values
5501
0
    kstring_t str = {0,0,0};
5502
0
    bcf_enc_int1(&str, fmt_id);
5503
0
    assert(values != NULL);
5504
0
    if ( type==BCF_HT_INT )
5505
0
        bcf_enc_vint(&str, n, (int32_t*)values, nps);
5506
0
    else if ( type==BCF_HT_REAL )
5507
0
    {
5508
0
        bcf_enc_size(&str, nps, BCF_BT_FLOAT);
5509
0
        serialize_float_array(&str, nps*line->n_sample, (float *) values);
5510
0
    }
5511
0
    else if ( type==BCF_HT_STR )
5512
0
    {
5513
0
        bcf_enc_size(&str, nps, BCF_BT_CHAR);
5514
0
        kputsn((char*)values, nps*line->n_sample, &str);
5515
0
    }
5516
0
    else
5517
0
    {
5518
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5519
0
        abort();
5520
0
    }
5521
5522
0
    if ( !fmt )
5523
0
    {
5524
        // Not present, new format field
5525
0
        line->n_fmt++;
5526
0
        hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
5527
5528
        // Special case: VCF specification requires that GT is always first
5529
0
        if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
5530
0
        {
5531
0
            for (i=line->n_fmt-1; i>0; i--)
5532
0
                line->d.fmt[i] = line->d.fmt[i-1];
5533
0
            fmt = &line->d.fmt[0];
5534
0
        }
5535
0
        else
5536
0
            fmt = &line->d.fmt[line->n_fmt-1];
5537
0
        bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5538
0
        line->d.indiv_dirty = 1;
5539
0
        fmt->p_free = 1;
5540
0
    }
5541
0
    else
5542
0
    {
5543
        // The tag is already present, check if it is big enough to accommodate the new block
5544
0
        if ( fmt->p && str.l <= fmt->p_len + fmt->p_off )
5545
0
        {
5546
            // good, the block is big enough
5547
0
            if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
5548
0
            uint8_t *ptr = fmt->p - fmt->p_off;
5549
0
            memcpy(ptr, str.s, str.l);
5550
0
            free(str.s);
5551
0
            int p_free = fmt->p_free;
5552
0
            bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
5553
0
            fmt->p_free = p_free;
5554
0
        }
5555
0
        else
5556
0
        {
5557
0
            if ( fmt->p_free )
5558
0
                free(fmt->p - fmt->p_off);
5559
0
            bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5560
0
            fmt->p_free = 1;
5561
0
            line->d.indiv_dirty = 1;
5562
0
        }
5563
0
    }
5564
0
    line->unpacked |= BCF_UN_FMT;
5565
5566
0
    if (is_len) {
5567
0
        line->rlen = get_rlen(hdr, line);
5568
0
    }
5569
0
    return 0;
5570
0
}
5571
5572
5573
int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
5574
0
{
5575
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5576
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5577
0
    line->d.n_flt = n;
5578
0
    if ( !n ) return 0;
5579
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5580
0
    int i;
5581
0
    for (i=0; i<n; i++)
5582
0
        line->d.flt[i] = flt_ids[i];
5583
0
    return 0;
5584
0
}
5585
5586
int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
5587
0
{
5588
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5589
0
    int i;
5590
0
    for (i=0; i<line->d.n_flt; i++)
5591
0
        if ( flt_id==line->d.flt[i] ) break;
5592
0
    if ( i<line->d.n_flt ) return 0;    // this filter is already set
5593
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5594
0
    if ( flt_id==0 )    // set to PASS
5595
0
        line->d.n_flt = 1;
5596
0
    else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
5597
0
        line->d.n_flt = 1;
5598
0
    else
5599
0
        line->d.n_flt++;
5600
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5601
0
    line->d.flt[line->d.n_flt-1] = flt_id;
5602
0
    return 1;
5603
0
}
5604
int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
5605
0
{
5606
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5607
0
    int i;
5608
0
    for (i=0; i<line->d.n_flt; i++)
5609
0
        if ( flt_id==line->d.flt[i] ) break;
5610
0
    if ( i==line->d.n_flt ) return 0;   // the filter is not present
5611
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5612
0
    if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt));
5613
0
    line->d.n_flt--;
5614
0
    if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
5615
0
    return 0;
5616
0
}
5617
5618
int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
5619
0
{
5620
0
    if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
5621
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
5622
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1;  // not defined in the header
5623
5624
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5625
0
    if ( id==0 && !line->d.n_flt) return 1; // PASS
5626
5627
0
    int i;
5628
0
    for (i=0; i<line->d.n_flt; i++)
5629
0
        if ( line->d.flt[i]==id ) return 1;
5630
0
    return 0;
5631
0
}
5632
5633
static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
5634
0
{
5635
0
    line->d.shared_dirty |= BCF1_DIRTY_ALS;
5636
0
    line->d.var_type = -1;
5637
5638
0
    line->n_allele = nals;
5639
0
    hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
5640
5641
0
    char *als = line->d.als;
5642
0
    int n = 0;
5643
0
    while (n<nals)
5644
0
    {
5645
0
        line->d.allele[n] = als;
5646
0
        while ( *als ) als++;
5647
0
        als++;
5648
0
        n++;
5649
0
    }
5650
    // Update REF length. Note that END is 1-based while line->pos 0-based
5651
0
    line->rlen = get_rlen(hdr, line);
5652
5653
0
    return 0;
5654
0
}
5655
int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
5656
0
{
5657
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5658
0
    char *free_old = NULL;
5659
0
    char buffer[256];
5660
0
    size_t used = 0;
5661
5662
    // The pointers in alleles may point into the existing line->d.als memory,
5663
    // so care needs to be taken not to clobber them while updating.  Usually
5664
    // they will be short so we can copy through an intermediate buffer.
5665
    // If they're longer, or won't fit in the existing allocation we
5666
    // can allocate a new buffer to write into.  Note that in either case
5667
    // pointers to line->d.als memory in alleles may not be valid when we've
5668
    // finished.
5669
0
    int i;
5670
0
    size_t avail = line->d.m_als < sizeof(buffer) ? line->d.m_als : sizeof(buffer);
5671
0
    for (i=0; i<nals; i++) {
5672
0
        size_t sz = strlen(alleles[i]) + 1;
5673
0
        if (avail - used < sz)
5674
0
            break;
5675
0
        memcpy(buffer + used, alleles[i], sz);
5676
0
        used += sz;
5677
0
    }
5678
5679
    // Did we miss anything?
5680
0
    if (i < nals) {
5681
0
        int j;
5682
0
        size_t needed = used;
5683
0
        char *new_als;
5684
0
        for (j = i; j < nals; j++)
5685
0
            needed += strlen(alleles[j]) + 1;
5686
0
        if (needed < line->d.m_als) // Don't shrink the buffer
5687
0
            needed = line->d.m_als;
5688
0
        if (needed > INT_MAX) {
5689
0
            hts_log_error("REF + alleles too long to fit in a BCF record");
5690
0
            return -1;
5691
0
        }
5692
0
        new_als = malloc(needed);
5693
0
        if (!new_als)
5694
0
            return -1;
5695
0
        free_old = line->d.als;
5696
0
        line->d.als = new_als;
5697
0
        line->d.m_als = needed;
5698
0
    }
5699
5700
    // Copy from the temp buffer to the destination
5701
0
    if (used) {
5702
0
        assert(used <= line->d.m_als);
5703
0
        memcpy(line->d.als, buffer, used);
5704
0
    }
5705
5706
    // Add in any remaining entries - if this happens we will always be
5707
    // writing to a newly-allocated buffer.
5708
0
    for (; i < nals; i++) {
5709
0
        size_t sz = strlen(alleles[i]) + 1;
5710
0
        memcpy(line->d.als + used, alleles[i], sz);
5711
0
        used += sz;
5712
0
    }
5713
5714
0
    if (free_old)
5715
0
        free(free_old);
5716
0
    return _bcf1_sync_alleles(hdr,line,nals);
5717
0
}
5718
5719
int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
5720
0
{
5721
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5722
0
    kstring_t tmp;
5723
0
    tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
5724
0
    kputs(alleles_string, &tmp);
5725
0
    line->d.als = tmp.s; line->d.m_als = tmp.m;
5726
5727
0
    int nals = 1;
5728
0
    char *t = line->d.als;
5729
0
    while (*t)
5730
0
    {
5731
0
        if ( *t==',' ) { *t = 0; nals++; }
5732
0
        t++;
5733
0
    }
5734
0
    return _bcf1_sync_alleles(hdr, line, nals);
5735
0
}
5736
5737
int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
5738
0
{
5739
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5740
0
    kstring_t tmp;
5741
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
5742
0
    if ( id )
5743
0
        kputs(id, &tmp);
5744
0
    else
5745
0
        kputs(".", &tmp);
5746
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
5747
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
5748
0
    return 0;
5749
0
}
5750
5751
int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
5752
0
{
5753
0
    if ( !id ) return 0;
5754
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5755
5756
0
    kstring_t tmp;
5757
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
5758
5759
0
    int len = strlen(id);
5760
0
    char *dst = line->d.id;
5761
0
    while ( *dst && (dst=strstr(dst,id)) )
5762
0
    {
5763
0
        if ( dst[len]!=0 && dst[len]!=';' ) dst++;              // a prefix, not a match
5764
0
        else if ( dst==line->d.id || dst[-1]==';' ) return 0;   // already present
5765
0
        dst++;  // a suffix, not a match
5766
0
    }
5767
0
    if ( line->d.id && (line->d.id[0]!='.' || line->d.id[1]) )
5768
0
    {
5769
0
        tmp.l = strlen(line->d.id);
5770
0
        kputc(';',&tmp);
5771
0
    }
5772
0
    kputs(id,&tmp);
5773
5774
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
5775
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
5776
0
    return 0;
5777
5778
0
}
5779
5780
bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
5781
0
{
5782
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
5783
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL;   // no such FMT field in the header
5784
0
    return bcf_get_fmt_id(line, id);
5785
0
}
5786
5787
bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
5788
0
{
5789
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
5790
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL;   // no such INFO field in the header
5791
0
    return bcf_get_info_id(line, id);
5792
0
}
5793
5794
bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
5795
0
{
5796
0
    int i;
5797
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5798
0
    for (i=0; i<line->n_fmt; i++)
5799
0
    {
5800
0
        if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
5801
0
    }
5802
0
    return NULL;
5803
0
}
5804
5805
bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
5806
0
{
5807
0
    int i;
5808
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
5809
0
    for (i=0; i<line->n_info; i++)
5810
0
    {
5811
0
        if ( line->d.info[i].key==id ) return &line->d.info[i];
5812
0
    }
5813
0
    return NULL;
5814
0
}
5815
5816
5817
int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
5818
0
{
5819
0
    int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
5820
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1;    // no such INFO field in the header
5821
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2;     // expected different type
5822
5823
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
5824
5825
0
    for (i=0; i<line->n_info; i++)
5826
0
        if ( line->d.info[i].key==tag_id ) break;
5827
0
    if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3;       // the tag is not present in this record
5828
0
    if ( type==BCF_HT_FLAG ) return 1;
5829
5830
0
    bcf_info_t *info = &line->d.info[i];
5831
0
    if ( !info->vptr ) return -3;           // the tag was marked for removal
5832
0
    if ( type==BCF_HT_STR )
5833
0
    {
5834
0
        if ( *ndst < info->len+1 )
5835
0
        {
5836
0
            *ndst = info->len + 1;
5837
0
            *dst  = realloc(*dst, *ndst);
5838
0
        }
5839
0
        memcpy(*dst,info->vptr,info->len);
5840
0
        ((uint8_t*)*dst)[info->len] = 0;
5841
0
        return info->len;
5842
0
    }
5843
5844
    // Make sure the buffer is big enough
5845
0
    int size1;
5846
0
    switch (type) {
5847
0
        case BCF_HT_INT:  size1 = sizeof(int32_t); break;
5848
0
        case BCF_HT_LONG: size1 = sizeof(int64_t); break;
5849
0
        case BCF_HT_REAL: size1 = sizeof(float); break;
5850
0
        default:
5851
0
            hts_log_error("Unexpected output type %d at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5852
0
            return -2;
5853
0
    }
5854
0
    if ( *ndst < info->len )
5855
0
    {
5856
0
        *ndst = info->len;
5857
0
        *dst  = realloc(*dst, *ndst * size1);
5858
0
    }
5859
5860
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_regular, out_type_t) do { \
5861
0
        out_type_t *tmp = (out_type_t *) *dst; \
5862
0
        int j; \
5863
0
        for (j=0; j<info->len; j++) \
5864
0
        { \
5865
0
            type_t p = convert(info->vptr + j * sizeof(type_t)); \
5866
0
            if ( is_vector_end ) break; \
5867
0
            if ( is_missing ) set_missing; \
5868
0
            else set_regular; \
5869
0
            tmp++; \
5870
0
        } \
5871
0
        ret = j; \
5872
0
    } while (0)
5873
0
    switch (info->type) {
5874
0
        case BCF_BT_INT8:
5875
0
            if (type == BCF_HT_LONG) {
5876
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int64_missing, *tmp=p, int64_t);
5877
0
            } else {
5878
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=p, int32_t);
5879
0
            }
5880
0
            break;
5881
0
        case BCF_BT_INT16:
5882
0
            if (type == BCF_HT_LONG) {
5883
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t);
5884
0
            } else {
5885
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t);
5886
0
            }
5887
0
            break;
5888
0
        case BCF_BT_INT32:
5889
0
            if (type == BCF_HT_LONG) {
5890
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); break;
5891
0
            } else {
5892
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break;
5893
0
            }
5894
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break;
5895
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, info->type, bcf_seqname_safe(hdr,line), line->pos+1); return -2;
5896
0
    }
5897
0
    #undef BRANCH
5898
0
    return ret;  // set by BRANCH
5899
0
}
5900
5901
int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
5902
0
{
5903
0
    int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
5904
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
5905
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;     // expected different type
5906
5907
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5908
5909
0
    for (i=0; i<line->n_fmt; i++)
5910
0
        if ( line->d.fmt[i].id==tag_id ) break;
5911
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
5912
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
5913
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
5914
5915
0
    int nsmpl = bcf_hdr_nsamples(hdr);
5916
0
    if ( !*dst )
5917
0
    {
5918
0
        *dst = (char**) malloc(sizeof(char*)*nsmpl);
5919
0
        if ( !*dst ) return -4;     // could not alloc
5920
0
        (*dst)[0] = NULL;
5921
0
    }
5922
0
    int n = (fmt->n+1)*nsmpl;
5923
0
    if ( *ndst < n )
5924
0
    {
5925
0
        (*dst)[0] = realloc((*dst)[0], n);
5926
0
        if ( !(*dst)[0] ) return -4;    // could not alloc
5927
0
        *ndst = n;
5928
0
    }
5929
0
    for (i=0; i<nsmpl; i++)
5930
0
    {
5931
0
        uint8_t *src = fmt->p + i*fmt->n;
5932
0
        uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
5933
0
        memcpy(tmp,src,fmt->n);
5934
0
        tmp[fmt->n] = 0;
5935
0
        (*dst)[i] = (char*) tmp;
5936
0
    }
5937
0
    return n;
5938
0
}
5939
5940
int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
5941
0
{
5942
0
    int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
5943
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
5944
0
    if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
5945
0
    {
5946
        // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
5947
0
        if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
5948
0
    }
5949
0
    else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2;     // expected different type
5950
5951
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5952
5953
0
    for (i=0; i<line->n_fmt; i++)
5954
0
        if ( line->d.fmt[i].id==tag_id ) break;
5955
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
5956
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
5957
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
5958
5959
0
    if ( type==BCF_HT_STR )
5960
0
    {
5961
0
        int n = fmt->n*bcf_hdr_nsamples(hdr);
5962
0
        if ( *ndst < n )
5963
0
        {
5964
0
            *dst  = realloc(*dst, n);
5965
0
            if ( !*dst ) return -4;     // could not alloc
5966
0
            *ndst = n;
5967
0
        }
5968
0
        memcpy(*dst,fmt->p,n);
5969
0
        return n;
5970
0
    }
5971
5972
    // Make sure the buffer is big enough
5973
0
    int nsmpl = bcf_hdr_nsamples(hdr);
5974
0
    int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
5975
0
    if ( *ndst < fmt->n*nsmpl )
5976
0
    {
5977
0
        *ndst = fmt->n*nsmpl;
5978
0
        *dst  = realloc(*dst, *ndst*size1);
5979
0
        if ( !*dst ) return -4;     // could not alloc
5980
0
    }
5981
5982
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_vector_end, set_regular, out_type_t) { \
5983
0
        out_type_t *tmp = (out_type_t *) *dst; \
5984
0
        uint8_t *fmt_p = fmt->p; \
5985
0
        for (i=0; i<nsmpl; i++) \
5986
0
        { \
5987
0
            for (j=0; j<fmt->n; j++) \
5988
0
            { \
5989
0
                type_t p = convert(fmt_p + j * sizeof(type_t)); \
5990
0
                if ( is_missing ) set_missing; \
5991
0
                else if ( is_vector_end ) { set_vector_end; break; } \
5992
0
                else set_regular; \
5993
0
                tmp++; \
5994
0
            } \
5995
0
            for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
5996
0
            fmt_p += fmt->size; \
5997
0
        } \
5998
0
    }
5999
0
    switch (fmt->type) {
6000
0
        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6001
0
        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6002
0
        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6003
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), bcf_float_set(tmp, p), float); break;
6004
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt->type, bcf_seqname_safe(hdr,line), line->pos+1); exit(1);
6005
0
    }
6006
0
    #undef BRANCH
6007
0
    return nsmpl*fmt->n;
6008
0
}
6009
6010
//error description structure definition
6011
typedef struct err_desc {
6012
    int  errorcode;
6013
    const char *description;
6014
}err_desc;
6015
6016
// error descriptions
6017
static const err_desc errdesc_bcf[] = {
6018
    { BCF_ERR_CTG_UNDEF, "Contig not defined in header"},
6019
    { BCF_ERR_TAG_UNDEF, "Tag not defined in header" },
6020
    { BCF_ERR_NCOLS, "Incorrect number of columns" },
6021
    { BCF_ERR_LIMITS, "Limits reached" },
6022
    { BCF_ERR_CHAR, "Invalid character" },
6023
    { BCF_ERR_CTG_INVALID, "Invalid contig" },
6024
    { BCF_ERR_TAG_INVALID, "Invalid tag" },
6025
};
6026
6027
/// append given description to buffer based on available size and add ... when not enough space
6028
    /** @param buffer       buffer to which description to be appended
6029
        @param offset       offset at which to be appended
6030
        @param maxbuffer    maximum size of the buffer
6031
        @param description  the description to be appended
6032
on failure returns -1 - when buffer is not big enough; returns -1 on invalid params and on too small buffer which are improbable due to validation at caller site
6033
on success returns 0
6034
    */
6035
3.22k
static int add_desc_to_buffer(char *buffer, size_t *offset, size_t maxbuffer, const char *description) {
6036
6037
3.22k
    if (!description || !buffer || !offset || (maxbuffer < 4))
6038
0
        return -1;
6039
6040
3.22k
    size_t rembuffer = maxbuffer - *offset;
6041
3.22k
    if (rembuffer > (strlen(description) + (rembuffer == maxbuffer ? 0 : 1))) {    //add description with optionally required ','
6042
3.22k
        *offset += snprintf(buffer + *offset, rembuffer, "%s%s", (rembuffer == maxbuffer)? "": ",", description);
6043
3.22k
    } else {    //not enough space for description, put ...
6044
0
        size_t tmppos = (rembuffer <= 4) ? maxbuffer - 4 : *offset;
6045
0
        snprintf(buffer + tmppos, 4, "...");    //ignore offset update
6046
0
        return -1;
6047
0
    }
6048
3.22k
    return 0;
6049
3.22k
}
6050
6051
//get description for given error code. return NULL on error
6052
1.54k
const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) {
6053
1.54k
    size_t usedup = 0;
6054
1.54k
    int ret = 0;
6055
1.54k
    int idx;
6056
6057
1.54k
    if (!buffer || maxbuffer < 4)
6058
0
        return NULL;           //invalid / insufficient buffer
6059
6060
1.54k
    if (!errorcode) {
6061
0
        buffer[0] = '\0';      //no error, set null
6062
0
        return buffer;
6063
0
    }
6064
6065
12.3k
    for (idx = 0; idx < sizeof(errdesc_bcf) / sizeof(err_desc); ++idx) {
6066
10.8k
        if (errorcode & errdesc_bcf[idx].errorcode) {    //error is set, add description
6067
3.22k
            ret = add_desc_to_buffer(buffer, &usedup, maxbuffer, errdesc_bcf[idx].description);
6068
3.22k
            if (ret < 0)
6069
0
                break;         //not enough space, ... added, no need to continue
6070
6071
3.22k
            errorcode &= ~errdesc_bcf[idx].errorcode;    //reset the error
6072
3.22k
        }
6073
10.8k
    }
6074
6075
1.54k
    if (errorcode && (ret >= 0))  {     //undescribed error is present in error code and had enough buffer, try to add unkonwn error as well§
6076
0
        add_desc_to_buffer(buffer, &usedup, maxbuffer, "Unknown error");
6077
0
    }
6078
1.54k
    return buffer;
6079
1.54k
}
6080
6081
/**
6082
 *  bcf_format_gt_v2 - formats GT information on a string
6083
 *  @param hdr - bcf header, to get version
6084
 *  @param fmt - pointer to bcf format data
6085
 *  @param isample - position of interested sample in data
6086
 *  @param str - pointer to output string
6087
 *  Returns 0 on success and -1 on failure
6088
 *  This method is preferred over bcf_format_gt as this supports vcf4.4 and
6089
 *  prefixed phasing. Explicit / prefixed phasing for 1st allele is used only
6090
 *  when it is a must to correctly express phasing.
6091
 * correctly express phasing.
6092
 */
6093
int bcf_format_gt_v2(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample, kstring_t *str)
6094
19.2k
{
6095
19.2k
    uint32_t e = 0;
6096
19.2k
    int ploidy = 1, anyunphased = 0;
6097
19.2k
    int32_t val0 = 0;
6098
19.2k
    size_t pos = str ? str->l : 0;
6099
6100
19.2k
    #define BRANCH(type_t, convert, missing, vector_end) { \
6101
17.6k
        uint8_t *ptr = fmt->p + isample*fmt->size; \
6102
17.6k
        int i; \
6103
44.1k
        for (i=0; i<fmt->n; i++, ptr += sizeof(type_t)) \
6104
37.1k
        { \
6105
37.1k
            type_t val = convert(ptr); \
6106
37.1k
            if ( val == vector_end ) break; \
6107
37.1k
            if (!i) { val0 = val; } \
6108
26.4k
            if (i) { \
6109
8.70k
                e |= kputc("/|"[val & 1], str) < 0; \
6110
8.70k
                anyunphased |= !(val & 1); \
6111
8.70k
            } \
6112
26.4k
            if (!(val >> 1)) e |= kputc('.', str) < 0; \
6113
26.4k
            else e |= kputw((val >> 1) - 1, str) < 0; \
6114
26.4k
        } \
6115
17.6k
        if (i == 0) e |= kputc('.', str) < 0; \
6116
17.6k
        ploidy = i; \
6117
17.6k
    }
6118
19.2k
    switch (fmt->type) {
6119
9.87k
        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  bcf_int8_missing,
6120
9.87k
            bcf_int8_vector_end); break;
6121
2.54k
        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing,
6122
2.54k
            bcf_int16_vector_end); break;
6123
5.27k
        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing,
6124
5.27k
            bcf_int32_vector_end); break;
6125
1.54k
        case BCF_BT_NULL:  e |= kputc('.', str) < 0; break;
6126
0
        default: hts_log_error("Unexpected type %d", fmt->type); return -2;
6127
19.2k
    }
6128
19.2k
    #undef BRANCH
6129
6130
19.2k
    if (hdr && get_hdr_aux(hdr)->version >= VCF44) {
6131
        //output which supports prefixed phasing
6132
6133
        /* update 1st allele's phasing if required and append rest to it.
6134
        use prefixed phasing only when it is a must. i.e. without which the
6135
        inferred value will be incorrect */
6136
10.1k
        if (val0 & 1) {
6137
            /* 1st one is phased, if ploidy is > 1 and an unphased allele exists
6138
             need to specify explicitly */
6139
1.02k
            e |= (ploidy > 1 && anyunphased) ?
6140
0
                    (kinsert_char('|', pos, str) < 0) :
6141
1.02k
                        (ploidy <= 1 && !((val0 >> 1)) ? //|. needs explicit o/p
6142
0
                            (kinsert_char('|', pos, str) < 0) :
6143
1.02k
                            0);
6144
9.08k
        } else {
6145
            /* 1st allele is unphased, if ploidy is = 1 or allele is '.' or
6146
             ploidy > 1 and no other unphased allele exist, need to specify
6147
             explicitly */
6148
9.08k
            e |= ((ploidy <= 1 && val0 != 0) || (ploidy > 1 && !anyunphased)) ?
6149
4.48k
                    (kinsert_char('/', pos, str) < 0) :
6150
9.08k
                    0;
6151
9.08k
        }
6152
10.1k
    }
6153
19.2k
    return e == 0 ? 0 : -1;
6154
19.2k
}
6155
6156
/**
6157
 *  get_rlen - calculates and returns rlen value
6158
 *  @param h - bcf header
6159
 *  @param v - bcf data
6160
 *  Returns rlen calculated on success and -1 on failure.
6161
 *  rlen calculation is dependent on vcf version and a few other field data.
6162
 *  When bcf decoded data is available, refers it. When not available, retrieves
6163
 *  required field data by seeking on the data stream.
6164
 *  Ideally pos & version be set appropriately before any info/format field
6165
 *  update to have proper rlen calculation.
6166
 *  As version is not kept properly updated in practice, it is ignored in calcs.
6167
 */
6168
static int64_t get_rlen(const bcf_hdr_t *h, bcf1_t *v)
6169
41.1k
{
6170
41.1k
    uint8_t *f = (uint8_t*)v->shared.s, *t = NULL,
6171
41.1k
        *e = (uint8_t*)v->shared.s + v->shared.l;
6172
41.1k
    int size, type, id, lenid, endid, svlenid, i, bad, gvcf = 0, use_svlen = 0;
6173
41.1k
    bcf_info_t *endinfo = NULL, *svleninfo = NULL, end_lcl, svlen_lcl;
6174
41.1k
    bcf_fmt_t *lenfmt = NULL, len_lcl;
6175
6176
    //holds SVLEN allele status for the max no of alleles
6177
41.1k
    uint8_t svlenals[8192];
6178
    //pos from info END, fmt LEN, info SVLEN
6179
41.1k
    hts_pos_t end = 0, end_fmtlen = 0, end_svlen = 0, hpos;
6180
41.1k
    int64_t len_ref = 0, len = 0, tmp;
6181
41.1k
    endid = bcf_hdr_id2int(h, BCF_DT_ID, "END");
6182
6183
    //initialise bytes which are to be used
6184
41.1k
    memset(svlenals, 0, 1 + v->n_allele / 8);
6185
6186
    //use decoded data where ever available and where not, get from stream
6187
41.1k
    if (v->unpacked & BCF_UN_STR || v->d.shared_dirty & BCF1_DIRTY_ALS) {
6188
0
        for (i = 1; i < v->n_allele; ++i) {
6189
            // check only symbolic alt alleles
6190
0
            if (v->d.allele[i][0] != '<')
6191
0
                continue;
6192
0
            if (svlen_on_ref_for_vcf_alt(v->d.allele[i], -1)) {
6193
                // del, dup or cnv allele, note to check corresponding svlen val
6194
0
                svlenals[i >> 3] |= 1 << (i & 7);
6195
0
                use_svlen = 1;
6196
0
            } else if (!strcmp(v->d.allele[i], "<*>") ||
6197
0
                         !strcmp(v->d.allele[i], "<NON_REF>")) {
6198
0
                gvcf = 1;   //gvcf present, have to check for LEN field
6199
0
            }
6200
0
        }
6201
0
        f += v->unpack_size[0] + v->unpack_size[1];
6202
0
        len_ref = v->n_allele ? strlen(v->d.allele[0]) : 0;
6203
41.1k
    } else if (f < e) {
6204
        //skip ID
6205
41.1k
        size = bcf_dec_size(f, &f, &type);
6206
41.1k
        f += size << bcf_type_shift[type];
6207
        // REF, ALT
6208
2.36M
        for (i = 0; i < v->n_allele; ++i) {
6209
            //check all alleles, w/o NUL
6210
2.32M
            size = bcf_dec_size(f, &f, &type);
6211
2.32M
            if (!i) {   //REF length
6212
41.1k
                len_ref = size;
6213
2.28M
            } else if (size > 0 && *f == '<') {
6214
2.93k
                if (svlen_on_ref_for_vcf_alt((char *) f, size)) {
6215
                    // del, dup or cnv allele, note to check corresponding svlen val
6216
24
                    svlenals[i >> 3] |= 1 << (i & 7);
6217
24
                    use_svlen = 1;
6218
2.90k
                } else if ((size == 3 && !strncmp((char*)f, "<*>", size)) ||
6219
2.77k
                    (size == 9 && !strncmp((char*)f, "<NON_REF>", size))) {
6220
176
                    gvcf = 1;   //gvcf present, have to check for LEN field
6221
176
                }
6222
2.93k
            }
6223
2.32M
            f += size << bcf_type_shift[type];
6224
2.32M
        }
6225
41.1k
    }
6226
    // FILTER
6227
41.1k
    if (v->unpacked & BCF_UN_FLT) {
6228
0
        f += v->unpack_size[2];
6229
41.1k
    } else if (f < e) {
6230
41.1k
        size = bcf_dec_size(f, &f, &type);
6231
41.1k
        f += size << bcf_type_shift[type];
6232
41.1k
    }
6233
6234
    // Only do SVLEN lookup if there are suitable symbolic alleles
6235
41.1k
    svlenid = use_svlen ? bcf_hdr_id2int(h, BCF_DT_ID, "SVLEN") : -1;
6236
6237
    // INFO
6238
41.1k
    if (svlenid >= 0 || endid >= 0 ) {  //only if end/svlen present
6239
17.2k
        if (v->unpacked & BCF_UN_INFO || v->d.shared_dirty & BCF1_DIRTY_INF) {
6240
0
            endinfo = bcf_get_info(h, v, "END");
6241
0
            svleninfo = bcf_get_info(h, v, "SVLEN");
6242
17.2k
        } else if (f < e) {
6243
14.7k
            for (i = 0; i < v->n_info; ++i) {
6244
7.84k
                id = bcf_dec_typed_int1(f, &t);
6245
7.84k
                if (id == endid) {  //END
6246
540
                    t = bcf_unpack_info_core1(f, &end_lcl);
6247
540
                    endinfo = &end_lcl;
6248
540
                    if (svleninfo || svlenid < 0) {
6249
540
                        break;  //already got svlen or no need to search further
6250
540
                    }
6251
7.30k
                } else if (id == svlenid) { //SVLEN
6252
0
                    t = bcf_unpack_info_core1(f, &svlen_lcl);
6253
0
                    svleninfo = &svlen_lcl;
6254
0
                    if (endinfo || endid < 0 ) {
6255
0
                        break;  //already got end or no need to search further
6256
0
                    }
6257
7.30k
                } else {
6258
7.30k
                    f = t;
6259
7.30k
                    size = bcf_dec_size(f, &t, &type);
6260
7.30k
                    t += size << bcf_type_shift[type];
6261
7.30k
                }
6262
7.30k
                f = t;
6263
7.30k
            }
6264
7.45k
        }
6265
17.2k
    }
6266
6267
    // Only do LEN lookup if a <*> allele was found
6268
41.1k
    lenid = gvcf ? bcf_hdr_id2int(h, BCF_DT_ID, "LEN") : -1;
6269
6270
    // FORMAT
6271
41.1k
    if (lenid >= 0) {
6272
        //with LEN and has gvcf allele
6273
0
        f = (uint8_t*)v->indiv.s; t = NULL; e = (uint8_t*)v->indiv.s + v->indiv.l;
6274
0
        if (v->unpacked & BCF_UN_FMT || v->d.indiv_dirty) {
6275
0
            lenfmt = bcf_get_fmt(h, v, "LEN");
6276
0
        } else if (f < e) {
6277
0
            for (i = 0; i < v->n_fmt; ++i) {
6278
0
                id = bcf_dec_typed_int1(f, &t);
6279
0
                if (id == lenid) {
6280
0
                        t = bcf_unpack_fmt_core1(f, v->n_sample, &len_lcl);
6281
0
                    lenfmt = &len_lcl;
6282
0
                    break;  //that's all needed
6283
0
                } else {
6284
0
                    f = t;
6285
0
                    size = bcf_dec_size(f, &t, &type);
6286
0
                    t += size * v->n_sample << bcf_type_shift[type];
6287
0
                }
6288
0
                f = t;
6289
0
            }
6290
0
        }
6291
0
    }
6292
    //got required data, find end and rlen
6293
41.1k
    if (endinfo && endinfo->vptr) { //end position given by info END
6294
        //end info exists, not being deleted
6295
540
        end = endinfo->v1.i;
6296
540
        switch(endinfo->type) {
6297
0
            case BCF_BT_INT8:  end = end == bcf_int8_missing ? 0 : end;  break;
6298
0
            case BCF_BT_INT16: end = end == bcf_int16_missing ? 0 : end; break;
6299
0
            case BCF_BT_INT32: end = end == bcf_int32_missing ? 0 : end; break;
6300
0
            case BCF_BT_INT64: end = end == bcf_int64_missing ? 0 : end; break;
6301
540
            default: end = 0; break; //invalid
6302
540
        }
6303
540
    }
6304
6305
41.1k
    if (svleninfo && svleninfo->vptr) {
6306
        //svlen info exists, not being deleted
6307
0
        bad = 0;
6308
        //get largest svlen corresponding to a <DEL> symbolic allele
6309
0
        for (i = 0; i < svleninfo->len && i + 1 < v->n_allele; ++i) {
6310
0
            if (!(svlenals[i >> 3] & (1 << ((i + 1) & 7))))
6311
0
                continue;
6312
6313
0
            switch(svleninfo->type) {
6314
0
                case BCF_BT_INT8:
6315
0
                    tmp = le_to_i8(&svleninfo->vptr[i]);
6316
0
                    tmp = tmp == bcf_int8_missing ? 0 : tmp;
6317
0
                break;
6318
0
                case BCF_BT_INT16:
6319
0
                    tmp = le_to_i16(&svleninfo->vptr[i * 2]);
6320
0
                    tmp = tmp == bcf_int16_missing ? 0 : tmp;
6321
0
                break;
6322
0
                case BCF_BT_INT32:
6323
0
                    tmp = le_to_i32(&svleninfo->vptr[i * 4]);
6324
0
                    tmp = tmp == bcf_int32_missing ? 0 : tmp;
6325
0
                break;
6326
0
                case BCF_BT_INT64:
6327
0
                    tmp = le_to_i64(&svleninfo->vptr[i * 8]);
6328
0
                    tmp = tmp == bcf_int64_missing ? 0 : tmp;
6329
0
                break;
6330
0
                default: //invalid
6331
0
                    tmp = 0;
6332
0
                    bad = 1;
6333
0
                break;
6334
0
            }
6335
0
            if (bad) {  //stop svlen check
6336
0
                len = 0;
6337
0
                break;
6338
0
            }
6339
6340
0
            tmp = tmp < 0 ? llabs(tmp) : tmp;
6341
0
            if (len < tmp) len = tmp;
6342
0
        }
6343
0
    }
6344
41.1k
    if ((!svleninfo || !len) && end) { //no svlen, infer from end
6345
0
        len = end > v->pos ? end - v->pos - 1 : 0;
6346
0
    }
6347
41.1k
    end_svlen = v->pos + len + 1;   //end position found from SVLEN
6348
6349
41.1k
    len = 0;
6350
41.1k
    if (lenfmt && lenfmt->p) {
6351
        //fmt len exists, not being deleted, has gvcf and version >= 4.5
6352
0
        int j = 0;
6353
0
        int64_t offset = 0;
6354
0
        bad = 0;
6355
0
        for (i = 0; i < v->n_sample; ++i) {
6356
0
            for (j = 0; j < lenfmt->n; ++j) {
6357
0
                switch(lenfmt->type) {
6358
0
                case BCF_BT_INT8:
6359
0
                    tmp = le_to_i8(lenfmt->p + offset + j);
6360
0
                    tmp = tmp == bcf_int8_missing ? 0 : tmp;
6361
0
                break;
6362
0
                case BCF_BT_INT16:
6363
0
                    tmp = le_to_i16(lenfmt->p + offset + j * 2);
6364
0
                    tmp = tmp == bcf_int16_missing ? 0 : tmp;
6365
0
                break;
6366
0
                case BCF_BT_INT32:
6367
0
                    tmp = le_to_i32(lenfmt->p + offset + j * 4);
6368
0
                    tmp = tmp == bcf_int32_missing ? 0 : tmp;
6369
0
                break;
6370
0
                case BCF_BT_INT64:
6371
0
                    tmp = le_to_i64(lenfmt->p + offset + j * 8);
6372
0
                    tmp = tmp == bcf_int64_missing ? 0 : tmp;
6373
0
                break;
6374
0
                default: //invalid
6375
0
                    bad = 1;
6376
0
                break;
6377
0
                }
6378
0
                if (bad) {  //stop LEN check
6379
0
                    len = 0;
6380
0
                    break;
6381
0
                }
6382
                //assumes only gvcf have valid LEN
6383
0
                if (len < tmp) len = tmp;
6384
0
            }
6385
0
            offset += j << bcf_type_shift[lenfmt->type];
6386
0
        }
6387
0
    }
6388
41.1k
    if ((!lenfmt || !len) && end) { //no fmt len, infer from end
6389
0
        len = end > v->pos ? end - v->pos : 0;
6390
0
    }
6391
41.1k
    end_fmtlen = v->pos + len;  //end position found from LEN
6392
6393
    //get largest pos, based on END, SVLEN, fmt LEN and length using it
6394
41.1k
    hpos = end < end_svlen ?
6395
10.8k
            end_svlen < end_fmtlen ? end_fmtlen : end_svlen :
6396
41.1k
            end < end_fmtlen ? end_fmtlen : end;
6397
41.1k
    len = hpos - v->pos;
6398
6399
    //NOTE: 'end' calculation be in sync with tbx.c:tbx_parse1
6400
6401
    /* rlen to be calculated based on version, END, SVLEN, fmt LEN, ref len.
6402
    Relevance of these fields vary across different vcf versions.
6403
    Many times, these info/fmt fields are used without version updates;
6404
    hence these fields are used for calculation disregarding vcf version */
6405
41.1k
    return len < len_ref ? len_ref : len;
6406
41.1k
}