Coverage Report

Created: 2025-12-31 06:24

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/vcf.c
Line
Count
Source
1
/*  vcf.c -- VCF/BCF API functions.
2
3
    Copyright (C) 2012, 2013 Broad Institute.
4
    Copyright (C) 2012-2025 Genome Research Ltd.
5
    Portions copyright (C) 2014 Intel Corporation.
6
7
    Author: Heng Li <lh3@sanger.ac.uk>
8
9
Permission is hereby granted, free of charge, to any person obtaining a copy
10
of this software and associated documentation files (the "Software"), to deal
11
in the Software without restriction, including without limitation the rights
12
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
copies of the Software, and to permit persons to whom the Software is
14
furnished to do so, subject to the following conditions:
15
16
The above copyright notice and this permission notice shall be included in
17
all copies or substantial portions of the Software.
18
19
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25
DEALINGS IN THE SOFTWARE.  */
26
27
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
28
#include <config.h>
29
30
#include <stdio.h>
31
#include <assert.h>
32
#include <string.h>
33
#include <strings.h>
34
#include <stdlib.h>
35
#include <limits.h>
36
#include <stdint.h>
37
#include <inttypes.h>
38
#include <errno.h>
39
40
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
41
#include "fuzz_settings.h"
42
#endif
43
44
#include "htslib/vcf.h"
45
#include "htslib/bgzf.h"
46
#include "htslib/tbx.h"
47
#include "htslib/hfile.h"
48
#include "hts_internal.h"
49
#include "htslib/hts_endian.h"
50
#include "htslib/khash_str2int.h"
51
#include "htslib/kstring.h"
52
#include "htslib/sam.h"
53
#include "htslib/khash.h"
54
#include "bgzf_internal.h"
55
56
#if 0
57
// This helps on Intel a bit, often 6-7% faster VCF parsing.
58
// Conversely sometimes harms AMD Zen4 as ~9% slower.
59
// Possibly related to IPC differences.  However for now it's just a
60
// curiousity we ignore and stick with the simpler code.
61
//
62
// Left here as a hint for future explorers.
63
static inline int xstreq(const char *a, const char *b) {
64
    while (*a && *a == *b)
65
        a++, b++;
66
    return *a == *b;
67
}
68
69
#define KHASH_MAP_INIT_XSTR(name, khval_t) \
70
  KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, xstreq)
71
72
KHASH_MAP_INIT_XSTR(vdict, bcf_idinfo_t)
73
#else
74
KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
75
#endif
76
77
typedef khash_t(vdict) vdict_t;
78
79
KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*)
80
typedef khash_t(hdict) hdict_t;
81
82
83
#include "htslib/kseq.h"
84
HTSLIB_EXPORT
85
uint32_t bcf_float_missing    = 0x7F800001;
86
87
HTSLIB_EXPORT
88
uint32_t bcf_float_vector_end = 0x7F800002;
89
90
HTSLIB_EXPORT
91
uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
92
93
static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
94
95
/*
96
    Partial support for 64-bit POS and Number=1 INFO tags.
97
    Notes:
98
     - the support for 64-bit values is motivated by POS and INFO/END for large genomes
99
     - the use of 64-bit values does not conform to the specification
100
     - cannot output 64-bit BCF and if it does, it is not compatible with anything
101
     - experimental, use at your risk
102
*/
103
#ifdef VCF_ALLOW_INT64
104
    #define BCF_MAX_BT_INT64 (0x7fffffffffffffff)       /* INT64_MAX, for internal use only */
105
    #define BCF_MIN_BT_INT64 -9223372036854775800LL     /* INT64_MIN + 8, for internal use only */
106
#endif
107
108
2.30k
#define BCF_IS_64BIT (1<<30)
109
110
111
// Opaque structure with auxilary data which allows to extend bcf_hdr_t without breaking ABI.
112
// Note that this preserving API and ABI requires that the first element is vdict_t struct
113
// rather than a pointer, as user programs may (and in some cases do) access the dictionary
114
// directly as (vdict_t*)hdr->dict.
115
typedef struct
116
{
117
    vdict_t dict;   // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT
118
    hdict_t *gen;   // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields
119
    size_t *key_len;// length of h->id[BCF_DT_ID] strings
120
    int version;    //cached version
121
    uint32_t ref_count; // reference count, low bit indicates bcf_hdr_destroy() has been called
122
}
123
bcf_hdr_aux_t;
124
125
static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr)
126
424k
{
127
424k
    return (bcf_hdr_aux_t *)hdr->dict[0];
128
424k
}
129
130
//version macros
131
234k
#define VCF_DEF 4002000
132
47.9k
#define VCF44   4004000
133
47.8k
#define VCF45   4005000
134
135
#define VCF_MAJOR_VER(x) ( (x) / 10000 / 100 )
136
#define VCF_MINOR_VER(x) ( ((x) % 1000000) / 1000 )
137
138
/**
139
 *  bcf_get_version - get the version as int
140
 *  @param hdr   - bcf header, to get version
141
 *  @param verstr- version string, which is already available
142
 *  Returns version on success and default version on failure
143
 *  version = major * 100 * 10000 + minor * 1000
144
 */
145
static int bcf_get_version(const bcf_hdr_t *hdr, const char *verstr)
146
31.6k
{
147
31.6k
    const char *version = NULL, vcf[] = "VCFv";
148
31.6k
    char *major = NULL, *minor = NULL;
149
31.6k
    int ver = -1;
150
31.6k
    long tmp = 0;
151
31.6k
    bcf_hdr_aux_t *aux = NULL;
152
153
31.6k
    if (!hdr && !verstr) {  //invalid input
154
0
        goto fail;
155
0
    }
156
157
31.6k
    if (hdr) {
158
23.5k
        if ((aux = get_hdr_aux(hdr)) && aux->version != 0) {    //use cached version
159
22.3k
            return aux->version;
160
22.3k
        }
161
        //get from header
162
1.21k
        version = bcf_hdr_get_version(hdr);
163
8.05k
    } else {
164
        //get from version string
165
8.05k
        version = verstr;
166
8.05k
    }
167
9.26k
    if (!(major = strstr(version, vcf))) {  //bad format
168
6.67k
        goto fail;
169
6.67k
    }
170
2.58k
    major += sizeof(vcf) - 1;
171
2.58k
    if (!(minor = strchr(major, '.'))) {    //bad format
172
442
        goto fail;
173
442
    }
174
2.14k
    tmp = strtol(major, NULL, 10);
175
2.14k
    if ((!tmp && errno == EINVAL) ||
176
1.96k
        ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) {    //failed
177
189
        goto fail;
178
189
    }
179
1.95k
    ver = tmp * 100 * 10000;
180
1.95k
    tmp = strtol(++minor, NULL, 10);
181
1.95k
    if ((!tmp && errno == EINVAL) ||
182
1.82k
        ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) {    //failed
183
432
        goto fail;
184
432
    }
185
1.52k
    ver += tmp * 1000;
186
1.52k
    return ver;
187
188
7.73k
fail:
189
7.73k
    hts_log_warning("Couldn't get VCF version, considering as %d.%d",
190
7.73k
        VCF_MAJOR_VER(VCF_DEF), VCF_MINOR_VER(VCF_DEF));
191
7.73k
    return VCF_DEF;
192
1.95k
}
193
194
// Header reference counting
195
196
static void bcf_hdr_incr_ref(bcf_hdr_t *h)
197
3.44k
{
198
3.44k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
199
3.44k
    aux->ref_count += 2;
200
3.44k
}
201
202
static void bcf_hdr_decr_ref(bcf_hdr_t *h)
203
3.44k
{
204
3.44k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
205
3.44k
    if (aux->ref_count >= 2)
206
3.44k
        aux->ref_count -= 2;
207
208
3.44k
    if (aux->ref_count == 0)
209
3.21k
        bcf_hdr_destroy(h);
210
3.44k
}
211
212
static void hdr_bgzf_private_data_cleanup(void *data)
213
3.44k
{
214
3.44k
    bcf_hdr_t *h = (bcf_hdr_t *) data;
215
3.44k
    bcf_hdr_decr_ref(h);
216
3.44k
}
217
218
static char *find_chrom_header_line(char *s)
219
0
{
220
0
    char *nl;
221
0
    if (strncmp(s, "#CHROM\t", 7) == 0) return s;
222
0
    else if ((nl = strstr(s, "\n#CHROM\t")) != NULL) return nl+1;
223
0
    else return NULL;
224
0
}
225
226
static int64_t get_rlen(const bcf_hdr_t *h, bcf1_t *v);
227
228
/*************************
229
 *** VCF header parser ***
230
 *************************/
231
232
static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len)
233
11.0k
{
234
11.0k
    const char *ss = s;
235
11.5k
    while ( *ss && isspace_c(*ss) && ss - s < len) ss++;
236
11.0k
    if ( !*ss || ss - s == len)
237
4
    {
238
4
        hts_log_error("Empty sample name: trailing spaces/tabs in the header line?");
239
4
        return -1;
240
4
    }
241
242
11.0k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
243
11.0k
    int ret;
244
11.0k
    char *sdup = malloc(len + 1);
245
11.0k
    if (!sdup) return -1;
246
11.0k
    memcpy(sdup, s, len);
247
11.0k
    sdup[len] = 0;
248
249
    // Ensure space is available in h->samples
250
11.0k
    size_t n = kh_size(d);
251
11.0k
    char **new_samples = realloc(h->samples, sizeof(char*) * (n + 1));
252
11.0k
    if (!new_samples) {
253
0
        free(sdup);
254
0
        return -1;
255
0
    }
256
11.0k
    h->samples = new_samples;
257
258
11.0k
    int k = kh_put(vdict, d, sdup, &ret);
259
11.0k
    if (ret < 0) {
260
0
        free(sdup);
261
0
        return -1;
262
0
    }
263
11.0k
    if (ret) { // absent
264
11.0k
        kh_val(d, k) = bcf_idinfo_def;
265
11.0k
        kh_val(d, k).id = n;
266
11.0k
    } else {
267
2
        hts_log_error("Duplicated sample name '%s'", sdup);
268
2
        free(sdup);
269
2
        return -1;
270
2
    }
271
11.0k
    h->samples[n] = sdup;
272
11.0k
    h->dirty = 1;
273
11.0k
    return 0;
274
11.0k
}
275
276
int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
277
0
{
278
0
    if (!s) {
279
        // Allowed for backwards-compatibility, calling with s == NULL
280
        // used to trigger bcf_hdr_sync(h);
281
0
        return 0;
282
0
    }
283
0
    return bcf_hdr_add_sample_len(h, s, strlen(s));
284
0
}
285
286
int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *hdr, const char *str)
287
6.14k
{
288
6.14k
    const char *mandatory = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
289
6.14k
    if ( strncmp(str,mandatory,strlen(mandatory)) )
290
114
    {
291
114
        hts_log_error("Could not parse the \"#CHROM..\" line, either the fields are incorrect or spaces are present instead of tabs:\n\t%s",str);
292
114
        return -1;
293
114
    }
294
295
6.02k
    const char *beg = str + strlen(mandatory), *end;
296
6.02k
    if ( !*beg || *beg=='\n' ) return 0;
297
1.95k
    if ( strncmp(beg,"\tFORMAT\t",8) )
298
40
    {
299
40
        hts_log_error("Could not parse the \"#CHROM..\" line, either FORMAT is missing or spaces are present instead of tabs:\n\t%s",str);
300
40
        return -1;
301
40
    }
302
1.91k
    beg += 8;
303
304
1.91k
    int ret = 0;
305
11.0k
    while ( *beg )
306
11.0k
    {
307
11.0k
        end = beg;
308
780M
        while ( *end && *end!='\t' && *end!='\n' ) end++;
309
11.0k
        if ( bcf_hdr_add_sample_len(hdr, beg, end-beg) < 0 ) ret = -1;
310
11.0k
        if ( !*end || *end=='\n' || ret<0 ) break;
311
9.14k
        beg = end + 1;
312
9.14k
    }
313
1.91k
    return ret;
314
1.95k
}
315
316
int bcf_hdr_sync(bcf_hdr_t *h)
317
104k
{
318
104k
    int i;
319
417k
    for (i = 0; i < 3; i++)
320
313k
    {
321
313k
        vdict_t *d = (vdict_t*)h->dict[i];
322
313k
        khint_t k;
323
313k
        if ( h->n[i] < kh_size(d) )
324
1.89k
        {
325
1.89k
            bcf_idpair_t *new_idpair;
326
            // this should be true only for i=2, BCF_DT_SAMPLE
327
1.89k
            new_idpair = (bcf_idpair_t*) realloc(h->id[i], kh_size(d)*sizeof(bcf_idpair_t));
328
1.89k
            if (!new_idpair) return -1;
329
1.89k
            h->n[i] = kh_size(d);
330
1.89k
            h->id[i] = new_idpair;
331
1.89k
        }
332
3.63G
        for (k=kh_begin(d); k<kh_end(d); k++)
333
3.63G
        {
334
3.63G
            if (!kh_exist(d,k)) continue;
335
21.8M
            h->id[i][kh_val(d,k).id].key = kh_key(d,k);
336
21.8M
            h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
337
21.8M
        }
338
313k
    }
339
340
    // Invalidate key length cache
341
104k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
342
104k
    if (aux && aux->key_len) {
343
4.64k
        free(aux->key_len);
344
4.64k
        aux->key_len = NULL;
345
4.64k
    }
346
347
104k
    h->dirty = 0;
348
104k
    return 0;
349
104k
}
350
351
void bcf_hrec_destroy(bcf_hrec_t *hrec)
352
231k
{
353
231k
    if (!hrec) return;
354
219k
    free(hrec->key);
355
219k
    if ( hrec->value ) free(hrec->value);
356
219k
    int i;
357
721k
    for (i=0; i<hrec->nkeys; i++)
358
501k
    {
359
501k
        free(hrec->keys[i]);
360
501k
        free(hrec->vals[i]);
361
501k
    }
362
219k
    free(hrec->keys);
363
219k
    free(hrec->vals);
364
219k
    free(hrec);
365
219k
}
366
367
// Copies all fields except IDX.
368
bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
369
0
{
370
0
    int save_errno;
371
0
    bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
372
0
    if (!out) return NULL;
373
374
0
    out->type = hrec->type;
375
0
    if ( hrec->key ) {
376
0
        out->key = strdup(hrec->key);
377
0
        if (!out->key) goto fail;
378
0
    }
379
0
    if ( hrec->value ) {
380
0
        out->value = strdup(hrec->value);
381
0
        if (!out->value) goto fail;
382
0
    }
383
0
    out->nkeys = hrec->nkeys;
384
0
    out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys);
385
0
    if (!out->keys) goto fail;
386
0
    out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys);
387
0
    if (!out->vals) goto fail;
388
0
    int i, j = 0;
389
0
    for (i=0; i<hrec->nkeys; i++)
390
0
    {
391
0
        if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
392
0
        if ( hrec->keys[i] ) {
393
0
            out->keys[j] = strdup(hrec->keys[i]);
394
0
            if (!out->keys[j]) goto fail;
395
0
        }
396
0
        if ( hrec->vals[i] ) {
397
0
            out->vals[j] = strdup(hrec->vals[i]);
398
0
            if (!out->vals[j]) goto fail;
399
0
        }
400
0
        j++;
401
0
    }
402
0
    if ( i!=j ) out->nkeys -= i-j;   // IDX was omitted
403
0
    return out;
404
405
0
 fail:
406
0
    save_errno = errno;
407
0
    hts_log_error("%s", strerror(errno));
408
0
    bcf_hrec_destroy(out);
409
0
    errno = save_errno;
410
0
    return NULL;
411
0
}
412
413
void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
414
0
{
415
0
    fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
416
0
    int i;
417
0
    for (i=0; i<hrec->nkeys; i++)
418
0
        fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
419
0
    fprintf(fp, "\n");
420
0
}
421
422
void bcf_header_debug(bcf_hdr_t *hdr)
423
0
{
424
0
    int i, j;
425
0
    for (i=0; i<hdr->nhrec; i++)
426
0
    {
427
0
        if ( !hdr->hrec[i]->value )
428
0
        {
429
0
            fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
430
0
            fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
431
0
            for (j=1; j<hdr->hrec[i]->nkeys; j++)
432
0
                fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
433
0
            fprintf(stderr,">\n");
434
0
        }
435
0
        else
436
0
            fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
437
0
    }
438
0
}
439
440
int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len)
441
392k
{
442
392k
    char **tmp;
443
392k
    size_t n = hrec->nkeys + 1;
444
392k
    assert(len > 0 && len < SIZE_MAX);
445
392k
    tmp = realloc(hrec->keys, sizeof(char*)*n);
446
392k
    if (!tmp) return -1;
447
392k
    hrec->keys = tmp;
448
392k
    tmp = realloc(hrec->vals, sizeof(char*)*n);
449
392k
    if (!tmp) return -1;
450
392k
    hrec->vals = tmp;
451
452
392k
    hrec->keys[hrec->nkeys] = (char*) malloc((len+1)*sizeof(char));
453
392k
    if (!hrec->keys[hrec->nkeys]) return -1;
454
392k
    memcpy(hrec->keys[hrec->nkeys],str,len);
455
392k
    hrec->keys[hrec->nkeys][len] = 0;
456
392k
    hrec->vals[hrec->nkeys] = NULL;
457
392k
    hrec->nkeys = n;
458
392k
    return 0;
459
392k
}
460
461
int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted)
462
391k
{
463
391k
    if ( hrec->vals[i] ) {
464
0
        free(hrec->vals[i]);
465
0
        hrec->vals[i] = NULL;
466
0
    }
467
391k
    if ( !str ) return 0;
468
391k
    if ( is_quoted )
469
108k
    {
470
108k
        if (len >= SIZE_MAX - 3) {
471
0
            errno = ENOMEM;
472
0
            return -1;
473
0
        }
474
108k
        hrec->vals[i] = (char*) malloc((len+3)*sizeof(char));
475
108k
        if (!hrec->vals[i]) return -1;
476
108k
        hrec->vals[i][0] = '"';
477
108k
        memcpy(&hrec->vals[i][1],str,len);
478
108k
        hrec->vals[i][len+1] = '"';
479
108k
        hrec->vals[i][len+2] = 0;
480
108k
    }
481
283k
    else
482
283k
    {
483
283k
        if (len == SIZE_MAX) {
484
0
            errno = ENOMEM;
485
0
            return -1;
486
0
        }
487
283k
        hrec->vals[i] = (char*) malloc((len+1)*sizeof(char));
488
283k
        if (!hrec->vals[i]) return -1;
489
283k
        memcpy(hrec->vals[i],str,len);
490
283k
        hrec->vals[i][len] = 0;
491
283k
    }
492
391k
    return 0;
493
391k
}
494
495
int hrec_add_idx(bcf_hrec_t *hrec, int idx)
496
109k
{
497
109k
    int n = hrec->nkeys + 1;
498
109k
    char **tmp = (char**) realloc(hrec->keys, sizeof(char*)*n);
499
109k
    if (!tmp) return -1;
500
109k
    hrec->keys = tmp;
501
502
109k
    tmp = (char**) realloc(hrec->vals, sizeof(char*)*n);
503
109k
    if (!tmp) return -1;
504
109k
    hrec->vals = tmp;
505
506
109k
    hrec->keys[hrec->nkeys] = strdup("IDX");
507
109k
    if (!hrec->keys[hrec->nkeys]) return -1;
508
509
109k
    kstring_t str = {0,0,0};
510
109k
    if (kputw(idx, &str) < 0) {
511
0
        free(hrec->keys[hrec->nkeys]);
512
0
        return -1;
513
0
    }
514
109k
    hrec->vals[hrec->nkeys] = str.s;
515
109k
    hrec->nkeys = n;
516
109k
    return 0;
517
109k
}
518
519
int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
520
166k
{
521
166k
    int i;
522
263k
    for (i=0; i<hrec->nkeys; i++)
523
192k
        if ( !strcasecmp(key,hrec->keys[i]) ) return i;
524
71.1k
    return -1;
525
166k
}
526
527
static void bcf_hrec_set_type(bcf_hrec_t *hrec)
528
413k
{
529
413k
    if ( !strcmp(hrec->key, "contig") ) hrec->type = BCF_HL_CTG;
530
370k
    else if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
531
232k
    else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
532
139k
    else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
533
108k
    else if ( hrec->nkeys>0 ) hrec->type = BCF_HL_STR;
534
87.4k
    else hrec->type = BCF_HL_GEN;
535
413k
}
536
537
538
/**
539
    The arrays were generated with
540
541
    valid_ctg:
542
        perl -le '@v = (split(//,q[!#$%&*+./:;=?@^_|~-]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
543
544
    valid_tag:
545
        perl -le '@v = (split(//,q[_.]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
546
*/
547
static const uint8_t valid_ctg[256] =
548
{
549
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
550
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551
    0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
552
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
553
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
554
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
555
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
556
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
557
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
558
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
559
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
560
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
561
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
562
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
563
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
564
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
565
};
566
static const uint8_t valid_tag[256] =
567
{
568
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
569
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
570
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
571
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
572
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
573
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
574
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
575
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
576
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
577
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
578
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
579
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
580
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
581
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
582
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
583
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
584
};
585
586
/**
587
    bcf_hrec_check() - check the validity of structured header lines
588
589
    Returns 0 on success or negative value on error.
590
591
    Currently the return status is not checked by the caller
592
    and only a warning is printed on stderr. This should be improved
593
    to propagate the error all the way up to the caller and let it
594
    decide what to do: throw an error or proceed anyway.
595
 */
596
static int bcf_hrec_check(bcf_hrec_t *hrec)
597
206k
{
598
206k
    int i;
599
206k
    bcf_hrec_set_type(hrec);
600
601
206k
    if ( hrec->type==BCF_HL_CTG )
602
21.2k
    {
603
21.2k
        i = bcf_hrec_find_key(hrec,"ID");
604
21.2k
        if ( i<0 ) goto err_missing_id;
605
10.8k
        char *val = hrec->vals[i];
606
10.8k
        if ( val[0]=='*' || val[0]=='=' || !valid_ctg[(uint8_t)val[0]] ) goto err_invalid_ctg;
607
87.4k
        while ( *(++val) )
608
86.9k
            if ( !valid_ctg[(uint8_t)*val] ) goto err_invalid_ctg;
609
478
        return 0;
610
1.41k
    }
611
185k
    if ( hrec->type==BCF_HL_INFO )
612
69.0k
    {
613
69.0k
        i = bcf_hrec_find_key(hrec,"ID");
614
69.0k
        if ( i<0 ) goto err_missing_id;
615
49.3k
        char *val = hrec->vals[i];
616
49.3k
        if ( !strcmp(val,"1000G") ) return 0;
617
49.3k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
618
19.0k
        while ( *(++val) )
619
15.8k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
620
3.20k
        return 0;
621
6.20k
    }
622
116k
    if ( hrec->type==BCF_HL_FMT )
623
15.0k
    {
624
15.0k
        i = bcf_hrec_find_key(hrec,"ID");
625
15.0k
        if ( i<0 ) goto err_missing_id;
626
12.5k
        char *val = hrec->vals[i];
627
12.5k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
628
14.1k
        while ( *(++val) )
629
10.8k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
630
3.34k
        return 0;
631
5.76k
    }
632
101k
    return 0;
633
634
32.5k
  err_missing_id:
635
32.5k
    hts_log_warning("Missing ID attribute in one or more header lines");
636
32.5k
    return -1;
637
638
10.3k
  err_invalid_ctg:
639
10.3k
    hts_log_warning("Invalid contig name: \"%s\"", hrec->vals[i]);
640
10.3k
    return -1;
641
642
55.3k
  err_invalid_tag:
643
55.3k
    hts_log_warning("Invalid tag name: \"%s\"", hrec->vals[i]);
644
55.3k
    return -1;
645
116k
}
646
647
static inline int is_escaped(const char *min, const char *str)
648
106k
{
649
106k
    int n = 0;
650
106k
    while ( --str>=min && *str=='\\' ) n++;
651
106k
    return n%2;
652
106k
}
653
654
bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
655
240k
{
656
240k
    bcf_hrec_t *hrec = NULL;
657
240k
    const char *p = line;
658
240k
    if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
659
231k
    p += 2;
660
661
231k
    const char *q = p;
662
1.86M
    while ( *q && *q!='=' && *q != '\n' ) q++;
663
231k
    ptrdiff_t n = q-p;
664
231k
    if ( *q!='=' || !n ) // wrong format
665
11.2k
        goto malformed_line;
666
667
219k
    hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
668
219k
    if (!hrec) { *len = -1; return NULL; }
669
219k
    hrec->key = (char*) malloc(sizeof(char)*(n+1));
670
219k
    if (!hrec->key) goto fail;
671
219k
    memcpy(hrec->key,p,n);
672
219k
    hrec->key[n] = 0;
673
219k
    hrec->type = -1;
674
675
219k
    p = ++q;
676
219k
    if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
677
51.8k
    {
678
10.6M
        while ( *q && *q!='\n' ) q++;
679
51.8k
        hrec->value = (char*) malloc((q-p+1)*sizeof(char));
680
51.8k
        if (!hrec->value) goto fail;
681
51.8k
        memcpy(hrec->value, p, q-p);
682
51.8k
        hrec->value[q-p] = 0;
683
51.8k
        *len = q - line + (*q ? 1 : 0); // Skip \n but not \0
684
51.8k
        return hrec;
685
51.8k
    }
686
687
    // structured line, e.g.
688
    // ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
689
    // ##PEDIGREE=<Name_0=G0-ID,Name_1=G1-ID,Name_3=GN-ID>
690
168k
    int nopen = 1;
691
560k
    while ( *q && *q!='\n' && nopen>0 )
692
405k
    {
693
405k
        p = ++q;
694
405k
        while ( *q && *q==' ' ) { p++; q++; }
695
        // ^[A-Za-z_][0-9A-Za-z_.]*$
696
405k
        if (p==q && *q && (isalpha_c(*q) || *q=='_'))
697
401k
        {
698
401k
            q++;
699
2.17M
            while ( *q && (isalnum_c(*q) || *q=='_' || *q=='.') ) q++;
700
401k
        }
701
405k
        n = q-p;
702
405k
        int m = 0;
703
405k
        while ( *q && *q==' ' ) { q++; m++; }
704
405k
        if ( *q!='=' || !n )
705
13.2k
            goto malformed_line;
706
707
392k
        if (bcf_hrec_add_key(hrec, p, q-p-m) < 0) goto fail;
708
392k
        p = ++q;
709
394k
        while ( *q && *q==' ' ) { p++; q++; }
710
711
392k
        int quoted = 0;
712
392k
        char ending = '\0';
713
392k
        switch (*p) {
714
108k
        case '"':
715
108k
            quoted = 1;
716
108k
            ending = '"';
717
108k
            p++;
718
108k
            break;
719
96
        case '[':
720
96
            quoted = 1;
721
96
            ending = ']';
722
96
            break;
723
392k
        }
724
392k
        if ( quoted ) q++;
725
291M
        while ( *q && *q != '\n' )
726
291M
        {
727
291M
            if ( quoted ) { if ( *q==ending && !is_escaped(p,q) ) break; }
728
290M
            else
729
290M
            {
730
290M
                if ( *q=='<' ) nopen++;
731
290M
                if ( *q=='>' ) nopen--;
732
290M
                if ( !nopen ) break;
733
290M
                if ( *q==',' && nopen==1 ) break;
734
290M
            }
735
291M
            q++;
736
291M
        }
737
392k
        const char *r = q;
738
392k
        if (quoted && ending == ']') {
739
96
            if (*q == ending) {
740
69
                r++;
741
69
                q++;
742
69
                quoted = 0;
743
69
            } else {
744
27
                char buffer[320];
745
27
                hts_log_error("Missing ']' in header line %s",
746
27
                              hts_strprint(buffer, sizeof(buffer), '"',
747
27
                                           line, q-line));
748
27
                goto fail;
749
27
            }
750
96
        }
751
392k
        while ( r > p && r[-1] == ' ' ) r--;
752
391k
        if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0)
753
0
            goto fail;
754
391k
        if ( quoted && *q==ending ) q++;
755
391k
        if ( *q=='>' )
756
110k
        {
757
110k
            if (nopen) nopen--;     // this can happen with nested angle brackets <>
758
110k
            q++;
759
110k
        }
760
391k
    }
761
154k
    if ( nopen )
762
44.2k
        hts_log_warning("Incomplete header line, trying to proceed anyway:\n\t[%s]\n\t[%d]",line,q[0]);
763
764
    // Skip to end of line
765
154k
    int nonspace = 0;
766
154k
    p = q;
767
873k
    while ( *q && *q!='\n' ) { nonspace |= !isspace_c(*q); q++; }
768
154k
    if (nonspace) {
769
935
        char buffer[320];
770
935
        hts_log_warning("Dropped trailing junk from header line '%s'",
771
935
                        hts_strprint(buffer, sizeof(buffer),
772
935
                                     '"', line, q - line));
773
935
    }
774
775
154k
    *len = q - line + (*q ? 1 : 0);
776
154k
    return hrec;
777
778
27
 fail:
779
27
    *len = -1;
780
27
    bcf_hrec_destroy(hrec);
781
27
    return NULL;
782
783
24.4k
 malformed_line:
784
24.4k
    {
785
24.4k
        char buffer[320];
786
39.9M
        while ( *q && *q!='\n' ) q++;  // Ensure *len includes full line
787
24.4k
        hts_log_error("Could not parse the header line: %s",
788
24.4k
                      hts_strprint(buffer, sizeof(buffer),
789
24.4k
                                   '"', line, q - line));
790
24.4k
        *len = q - line + (*q ? 1 : 0);
791
24.4k
        bcf_hrec_destroy(hrec);
792
24.4k
        return NULL;
793
168k
    }
794
168k
}
795
796
static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_idinfo_t *idinfo)
797
108k
{
798
108k
    size_t new_n;
799
800
    // If available, preserve existing IDX
801
108k
    if ( idinfo->id==-1 )
802
107k
        idinfo->id = hdr->n[dict_type];
803
674
    else if ( idinfo->id < hdr->n[dict_type] && hdr->id[dict_type][idinfo->id].key )
804
8
    {
805
8
        hts_log_error("Conflicting IDX=%d lines in the header dictionary, the new tag is %s",
806
8
            idinfo->id, tag);
807
8
        errno = EINVAL;
808
8
        return -1;
809
8
    }
810
811
108k
    new_n = idinfo->id >= hdr->n[dict_type] ? idinfo->id+1 : hdr->n[dict_type];
812
108k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
813
    // hts_resize() can attempt to allocate up to 2 * requested items
814
108k
    if (new_n > FUZZ_ALLOC_LIMIT/(2 * sizeof(bcf_idpair_t)))
815
16
        return -1;
816
108k
#endif
817
108k
    if (hts_resize(bcf_idpair_t, new_n, &hdr->m[dict_type],
818
108k
                   &hdr->id[dict_type], HTS_RESIZE_CLEAR)) {
819
0
        return -1;
820
0
    }
821
108k
    hdr->n[dict_type] = new_n;
822
823
    // NB: the next kh_put call can invalidate the idinfo pointer, therefore
824
    // we leave it unassigned here. It must be set explicitly in bcf_hdr_sync.
825
108k
    hdr->id[dict_type][idinfo->id].key = tag;
826
827
108k
    return 0;
828
108k
}
829
830
// returns: 1 when hdr needs to be synced, -1 on error, 0 otherwise
831
static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
832
206k
{
833
    // contig
834
206k
    int i, ret, replacing = 0;
835
206k
    khint_t k;
836
206k
    char *str = NULL;
837
838
206k
    bcf_hrec_set_type(hrec);
839
840
206k
    if ( hrec->type==BCF_HL_CTG )
841
21.2k
    {
842
21.2k
        hts_pos_t len = 0;
843
844
        // Get the contig ID ($str) and length ($j)
845
21.2k
        i = bcf_hrec_find_key(hrec,"length");
846
21.2k
        if ( i<0 ) len = 0;
847
3.43k
        else {
848
3.43k
            char *end = hrec->vals[i];
849
3.43k
            len = strtoll(hrec->vals[i], &end, 10);
850
3.43k
            if (end == hrec->vals[i] || len < 0) return 0;
851
3.43k
        }
852
853
19.9k
        i = bcf_hrec_find_key(hrec,"ID");
854
19.9k
        if ( i<0 ) return 0;
855
10.8k
        str = strdup(hrec->vals[i]);
856
10.8k
        if (!str) return -1;
857
858
        // Register in the dictionary
859
10.8k
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
860
10.8k
        khint_t k = kh_get(vdict, d, str);
861
10.8k
        if ( k != kh_end(d) ) { // already present
862
1.48k
            free(str); str=NULL;
863
1.48k
            if (kh_val(d, k).hrec[0] != NULL) // and not removed
864
1.48k
                return 0;
865
0
            replacing = 1;
866
9.38k
        } else {
867
9.38k
            k = kh_put(vdict, d, str, &ret);
868
9.38k
            if (ret < 0) { free(str); return -1; }
869
9.38k
        }
870
871
9.38k
        int idx = bcf_hrec_find_key(hrec,"IDX");
872
9.38k
        if ( idx!=-1 )
873
2.67k
        {
874
2.67k
            char *tmp = hrec->vals[idx];
875
2.67k
            idx = strtol(hrec->vals[idx], &tmp, 10);
876
2.67k
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
877
2.63k
            {
878
2.63k
                if (!replacing) {
879
2.63k
                    kh_del(vdict, d, k);
880
2.63k
                    free(str);
881
2.63k
                }
882
2.63k
                hts_log_warning("Error parsing the IDX tag, skipping");
883
2.63k
                return 0;
884
2.63k
            }
885
2.67k
        }
886
887
6.74k
        kh_val(d, k) = bcf_idinfo_def;
888
6.74k
        kh_val(d, k).id = idx;
889
6.74k
        kh_val(d, k).info[0] = len;
890
6.74k
        kh_val(d, k).hrec[0] = hrec;
891
6.74k
        if (bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k)) < 0) {
892
18
            if (!replacing) {
893
18
                kh_del(vdict, d, k);
894
18
                free(str);
895
18
            }
896
18
            return -1;
897
18
        }
898
6.73k
        if ( idx==-1 ) {
899
6.71k
            if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
900
0
               return -1;
901
0
            }
902
6.71k
        }
903
904
6.73k
        return 1;
905
6.73k
    }
906
907
185k
    if ( hrec->type==BCF_HL_STR ) return 1;
908
174k
    if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_FMT ) return 0;
909
910
    // INFO/FILTER/FORMAT
911
130k
    char *id = NULL;
912
130k
    uint32_t type = UINT32_MAX, var = UINT32_MAX;
913
130k
    int num = -1, idx = -1;
914
474k
    for (i=0; i<hrec->nkeys; i++)
915
344k
    {
916
344k
        if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
917
235k
        else if ( !strcmp(hrec->keys[i], "IDX") )
918
1.96k
        {
919
1.96k
            char *tmp = hrec->vals[i];
920
1.96k
            idx = strtol(hrec->vals[i], &tmp, 10);
921
1.96k
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
922
568
            {
923
568
                hts_log_warning("Error parsing the IDX tag, skipping");
924
568
                return 0;
925
568
            }
926
1.96k
        }
927
233k
        else if ( !strcmp(hrec->keys[i], "Type") )
928
66.5k
        {
929
66.5k
            if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
930
63.4k
            else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
931
62.1k
            else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
932
9.76k
            else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
933
9.38k
            else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
934
7.01k
            else
935
7.01k
            {
936
7.01k
                hts_log_warning("The type \"%s\" is not supported, assuming \"String\"", hrec->vals[i]);
937
7.01k
                type = BCF_HT_STR;
938
7.01k
            }
939
66.5k
        }
940
167k
        else if ( !strcmp(hrec->keys[i], "Number") )
941
58.6k
        {
942
58.6k
            int is_fmt = hrec->type == BCF_HL_FMT;
943
58.6k
            if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
944
56.7k
            else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
945
56.6k
            else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
946
55.4k
            else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
947
55.3k
            else if ( is_fmt && !strcmp(hrec->vals[i],"P") )  var = BCF_VL_P;
948
55.3k
            else if ( is_fmt && !strcmp(hrec->vals[i],"LA") ) var = BCF_VL_LA;
949
55.3k
            else if ( is_fmt && !strcmp(hrec->vals[i],"LR") ) var = BCF_VL_LR;
950
55.3k
            else if ( is_fmt && !strcmp(hrec->vals[i],"LG") ) var = BCF_VL_LG;
951
55.3k
            else if ( is_fmt && !strcmp(hrec->vals[i],"M") )  var = BCF_VL_M;
952
55.3k
            else
953
55.3k
            {
954
55.3k
                if (sscanf(hrec->vals[i],"%d",&num) == 1)
955
54.6k
                    var = BCF_VL_FIXED;
956
55.3k
            }
957
58.6k
            if (var != BCF_VL_FIXED) num = 0xfffff;
958
58.6k
        }
959
344k
    }
960
130k
    if (hrec->type == BCF_HL_INFO || hrec->type == BCF_HL_FMT) {
961
83.5k
        if (type == -1) {
962
18.4k
            hts_log_warning("%s %s field has no Type defined. Assuming String",
963
18.4k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
964
18.4k
            type = BCF_HT_STR;
965
18.4k
        }
966
83.5k
        if (var == UINT32_MAX) {
967
25.6k
            hts_log_warning("%s %s field has no Number defined. Assuming '.'",
968
25.6k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
969
25.6k
            var = BCF_VL_VAR;
970
25.6k
        }
971
83.5k
        if ( type==BCF_HT_FLAG && (var!=BCF_VL_FIXED || num!=0) )
972
1.81k
        {
973
1.81k
            hts_log_warning("The definition of Flag \"%s/%s\" is invalid, forcing Number=0", hrec->key,id);
974
1.81k
            var = BCF_VL_FIXED;
975
1.81k
            num = 0;
976
1.81k
        }
977
83.5k
    }
978
130k
    uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 |
979
130k
                     (var & 0xf) << 8 |
980
130k
                     (type & 0xf) << 4 |
981
130k
                     (((uint32_t) hrec->type) & 0xf));
982
983
130k
    if ( !id ) return 0;
984
108k
    str = strdup(id);
985
108k
    if (!str) return -1;
986
987
108k
    vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
988
108k
    k = kh_get(vdict, d, str);
989
108k
    if ( k != kh_end(d) )
990
6.91k
    {
991
        // already present
992
6.91k
        free(str);
993
6.91k
        if ( kh_val(d, k).hrec[info&0xf] ) return 0;
994
2.02k
        kh_val(d, k).info[info&0xf] = info;
995
2.02k
        kh_val(d, k).hrec[info&0xf] = hrec;
996
2.02k
        if ( idx==-1 ) {
997
2.02k
            if (hrec_add_idx(hrec, kh_val(d, k).id) < 0) {
998
0
                return -1;
999
0
            }
1000
2.02k
        }
1001
2.02k
        return 1;
1002
2.02k
    }
1003
101k
    k = kh_put(vdict, d, str, &ret);
1004
101k
    if (ret < 0) {
1005
0
        free(str);
1006
0
        return -1;
1007
0
    }
1008
101k
    kh_val(d, k) = bcf_idinfo_def;
1009
101k
    kh_val(d, k).info[info&0xf] = info;
1010
101k
    kh_val(d, k).hrec[info&0xf] = hrec;
1011
101k
    kh_val(d, k).id = idx;
1012
101k
    if (bcf_hdr_set_idx(hdr, BCF_DT_ID, kh_key(d,k), &kh_val(d,k)) < 0) {
1013
6
        kh_del(vdict, d, k);
1014
6
        free(str);
1015
6
        return -1;
1016
6
    }
1017
101k
    if ( idx==-1 ) {
1018
100k
        if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
1019
0
            return -1;
1020
0
        }
1021
100k
    }
1022
1023
101k
    return 1;
1024
101k
}
1025
1026
static void bcf_hdr_unregister_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1027
0
{
1028
0
    if (hrec->type == BCF_HL_FLT ||
1029
0
        hrec->type == BCF_HL_INFO ||
1030
0
        hrec->type == BCF_HL_FMT ||
1031
0
        hrec->type == BCF_HL_CTG) {
1032
0
        int id = bcf_hrec_find_key(hrec, "ID");
1033
0
        if (id < 0 || !hrec->vals[id])
1034
0
            return;
1035
0
        vdict_t *dict = (hrec->type == BCF_HL_CTG
1036
0
                         ? (vdict_t*)hdr->dict[BCF_DT_CTG]
1037
0
                         : (vdict_t*)hdr->dict[BCF_DT_ID]);
1038
0
        khint_t k = kh_get(vdict, dict, hrec->vals[id]);
1039
0
        if (k != kh_end(dict))
1040
0
            kh_val(dict, k).hrec[hrec->type==BCF_HL_CTG ? 0 : hrec->type] = NULL;
1041
0
    }
1042
0
}
1043
1044
static void bcf_hdr_remove_from_hdict(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1045
0
{
1046
0
    kstring_t str = KS_INITIALIZE;
1047
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1048
0
    khint_t k;
1049
0
    int id;
1050
1051
0
    switch (hrec->type) {
1052
0
    case BCF_HL_GEN:
1053
0
        if (ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0)
1054
0
            str.l = 0;
1055
0
        break;
1056
0
    case BCF_HL_STR:
1057
0
        id = bcf_hrec_find_key(hrec, "ID");
1058
0
        if (id < 0)
1059
0
            return;
1060
0
        if (!hrec->vals[id] ||
1061
0
            ksprintf(&str, "##%s=<ID=%s>", hrec->key, hrec->vals[id]) < 0)
1062
0
            str.l = 0;
1063
0
        break;
1064
0
    default:
1065
0
        return;
1066
0
    }
1067
0
    if (str.l) {
1068
0
        k = kh_get(hdict, aux->gen, str.s);
1069
0
    } else {
1070
        // Couldn't get a string for some reason, so try the hard way...
1071
0
        for (k = kh_begin(aux->gen); k < kh_end(aux->gen); k++) {
1072
0
            if (kh_exist(aux->gen, k) && kh_val(aux->gen, k) == hrec)
1073
0
                break;
1074
0
        }
1075
0
    }
1076
0
    if (k != kh_end(aux->gen) && kh_val(aux->gen, k) == hrec) {
1077
0
        kh_val(aux->gen, k) = NULL;
1078
0
        free((char *) kh_key(aux->gen, k));
1079
0
        kh_key(aux->gen, k) = NULL;
1080
0
        kh_del(hdict, aux->gen, k);
1081
0
    }
1082
0
    free(str.s);
1083
0
}
1084
1085
int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp)
1086
0
{
1087
0
    assert( hrec->type==BCF_HL_GEN );
1088
0
    int ret;
1089
0
    khint_t k;
1090
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1091
0
    for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
1092
0
    {
1093
0
        if ( !kh_exist(aux->gen,k) ) continue;
1094
0
        if ( hrec!=(bcf_hrec_t*)kh_val(aux->gen,k) ) continue;
1095
0
        break;
1096
0
    }
1097
0
    assert( k<kh_end(aux->gen) );   // something went wrong, should never happen
1098
0
    free((char*)kh_key(aux->gen,k));
1099
0
    kh_del(hdict,aux->gen,k);
1100
0
    kstring_t str = {0,0,0};
1101
0
    if ( ksprintf(&str, "##%s=%s", tmp->key,tmp->value) < 0 )
1102
0
    {
1103
0
        free(str.s);
1104
0
        return -1;
1105
0
    }
1106
0
    k = kh_put(hdict, aux->gen, str.s, &ret);
1107
0
    if ( ret<0 )
1108
0
    {
1109
0
        free(str.s);
1110
0
        return -1;
1111
0
    }
1112
0
    free(hrec->value);
1113
0
    hrec->value = strdup(tmp->value);
1114
0
    if ( !hrec->value ) return -1;
1115
0
    kh_val(aux->gen,k) = hrec;
1116
1117
0
    if (!strcmp(hrec->key,"fileformat")) {
1118
        //update version
1119
0
        get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value);
1120
0
    }
1121
0
    return 0;
1122
0
}
1123
1124
int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1125
207k
{
1126
207k
    kstring_t str = {0,0,0};
1127
207k
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1128
1129
207k
    int res;
1130
207k
    if ( !hrec ) return 0;
1131
1132
206k
    bcf_hrec_check(hrec);   // todo: check return status and propagate errors up
1133
1134
206k
    res = bcf_hdr_register_hrec(hdr,hrec);
1135
206k
    if (res < 0) return -1;
1136
206k
    if ( !res )
1137
85.6k
    {
1138
        // If one of the hashed field, then it is already present
1139
85.6k
        if ( hrec->type != BCF_HL_GEN )
1140
41.8k
        {
1141
41.8k
            bcf_hrec_destroy(hrec);
1142
41.8k
            return 0;
1143
41.8k
        }
1144
        // Is one of the generic fields and already present?
1145
43.7k
        if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 )
1146
0
        {
1147
0
            free(str.s);
1148
0
            return -1;
1149
0
        }
1150
43.7k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1151
43.7k
        if ( k != kh_end(aux->gen) )
1152
26.6k
        {
1153
            // duplicate record
1154
26.6k
            bcf_hrec_destroy(hrec);
1155
26.6k
            free(str.s);
1156
26.6k
            return 0;
1157
26.6k
        }
1158
17.1k
        if (!strcmp(hrec->key, "fileformat")) {
1159
8.05k
            aux->version = bcf_get_version(NULL, hrec->value);
1160
8.05k
        }
1161
17.1k
    }
1162
1163
138k
    int i;
1164
138k
    if ( hrec->type==BCF_HL_STR && (i=bcf_hrec_find_key(hrec,"ID"))>=0 )
1165
5.84k
    {
1166
5.84k
        if ( ksprintf(&str, "##%s=<ID=%s>", hrec->key,hrec->vals[i]) < 0 )
1167
0
        {
1168
0
            free(str.s);
1169
0
            return -1;
1170
0
        }
1171
5.84k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1172
5.84k
        if ( k != kh_end(aux->gen) )
1173
3.70k
        {
1174
            // duplicate record
1175
3.70k
            bcf_hrec_destroy(hrec);
1176
3.70k
            free(str.s);
1177
3.70k
            return 0;
1178
3.70k
        }
1179
5.84k
    }
1180
1181
    // New record, needs to be added
1182
134k
    int n = hdr->nhrec + 1;
1183
134k
    bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
1184
134k
    if (!new_hrec) {
1185
0
        free(str.s);
1186
0
        bcf_hdr_unregister_hrec(hdr, hrec);
1187
0
        return -1;
1188
0
    }
1189
134k
    hdr->hrec = new_hrec;
1190
1191
134k
    if ( str.s )
1192
19.2k
    {
1193
19.2k
        khint_t k = kh_put(hdict, aux->gen, str.s, &res);
1194
19.2k
        if ( res<0 )
1195
0
        {
1196
0
            free(str.s);
1197
0
            return -1;
1198
0
        }
1199
19.2k
        kh_val(aux->gen,k) = hrec;
1200
19.2k
    }
1201
1202
134k
    hdr->hrec[hdr->nhrec] = hrec;
1203
134k
    hdr->dirty = 1;
1204
134k
    hdr->nhrec = n;
1205
1206
134k
    return hrec->type==BCF_HL_GEN ? 0 : 1;
1207
134k
}
1208
1209
bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
1210
1.21k
{
1211
1.21k
    int i;
1212
1.21k
    if ( type==BCF_HL_GEN )
1213
1.21k
    {
1214
        // e.g. ##fileformat=VCFv4.2
1215
        //      ##source=GenomicsDBImport
1216
        //      ##bcftools_viewVersion=1.16-80-gdfdb0923+htslib-1.16-34-g215d364
1217
1.21k
        if ( value )
1218
0
        {
1219
0
            kstring_t str = {0,0,0};
1220
0
            ksprintf(&str, "##%s=%s", key,value);
1221
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1222
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1223
0
            free(str.s);
1224
0
            if ( k == kh_end(aux->gen) ) return NULL;
1225
0
            return kh_val(aux->gen, k);
1226
0
        }
1227
2.56k
        for (i=0; i<hdr->nhrec; i++)
1228
1.73k
        {
1229
1.73k
            if ( hdr->hrec[i]->type!=type ) continue;
1230
555
            if ( strcmp(hdr->hrec[i]->key,key) ) continue;
1231
387
            return hdr->hrec[i];
1232
555
        }
1233
826
        return NULL;
1234
1.21k
    }
1235
0
    else if ( type==BCF_HL_STR )
1236
0
    {
1237
        // e.g. ##GATKCommandLine=<ID=GenomicsDBImport,CommandLine="GenomicsDBImport....">
1238
        //      ##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
1239
0
        if (!str_class) return NULL;
1240
0
        if ( !strcmp("ID",key) )
1241
0
        {
1242
0
            kstring_t str = {0,0,0};
1243
0
            ksprintf(&str, "##%s=<%s=%s>",str_class,key,value);
1244
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1245
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1246
0
            free(str.s);
1247
0
            if ( k == kh_end(aux->gen) ) return NULL;
1248
0
            return kh_val(aux->gen, k);
1249
0
        }
1250
0
        for (i=0; i<hdr->nhrec; i++)
1251
0
        {
1252
0
            if ( hdr->hrec[i]->type!=type ) continue;
1253
0
            if ( strcmp(hdr->hrec[i]->key,str_class) ) continue;
1254
0
            int j = bcf_hrec_find_key(hdr->hrec[i],key);
1255
0
            if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i];
1256
0
        }
1257
0
        return NULL;
1258
0
    }
1259
0
    vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1260
0
    khint_t k = kh_get(vdict, d, value);
1261
0
    if ( k == kh_end(d) ) return NULL;
1262
0
    return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
1263
0
}
1264
1265
// Check the VCF header is correctly formatted as per the specification.
1266
// Note the code that calls this doesn't bother to check return values and
1267
// we have so many broken VCFs in the wild that for now we just reprt a
1268
// warning and continue anyway.  So currently this is a void function.
1269
void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
1270
5.98k
{
1271
5.98k
    int version = bcf_get_version(hdr, NULL);
1272
1273
5.98k
    struct tag {
1274
5.98k
        char name[10];
1275
5.98k
        char number_str[3];
1276
5.98k
        int number;
1277
5.98k
        int version;
1278
5.98k
        int type;
1279
5.98k
    };
1280
1281
5.98k
    char type_str[][8] = {"Flag", "Integer", "Float", "String"};
1282
1283
5.98k
    struct tag info_tags[] = {
1284
5.98k
        {"AD",        "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1285
5.98k
        {"ADF",       "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1286
5.98k
        {"ADR",       "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1287
5.98k
        {"AC",        "A",  BCF_VL_A,     VCF_DEF, BCF_HT_INT},
1288
5.98k
        {"AF",        "A",  BCF_VL_A,     VCF_DEF, BCF_HT_REAL},
1289
5.98k
        {"CIGAR",     "A",  BCF_VL_A,     VCF_DEF, BCF_HT_STR},
1290
5.98k
        {"AA",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_STR},
1291
5.98k
        {"AN",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1292
5.98k
        {"BQ",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_REAL},
1293
5.98k
        {"DB",        "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1294
5.98k
        {"DP",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1295
5.98k
        {"END",       "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1296
5.98k
        {"H2",        "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1297
5.98k
        {"H3",        "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1298
5.98k
        {"MQ",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_REAL},
1299
5.98k
        {"MQ0",       "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1300
5.98k
        {"NS",        "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1301
5.98k
        {"SB",        "4",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1302
5.98k
        {"SOMATIC",   "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1303
5.98k
        {"VALIDATED", "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1304
5.98k
        {"1000G",     "0",  BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG},
1305
5.98k
    };
1306
5.98k
    static int info_warned[sizeof(info_tags)/sizeof(*info_tags)] = {0};
1307
1308
5.98k
    struct tag fmt_tags[] = {
1309
5.98k
        {"AD",   "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1310
5.98k
        {"ADF",  "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1311
5.98k
        {"ADR",  "R",  BCF_VL_R,     VCF_DEF, BCF_HT_INT},
1312
5.98k
        {"EC",   "A",  BCF_VL_A,     VCF_DEF, BCF_HT_INT},
1313
5.98k
        {"GL",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_REAL},
1314
5.98k
        {"GP",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_REAL},
1315
5.98k
        {"PL",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_INT},
1316
5.98k
        {"PP",   "G",  BCF_VL_G,     VCF_DEF, BCF_HT_INT},
1317
5.98k
        {"DP",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1318
5.98k
        {"LEN",  "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1319
5.98k
        {"FT",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_STR},
1320
5.98k
        {"GQ",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1321
5.98k
        {"GT",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_STR},
1322
5.98k
        {"HQ",   "2",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1323
5.98k
        {"MQ",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1324
5.98k
        {"PQ",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1325
5.98k
        {"PS",   "1",  BCF_VL_FIXED, VCF_DEF, BCF_HT_INT},
1326
5.98k
        {"PSL",  "P",  BCF_VL_P,     VCF44,   BCF_HT_STR},
1327
5.98k
        {"PSO",  "P",  BCF_VL_P,     VCF44,   BCF_HT_INT},
1328
5.98k
        {"PSQ",  "P",  BCF_VL_P,     VCF44,   BCF_HT_INT},
1329
5.98k
        {"LGL",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1330
5.98k
        {"LGP",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1331
5.98k
        {"LPL",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1332
5.98k
        {"LPP",  "LG", BCF_VL_LG,    VCF45,   BCF_HT_INT},
1333
5.98k
        {"LEC",  "LA", BCF_VL_LA,    VCF45,   BCF_HT_INT},
1334
5.98k
        {"LAD",  "LR", BCF_VL_LR,    VCF45,   BCF_HT_INT},
1335
5.98k
        {"LADF", "LR", BCF_VL_LR,    VCF45,   BCF_HT_INT},
1336
5.98k
        {"LADR", "LR", BCF_VL_LR,    VCF45,   BCF_HT_INT},
1337
5.98k
    };
1338
5.98k
    static int fmt_warned[sizeof(fmt_tags)/sizeof(*fmt_tags)] = {0};
1339
1340
    // Check INFO tag numbers.  We shouldn't really permit ".", but it's
1341
    // commonly misused so we let it slide unless it's a new tag and the
1342
    // file format claims to be new also.  We also cannot distinguish between
1343
    // Number=1 and Number=2, but we at least report the correct term if we
1344
    // get, say, Number=G in its place.
1345
    // Also check the types.
1346
5.98k
    int i;
1347
131k
    for (i = 0; i < sizeof(info_tags)/sizeof(*info_tags); i++) {
1348
125k
        if (info_warned[i])
1349
3.41k
            continue;
1350
122k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, info_tags[i].name);
1351
122k
        if (bcf_hdr_idinfo_exists(hdr, BCF_HL_INFO, id)) {
1352
1
            if (bcf_hdr_id2length(hdr, BCF_HL_INFO, id) != info_tags[i].number &&
1353
1
                bcf_hdr_id2length(hdr, BCF_HL_INFO, id) != BCF_VL_VAR) {
1354
0
                info_warned[i] = 1;
1355
1
            } else if (bcf_hdr_id2length(hdr, BCF_HL_INFO, id) == BCF_VL_FIXED &&
1356
0
                       bcf_hdr_id2number(hdr, BCF_HL_INFO, id) != atoi(info_tags[i].number_str)) {
1357
0
                info_warned[i] = 1;
1358
0
            }
1359
1360
1
            if (info_warned[i]) {
1361
0
                hts_log_warning("%s should be declared as Number=%s",
1362
0
                                info_tags[i].name, info_tags[i].number_str);
1363
0
            }
1364
1365
1
            if (bcf_hdr_id2type(hdr, BCF_HL_INFO, id) != info_tags[i].type) {
1366
1
                hts_log_warning("%s should be declared as Type=%s",
1367
1
                                info_tags[i].name, type_str[info_tags[i].type]);
1368
1
                info_warned[i] = 1;
1369
1
            }
1370
1
        }
1371
122k
    }
1372
1373
    // Check FORMAT tag numbers and types.
1374
173k
    for (i = 0; i < sizeof(fmt_tags)/sizeof(*fmt_tags); i++) {
1375
167k
        if (fmt_warned[i])
1376
0
            continue;
1377
167k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, fmt_tags[i].name);
1378
167k
        if (bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, id)) {
1379
0
            if (bcf_hdr_id2length(hdr, BCF_HL_FMT, id) != fmt_tags[i].number) {
1380
                // Permit "Number=." if this tag predates the vcf version it is
1381
                // defined within.  This is a common tactic for callers to use
1382
                // new tags with older formats in order to avoid parsing failures
1383
                // with some software.
1384
                // We don't care for 4.3 and earlier as that's more of a wild-west
1385
                // and it's not abnormal to see incorrect usage of Number=. there.
1386
0
                if ((version < VCF44 &&
1387
0
                     bcf_hdr_id2length(hdr, BCF_HL_FMT, id) != BCF_VL_VAR) ||
1388
0
                    (version >= VCF44 && version >= fmt_tags[i].version)) {
1389
0
                    fmt_warned[i] = 1;
1390
0
                }
1391
0
            } else if (bcf_hdr_id2length(hdr, BCF_HL_FMT, id) == BCF_VL_FIXED &&
1392
0
                       bcf_hdr_id2number(hdr, BCF_HL_FMT, id) != atoi(fmt_tags[i].number_str)) {
1393
0
                fmt_warned[i] = 1;
1394
0
            }
1395
1396
0
            if (fmt_warned[i]) {
1397
0
                hts_log_warning("%s should be declared as Number=%s",
1398
0
                                fmt_tags[i].name, fmt_tags[i].number_str);
1399
0
            }
1400
1401
0
            if (bcf_hdr_id2type(hdr, BCF_HL_FMT, id) != fmt_tags[i].type) {
1402
0
                hts_log_warning("%s should be declared as Type=%s",
1403
0
                                fmt_tags[i].name, type_str[fmt_tags[i].type]);
1404
0
                fmt_warned[i] = 1;
1405
0
            }
1406
0
        }
1407
167k
    }
1408
5.98k
}
1409
1410
int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
1411
7.32k
{
1412
7.32k
    int len, done = 0;
1413
7.32k
    char *p = htxt;
1414
1415
    // Check sanity: "fileformat" string must come as first
1416
7.32k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
1417
7.32k
    if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") )
1418
494
        hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?");
1419
7.32k
    if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1420
0
        bcf_hrec_destroy(hrec);
1421
0
        return -1;
1422
0
    }
1423
1424
    // The filter PASS must appear first in the dictionary
1425
7.32k
    hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
1426
7.32k
    if (!hrec || bcf_hdr_add_hrec(hdr, hrec) < 0) {
1427
0
        bcf_hrec_destroy(hrec);
1428
0
        return -1;
1429
0
    }
1430
1431
    // Parse the whole header
1432
33.1k
    do {
1433
127k
        while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) {
1434
94.1k
            if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1435
22
                bcf_hrec_destroy(hrec);
1436
22
                return -1;
1437
22
            }
1438
94.0k
            p += len;
1439
94.0k
        }
1440
33.1k
        assert(hrec == NULL);
1441
33.0k
        if (len < 0) {
1442
            // len < 0 indicates out-of-memory, or similar error
1443
8
            hts_log_error("Could not parse header line: %s", strerror(errno));
1444
8
            return -1;
1445
33.0k
        } else if (len > 0) {
1446
            // Bad header line.  bcf_hdr_parse_line() will have logged it.
1447
            // Skip and try again on the next line (p + len will be the start
1448
            // of the next one).
1449
24.3k
            p += len;
1450
24.3k
            continue;
1451
24.3k
        }
1452
1453
        // Next should be the sample line.  If not, it was a malformed
1454
        // header, in which case print a warning and skip (many VCF
1455
        // operations do not really care about a few malformed lines).
1456
        // In the future we may want to add a strict mode that errors in
1457
        // this case.
1458
8.77k
        if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) {
1459
2.63k
            char *eol = strchr(p, '\n');
1460
2.63k
            if (*p != '\0') {
1461
1.54k
                char buffer[320];
1462
1.54k
                hts_log_warning("Could not parse header line: %s",
1463
1.54k
                                hts_strprint(buffer, sizeof(buffer),
1464
1.54k
                                               '"', p,
1465
1.54k
                                               eol ? (eol - p) : SIZE_MAX));
1466
1.54k
            }
1467
2.63k
            if (eol) {
1468
1.48k
                p = eol + 1; // Try from the next line.
1469
1.48k
            } else {
1470
1.15k
                done = -1; // No more lines left, give up.
1471
1.15k
            }
1472
6.14k
        } else {
1473
6.14k
            done = 1; // Sample line found
1474
6.14k
        }
1475
33.0k
    } while (!done);
1476
1477
7.29k
    if (done < 0) {
1478
        // No sample line is fatal.
1479
1.15k
        hts_log_error("Could not parse the header, sample line not found");
1480
1.15k
        return -1;
1481
1.15k
    }
1482
1483
6.14k
    if (bcf_hdr_parse_sample_line(hdr,p) < 0)
1484
160
        return -1;
1485
5.98k
    if (bcf_hdr_sync(hdr) < 0)
1486
0
        return -1;
1487
5.98k
    bcf_hdr_check_sanity(hdr);
1488
5.98k
    return 0;
1489
5.98k
}
1490
1491
int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
1492
0
{
1493
0
    int len;
1494
0
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
1495
0
    if ( !hrec ) return -1;
1496
0
    if (bcf_hdr_add_hrec(hdr, hrec) < 0)
1497
0
        return -1;
1498
0
    return 0;
1499
0
}
1500
1501
void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
1502
0
{
1503
0
    int i = 0;
1504
0
    bcf_hrec_t *hrec;
1505
0
    if ( !key )
1506
0
    {
1507
        // no key, remove all entries of this type
1508
0
        while ( i<hdr->nhrec )
1509
0
        {
1510
0
            if ( hdr->hrec[i]->type!=type ) { i++; continue; }
1511
0
            hrec = hdr->hrec[i];
1512
0
            bcf_hdr_unregister_hrec(hdr, hrec);
1513
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1514
0
            hdr->dirty = 1;
1515
0
            hdr->nhrec--;
1516
0
            if ( i < hdr->nhrec )
1517
0
                memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1518
0
            bcf_hrec_destroy(hrec);
1519
0
        }
1520
0
        return;
1521
0
    }
1522
0
    while (1)
1523
0
    {
1524
0
        if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
1525
0
        {
1526
0
            hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL);
1527
0
            if ( !hrec ) return;
1528
1529
0
            for (i=0; i<hdr->nhrec; i++)
1530
0
                if ( hdr->hrec[i]==hrec ) break;
1531
0
            assert( i<hdr->nhrec );
1532
1533
0
            vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1534
0
            khint_t k = kh_get(vdict, d, key);
1535
0
            kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
1536
0
        }
1537
0
        else
1538
0
        {
1539
0
            for (i=0; i<hdr->nhrec; i++)
1540
0
            {
1541
0
                if ( hdr->hrec[i]->type!=type ) continue;
1542
0
                if ( type==BCF_HL_GEN )
1543
0
                {
1544
0
                    if ( !strcmp(hdr->hrec[i]->key,key) ) break;
1545
0
                }
1546
0
                else
1547
0
                {
1548
                    // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
1549
0
                    int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
1550
0
                    if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break;
1551
0
                }
1552
0
            }
1553
0
            if ( i==hdr->nhrec ) return;
1554
0
            hrec = hdr->hrec[i];
1555
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1556
0
        }
1557
1558
0
        hdr->nhrec--;
1559
0
        if ( i < hdr->nhrec )
1560
0
            memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1561
0
        bcf_hrec_destroy(hrec);
1562
0
        hdr->dirty = 1;
1563
0
    }
1564
0
}
1565
1566
int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
1567
0
{
1568
0
    char tmp[256], *line = tmp;
1569
0
    va_list ap;
1570
0
    va_start(ap, fmt);
1571
0
    int n = vsnprintf(line, sizeof(tmp), fmt, ap);
1572
0
    va_end(ap);
1573
1574
0
    if (n >= sizeof(tmp)) {
1575
0
        n++; // For trailing NUL
1576
0
        line = (char*)malloc(n);
1577
0
        if (!line)
1578
0
            return -1;
1579
1580
0
        va_start(ap, fmt);
1581
0
        vsnprintf(line, n, fmt, ap);
1582
0
        va_end(ap);
1583
0
    }
1584
1585
0
    int ret = bcf_hdr_append(hdr, line);
1586
1587
0
    if (line != tmp) free(line);
1588
0
    return ret;
1589
0
}
1590
1591
1592
/**********************
1593
 *** BCF header I/O ***
1594
 **********************/
1595
1596
const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
1597
1.21k
{
1598
1.21k
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1599
1.21k
    if ( !hrec )
1600
826
    {
1601
826
        hts_log_warning("No version string found, assuming VCFv4.2");
1602
826
        return "VCFv4.2";
1603
826
    }
1604
387
    return hrec->value;
1605
1.21k
}
1606
1607
int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
1608
0
{
1609
0
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1610
0
    if ( !hrec )
1611
0
    {
1612
0
        int len;
1613
0
        kstring_t str = {0,0,0};
1614
0
        if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1;
1615
0
        hrec = bcf_hdr_parse_line(hdr, str.s, &len);
1616
0
        free(str.s);
1617
1618
0
        get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value);
1619
0
    }
1620
0
    else
1621
0
    {
1622
0
        bcf_hrec_t *tmp = bcf_hrec_dup(hrec);
1623
0
        if ( !tmp ) return -1;
1624
0
        free(tmp->value);
1625
0
        tmp->value = strdup(version);
1626
0
        if ( !tmp->value ) return -1;
1627
0
        bcf_hdr_update_hrec(hdr, hrec, tmp);
1628
0
        bcf_hrec_destroy(tmp);
1629
0
    }
1630
0
    hdr->dirty = 1;
1631
    //TODO rlen may change, deal with it
1632
0
    return 0; // FIXME: check for errs in this function (return < 0 if so)
1633
0
}
1634
1635
bcf_hdr_t *bcf_hdr_init(const char *mode)
1636
7.35k
{
1637
7.35k
    int i;
1638
7.35k
    bcf_hdr_t *h;
1639
7.35k
    h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
1640
7.35k
    if (!h) return NULL;
1641
29.4k
    for (i = 0; i < 3; ++i) {
1642
22.0k
        if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail;
1643
        // Supersize the hash to make collisions very unlikely
1644
22.0k
        static int dsize[3] = {16384,16384,2048}; // info, contig, format
1645
22.0k
        if (kh_resize(vdict, h->dict[i], dsize[i]) < 0) goto fail;
1646
22.0k
    }
1647
1648
7.35k
    bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t));
1649
7.35k
    if ( !aux ) goto fail;
1650
7.35k
    if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; }
1651
7.35k
    aux->key_len = NULL;
1652
7.35k
    aux->dict = *((vdict_t*)h->dict[0]);
1653
7.35k
    aux->version = 0;
1654
7.35k
    aux->ref_count = 1;
1655
7.35k
    free(h->dict[0]);
1656
7.35k
    h->dict[0] = aux;
1657
1658
7.35k
    if ( strchr(mode,'w') )
1659
0
    {
1660
0
        bcf_hdr_append(h, "##fileformat=VCFv4.2");
1661
        // The filter PASS must appear first in the dictionary
1662
0
        bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
1663
0
        aux->version = VCF_DEF;
1664
0
    }
1665
7.35k
    return h;
1666
1667
0
 fail:
1668
0
    for (i = 0; i < 3; ++i)
1669
0
        kh_destroy(vdict, h->dict[i]);
1670
0
    free(h);
1671
0
    return NULL;
1672
7.35k
}
1673
1674
void bcf_hdr_destroy(bcf_hdr_t *h)
1675
10.5k
{
1676
10.5k
    int i;
1677
10.5k
    khint_t k;
1678
10.5k
    if (!h) return;
1679
10.5k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
1680
10.5k
    if (aux->ref_count > 1) // Refs still held, so delay destruction
1681
3.21k
    {
1682
3.21k
        aux->ref_count &= ~1;
1683
3.21k
        return;
1684
3.21k
    }
1685
29.4k
    for (i = 0; i < 3; ++i) {
1686
22.0k
        vdict_t *d = (vdict_t*)h->dict[i];
1687
22.0k
        if (d == 0) continue;
1688
256M
        for (k = kh_begin(d); k != kh_end(d); ++k)
1689
256M
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
1690
22.0k
        if ( i==0 )
1691
7.35k
        {
1692
52.2k
            for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
1693
44.8k
                if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k));
1694
7.35k
            kh_destroy(hdict, aux->gen);
1695
7.35k
            free(aux->key_len); // may exist for dict[0] only
1696
7.35k
        }
1697
22.0k
        kh_destroy(vdict, d);
1698
22.0k
        free(h->id[i]);
1699
22.0k
    }
1700
141k
    for (i=0; i<h->nhrec; i++)
1701
134k
        bcf_hrec_destroy(h->hrec[i]);
1702
7.35k
    if (h->nhrec) free(h->hrec);
1703
7.35k
    if (h->samples) free(h->samples);
1704
7.35k
    free(h->keep_samples);
1705
7.35k
    free(h->transl[0]); free(h->transl[1]);
1706
7.35k
    free(h->mem.s);
1707
7.35k
    free(h);
1708
7.35k
}
1709
1710
bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
1711
7.35k
{
1712
7.35k
    if (hfp->format.format == vcf)
1713
6.86k
        return vcf_hdr_read(hfp);
1714
496
    if (hfp->format.format != bcf) {
1715
0
        hts_log_error("Input is not detected as bcf or vcf format");
1716
0
        return NULL;
1717
0
    }
1718
1719
496
    assert(hfp->is_bgzf);
1720
1721
496
    BGZF *fp = hfp->fp.bgzf;
1722
496
    uint8_t magic[5];
1723
496
    bcf_hdr_t *h;
1724
496
    h = bcf_hdr_init("r");
1725
496
    if (!h) {
1726
0
        hts_log_error("Failed to allocate bcf header");
1727
0
        return NULL;
1728
0
    }
1729
496
    if (bgzf_read(fp, magic, 5) != 5)
1730
0
    {
1731
0
        hts_log_error("Failed to read the header (reading BCF in text mode?)");
1732
0
        bcf_hdr_destroy(h);
1733
0
        return NULL;
1734
0
    }
1735
496
    if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
1736
0
    {
1737
0
        if (!strncmp((char*)magic, "BCF", 3))
1738
0
            hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported");
1739
0
        else
1740
0
            hts_log_error("Invalid BCF2 magic string");
1741
0
        bcf_hdr_destroy(h);
1742
0
        return NULL;
1743
0
    }
1744
496
    uint8_t buf[4];
1745
496
    size_t hlen;
1746
496
    char *htxt = NULL;
1747
496
    if (bgzf_read(fp, buf, 4) != 4) goto fail;
1748
496
    hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24);
1749
496
    if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; }
1750
496
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1751
496
    if (hlen > FUZZ_ALLOC_LIMIT/2) { errno = ENOMEM; goto fail; }
1752
496
#endif
1753
496
    htxt = (char*)malloc(hlen + 1);
1754
496
    if (!htxt) goto fail;
1755
496
    if (bgzf_read(fp, htxt, hlen) != hlen) goto fail;
1756
494
    htxt[hlen] = '\0'; // Ensure htxt is terminated
1757
494
    if ( bcf_hdr_parse(h, htxt) < 0 ) goto fail;
1758
450
    free(htxt);
1759
1760
450
    bcf_hdr_incr_ref(h);
1761
450
    bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup);
1762
1763
450
    return h;
1764
46
 fail:
1765
46
    hts_log_error("Failed to read BCF header");
1766
46
    free(htxt);
1767
46
    bcf_hdr_destroy(h);
1768
46
    return NULL;
1769
494
}
1770
1771
int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
1772
5.98k
{
1773
5.98k
    if (!h) {
1774
0
        errno = EINVAL;
1775
0
        return -1;
1776
0
    }
1777
5.98k
    if ( h->dirty ) {
1778
0
        if (bcf_hdr_sync(h) < 0) return -1;
1779
0
    }
1780
5.98k
    hfp->format.category = variant_data;
1781
5.98k
    if (hfp->format.format == vcf || hfp->format.format == text_format) {
1782
2.99k
        hfp->format.format = vcf;
1783
2.99k
        return vcf_hdr_write(hfp, h);
1784
2.99k
    }
1785
1786
2.99k
    if (hfp->format.format == binary_format)
1787
2.99k
        hfp->format.format = bcf;
1788
1789
2.99k
    kstring_t htxt = {0,0,0};
1790
2.99k
    if (bcf_hdr_format(h, 1, &htxt) < 0) {
1791
0
        free(htxt.s);
1792
0
        return -1;
1793
0
    }
1794
2.99k
    kputc('\0', &htxt); // include the \0 byte
1795
1796
2.99k
    BGZF *fp = hfp->fp.bgzf;
1797
2.99k
    if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
1798
2.99k
    uint8_t hlen[4];
1799
2.99k
    u32_to_le(htxt.l, hlen);
1800
2.99k
    if ( bgzf_write(fp, hlen, 4) !=4 ) return -1;
1801
2.99k
    if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1;
1802
2.99k
    if ( bgzf_flush(fp) < 0) return -1;
1803
1804
2.99k
    bcf_hdr_incr_ref(h);
1805
2.99k
    bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup);
1806
1807
2.99k
    free(htxt.s);
1808
2.99k
    return 0;
1809
2.99k
}
1810
1811
/********************
1812
 *** BCF site I/O ***
1813
 ********************/
1814
1815
bcf1_t *bcf_init(void)
1816
5.98k
{
1817
5.98k
    bcf1_t *v;
1818
5.98k
    v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
1819
5.98k
    return v;
1820
5.98k
}
1821
1822
void bcf_clear(bcf1_t *v)
1823
65.4k
{
1824
65.4k
    int i;
1825
65.4k
    for (i=0; i<v->d.m_info; i++)
1826
0
    {
1827
0
        if ( v->d.info[i].vptr_free )
1828
0
        {
1829
0
            free(v->d.info[i].vptr - v->d.info[i].vptr_off);
1830
0
            v->d.info[i].vptr_free = 0;
1831
0
        }
1832
0
    }
1833
65.4k
    for (i=0; i<v->d.m_fmt; i++)
1834
0
    {
1835
0
        if ( v->d.fmt[i].p_free )
1836
0
        {
1837
0
            free(v->d.fmt[i].p - v->d.fmt[i].p_off);
1838
0
            v->d.fmt[i].p_free = 0;
1839
0
        }
1840
0
    }
1841
65.4k
    v->rid = v->pos = v->rlen = v->unpacked = 0;
1842
65.4k
    bcf_float_set_missing(v->qual);
1843
65.4k
    v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
1844
65.4k
    v->shared.l = v->indiv.l = 0;
1845
65.4k
    v->d.var_type = -1;
1846
65.4k
    v->d.shared_dirty = 0;
1847
65.4k
    v->d.indiv_dirty  = 0;
1848
65.4k
    v->d.n_flt = 0;
1849
65.4k
    v->errcode = 0;
1850
65.4k
    if (v->d.m_als) v->d.als[0] = 0;
1851
65.4k
    if (v->d.m_id) v->d.id[0] = 0;
1852
65.4k
}
1853
1854
void bcf_empty(bcf1_t *v)
1855
5.98k
{
1856
5.98k
    bcf_clear1(v);
1857
5.98k
    free(v->d.id);
1858
5.98k
    free(v->d.als);
1859
5.98k
    free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
1860
5.98k
    if (v->d.var ) free(v->d.var);
1861
5.98k
    free(v->shared.s); free(v->indiv.s);
1862
5.98k
    memset(&v->d,0,sizeof(v->d));
1863
5.98k
    memset(&v->shared,0,sizeof(v->shared));
1864
5.98k
    memset(&v->indiv,0,sizeof(v->indiv));
1865
5.98k
}
1866
1867
void bcf_destroy(bcf1_t *v)
1868
5.98k
{
1869
5.98k
    if (!v) return;
1870
5.98k
    bcf_empty1(v);
1871
5.98k
    free(v);
1872
5.98k
}
1873
1874
static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
1875
496
{
1876
496
    uint8_t x[32];
1877
496
    ssize_t ret;
1878
496
    uint32_t shared_len, indiv_len;
1879
496
    if ((ret = bgzf_read(fp, x, 32)) != 32) {
1880
10
        if (ret == 0) return -1;
1881
6
        return -2;
1882
10
    }
1883
486
    bcf_clear1(v);
1884
486
    shared_len = le_to_u32(x);
1885
486
    if (shared_len < 24) return -2;
1886
484
    shared_len -= 24; // to exclude six 32-bit integers
1887
484
    indiv_len = le_to_u32(x + 4);
1888
484
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1889
    // ks_resize() normally allocates 1.5 * requested size to allow for growth
1890
484
    if ((uint64_t) shared_len + indiv_len > FUZZ_ALLOC_LIMIT / 3 * 2) return -2;
1891
460
#endif
1892
460
    if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2;
1893
460
    if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2;
1894
460
    v->rid  = le_to_i32(x + 8);
1895
460
    v->pos  = le_to_u32(x + 12);
1896
460
    if ( v->pos==UINT32_MAX ) v->pos = -1;  // this is for telomere coordinate, e.g. MT:0
1897
460
    v->rlen = le_to_i32(x + 16);
1898
460
    v->qual = le_to_float(x + 20);
1899
460
    v->n_info = le_to_u16(x + 24);
1900
460
    v->n_allele = le_to_u16(x + 26);
1901
460
    v->n_sample = le_to_u32(x + 28) & 0xffffff;
1902
460
    v->n_fmt = x[31];
1903
460
    v->shared.l = shared_len;
1904
460
    v->indiv.l = indiv_len;
1905
    // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
1906
460
    if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
1907
1908
460
    if (bgzf_read(fp, v->shared.s, v->shared.l) != v->shared.l) return -2;
1909
392
    if (bgzf_read(fp, v->indiv.s, v->indiv.l) != v->indiv.l) return -2;
1910
376
    return 0;
1911
392
}
1912
1913
0
#define bit_array_size(n) ((n)/8+1)
1914
0
#define bit_array_set(a,i)   ((a)[(i)/8] |=   1 << ((i)%8))
1915
0
#define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
1916
0
#define bit_array_test(a,i)  ((a)[(i)/8] &   (1 << ((i)%8)))
1917
1918
static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1919
7.12k
                                   int32_t *val) {
1920
7.12k
    uint32_t t;
1921
7.12k
    if (end - p < 2) return -1;
1922
7.11k
    t = *p++ & 0xf;
1923
    /* Use if .. else if ... else instead of switch to force order.  Assumption
1924
       is that small integers are more frequent than big ones. */
1925
7.11k
    if (t == BCF_BT_INT8) {
1926
3.76k
        *val = *(int8_t *) p++;
1927
3.76k
    } else {
1928
3.35k
        if (end - p < (1<<bcf_type_shift[t])) return -1;
1929
3.33k
        if (t == BCF_BT_INT16) {
1930
2.23k
            *val = le_to_i16(p);
1931
2.23k
            p += 2;
1932
2.23k
        } else if (t == BCF_BT_INT32) {
1933
938
            *val = le_to_i32(p);
1934
938
            p += 4;
1935
#ifdef VCF_ALLOW_INT64
1936
        } else if (t == BCF_BT_INT64) {
1937
            // This case should never happen because there should be no
1938
            // 64-bit BCFs at all, definitely not coming from htslib
1939
            *val = le_to_i64(p);
1940
            p += 8;
1941
#endif
1942
938
        } else {
1943
162
            return -1;
1944
162
        }
1945
3.33k
    }
1946
6.93k
    *q = p;
1947
6.93k
    return 0;
1948
7.11k
}
1949
1950
static int bcf_dec_size_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1951
21.5k
                             int *num, int *type) {
1952
21.5k
    int r;
1953
21.5k
    if (p >= end) return -1;
1954
21.5k
    *type = *p & 0xf;
1955
21.5k
    if (*p>>4 != 15) {
1956
21.1k
        *q = p + 1;
1957
21.1k
        *num = *p >> 4;
1958
21.1k
        return 0;
1959
21.1k
    }
1960
448
    r = bcf_dec_typed_int1_safe(p + 1, end, q, num);
1961
448
    if (r) return r;
1962
414
    return *num >= 0 ? 0 : -1;
1963
448
}
1964
1965
530
static const char *get_type_name(int type) {
1966
530
    const char *types[9] = {
1967
530
        "null", "int (8-bit)", "int (16 bit)", "int (32 bit)",
1968
530
        "unknown", "float", "unknown", "char", "unknown"
1969
530
    };
1970
530
    int t = (type >= 0 && type < 8) ? type : 8;
1971
530
    return types[t];
1972
530
}
1973
1974
/**
1975
 *  updatephasing - updates 1st phasing based on other phasing status
1976
 *  @param p - pointer to phase value array
1977
 *  @param end - end of array
1978
 *  @param q - pointer to consumed data
1979
 *  @param samples - no. of samples in array
1980
 *  @param ploidy - no. of phasing values per sample
1981
 *  @param type - value type (one of BCF_BT_...)
1982
 *  Returns 0 on success and 1 on failure
1983
 *  Update for haploids made only if it is not unknown (.)
1984
 */
1985
static int updatephasing(uint8_t *p, uint8_t *end, uint8_t **q, int samples, int ploidy, int type)
1986
0
{
1987
0
    int j, k;
1988
0
    unsigned int inc = 1 << bcf_type_shift[type];
1989
0
    ptrdiff_t bytes = samples * ploidy * inc;
1990
1991
0
    if (samples < 0 || ploidy < 0 || end - p < bytes)
1992
0
        return 1;
1993
1994
    /*
1995
     * This works because phasing is stored in the least-significant bit
1996
     * of the GT encoding, and the data is always stored little-endian.
1997
     * Thus it's possible to get the desired result by doing bit operations
1998
     * on the least-significant byte of each value and ignoring the
1999
     * higher bytes (for 16-bit and 32-bit values).
2000
     */
2001
2002
0
    switch (ploidy) {
2003
0
    case 1:
2004
        // Trivial case - haploid data is phased by default
2005
0
        for (j = 0; j < samples; ++j) {
2006
0
            if (*p) *p |= 1;    //only if not unknown (.)
2007
0
            p += inc;
2008
0
        }
2009
0
        break;
2010
0
    case 2:
2011
        // Mostly trivial case - first is phased if second is.
2012
0
        for (j = 0; j < samples; ++j) {
2013
0
            *p |= (p[inc] & 1);
2014
0
            p += 2 * inc;
2015
0
        }
2016
0
        break;
2017
0
    default:
2018
        // Generic case - first is phased if all other alleles are.
2019
0
        for (j = 0; j < samples; ++j) {
2020
0
            uint8_t allphased = 1;
2021
0
            for (k = 1; k < ploidy; ++k)
2022
0
                allphased &= (p[inc * k]);
2023
0
            *p |= allphased;
2024
0
            p += ploidy * inc;
2025
0
        }
2026
0
    }
2027
0
    *q = p;
2028
0
    return 0;
2029
0
}
2030
2031
static void bcf_record_check_err(const bcf_hdr_t *hdr, bcf1_t *rec,
2032
3.47k
                                 char *type, uint32_t *reports, int i) {
2033
3.47k
    if (*reports == 0 || hts_verbose >= HTS_LOG_DEBUG)
2034
98
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos
2035
3.47k
                        ": Invalid FORMAT %s %d",
2036
3.47k
                        bcf_seqname_safe(hdr,rec), rec->pos+1, type, i);
2037
3.47k
    (*reports)++;
2038
3.47k
}
2039
2040
376
static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
2041
376
    uint8_t *ptr, *end;
2042
376
    size_t bytes;
2043
376
    uint32_t err = 0;
2044
376
    int type = 0;
2045
376
    int num  = 0;
2046
376
    uint32_t i, reports;
2047
376
    const uint32_t is_integer = ((1 << BCF_BT_INT8)  |
2048
376
                                 (1 << BCF_BT_INT16) |
2049
#ifdef VCF_ALLOW_INT64
2050
                                 (1 << BCF_BT_INT64) |
2051
#endif
2052
376
                                 (1 << BCF_BT_INT32));
2053
376
    const uint32_t is_valid_type = (is_integer          |
2054
376
                                    (1 << BCF_BT_NULL)  |
2055
376
                                    (1 << BCF_BT_FLOAT) |
2056
376
                                    (1 << BCF_BT_CHAR));
2057
376
    int32_t max_id = hdr ? hdr->n[BCF_DT_ID] : 0;
2058
    /* set phasing for 1st allele as in v44 for versions upto v43, to have
2059
    consistent binary values irrespective of version; not run for v >= v44,
2060
    to retain explicit phasing in v44 and higher */
2061
376
    int idgt = hdr ?
2062
376
                    bcf_get_version(hdr, NULL) < VCF44 ?
2063
376
                        bcf_hdr_id2int(hdr, BCF_DT_ID, "GT") : -1 :
2064
376
                    -1;
2065
2066
    // Check for valid contig ID
2067
376
    if (rec->rid < 0
2068
292
        || (hdr && (rec->rid >= hdr->n[BCF_DT_CTG]
2069
310
                    || hdr->id[BCF_DT_CTG][rec->rid].key == NULL))) {
2070
310
        hts_log_warning("Bad BCF record at %"PRIhts_pos": Invalid %s id %d", rec->pos+1, "CONTIG", rec->rid);
2071
310
        err |= BCF_ERR_CTG_INVALID;
2072
310
    }
2073
2074
    // Check ID
2075
376
    ptr = (uint8_t *) rec->shared.s;
2076
376
    end = ptr + rec->shared.l;
2077
376
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2078
372
    if (type != BCF_BT_CHAR) {
2079
308
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "ID", type, get_type_name(type));
2080
308
        err |= BCF_ERR_TAG_INVALID;
2081
308
    }
2082
372
    bytes = (size_t) num << bcf_type_shift[type];
2083
372
    if (end - ptr < bytes) goto bad_shared;
2084
372
    ptr += bytes;
2085
2086
    // Check REF and ALT
2087
372
    if (rec->n_allele < 1) {
2088
148
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": No REF allele",
2089
148
                        bcf_seqname_safe(hdr,rec), rec->pos+1);
2090
148
        err |= BCF_ERR_TAG_UNDEF;
2091
148
    }
2092
2093
372
    reports = 0;
2094
14.6k
    for (i = 0; i < rec->n_allele; i++) {
2095
14.3k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2096
14.3k
        if (type != BCF_BT_CHAR) {
2097
13.7k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2098
164
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "REF/ALT", type, get_type_name(type));
2099
13.7k
            err |= BCF_ERR_CHAR;
2100
13.7k
        }
2101
14.3k
        bytes = (size_t) num << bcf_type_shift[type];
2102
14.3k
        if (end - ptr < bytes) goto bad_shared;
2103
14.3k
        ptr += bytes;
2104
14.3k
    }
2105
2106
    // Check FILTER
2107
322
    reports = 0;
2108
322
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2109
316
    if (num > 0) {
2110
134
        bytes = (size_t) num << bcf_type_shift[type];
2111
134
        if (((1 << type) & is_integer) == 0) {
2112
44
            hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", type, get_type_name(type));
2113
44
            err |= BCF_ERR_TAG_INVALID;
2114
44
            if (end - ptr < bytes) goto bad_shared;
2115
42
            ptr += bytes;
2116
90
        } else {
2117
90
            if (end - ptr < bytes) goto bad_shared;
2118
5.12k
            for (i = 0; i < num; i++) {
2119
5.03k
                int32_t key = bcf_dec_int1(ptr, type, &ptr);
2120
5.03k
                if (key < 0
2121
4.25k
                    || (hdr && (key >= max_id
2122
3.50k
                                || hdr->id[BCF_DT_ID][key].key == NULL))) {
2123
3.50k
                    if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2124
80
                        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", key);
2125
3.50k
                    err |= BCF_ERR_TAG_UNDEF;
2126
3.50k
                }
2127
5.03k
            }
2128
88
        }
2129
134
    }
2130
2131
    // Check INFO
2132
312
    reports = 0;
2133
312
    bcf_idpair_t *id_tmp = hdr ? hdr->id[BCF_DT_ID] : NULL;
2134
3.36k
    for (i = 0; i < rec->n_info; i++) {
2135
3.19k
        int32_t key = -1;
2136
3.19k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_shared;
2137
3.08k
        if (key < 0 || (hdr && (key >= max_id
2138
2.59k
                                || id_tmp[key].key == NULL))) {
2139
2.59k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2140
100
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", key);
2141
2.59k
            err |= BCF_ERR_TAG_UNDEF;
2142
2.59k
        }
2143
3.08k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
2144
3.07k
        if (((1 << type) & is_valid_type) == 0
2145
2.92k
            || (type == BCF_BT_NULL && num > 0)) {
2146
170
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
2147
14
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", type, get_type_name(type));
2148
170
            err |= BCF_ERR_TAG_INVALID;
2149
170
        }
2150
3.07k
        bytes = (size_t) num << bcf_type_shift[type];
2151
3.07k
        if (end - ptr < bytes) goto bad_shared;
2152
3.05k
        ptr += bytes;
2153
3.05k
    }
2154
2155
    // Check FORMAT and individual information
2156
174
    ptr = (uint8_t *) rec->indiv.s;
2157
174
    end = ptr + rec->indiv.l;
2158
174
    reports = 0;
2159
3.56k
    for (i = 0; i < rec->n_fmt; i++) {
2160
3.48k
        int32_t key = -1;
2161
3.48k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_indiv;
2162
3.43k
        if (key < 0
2163
3.24k
            || (hdr && (key >= max_id
2164
3.32k
                        || id_tmp[key].key == NULL))) {
2165
3.32k
            bcf_record_check_err(hdr, rec, "id", &reports, key);
2166
3.32k
            err |= BCF_ERR_TAG_UNDEF;
2167
3.32k
        }
2168
3.43k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv;
2169
3.42k
        if (((1 << type) & is_valid_type) == 0
2170
3.30k
            || (type == BCF_BT_NULL && num > 0)) {
2171
152
            bcf_record_check_err(hdr, rec, "type", &reports, type);
2172
152
            err |= BCF_ERR_TAG_INVALID;
2173
152
        }
2174
3.42k
        if (idgt >= 0 && idgt == key) {
2175
            // check first GT phasing bit and fix up if necessary
2176
0
            if (updatephasing(ptr, end, &ptr, rec->n_sample, num, type)) {
2177
0
                err |= BCF_ERR_TAG_INVALID;
2178
0
            }
2179
3.42k
        } else {
2180
3.42k
            bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample;
2181
3.42k
            if (end - ptr < bytes) goto bad_indiv;
2182
3.38k
            ptr += bytes;
2183
3.38k
        }
2184
3.42k
    }
2185
2186
74
    if (!err && rec->rlen < 0) {
2187
        // Treat bad rlen as a warning instead of an error, and try to
2188
        // fix up by using the length of the stored REF allele.
2189
22
        static int warned = 0;
2190
22
        if (!warned) {
2191
1
            hts_log_warning("BCF record at %s:%"PRIhts_pos" has invalid RLEN (%"PRIhts_pos"). "
2192
1
                            "Only one invalid RLEN will be reported.",
2193
1
                            bcf_seqname_safe(hdr,rec), rec->pos+1, rec->rlen);
2194
1
            warned = 1;
2195
1
        }
2196
        //find rlen considering reflen, END, SVLEN, fmt LEN
2197
22
        hts_pos_t len = get_rlen(hdr, rec);
2198
22
        rec->rlen = len >= 0 ? len : 0;
2199
22
    }
2200
2201
74
    rec->errcode |= err;
2202
2203
74
    return err ? -2 : 0; // Return -2 so bcf_read() reports an error
2204
2205
202
 bad_shared:
2206
202
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - shared section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
2207
202
    return -2;
2208
2209
100
 bad_indiv:
2210
100
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - individuals section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
2211
100
    return -2;
2212
174
}
2213
2214
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
2215
int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
2216
0
{
2217
0
    if ( !hdr->keep_samples ) return 0;
2218
0
    if ( !bcf_hdr_nsamples(hdr) )
2219
0
    {
2220
0
        rec->indiv.l = rec->n_sample = 0;
2221
0
        return 0;
2222
0
    }
2223
2224
0
    int i, j;
2225
0
    uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
2226
0
    bcf_dec_t *dec = &rec->d;
2227
0
    hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
2228
0
    for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
2229
2230
0
    for (i=0; i<rec->n_fmt; i++)
2231
0
    {
2232
0
        ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
2233
0
        src = dec->fmt[i].p - dec->fmt[i].size;
2234
0
        if ( dst )
2235
0
        {
2236
0
            memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
2237
0
            dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
2238
0
        }
2239
0
        dst = dec->fmt[i].p;
2240
0
        for (j=0; j<hdr->nsamples_ori; j++)
2241
0
        {
2242
0
            src += dec->fmt[i].size;
2243
0
            if ( !bit_array_test(hdr->keep_samples,j) ) continue;
2244
0
            memmove(dst, src, dec->fmt[i].size);
2245
0
            dst += dec->fmt[i].size;
2246
0
        }
2247
0
        rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
2248
0
        dec->fmt[i].p_len = dst - dec->fmt[i].p;
2249
0
    }
2250
0
    rec->unpacked |= BCF_UN_FMT;
2251
2252
0
    rec->n_sample = bcf_hdr_nsamples(hdr);
2253
0
    return 0;
2254
0
}
2255
2256
int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
2257
60.7k
{
2258
60.7k
    if (fp->format.format == vcf) return vcf_read(fp, h, v);
2259
496
    if (!h)
2260
0
        h = (const bcf_hdr_t *) bgzf_get_private_data(fp->fp.bgzf);
2261
496
    int ret = bcf_read1_core(fp->fp.bgzf, v);
2262
496
    if (ret == 0) ret = bcf_record_check(h, v);
2263
496
    if ( ret!=0 || !h->keep_samples ) return ret;
2264
0
    return bcf_subset_format(h,v);
2265
496
}
2266
2267
int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end)
2268
0
{
2269
0
    bcf1_t *v = (bcf1_t *) vv;
2270
0
    const bcf_hdr_t *hdr = (const bcf_hdr_t *) bgzf_get_private_data(fp);
2271
0
    int ret = bcf_read1_core(fp, v);
2272
0
    if (ret == 0) ret = bcf_record_check(hdr, v);
2273
0
    if (ret  >= 0)
2274
0
        *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
2275
0
    return ret;
2276
0
}
2277
2278
static inline int bcf1_sync_id(bcf1_t *line, kstring_t *str)
2279
0
{
2280
    // single typed string
2281
0
    if ( line->d.id && strcmp(line->d.id, ".") ) {
2282
0
        return bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
2283
0
    } else {
2284
0
        return bcf_enc_size(str, 0, BCF_BT_CHAR);
2285
0
    }
2286
0
}
2287
static inline int bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
2288
0
{
2289
    // list of typed strings
2290
0
    int i;
2291
0
    for (i=0; i<line->n_allele; i++) {
2292
0
        if (bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]) < 0)
2293
0
            return -1;
2294
0
    }
2295
0
    if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
2296
0
    return 0;
2297
0
}
2298
static inline int bcf1_sync_filter(bcf1_t *line, kstring_t *str)
2299
0
{
2300
    // typed vector of integers
2301
0
    if ( line->d.n_flt ) {
2302
0
        return bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
2303
0
    } else {
2304
0
        return bcf_enc_vint(str, 0, 0, -1);
2305
0
    }
2306
0
}
2307
2308
static inline int bcf1_sync_info(bcf1_t *line, kstring_t *str)
2309
0
{
2310
    // pairs of typed vectors
2311
0
    int i, irm = -1, e = 0;
2312
0
    for (i=0; i<line->n_info; i++)
2313
0
    {
2314
0
        bcf_info_t *info = &line->d.info[i];
2315
0
        if ( !info->vptr )
2316
0
        {
2317
            // marked for removal
2318
0
            if ( irm < 0 ) irm = i;
2319
0
            continue;
2320
0
        }
2321
0
        e |= kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str) < 0;
2322
0
        if ( irm >=0 )
2323
0
        {
2324
0
            bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
2325
0
            while ( irm<=i && line->d.info[irm].vptr ) irm++;
2326
0
        }
2327
0
    }
2328
0
    if ( irm>=0 ) line->n_info = irm;
2329
0
    return e == 0 ? 0 : -1;
2330
0
}
2331
2332
static int bcf1_sync(bcf1_t *line)
2333
23
{
2334
23
    char *shared_ori = line->shared.s;
2335
23
    size_t prev_len;
2336
2337
23
    kstring_t tmp = {0,0,0};
2338
23
    if ( !line->shared.l )
2339
0
    {
2340
        // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
2341
0
        tmp = line->shared;
2342
0
        bcf1_sync_id(line, &tmp);
2343
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2344
2345
0
        bcf1_sync_alleles(line, &tmp);
2346
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2347
2348
0
        bcf1_sync_filter(line, &tmp);
2349
0
        line->unpack_size[2] = tmp.l - prev_len;
2350
2351
0
        bcf1_sync_info(line, &tmp);
2352
0
        line->shared = tmp;
2353
0
    }
2354
23
    else if ( line->d.shared_dirty )
2355
0
    {
2356
        // The line was edited, update the BCF data block.
2357
2358
0
        if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line,BCF_UN_STR);
2359
2360
        // ptr_ori points to the original unchanged BCF data.
2361
0
        uint8_t *ptr_ori = (uint8_t *) line->shared.s;
2362
2363
        // ID: single typed string
2364
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ID )
2365
0
            bcf1_sync_id(line, &tmp);
2366
0
        else
2367
0
            kputsn_(ptr_ori, line->unpack_size[0], &tmp);
2368
0
        ptr_ori += line->unpack_size[0];
2369
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2370
2371
        // REF+ALT: list of typed strings
2372
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
2373
0
            bcf1_sync_alleles(line, &tmp);
2374
0
        else
2375
0
        {
2376
0
            kputsn_(ptr_ori, line->unpack_size[1], &tmp);
2377
0
            if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
2378
0
        }
2379
0
        ptr_ori += line->unpack_size[1];
2380
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2381
2382
0
        if ( line->unpacked & BCF_UN_FLT )
2383
0
        {
2384
            // FILTER: typed vector of integers
2385
0
            if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
2386
0
                bcf1_sync_filter(line, &tmp);
2387
0
            else if ( line->d.n_flt )
2388
0
                kputsn_(ptr_ori, line->unpack_size[2], &tmp);
2389
0
            else
2390
0
                bcf_enc_vint(&tmp, 0, 0, -1);
2391
0
            ptr_ori += line->unpack_size[2];
2392
0
            line->unpack_size[2] = tmp.l - prev_len;
2393
2394
0
            if ( line->unpacked & BCF_UN_INFO )
2395
0
            {
2396
                // INFO: pairs of typed vectors
2397
0
                if ( line->d.shared_dirty & BCF1_DIRTY_INF )
2398
0
                {
2399
0
                    bcf1_sync_info(line, &tmp);
2400
0
                    ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
2401
0
                }
2402
0
            }
2403
0
        }
2404
2405
0
        int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
2406
0
        if ( size ) kputsn_(ptr_ori, size, &tmp);
2407
2408
0
        free(line->shared.s);
2409
0
        line->shared = tmp;
2410
0
    }
2411
23
    if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
2412
0
    {
2413
        // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
2414
0
        size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
2415
0
        int i;
2416
0
        for (i=0; i<line->n_info; i++)
2417
0
        {
2418
0
            uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
2419
0
            line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
2420
0
            off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
2421
0
            if ( vptr_free )
2422
0
            {
2423
0
                free(vptr_free);
2424
0
                line->d.info[i].vptr_free = 0;
2425
0
            }
2426
0
        }
2427
0
    }
2428
2429
23
    if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
2430
0
    {
2431
        // The genotype fields changed or are not present
2432
0
        tmp.l = tmp.m = 0; tmp.s = NULL;
2433
0
        int i, irm = -1;
2434
0
        for (i=0; i<line->n_fmt; i++)
2435
0
        {
2436
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
2437
0
            if ( !fmt->p )
2438
0
            {
2439
                // marked for removal
2440
0
                if ( irm < 0 ) irm = i;
2441
0
                continue;
2442
0
            }
2443
0
            kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
2444
0
            if ( irm >=0 )
2445
0
            {
2446
0
                bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
2447
0
                while ( irm<=i && line->d.fmt[irm].p ) irm++;
2448
0
            }
2449
2450
0
        }
2451
0
        if ( irm>=0 ) line->n_fmt = irm;
2452
0
        free(line->indiv.s);
2453
0
        line->indiv = tmp;
2454
2455
        // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
2456
0
        size_t off_new = 0;
2457
0
        for (i=0; i<line->n_fmt; i++)
2458
0
        {
2459
0
            uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
2460
0
            line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
2461
0
            off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
2462
0
            if ( p_free )
2463
0
            {
2464
0
                free(p_free);
2465
0
                line->d.fmt[i].p_free = 0;
2466
0
            }
2467
0
        }
2468
0
    }
2469
23
    if ( !line->n_sample ) line->n_fmt = 0;
2470
23
    line->d.shared_dirty = line->d.indiv_dirty = 0;
2471
23
    return 0;
2472
23
}
2473
2474
bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
2475
0
{
2476
0
    bcf1_sync(src);
2477
2478
0
    bcf_clear(dst);
2479
0
    dst->rid  = src->rid;
2480
0
    dst->pos  = src->pos;
2481
0
    dst->rlen = src->rlen;
2482
0
    dst->qual = src->qual;
2483
0
    dst->n_info = src->n_info; dst->n_allele = src->n_allele;
2484
0
    dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
2485
2486
0
    if ( dst->shared.m < src->shared.l )
2487
0
    {
2488
0
        dst->shared.s = (char*) realloc(dst->shared.s, src->shared.l);
2489
0
        dst->shared.m = src->shared.l;
2490
0
    }
2491
0
    dst->shared.l = src->shared.l;
2492
0
    memcpy(dst->shared.s,src->shared.s,dst->shared.l);
2493
2494
0
    if ( dst->indiv.m < src->indiv.l )
2495
0
    {
2496
0
        dst->indiv.s = (char*) realloc(dst->indiv.s, src->indiv.l);
2497
0
        dst->indiv.m = src->indiv.l;
2498
0
    }
2499
0
    dst->indiv.l = src->indiv.l;
2500
0
    memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
2501
2502
0
    return dst;
2503
0
}
2504
bcf1_t *bcf_dup(bcf1_t *src)
2505
0
{
2506
0
    bcf1_t *out = bcf_init1();
2507
0
    return bcf_copy(out, src);
2508
0
}
2509
2510
int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
2511
57.3k
{
2512
57.3k
    if ( h->dirty ) {
2513
0
        if (bcf_hdr_sync(h) < 0) return -1;
2514
0
    }
2515
57.3k
    if ( bcf_hdr_nsamples(h)!=v->n_sample )
2516
63
    {
2517
63
        hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
2518
63
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
2519
63
        return -1;
2520
63
    }
2521
2522
57.2k
    if ( hfp->format.format == vcf || hfp->format.format == text_format )
2523
54.7k
        return vcf_write(hfp,h,v);
2524
2525
2.51k
    if ( v->errcode & ~BCF_ERR_LIMITS ) // todo: unsure about the other BCF_ERR_LIMITS branches in vcf_parse_format_alloc4()
2526
2.48k
    {
2527
        // vcf_parse1() encountered a new contig or tag, undeclared in the
2528
        // header.  At this point, the header must have been printed,
2529
        // proceeding would lead to a broken BCF file. Errors must be checked
2530
        // and cleared by the caller before we can proceed.
2531
2.48k
        char errdescription[1024] = "";
2532
2.48k
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos, v->errcode, bcf_strerror(v->errcode, errdescription, sizeof(errdescription)), bcf_seqname_safe(h,v), v->pos+1);
2533
2.48k
        return -1;
2534
2.48k
    }
2535
23
    bcf1_sync(v);   // check if the BCF record was modified
2536
2537
23
    if ( v->unpacked & BCF_IS_64BIT )
2538
0
    {
2539
0
        hts_log_error("Data at %s:%"PRIhts_pos" contains 64-bit values not representable in BCF. Please use VCF instead", bcf_seqname_safe(h,v), v->pos+1);
2540
0
        return -1;
2541
0
    }
2542
2543
23
    BGZF *fp = hfp->fp.bgzf;
2544
23
    uint8_t x[32];
2545
23
    u32_to_le(v->shared.l + 24, x); // to include six 32-bit integers
2546
23
    u32_to_le(v->indiv.l, x + 4);
2547
23
    i32_to_le(v->rid, x + 8);
2548
23
    u32_to_le(v->pos, x + 12);
2549
23
    u32_to_le(v->rlen, x + 16);
2550
23
    float_to_le(v->qual, x + 20);
2551
23
    u16_to_le(v->n_info, x + 24);
2552
23
    u16_to_le(v->n_allele, x + 26);
2553
23
    u32_to_le((uint32_t)v->n_fmt<<24 | (v->n_sample & 0xffffff), x + 28);
2554
23
    if ( bgzf_write(fp, x, 32) != 32 ) return -1;
2555
23
    if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
2556
23
    if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
2557
2558
23
    if (hfp->idx) {
2559
0
        if (bgzf_idx_push(fp, hfp->idx, v->rid, v->pos, v->pos + v->rlen,
2560
0
                          bgzf_tell(fp), 1) < 0)
2561
0
            return -1;
2562
0
    }
2563
2564
23
    return 0;
2565
23
}
2566
2567
/**********************
2568
 *** VCF header I/O ***
2569
 **********************/
2570
2571
0
static int add_missing_contig_hrec(bcf_hdr_t *h, const char *name) {
2572
0
    bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t));
2573
0
    int save_errno;
2574
0
    if (!hrec) goto fail;
2575
2576
0
    hrec->key = strdup("contig");
2577
0
    if (!hrec->key) goto fail;
2578
2579
0
    if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) goto fail;
2580
0
    if (bcf_hrec_set_val(hrec, hrec->nkeys-1, name, strlen(name), 0) < 0)
2581
0
        goto fail;
2582
0
    if (bcf_hdr_add_hrec(h, hrec) < 0)
2583
0
        goto fail;
2584
0
    return 0;
2585
2586
0
 fail:
2587
0
    save_errno = errno;
2588
0
    hts_log_error("%s", strerror(errno));
2589
0
    if (hrec) bcf_hrec_destroy(hrec);
2590
0
    errno = save_errno;
2591
0
    return -1;
2592
0
}
2593
2594
bcf_hdr_t *vcf_hdr_read(htsFile *fp)
2595
6.86k
{
2596
6.86k
    kstring_t txt, *s = &fp->line;
2597
6.86k
    int ret;
2598
6.86k
    bcf_hdr_t *h;
2599
6.86k
    tbx_t *idx = NULL;
2600
6.86k
    const char **names = NULL;
2601
6.86k
    h = bcf_hdr_init("r");
2602
6.86k
    if (!h) {
2603
0
        hts_log_error("Failed to allocate bcf header");
2604
0
        return NULL;
2605
0
    }
2606
6.86k
    txt.l = txt.m = 0; txt.s = 0;
2607
134k
    while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) {
2608
133k
        int e = 0;
2609
133k
        if (s->l == 0) continue;
2610
127k
        if (s->s[0] != '#') {
2611
22
            hts_log_error("No sample line");
2612
22
            goto error;
2613
22
        }
2614
127k
        if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
2615
0
            kstring_t tmp = { 0, 0, NULL };
2616
0
            hFILE *f = hopen(fp->fn_aux, "r");
2617
0
            if (f == NULL) {
2618
0
                hts_log_error("Couldn't open \"%s\"", fp->fn_aux);
2619
0
                goto error;
2620
0
            }
2621
0
            while (tmp.l = 0, kgetline(&tmp, (kgets_func *) hgets, f) >= 0) {
2622
0
                char *tab = strchr(tmp.s, '\t');
2623
0
                if (tab == NULL) continue;
2624
0
                e |= (kputs("##contig=<ID=", &txt) < 0);
2625
0
                e |= (kputsn(tmp.s, tab - tmp.s, &txt) < 0);
2626
0
                e |= (kputs(",length=", &txt) < 0);
2627
0
                e |= (kputl(atol(tab), &txt) < 0);
2628
0
                e |= (kputsn(">\n", 2, &txt) < 0);
2629
0
            }
2630
0
            free(tmp.s);
2631
0
            if (hclose(f) != 0) {
2632
0
                hts_log_error("Error on closing %s", fp->fn_aux);
2633
0
                goto error;
2634
0
            }
2635
0
            if (e) goto error;
2636
0
        }
2637
127k
        if (kputsn(s->s, s->l, &txt) < 0) goto error;
2638
127k
        if (kputc('\n', &txt) < 0) goto error;
2639
127k
        if (s->s[1] != '#') break;
2640
127k
    }
2641
6.84k
    if ( ret < -1 ) goto error;
2642
6.82k
    if ( !txt.s )
2643
0
    {
2644
0
        hts_log_error("Could not read the header");
2645
0
        goto error;
2646
0
    }
2647
6.82k
    if ( bcf_hdr_parse(h, txt.s) < 0 ) goto error;
2648
2649
    // check tabix index, are all contigs listed in the header? add the missing ones
2650
5.53k
    idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SILENT_FAIL);
2651
5.53k
    if ( idx )
2652
0
    {
2653
0
        int i, n, need_sync = 0;
2654
0
        names = tbx_seqnames(idx, &n);
2655
0
        if (!names) goto error;
2656
0
        for (i=0; i<n; i++)
2657
0
        {
2658
0
            bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL);
2659
0
            if ( hrec ) continue;
2660
0
            if (add_missing_contig_hrec(h, names[i]) < 0) goto error;
2661
0
            need_sync = 1;
2662
0
        }
2663
0
        if ( need_sync ) {
2664
0
            if (bcf_hdr_sync(h) < 0) goto error;
2665
0
        }
2666
0
        free(names);
2667
0
        tbx_destroy(idx);
2668
0
    }
2669
5.53k
    free(txt.s);
2670
5.53k
    return h;
2671
2672
1.33k
 error:
2673
1.33k
    if (idx) tbx_destroy(idx);
2674
1.33k
    free(names);
2675
1.33k
    free(txt.s);
2676
1.33k
    if (h) bcf_hdr_destroy(h);
2677
1.33k
    return NULL;
2678
5.53k
}
2679
2680
int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
2681
0
{
2682
0
    int i = 0, n = 0, save_errno;
2683
0
    char **lines = hts_readlines(fname, &n);
2684
0
    if ( !lines ) return 1;
2685
0
    for (i=0; i<n-1; i++)
2686
0
    {
2687
0
        int k;
2688
0
        bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
2689
0
        if (!hrec) goto fail;
2690
0
        if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
2691
0
            bcf_hrec_destroy(hrec);
2692
0
            goto fail;
2693
0
        }
2694
0
        free(lines[i]);
2695
0
        lines[i] = NULL;
2696
0
    }
2697
0
    if (bcf_hdr_parse_sample_line(hdr, lines[n-1]) < 0) goto fail;
2698
0
    if (bcf_hdr_sync(hdr) < 0) goto fail;
2699
0
    free(lines[n-1]);
2700
0
    free(lines);
2701
0
    return 0;
2702
2703
0
 fail:
2704
0
    save_errno = errno;
2705
0
    for (; i < n; i++)
2706
0
        free(lines[i]);
2707
0
    free(lines);
2708
0
    errno = save_errno;
2709
0
    return 1;
2710
0
}
2711
2712
static int _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
2713
20.9k
{
2714
20.9k
    uint32_t e = 0;
2715
20.9k
    if ( !hrec->value )
2716
11.9k
    {
2717
11.9k
        int j, nout = 0;
2718
11.9k
        e |= ksprintf(str, "##%s=<", hrec->key) < 0;
2719
48.4k
        for (j=0; j<hrec->nkeys; j++)
2720
36.5k
        {
2721
            // do not output IDX if output is VCF
2722
36.5k
            if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
2723
31.5k
            if ( nout ) e |= kputc(',',str) < 0;
2724
31.5k
            e |= ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]) < 0;
2725
31.5k
            nout++;
2726
31.5k
        }
2727
11.9k
        e |= ksprintf(str,">\n") < 0;
2728
11.9k
    }
2729
8.97k
    else
2730
8.97k
        e |= ksprintf(str,"##%s=%s\n", hrec->key,hrec->value) < 0;
2731
2732
20.9k
    return e == 0 ? 0 : -1;
2733
20.9k
}
2734
2735
int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
2736
0
{
2737
0
    return _bcf_hrec_format(hrec,0,str);
2738
0
}
2739
2740
int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str)
2741
5.98k
{
2742
5.98k
    int i, r = 0;
2743
26.9k
    for (i=0; i<hdr->nhrec; i++)
2744
20.9k
        r |= _bcf_hrec_format(hdr->hrec[i], is_bcf, str) < 0;
2745
2746
5.98k
    r |= ksprintf(str, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") < 0;
2747
5.98k
    if ( bcf_hdr_nsamples(hdr) )
2748
1.89k
    {
2749
1.89k
        r |= ksprintf(str, "\tFORMAT") < 0;
2750
12.8k
        for (i=0; i<bcf_hdr_nsamples(hdr); i++)
2751
10.9k
            r |= ksprintf(str, "\t%s", hdr->samples[i]) < 0;
2752
1.89k
    }
2753
5.98k
    r |= ksprintf(str, "\n") < 0;
2754
2755
5.98k
    return r ? -1 : 0;
2756
5.98k
}
2757
2758
char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
2759
0
{
2760
0
    kstring_t txt = {0,0,0};
2761
0
    if (bcf_hdr_format(hdr, is_bcf, &txt) < 0)
2762
0
        return NULL;
2763
0
    if ( len ) *len = txt.l;
2764
0
    return txt.s;
2765
0
}
2766
2767
const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
2768
0
{
2769
0
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
2770
0
    int i, tid, m = kh_size(d);
2771
0
    const char **names = (const char**) calloc(m,sizeof(const char*));
2772
0
    if ( !names )
2773
0
    {
2774
0
        hts_log_error("Failed to allocate memory");
2775
0
        *n = 0;
2776
0
        return NULL;
2777
0
    }
2778
0
    khint_t k;
2779
0
    for (k=kh_begin(d); k<kh_end(d); k++)
2780
0
    {
2781
0
        if ( !kh_exist(d,k) ) continue;
2782
0
        if ( !kh_val(d, k).hrec[0] ) continue;  // removed via bcf_hdr_remove
2783
0
        tid = kh_val(d,k).id;
2784
0
        if ( tid >= m )
2785
0
        {
2786
            // This can happen after a contig has been removed from BCF header via bcf_hdr_remove()
2787
0
            if ( hts_resize(const char*, tid + 1, &m, &names, HTS_RESIZE_CLEAR)<0 )
2788
0
            {
2789
0
                hts_log_error("Failed to allocate memory");
2790
0
                *n = 0;
2791
0
                free(names);
2792
0
                return NULL;
2793
0
            }
2794
0
            m = tid + 1;
2795
0
        }
2796
0
        names[tid] = kh_key(d,k);
2797
0
    }
2798
    // ensure there are no gaps
2799
0
    for (i=0,tid=0; tid<m; i++,tid++)
2800
0
    {
2801
0
        while ( tid<m && !names[tid] ) tid++;
2802
0
        if ( tid==m ) break;
2803
0
        if ( i==tid ) continue;
2804
0
        names[i] = names[tid];
2805
0
        names[tid] = 0;
2806
0
    }
2807
0
    *n = i;
2808
0
    return names;
2809
0
}
2810
2811
int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
2812
2.99k
{
2813
2.99k
    kstring_t htxt = {0,0,0};
2814
2.99k
    if (bcf_hdr_format(h, 0, &htxt) < 0) {
2815
0
        free(htxt.s);
2816
0
        return -1;
2817
0
    }
2818
2.99k
    while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros
2819
2.99k
    int ret;
2820
2.99k
    if ( fp->format.compression!=no_compression ) {
2821
0
        ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l);
2822
0
        if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2823
2.99k
    } else {
2824
2.99k
        ret = hwrite(fp->fp.hfile, htxt.s, htxt.l);
2825
2.99k
    }
2826
2.99k
    free(htxt.s);
2827
2.99k
    return ret<0 ? -1 : 0;
2828
2.99k
}
2829
2830
/***********************
2831
 *** Typed value I/O ***
2832
 ***********************/
2833
2834
int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
2835
187k
{
2836
187k
    int32_t max = INT32_MIN, min = INT32_MAX;
2837
187k
    int i;
2838
187k
    if (n <= 0) {
2839
5.51k
        return bcf_enc_size(s, 0, BCF_BT_NULL);
2840
181k
    } else if (n == 1) {
2841
51.4k
        return bcf_enc_int1(s, a[0]);
2842
130k
    } else {
2843
130k
        if (wsize <= 0) wsize = n;
2844
2845
        // Equivalent to:
2846
        // for (i = 0; i < n; ++i) {
2847
        //     if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end )
2848
        //         continue;
2849
        //     if (max < a[i]) max = a[i];
2850
        //     if (min > a[i]) min = a[i];
2851
        // }
2852
130k
        int max4[4] = {INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN};
2853
130k
        int min4[4] = {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX};
2854
23.3M
        for (i = 0; i < (n&~3); i+=4) {
2855
            // bcf_int32_missing    == INT32_MIN and
2856
            // bcf_int32_vector_end == INT32_MIN+1.
2857
            // We skip these, but can mostly avoid explicit checking
2858
23.1M
            if (max4[0] < a[i+0]) max4[0] = a[i+0];
2859
23.1M
            if (max4[1] < a[i+1]) max4[1] = a[i+1];
2860
23.1M
            if (max4[2] < a[i+2]) max4[2] = a[i+2];
2861
23.1M
            if (max4[3] < a[i+3]) max4[3] = a[i+3];
2862
23.1M
            if (min4[0] > a[i+0] && a[i+0] > INT32_MIN+1) min4[0] = a[i+0];
2863
23.1M
            if (min4[1] > a[i+1] && a[i+1] > INT32_MIN+1) min4[1] = a[i+1];
2864
23.1M
            if (min4[2] > a[i+2] && a[i+2] > INT32_MIN+1) min4[2] = a[i+2];
2865
23.1M
            if (min4[3] > a[i+3] && a[i+3] > INT32_MIN+1) min4[3] = a[i+3];
2866
23.1M
        }
2867
130k
        min = min4[0];
2868
130k
        if (min > min4[1]) min = min4[1];
2869
130k
        if (min > min4[2]) min = min4[2];
2870
130k
        if (min > min4[3]) min = min4[3];
2871
130k
        max = max4[0];
2872
130k
        if (max < max4[1]) max = max4[1];
2873
130k
        if (max < max4[2]) max = max4[2];
2874
130k
        if (max < max4[3]) max = max4[3];
2875
325k
        for (; i < n; ++i) {
2876
195k
            if (max < a[i]) max = a[i];
2877
195k
            if (min > a[i] && a[i] > INT32_MIN+1) min = a[i];
2878
195k
        }
2879
2880
130k
        if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) {
2881
14.8k
            if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 ||
2882
14.8k
                ks_resize(s, s->l + n) < 0)
2883
0
                return -1;
2884
14.8k
            uint8_t *p = (uint8_t *) s->s + s->l;
2885
2.54M
            for (i = 0; i < n; ++i, p++) {
2886
2.52M
                if ( a[i]==bcf_int32_vector_end )   *p = bcf_int8_vector_end;
2887
2.51M
                else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing;
2888
94.8k
                else *p = a[i];
2889
2.52M
            }
2890
14.8k
            s->l += n;
2891
115k
        } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) {
2892
81.8k
            uint8_t *p;
2893
81.8k
            if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 ||
2894
81.8k
                ks_resize(s, s->l + n * sizeof(int16_t)) < 0)
2895
0
                return -1;
2896
81.8k
            p = (uint8_t *) s->s + s->l;
2897
32.5M
            for (i = 0; i < n; ++i)
2898
32.4M
            {
2899
32.4M
                int16_t x;
2900
32.4M
                if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
2901
32.4M
                else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
2902
697k
                else x = a[i];
2903
32.4M
                i16_to_le(x, p);
2904
32.4M
                p += sizeof(int16_t);
2905
32.4M
            }
2906
81.8k
            s->l += n * sizeof(int16_t);
2907
81.8k
        } else {
2908
33.6k
            uint8_t *p;
2909
33.6k
            if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 ||
2910
33.6k
                ks_resize(s, s->l + n * sizeof(int32_t)) < 0)
2911
0
                return -1;
2912
33.6k
            p = (uint8_t *) s->s + s->l;
2913
57.9M
            for (i = 0; i < n; ++i) {
2914
57.9M
                i32_to_le(a[i], p);
2915
57.9M
                p += sizeof(int32_t);
2916
57.9M
            }
2917
33.6k
            s->l += n * sizeof(int32_t);
2918
33.6k
        }
2919
130k
    }
2920
2921
130k
    return 0;
2922
187k
}
2923
2924
#ifdef VCF_ALLOW_INT64
2925
static int bcf_enc_long1(kstring_t *s, int64_t x) {
2926
    uint32_t e = 0;
2927
    if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32)
2928
        return bcf_enc_int1(s, x);
2929
    if (x == bcf_int64_vector_end) {
2930
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2931
        e |= kputc(bcf_int8_vector_end, s) < 0;
2932
    } else if (x == bcf_int64_missing) {
2933
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2934
        e |= kputc(bcf_int8_missing, s) < 0;
2935
    } else {
2936
        e |= bcf_enc_size(s, 1, BCF_BT_INT64);
2937
        e |= ks_expand(s, 8);
2938
        if (e == 0) { u64_to_le(x, (uint8_t *) s->s + s->l); s->l += 8; }
2939
    }
2940
    return e == 0 ? 0 : -1;
2941
}
2942
#endif
2943
2944
492k
static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) {
2945
492k
    uint8_t *p;
2946
492k
    size_t i;
2947
492k
    size_t bytes = n * sizeof(float);
2948
2949
492k
    if (bytes / sizeof(float) != n) return -1;
2950
492k
    if (ks_resize(s, s->l + bytes) < 0) return -1;
2951
2952
492k
    p = (uint8_t *) s->s + s->l;
2953
91.4M
    for (i = 0; i < n; i++) {
2954
90.9M
        float_to_le(a[i], p);
2955
90.9M
        p += sizeof(float);
2956
90.9M
    }
2957
492k
    s->l += bytes;
2958
2959
492k
    return 0;
2960
492k
}
2961
2962
int bcf_enc_vfloat(kstring_t *s, int n, float *a)
2963
492k
{
2964
492k
    assert(n >= 0);
2965
492k
    bcf_enc_size(s, n, BCF_BT_FLOAT);
2966
492k
    serialize_float_array(s, n, a);
2967
492k
    return 0; // FIXME: check for errs in this function
2968
492k
}
2969
2970
int bcf_enc_vchar(kstring_t *s, int l, const char *a)
2971
4.24M
{
2972
4.24M
    bcf_enc_size(s, l, BCF_BT_CHAR);
2973
4.24M
    kputsn(a, l, s);
2974
4.24M
    return 0; // FIXME: check for errs in this function
2975
4.24M
}
2976
2977
// Special case of n==1 as it also occurs quite often in FORMAT data.
2978
// This version is also small enough to get inlined.
2979
9.29k
static inline int bcf_fmt_array1(kstring_t *s, int type, void *data) {
2980
9.29k
    uint32_t e = 0;
2981
9.29k
    uint8_t *p = (uint8_t *)data;
2982
9.29k
    int32_t v;
2983
2984
    // helps gcc more than clang here. In billions of cycles:
2985
    //          bcf_fmt_array1  bcf_fmt_array
2986
    // gcc7:    23.2            24.3
2987
    // gcc13:   21.6            23.0
2988
    // clang13: 27.1            27.8
2989
9.29k
    switch (type) {
2990
9.29k
    case BCF_BT_CHAR:
2991
9.29k
        e |= kputc_(*p == bcf_str_missing ? '.' : *p, s) < 0;
2992
9.29k
        break;
2993
2994
0
    case BCF_BT_INT8:
2995
0
        if (*(int8_t *)p != bcf_int8_vector_end) {
2996
0
            e |= ((*(int8_t *)p == bcf_int8_missing)
2997
0
                  ? kputc_('.', s)
2998
0
                  : kputw(*(int8_t *)p, s)) < 0;
2999
0
        }
3000
0
        break;
3001
0
    case BCF_BT_INT16:
3002
0
        v = le_to_i16(p);
3003
0
        if (v != bcf_int16_vector_end) {
3004
0
            e |= (v == bcf_int16_missing
3005
0
                  ? kputc_('.', s)
3006
0
                  : kputw(v, s)) < 0;
3007
0
        }
3008
0
        break;
3009
3010
0
    case BCF_BT_INT32:
3011
0
        v = le_to_i32(p);
3012
0
        if (v != bcf_int32_vector_end) {
3013
0
            e |= (v == bcf_int32_missing
3014
0
                  ? kputc_('.', s)
3015
0
                  : kputw(v, s)) < 0;
3016
0
        }
3017
0
        break;
3018
3019
0
    case BCF_BT_FLOAT:
3020
0
        v = le_to_u32(p);
3021
0
        if (v != bcf_float_vector_end) {
3022
0
            e |= (v == bcf_float_missing
3023
0
                  ? kputc_('.', s)
3024
0
                  : kputd(le_to_float(p), s)) < 0;
3025
0
        }
3026
0
        break;
3027
3028
0
    default:
3029
0
        hts_log_error("Unexpected type %d", type);
3030
0
        return -1;
3031
9.29k
    }
3032
3033
9.29k
    return e == 0 ? 0 : -1;
3034
9.29k
}
3035
3036
int bcf_fmt_array(kstring_t *s, int n, int type, void *data)
3037
3.07M
{
3038
3.07M
    int j = 0;
3039
3.07M
    uint32_t e = 0;
3040
3.07M
    if (n == 0) {
3041
2.21M
        return kputc_('.', s) >= 0 ? 0 : -1;
3042
2.21M
    }
3043
3044
863k
    if (type == BCF_BT_CHAR)
3045
344k
    {
3046
344k
        char *p = (char *)data;
3047
3048
        // Note bcf_str_missing is already accounted for in n==0 above.
3049
344k
        if (n >= 8) {
3050
58.6k
            char *p_end = memchr(p, 0, n);
3051
58.6k
            e |= kputsn(p, p_end ? p_end-p : n, s) < 0;
3052
285k
        } else {
3053
1.03M
            for (j = 0; j < n && *p; ++j, ++p)
3054
752k
               e |= kputc(*p, s) < 0;
3055
285k
        }
3056
344k
    }
3057
519k
    else
3058
519k
    {
3059
519k
        #define BRANCH(type_t, convert, is_missing, is_vector_end, kprint) { \
3060
519k
            uint8_t *p = (uint8_t *) data; \
3061
89.9M
            for (j=0; j<n; j++, p += sizeof(type_t))    \
3062
89.3M
            { \
3063
89.3M
                type_t v = convert(p); \
3064
89.3M
                if ( is_vector_end ) break; \
3065
89.3M
                if ( j ) e |= kputc_(',', s) < 0; \
3066
89.3M
                e |= (is_missing ? kputc('.', s) : kprint) < 0; \
3067
89.3M
            } \
3068
519k
        }
3069
519k
        switch (type) {
3070
122k
            case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, v==bcf_int8_missing,  v==bcf_int8_vector_end,  kputw(v, s)); break;
3071
77.8k
            case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break;
3072
85.1k
            case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break;
3073
234k
            case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break;
3074
0
            default: hts_log_error("Unexpected type %d", type); exit(1); break;
3075
519k
        }
3076
519k
        #undef BRANCH
3077
519k
    }
3078
863k
    return e == 0 ? 0 : -1;
3079
863k
}
3080
3081
uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
3082
2.46M
{
3083
2.46M
    int x, type;
3084
2.46M
    x = bcf_dec_size(ptr, &ptr, &type);
3085
2.46M
    bcf_fmt_array(s, x, type, ptr);
3086
2.46M
    return ptr + (x << bcf_type_shift[type]);
3087
2.46M
}
3088
3089
/********************
3090
 *** VCF site I/O ***
3091
 ********************/
3092
3093
typedef struct {
3094
    int key;            // Key for h->id[BCF_DT_ID][key] vdict
3095
    int max_m;          // number of elements in field array (ie commas)
3096
    int size;           // field size (max_l or max_g*4 if is_gt)
3097
    int offset;         // offset of buf into h->mem
3098
    uint32_t is_gt:1,   // is genotype
3099
             max_g:31;  // maximum number of genotypes
3100
    uint32_t max_l;     // length of field
3101
    uint32_t y;         // h->id[0][fmt[j].key].val->info[BCF_HL_FMT]
3102
    uint8_t *buf;       // Pointer into h->mem
3103
} fmt_aux_t;
3104
3105
// fmt_aux_t field notes:
3106
// max_* are biggest sizes of the various FORMAT fields across all samples.
3107
// We use these after pivoting the data to ensure easy random access
3108
// of a specific sample.
3109
//
3110
// max_m is only used for type BCF_HT_REAL or BCF_HT_INT
3111
// max_g is only used for is_gt == 1 (will be BCF_HT_STR)
3112
// max_l is only used for is_gt == 0 (will be BCF_HT_STR)
3113
//
3114
// These are computed in vcf_parse_format_max3 and used in
3115
// vcf_parse_format_alloc4 to get the size.
3116
//
3117
// size is computed from max_g, max_l, max_m and is_gt.  Once computed
3118
// the max values are never accessed again.
3119
//
3120
// In theory all 4 vars could be coalesced into a single variable, but this
3121
// significantly harms speed (even if done via a union).  It's about 25-30%
3122
// slower.
3123
3124
static inline int align_mem(kstring_t *s)
3125
83.4k
{
3126
83.4k
    int e = 0;
3127
83.4k
    if (s->l&7) {
3128
13.5k
        uint64_t zero = 0;
3129
13.5k
        e = kputsn((char*)&zero, 8 - (s->l&7), s) < 0;
3130
13.5k
    }
3131
83.4k
    return e == 0 ? 0 : -1;
3132
83.4k
}
3133
3134
84.7k
#define MAX_N_FMT 255   /* Limited by size of bcf1_t n_fmt field */
3135
3136
// detect FORMAT "."
3137
static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3138
17.5k
                                   const char *p, const char *q) {
3139
17.5k
    const char *end = s->s + s->l;
3140
17.5k
    if ( q>=end )
3141
28
    {
3142
28
        hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1);
3143
28
        v->errcode |= BCF_ERR_NCOLS;
3144
28
        return -1;
3145
28
    }
3146
3147
17.4k
    v->n_fmt = 0;
3148
17.4k
    if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "."
3149
178
    {
3150
178
        v->n_sample = bcf_hdr_nsamples(h);
3151
178
        return 1;
3152
178
    }
3153
3154
17.3k
    return 0;
3155
17.4k
}
3156
3157
// get format information from the dictionary
3158
static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3159
17.3k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
3160
17.3k
    const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3161
17.3k
    char *t;
3162
17.3k
    int j;
3163
17.3k
    ks_tokaux_t aux1;
3164
3165
101k
    for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
3166
84.7k
        if (j >= MAX_N_FMT) {
3167
3
            v->errcode |= BCF_ERR_LIMITS;
3168
3
            hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle",
3169
3
                bcf_seqname_safe(h,v), v->pos+1);
3170
3
            return -1;
3171
3
        }
3172
3173
84.7k
        *(char*)aux1.p = 0;
3174
84.7k
        khint_t k = kh_get(vdict, d, t);
3175
84.7k
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
3176
8.86k
            if ( t[0]=='.' && t[1]==0 )
3177
1
            {
3178
1
                hts_log_error("Invalid FORMAT tag name '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3179
1
                v->errcode |= BCF_ERR_TAG_INVALID;
3180
1
                return -1;
3181
1
            }
3182
8.86k
            hts_log_warning("FORMAT '%s' at %s:%"PRIhts_pos" is not defined in the header, assuming Type=String", t, bcf_seqname_safe(h,v), v->pos+1);
3183
8.86k
            kstring_t tmp = {0,0,0};
3184
8.86k
            int l;
3185
8.86k
            ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
3186
8.86k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3187
8.86k
            free(tmp.s);
3188
8.86k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3189
8.86k
            if (res < 0) bcf_hrec_destroy(hrec);
3190
8.86k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3191
3192
8.86k
            k = kh_get(vdict, d, t);
3193
8.86k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3194
8.86k
            if (res || k == kh_end(d)) {
3195
23
                hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
3196
23
                v->errcode |= BCF_ERR_TAG_INVALID;
3197
23
                return -1;
3198
23
            }
3199
8.86k
        }
3200
84.6k
        fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
3201
84.6k
        fmt[j].key = kh_val(d, k).id;
3202
84.6k
        fmt[j].is_gt = (t[0] == 'G' && t[1] == 'T' && !t[2]);
3203
84.6k
        fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
3204
84.6k
        v->n_fmt++;
3205
84.6k
    }
3206
17.2k
    return 0;
3207
17.3k
}
3208
3209
// compute max
3210
static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3211
17.2k
                                 char *p, char *q, fmt_aux_t *fmt) {
3212
17.2k
    int n_sample_ori = -1;
3213
17.2k
    char *r = q + 1;  // r: position in the format string
3214
17.2k
    int l = 0, m = 1, g = 1, j;
3215
17.2k
    v->n_sample = 0;  // m: max vector size, l: max field len, g: max number of alleles
3216
17.2k
    const char *end = s->s + s->l;
3217
3218
37.3k
    while ( r<end )
3219
37.2k
    {
3220
        // can we skip some samples?
3221
37.2k
        if ( h->keep_samples )
3222
0
        {
3223
0
            n_sample_ori++;
3224
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
3225
0
            {
3226
0
                while ( *r!='\t' && r<end ) r++;
3227
0
                if ( *r=='\t' ) { *r = 0; r++; }
3228
0
                continue;
3229
0
            }
3230
0
        }
3231
3232
        // collect fmt stats: max vector size, length, number of alleles
3233
37.2k
        j = 0;  // j-th format field
3234
37.2k
        fmt_aux_t *f = fmt;
3235
37.2k
        static char meta[256] = {
3236
            // \0 \t , / : |
3237
37.2k
            1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3238
37.2k
            0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1, 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
3239
37.2k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3240
37.2k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
3241
37.2k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3242
37.2k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3243
37.2k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
3244
37.2k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3245
37.2k
        };
3246
3247
37.2k
        char *r_start = r;
3248
3.89M
        for (;;) {
3249
            // Quickly skip ahead to an appropriate meta-character
3250
4.63M
            while (!meta[(unsigned char)*r]) r++;
3251
3252
3.89M
            switch (*r) {
3253
3.81M
            case ',':
3254
3.81M
                m++;
3255
3.81M
                break;
3256
3257
1.74k
            case '|':
3258
21.9k
            case '/':
3259
21.9k
                if (f->is_gt) g++;
3260
21.9k
                break;
3261
3262
15.9k
            case '\t':
3263
15.9k
                *r = 0; // fall through
3264
3265
15.9k
            default: // valid due to while loop above.
3266
37.1k
            case '\0':
3267
55.6k
            case ':':
3268
55.6k
                l = r - r_start; r_start = r;
3269
55.6k
                if (f->max_m < m) f->max_m = m;
3270
55.6k
                if (f->max_l < l) f->max_l = l;
3271
55.6k
                if (f->is_gt && f->max_g < g) f->max_g = g;
3272
55.6k
                l = 0, m = g = 1;
3273
55.6k
                if ( *r==':' ) {
3274
18.4k
                    j++; f++;
3275
18.4k
                    if ( j>=v->n_fmt ) {
3276
52
                        hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"",
3277
52
                                      h->id[BCF_DT_CTG][v->rid].key, v->pos+1);
3278
52
                        v->errcode |= BCF_ERR_NCOLS;
3279
52
                        return -1;
3280
52
                    }
3281
37.1k
                } else goto end_for;
3282
18.4k
                break;
3283
3.89M
            }
3284
3.85M
            if ( r>=end ) break;
3285
3.85M
            r++;
3286
3.85M
        }
3287
37.1k
    end_for:
3288
37.1k
        v->n_sample++;
3289
37.1k
        if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
3290
20.0k
        r++;
3291
20.0k
    }
3292
3293
17.2k
    return 0;
3294
17.2k
}
3295
3296
// allocate memory for arrays
3297
static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3298
                                   const char *p, const char *q,
3299
17.2k
                                   fmt_aux_t *fmt) {
3300
17.2k
    kstring_t *mem = (kstring_t*)&h->mem;
3301
3302
17.2k
    int j;
3303
100k
    for (j = 0; j < v->n_fmt; ++j) {
3304
83.4k
        fmt_aux_t *f = &fmt[j];
3305
83.4k
        if ( !f->max_m ) f->max_m = 1;  // omitted trailing format field
3306
3307
83.4k
        if ((f->y>>4&0xf) == BCF_HT_STR) {
3308
83.4k
            f->size = f->is_gt? f->max_g << 2 : f->max_l;
3309
83.4k
        } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
3310
0
            f->size = f->max_m << 2;
3311
0
        } else {
3312
0
            hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
3313
0
            v->errcode |= BCF_ERR_TAG_INVALID;
3314
0
            return -1;
3315
0
        }
3316
3317
83.4k
        if (align_mem(mem) < 0) {
3318
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3319
0
            v->errcode |= BCF_ERR_LIMITS;
3320
0
            return -1;
3321
0
        }
3322
3323
        // Limit the total memory to ~2Gb per VCF row.  This should mean
3324
        // malformed VCF data is less likely to take excessive memory and/or
3325
        // time.
3326
83.4k
        if ((uint64_t) mem->l + v->n_sample * (uint64_t)f->size > INT_MAX) {
3327
0
            static int warned = 0;
3328
0
            if ( !warned ) hts_log_warning("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3329
0
            warned = 1;
3330
0
            v->errcode |= BCF_ERR_LIMITS;
3331
0
            f->size = -1;
3332
0
            f->offset = 0;
3333
0
            continue;
3334
0
        }
3335
3336
83.4k
        f->offset = mem->l;
3337
83.4k
        if (ks_resize(mem, mem->l + v->n_sample * (size_t)f->size) < 0) {
3338
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3339
0
            v->errcode |= BCF_ERR_LIMITS;
3340
0
            return -1;
3341
0
        }
3342
83.4k
        mem->l += v->n_sample * f->size;
3343
83.4k
    }
3344
3345
17.2k
    {
3346
17.2k
        int j;
3347
100k
        for (j = 0; j < v->n_fmt; ++j)
3348
83.4k
            fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
3349
17.2k
    }
3350
3351
    // check for duplicate tags
3352
17.2k
    int i;
3353
83.4k
    for (i=1; i<v->n_fmt; i++)
3354
66.2k
    {
3355
66.2k
        fmt_aux_t *ifmt = &fmt[i];
3356
66.2k
        if ( ifmt->size==-1 ) continue; // already marked for removal
3357
323k
        for (j=0; j<i; j++)
3358
301k
        {
3359
301k
            fmt_aux_t *jfmt = &fmt[j];
3360
301k
            if ( jfmt->size==-1 ) continue; // already marked for removal
3361
157k
            if ( ifmt->key!=jfmt->key ) continue;
3362
44.1k
            static int warned = 0;
3363
44.1k
            if ( !warned ) hts_log_warning("Duplicate FORMAT tag %s at %s:%"PRIhts_pos, bcf_hdr_int2id(h,BCF_DT_ID,ifmt->key), bcf_seqname_safe(h,v), v->pos+1);
3364
44.1k
            warned = 1;
3365
44.1k
            v->errcode |= BCF_ERR_TAG_INVALID;
3366
44.1k
            ifmt->size = -1;
3367
44.1k
            ifmt->offset = 0;
3368
44.1k
            break;
3369
157k
        }
3370
66.2k
    }
3371
17.2k
    return 0;
3372
17.2k
}
3373
3374
// Fill the sample fields
3375
static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3376
17.2k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
3377
17.2k
    static int extreme_val_warned = 0;
3378
17.2k
    int n_sample_ori = -1;
3379
    // At beginning of the loop t points to the first char of a format
3380
17.2k
    const char *t = q + 1;
3381
17.2k
    int m = 0;   // m: sample id
3382
17.2k
    const int nsamples = bcf_hdr_nsamples(h);
3383
17.2k
    const char *end = s->s + s->l;
3384
3385
17.2k
    int ver = bcf_get_version(h, NULL);
3386
3387
54.0k
    while ( t<end )
3388
52.0k
    {
3389
        // can we skip some samples?
3390
52.0k
        if ( h->keep_samples )
3391
0
        {
3392
0
            n_sample_ori++;
3393
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
3394
0
            {
3395
0
                while ( *t && t<end ) t++;
3396
0
                t++;
3397
0
                continue;
3398
0
            }
3399
0
        }
3400
52.0k
        if ( m == nsamples ) break;
3401
3402
36.9k
        int j = 0; // j-th format field, m-th sample
3403
55.0k
        while ( t < end )
3404
54.4k
        {
3405
54.4k
            fmt_aux_t *z = &fmt[j++];
3406
54.4k
            const int htype = z->y>>4&0xf;
3407
54.4k
            if (!z->buf) {
3408
8
                hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos,
3409
8
                              z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
3410
8
                v->errcode |= BCF_ERR_LIMITS;
3411
8
                return -1;
3412
8
            }
3413
3414
54.4k
            if ( z->size==-1 )
3415
8.02k
            {
3416
                // this field is to be ignored, it's either too big or a duplicate
3417
97.2k
                while ( *t != ':' && *t ) t++;
3418
8.02k
            }
3419
46.4k
            else if (htype == BCF_HT_STR) {
3420
46.4k
                int l;
3421
46.4k
                if (z->is_gt) {
3422
                    // Genotypes.
3423
                    //([/|])?<val>)([|/]<val>)+... where <val> is [0-9]+ or ".".
3424
6.17k
                    int32_t is_phased = 0;
3425
6.17k
                    uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
3426
6.17k
                    uint32_t unreadable = 0;
3427
6.17k
                    uint32_t max = 0;
3428
6.17k
                    int overflow = 0, ploidy = 0, anyunphased = 0, \
3429
6.17k
                        phasingprfx = 0, unknown1 = 0;
3430
3431
                    /* with prefixed phasing, it is explicitly given for 1st one
3432
                    with non-prefixed, set based on ploidy and phasing of other
3433
                    alleles. */
3434
6.17k
                    if (ver >= VCF44 && (*t == '|' || *t == '/')) {
3435
                        // cache prefix and phasing status
3436
1.11k
                        is_phased = *t++ == '|';
3437
1.11k
                        phasingprfx = 1;
3438
1.11k
                    }
3439
3440
22.5k
                    for (l = 0;; ++t) {
3441
22.5k
                        ploidy++;
3442
22.5k
                        if (*t == '.') {
3443
5.34k
                            ++t, x[l++] = is_phased;
3444
5.34k
                            if (l==1) {   //for 1st allele only
3445
799
                                unknown1 = 1;
3446
799
                            }
3447
17.1k
                        } else {
3448
17.1k
                            const char *tt = t;
3449
17.1k
                            uint32_t val;
3450
                            // Or "v->n_allele < 10", but it doesn't
3451
                            // seem to be any faster and this feels safer.
3452
17.1k
                            if (*t >= '0' && *t <= '9' &&
3453
16.5k
                                !(t[1] >= '0' && t[1] <= '9')) {
3454
9.02k
                                val = *t++ - '0';
3455
9.02k
                            } else {
3456
8.15k
                                val = hts_str2uint(t, (char **)&t,
3457
8.15k
                                                   sizeof(val) * CHAR_MAX - 2,
3458
8.15k
                                                   &overflow);
3459
8.15k
                                unreadable |= tt == t;
3460
8.15k
                            }
3461
17.1k
                            if (max < val) max = val;
3462
17.1k
                            x[l++] = (val + 1) << 1 | is_phased;
3463
17.1k
                        }
3464
22.5k
                        anyunphased |= (ploidy != 1) && !is_phased;
3465
22.5k
                        is_phased = (*t == '|');
3466
22.5k
                        if (*t != '|' && *t != '/') break;
3467
22.5k
                    }
3468
6.17k
                    if (!phasingprfx) { //get GT in v44 way when no prefixed phasing
3469
                        /* no explicit phasing for 1st allele, set based on
3470
                         other alleles and ploidy */
3471
5.06k
                        if (ploidy == 1) {  //implicitly phased
3472
1.37k
                            if (!unknown1) {
3473
1.11k
                                x[0] |= 1;
3474
1.11k
                            }
3475
3.68k
                        } else {            //set by other unphased alleles
3476
3.68k
                            x[0] |= (anyunphased)? 0 : 1;
3477
3.68k
                        }
3478
5.06k
                    }
3479
                    // Possibly check max against v->n_allele instead?
3480
6.17k
                    if (overflow || max > (INT32_MAX >> 1) - 1) {
3481
71
                        hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3482
71
                        return -1;
3483
71
                    }
3484
6.10k
                    if (unreadable) {
3485
46
                        hts_log_error("Couldn't read GT data: value not a number or '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3486
46
                        return -1;
3487
46
                    }
3488
6.05k
                    if ( !l ) x[l++] = 0;   // An empty field, insert missing value
3489
8.28k
                    for (; l < z->size>>2; ++l)
3490
2.23k
                        x[l] = bcf_int32_vector_end;
3491
3492
40.2k
                } else {
3493
                    // Otherwise arbitrary strings
3494
40.2k
                    char *x = (char*)z->buf + z->size * (size_t)m;
3495
4.20M
                    for (l = 0; *t != ':' && *t; ++t)
3496
4.16M
                        x[l++] = *t;
3497
40.2k
                    if (z->size > l)
3498
20.5k
                        memset(&x[l], 0, (z->size-l) * sizeof(*x));
3499
40.2k
                }
3500
3501
46.4k
            } else if (htype == BCF_HT_INT) {
3502
                // One or more integers in an array
3503
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3504
0
                int l;
3505
0
                for (l = 0;; ++t) {
3506
0
                    if (*t == '.') {
3507
0
                        x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
3508
0
                    } else {
3509
0
                        int overflow = 0;
3510
0
                        char *te;
3511
0
                        long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3512
0
                        if ( te==t || overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3513
0
                        {
3514
0
                            if ( !extreme_val_warned )
3515
0
                            {
3516
0
                                hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos,
3517
0
                                                h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1);
3518
0
                                extreme_val_warned = 1;
3519
0
                            }
3520
0
                            tmp_val = bcf_int32_missing;
3521
0
                        }
3522
0
                        x[l++] = tmp_val;
3523
0
                        t = te;
3524
0
                    }
3525
0
                    if (*t != ',') break;
3526
0
                }
3527
0
                if ( !l )
3528
0
                    x[l++] = bcf_int32_missing;
3529
0
                for (; l < z->size>>2; ++l)
3530
0
                    x[l] = bcf_int32_vector_end;
3531
3532
0
            } else if (htype == BCF_HT_REAL) {
3533
                // One of more floating point values in an array
3534
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3535
0
                int l;
3536
0
                for (l = 0;; ++t) {
3537
0
                    if (*t == '.' && !isdigit_c(t[1])) {
3538
0
                        bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
3539
0
                    } else {
3540
0
                        int overflow = 0;
3541
0
                        char *te;
3542
0
                        float tmp_val = hts_str2dbl(t, &te, &overflow);
3543
0
                        if ( (te==t || overflow) && !extreme_val_warned )
3544
0
                        {
3545
0
                            hts_log_warning("Extreme FORMAT/%s value encountered at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname(h,v), v->pos+1);
3546
0
                            extreme_val_warned = 1;
3547
0
                        }
3548
0
                        x[l++] = tmp_val;
3549
0
                        t = te;
3550
0
                    }
3551
0
                    if (*t != ',') break;
3552
0
                }
3553
0
                if ( !l )
3554
                    // An empty field, insert missing value
3555
0
                    bcf_float_set_missing(x[l++]);
3556
0
                for (; l < z->size>>2; ++l)
3557
0
                    bcf_float_set_vector_end(x[l]);
3558
0
            } else {
3559
0
                hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, htype, bcf_seqname_safe(h,v), v->pos+1);
3560
0
                v->errcode |= BCF_ERR_TAG_INVALID;
3561
0
                return -1;
3562
0
            }
3563
3564
54.3k
            if (*t == '\0') {
3565
36.2k
                break;
3566
36.2k
            }
3567
18.0k
            else if (*t == ':') {
3568
18.0k
                t++;
3569
18.0k
            }
3570
36
            else {
3571
36
                char buffer[8];
3572
36
                hts_log_error("Invalid character %s in '%s' FORMAT field at %s:%"PRIhts_pos"",
3573
36
                    hts_strprint(buffer, sizeof buffer, '\'', t, 1),
3574
36
                    h->id[BCF_DT_ID][z->key].key, bcf_seqname_safe(h,v), v->pos+1);
3575
36
                v->errcode |= BCF_ERR_CHAR;
3576
36
                return -1;
3577
36
            }
3578
54.3k
        }
3579
3580
        // fill end-of-vector values
3581
574k
        for (; j < v->n_fmt; ++j) {
3582
537k
            fmt_aux_t *z = &fmt[j];
3583
537k
            const int htype = z->y>>4&0xf;
3584
537k
            int l;
3585
3586
537k
            if (z->size == -1) // this field is to be ignored
3587
443k
                continue;
3588
3589
94.4k
            if (htype == BCF_HT_STR) {
3590
94.4k
                if (z->is_gt) {
3591
13.0k
                    int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3592
13.0k
                    if (z->size) x[0] = bcf_int32_missing;
3593
30.5k
                    for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3594
81.3k
                } else {
3595
81.3k
                    char *x = (char*)z->buf + z->size * (size_t)m;
3596
81.3k
                    if ( z->size ) {
3597
16.9k
                        x[0] = '.';
3598
16.9k
                        memset(&x[1], 0, (z->size-1) * sizeof(*x));
3599
16.9k
                    }
3600
81.3k
                }
3601
94.4k
            } else if (htype == BCF_HT_INT) {
3602
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3603
0
                x[0] = bcf_int32_missing;
3604
0
                for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3605
0
            } else if (htype == BCF_HT_REAL) {
3606
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3607
0
                bcf_float_set_missing(x[0]);
3608
0
                for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
3609
0
            }
3610
94.4k
        }
3611
3612
36.8k
        m++; t++;
3613
36.8k
    }
3614
3615
17.0k
    return 0;
3616
17.2k
}
3617
3618
// write individual genotype information
3619
static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3620
17.0k
                                const char *p, const char *q, fmt_aux_t *fmt) {
3621
17.0k
    kstring_t *str = &v->indiv;
3622
17.0k
    int i, need_downsize = 0;
3623
17.0k
    if (v->n_sample > 0) {
3624
97.7k
        for (i = 0; i < v->n_fmt; ++i) {
3625
80.6k
            fmt_aux_t *z = &fmt[i];
3626
80.6k
            if ( z->size==-1 ) {
3627
41.8k
                need_downsize = 1;
3628
41.8k
                continue;
3629
41.8k
            }
3630
38.7k
            bcf_enc_int1(str, z->key);
3631
38.7k
            if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
3632
31.9k
                bcf_enc_size(str, z->size, BCF_BT_CHAR);
3633
31.9k
                kputsn((char*)z->buf, z->size * (size_t)v->n_sample, str);
3634
31.9k
            } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
3635
6.84k
                bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
3636
6.84k
            } else {
3637
0
                bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
3638
0
                if (serialize_float_array(str, (z->size>>2) * (size_t)v->n_sample,
3639
0
                                          (float *) z->buf) != 0) {
3640
0
                    v->errcode |= BCF_ERR_LIMITS;
3641
0
                    hts_log_error("Out of memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3642
0
                    return -1;
3643
0
                }
3644
0
            }
3645
38.7k
        }
3646
3647
17.0k
    }
3648
17.0k
    if ( need_downsize ) {
3649
6.30k
        i = 0;
3650
66.9k
        while ( i < v->n_fmt ) {
3651
60.6k
            if ( fmt[i].size==-1 )
3652
41.8k
            {
3653
41.8k
                v->n_fmt--;
3654
41.8k
                if ( i < v->n_fmt ) memmove(&fmt[i],&fmt[i+1],sizeof(*fmt)*(v->n_fmt-i));
3655
41.8k
            }
3656
18.7k
            else
3657
18.7k
                i++;
3658
60.6k
        }
3659
6.30k
    }
3660
17.0k
    return 0;
3661
17.0k
}
3662
3663
// validity checking
3664
17.0k
static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) {
3665
17.0k
    if ( v->n_sample!=bcf_hdr_nsamples(h) )
3666
121
    {
3667
121
        hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
3668
121
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
3669
121
        v->errcode |= BCF_ERR_NCOLS;
3670
121
        return -1;
3671
121
    }
3672
16.9k
    if ( v->indiv.l > 0xffffffff )
3673
0
    {
3674
0
        hts_log_error("The FORMAT at %s:%"PRIhts_pos" is too long", bcf_seqname_safe(h,v), v->pos+1);
3675
0
        v->errcode |= BCF_ERR_LIMITS;
3676
3677
        // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed
3678
0
        v->n_fmt = 0;
3679
0
        return -1;
3680
0
    }
3681
3682
16.9k
    return 0;
3683
16.9k
}
3684
3685
// p,q is the start and the end of the FORMAT field
3686
static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3687
                            char *p, char *q)
3688
49.7k
{
3689
49.7k
    if ( !bcf_hdr_nsamples(h) ) return 0;
3690
17.5k
    kstring_t *mem = (kstring_t*)&h->mem;
3691
17.5k
    mem->l = 0;
3692
3693
17.5k
    fmt_aux_t fmt[MAX_N_FMT];
3694
3695
    // detect FORMAT "."
3696
17.5k
    int ret; // +ve = ok, -ve = err
3697
17.5k
    if ((ret = vcf_parse_format_empty1(s, h, v, p, q)))
3698
206
        return ret ? 0 : -1;
3699
3700
    // get format information from the dictionary
3701
17.3k
    if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0)
3702
27
        return -1;
3703
3704
    // FORMAT data is per-sample A:B:C A:B:C A:B:C ... but in memory it is
3705
    // stored as per-type arrays AAA... BBB... CCC...  This is basically
3706
    // a data rotation or pivot.
3707
3708
    // The size of elements in the array grow to their maximum needed,
3709
    // permitting fast random access.  This means however we have to first
3710
    // scan the whole FORMAT line to find the maximum of each type, and
3711
    // then scan it again to find the store the data.
3712
    // We break this down into compute-max, allocate, fill-out-buffers
3713
3714
    // TODO: ?
3715
    // The alternative would be to pivot on the first pass, with fixed
3716
    // size entries for numerics and concatenated strings otherwise, also
3717
    // tracking maximum sizes.  Then on a second pass we reallocate and
3718
    // copy the data again to a uniformly sized array.  Two passes through
3719
    // memory, but without doubling string parsing.
3720
3721
    // compute max
3722
17.2k
    if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0)
3723
52
        return -1;
3724
3725
    // allocate memory for arrays
3726
17.2k
    if (vcf_parse_format_alloc4(s, h, v, p, q, fmt) < 0)
3727
0
        return -1;
3728
3729
    // fill the sample fields; at beginning of the loop
3730
17.2k
    if (vcf_parse_format_fill5(s, h, v, p, q, fmt) < 0)
3731
161
        return -1;
3732
3733
    // write individual genotype information
3734
17.0k
    if (vcf_parse_format_gt6(s, h, v, p, q, fmt) < 0)
3735
0
        return -1;
3736
3737
    // validity checking
3738
17.0k
    if (vcf_parse_format_check7(h, v) < 0)
3739
121
        return -1;
3740
3741
16.9k
    return 0;
3742
17.0k
}
3743
3744
6.47k
static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
3745
    // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
3746
    // been already printed, but will enable tools like vcfcheck to proceed.
3747
3748
6.47k
    kstring_t tmp = {0,0,0};
3749
6.47k
    khint_t k;
3750
6.47k
    int l;
3751
6.47k
    if (ksprintf(&tmp, "##contig=<ID=%s>", p) < 0)
3752
0
        return kh_end(d);
3753
6.47k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3754
6.47k
    free(tmp.s);
3755
6.47k
    int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3756
6.47k
    if (res < 0) bcf_hrec_destroy(hrec);
3757
6.47k
    if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3758
6.47k
    k = kh_get(vdict, d, p);
3759
3760
6.47k
    return k;
3761
6.47k
}
3762
3763
53.6k
static int vcf_parse_filter(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3764
53.6k
    int i, n_flt = 1, max_n_flt = 0;
3765
53.6k
    char *r, *t;
3766
53.6k
    int32_t *a_flt = NULL;
3767
53.6k
    ks_tokaux_t aux1;
3768
53.6k
    khint_t k;
3769
53.6k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3770
    // count the number of filters
3771
53.6k
    if (*(q-1) == ';') *(q-1) = 0;
3772
272M
    for (r = p; *r; ++r)
3773
272M
        if (*r == ';') ++n_flt;
3774
53.6k
    if (n_flt > max_n_flt) {
3775
53.6k
        a_flt = malloc(n_flt * sizeof(*a_flt));
3776
53.6k
        if (!a_flt) {
3777
0
            hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3778
0
            v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3779
0
            return -1;
3780
0
        }
3781
53.6k
        max_n_flt = n_flt;
3782
53.6k
    }
3783
    // add filters
3784
1.48M
    for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
3785
1.42M
        *(char*)aux1.p = 0;
3786
1.42M
        k = kh_get(vdict, d, t);
3787
1.42M
        if (k == kh_end(d))
3788
39.5k
        {
3789
            // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
3790
            // been already printed, but will enable tools like vcfcheck to proceed.
3791
39.5k
            hts_log_warning("FILTER '%s' is not defined in the header", t);
3792
39.5k
            kstring_t tmp = {0,0,0};
3793
39.5k
            int l;
3794
39.5k
            ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
3795
39.5k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3796
39.5k
            free(tmp.s);
3797
39.5k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3798
39.5k
            if (res < 0) bcf_hrec_destroy(hrec);
3799
39.5k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3800
39.5k
            k = kh_get(vdict, d, t);
3801
39.5k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3802
39.5k
            if (res || k == kh_end(d)) {
3803
66
                hts_log_error("Could not add dummy header for FILTER '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
3804
66
                v->errcode |= BCF_ERR_TAG_INVALID;
3805
66
                free(a_flt);
3806
66
                return -1;
3807
66
            }
3808
39.5k
        }
3809
1.42M
        a_flt[i++] = kh_val(d, k).id;
3810
1.42M
    }
3811
3812
53.5k
    bcf_enc_vint(str, n_flt, a_flt, -1);
3813
53.5k
    free(a_flt);
3814
3815
53.5k
    return 0;
3816
53.6k
}
3817
3818
57.5k
static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3819
57.5k
    static int extreme_int_warned = 0, negative_rlen_warned = 0;
3820
57.5k
    int max_n_val = 0, overflow = 0;
3821
57.5k
    char *r, *key;
3822
57.5k
    khint_t k;
3823
57.5k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3824
57.5k
    int32_t *a_val = NULL;
3825
3826
57.5k
    v->n_info = 0;
3827
57.5k
    if (*(q-1) == ';') *(q-1) = 0;
3828
3.52M
    for (r = key = p;; ++r) {
3829
3.52M
        int c;
3830
3.52M
        char *val, *end;
3831
318M
        while (*r > '=' || (*r != ';' && *r != '=' && *r != 0)) r++;
3832
3.52M
        if (v->n_info == UINT16_MAX) {
3833
7
            hts_log_error("Too many INFO entries at %s:%"PRIhts_pos,
3834
7
                          bcf_seqname_safe(h,v), v->pos+1);
3835
7
            v->errcode |= BCF_ERR_LIMITS;
3836
7
            goto fail;
3837
7
        }
3838
3.52M
        val = end = NULL;
3839
3.52M
        c = *r; *r = 0;
3840
3.52M
        if (c == '=') {
3841
1.10M
            val = r + 1;
3842
3843
267M
            for (end = val; *end != ';' && *end != 0; ++end);
3844
1.10M
            c = *end; *end = 0;
3845
2.42M
        } else end = r;
3846
3.52M
        if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; }  // faulty VCF, ";;" in the INFO
3847
3.46M
        k = kh_get(vdict, d, key);
3848
3.46M
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
3849
43.7k
        {
3850
43.7k
            hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key);
3851
43.7k
            kstring_t tmp = {0,0,0};
3852
43.7k
            int l;
3853
43.7k
            ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
3854
43.7k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3855
43.7k
            free(tmp.s);
3856
43.7k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3857
43.7k
            if (res < 0) bcf_hrec_destroy(hrec);
3858
43.7k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3859
43.7k
            k = kh_get(vdict, d, key);
3860
43.7k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3861
43.7k
            if (res || k == kh_end(d)) {
3862
138
                hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1);
3863
138
                v->errcode |= BCF_ERR_TAG_INVALID;
3864
138
                goto fail;
3865
138
            }
3866
43.7k
        }
3867
3.46M
        uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
3868
3.46M
        ++v->n_info;
3869
3.46M
        bcf_enc_int1(str, kh_val(d, k).id);
3870
3.46M
        if (val == 0) {
3871
2.36M
            bcf_enc_size(str, 0, BCF_BT_NULL);
3872
2.36M
        } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
3873
32.9k
            bcf_enc_vchar(str, end - val, val);
3874
1.06M
        } else { // int/float value/array
3875
1.06M
            int i, n_val;
3876
1.06M
            char *t, *te;
3877
261M
            for (t = val, n_val = 1; *t; ++t) // count the number of values
3878
260M
                if (*t == ',') ++n_val;
3879
            // Check both int and float size in one step for simplicity
3880
1.06M
            if (n_val > max_n_val) {
3881
6.39k
                int32_t *a_tmp = (int32_t *)realloc(a_val, n_val * sizeof(*a_val));
3882
6.39k
                if (!a_tmp) {
3883
0
                    hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3884
0
                    v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3885
0
                    goto fail;
3886
0
                }
3887
6.39k
                a_val = a_tmp;
3888
6.39k
                max_n_val = n_val;
3889
6.39k
            }
3890
1.06M
            if ((y>>4&0xf) == BCF_HT_INT) {
3891
575k
                i = 0, t = val;
3892
575k
                int64_t val1;
3893
575k
                int is_int64 = 0;
3894
#ifdef VCF_ALLOW_INT64
3895
                if ( n_val==1 )
3896
                {
3897
                    overflow = 0;
3898
                    long long int tmp_val = hts_str2int(val, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3899
                    if ( te==val ) tmp_val = bcf_int32_missing;
3900
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT64 || tmp_val>BCF_MAX_BT_INT64 )
3901
                    {
3902
                        if ( !extreme_int_warned )
3903
                        {
3904
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3905
                            extreme_int_warned = 1;
3906
                        }
3907
                        tmp_val = bcf_int32_missing;
3908
                    }
3909
                    else
3910
                        is_int64 = 1;
3911
                    val1 = tmp_val;
3912
                    t = te;
3913
                    i = 1;  // this is just to avoid adding another nested block...
3914
                }
3915
#endif
3916
93.4M
                for (; i < n_val; ++i, ++t)
3917
92.8M
                {
3918
92.8M
                    overflow = 0;
3919
92.8M
                    long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3920
92.8M
                    if ( te==t ) tmp_val = bcf_int32_missing;
3921
1.12M
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3922
140k
                    {
3923
140k
                        if ( !extreme_int_warned )
3924
1
                        {
3925
1
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3926
1
                            extreme_int_warned = 1;
3927
1
                        }
3928
140k
                        tmp_val = bcf_int32_missing;
3929
140k
                    }
3930
92.8M
                    a_val[i] = tmp_val;
3931
131M
                    for (t = te; *t && *t != ','; t++);
3932
92.8M
                }
3933
575k
                if (n_val == 1) {
3934
#ifdef VCF_ALLOW_INT64
3935
                    if ( is_int64 )
3936
                    {
3937
                        v->unpacked |= BCF_IS_64BIT;
3938
                        bcf_enc_long1(str, val1);
3939
                    }
3940
                    else
3941
                        bcf_enc_int1(str, (int32_t)val1);
3942
#else
3943
453k
                    val1 = a_val[0];
3944
453k
                    bcf_enc_int1(str, (int32_t)val1);
3945
453k
#endif
3946
453k
                } else {
3947
122k
                    bcf_enc_vint(str, n_val, a_val, -1);
3948
122k
                }
3949
575k
                if (n_val==1 && (val1!=bcf_int32_missing || is_int64)
3950
374k
                    && memcmp(key, "END", 4) == 0)
3951
0
                {
3952
0
                    if ( val1 <= v->pos )
3953
0
                    {
3954
0
                        if ( !negative_rlen_warned )
3955
0
                        {
3956
0
                            hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,val1,bcf_seqname_safe(h,v),v->pos+1);
3957
0
                            negative_rlen_warned = 1;
3958
0
                        }
3959
0
                    }
3960
0
                }
3961
575k
            } else if ((y>>4&0xf) == BCF_HT_REAL) {
3962
492k
                float *val_f = (float *)a_val;
3963
91.4M
                for (i = 0, t = val; i < n_val; ++i, ++t)
3964
90.9M
                {
3965
90.9M
                    overflow = 0;
3966
90.9M
                    val_f[i] = hts_str2dbl(t, &te, &overflow);
3967
90.9M
                    if ( te==t || overflow ) // conversion failed
3968
89.5M
                        bcf_float_set_missing(val_f[i]);
3969
117M
                    for (t = te; *t && *t != ','; t++);
3970
90.9M
                }
3971
492k
                bcf_enc_vfloat(str, n_val, val_f);
3972
492k
            }
3973
1.06M
        }
3974
3.46M
        if (c == 0) break;
3975
3.43M
        r = end;
3976
3.43M
        key = r + 1;
3977
3.43M
    }
3978
3979
57.4k
    free(a_val);
3980
57.4k
    return 0;
3981
3982
145
 fail:
3983
145
    free(a_val);
3984
145
    return -1;
3985
57.5k
}
3986
3987
int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
3988
58.9k
{
3989
58.9k
    int ret = -2, overflow = 0;
3990
58.9k
    char *p, *q, *r, *t;
3991
58.9k
    kstring_t *str;
3992
58.9k
    khint_t k;
3993
58.9k
    ks_tokaux_t aux;
3994
3995
//#define NOT_DOT(p) strcmp((p), ".")
3996
//#define NOT_DOT(p) (!(*p == '.' && !p[1]))
3997
//#define NOT_DOT(p) ((*p) != '.' || (p)[1])
3998
//#define NOT_DOT(p) (q-p != 1 || memcmp(p, ".\0", 2))
3999
290k
#define NOT_DOT(p) (memcmp(p, ".\0", 2))
4000
4001
58.9k
    if (!s || !h || !v || !(s->s))
4002
0
        return ret;
4003
4004
    // Assumed in lots of places, but we may as well spot this early
4005
58.9k
    assert(sizeof(float) == sizeof(int32_t));
4006
4007
    // Ensure string we parse has space to permit some over-flow when during
4008
    // parsing.  Eg to do memcmp(key, "END", 4) in vcf_parse_info over
4009
    // the more straight forward looking strcmp, giving a speed advantage.
4010
58.9k
    if (ks_resize(s, s->l+4) < 0)
4011
0
        return -2;
4012
4013
    // Force our memory to be initialised so we avoid the technicality of
4014
    // undefined behaviour in using a 4-byte memcmp.  (The reality is this
4015
    // almost certainly is never detected by the compiler so has no impact,
4016
    // but equally so this code has minimal (often beneficial) impact on
4017
    // performance too.)
4018
58.9k
    s->s[s->l+0] = 0;
4019
58.9k
    s->s[s->l+1] = 0;
4020
58.9k
    s->s[s->l+2] = 0;
4021
58.9k
    s->s[s->l+3] = 0;
4022
4023
58.9k
    bcf_clear1(v);
4024
58.9k
    str = &v->shared;
4025
58.9k
    memset(&aux, 0, sizeof(ks_tokaux_t));
4026
4027
    // CHROM
4028
58.9k
    if (!(p = kstrtok(s->s, "\t", &aux)))
4029
0
        goto err;
4030
58.9k
    *(q = (char*)aux.p) = 0;
4031
4032
58.9k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
4033
58.9k
    k = kh_get(vdict, d, p);
4034
58.9k
    if (k == kh_end(d)) {
4035
6.47k
        hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p);
4036
6.47k
        v->errcode = BCF_ERR_CTG_UNDEF;
4037
6.47k
        if ((k = fix_chromosome(h, d, p)) == kh_end(d)) {
4038
163
            hts_log_error("Could not add dummy header for contig '%s'", p);
4039
163
            v->errcode |= BCF_ERR_CTG_INVALID;
4040
163
            goto err;
4041
163
        }
4042
6.47k
    }
4043
58.7k
    v->rid = kh_val(d, k).id;
4044
4045
    // POS
4046
58.7k
    if (!(p = kstrtok(0, 0, &aux)))
4047
313
        goto err;
4048
58.4k
    *(q = (char*)aux.p) = 0;
4049
4050
58.4k
    overflow = 0;
4051
58.4k
    char *tmp = p;
4052
58.4k
    v->pos = hts_str2uint(p, &p, 62, &overflow);
4053
58.4k
    if (overflow) {
4054
28
        hts_log_error("Position value '%s' is too large", tmp);
4055
28
        goto err;
4056
58.4k
    } else if ( *p ) {
4057
131
        hts_log_error("Could not parse the position '%s'", tmp);
4058
131
        goto err;
4059
58.2k
    } else {
4060
58.2k
        v->pos -= 1;
4061
58.2k
    }
4062
58.2k
    if (v->pos >= INT32_MAX)
4063
2.28k
        v->unpacked |= BCF_IS_64BIT;
4064
4065
    // ID
4066
58.2k
    if (!(p = kstrtok(0, 0, &aux)))
4067
26
        goto err;
4068
58.2k
    *(q = (char*)aux.p) = 0;
4069
4070
58.2k
    if (NOT_DOT(p)) bcf_enc_vchar(str, q - p, p);
4071
683
    else bcf_enc_size(str, 0, BCF_BT_CHAR);
4072
4073
    // REF
4074
58.2k
    if (!(p = kstrtok(0, 0, &aux)))
4075
76
        goto err;
4076
58.1k
    *(q = (char*)aux.p) = 0;
4077
4078
58.1k
    bcf_enc_vchar(str, q - p, p);
4079
58.1k
    v->n_allele = 1, v->rlen = q - p;
4080
4081
    // ALT
4082
58.1k
    if (!(p = kstrtok(0, 0, &aux)))
4083
35
        goto err;
4084
58.1k
    *(q = (char*)aux.p) = 0;
4085
4086
58.1k
    if (NOT_DOT(p)) {
4087
95.8M
        for (r = t = p;; ++r) {
4088
95.8M
            if (*r == ',' || *r == 0) {
4089
4.09M
                if (v->n_allele == UINT16_MAX) {
4090
6
                    hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos,
4091
6
                                  bcf_seqname_safe(h,v), v->pos+1);
4092
6
                    v->errcode |= BCF_ERR_LIMITS;
4093
6
                    goto err;
4094
6
                }
4095
4.09M
                bcf_enc_vchar(str, r - t, t);
4096
4.09M
                t = r + 1;
4097
4.09M
                ++v->n_allele;
4098
4.09M
            }
4099
95.8M
            if (r == q) break;
4100
95.8M
        }
4101
56.4k
    }
4102
4103
    // QUAL
4104
58.1k
    if (!(p = kstrtok(0, 0, &aux)))
4105
93
        goto err;
4106
58.0k
    *(q = (char*)aux.p) = 0;
4107
4108
58.0k
    if (NOT_DOT(p)) v->qual = atof(p);
4109
1.43k
    else bcf_float_set_missing(v->qual);
4110
58.0k
    if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR
4111
4112
    // FILTER
4113
58.0k
    if (!(p = kstrtok(0, 0, &aux)))
4114
79
        goto err;
4115
57.9k
    *(q = (char*)aux.p) = 0;
4116
4117
57.9k
    if (NOT_DOT(p)) {
4118
53.6k
        if (vcf_parse_filter(str, h, v, p, q)) {
4119
66
            goto err;
4120
66
        }
4121
53.6k
    } else bcf_enc_vint(str, 0, 0, -1);
4122
57.9k
    if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT
4123
4124
    // INFO
4125
57.9k
    if (!(p = kstrtok(0, 0, &aux)))
4126
148
        goto err;
4127
57.7k
    *(q = (char*)aux.p) = 0;
4128
4129
57.7k
    if (NOT_DOT(p)) {
4130
57.5k
        if (vcf_parse_info(str, h, v, p, q)) {
4131
145
            goto err;
4132
145
        }
4133
57.5k
    }
4134
57.6k
    if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
4135
4136
    // FORMAT; optional
4137
57.6k
    p = kstrtok(0, 0, &aux);
4138
57.6k
    if (p) {
4139
49.7k
        *(q = (char*)aux.p) = 0;
4140
4141
49.7k
        if (vcf_parse_format(s, h, v, p, q)) {
4142
361
            goto err;
4143
361
        }
4144
49.7k
    }
4145
4146
57.2k
 end:
4147
57.2k
    v->rlen = get_rlen(h, v);    //set rlen based on version
4148
57.2k
    ret = 0;
4149
4150
58.9k
 err:
4151
58.9k
    return ret;
4152
57.2k
}
4153
4154
int vcf_open_mode(char *mode, const char *fn, const char *format)
4155
0
{
4156
0
    if (format == NULL) {
4157
        // Try to pick a format based on the filename extension
4158
0
        char extension[HTS_MAX_EXT_LEN];
4159
0
        if (find_file_extension(fn, extension) < 0) return -1;
4160
0
        return vcf_open_mode(mode, fn, extension);
4161
0
    }
4162
0
    else if (strcasecmp(format, "bcf") == 0) strcpy(mode, "b");
4163
0
    else if (strcasecmp(format, "vcf") == 0) strcpy(mode, "");
4164
0
    else if (strcasecmp(format, "vcf.gz") == 0 || strcasecmp(format, "vcf.bgz") == 0) strcpy(mode, "z");
4165
0
    else return -1;
4166
4167
0
    return 0;
4168
0
}
4169
4170
int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
4171
60.2k
{
4172
60.2k
    int ret;
4173
60.2k
    ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
4174
60.2k
    if (ret < 0) return ret;
4175
58.9k
    return vcf_parse1(&fp->line, h, v);
4176
60.2k
}
4177
4178
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
4179
0
{
4180
0
    uint8_t *ptr_start = ptr;
4181
0
    fmt->id = bcf_dec_typed_int1(ptr, &ptr);
4182
0
    fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
4183
0
    fmt->size = fmt->n << bcf_type_shift[fmt->type];
4184
0
    fmt->p = ptr;
4185
0
    fmt->p_off  = ptr - ptr_start;
4186
0
    fmt->p_free = 0;
4187
0
    ptr += n_sample * fmt->size;
4188
0
    fmt->p_len = ptr - fmt->p;
4189
0
    return ptr;
4190
0
}
4191
4192
static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
4193
2.26k
{
4194
2.26k
    uint8_t *ptr_start = ptr;
4195
2.26k
    int64_t len = 0;
4196
2.26k
    info->key = bcf_dec_typed_int1(ptr, &ptr);
4197
2.26k
    len = info->len = bcf_dec_size(ptr, &ptr, &info->type);
4198
2.26k
    info->vptr = ptr;
4199
2.26k
    info->vptr_off  = ptr - ptr_start;
4200
2.26k
    info->vptr_free = 0;
4201
2.26k
    info->v1.i = 0;
4202
2.26k
    if (info->len == 1) {
4203
91
        switch(info->type) {
4204
0
        case BCF_BT_INT8:
4205
91
        case BCF_BT_CHAR:
4206
91
            info->v1.i = *(int8_t*)ptr;
4207
91
            break;
4208
0
        case BCF_BT_INT16:
4209
0
            info->v1.i = le_to_i16(ptr);
4210
0
            len <<= 1;
4211
0
            break;
4212
0
        case BCF_BT_INT32:
4213
0
            info->v1.i = le_to_i32(ptr);
4214
0
            len <<= 2;
4215
0
            break;
4216
0
        case BCF_BT_FLOAT:
4217
0
            info->v1.f = le_to_float(ptr);
4218
0
            len <<= 2;
4219
0
            break;
4220
0
        case BCF_BT_INT64:
4221
0
            info->v1.i = le_to_i64(ptr);
4222
0
            len <<= 3;
4223
0
            break;
4224
91
        }
4225
2.17k
    } else {
4226
2.17k
        len <<= bcf_type_shift[info->type];
4227
2.17k
    }
4228
2.26k
    ptr += len;
4229
4230
2.26k
    info->vptr_len = ptr - info->vptr;
4231
2.26k
    return ptr;
4232
2.26k
}
4233
4234
int bcf_unpack(bcf1_t *b, int which)
4235
54.7k
{
4236
54.7k
    if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
4237
54.7k
    uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
4238
54.7k
    int i;
4239
54.7k
    bcf_dec_t *d = &b->d;
4240
54.7k
    if (which & BCF_UN_FLT) which |= BCF_UN_STR;
4241
54.7k
    if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
4242
54.7k
    if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
4243
54.7k
    {
4244
54.7k
        kstring_t tmp;
4245
4246
        // ID
4247
54.7k
        tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
4248
54.7k
        ptr_ori = ptr;
4249
54.7k
        ptr = bcf_fmt_sized_array(&tmp, ptr);
4250
54.7k
        b->unpack_size[0] = ptr - ptr_ori;
4251
54.7k
        kputc_('\0', &tmp);
4252
54.7k
        d->id = tmp.s; d->m_id = tmp.m;
4253
4254
        // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
4255
54.7k
        hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
4256
54.7k
        tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
4257
54.7k
        ptr_ori = ptr;
4258
2.46M
        for (i = 0; i < b->n_allele; ++i) {
4259
            // Use offset within tmp.s as realloc may change pointer
4260
2.40M
            d->allele[i] = (char *)(intptr_t)tmp.l;
4261
2.40M
            ptr = bcf_fmt_sized_array(&tmp, ptr);
4262
2.40M
            kputc_('\0', &tmp);
4263
2.40M
        }
4264
54.7k
        b->unpack_size[1] = ptr - ptr_ori;
4265
54.7k
        d->als = tmp.s; d->m_als = tmp.m;
4266
4267
        // Convert our offsets within tmp.s back to pointers again
4268
2.46M
        for (i = 0; i < b->n_allele; ++i)
4269
2.40M
            d->allele[i] = d->als + (ptrdiff_t)d->allele[i];
4270
54.7k
        b->unpacked |= BCF_UN_STR;
4271
54.7k
    }
4272
54.7k
    if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
4273
54.7k
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
4274
54.7k
        ptr_ori = ptr;
4275
54.7k
        if (*ptr>>4) {
4276
50.5k
            int type;
4277
50.5k
            d->n_flt = bcf_dec_size(ptr, &ptr, &type);
4278
50.5k
            hts_expand(int, d->n_flt, d->m_flt, d->flt);
4279
554k
            for (i = 0; i < d->n_flt; ++i)
4280
504k
                d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
4281
50.5k
        } else ++ptr, d->n_flt = 0;
4282
54.7k
        b->unpack_size[2] = ptr - ptr_ori;
4283
54.7k
        b->unpacked |= BCF_UN_FLT;
4284
54.7k
    }
4285
54.7k
    if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
4286
0
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
4287
0
        hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
4288
0
        for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
4289
0
        for (i = 0; i < b->n_info; ++i)
4290
0
            ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
4291
0
        b->unpacked |= BCF_UN_INFO;
4292
0
    }
4293
54.7k
    if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
4294
0
        ptr = (uint8_t*)b->indiv.s;
4295
0
        hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
4296
0
        for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
4297
0
        for (i = 0; i < b->n_fmt; ++i)
4298
0
            ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
4299
0
        b->unpacked |= BCF_UN_FMT;
4300
0
    }
4301
54.7k
    return 0;
4302
54.7k
}
4303
4304
int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
4305
54.7k
{
4306
54.7k
    int i;
4307
54.7k
    int32_t max_dt_id = h->n[BCF_DT_ID];
4308
54.7k
    const char *chrom = bcf_seqname(h, v);
4309
54.7k
    if (!chrom) {
4310
0
        hts_log_error("Invalid BCF, CONTIG id=%d not present in the header",
4311
0
                      v->rid);
4312
0
        errno = EINVAL;
4313
0
        return -1;
4314
0
    }
4315
4316
54.7k
    bcf_unpack((bcf1_t*)v, BCF_UN_ALL & ~(BCF_UN_INFO|BCF_UN_FMT));
4317
4318
    // Cache of key lengths so we don't keep repeatedly using them.
4319
    // This assumes we're not modifying the header between successive calls
4320
    // to vcf_format, but that would lead to many other forms of breakage
4321
    // so it feels like a valid assumption to make.
4322
    //
4323
    // We cannot just do this in bcf_hdr_sync as some code (eg bcftools
4324
    // annotate) manipulates the headers directly without calling sync to
4325
    // refresh the data structures.  So we must do just-in-time length
4326
    // calculation during writes instead.
4327
54.7k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
4328
54.7k
    if (!aux->key_len) {
4329
6.57k
        if (!(aux->key_len = calloc(h->n[BCF_DT_ID]+1, sizeof(*aux->key_len))))
4330
0
            return -1;
4331
6.57k
    }
4332
54.7k
    size_t *key_len = aux->key_len;
4333
4334
54.7k
    kputs(chrom, s); // CHROM
4335
54.7k
    kputc_('\t', s); kputll(v->pos + 1, s); // POS
4336
54.7k
    kputc_('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
4337
54.7k
    kputc_('\t', s); // REF
4338
54.7k
    if (v->n_allele > 0) kputs(v->d.allele[0], s);
4339
0
    else kputc_('.', s);
4340
54.7k
    kputc_('\t', s); // ALT
4341
54.7k
    if (v->n_allele > 1) {
4342
2.40M
        for (i = 1; i < v->n_allele; ++i) {
4343
2.35M
            if (i > 1) kputc_(',', s);
4344
2.35M
            kputs(v->d.allele[i], s);
4345
2.35M
        }
4346
53.1k
    } else kputc_('.', s);
4347
54.7k
    kputc_('\t', s); // QUAL
4348
54.7k
    if ( bcf_float_is_missing(v->qual) ) kputc_('.', s); // QUAL
4349
53.4k
    else kputd(v->qual, s);
4350
54.7k
    kputc_('\t', s); // FILTER
4351
54.7k
    if (v->d.n_flt) {
4352
554k
        for (i = 0; i < v->d.n_flt; ++i) {
4353
504k
            int32_t idx = v->d.flt[i];
4354
504k
            if (idx < 0 || idx >= max_dt_id
4355
504k
                || h->id[BCF_DT_ID][idx].key == NULL) {
4356
0
                hts_log_error("Invalid BCF, the FILTER tag id=%d at %s:%"PRIhts_pos" not present in the header",
4357
0
                              idx, bcf_seqname_safe(h, v), v->pos + 1);
4358
0
                errno = EINVAL;
4359
0
                return -1;
4360
0
            }
4361
504k
            if (i) kputc_(';', s);
4362
504k
            if (!key_len[idx])
4363
71.0k
                key_len[idx] = strlen(h->id[BCF_DT_ID][idx].key);
4364
504k
            kputsn(h->id[BCF_DT_ID][idx].key, key_len[idx], s);
4365
504k
        }
4366
50.5k
    } else kputc_('.', s);
4367
4368
54.7k
    kputc_('\t', s); // INFO
4369
54.7k
    if (v->n_info) {
4370
29.0k
        uint8_t *ptr = v->shared.s
4371
29.0k
            ? (uint8_t *)v->shared.s + v->unpack_size[0] +
4372
29.0k
               v->unpack_size[1] + v->unpack_size[2]
4373
29.0k
            : NULL;
4374
29.0k
        int first = 1;
4375
29.0k
        bcf_info_t *info = v->d.info;
4376
4377
        // Note if we duplicate this code into custom packed and unpacked
4378
        // implementations then we gain a bit more speed, particularly with
4379
        // clang 13 (up to 5%).  Not sure why this is, but code duplication
4380
        // isn't pleasant and it's still faster adding packed support than
4381
        // not so it's a win, just not as good as it should be.
4382
29.0k
        const int info_packed = !(v->unpacked & BCF_UN_INFO) && v->shared.l;
4383
1.78M
        for (i = 0; i < v->n_info; ++i) {
4384
1.75M
            bcf_info_t in, *z;
4385
1.75M
            if (info_packed) {
4386
                // Use a local bcf_info_t when data is packed
4387
1.75M
                z = &in;
4388
1.75M
                z->key  = bcf_dec_typed_int1(ptr, &ptr);
4389
1.75M
                z->len  = bcf_dec_size(ptr, &ptr, &z->type);
4390
1.75M
                z->vptr = ptr;
4391
1.75M
                ptr += z->len << bcf_type_shift[z->type];
4392
1.75M
            } else {
4393
                // Else previously unpacked INFO struct
4394
0
                z = &info[i];
4395
4396
                // Also potentially since deleted
4397
0
                if ( !z->vptr ) continue;
4398
0
            }
4399
4400
1.75M
            bcf_idpair_t *id = z->key >= 0 && z->key < max_dt_id
4401
1.75M
                ? &h->id[BCF_DT_ID][z->key]
4402
1.75M
                : NULL;
4403
4404
1.75M
            if (!id || !id->key) {
4405
0
                hts_log_error("Invalid BCF, the INFO tag id=%d is %s at %s:%"PRIhts_pos,
4406
0
                              z->key,
4407
0
                              z->key < 0 ? "negative"
4408
0
                              : (z->key >= max_dt_id ? "too large" : "not present in the header"),
4409
0
                              bcf_seqname_safe(h, v), v->pos+1);
4410
0
                errno = EINVAL;
4411
0
                return -1;
4412
0
            }
4413
4414
            // KEY
4415
1.75M
            if (!key_len[z->key])
4416
32.0k
                key_len[z->key] = strlen(id->key);
4417
1.75M
            size_t id_len = key_len[z->key];
4418
1.75M
            if (ks_resize(s, s->l + 3 + id_len) < 0)
4419
0
                return -1;
4420
1.75M
            char *sptr = s->s + s->l;
4421
1.75M
            if ( !first ) {
4422
1.73M
                *sptr++ = ';';
4423
1.73M
                s->l++;
4424
1.73M
            }
4425
1.75M
            first = 0;
4426
1.75M
            memcpy(sptr, id->key, id_len);
4427
1.75M
            s->l += id_len;
4428
4429
            // VALUE
4430
1.75M
            if (z->len <= 0) continue;
4431
539k
            sptr[id_len] = '=';
4432
539k
            s->l++;
4433
4434
539k
            if (z->len != 1 || info_packed) {
4435
539k
                bcf_fmt_array(s, z->len, z->type, z->vptr);
4436
539k
            } else {
4437
                // Single length vectors are unpacked into their
4438
                // own info.v1 union and handled separately.
4439
0
                if (z->type == BCF_BT_FLOAT) {
4440
0
                    if ( bcf_float_is_missing(z->v1.f) )
4441
0
                        kputc_('.', s);
4442
0
                    else
4443
0
                        kputd(z->v1.f, s);
4444
0
                } else if (z->type == BCF_BT_CHAR) {
4445
0
                    kputc_(z->v1.i, s);
4446
0
                } else if (z->type < BCF_BT_INT64) {
4447
0
                    int64_t missing[] = {
4448
0
                        0, // BCF_BT_NULL
4449
0
                        bcf_int8_missing,
4450
0
                        bcf_int16_missing,
4451
0
                        bcf_int32_missing,
4452
0
                    };
4453
0
                    if (z->v1.i == missing[z->type])
4454
0
                        kputc_('.', s);
4455
0
                    else
4456
0
                        kputw(z->v1.i, s);
4457
0
                } else if (z->type == BCF_BT_INT64) {
4458
0
                    if (z->v1.i == bcf_int64_missing)
4459
0
                        kputc_('.', s);
4460
0
                    else
4461
0
                        kputll(z->v1.i, s);
4462
0
                } else {
4463
0
                    hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, z->type, bcf_seqname_safe(h, v), v->pos+1);
4464
0
                    errno = EINVAL;
4465
0
                    return -1;
4466
0
                }
4467
0
            }
4468
539k
        }
4469
29.0k
        if ( first ) kputc_('.', s);
4470
29.0k
    } else kputc_('.', s);
4471
4472
    // FORMAT and individual information
4473
54.7k
    if (v->n_sample) {
4474
16.3k
        int i,j;
4475
16.3k
        if ( v->n_fmt) {
4476
16.1k
            uint8_t *ptr = (uint8_t *)v->indiv.s;
4477
16.1k
            int gt_i = -1;
4478
16.1k
            bcf_fmt_t *fmt = v->d.fmt;
4479
16.1k
            int first = 1, ret = 0;
4480
16.1k
            int fmt_packed = !(v->unpacked & BCF_UN_FMT);
4481
4482
16.1k
            if (fmt_packed) {
4483
                // Local fmt as we have an array of num FORMAT keys,
4484
                // each of which points to N.Sample values.
4485
4486
                // No real gain to be had in handling unpacked data here,
4487
                // but it doesn't cost us much in complexity either and
4488
                // it gives us flexibility.
4489
16.1k
                fmt = malloc(v->n_fmt * sizeof(*fmt));
4490
16.1k
                if (!fmt)
4491
0
                    return -1;
4492
16.1k
            }
4493
4494
            // KEYS
4495
51.2k
            for (i = 0; i < (int)v->n_fmt; ++i) {
4496
35.0k
                bcf_fmt_t *z;
4497
35.0k
                z = &fmt[i];
4498
35.0k
                if (fmt_packed) {
4499
35.0k
                    z->id   = bcf_dec_typed_int1(ptr, &ptr);
4500
35.0k
                    z->n    = bcf_dec_size(ptr, &ptr, &z->type);
4501
35.0k
                    z->p    = ptr;
4502
35.0k
                    z->size = z->n << bcf_type_shift[z->type];
4503
35.0k
                    ptr += v->n_sample * z->size;
4504
35.0k
                }
4505
35.0k
                if ( !z->p ) continue;
4506
35.0k
                kputc_(!first ? ':' : '\t', s); first = 0;
4507
4508
35.0k
                bcf_idpair_t *id = z->id >= 0 && z->id < max_dt_id
4509
35.0k
                    ? &h->id[BCF_DT_ID][z->id]
4510
35.0k
                    : NULL;
4511
4512
35.0k
                if (!id || !id->key) {
4513
0
                    hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", z->id, bcf_seqname_safe(h, v), v->pos+1);
4514
0
                    errno = EINVAL;
4515
0
                    if (fmt_packed)
4516
0
                        free(fmt);
4517
0
                    return -1;
4518
0
                }
4519
4520
35.0k
                if (!key_len[z->id])
4521
20.0k
                    key_len[z->id] = strlen(id->key);
4522
35.0k
                size_t id_len = key_len[z->id];
4523
35.0k
                kputsn(id->key, id_len, s);
4524
35.0k
                if (id_len == 2 && id->key[0] == 'G' && id->key[1] == 'T')
4525
6.45k
                    gt_i = i;
4526
35.0k
            }
4527
16.1k
            if ( first ) kputsn("\t.", 2, s);
4528
4529
            // VALUES per sample
4530
47.7k
            for (j = 0; j < v->n_sample; ++j) {
4531
31.5k
                kputc_('\t', s);
4532
31.5k
                first = 1;
4533
31.5k
                bcf_fmt_t *f = fmt;
4534
96.0k
                for (i = 0; i < (int)v->n_fmt; i++, f++) {
4535
81.7k
                    if ( !f->p ) continue;
4536
81.7k
                    if (!first) kputc_(':', s);
4537
81.7k
                    first = 0;
4538
81.7k
                    if (gt_i == i) {
4539
17.2k
                        if ((ret = bcf_format_gt_v2(h, f,j,s)) < 0) {
4540
0
                            hts_log_error("Failed to format GT value for sample %d, returned %d", i, ret);
4541
0
                            errno = EINVAL;
4542
0
                            if (fmt_packed)
4543
0
                                free(fmt);
4544
0
                            return -1;
4545
0
                        }
4546
17.2k
                        break;
4547
17.2k
                    }
4548
64.4k
                    else if (f->n == 1)
4549
8.21k
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4550
56.2k
                    else
4551
56.2k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4552
81.7k
                }
4553
4554
                // Simpler loop post GT and at least 1 iteration
4555
53.0k
                for (i++, f++; i < (int)v->n_fmt; i++, f++) {
4556
21.4k
                    if ( !f->p ) continue;
4557
21.4k
                    kputc_(':', s);
4558
21.4k
                    if (f->n == 1)
4559
1.08k
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4560
20.3k
                    else
4561
20.3k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4562
21.4k
                }
4563
31.5k
                if ( first ) kputc_('.', s);
4564
31.5k
            }
4565
16.1k
            if (fmt_packed)
4566
16.1k
                free(fmt);
4567
16.1k
        }
4568
155
        else
4569
1.20k
            for (j=0; j<=v->n_sample; j++)
4570
1.05k
                kputsn("\t.", 2, s);
4571
16.3k
    }
4572
54.7k
    kputc('\n', s);
4573
54.7k
    return 0;
4574
54.7k
}
4575
4576
int vcf_write_line(htsFile *fp, kstring_t *line)
4577
0
{
4578
0
    int ret;
4579
0
    if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
4580
0
    if ( fp->format.compression!=no_compression )
4581
0
        ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
4582
0
    else
4583
0
        ret = hwrite(fp->fp.hfile, line->s, line->l);
4584
0
    return ret==line->l ? 0 : -1;
4585
0
}
4586
4587
int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
4588
54.7k
{
4589
54.7k
    ssize_t ret;
4590
54.7k
    fp->line.l = 0;
4591
54.7k
    if (vcf_format1(h, v, &fp->line) != 0)
4592
0
        return -1;
4593
54.7k
    if ( fp->format.compression!=no_compression ) {
4594
0
        if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4595
0
            return -1;
4596
0
        if (fp->idx && !fp->fp.bgzf->mt)
4597
0
            hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
4598
0
        ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
4599
54.7k
    } else {
4600
54.7k
        ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
4601
54.7k
    }
4602
4603
54.7k
    if (fp->idx && fp->format.compression == bgzf) {
4604
0
        int tid;
4605
0
        if ((tid = hts_idx_tbi_name(fp->idx, v->rid, bcf_seqname_safe(h, v))) < 0)
4606
0
            return -1;
4607
4608
0
        if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
4609
0
                          tid, v->pos, v->pos + v->rlen,
4610
0
                          bgzf_tell(fp->fp.bgzf), 1) < 0)
4611
0
            return -1;
4612
0
    }
4613
4614
54.7k
    return ret==fp->line.l ? 0 : -1;
4615
54.7k
}
4616
4617
/************************
4618
 * Data access routines *
4619
 ************************/
4620
4621
int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
4622
348k
{
4623
348k
    khint_t k;
4624
348k
    vdict_t *d = (vdict_t*)h->dict[which];
4625
348k
    k = kh_get(vdict, d, id);
4626
348k
    return k == kh_end(d)? -1 : kh_val(d, k).id;
4627
348k
}
4628
4629
4630
/********************
4631
 *** BCF indexing ***
4632
 ********************/
4633
4634
// Calculate number of index levels given min_shift and the header contig
4635
// list.  Also returns number of contigs in *nids_out.
4636
static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int *min_shift_in_out,
4637
                               int starting_n_lvls, int *nids_out)
4638
0
{
4639
0
    int n_lvls = starting_n_lvls, i, nids = 0;
4640
0
    int64_t max_len = 0;
4641
4642
0
    for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
4643
0
    {
4644
0
        if ( !h->id[BCF_DT_CTG][i].val ) continue;
4645
0
        if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] )
4646
0
            max_len = h->id[BCF_DT_CTG][i].val->info[0];
4647
0
        nids++;
4648
0
    }
4649
0
    if ( !max_len ) max_len = (1LL<<31) - 1;  // In case contig line is broken.
4650
4651
0
    hts_adjust_csi_settings(max_len, min_shift_in_out, &n_lvls);
4652
4653
0
    if (nids_out) *nids_out = nids;
4654
0
    return n_lvls;
4655
0
}
4656
4657
hts_idx_t *bcf_index(htsFile *fp, int min_shift)
4658
0
{
4659
0
    int n_lvls;
4660
0
    bcf1_t *b = NULL;
4661
0
    hts_idx_t *idx = NULL;
4662
0
    bcf_hdr_t *h;
4663
0
    int r;
4664
0
    h = bcf_hdr_read(fp);
4665
0
    if ( !h ) return NULL;
4666
0
    int nids = 0;
4667
0
    n_lvls = idx_calc_n_lvls_ids(h, &min_shift, 0, &nids);
4668
0
    idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4669
0
    if (!idx) goto fail;
4670
0
    b = bcf_init1();
4671
0
    if (!b) goto fail;
4672
0
    while ((r = bcf_read1(fp,h, b)) >= 0) {
4673
0
        int ret;
4674
0
        ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
4675
0
        if (ret < 0) goto fail;
4676
0
    }
4677
0
    if (r < -1) goto fail;
4678
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
4679
0
    bcf_destroy1(b);
4680
0
    bcf_hdr_destroy(h);
4681
0
    return idx;
4682
4683
0
 fail:
4684
0
    hts_idx_destroy(idx);
4685
0
    bcf_destroy1(b);
4686
0
    bcf_hdr_destroy(h);
4687
0
    return NULL;
4688
0
}
4689
4690
hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
4691
0
{
4692
0
    return fnidx? hts_idx_load2(fn, fnidx) : bcf_index_load(fn);
4693
0
}
4694
4695
hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags)
4696
0
{
4697
0
    return hts_idx_load3(fn, fnidx, HTS_FMT_CSI, flags);
4698
0
}
4699
4700
int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads)
4701
0
{
4702
0
    htsFile *fp;
4703
0
    hts_idx_t *idx;
4704
0
    tbx_t *tbx;
4705
0
    int ret;
4706
0
    if ((fp = hts_open(fn, "rb")) == 0) return -2;
4707
0
    if (n_threads)
4708
0
        hts_set_threads(fp, n_threads);
4709
0
    if ( fp->format.compression!=bgzf ) { hts_close(fp); return -3; }
4710
0
    switch (fp->format.format) {
4711
0
        case bcf:
4712
0
            if (!min_shift) {
4713
0
                hts_log_error("TBI indices for BCF files are not supported");
4714
0
                ret = -1;
4715
0
            } else {
4716
0
                idx = bcf_index(fp, min_shift);
4717
0
                if (idx) {
4718
0
                    ret = hts_idx_save_as(idx, fn, fnidx, HTS_FMT_CSI);
4719
0
                    if (ret < 0) ret = -4;
4720
0
                    hts_idx_destroy(idx);
4721
0
                }
4722
0
                else ret = -1;
4723
0
            }
4724
0
            break;
4725
4726
0
        case vcf:
4727
0
            tbx = tbx_index(hts_get_bgzfp(fp), min_shift, &tbx_conf_vcf);
4728
0
            if (tbx) {
4729
0
                ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0 ? HTS_FMT_CSI : HTS_FMT_TBI);
4730
0
                if (ret < 0) ret = -4;
4731
0
                tbx_destroy(tbx);
4732
0
            }
4733
0
            else ret = -1;
4734
0
            break;
4735
4736
0
        default:
4737
0
            ret = -3;
4738
0
            break;
4739
0
    }
4740
0
    hts_close(fp);
4741
0
    return ret;
4742
0
}
4743
4744
int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
4745
0
{
4746
0
    return bcf_index_build3(fn, fnidx, min_shift, 0);
4747
0
}
4748
4749
int bcf_index_build(const char *fn, int min_shift)
4750
0
{
4751
0
    return bcf_index_build3(fn, NULL, min_shift, 0);
4752
0
}
4753
4754
// Initialise fp->idx for the current format type.
4755
// This must be called after the header has been written but no other data.
4756
0
static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4757
0
    int n_lvls, fmt;
4758
4759
0
    if (min_shift == 0) {
4760
0
        min_shift = 14;
4761
0
        n_lvls = 5;
4762
0
        fmt = HTS_FMT_TBI;
4763
0
    } else {
4764
        // Set initial n_lvls to match tbx_index()
4765
0
        int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3;
4766
        // Increase if necessary
4767
0
        n_lvls = idx_calc_n_lvls_ids(h, &min_shift, starting_n_lvls, NULL);
4768
0
        fmt = HTS_FMT_CSI;
4769
0
    }
4770
4771
0
    fp->idx = hts_idx_init(0, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4772
0
    if (!fp->idx) return -1;
4773
4774
    // Tabix meta data, added even in CSI for VCF
4775
0
    uint8_t conf[4*7];
4776
0
    u32_to_le(TBX_VCF, conf+0);  // fmt
4777
0
    u32_to_le(1,       conf+4);  // name col
4778
0
    u32_to_le(2,       conf+8);  // beg col
4779
0
    u32_to_le(0,       conf+12); // end col
4780
0
    u32_to_le('#',     conf+16); // comment
4781
0
    u32_to_le(0,       conf+20); // n.skip
4782
0
    u32_to_le(0,       conf+24); // ref name len
4783
0
    if (hts_idx_set_meta(fp->idx, sizeof(conf)*sizeof(*conf), (uint8_t *)conf, 1) < 0) {
4784
0
        hts_idx_destroy(fp->idx);
4785
0
        fp->idx = NULL;
4786
0
        return -1;
4787
0
    }
4788
0
    fp->fnidx = fnidx;
4789
4790
0
    return 0;
4791
0
}
4792
4793
// Initialise fp->idx for the current format type.
4794
// This must be called after the header has been written but no other data.
4795
0
int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4796
0
    int n_lvls, nids = 0;
4797
4798
0
    if (fp->format.compression != bgzf) {
4799
0
        hts_log_error("Indexing is only supported on BGZF-compressed files");
4800
0
        return -3; // Matches no-compression return for bcf_index_build3()
4801
0
    }
4802
4803
0
    if (fp->format.format == vcf)
4804
0
        return vcf_idx_init(fp, h, min_shift, fnidx);
4805
4806
0
    if (!min_shift)
4807
0
        min_shift = 14;
4808
4809
0
    n_lvls = idx_calc_n_lvls_ids(h, &min_shift, 0, &nids);
4810
4811
0
    fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4812
0
    if (!fp->idx) return -1;
4813
0
    fp->fnidx = fnidx;
4814
4815
0
    return 0;
4816
0
}
4817
4818
// Finishes an index. Call after the last record has been written.
4819
// Returns 0 on success, <0 on failure.
4820
//
4821
// NB: same format as SAM/BAM as it uses bgzf.
4822
0
int bcf_idx_save(htsFile *fp) {
4823
0
    return sam_idx_save(fp);
4824
0
}
4825
4826
/*****************
4827
 *** Utilities ***
4828
 *****************/
4829
4830
int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
4831
0
{
4832
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res;
4833
0
    for (i=0; i<src->nhrec; i++)
4834
0
    {
4835
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4836
0
        {
4837
0
            int j;
4838
0
            for (j=0; j<ndst_ori; j++)
4839
0
            {
4840
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4841
4842
                // Checking only the key part of generic lines, otherwise
4843
                // the VCFs are too verbose. Should we perhaps add a flag
4844
                // to bcf_hdr_combine() and make this optional?
4845
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4846
0
            }
4847
0
            if ( j>=ndst_ori ) {
4848
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4849
0
                if (res < 0) return -1;
4850
0
                need_sync += res;
4851
0
            }
4852
0
        }
4853
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4854
0
        {
4855
            // NB: we are ignoring fields without ID
4856
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4857
0
            if ( j>=0 )
4858
0
            {
4859
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4860
0
                if ( !rec ) {
4861
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4862
0
                    if (res < 0) return -1;
4863
0
                    need_sync += res;
4864
0
                }
4865
0
            }
4866
0
        }
4867
0
        else
4868
0
        {
4869
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4870
0
            assert( j>=0 ); // this should always be true for valid VCFs
4871
4872
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4873
0
            if ( !rec ) {
4874
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4875
0
                if (res < 0) return -1;
4876
0
                need_sync += res;
4877
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4878
0
            {
4879
                // Check that both records are of the same type. The bcf_hdr_id2length
4880
                // macro cannot be used here because dst header is not synced yet.
4881
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4882
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4883
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4884
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4885
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4886
0
                {
4887
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4888
0
                        src->hrec[i]->vals[0]);
4889
0
                    ret |= 1;
4890
0
                }
4891
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4892
0
                {
4893
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4894
0
                        src->hrec[i]->vals[0]);
4895
0
                    ret |= 1;
4896
0
                }
4897
0
            }
4898
0
        }
4899
0
    }
4900
0
    if ( need_sync ) {
4901
0
        if (bcf_hdr_sync(dst) < 0) return -1;
4902
0
    }
4903
0
    return ret;
4904
0
}
4905
4906
bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
4907
0
{
4908
0
    if ( !dst )
4909
0
    {
4910
        // this will effectively strip existing IDX attributes from src to become dst
4911
0
        dst = bcf_hdr_init("r");
4912
0
        kstring_t htxt = {0,0,0};
4913
0
        if (bcf_hdr_format(src, 0, &htxt) < 0) {
4914
0
            free(htxt.s);
4915
0
            return NULL;
4916
0
        }
4917
0
        if ( bcf_hdr_parse(dst, htxt.s) < 0 ) {
4918
0
            bcf_hdr_destroy(dst);
4919
0
            dst = NULL;
4920
0
        }
4921
0
        free(htxt.s);
4922
0
        return dst;
4923
0
    }
4924
4925
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, res;
4926
0
    for (i=0; i<src->nhrec; i++)
4927
0
    {
4928
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4929
0
        {
4930
0
            int j;
4931
0
            for (j=0; j<ndst_ori; j++)
4932
0
            {
4933
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4934
4935
                // Checking only the key part of generic lines, otherwise
4936
                // the VCFs are too verbose. Should we perhaps add a flag
4937
                // to bcf_hdr_combine() and make this optional?
4938
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4939
0
            }
4940
0
            if ( j>=ndst_ori ) {
4941
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4942
0
                if (res < 0) return NULL;
4943
0
                need_sync += res;
4944
0
            }
4945
0
            else if ( !strcmp(src->hrec[i]->key,"fileformat") )
4946
0
            {
4947
0
                int ver_src = bcf_get_version(src,src->hrec[i]->value);
4948
0
                int ver_dst = bcf_get_version(dst,dst->hrec[j]->value);
4949
0
                if ( ver_src > ver_dst )
4950
0
                {
4951
0
                    if (bcf_hdr_set_version(dst,src->hrec[i]->value) < 0)
4952
0
                        return NULL;
4953
0
                    need_sync = 1;
4954
0
                }
4955
0
            }
4956
0
        }
4957
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4958
0
        {
4959
            // NB: we are ignoring fields without ID
4960
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4961
0
            if ( j>=0 )
4962
0
            {
4963
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4964
0
                if ( !rec ) {
4965
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4966
0
                    if (res < 0) return NULL;
4967
0
                    need_sync += res;
4968
0
                }
4969
0
            }
4970
0
        }
4971
0
        else
4972
0
        {
4973
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4974
0
            assert( j>=0 ); // this should always be true for valid VCFs
4975
4976
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4977
0
            if ( !rec ) {
4978
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4979
0
                if (res < 0) return NULL;
4980
0
                need_sync += res;
4981
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4982
0
            {
4983
                // Check that both records are of the same type. The bcf_hdr_id2length
4984
                // macro cannot be used here because dst header is not synced yet.
4985
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4986
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4987
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4988
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4989
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4990
0
                {
4991
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4992
0
                        src->hrec[i]->vals[0]);
4993
0
                }
4994
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4995
0
                {
4996
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4997
0
                        src->hrec[i]->vals[0]);
4998
0
                }
4999
0
            }
5000
0
        }
5001
0
    }
5002
0
    if ( need_sync ) {
5003
0
        if (bcf_hdr_sync(dst) < 0) return NULL;
5004
0
    }
5005
0
    return dst;
5006
0
}
5007
5008
int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
5009
0
{
5010
0
    int i;
5011
0
    if ( line->errcode )
5012
0
    {
5013
0
        char errordescription[1024] = "";
5014
0
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_strerror(line->errcode, errordescription, sizeof(errordescription)),  bcf_seqname_safe(src_hdr,line), line->pos+1);
5015
0
        exit(1);
5016
0
    }
5017
0
    if ( src_hdr->ntransl==-1 ) return 0;    // no need to translate, all tags have the same id
5018
0
    if ( !src_hdr->ntransl )  // called for the first time, see what needs translating
5019
0
    {
5020
0
        int dict;
5021
0
        for (dict=0; dict<2; dict++)    // BCF_DT_ID and BCF_DT_CTG
5022
0
        {
5023
0
            src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
5024
0
            for (i=0; i<src_hdr->n[dict]; i++)
5025
0
            {
5026
0
                if ( !src_hdr->id[dict][i].key ) // gap left after removed BCF header lines
5027
0
                {
5028
0
                    src_hdr->transl[dict][i] = -1;
5029
0
                    continue;
5030
0
                }
5031
0
                src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
5032
0
                if ( src_hdr->transl[dict][i]!=-1 && i!=src_hdr->transl[dict][i] ) src_hdr->ntransl++;
5033
0
            }
5034
0
        }
5035
0
        if ( !src_hdr->ntransl )
5036
0
        {
5037
0
            free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
5038
0
            free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
5039
0
            src_hdr->ntransl = -1;
5040
0
        }
5041
0
        if ( src_hdr->ntransl==-1 ) return 0;
5042
0
    }
5043
0
    bcf_unpack(line,BCF_UN_ALL);
5044
5045
    // CHROM
5046
0
    if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
5047
5048
    // FILTER
5049
0
    for (i=0; i<line->d.n_flt; i++)
5050
0
    {
5051
0
        int src_id = line->d.flt[i];
5052
0
        if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
5053
0
            line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
5054
0
        line->d.shared_dirty |= BCF1_DIRTY_FLT;
5055
0
    }
5056
5057
    // INFO
5058
0
    for (i=0; i<line->n_info; i++)
5059
0
    {
5060
0
        int src_id = line->d.info[i].key;
5061
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
5062
0
        if ( dst_id<0 ) continue;
5063
0
        line->d.info[i].key = dst_id;
5064
0
        if ( !line->d.info[i].vptr ) continue;  // skip deleted
5065
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5066
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5067
0
        if ( src_size==dst_size )   // can overwrite
5068
0
        {
5069
0
            uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
5070
0
            if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
5071
0
            else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
5072
0
            else { *(uint32_t*)vptr = (uint32_t)dst_id; }
5073
0
        }
5074
0
        else    // must realloc
5075
0
        {
5076
0
            bcf_info_t *info = &line->d.info[i];
5077
0
            kstring_t str = {0,0,0};
5078
0
            bcf_enc_int1(&str, dst_id);
5079
0
            bcf_enc_size(&str, info->len,info->type);
5080
0
            uint32_t vptr_off = str.l;
5081
0
            kputsn((char*)info->vptr, info->vptr_len, &str);
5082
0
            if( info->vptr_free ) free(info->vptr - info->vptr_off);
5083
0
            info->vptr_off = vptr_off;
5084
0
            info->vptr = (uint8_t*)str.s + info->vptr_off;
5085
0
            info->vptr_free = 1;
5086
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5087
0
        }
5088
0
    }
5089
5090
    // FORMAT
5091
0
    for (i=0; i<line->n_fmt; i++)
5092
0
    {
5093
0
        int src_id = line->d.fmt[i].id;
5094
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
5095
0
        if ( dst_id<0 ) continue;
5096
0
        line->d.fmt[i].id = dst_id;
5097
0
        if( !line->d.fmt[i].p ) continue;  // skip deleted
5098
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5099
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
5100
0
        if ( src_size==dst_size )   // can overwrite
5101
0
        {
5102
0
            uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off;    // pointer to the vector size (4bits) and BT type (4bits)
5103
0
            if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
5104
0
            else if ( dst_size==BCF_BT_INT16 ) { i16_to_le(dst_id, p + 1); }
5105
0
            else { i32_to_le(dst_id, p + 1); }
5106
0
        }
5107
0
        else    // must realloc
5108
0
        {
5109
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
5110
0
            kstring_t str = {0,0,0};
5111
0
            bcf_enc_int1(&str, dst_id);
5112
0
            bcf_enc_size(&str, fmt->n, fmt->type);
5113
0
            uint32_t p_off = str.l;
5114
0
            kputsn((char*)fmt->p, fmt->p_len, &str);
5115
0
            if( fmt->p_free ) free(fmt->p - fmt->p_off);
5116
0
            fmt->p_off = p_off;
5117
0
            fmt->p = (uint8_t*)str.s + fmt->p_off;
5118
0
            fmt->p_free = 1;
5119
0
            line->d.indiv_dirty = 1;
5120
0
        }
5121
0
    }
5122
0
    return 0;
5123
0
}
5124
5125
bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
5126
0
{
5127
0
    bcf_hdr_t *hout = bcf_hdr_init("r");
5128
0
    if (!hout) {
5129
0
        hts_log_error("Failed to allocate bcf header");
5130
0
        return NULL;
5131
0
    }
5132
0
    kstring_t htxt = {0,0,0};
5133
0
    if (bcf_hdr_format(hdr, 1, &htxt) < 0) {
5134
0
        free(htxt.s);
5135
0
        return NULL;
5136
0
    }
5137
0
    if ( bcf_hdr_parse(hout, htxt.s) < 0 ) {
5138
0
        bcf_hdr_destroy(hout);
5139
0
        hout = NULL;
5140
0
    }
5141
0
    free(htxt.s);
5142
0
    return hout;
5143
0
}
5144
5145
bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
5146
0
{
5147
0
    void *names_hash = khash_str2int_init();
5148
0
    kstring_t htxt = {0,0,0};
5149
0
    kstring_t str = {0,0,0};
5150
0
    bcf_hdr_t *h = bcf_hdr_init("w");
5151
0
    int r = 0;
5152
0
    if (!h || !names_hash) {
5153
0
        hts_log_error("Failed to allocate bcf header");
5154
0
        goto err;
5155
0
    }
5156
0
    if (bcf_hdr_format(h0, 1, &htxt) < 0) {
5157
0
        hts_log_error("Failed to get header text");
5158
0
        goto err;
5159
0
    }
5160
0
    bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
5161
0
    int j;
5162
0
    for (j=0; j<n; j++) imap[j] = -1;
5163
0
    if ( bcf_hdr_nsamples(h0) > 0) {
5164
0
        char *p = find_chrom_header_line(htxt.s);
5165
0
        int i = 0, end = n? 8 : 7;
5166
0
        while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
5167
0
        if (i != end) {
5168
0
            hts_log_error("Wrong number of columns in header #CHROM line");
5169
0
            goto err;
5170
0
        }
5171
0
        r |= kputsn(htxt.s, p - htxt.s, &str) < 0;
5172
0
        for (i = 0; i < n; ++i) {
5173
0
            if ( khash_str2int_has_key(names_hash,samples[i]) )
5174
0
            {
5175
0
                hts_log_error("Duplicate sample name \"%s\"", samples[i]);
5176
0
                goto err;
5177
0
            }
5178
0
            imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
5179
0
            if (imap[i] < 0) continue;
5180
0
            r |= kputc('\t', &str) < 0;
5181
0
            r |= kputs(samples[i], &str) < 0;
5182
0
            r |= khash_str2int_inc(names_hash,samples[i]) < 0;
5183
0
        }
5184
0
    } else r |= kputsn(htxt.s, htxt.l, &str) < 0;
5185
0
    while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
5186
0
    r |= kputc('\n',&str) < 0;
5187
0
    if (r) {
5188
0
        hts_log_error("%s", strerror(errno));
5189
0
        goto err;
5190
0
    }
5191
0
    if ( bcf_hdr_parse(h, str.s) < 0 ) {
5192
0
        bcf_hdr_destroy(h);
5193
0
        h = NULL;
5194
0
    }
5195
0
    free(str.s);
5196
0
    free(htxt.s);
5197
0
    khash_str2int_destroy(names_hash);
5198
0
    return h;
5199
5200
0
 err:
5201
0
    ks_free(&str);
5202
0
    ks_free(&htxt);
5203
0
    khash_str2int_destroy(names_hash);
5204
0
    bcf_hdr_destroy(h);
5205
0
    return NULL;
5206
0
}
5207
5208
int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
5209
0
{
5210
0
    if ( samples && !strcmp("-",samples) ) return 0;            // keep all samples
5211
5212
0
    int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
5213
0
    hdr->keep_samples = (uint8_t*) calloc(narr,1);
5214
0
    if (!hdr->keep_samples) return -1;
5215
5216
0
    hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
5217
0
    if ( !samples )
5218
0
    {
5219
        // exclude all samples
5220
0
        khint_t k;
5221
0
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE], *new_dict;
5222
0
        new_dict = kh_init(vdict);
5223
0
        if (!new_dict) return -1;
5224
5225
0
        bcf_hdr_nsamples(hdr) = 0;
5226
5227
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
5228
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
5229
0
        kh_destroy(vdict, d);
5230
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
5231
0
        if (bcf_hdr_sync(hdr) < 0) return -1;
5232
5233
0
        return 0;
5234
0
    }
5235
5236
0
    if ( samples[0]=='^' )
5237
0
        for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
5238
5239
0
    int idx, n, ret = 0;
5240
0
    char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
5241
0
    if ( !smpls ) return -1;
5242
0
    for (i=0; i<n; i++)
5243
0
    {
5244
0
        idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
5245
0
        if ( idx<0 )
5246
0
        {
5247
0
            if ( !ret ) ret = i+1;
5248
0
            continue;
5249
0
        }
5250
0
        assert( idx<bcf_hdr_nsamples(hdr) );
5251
0
        if (  samples[0]=='^' )
5252
0
            bit_array_clear(hdr->keep_samples, idx);
5253
0
        else
5254
0
            bit_array_set(hdr->keep_samples, idx);
5255
0
    }
5256
0
    for (i=0; i<n; i++) free(smpls[i]);
5257
0
    free(smpls);
5258
5259
0
    bcf_hdr_nsamples(hdr) = 0;
5260
0
    for (i=0; i<hdr->nsamples_ori; i++)
5261
0
        if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
5262
5263
0
    if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
5264
0
    else
5265
0
    {
5266
        // Make new list and dictionary with desired samples
5267
0
        char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr));
5268
0
        vdict_t *new_dict, *d;
5269
0
        int k, res;
5270
0
        if (!samples) return -1;
5271
5272
0
        new_dict = kh_init(vdict);
5273
0
        if (!new_dict) {
5274
0
            free(samples);
5275
0
            return -1;
5276
0
        }
5277
0
        idx = 0;
5278
0
        for (i=0; i<hdr->nsamples_ori; i++) {
5279
0
            if ( bit_array_test(hdr->keep_samples,i) ) {
5280
0
                samples[idx] = hdr->samples[i];
5281
0
                k = kh_put(vdict, new_dict, hdr->samples[i], &res);
5282
0
                if (res < 0) {
5283
0
                    free(samples);
5284
0
                    kh_destroy(vdict, new_dict);
5285
0
                    return -1;
5286
0
                }
5287
0
                kh_val(new_dict, k) = bcf_idinfo_def;
5288
0
                kh_val(new_dict, k).id = idx;
5289
0
                idx++;
5290
0
            }
5291
0
        }
5292
5293
        // Delete desired samples from old dictionary, so we don't free them
5294
0
        d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
5295
0
        for (i=0; i < idx; i++) {
5296
0
            int k = kh_get(vdict, d, samples[i]);
5297
0
            if (k < kh_end(d)) kh_del(vdict, d, k);
5298
0
        }
5299
5300
        // Free everything else
5301
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
5302
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
5303
0
        kh_destroy(vdict, d);
5304
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
5305
5306
0
        free(hdr->samples);
5307
0
        hdr->samples = samples;
5308
5309
0
        if (bcf_hdr_sync(hdr) < 0)
5310
0
            return -1;
5311
0
    }
5312
5313
0
    return ret;
5314
0
}
5315
5316
int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
5317
0
{
5318
0
    kstring_t ind;
5319
0
    ind.s = 0; ind.l = ind.m = 0;
5320
0
    if (n) {
5321
0
        bcf_fmt_t fmt[MAX_N_FMT];
5322
0
        int i, j;
5323
0
        uint8_t *ptr = (uint8_t*)v->indiv.s;
5324
0
        for (i = 0; i < v->n_fmt; ++i)
5325
0
            ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
5326
0
        for (i = 0; i < (int)v->n_fmt; ++i) {
5327
0
            bcf_fmt_t *f = &fmt[i];
5328
0
            bcf_enc_int1(&ind, f->id);
5329
0
            bcf_enc_size(&ind, f->n, f->type);
5330
0
            for (j = 0; j < n; ++j)
5331
0
                if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
5332
0
        }
5333
0
        for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
5334
0
        v->n_sample = i;
5335
0
    } else v->n_sample = 0;
5336
0
    if ( !v->n_sample ) v->n_fmt = 0;
5337
0
    free(v->indiv.s);
5338
0
    v->indiv = ind;
5339
0
    v->unpacked &= ~BCF_UN_FMT;    // only BCF is ready for output, VCF will need to unpack again
5340
0
    return 0;
5341
0
}
5342
5343
int bcf_is_snp(bcf1_t *v)
5344
0
{
5345
0
    int i;
5346
0
    bcf_unpack(v, BCF_UN_STR);
5347
0
    for (i = 0; i < v->n_allele; ++i)
5348
0
    {
5349
0
        if ( v->d.allele[i][1]==0 && v->d.allele[i][0]!='*' ) continue;
5350
5351
        // mpileup's <X> allele, see also below. This is not completely satisfactory,
5352
        // a general library is here narrowly tailored to fit samtools.
5353
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue;
5354
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='*' && v->d.allele[i][2]=='>' ) continue;
5355
5356
0
        break;
5357
0
    }
5358
0
    return i == v->n_allele;
5359
0
}
5360
5361
static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t *var)
5362
0
{
5363
0
    if ( *alt == '*' && !alt[1] ) { var->n = 0; var->type = VCF_OVERLAP; return; }  // overlapping variant
5364
5365
    // The most frequent case
5366
0
    if ( !ref[1] && !alt[1] )
5367
0
    {
5368
0
        if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
5369
0
        if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
5370
0
        var->n = 1; var->type = VCF_SNP; return;
5371
0
    }
5372
0
    if ( alt[0]=='<' )
5373
0
    {
5374
0
        if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
5375
0
        if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }
5376
0
        if ( !strcmp("NON_REF>",alt+1) ) { var->n = 0; var->type = VCF_REF; return; }
5377
0
        var->type = VCF_OTHER;
5378
0
        return;
5379
0
    }
5380
5381
    // Catch "joined before" breakend case
5382
0
    if ( alt[0]==']' || alt[0] == '[' )
5383
0
    {
5384
0
        var->type = VCF_BND; return;
5385
0
    }
5386
5387
    // Iterate through alt characters that match the reference
5388
0
    const char *r = ref, *a = alt;
5389
0
    while (*r && *a && toupper_c(*r)==toupper_c(*a) ) { r++; a++; }     // unfortunately, matching REF,ALT case is not guaranteed
5390
5391
0
    if ( *a && !*r )
5392
0
    {
5393
0
        while ( *a ) a++;
5394
0
        if ( *(a-1)==']' || *(a-1)=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend
5395
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_INS; return;
5396
0
    }
5397
0
    else if ( *r && !*a )
5398
0
    {
5399
0
        while ( *r ) r++;
5400
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_DEL; return;
5401
0
    }
5402
0
    else if ( !*r && !*a )
5403
0
    {
5404
0
        var->n = 0; var->type = VCF_REF; return;
5405
0
    }
5406
5407
0
    const char *re = r, *ae = a;
5408
0
    while ( re[1] ) re++;
5409
0
    while ( ae[1] ) ae++;
5410
0
    if ( ae[0]==']' || ae[0]=='[' ) { var->type = VCF_BND; return; }    // "joined after" breakend
5411
0
    while ( re>r && ae>a && toupper_c(*re)==toupper_c(*ae) ) { re--; ae--; }
5412
0
    if ( ae==a )
5413
0
    {
5414
0
        if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
5415
0
        var->n = -(re-r);
5416
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_DEL; return; }
5417
0
        var->type = VCF_OTHER; return;
5418
0
    }
5419
0
    else if ( re==r )
5420
0
    {
5421
0
        var->n = ae-a;
5422
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_INS; return; }
5423
0
        var->type = VCF_OTHER; return;
5424
0
    }
5425
5426
0
    var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
5427
0
    var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
5428
5429
    // should do also complex events, SVs, etc...
5430
0
}
5431
5432
static int bcf_set_variant_types(bcf1_t *b)
5433
0
{
5434
0
    if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
5435
0
    bcf_dec_t *d = &b->d;
5436
0
    if ( d->n_var < b->n_allele )
5437
0
    {
5438
0
        bcf_variant_t *new_var = realloc(d->var, sizeof(bcf_variant_t)*b->n_allele);
5439
0
        if (!new_var)
5440
0
            return -1;
5441
0
        d->var = new_var;
5442
0
        d->n_var = b->n_allele;
5443
0
    }
5444
0
    int i;
5445
0
    b->d.var_type = 0;
5446
0
    d->var[0].type = VCF_REF;
5447
0
    d->var[0].n    = 0;
5448
0
    for (i=1; i<b->n_allele; i++)
5449
0
    {
5450
0
        bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
5451
0
        b->d.var_type |= d->var[i].type;
5452
        //fprintf(stderr,"[set_variant_type] %d   %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
5453
0
    }
5454
0
    return 0;
5455
0
}
5456
5457
// bcf_get_variant_type/bcf_get_variant_types should only return the following,
5458
// to be compatible with callers that are not expecting newer values
5459
// like VCF_INS, VCF_DEL.  The full set is available from the newer
5460
// vcf_has_variant_type* interfaces.
5461
0
#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP)
5462
int bcf_get_variant_types(bcf1_t *rec)
5463
0
{
5464
0
    if ( rec->d.var_type==-1 ) {
5465
0
        if (bcf_set_variant_types(rec) != 0) {
5466
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5467
0
            exit(1); // Due to legacy API having no way to report failures
5468
0
        }
5469
0
    }
5470
0
    return rec->d.var_type & ORIG_VAR_TYPES;
5471
0
}
5472
5473
int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
5474
0
{
5475
0
    if ( rec->d.var_type==-1 ) {
5476
0
        if (bcf_set_variant_types(rec) != 0) {
5477
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5478
0
            exit(1); // Due to legacy API having no way to report failures
5479
0
        }
5480
0
    }
5481
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) {
5482
0
        hts_log_error("Requested allele outside valid range");
5483
0
        exit(1);
5484
0
    }
5485
0
    return rec->d.var[ith_allele].type & ORIG_VAR_TYPES;
5486
0
}
5487
#undef ORIG_VAR_TYPES
5488
5489
int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask)
5490
0
{
5491
0
    if ( rec->d.var_type==-1 ) {
5492
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5493
0
    }
5494
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return -1;
5495
0
    if (bitmask == VCF_REF) {  // VCF_REF is 0, so handled as a special case
5496
0
        return rec->d.var[ith_allele].type == VCF_REF;
5497
0
    }
5498
0
    return bitmask & rec->d.var[ith_allele].type;
5499
0
}
5500
5501
int bcf_variant_length(bcf1_t *rec, int ith_allele)
5502
0
{
5503
0
    if ( rec->d.var_type==-1 ) {
5504
0
        if (bcf_set_variant_types(rec) != 0) return bcf_int32_missing;
5505
0
    }
5506
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return bcf_int32_missing;
5507
0
    return rec->d.var[ith_allele].n;
5508
0
}
5509
5510
int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask,
5511
                          enum bcf_variant_match mode)
5512
0
{
5513
0
    if ( rec->d.var_type==-1 ) {
5514
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5515
0
    }
5516
0
    uint32_t type = rec->d.var_type;
5517
0
    if ( mode==bcf_match_overlap ) return bitmask & type;
5518
5519
    // VCF_INDEL is always set with VCF_INS and VCF_DEL by bcf_set_variant_type[s], but the bitmask may
5520
    // ask for say `VCF_INS` or `VCF_INDEL` only
5521
0
    if ( bitmask&(VCF_INS|VCF_DEL) && !(bitmask&VCF_INDEL) ) type &= ~VCF_INDEL;
5522
0
    else if ( bitmask&VCF_INDEL && !(bitmask&(VCF_INS|VCF_DEL)) ) type &= ~(VCF_INS|VCF_DEL);
5523
5524
0
    if ( mode==bcf_match_subset )
5525
0
    {
5526
0
        if ( ~bitmask & type ) return 0;
5527
0
        else return bitmask & type;
5528
0
    }
5529
    // mode == bcf_match_exact
5530
0
    if ( bitmask==VCF_REF ) return type==bitmask ? 1 : 0;
5531
0
    return type==bitmask ? type : 0;
5532
0
}
5533
5534
int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5535
0
{
5536
0
    static int negative_rlen_warned = 0;
5537
0
    int is_end_tag, is_svlen_tag = 0;
5538
5539
    // Is the field already present?
5540
0
    int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5541
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1;    // No such INFO field in the header
5542
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
5543
5544
0
    is_end_tag = strcmp(key, "END") == 0;
5545
0
    is_svlen_tag = strcmp(key, "SVLEN") == 0;
5546
5547
0
    for (i=0; i<line->n_info; i++)
5548
0
        if ( inf_id==line->d.info[i].key ) break;
5549
0
    bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
5550
5551
0
    if ( !n || (type==BCF_HT_STR && !values) )
5552
0
    {
5553
0
        if ( inf )
5554
0
        {
5555
            // Mark the tag for removal, free existing memory if necessary
5556
0
            if ( inf->vptr_free )
5557
0
            {
5558
0
                free(inf->vptr - inf->vptr_off);
5559
0
                inf->vptr_free = 0;
5560
0
            }
5561
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5562
0
            inf->vptr = NULL;
5563
0
            inf->vptr_off = inf->vptr_len = 0;
5564
0
        }
5565
0
        if ( n==0 && (is_end_tag || is_svlen_tag) ) {
5566
0
            line->rlen = get_rlen(hdr, line);
5567
0
        }
5568
0
        return 0;
5569
0
    }
5570
5571
0
    if (is_end_tag)
5572
0
    {
5573
0
        if (n != 1)
5574
0
        {
5575
0
            hts_log_error("END info tag should only have one value at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5576
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5577
0
            return -1;
5578
0
        }
5579
0
        if (type != BCF_HT_INT && type != BCF_HT_LONG)
5580
0
        {
5581
0
            hts_log_error("Wrong type (%d) for END info tag at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5582
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5583
0
            return -1;
5584
0
        }
5585
0
    }
5586
5587
    // Encode the values and determine the size required to accommodate the values
5588
0
    kstring_t str = {0,0,0};
5589
0
    bcf_enc_int1(&str, inf_id);
5590
0
    if ( type==BCF_HT_INT )
5591
0
        bcf_enc_vint(&str, n, (int32_t*)values, -1);
5592
0
    else if ( type==BCF_HT_REAL )
5593
0
        bcf_enc_vfloat(&str, n, (float*)values);
5594
0
    else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
5595
0
    {
5596
0
        if ( values==NULL )
5597
0
            bcf_enc_size(&str, 0, BCF_BT_NULL);
5598
0
        else
5599
0
            bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
5600
0
    }
5601
#ifdef VCF_ALLOW_INT64
5602
    else if ( type==BCF_HT_LONG )
5603
    {
5604
        if (n != 1) {
5605
            hts_log_error("Only storing a single BCF_HT_LONG value is supported at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5606
            abort();
5607
        }
5608
        bcf_enc_long1(&str, *(int64_t *) values);
5609
    }
5610
#endif
5611
0
    else
5612
0
    {
5613
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5614
0
        abort();
5615
0
    }
5616
5617
    // Is the INFO tag already present
5618
0
    if ( inf )
5619
0
    {
5620
        // Is it big enough to accommodate new block?
5621
0
        if ( inf->vptr && str.l <= inf->vptr_len + inf->vptr_off )
5622
0
        {
5623
0
            if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
5624
0
            uint8_t *ptr = inf->vptr - inf->vptr_off;
5625
0
            memcpy(ptr, str.s, str.l);
5626
0
            free(str.s);
5627
0
            int vptr_free = inf->vptr_free;
5628
0
            bcf_unpack_info_core1(ptr, inf);
5629
0
            inf->vptr_free = vptr_free;
5630
0
        }
5631
0
        else
5632
0
        {
5633
0
            if ( inf->vptr_free )
5634
0
                free(inf->vptr - inf->vptr_off);
5635
0
            bcf_unpack_info_core1((uint8_t*)str.s, inf);
5636
0
            inf->vptr_free = 1;
5637
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5638
0
        }
5639
0
    }
5640
0
    else
5641
0
    {
5642
        // The tag is not present, create new one
5643
0
        line->n_info++;
5644
0
        hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
5645
0
        inf = &line->d.info[line->n_info-1];
5646
0
        bcf_unpack_info_core1((uint8_t*)str.s, inf);
5647
0
        inf->vptr_free = 1;
5648
0
        line->d.shared_dirty |= BCF1_DIRTY_INF;
5649
0
    }
5650
0
    line->unpacked |= BCF_UN_INFO;
5651
5652
0
   if ( n==1 && is_end_tag) {
5653
0
        hts_pos_t end = type == BCF_HT_INT ? *(int32_t *) values : *(int64_t *) values;
5654
0
        if ( (type == BCF_HT_INT && end!=bcf_int32_missing) || (type == BCF_HT_LONG && end!=bcf_int64_missing) )
5655
0
        {
5656
0
            if ( end <= line->pos )
5657
0
            {
5658
0
                if ( !negative_rlen_warned )
5659
0
                {
5660
0
                    hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,end,bcf_seqname_safe(hdr,line),line->pos+1);
5661
0
                    negative_rlen_warned = 1;
5662
0
                }
5663
0
            }
5664
0
        }
5665
0
    }
5666
0
    if (is_svlen_tag || is_end_tag) {
5667
0
        line->rlen = get_rlen(hdr, line);
5668
0
    }
5669
0
    return 0;
5670
0
}
5671
5672
int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
5673
0
{
5674
0
    if ( !n )
5675
0
        return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
5676
5677
0
    int i, max_len = 0;
5678
0
    for (i=0; i<n; i++)
5679
0
    {
5680
0
        int len = strlen(values[i]);
5681
0
        if ( len > max_len ) max_len = len;
5682
0
    }
5683
0
    char *out = (char*) malloc(max_len*n);
5684
0
    if ( !out ) return -2;
5685
0
    for (i=0; i<n; i++)
5686
0
    {
5687
0
        char *dst = out+i*max_len;
5688
0
        const char *src = values[i];
5689
0
        int j = 0;
5690
0
        while ( src[j] ) { dst[j] = src[j]; j++; }
5691
0
        for (; j<max_len; j++) dst[j] = 0;
5692
0
    }
5693
0
    int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
5694
0
    free(out);
5695
0
    return ret;
5696
0
}
5697
5698
int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5699
0
{
5700
    // Is the field already present?
5701
0
    int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5702
0
    int is_len = 0;
5703
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
5704
0
    {
5705
0
        if ( !n ) return 0;
5706
0
        return -1;  // the key not present in the header
5707
0
    }
5708
5709
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5710
5711
0
    for (i=0; i<line->n_fmt; i++)
5712
0
        if ( line->d.fmt[i].id==fmt_id ) break;
5713
0
    bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
5714
5715
0
    is_len = strcmp(key, "LEN") == 0;
5716
0
    if ( !n )
5717
0
    {
5718
0
        if ( fmt )
5719
0
        {
5720
            // Mark the tag for removal, free existing memory if necessary
5721
0
            if ( fmt->p_free )
5722
0
            {
5723
0
                free(fmt->p - fmt->p_off);
5724
0
                fmt->p_free = 0;
5725
0
            }
5726
0
            line->d.indiv_dirty = 1;
5727
0
            fmt->p = NULL;
5728
0
        }
5729
0
        if (is_len) {
5730
0
            line->rlen = get_rlen(hdr, line);
5731
0
        }
5732
0
        return 0;
5733
0
    }
5734
5735
0
    line->n_sample = bcf_hdr_nsamples(hdr);
5736
0
    int nps = n / line->n_sample;  // number of values per sample
5737
0
    assert( nps && nps*line->n_sample==n );     // must be divisible by n_sample
5738
5739
    // Encode the values and determine the size required to accommodate the values
5740
0
    kstring_t str = {0,0,0};
5741
0
    bcf_enc_int1(&str, fmt_id);
5742
0
    assert(values != NULL);
5743
0
    if ( type==BCF_HT_INT )
5744
0
        bcf_enc_vint(&str, n, (int32_t*)values, nps);
5745
0
    else if ( type==BCF_HT_REAL )
5746
0
    {
5747
0
        bcf_enc_size(&str, nps, BCF_BT_FLOAT);
5748
0
        serialize_float_array(&str, nps*line->n_sample, (float *) values);
5749
0
    }
5750
0
    else if ( type==BCF_HT_STR )
5751
0
    {
5752
0
        bcf_enc_size(&str, nps, BCF_BT_CHAR);
5753
0
        kputsn((char*)values, nps*line->n_sample, &str);
5754
0
    }
5755
0
    else
5756
0
    {
5757
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5758
0
        abort();
5759
0
    }
5760
5761
0
    if ( !fmt )
5762
0
    {
5763
        // Not present, new format field
5764
0
        line->n_fmt++;
5765
0
        hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
5766
5767
        // Special case: VCF specification requires that GT is always first
5768
0
        if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
5769
0
        {
5770
0
            for (i=line->n_fmt-1; i>0; i--)
5771
0
                line->d.fmt[i] = line->d.fmt[i-1];
5772
0
            fmt = &line->d.fmt[0];
5773
0
        }
5774
0
        else
5775
0
            fmt = &line->d.fmt[line->n_fmt-1];
5776
0
        bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5777
0
        line->d.indiv_dirty = 1;
5778
0
        fmt->p_free = 1;
5779
0
    }
5780
0
    else
5781
0
    {
5782
        // The tag is already present, check if it is big enough to accommodate the new block
5783
0
        if ( fmt->p && str.l <= fmt->p_len + fmt->p_off )
5784
0
        {
5785
            // good, the block is big enough
5786
0
            if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
5787
0
            uint8_t *ptr = fmt->p - fmt->p_off;
5788
0
            memcpy(ptr, str.s, str.l);
5789
0
            free(str.s);
5790
0
            int p_free = fmt->p_free;
5791
0
            bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
5792
0
            fmt->p_free = p_free;
5793
0
        }
5794
0
        else
5795
0
        {
5796
0
            if ( fmt->p_free )
5797
0
                free(fmt->p - fmt->p_off);
5798
0
            bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5799
0
            fmt->p_free = 1;
5800
0
            line->d.indiv_dirty = 1;
5801
0
        }
5802
0
    }
5803
0
    line->unpacked |= BCF_UN_FMT;
5804
5805
0
    if (is_len) {
5806
0
        line->rlen = get_rlen(hdr, line);
5807
0
    }
5808
0
    return 0;
5809
0
}
5810
5811
5812
int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
5813
0
{
5814
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5815
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5816
0
    line->d.n_flt = n;
5817
0
    if ( !n ) return 0;
5818
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5819
0
    int i;
5820
0
    for (i=0; i<n; i++)
5821
0
        line->d.flt[i] = flt_ids[i];
5822
0
    return 0;
5823
0
}
5824
5825
int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
5826
0
{
5827
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5828
0
    int i;
5829
0
    for (i=0; i<line->d.n_flt; i++)
5830
0
        if ( flt_id==line->d.flt[i] ) break;
5831
0
    if ( i<line->d.n_flt ) return 0;    // this filter is already set
5832
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5833
0
    if ( flt_id==0 )    // set to PASS
5834
0
        line->d.n_flt = 1;
5835
0
    else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
5836
0
        line->d.n_flt = 1;
5837
0
    else
5838
0
        line->d.n_flt++;
5839
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5840
0
    line->d.flt[line->d.n_flt-1] = flt_id;
5841
0
    return 1;
5842
0
}
5843
int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
5844
0
{
5845
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5846
0
    int i;
5847
0
    for (i=0; i<line->d.n_flt; i++)
5848
0
        if ( flt_id==line->d.flt[i] ) break;
5849
0
    if ( i==line->d.n_flt ) return 0;   // the filter is not present
5850
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5851
0
    if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt));
5852
0
    line->d.n_flt--;
5853
0
    if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
5854
0
    return 0;
5855
0
}
5856
5857
int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
5858
0
{
5859
0
    if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
5860
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
5861
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1;  // not defined in the header
5862
5863
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5864
0
    if ( id==0 && !line->d.n_flt) return 1; // PASS
5865
5866
0
    int i;
5867
0
    for (i=0; i<line->d.n_flt; i++)
5868
0
        if ( line->d.flt[i]==id ) return 1;
5869
0
    return 0;
5870
0
}
5871
5872
static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
5873
0
{
5874
0
    line->d.shared_dirty |= BCF1_DIRTY_ALS;
5875
0
    line->d.var_type = -1;
5876
5877
0
    line->n_allele = nals;
5878
0
    hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
5879
5880
0
    char *als = line->d.als;
5881
0
    int n = 0;
5882
0
    while (n<nals)
5883
0
    {
5884
0
        line->d.allele[n] = als;
5885
0
        while ( *als ) als++;
5886
0
        als++;
5887
0
        n++;
5888
0
    }
5889
    // Update REF length. Note that END is 1-based while line->pos 0-based
5890
0
    line->rlen = get_rlen(hdr, line);
5891
5892
0
    return 0;
5893
0
}
5894
int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
5895
0
{
5896
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5897
0
    char *free_old = NULL;
5898
0
    char buffer[256];
5899
0
    size_t used = 0;
5900
5901
    // The pointers in alleles may point into the existing line->d.als memory,
5902
    // so care needs to be taken not to clobber them while updating.  Usually
5903
    // they will be short so we can copy through an intermediate buffer.
5904
    // If they're longer, or won't fit in the existing allocation we
5905
    // can allocate a new buffer to write into.  Note that in either case
5906
    // pointers to line->d.als memory in alleles may not be valid when we've
5907
    // finished.
5908
0
    int i;
5909
0
    size_t avail = line->d.m_als < sizeof(buffer) ? line->d.m_als : sizeof(buffer);
5910
0
    for (i=0; i<nals; i++) {
5911
0
        size_t sz = strlen(alleles[i]) + 1;
5912
0
        if (avail - used < sz)
5913
0
            break;
5914
0
        memcpy(buffer + used, alleles[i], sz);
5915
0
        used += sz;
5916
0
    }
5917
5918
    // Did we miss anything?
5919
0
    if (i < nals) {
5920
0
        int j;
5921
0
        size_t needed = used;
5922
0
        char *new_als;
5923
0
        for (j = i; j < nals; j++)
5924
0
            needed += strlen(alleles[j]) + 1;
5925
0
        if (needed < line->d.m_als) // Don't shrink the buffer
5926
0
            needed = line->d.m_als;
5927
0
        if (needed > INT_MAX) {
5928
0
            hts_log_error("REF + alleles too long to fit in a BCF record");
5929
0
            return -1;
5930
0
        }
5931
0
        new_als = malloc(needed);
5932
0
        if (!new_als)
5933
0
            return -1;
5934
0
        free_old = line->d.als;
5935
0
        line->d.als = new_als;
5936
0
        line->d.m_als = needed;
5937
0
    }
5938
5939
    // Copy from the temp buffer to the destination
5940
0
    if (used) {
5941
0
        assert(used <= line->d.m_als);
5942
0
        memcpy(line->d.als, buffer, used);
5943
0
    }
5944
5945
    // Add in any remaining entries - if this happens we will always be
5946
    // writing to a newly-allocated buffer.
5947
0
    for (; i < nals; i++) {
5948
0
        size_t sz = strlen(alleles[i]) + 1;
5949
0
        memcpy(line->d.als + used, alleles[i], sz);
5950
0
        used += sz;
5951
0
    }
5952
5953
0
    if (free_old)
5954
0
        free(free_old);
5955
0
    return _bcf1_sync_alleles(hdr,line,nals);
5956
0
}
5957
5958
int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
5959
0
{
5960
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5961
0
    kstring_t tmp;
5962
0
    tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
5963
0
    kputs(alleles_string, &tmp);
5964
0
    line->d.als = tmp.s; line->d.m_als = tmp.m;
5965
5966
0
    int nals = 1;
5967
0
    char *t = line->d.als;
5968
0
    while (*t)
5969
0
    {
5970
0
        if ( *t==',' ) { *t = 0; nals++; }
5971
0
        t++;
5972
0
    }
5973
0
    return _bcf1_sync_alleles(hdr, line, nals);
5974
0
}
5975
5976
int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
5977
0
{
5978
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5979
0
    kstring_t tmp;
5980
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
5981
0
    if ( id )
5982
0
        kputs(id, &tmp);
5983
0
    else
5984
0
        kputs(".", &tmp);
5985
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
5986
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
5987
0
    return 0;
5988
0
}
5989
5990
int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
5991
0
{
5992
0
    if ( !id ) return 0;
5993
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5994
5995
0
    kstring_t tmp;
5996
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
5997
5998
0
    int len = strlen(id);
5999
0
    char *dst = line->d.id;
6000
0
    while ( *dst && (dst=strstr(dst,id)) )
6001
0
    {
6002
0
        if ( dst[len]!=0 && dst[len]!=';' ) dst++;              // a prefix, not a match
6003
0
        else if ( dst==line->d.id || dst[-1]==';' ) return 0;   // already present
6004
0
        dst++;  // a suffix, not a match
6005
0
    }
6006
0
    if ( line->d.id && (line->d.id[0]!='.' || line->d.id[1]) )
6007
0
    {
6008
0
        tmp.l = strlen(line->d.id);
6009
0
        kputc(';',&tmp);
6010
0
    }
6011
0
    kputs(id,&tmp);
6012
6013
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
6014
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
6015
0
    return 0;
6016
6017
0
}
6018
6019
bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
6020
0
{
6021
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
6022
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL;   // no such FMT field in the header
6023
0
    return bcf_get_fmt_id(line, id);
6024
0
}
6025
6026
bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
6027
0
{
6028
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
6029
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL;   // no such INFO field in the header
6030
0
    return bcf_get_info_id(line, id);
6031
0
}
6032
6033
bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
6034
0
{
6035
0
    int i;
6036
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
6037
0
    for (i=0; i<line->n_fmt; i++)
6038
0
    {
6039
0
        if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
6040
0
    }
6041
0
    return NULL;
6042
0
}
6043
6044
bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
6045
0
{
6046
0
    int i;
6047
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
6048
0
    for (i=0; i<line->n_info; i++)
6049
0
    {
6050
0
        if ( line->d.info[i].key==id ) return &line->d.info[i];
6051
0
    }
6052
0
    return NULL;
6053
0
}
6054
6055
6056
int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
6057
0
{
6058
0
    int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
6059
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1;    // no such INFO field in the header
6060
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2;     // expected different type
6061
6062
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
6063
6064
0
    for (i=0; i<line->n_info; i++)
6065
0
        if ( line->d.info[i].key==tag_id ) break;
6066
0
    if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3;       // the tag is not present in this record
6067
0
    if ( type==BCF_HT_FLAG ) return 1;
6068
6069
0
    bcf_info_t *info = &line->d.info[i];
6070
0
    if ( !info->vptr ) return -3;           // the tag was marked for removal
6071
0
    if ( type==BCF_HT_STR )
6072
0
    {
6073
0
        if ( *ndst < info->len+1 )
6074
0
        {
6075
0
            *ndst = info->len + 1;
6076
0
            *dst  = realloc(*dst, *ndst);
6077
0
        }
6078
0
        memcpy(*dst,info->vptr,info->len);
6079
0
        ((uint8_t*)*dst)[info->len] = 0;
6080
0
        return info->len;
6081
0
    }
6082
6083
    // Make sure the buffer is big enough
6084
0
    int size1;
6085
0
    switch (type) {
6086
0
        case BCF_HT_INT:  size1 = sizeof(int32_t); break;
6087
0
        case BCF_HT_LONG: size1 = sizeof(int64_t); break;
6088
0
        case BCF_HT_REAL: size1 = sizeof(float); break;
6089
0
        default:
6090
0
            hts_log_error("Unexpected output type %d at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
6091
0
            return -2;
6092
0
    }
6093
0
    if ( *ndst < info->len )
6094
0
    {
6095
0
        *ndst = info->len;
6096
0
        *dst  = realloc(*dst, *ndst * size1);
6097
0
    }
6098
6099
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_regular, out_type_t) do { \
6100
0
        out_type_t *tmp = (out_type_t *) *dst; \
6101
0
        int j; \
6102
0
        for (j=0; j<info->len; j++) \
6103
0
        { \
6104
0
            type_t p = convert(info->vptr + j * sizeof(type_t)); \
6105
0
            if ( is_vector_end ) break; \
6106
0
            if ( is_missing ) set_missing; \
6107
0
            else set_regular; \
6108
0
            tmp++; \
6109
0
        } \
6110
0
        ret = j; \
6111
0
    } while (0)
6112
0
    switch (info->type) {
6113
0
        case BCF_BT_INT8:
6114
0
            if (type == BCF_HT_LONG) {
6115
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int64_missing, *tmp=p, int64_t);
6116
0
            } else {
6117
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=p, int32_t);
6118
0
            }
6119
0
            break;
6120
0
        case BCF_BT_INT16:
6121
0
            if (type == BCF_HT_LONG) {
6122
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t);
6123
0
            } else {
6124
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t);
6125
0
            }
6126
0
            break;
6127
0
        case BCF_BT_INT32:
6128
0
            if (type == BCF_HT_LONG) {
6129
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); break;
6130
0
            } else {
6131
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break;
6132
0
            }
6133
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break;
6134
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, info->type, bcf_seqname_safe(hdr,line), line->pos+1); return -2;
6135
0
    }
6136
0
    #undef BRANCH
6137
0
    return ret;  // set by BRANCH
6138
0
}
6139
6140
int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
6141
0
{
6142
0
    int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
6143
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
6144
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;     // expected different type
6145
6146
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
6147
6148
0
    for (i=0; i<line->n_fmt; i++)
6149
0
        if ( line->d.fmt[i].id==tag_id ) break;
6150
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
6151
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
6152
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
6153
6154
0
    int nsmpl = bcf_hdr_nsamples(hdr);
6155
0
    if ( !*dst )
6156
0
    {
6157
0
        *dst = (char**) malloc(sizeof(char*)*nsmpl);
6158
0
        if ( !*dst ) return -4;     // could not alloc
6159
0
        (*dst)[0] = NULL;
6160
0
    }
6161
0
    int n = (fmt->n+1)*nsmpl;
6162
0
    if ( *ndst < n )
6163
0
    {
6164
0
        (*dst)[0] = realloc((*dst)[0], n);
6165
0
        if ( !(*dst)[0] ) return -4;    // could not alloc
6166
0
        *ndst = n;
6167
0
    }
6168
0
    for (i=0; i<nsmpl; i++)
6169
0
    {
6170
0
        uint8_t *src = fmt->p + i*fmt->n;
6171
0
        uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
6172
0
        memcpy(tmp,src,fmt->n);
6173
0
        tmp[fmt->n] = 0;
6174
0
        (*dst)[i] = (char*) tmp;
6175
0
    }
6176
0
    return n;
6177
0
}
6178
6179
int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
6180
0
{
6181
0
    int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
6182
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
6183
0
    if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
6184
0
    {
6185
        // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
6186
0
        if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
6187
0
    }
6188
0
    else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2;     // expected different type
6189
6190
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
6191
6192
0
    for (i=0; i<line->n_fmt; i++)
6193
0
        if ( line->d.fmt[i].id==tag_id ) break;
6194
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
6195
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
6196
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
6197
6198
0
    if ( type==BCF_HT_STR )
6199
0
    {
6200
0
        int n = fmt->n*bcf_hdr_nsamples(hdr);
6201
0
        if ( *ndst < n )
6202
0
        {
6203
0
            *dst  = realloc(*dst, n);
6204
0
            if ( !*dst ) return -4;     // could not alloc
6205
0
            *ndst = n;
6206
0
        }
6207
0
        memcpy(*dst,fmt->p,n);
6208
0
        return n;
6209
0
    }
6210
6211
    // Make sure the buffer is big enough
6212
0
    int nsmpl = bcf_hdr_nsamples(hdr);
6213
0
    int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
6214
0
    if ( *ndst < fmt->n*nsmpl )
6215
0
    {
6216
0
        *ndst = fmt->n*nsmpl;
6217
0
        *dst  = realloc(*dst, *ndst*size1);
6218
0
        if ( !*dst ) return -4;     // could not alloc
6219
0
    }
6220
6221
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_vector_end, set_regular, out_type_t) { \
6222
0
        out_type_t *tmp = (out_type_t *) *dst; \
6223
0
        uint8_t *fmt_p = fmt->p; \
6224
0
        for (i=0; i<nsmpl; i++) \
6225
0
        { \
6226
0
            for (j=0; j<fmt->n; j++) \
6227
0
            { \
6228
0
                type_t p = convert(fmt_p + j * sizeof(type_t)); \
6229
0
                if ( is_missing ) set_missing; \
6230
0
                else if ( is_vector_end ) { set_vector_end; break; } \
6231
0
                else set_regular; \
6232
0
                tmp++; \
6233
0
            } \
6234
0
            for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
6235
0
            fmt_p += fmt->size; \
6236
0
        } \
6237
0
    }
6238
0
    switch (fmt->type) {
6239
0
        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6240
0
        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6241
0
        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
6242
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), bcf_float_set(tmp, p), float); break;
6243
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt->type, bcf_seqname_safe(hdr,line), line->pos+1); exit(1);
6244
0
    }
6245
0
    #undef BRANCH
6246
6247
0
    return nsmpl*fmt->n;
6248
0
}
6249
6250
//error description structure definition
6251
typedef struct err_desc {
6252
    int  errorcode;
6253
    const char *description;
6254
}err_desc;
6255
6256
// error descriptions
6257
static const err_desc errdesc_bcf[] = {
6258
    { BCF_ERR_CTG_UNDEF, "Contig not defined in header"},
6259
    { BCF_ERR_TAG_UNDEF, "Tag not defined in header" },
6260
    { BCF_ERR_NCOLS, "Incorrect number of columns" },
6261
    { BCF_ERR_LIMITS, "Limits reached" },
6262
    { BCF_ERR_CHAR, "Invalid character" },
6263
    { BCF_ERR_CTG_INVALID, "Invalid contig" },
6264
    { BCF_ERR_TAG_INVALID, "Invalid tag" },
6265
};
6266
6267
/// append given description to buffer based on available size and add ... when not enough space
6268
    /** @param buffer       buffer to which description to be appended
6269
        @param offset       offset at which to be appended
6270
        @param maxbuffer    maximum size of the buffer
6271
        @param description  the description to be appended
6272
on failure returns -1 - when buffer is not big enough; returns -1 on invalid params and on too small buffer which are improbable due to validation at caller site
6273
on success returns 0
6274
    */
6275
5.19k
static int add_desc_to_buffer(char *buffer, size_t *offset, size_t maxbuffer, const char *description) {
6276
6277
5.19k
    if (!description || !buffer || !offset || (maxbuffer < 4))
6278
0
        return -1;
6279
6280
5.19k
    size_t rembuffer = maxbuffer - *offset;
6281
5.19k
    if (rembuffer > (strlen(description) + (rembuffer == maxbuffer ? 0 : 1))) {    //add description with optionally required ','
6282
5.19k
        *offset += snprintf(buffer + *offset, rembuffer, "%s%s", (rembuffer == maxbuffer)? "": ",", description);
6283
5.19k
    } else {    //not enough space for description, put ...
6284
0
        size_t tmppos = (rembuffer <= 4) ? maxbuffer - 4 : *offset;
6285
0
        snprintf(buffer + tmppos, 4, "...");    //ignore offset update
6286
0
        return -1;
6287
0
    }
6288
5.19k
    return 0;
6289
5.19k
}
6290
6291
//get description for given error code. return NULL on error
6292
2.48k
const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) {
6293
2.48k
    size_t usedup = 0;
6294
2.48k
    int ret = 0;
6295
2.48k
    int idx;
6296
6297
2.48k
    if (!buffer || maxbuffer < 4)
6298
0
        return NULL;           //invalid / insufficient buffer
6299
6300
2.48k
    if (!errorcode) {
6301
0
        buffer[0] = '\0';      //no error, set null
6302
0
        return buffer;
6303
0
    }
6304
6305
19.9k
    for (idx = 0; idx < sizeof(errdesc_bcf) / sizeof(err_desc); ++idx) {
6306
17.4k
        if (errorcode & errdesc_bcf[idx].errorcode) {    //error is set, add description
6307
5.19k
            ret = add_desc_to_buffer(buffer, &usedup, maxbuffer, errdesc_bcf[idx].description);
6308
5.19k
            if (ret < 0)
6309
0
                break;         //not enough space, ... added, no need to continue
6310
6311
5.19k
            errorcode &= ~errdesc_bcf[idx].errorcode;    //reset the error
6312
5.19k
        }
6313
17.4k
    }
6314
6315
2.48k
    if (errorcode && (ret >= 0))  {     //undescribed error is present in error code and had enough buffer, try to add unkonwn error as well§
6316
0
        add_desc_to_buffer(buffer, &usedup, maxbuffer, "Unknown error");
6317
0
    }
6318
2.48k
    return buffer;
6319
2.48k
}
6320
6321
/**
6322
 *  bcf_format_gt_v2 - formats GT information on a string
6323
 *  @param hdr - bcf header, to get version
6324
 *  @param fmt - pointer to bcf format data
6325
 *  @param isample - position of interested sample in data
6326
 *  @param str - pointer to output string
6327
 *  Returns 0 on success and -1 on failure
6328
 *  This method is preferred over bcf_format_gt as this supports vcf4.4 and
6329
 *  prefixed phasing. Explicit / prefixed phasing for 1st allele is used only
6330
 *  when it is a must to correctly express phasing.
6331
 * correctly express phasing.
6332
 */
6333
int bcf_format_gt_v2(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample, kstring_t *str)
6334
17.2k
{
6335
17.2k
    uint32_t e = 0;
6336
17.2k
    int ploidy = 1, anyunphased = 0;
6337
17.2k
    int32_t val0 = 0;
6338
17.2k
    size_t pos = str ? str->l : 0;
6339
6340
17.2k
    #define BRANCH(type_t, convert, missing, vector_end) { \
6341
13.4k
        uint8_t *ptr = fmt->p + isample*fmt->size; \
6342
13.4k
        int i; \
6343
35.0k
        for (i=0; i<fmt->n; i++, ptr += sizeof(type_t)) \
6344
29.1k
        { \
6345
29.1k
            type_t val = convert(ptr); \
6346
29.1k
            if ( val == vector_end ) break; \
6347
29.1k
            if (!i) { val0 = val; } \
6348
21.5k
            if (i) { \
6349
8.06k
                e |= kputc("/|"[val & 1], str) < 0; \
6350
8.06k
                anyunphased |= !(val & 1); \
6351
8.06k
            } \
6352
21.5k
            if (!(val >> 1)) e |= kputc('.', str) < 0; \
6353
21.5k
            else e |= kputw((val >> 1) - 1, str) < 0; \
6354
21.5k
        } \
6355
13.4k
        if (i == 0) e |= kputc('.', str) < 0; \
6356
13.4k
        ploidy = i; \
6357
13.4k
    }
6358
17.2k
    switch (fmt->type) {
6359
6.86k
        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8,  bcf_int8_missing,
6360
6.86k
            bcf_int8_vector_end); break;
6361
1.77k
        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing,
6362
1.77k
            bcf_int16_vector_end); break;
6363
4.86k
        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing,
6364
4.86k
            bcf_int32_vector_end); break;
6365
3.76k
        case BCF_BT_NULL:  e |= kputc('.', str) < 0; break;
6366
0
        default: hts_log_error("Unexpected type %d", fmt->type); return -2;
6367
17.2k
    }
6368
17.2k
    #undef BRANCH
6369
6370
17.2k
    if (hdr && get_hdr_aux(hdr)->version >= VCF44) {
6371
        //output which supports prefixed phasing
6372
6373
        /* update 1st allele's phasing if required and append rest to it.
6374
        use prefixed phasing only when it is a must. i.e. without which the
6375
        inferred value will be incorrect */
6376
8.31k
        if (val0 & 1) {
6377
            /* 1st one is phased, if ploidy is > 1 and an unphased allele exists
6378
             need to specify explicitly */
6379
1.65k
            e |= (ploidy > 1 && anyunphased) ?
6380
757
                    (kinsert_char('|', pos, str) < 0) :
6381
1.65k
                        (ploidy <= 1 && !((val0 >> 1)) ? //|. needs explicit o/p
6382
0
                            (kinsert_char('|', pos, str) < 0) :
6383
894
                            0);
6384
6.66k
        } else {
6385
            /* 1st allele is unphased, if ploidy is = 1 or allele is '.' or
6386
             ploidy > 1 and no other unphased allele exist, need to specify
6387
             explicitly */
6388
6.66k
            e |= ((ploidy <= 1 && val0 != 0) || (ploidy > 1 && !anyunphased)) ?
6389
3.69k
                    (kinsert_char('/', pos, str) < 0) :
6390
6.66k
                    0;
6391
6.66k
        }
6392
8.31k
    }
6393
17.2k
    return e == 0 ? 0 : -1;
6394
17.2k
}
6395
6396
/**
6397
 *  get_rlen - calculates and returns rlen value
6398
 *  @param h - bcf header
6399
 *  @param v - bcf data
6400
 *  Returns rlen calculated on success and -1 on failure.
6401
 *  rlen calculation is dependent on vcf version and a few other field data.
6402
 *  When bcf decoded data is available, refers it. When not available, retrieves
6403
 *  required field data by seeking on the data stream.
6404
 *  Ideally pos & version be set appropriately before any info/format field
6405
 *  update to have proper rlen calculation.
6406
 *  As version is not kept properly updated in practice, it is ignored in calcs.
6407
 */
6408
static int64_t get_rlen(const bcf_hdr_t *h, bcf1_t *v)
6409
57.2k
{
6410
57.2k
    uint8_t *f = (uint8_t*)v->shared.s, *t = NULL,
6411
57.2k
        *e = (uint8_t*)v->shared.s + v->shared.l;
6412
57.2k
    int size, type, id, lenid, endid, svlenid, i, bad, gvcf = 0, use_svlen = 0;
6413
57.2k
    bcf_info_t *endinfo = NULL, *svleninfo = NULL, end_lcl, svlen_lcl;
6414
57.2k
    bcf_fmt_t *lenfmt = NULL, len_lcl;
6415
6416
    //holds SVLEN allele status for the max no of alleles
6417
57.2k
    uint8_t svlenals[8192];
6418
    //pos from info END, fmt LEN, info SVLEN
6419
57.2k
    hts_pos_t end = 0, end_fmtlen = 0, end_svlen = 0, hpos;
6420
57.2k
    int64_t len_ref = 0, len = 0, tmp;
6421
57.2k
    endid = bcf_hdr_id2int(h, BCF_DT_ID, "END");
6422
6423
    //initialise bytes which are to be used
6424
57.2k
    memset(svlenals, 0, 1 + v->n_allele / 8);
6425
6426
    //use decoded data where ever available and where not, get from stream
6427
57.2k
    if (v->unpacked & BCF_UN_STR || v->d.shared_dirty & BCF1_DIRTY_ALS) {
6428
0
        for (i = 1; i < v->n_allele; ++i) {
6429
            // check only symbolic alt alleles
6430
0
            if (v->d.allele[i][0] != '<')
6431
0
                continue;
6432
0
            if (svlen_on_ref_for_vcf_alt(v->d.allele[i], -1)) {
6433
                // del, dup or cnv allele, note to check corresponding svlen val
6434
0
                svlenals[i >> 3] |= 1 << (i & 7);
6435
0
                use_svlen = 1;
6436
0
            } else if (!strcmp(v->d.allele[i], "<*>") ||
6437
0
                         !strcmp(v->d.allele[i], "<NON_REF>")) {
6438
0
                gvcf = 1;   //gvcf present, have to check for LEN field
6439
0
            }
6440
0
        }
6441
0
        f += v->unpack_size[0] + v->unpack_size[1];
6442
0
        len_ref = v->n_allele ? strlen(v->d.allele[0]) : 0;
6443
57.2k
    } else if (f < e) {
6444
        //skip ID
6445
57.2k
        size = bcf_dec_size(f, &f, &type);
6446
57.2k
        f += size << bcf_type_shift[type];
6447
        // REF, ALT
6448
3.67M
        for (i = 0; i < v->n_allele; ++i) {
6449
            //check all alleles, w/o NUL
6450
3.61M
            size = bcf_dec_size(f, &f, &type);
6451
3.61M
            if (!i) {   //REF length
6452
57.2k
                len_ref = size;
6453
3.56M
            } else if (size > 0 && *f == '<') {
6454
10.8k
                if (svlen_on_ref_for_vcf_alt((char *) f, size)) {
6455
                    // del, dup or cnv allele, note to check corresponding svlen val
6456
10
                    svlenals[i >> 3] |= 1 << (i & 7);
6457
10
                    use_svlen = 1;
6458
10.8k
                } else if ((size == 3 && !strncmp((char*)f, "<*>", size)) ||
6459
7.38k
                    (size == 9 && !strncmp((char*)f, "<NON_REF>", size))) {
6460
3.62k
                    gvcf = 1;   //gvcf present, have to check for LEN field
6461
3.62k
                }
6462
10.8k
            }
6463
3.61M
            f += size << bcf_type_shift[type];
6464
3.61M
        }
6465
57.2k
    }
6466
    // FILTER
6467
57.2k
    if (v->unpacked & BCF_UN_FLT) {
6468
0
        f += v->unpack_size[2];
6469
57.2k
    } else if (f < e) {
6470
57.2k
        size = bcf_dec_size(f, &f, &type);
6471
57.2k
        f += size << bcf_type_shift[type];
6472
57.2k
    }
6473
6474
    // Only do SVLEN lookup if there are suitable symbolic alleles
6475
57.2k
    svlenid = use_svlen ? bcf_hdr_id2int(h, BCF_DT_ID, "SVLEN") : -1;
6476
6477
    // INFO
6478
57.2k
    if (svlenid >= 0 || endid >= 0 ) {  //only if end/svlen present
6479
22.0k
        if (v->unpacked & BCF_UN_INFO || v->d.shared_dirty & BCF1_DIRTY_INF) {
6480
0
            endinfo = bcf_get_info(h, v, "END");
6481
0
            svleninfo = bcf_get_info(h, v, "SVLEN");
6482
22.0k
        } else if (f < e) {
6483
26.9k
            for (i = 0; i < v->n_info; ++i) {
6484
18.0k
                id = bcf_dec_typed_int1(f, &t);
6485
18.0k
                if (id == endid) {  //END
6486
2.26k
                    t = bcf_unpack_info_core1(f, &end_lcl);
6487
2.26k
                    endinfo = &end_lcl;
6488
2.26k
                    if (svleninfo || svlenid < 0) {
6489
2.26k
                        break;  //already got svlen or no need to search further
6490
2.26k
                    }
6491
15.7k
                } else if (id == svlenid) { //SVLEN
6492
0
                    t = bcf_unpack_info_core1(f, &svlen_lcl);
6493
0
                    svleninfo = &svlen_lcl;
6494
0
                    if (endinfo || endid < 0 ) {
6495
0
                        break;  //already got end or no need to search further
6496
0
                    }
6497
15.7k
                } else {
6498
15.7k
                    f = t;
6499
15.7k
                    size = bcf_dec_size(f, &t, &type);
6500
15.7k
                    t += size << bcf_type_shift[type];
6501
15.7k
                }
6502
15.7k
                f = t;
6503
15.7k
            }
6504
11.2k
        }
6505
22.0k
    }
6506
6507
    // Only do LEN lookup if a <*> allele was found
6508
57.2k
    lenid = gvcf ? bcf_hdr_id2int(h, BCF_DT_ID, "LEN") : -1;
6509
6510
    // FORMAT
6511
57.2k
    if (lenid >= 0) {
6512
        //with LEN and has gvcf allele
6513
0
        f = (uint8_t*)v->indiv.s; t = NULL; e = (uint8_t*)v->indiv.s + v->indiv.l;
6514
0
        if (v->unpacked & BCF_UN_FMT || v->d.indiv_dirty) {
6515
0
            lenfmt = bcf_get_fmt(h, v, "LEN");
6516
0
        } else if (f < e) {
6517
0
            for (i = 0; i < v->n_fmt; ++i) {
6518
0
                id = bcf_dec_typed_int1(f, &t);
6519
0
                if (id == lenid) {
6520
0
                        t = bcf_unpack_fmt_core1(f, v->n_sample, &len_lcl);
6521
0
                    lenfmt = &len_lcl;
6522
0
                    break;  //that's all needed
6523
0
                } else {
6524
0
                    f = t;
6525
0
                    size = bcf_dec_size(f, &t, &type);
6526
0
                    t += size * v->n_sample << bcf_type_shift[type];
6527
0
                }
6528
0
                f = t;
6529
0
            }
6530
0
        }
6531
0
    }
6532
    //got required data, find end and rlen
6533
57.2k
    if (endinfo && endinfo->vptr) { //end position given by info END
6534
        //end info exists, not being deleted
6535
2.26k
        end = endinfo->v1.i;
6536
2.26k
        switch(endinfo->type) {
6537
0
            case BCF_BT_INT8:  end = end == bcf_int8_missing ? 0 : end;  break;
6538
0
            case BCF_BT_INT16: end = end == bcf_int16_missing ? 0 : end; break;
6539
0
            case BCF_BT_INT32: end = end == bcf_int32_missing ? 0 : end; break;
6540
0
            case BCF_BT_INT64: end = end == bcf_int64_missing ? 0 : end; break;
6541
2.26k
            default: end = 0; break; //invalid
6542
2.26k
        }
6543
2.26k
    }
6544
6545
57.2k
    if (svleninfo && svleninfo->vptr) {
6546
        //svlen info exists, not being deleted
6547
0
        bad = 0;
6548
        //get largest svlen corresponding to a <DEL> symbolic allele
6549
0
        for (i = 0; i < svleninfo->len && i + 1 < v->n_allele; ++i) {
6550
0
            if (!(svlenals[i >> 3] & (1 << ((i + 1) & 7))))
6551
0
                continue;
6552
6553
0
            switch(svleninfo->type) {
6554
0
                case BCF_BT_INT8:
6555
0
                    tmp = le_to_i8(&svleninfo->vptr[i]);
6556
0
                    tmp = tmp == bcf_int8_missing ? 0 : tmp;
6557
0
                break;
6558
0
                case BCF_BT_INT16:
6559
0
                    tmp = le_to_i16(&svleninfo->vptr[i * 2]);
6560
0
                    tmp = tmp == bcf_int16_missing ? 0 : tmp;
6561
0
                break;
6562
0
                case BCF_BT_INT32:
6563
0
                    tmp = le_to_i32(&svleninfo->vptr[i * 4]);
6564
0
                    tmp = tmp == bcf_int32_missing ? 0 : tmp;
6565
0
                break;
6566
0
                case BCF_BT_INT64:
6567
0
                    tmp = le_to_i64(&svleninfo->vptr[i * 8]);
6568
0
                    tmp = tmp == bcf_int64_missing ? 0 : tmp;
6569
0
                break;
6570
0
                default: //invalid
6571
0
                    tmp = 0;
6572
0
                    bad = 1;
6573
0
                break;
6574
0
            }
6575
0
            if (bad) {  //stop svlen check
6576
0
                len = 0;
6577
0
                break;
6578
0
            }
6579
6580
0
            tmp = tmp < 0 ? llabs(tmp) : tmp;
6581
0
            if (len < tmp) len = tmp;
6582
0
        }
6583
0
    }
6584
57.2k
    if ((!svleninfo || !len) && end) { //no svlen, infer from end
6585
0
        len = end > v->pos ? end - v->pos - 1 : 0;
6586
0
    }
6587
57.2k
    end_svlen = v->pos + len + 1;   //end position found from SVLEN
6588
6589
57.2k
    len = 0;
6590
57.2k
    if (lenfmt && lenfmt->p) {
6591
        //fmt len exists, not being deleted, has gvcf and version >= 4.5
6592
0
        int j = 0;
6593
0
        int64_t offset = 0;
6594
0
        bad = 0;
6595
0
        for (i = 0; i < v->n_sample; ++i) {
6596
0
            for (j = 0; j < lenfmt->n; ++j) {
6597
0
                switch(lenfmt->type) {
6598
0
                case BCF_BT_INT8:
6599
0
                    tmp = le_to_i8(lenfmt->p + offset + j);
6600
0
                    tmp = tmp == bcf_int8_missing ? 0 : tmp;
6601
0
                break;
6602
0
                case BCF_BT_INT16:
6603
0
                    tmp = le_to_i16(lenfmt->p + offset + j * 2);
6604
0
                    tmp = tmp == bcf_int16_missing ? 0 : tmp;
6605
0
                break;
6606
0
                case BCF_BT_INT32:
6607
0
                    tmp = le_to_i32(lenfmt->p + offset + j * 4);
6608
0
                    tmp = tmp == bcf_int32_missing ? 0 : tmp;
6609
0
                break;
6610
0
                case BCF_BT_INT64:
6611
0
                    tmp = le_to_i64(lenfmt->p + offset + j * 8);
6612
0
                    tmp = tmp == bcf_int64_missing ? 0 : tmp;
6613
0
                break;
6614
0
                default: //invalid
6615
0
                    bad = 1;
6616
0
                break;
6617
0
                }
6618
0
                if (bad) {  //stop LEN check
6619
0
                    len = 0;
6620
0
                    break;
6621
0
                }
6622
                //assumes only gvcf have valid LEN
6623
0
                if (len < tmp) len = tmp;
6624
0
            }
6625
0
            offset += j << bcf_type_shift[lenfmt->type];
6626
0
        }
6627
0
    }
6628
57.2k
    if ((!lenfmt || !len) && end) { //no fmt len, infer from end
6629
0
        len = end > v->pos ? end - v->pos : 0;
6630
0
    }
6631
57.2k
    end_fmtlen = v->pos + len;  //end position found from LEN
6632
6633
    //get largest pos, based on END, SVLEN, fmt LEN and length using it
6634
57.2k
    hpos = end < end_svlen ?
6635
13.4k
            end_svlen < end_fmtlen ? end_fmtlen : end_svlen :
6636
57.2k
            end < end_fmtlen ? end_fmtlen : end;
6637
57.2k
    len = hpos - v->pos;
6638
6639
    //NOTE: 'end' calculation be in sync with tbx.c:tbx_parse1
6640
6641
    /* rlen to be calculated based on version, END, SVLEN, fmt LEN, ref len.
6642
    Relevance of these fields vary across different vcf versions.
6643
    Many times, these info/fmt fields are used without version updates;
6644
    hence these fields are used for calculation disregarding vcf version */
6645
57.2k
    return len < len_ref ? len_ref : len;
6646
57.2k
}