Coverage Report

Created: 2024-02-11 06:32

/src/htslib/vcf.c
Line
Count
Source (jump to first uncovered line)
1
/*  vcf.c -- VCF/BCF API functions.
2
3
    Copyright (C) 2012, 2013 Broad Institute.
4
    Copyright (C) 2012-2023 Genome Research Ltd.
5
    Portions copyright (C) 2014 Intel Corporation.
6
7
    Author: Heng Li <lh3@sanger.ac.uk>
8
9
Permission is hereby granted, free of charge, to any person obtaining a copy
10
of this software and associated documentation files (the "Software"), to deal
11
in the Software without restriction, including without limitation the rights
12
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
copies of the Software, and to permit persons to whom the Software is
14
furnished to do so, subject to the following conditions:
15
16
The above copyright notice and this permission notice shall be included in
17
all copies or substantial portions of the Software.
18
19
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25
DEALINGS IN THE SOFTWARE.  */
26
27
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
28
#include <config.h>
29
30
#include <stdio.h>
31
#include <assert.h>
32
#include <string.h>
33
#include <strings.h>
34
#include <stdlib.h>
35
#include <limits.h>
36
#include <stdint.h>
37
#include <inttypes.h>
38
#include <errno.h>
39
40
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
41
#include "fuzz_settings.h"
42
#endif
43
44
#include "htslib/vcf.h"
45
#include "htslib/bgzf.h"
46
#include "htslib/tbx.h"
47
#include "htslib/hfile.h"
48
#include "hts_internal.h"
49
#include "htslib/hts_endian.h"
50
#include "htslib/khash_str2int.h"
51
#include "htslib/kstring.h"
52
#include "htslib/sam.h"
53
#include "htslib/khash.h"
54
55
#if 0
56
// This helps on Intel a bit, often 6-7% faster VCF parsing.
57
// Conversely sometimes harms AMD Zen4 as ~9% slower.
58
// Possibly related to IPC differences.  However for now it's just a
59
// curiousity we ignore and stick with the simpler code.
60
//
61
// Left here as a hint for future explorers.
62
static inline int xstreq(const char *a, const char *b) {
63
    while (*a && *a == *b)
64
        a++, b++;
65
    return *a == *b;
66
}
67
68
#define KHASH_MAP_INIT_XSTR(name, khval_t) \
69
  KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, xstreq)
70
71
KHASH_MAP_INIT_XSTR(vdict, bcf_idinfo_t)
72
#else
73
KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
74
#endif
75
76
typedef khash_t(vdict) vdict_t;
77
78
KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*)
79
typedef khash_t(hdict) hdict_t;
80
81
82
#include "htslib/kseq.h"
83
HTSLIB_EXPORT
84
uint32_t bcf_float_missing    = 0x7F800001;
85
86
HTSLIB_EXPORT
87
uint32_t bcf_float_vector_end = 0x7F800002;
88
89
HTSLIB_EXPORT
90
uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
91
92
static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
93
94
/*
95
    Partial support for 64-bit POS and Number=1 INFO tags.
96
    Notes:
97
     - the support for 64-bit values is motivated by POS and INFO/END for large genomes
98
     - the use of 64-bit values does not conform to the specification
99
     - cannot output 64-bit BCF and if it does, it is not compatible with anything
100
     - experimental, use at your risk
101
*/
102
#ifdef VCF_ALLOW_INT64
103
    #define BCF_MAX_BT_INT64 (0x7fffffffffffffff)       /* INT64_MAX, for internal use only */
104
    #define BCF_MIN_BT_INT64 -9223372036854775800LL     /* INT64_MIN + 8, for internal use only */
105
#endif
106
107
519
#define BCF_IS_64BIT (1<<30)
108
109
110
// Opaque structure with auxilary data which allows to extend bcf_hdr_t without breaking ABI.
111
// Note that this preserving API and ABI requires that the first element is vdict_t struct
112
// rather than a pointer, as user programs may (and in some cases do) access the dictionary
113
// directly as (vdict_t*)hdr->dict.
114
typedef struct
115
{
116
    vdict_t dict;   // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT
117
    hdict_t *gen;   // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields
118
    size_t *key_len;// length of h->id[BCF_DT_ID] strings
119
}
120
bcf_hdr_aux_t;
121
122
static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr)
123
316k
{
124
316k
    return (bcf_hdr_aux_t *)hdr->dict[0];
125
316k
}
126
127
static char *find_chrom_header_line(char *s)
128
0
{
129
0
    char *nl;
130
0
    if (strncmp(s, "#CHROM\t", 7) == 0) return s;
131
0
    else if ((nl = strstr(s, "\n#CHROM\t")) != NULL) return nl+1;
132
0
    else return NULL;
133
0
}
134
135
/*************************
136
 *** VCF header parser ***
137
 *************************/
138
139
static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len)
140
11.8k
{
141
11.8k
    const char *ss = s;
142
12.1k
    while ( *ss && isspace_c(*ss) && ss - s < len) ss++;
143
11.8k
    if ( !*ss || ss - s == len)
144
4
    {
145
4
        hts_log_error("Empty sample name: trailing spaces/tabs in the header line?");
146
4
        return -1;
147
4
    }
148
149
11.8k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
150
11.8k
    int ret;
151
11.8k
    char *sdup = malloc(len + 1);
152
11.8k
    if (!sdup) return -1;
153
11.8k
    memcpy(sdup, s, len);
154
11.8k
    sdup[len] = 0;
155
156
    // Ensure space is available in h->samples
157
11.8k
    size_t n = kh_size(d);
158
11.8k
    char **new_samples = realloc(h->samples, sizeof(char*) * (n + 1));
159
11.8k
    if (!new_samples) {
160
0
        free(sdup);
161
0
        return -1;
162
0
    }
163
11.8k
    h->samples = new_samples;
164
165
11.8k
    int k = kh_put(vdict, d, sdup, &ret);
166
11.8k
    if (ret < 0) {
167
0
        free(sdup);
168
0
        return -1;
169
0
    }
170
11.8k
    if (ret) { // absent
171
11.8k
        kh_val(d, k) = bcf_idinfo_def;
172
11.8k
        kh_val(d, k).id = n;
173
11.8k
    } else {
174
0
        hts_log_error("Duplicated sample name '%s'", sdup);
175
0
        free(sdup);
176
0
        return -1;
177
0
    }
178
11.8k
    h->samples[n] = sdup;
179
11.8k
    h->dirty = 1;
180
11.8k
    return 0;
181
11.8k
}
182
183
int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
184
0
{
185
0
    if (!s) {
186
        // Allowed for backwards-compatibility, calling with s == NULL
187
        // used to trigger bcf_hdr_sync(h);
188
0
        return 0;
189
0
    }
190
0
    return bcf_hdr_add_sample_len(h, s, strlen(s));
191
0
}
192
193
int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *hdr, const char *str)
194
4.48k
{
195
4.48k
    const char *mandatory = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
196
4.48k
    if ( strncmp(str,mandatory,strlen(mandatory)) )
197
80
    {
198
80
        hts_log_error("Could not parse the \"#CHROM..\" line, either the fields are incorrect or spaces are present instead of tabs:\n\t%s",str);
199
80
        return -1;
200
80
    }
201
202
4.40k
    const char *beg = str + strlen(mandatory), *end;
203
4.40k
    if ( !*beg || *beg=='\n' ) return 0;
204
1.53k
    if ( strncmp(beg,"\tFORMAT\t",8) )
205
24
    {
206
24
        hts_log_error("Could not parse the \"#CHROM..\" line, either FORMAT is missing or spaces are present instead of tabs:\n\t%s",str);
207
24
        return -1;
208
24
    }
209
1.51k
    beg += 8;
210
211
1.51k
    int ret = 0;
212
11.8k
    while ( *beg )
213
11.8k
    {
214
11.8k
        end = beg;
215
869M
        while ( *end && *end!='\t' && *end!='\n' ) end++;
216
11.8k
        if ( bcf_hdr_add_sample_len(hdr, beg, end-beg) < 0 ) ret = -1;
217
11.8k
        if ( !*end || *end=='\n' || ret<0 ) break;
218
10.3k
        beg = end + 1;
219
10.3k
    }
220
1.51k
    return ret;
221
1.53k
}
222
223
int bcf_hdr_sync(bcf_hdr_t *h)
224
110k
{
225
110k
    int i;
226
441k
    for (i = 0; i < 3; i++)
227
331k
    {
228
331k
        vdict_t *d = (vdict_t*)h->dict[i];
229
331k
        khint_t k;
230
331k
        if ( h->n[i] < kh_size(d) )
231
1.47k
        {
232
1.47k
            bcf_idpair_t *new_idpair;
233
            // this should be true only for i=2, BCF_DT_SAMPLE
234
1.47k
            new_idpair = (bcf_idpair_t*) realloc(h->id[i], kh_size(d)*sizeof(bcf_idpair_t));
235
1.47k
            if (!new_idpair) return -1;
236
1.47k
            h->n[i] = kh_size(d);
237
1.47k
            h->id[i] = new_idpair;
238
1.47k
        }
239
3.84G
        for (k=kh_begin(d); k<kh_end(d); k++)
240
3.84G
        {
241
3.84G
            if (!kh_exist(d,k)) continue;
242
27.3M
            h->id[i][kh_val(d,k).id].key = kh_key(d,k);
243
27.3M
            h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
244
27.3M
        }
245
331k
    }
246
247
    // Invalidate key length cache
248
110k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
249
110k
    if (aux && aux->key_len) {
250
2.19k
        free(aux->key_len);
251
2.19k
        aux->key_len = NULL;
252
2.19k
    }
253
254
110k
    h->dirty = 0;
255
110k
    return 0;
256
110k
}
257
258
void bcf_hrec_destroy(bcf_hrec_t *hrec)
259
200k
{
260
200k
    if (!hrec) return;
261
195k
    free(hrec->key);
262
195k
    if ( hrec->value ) free(hrec->value);
263
195k
    int i;
264
791k
    for (i=0; i<hrec->nkeys; i++)
265
596k
    {
266
596k
        free(hrec->keys[i]);
267
596k
        free(hrec->vals[i]);
268
596k
    }
269
195k
    free(hrec->keys);
270
195k
    free(hrec->vals);
271
195k
    free(hrec);
272
195k
}
273
274
// Copies all fields except IDX.
275
bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
276
0
{
277
0
    int save_errno;
278
0
    bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
279
0
    if (!out) return NULL;
280
281
0
    out->type = hrec->type;
282
0
    if ( hrec->key ) {
283
0
        out->key = strdup(hrec->key);
284
0
        if (!out->key) goto fail;
285
0
    }
286
0
    if ( hrec->value ) {
287
0
        out->value = strdup(hrec->value);
288
0
        if (!out->value) goto fail;
289
0
    }
290
0
    out->nkeys = hrec->nkeys;
291
0
    out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys);
292
0
    if (!out->keys) goto fail;
293
0
    out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys);
294
0
    if (!out->vals) goto fail;
295
0
    int i, j = 0;
296
0
    for (i=0; i<hrec->nkeys; i++)
297
0
    {
298
0
        if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
299
0
        if ( hrec->keys[i] ) {
300
0
            out->keys[j] = strdup(hrec->keys[i]);
301
0
            if (!out->keys[j]) goto fail;
302
0
        }
303
0
        if ( hrec->vals[i] ) {
304
0
            out->vals[j] = strdup(hrec->vals[i]);
305
0
            if (!out->vals[j]) goto fail;
306
0
        }
307
0
        j++;
308
0
    }
309
0
    if ( i!=j ) out->nkeys -= i-j;   // IDX was omitted
310
0
    return out;
311
312
0
 fail:
313
0
    save_errno = errno;
314
0
    hts_log_error("%s", strerror(errno));
315
0
    bcf_hrec_destroy(out);
316
0
    errno = save_errno;
317
0
    return NULL;
318
0
}
319
320
void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
321
0
{
322
0
    fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
323
0
    int i;
324
0
    for (i=0; i<hrec->nkeys; i++)
325
0
        fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
326
0
    fprintf(fp, "\n");
327
0
}
328
329
void bcf_header_debug(bcf_hdr_t *hdr)
330
0
{
331
0
    int i, j;
332
0
    for (i=0; i<hdr->nhrec; i++)
333
0
    {
334
0
        if ( !hdr->hrec[i]->value )
335
0
        {
336
0
            fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
337
0
            fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
338
0
            for (j=1; j<hdr->hrec[i]->nkeys; j++)
339
0
                fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
340
0
            fprintf(stderr,">\n");
341
0
        }
342
0
        else
343
0
            fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
344
0
    }
345
0
}
346
347
int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len)
348
482k
{
349
482k
    char **tmp;
350
482k
    size_t n = hrec->nkeys + 1;
351
482k
    assert(len > 0 && len < SIZE_MAX);
352
482k
    tmp = realloc(hrec->keys, sizeof(char*)*n);
353
482k
    if (!tmp) return -1;
354
482k
    hrec->keys = tmp;
355
482k
    tmp = realloc(hrec->vals, sizeof(char*)*n);
356
482k
    if (!tmp) return -1;
357
482k
    hrec->vals = tmp;
358
359
482k
    hrec->keys[hrec->nkeys] = (char*) malloc((len+1)*sizeof(char));
360
482k
    if (!hrec->keys[hrec->nkeys]) return -1;
361
482k
    memcpy(hrec->keys[hrec->nkeys],str,len);
362
482k
    hrec->keys[hrec->nkeys][len] = 0;
363
482k
    hrec->vals[hrec->nkeys] = NULL;
364
482k
    hrec->nkeys = n;
365
482k
    return 0;
366
482k
}
367
368
int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted)
369
482k
{
370
482k
    if ( hrec->vals[i] ) {
371
0
        free(hrec->vals[i]);
372
0
        hrec->vals[i] = NULL;
373
0
    }
374
482k
    if ( !str ) return 0;
375
482k
    if ( is_quoted )
376
199k
    {
377
199k
        if (len >= SIZE_MAX - 3) {
378
0
            errno = ENOMEM;
379
0
            return -1;
380
0
        }
381
199k
        hrec->vals[i] = (char*) malloc((len+3)*sizeof(char));
382
199k
        if (!hrec->vals[i]) return -1;
383
199k
        hrec->vals[i][0] = '"';
384
199k
        memcpy(&hrec->vals[i][1],str,len);
385
199k
        hrec->vals[i][len+1] = '"';
386
199k
        hrec->vals[i][len+2] = 0;
387
199k
    }
388
282k
    else
389
282k
    {
390
282k
        if (len == SIZE_MAX) {
391
0
            errno = ENOMEM;
392
0
            return -1;
393
0
        }
394
282k
        hrec->vals[i] = (char*) malloc((len+1)*sizeof(char));
395
282k
        if (!hrec->vals[i]) return -1;
396
282k
        memcpy(hrec->vals[i],str,len);
397
282k
        hrec->vals[i][len] = 0;
398
282k
    }
399
482k
    return 0;
400
482k
}
401
402
int hrec_add_idx(bcf_hrec_t *hrec, int idx)
403
114k
{
404
114k
    int n = hrec->nkeys + 1;
405
114k
    char **tmp = (char**) realloc(hrec->keys, sizeof(char*)*n);
406
114k
    if (!tmp) return -1;
407
114k
    hrec->keys = tmp;
408
409
114k
    tmp = (char**) realloc(hrec->vals, sizeof(char*)*n);
410
114k
    if (!tmp) return -1;
411
114k
    hrec->vals = tmp;
412
413
114k
    hrec->keys[hrec->nkeys] = strdup("IDX");
414
114k
    if (!hrec->keys[hrec->nkeys]) return -1;
415
416
114k
    kstring_t str = {0,0,0};
417
114k
    if (kputw(idx, &str) < 0) {
418
0
        free(hrec->keys[hrec->nkeys]);
419
0
        return -1;
420
0
    }
421
114k
    hrec->vals[hrec->nkeys] = str.s;
422
114k
    hrec->nkeys = n;
423
114k
    return 0;
424
114k
}
425
426
int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
427
144k
{
428
144k
    int i;
429
289k
    for (i=0; i<hrec->nkeys; i++)
430
235k
        if ( !strcasecmp(key,hrec->keys[i]) ) return i;
431
53.3k
    return -1;
432
144k
}
433
434
static void bcf_hrec_set_type(bcf_hrec_t *hrec)
435
370k
{
436
370k
    if ( !strcmp(hrec->key, "contig") ) hrec->type = BCF_HL_CTG;
437
342k
    else if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
438
213k
    else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
439
114k
    else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
440
94.2k
    else if ( hrec->nkeys>0 ) hrec->type = BCF_HL_STR;
441
46.1k
    else hrec->type = BCF_HL_GEN;
442
370k
}
443
444
445
/**
446
    The arrays were generated with
447
448
    valid_ctg:
449
        perl -le '@v = (split(//,q[!#$%&*+./:;=?@^_|~-]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
450
451
    valid_tag:
452
        perl -le '@v = (split(//,q[_.]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
453
*/
454
static const uint8_t valid_ctg[256] =
455
{
456
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
457
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
458
    0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
459
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
460
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
461
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
462
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
463
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
464
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
465
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
466
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
467
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
468
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
469
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
470
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
471
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
472
};
473
static const uint8_t valid_tag[256] =
474
{
475
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
476
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
477
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
478
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
479
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
480
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
481
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
482
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
483
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
484
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
485
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
486
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
487
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
488
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
489
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
490
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
491
};
492
493
/**
494
    bcf_hrec_check() - check the validity of structured header lines
495
496
    Returns 0 on success or negative value on error.
497
498
    Currently the return status is not checked by the caller
499
    and only a warning is printed on stderr. This should be improved
500
    to propagate the error all the way up to the caller and let it
501
    decide what to do: throw an error or proceed anyway.
502
 */
503
static int bcf_hrec_check(bcf_hrec_t *hrec)
504
185k
{
505
185k
    int i;
506
185k
    bcf_hrec_set_type(hrec);
507
508
185k
    if ( hrec->type==BCF_HL_CTG )
509
13.9k
    {
510
13.9k
        i = bcf_hrec_find_key(hrec,"ID");
511
13.9k
        if ( i<0 ) goto err_missing_id;
512
7.16k
        char *val = hrec->vals[i];
513
7.16k
        if ( val[0]=='*' || val[0]=='=' || !valid_ctg[(uint8_t)val[0]] ) goto err_invalid_ctg;
514
68.7k
        while ( *(++val) )
515
67.8k
            if ( !valid_ctg[(uint8_t)*val] ) goto err_invalid_ctg;
516
930
        return 0;
517
1.74k
    }
518
171k
    if ( hrec->type==BCF_HL_INFO )
519
64.3k
    {
520
64.3k
        i = bcf_hrec_find_key(hrec,"ID");
521
64.3k
        if ( i<0 ) goto err_missing_id;
522
57.7k
        char *val = hrec->vals[i];
523
57.7k
        if ( !strcmp(val,"1000G") ) return 0;
524
57.6k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
525
19.5k
        while ( *(++val) )
526
17.8k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
527
1.78k
        return 0;
528
7.12k
    }
529
106k
    if ( hrec->type==BCF_HL_FMT )
530
9.92k
    {
531
9.92k
        i = bcf_hrec_find_key(hrec,"ID");
532
9.92k
        if ( i<0 ) goto err_missing_id;
533
9.51k
        char *val = hrec->vals[i];
534
9.51k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
535
61.8k
        while ( *(++val) )
536
59.8k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
537
2.03k
        return 0;
538
3.93k
    }
539
96.8k
    return 0;
540
541
13.8k
  err_missing_id:
542
13.8k
    hts_log_warning("Missing ID attribute in one or more header lines");
543
13.8k
    return -1;
544
545
6.23k
  err_invalid_ctg:
546
6.23k
    hts_log_warning("Invalid contig name: \"%s\"", hrec->vals[i]);
547
6.23k
    return -1;
548
549
63.3k
  err_invalid_tag:
550
63.3k
    hts_log_warning("Invalid tag name: \"%s\"", hrec->vals[i]);
551
63.3k
    return -1;
552
106k
}
553
554
static inline int is_escaped(const char *min, const char *str)
555
185k
{
556
185k
    int n = 0;
557
186k
    while ( --str>=min && *str=='\\' ) n++;
558
185k
    return n%2;
559
185k
}
560
561
bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
562
207k
{
563
207k
    bcf_hrec_t *hrec = NULL;
564
207k
    const char *p = line;
565
207k
    if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
566
199k
    p += 2;
567
568
199k
    const char *q = p;
569
1.36M
    while ( *q && *q!='=' && *q != '\n' ) q++;
570
199k
    ptrdiff_t n = q-p;
571
199k
    if ( *q!='=' || !n ) // wrong format
572
4.70k
        goto malformed_line;
573
574
195k
    hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
575
195k
    if (!hrec) { *len = -1; return NULL; }
576
195k
    hrec->key = (char*) malloc(sizeof(char)*(n+1));
577
195k
    if (!hrec->key) goto fail;
578
195k
    memcpy(hrec->key,p,n);
579
195k
    hrec->key[n] = 0;
580
195k
    hrec->type = -1;
581
582
195k
    p = ++q;
583
195k
    if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
584
24.9k
    {
585
16.4M
        while ( *q && *q!='\n' ) q++;
586
24.9k
        hrec->value = (char*) malloc((q-p+1)*sizeof(char));
587
24.9k
        if (!hrec->value) goto fail;
588
24.9k
        memcpy(hrec->value, p, q-p);
589
24.9k
        hrec->value[q-p] = 0;
590
24.9k
        *len = q - line + (*q ? 1 : 0); // Skip \n but not \0
591
24.9k
        return hrec;
592
24.9k
    }
593
594
    // structured line, e.g.
595
    // ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
596
    // ##PEDIGREE=<Name_0=G0-ID,Name_1=G1-ID,Name_3=GN-ID>
597
170k
    int nopen = 1;
598
652k
    while ( *q && *q!='\n' && nopen>0 )
599
492k
    {
600
492k
        p = ++q;
601
492k
        while ( *q && *q==' ' ) { p++; q++; }
602
        // ^[A-Za-z_][0-9A-Za-z_.]*$
603
492k
        if (p==q && *q && (isalpha_c(*q) || *q=='_'))
604
488k
        {
605
488k
            q++;
606
2.33M
            while ( *q && (isalnum_c(*q) || *q=='_' || *q=='.') ) q++;
607
488k
        }
608
492k
        n = q-p;
609
492k
        int m = 0;
610
492k
        while ( *q && *q==' ' ) { q++; m++; }
611
492k
        if ( *q!='=' || !n )
612
10.1k
            goto malformed_line;
613
614
482k
        if (bcf_hrec_add_key(hrec, p, q-p-m) < 0) goto fail;
615
482k
        p = ++q;
616
483k
        while ( *q && *q==' ' ) { p++; q++; }
617
618
482k
        int quoted = 0;
619
482k
        char ending = '\0';
620
482k
        switch (*p) {
621
199k
        case '"':
622
199k
            quoted = 1;
623
199k
            ending = '"';
624
199k
            p++;
625
199k
            break;
626
12
        case '[':
627
12
            quoted = 1;
628
12
            ending = ']';
629
12
            break;
630
482k
        }
631
482k
        if ( quoted ) q++;
632
403M
        while ( *q && *q != '\n' )
633
403M
        {
634
403M
            if ( quoted ) { if ( *q==ending && !is_escaped(p,q) ) break; }
635
402M
            else
636
402M
            {
637
402M
                if ( *q=='<' ) nopen++;
638
402M
                if ( *q=='>' ) nopen--;
639
402M
                if ( !nopen ) break;
640
402M
                if ( *q==',' && nopen==1 ) break;
641
402M
            }
642
402M
            q++;
643
402M
        }
644
482k
        const char *r = q;
645
482k
        if (quoted && ending == ']') {
646
12
            if (*q == ending) {
647
3
                r++;
648
3
                q++;
649
3
                quoted = 0;
650
9
            } else {
651
9
                char buffer[320];
652
9
                hts_log_error("Missing ']' in header line %s",
653
9
                              hts_strprint(buffer, sizeof(buffer), '"',
654
9
                                           line, q-line));
655
9
                goto fail;
656
9
            }
657
12
        }
658
482k
        while ( r > p && r[-1] == ' ' ) r--;
659
482k
        if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0)
660
0
            goto fail;
661
482k
        if ( quoted && *q==ending ) q++;
662
482k
        if ( *q=='>' )
663
112k
        {
664
112k
            if (nopen) nopen--;     // this can happen with nested angle brackets <>
665
112k
            q++;
666
112k
        }
667
482k
    }
668
160k
    if ( nopen )
669
48.0k
        hts_log_warning("Incomplete header line, trying to proceed anyway:\n\t[%s]\n\t[%d]",line,q[0]);
670
671
    // Skip to end of line
672
160k
    int nonspace = 0;
673
160k
    p = q;
674
11.0M
    while ( *q && *q!='\n' ) { nonspace |= !isspace_c(*q); q++; }
675
160k
    if (nonspace) {
676
882
        char buffer[320];
677
882
        hts_log_warning("Dropped trailing junk from header line '%s'",
678
882
                        hts_strprint(buffer, sizeof(buffer),
679
882
                                     '"', line, q - line));
680
882
    }
681
682
160k
    *len = q - line + (*q ? 1 : 0);
683
160k
    return hrec;
684
685
9
 fail:
686
9
    *len = -1;
687
9
    bcf_hrec_destroy(hrec);
688
9
    return NULL;
689
690
14.8k
 malformed_line:
691
14.8k
    {
692
14.8k
        char buffer[320];
693
16.8M
        while ( *q && *q!='\n' ) q++;  // Ensure *len includes full line
694
14.8k
        hts_log_error("Could not parse the header line: %s",
695
14.8k
                      hts_strprint(buffer, sizeof(buffer),
696
14.8k
                                   '"', line, q - line));
697
14.8k
        *len = q - line + (*q ? 1 : 0);
698
14.8k
        bcf_hrec_destroy(hrec);
699
14.8k
        return NULL;
700
170k
    }
701
170k
}
702
703
static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_idinfo_t *idinfo)
704
112k
{
705
112k
    size_t new_n;
706
707
    // If available, preserve existing IDX
708
112k
    if ( idinfo->id==-1 )
709
112k
        idinfo->id = hdr->n[dict_type];
710
294
    else if ( idinfo->id < hdr->n[dict_type] && hdr->id[dict_type][idinfo->id].key )
711
6
    {
712
6
        hts_log_error("Conflicting IDX=%d lines in the header dictionary, the new tag is %s",
713
6
            idinfo->id, tag);
714
6
        errno = EINVAL;
715
6
        return -1;
716
6
    }
717
718
112k
    new_n = idinfo->id >= hdr->n[dict_type] ? idinfo->id+1 : hdr->n[dict_type];
719
112k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
720
    // hts_resize() can attempt to allocate up to 2 * requested items
721
112k
    if (new_n > FUZZ_ALLOC_LIMIT/(2 * sizeof(bcf_idpair_t)))
722
14
        return -1;
723
112k
#endif
724
112k
    if (hts_resize(bcf_idpair_t, new_n, &hdr->m[dict_type],
725
112k
                   &hdr->id[dict_type], HTS_RESIZE_CLEAR)) {
726
0
        return -1;
727
0
    }
728
112k
    hdr->n[dict_type] = new_n;
729
730
    // NB: the next kh_put call can invalidate the idinfo pointer, therefore
731
    // we leave it unassigned here. It must be set explicitly in bcf_hdr_sync.
732
112k
    hdr->id[dict_type][idinfo->id].key = tag;
733
734
112k
    return 0;
735
112k
}
736
737
// returns: 1 when hdr needs to be synced, -1 on error, 0 otherwise
738
static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
739
185k
{
740
    // contig
741
185k
    int i, ret, replacing = 0;
742
185k
    khint_t k;
743
185k
    char *str = NULL;
744
745
185k
    bcf_hrec_set_type(hrec);
746
747
185k
    if ( hrec->type==BCF_HL_CTG )
748
13.9k
    {
749
13.9k
        hts_pos_t len = 0;
750
751
        // Get the contig ID ($str) and length ($j)
752
13.9k
        i = bcf_hrec_find_key(hrec,"length");
753
13.9k
        if ( i<0 ) len = 0;
754
3.85k
        else {
755
3.85k
            char *end = hrec->vals[i];
756
3.85k
            len = strtoll(hrec->vals[i], &end, 10);
757
3.85k
            if (end == hrec->vals[i] || len < 0) return 0;
758
3.85k
        }
759
760
11.9k
        i = bcf_hrec_find_key(hrec,"ID");
761
11.9k
        if ( i<0 ) return 0;
762
7.16k
        str = strdup(hrec->vals[i]);
763
7.16k
        if (!str) return -1;
764
765
        // Register in the dictionary
766
7.16k
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
767
7.16k
        khint_t k = kh_get(vdict, d, str);
768
7.16k
        if ( k != kh_end(d) ) { // already present
769
873
            free(str); str=NULL;
770
873
            if (kh_val(d, k).hrec[0] != NULL) // and not removed
771
873
                return 0;
772
0
            replacing = 1;
773
6.29k
        } else {
774
6.29k
            k = kh_put(vdict, d, str, &ret);
775
6.29k
            if (ret < 0) { free(str); return -1; }
776
6.29k
        }
777
778
6.29k
        int idx = bcf_hrec_find_key(hrec,"IDX");
779
6.29k
        if ( idx!=-1 )
780
1.39k
        {
781
1.39k
            char *tmp = hrec->vals[idx];
782
1.39k
            idx = strtol(hrec->vals[idx], &tmp, 10);
783
1.39k
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
784
1.33k
            {
785
1.33k
                if (!replacing) {
786
1.33k
                    kh_del(vdict, d, k);
787
1.33k
                    free(str);
788
1.33k
                }
789
1.33k
                hts_log_warning("Error parsing the IDX tag, skipping");
790
1.33k
                return 0;
791
1.33k
            }
792
1.39k
        }
793
794
4.96k
        kh_val(d, k) = bcf_idinfo_def;
795
4.96k
        kh_val(d, k).id = idx;
796
4.96k
        kh_val(d, k).info[0] = len;
797
4.96k
        kh_val(d, k).hrec[0] = hrec;
798
4.96k
        if (bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k)) < 0) {
799
15
            if (!replacing) {
800
15
                kh_del(vdict, d, k);
801
15
                free(str);
802
15
            }
803
15
            return -1;
804
15
        }
805
4.94k
        if ( idx==-1 ) {
806
4.89k
            if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
807
0
               return -1;
808
0
            }
809
4.89k
        }
810
811
4.94k
        return 1;
812
4.94k
    }
813
814
171k
    if ( hrec->type==BCF_HL_STR ) return 1;
815
147k
    if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_FMT ) return 0;
816
817
    // INFO/FILTER/FORMAT
818
124k
    char *id = NULL;
819
124k
    uint32_t type = UINT32_MAX, var = UINT32_MAX;
820
124k
    int num = -1, idx = -1;
821
479k
    for (i=0; i<hrec->nkeys; i++)
822
357k
    {
823
357k
        if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
824
240k
        else if ( !strcmp(hrec->keys[i], "IDX") )
825
2.74k
        {
826
2.74k
            char *tmp = hrec->vals[i];
827
2.74k
            idx = strtol(hrec->vals[i], &tmp, 10);
828
2.74k
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
829
1.08k
            {
830
1.08k
                hts_log_warning("Error parsing the IDX tag, skipping");
831
1.08k
                return 0;
832
1.08k
            }
833
2.74k
        }
834
238k
        else if ( !strcmp(hrec->keys[i], "Type") )
835
68.6k
        {
836
68.6k
            if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
837
66.9k
            else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
838
61.9k
            else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
839
4.47k
            else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
840
4.28k
            else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
841
2.44k
            else
842
2.44k
            {
843
2.44k
                hts_log_warning("The type \"%s\" is not supported, assuming \"String\"", hrec->vals[i]);
844
2.44k
                type = BCF_HT_STR;
845
2.44k
            }
846
68.6k
        }
847
169k
        else if ( !strcmp(hrec->keys[i], "Number") )
848
59.2k
        {
849
59.2k
            if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
850
59.0k
            else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
851
58.6k
            else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
852
58.6k
            else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
853
58.5k
            else
854
58.5k
            {
855
58.5k
                sscanf(hrec->vals[i],"%d",&num);
856
58.5k
                var = BCF_VL_FIXED;
857
58.5k
            }
858
59.2k
            if (var != BCF_VL_FIXED) num = 0xfffff;
859
59.2k
        }
860
357k
    }
861
122k
    if (hrec->type == BCF_HL_INFO || hrec->type == BCF_HL_FMT) {
862
73.5k
        if (type == -1) {
863
4.98k
            hts_log_warning("%s %s field has no Type defined. Assuming String",
864
4.98k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
865
4.98k
            type = BCF_HT_STR;
866
4.98k
        }
867
73.5k
        if (var == -1) {
868
14.3k
            hts_log_warning("%s %s field has no Number defined. Assuming '.'",
869
14.3k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
870
14.3k
            var = BCF_VL_VAR;
871
14.3k
        }
872
73.5k
        if ( type==BCF_HT_FLAG && (var!=BCF_VL_FIXED || num!=0) )
873
1.01k
        {
874
1.01k
            hts_log_warning("The definition of Flag \"%s/%s\" is invalid, forcing Number=0", hrec->key,id);
875
1.01k
            var = BCF_VL_FIXED;
876
1.01k
            num = 0;
877
1.01k
        }
878
73.5k
    }
879
122k
    uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 |
880
122k
                     (var & 0xf) << 8 |
881
122k
                     (type & 0xf) << 4 |
882
122k
                     (((uint32_t) hrec->type) & 0xf));
883
884
122k
    if ( !id ) return 0;
885
115k
    str = strdup(id);
886
115k
    if (!str) return -1;
887
888
115k
    vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
889
115k
    k = kh_get(vdict, d, str);
890
115k
    if ( k != kh_end(d) )
891
8.04k
    {
892
        // already present
893
8.04k
        free(str);
894
8.04k
        if ( kh_val(d, k).hrec[info&0xf] ) return 0;
895
1.88k
        kh_val(d, k).info[info&0xf] = info;
896
1.88k
        kh_val(d, k).hrec[info&0xf] = hrec;
897
1.88k
        if ( idx==-1 ) {
898
1.81k
            if (hrec_add_idx(hrec, kh_val(d, k).id) < 0) {
899
0
                return -1;
900
0
            }
901
1.81k
        }
902
1.88k
        return 1;
903
1.88k
    }
904
107k
    k = kh_put(vdict, d, str, &ret);
905
107k
    if (ret < 0) {
906
0
        free(str);
907
0
        return -1;
908
0
    }
909
107k
    kh_val(d, k) = bcf_idinfo_def;
910
107k
    kh_val(d, k).info[info&0xf] = info;
911
107k
    kh_val(d, k).hrec[info&0xf] = hrec;
912
107k
    kh_val(d, k).id = idx;
913
107k
    if (bcf_hdr_set_idx(hdr, BCF_DT_ID, kh_key(d,k), &kh_val(d,k)) < 0) {
914
5
        kh_del(vdict, d, k);
915
5
        free(str);
916
5
        return -1;
917
5
    }
918
107k
    if ( idx==-1 ) {
919
107k
        if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
920
0
            return -1;
921
0
        }
922
107k
    }
923
924
107k
    return 1;
925
107k
}
926
927
static void bcf_hdr_unregister_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
928
0
{
929
0
    if (hrec->type == BCF_HL_FLT ||
930
0
        hrec->type == BCF_HL_INFO ||
931
0
        hrec->type == BCF_HL_FMT ||
932
0
        hrec->type == BCF_HL_CTG) {
933
0
        int id = bcf_hrec_find_key(hrec, "ID");
934
0
        if (id < 0 || !hrec->vals[id])
935
0
            return;
936
0
        vdict_t *dict = (hrec->type == BCF_HL_CTG
937
0
                         ? (vdict_t*)hdr->dict[BCF_DT_CTG]
938
0
                         : (vdict_t*)hdr->dict[BCF_DT_ID]);
939
0
        khint_t k = kh_get(vdict, dict, hrec->vals[id]);
940
0
        if (k != kh_end(dict))
941
0
            kh_val(dict, k).hrec[hrec->type==BCF_HL_CTG ? 0 : hrec->type] = NULL;
942
0
    }
943
0
}
944
945
static void bcf_hdr_remove_from_hdict(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
946
0
{
947
0
    kstring_t str = KS_INITIALIZE;
948
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
949
0
    khint_t k;
950
0
    int id;
951
952
0
    switch (hrec->type) {
953
0
    case BCF_HL_GEN:
954
0
        if (ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0)
955
0
            str.l = 0;
956
0
        break;
957
0
    case BCF_HL_STR:
958
0
        id = bcf_hrec_find_key(hrec, "ID");
959
0
        if (id < 0)
960
0
            return;
961
0
        if (!hrec->vals[id] ||
962
0
            ksprintf(&str, "##%s=<ID=%s>", hrec->key, hrec->vals[id]) < 0)
963
0
            str.l = 0;
964
0
        break;
965
0
    default:
966
0
        return;
967
0
    }
968
0
    if (str.l) {
969
0
        k = kh_get(hdict, aux->gen, str.s);
970
0
    } else {
971
        // Couldn't get a string for some reason, so try the hard way...
972
0
        for (k = kh_begin(aux->gen); k < kh_end(aux->gen); k++) {
973
0
            if (kh_exist(aux->gen, k) && kh_val(aux->gen, k) == hrec)
974
0
                break;
975
0
        }
976
0
    }
977
0
    if (k != kh_end(aux->gen) && kh_val(aux->gen, k) == hrec) {
978
0
        kh_val(aux->gen, k) = NULL;
979
0
        free((char *) kh_key(aux->gen, k));
980
0
        kh_key(aux->gen, k) = NULL;
981
0
        kh_del(hdict, aux->gen, k);
982
0
    }
983
0
    free(str.s);
984
0
}
985
986
int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp)
987
0
{
988
    // currently only for bcf_hdr_set_version
989
0
    assert( hrec->type==BCF_HL_GEN );
990
0
    int ret;
991
0
    khint_t k;
992
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
993
0
    for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
994
0
    {
995
0
        if ( !kh_exist(aux->gen,k) ) continue;
996
0
        if ( hrec!=(bcf_hrec_t*)kh_val(aux->gen,k) ) continue;
997
0
        break;
998
0
    }
999
0
    assert( k<kh_end(aux->gen) );   // something went wrong, should never happen
1000
0
    free((char*)kh_key(aux->gen,k));
1001
0
    kh_del(hdict,aux->gen,k);
1002
0
    kstring_t str = {0,0,0};
1003
0
    if ( ksprintf(&str, "##%s=%s", tmp->key,tmp->value) < 0 )
1004
0
    {
1005
0
        free(str.s);
1006
0
        return -1;
1007
0
    }
1008
0
    k = kh_put(hdict, aux->gen, str.s, &ret);
1009
0
    if ( ret<0 )
1010
0
    {
1011
0
        free(str.s);
1012
0
        return -1;
1013
0
    }
1014
0
    free(hrec->value);
1015
0
    hrec->value = strdup(tmp->value);
1016
0
    if ( !hrec->value ) return -1;
1017
0
    return 0;
1018
0
}
1019
1020
int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
1021
185k
{
1022
185k
    kstring_t str = {0,0,0};
1023
185k
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1024
1025
185k
    int res;
1026
185k
    if ( !hrec ) return 0;
1027
1028
185k
    bcf_hrec_check(hrec);   // todo: check return status and propagate errors up
1029
1030
185k
    res = bcf_hdr_register_hrec(hdr,hrec);
1031
185k
    if (res < 0) return -1;
1032
185k
    if ( !res )
1033
46.3k
    {
1034
        // If one of the hashed field, then it is already present
1035
46.3k
        if ( hrec->type != BCF_HL_GEN )
1036
23.2k
        {
1037
23.2k
            bcf_hrec_destroy(hrec);
1038
23.2k
            return 0;
1039
23.2k
        }
1040
1041
        // Is one of the generic fields and already present?
1042
23.0k
        if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 )
1043
0
        {
1044
0
            free(str.s);
1045
0
            return -1;
1046
0
        }
1047
23.0k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1048
23.0k
        if ( k != kh_end(aux->gen) )
1049
11.9k
        {
1050
            // duplicate record
1051
11.9k
            bcf_hrec_destroy(hrec);
1052
11.9k
            free(str.s);
1053
11.9k
            return 0;
1054
11.9k
        }
1055
23.0k
    }
1056
1057
149k
    int i;
1058
149k
    if ( hrec->type==BCF_HL_STR && (i=bcf_hrec_find_key(hrec,"ID"))>=0 )
1059
4.32k
    {
1060
4.32k
        if ( ksprintf(&str, "##%s=<ID=%s>", hrec->key,hrec->vals[i]) < 0 )
1061
0
        {
1062
0
            free(str.s);
1063
0
            return -1;
1064
0
        }
1065
4.32k
        khint_t k = kh_get(hdict, aux->gen, str.s);
1066
4.32k
        if ( k != kh_end(aux->gen) )
1067
3.41k
        {
1068
            // duplicate record
1069
3.41k
            bcf_hrec_destroy(hrec);
1070
3.41k
            free(str.s);
1071
3.41k
            return 0;
1072
3.41k
        }
1073
4.32k
    }
1074
1075
    // New record, needs to be added
1076
146k
    int n = hdr->nhrec + 1;
1077
146k
    bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
1078
146k
    if (!new_hrec) {
1079
0
        free(str.s);
1080
0
        bcf_hdr_unregister_hrec(hdr, hrec);
1081
0
        return -1;
1082
0
    }
1083
146k
    hdr->hrec = new_hrec;
1084
1085
146k
    if ( str.s )
1086
12.1k
    {
1087
12.1k
        khint_t k = kh_put(hdict, aux->gen, str.s, &res);
1088
12.1k
        if ( res<0 )
1089
0
        {
1090
0
            free(str.s);
1091
0
            return -1;
1092
0
        }
1093
12.1k
        kh_val(aux->gen,k) = hrec;
1094
12.1k
    }
1095
1096
146k
    hdr->hrec[hdr->nhrec] = hrec;
1097
146k
    hdr->dirty = 1;
1098
146k
    hdr->nhrec = n;
1099
1100
146k
    return hrec->type==BCF_HL_GEN ? 0 : 1;
1101
146k
}
1102
1103
bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
1104
0
{
1105
0
    int i;
1106
0
    if ( type==BCF_HL_GEN )
1107
0
    {
1108
        // e.g. ##fileformat=VCFv4.2
1109
        //      ##source=GenomicsDBImport
1110
        //      ##bcftools_viewVersion=1.16-80-gdfdb0923+htslib-1.16-34-g215d364
1111
0
        if ( value )
1112
0
        {
1113
0
            kstring_t str = {0,0,0};
1114
0
            ksprintf(&str, "##%s=%s", key,value);
1115
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1116
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1117
0
            free(str.s);
1118
0
            if ( k == kh_end(aux->gen) ) return NULL;
1119
0
            return kh_val(aux->gen, k);
1120
0
        }
1121
0
        for (i=0; i<hdr->nhrec; i++)
1122
0
        {
1123
0
            if ( hdr->hrec[i]->type!=type ) continue;
1124
0
            if ( strcmp(hdr->hrec[i]->key,key) ) continue;
1125
0
            return hdr->hrec[i];
1126
0
        }
1127
0
        return NULL;
1128
0
    }
1129
0
    else if ( type==BCF_HL_STR )
1130
0
    {
1131
        // e.g. ##GATKCommandLine=<ID=GenomicsDBImport,CommandLine="GenomicsDBImport....">
1132
        //      ##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
1133
0
        if (!str_class) return NULL;
1134
0
        if ( !strcmp("ID",key) )
1135
0
        {
1136
0
            kstring_t str = {0,0,0};
1137
0
            ksprintf(&str, "##%s=<%s=%s>",str_class,key,value);
1138
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1139
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1140
0
            free(str.s);
1141
0
            if ( k == kh_end(aux->gen) ) return NULL;
1142
0
            return kh_val(aux->gen, k);
1143
0
        }
1144
0
        for (i=0; i<hdr->nhrec; i++)
1145
0
        {
1146
0
            if ( hdr->hrec[i]->type!=type ) continue;
1147
0
            if ( strcmp(hdr->hrec[i]->key,str_class) ) continue;
1148
0
            int j = bcf_hrec_find_key(hdr->hrec[i],key);
1149
0
            if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i];
1150
0
        }
1151
0
        return NULL;
1152
0
    }
1153
0
    vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1154
0
    khint_t k = kh_get(vdict, d, value);
1155
0
    if ( k == kh_end(d) ) return NULL;
1156
0
    return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
1157
0
}
1158
1159
void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
1160
4.37k
{
1161
4.37k
    static int PL_warned = 0, GL_warned = 0;
1162
1163
4.37k
    if ( !PL_warned )
1164
4.37k
    {
1165
4.37k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "PL");
1166
4.37k
        if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
1167
0
        {
1168
0
            hts_log_warning("PL should be declared as Number=G");
1169
0
            PL_warned = 1;
1170
0
        }
1171
4.37k
    }
1172
4.37k
    if ( !GL_warned )
1173
4.37k
    {
1174
4.37k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GL");
1175
4.37k
        if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
1176
0
        {
1177
0
            hts_log_warning("GL should be declared as Number=G");
1178
0
            GL_warned = 1;
1179
0
        }
1180
4.37k
    }
1181
4.37k
}
1182
1183
int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
1184
5.17k
{
1185
5.17k
    int len, done = 0;
1186
5.17k
    char *p = htxt;
1187
1188
    // Check sanity: "fileformat" string must come as first
1189
5.17k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
1190
5.17k
    if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") )
1191
444
        hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?");
1192
5.17k
    if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1193
0
        bcf_hrec_destroy(hrec);
1194
0
        return -1;
1195
0
    }
1196
1197
    // The filter PASS must appear first in the dictionary
1198
5.17k
    hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
1199
5.17k
    if (!hrec || bcf_hdr_add_hrec(hdr, hrec) < 0) {
1200
0
        bcf_hrec_destroy(hrec);
1201
0
        return -1;
1202
0
    }
1203
1204
    // Parse the whole header
1205
21.5k
    do {
1206
90.5k
        while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) {
1207
69.0k
            if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1208
18
                bcf_hrec_destroy(hrec);
1209
18
                return -1;
1210
18
            }
1211
69.0k
            p += len;
1212
69.0k
        }
1213
21.5k
        assert(hrec == NULL);
1214
21.5k
        if (len < 0) {
1215
            // len < 0 indicates out-of-memory, or similar error
1216
0
            hts_log_error("Could not parse header line: %s", strerror(errno));
1217
0
            return -1;
1218
21.5k
        } else if (len > 0) {
1219
            // Bad header line.  bcf_hdr_parse_line() will have logged it.
1220
            // Skip and try again on the next line (p + len will be the start
1221
            // of the next one).
1222
14.7k
            p += len;
1223
14.7k
            continue;
1224
14.7k
        }
1225
1226
        // Next should be the sample line.  If not, it was a malformed
1227
        // header, in which case print a warning and skip (many VCF
1228
        // operations do not really care about a few malformed lines).
1229
        // In the future we may want to add a strict mode that errors in
1230
        // this case.
1231
6.80k
        if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) {
1232
2.32k
            char *eol = strchr(p, '\n');
1233
2.32k
            if (*p != '\0') {
1234
1.67k
                char buffer[320];
1235
1.67k
                hts_log_warning("Could not parse header line: %s",
1236
1.67k
                                hts_strprint(buffer, sizeof(buffer),
1237
1.67k
                                               '"', p,
1238
1.67k
                                               eol ? (eol - p) : SIZE_MAX));
1239
1.67k
            }
1240
2.32k
            if (eol) {
1241
1.65k
                p = eol + 1; // Try from the next line.
1242
1.65k
            } else {
1243
676
                done = -1; // No more lines left, give up.
1244
676
            }
1245
4.48k
        } else {
1246
4.48k
            done = 1; // Sample line found
1247
4.48k
        }
1248
21.5k
    } while (!done);
1249
1250
5.15k
    if (done < 0) {
1251
        // No sample line is fatal.
1252
676
        hts_log_error("Could not parse the header, sample line not found");
1253
676
        return -1;
1254
676
    }
1255
1256
4.48k
    if (bcf_hdr_parse_sample_line(hdr,p) < 0)
1257
108
        return -1;
1258
4.37k
    if (bcf_hdr_sync(hdr) < 0)
1259
0
        return -1;
1260
4.37k
    bcf_hdr_check_sanity(hdr);
1261
4.37k
    return 0;
1262
4.37k
}
1263
1264
int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
1265
0
{
1266
0
    int len;
1267
0
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
1268
0
    if ( !hrec ) return -1;
1269
0
    if (bcf_hdr_add_hrec(hdr, hrec) < 0)
1270
0
        return -1;
1271
0
    return 0;
1272
0
}
1273
1274
void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
1275
0
{
1276
0
    int i = 0;
1277
0
    bcf_hrec_t *hrec;
1278
0
    if ( !key )
1279
0
    {
1280
        // no key, remove all entries of this type
1281
0
        while ( i<hdr->nhrec )
1282
0
        {
1283
0
            if ( hdr->hrec[i]->type!=type ) { i++; continue; }
1284
0
            hrec = hdr->hrec[i];
1285
0
            bcf_hdr_unregister_hrec(hdr, hrec);
1286
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1287
0
            hdr->dirty = 1;
1288
0
            hdr->nhrec--;
1289
0
            if ( i < hdr->nhrec )
1290
0
                memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1291
0
            bcf_hrec_destroy(hrec);
1292
0
        }
1293
0
        return;
1294
0
    }
1295
0
    while (1)
1296
0
    {
1297
0
        if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
1298
0
        {
1299
0
            hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL);
1300
0
            if ( !hrec ) return;
1301
1302
0
            for (i=0; i<hdr->nhrec; i++)
1303
0
                if ( hdr->hrec[i]==hrec ) break;
1304
0
            assert( i<hdr->nhrec );
1305
1306
0
            vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1307
0
            khint_t k = kh_get(vdict, d, key);
1308
0
            kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
1309
0
        }
1310
0
        else
1311
0
        {
1312
0
            for (i=0; i<hdr->nhrec; i++)
1313
0
            {
1314
0
                if ( hdr->hrec[i]->type!=type ) continue;
1315
0
                if ( type==BCF_HL_GEN )
1316
0
                {
1317
0
                    if ( !strcmp(hdr->hrec[i]->key,key) ) break;
1318
0
                }
1319
0
                else
1320
0
                {
1321
                    // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
1322
0
                    int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
1323
0
                    if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break;
1324
0
                }
1325
0
            }
1326
0
            if ( i==hdr->nhrec ) return;
1327
0
            hrec = hdr->hrec[i];
1328
0
            bcf_hdr_remove_from_hdict(hdr, hrec);
1329
0
        }
1330
1331
0
        hdr->nhrec--;
1332
0
        if ( i < hdr->nhrec )
1333
0
            memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1334
0
        bcf_hrec_destroy(hrec);
1335
0
        hdr->dirty = 1;
1336
0
    }
1337
0
}
1338
1339
int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
1340
0
{
1341
0
    char tmp[256], *line = tmp;
1342
0
    va_list ap;
1343
0
    va_start(ap, fmt);
1344
0
    int n = vsnprintf(line, sizeof(tmp), fmt, ap);
1345
0
    va_end(ap);
1346
1347
0
    if (n >= sizeof(tmp)) {
1348
0
        n++; // For trailing NUL
1349
0
        line = (char*)malloc(n);
1350
0
        if (!line)
1351
0
            return -1;
1352
1353
0
        va_start(ap, fmt);
1354
0
        vsnprintf(line, n, fmt, ap);
1355
0
        va_end(ap);
1356
0
    }
1357
1358
0
    int ret = bcf_hdr_append(hdr, line);
1359
1360
0
    if (line != tmp) free(line);
1361
0
    return ret;
1362
0
}
1363
1364
1365
/**********************
1366
 *** BCF header I/O ***
1367
 **********************/
1368
1369
const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
1370
0
{
1371
0
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1372
0
    if ( !hrec )
1373
0
    {
1374
0
        hts_log_warning("No version string found, assuming VCFv4.2");
1375
0
        return "VCFv4.2";
1376
0
    }
1377
0
    return hrec->value;
1378
0
}
1379
1380
int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
1381
0
{
1382
0
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1383
0
    if ( !hrec )
1384
0
    {
1385
0
        int len;
1386
0
        kstring_t str = {0,0,0};
1387
0
        if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1;
1388
0
        hrec = bcf_hdr_parse_line(hdr, str.s, &len);
1389
0
        free(str.s);
1390
0
    }
1391
0
    else
1392
0
    {
1393
0
        bcf_hrec_t *tmp = bcf_hrec_dup(hrec);
1394
0
        if ( !tmp ) return -1;
1395
0
        free(tmp->value);
1396
0
        tmp->value = strdup(version);
1397
0
        if ( !tmp->value ) return -1;
1398
0
        bcf_hdr_update_hrec(hdr, hrec, tmp);
1399
0
        bcf_hrec_destroy(tmp);
1400
0
    }
1401
0
    hdr->dirty = 1;
1402
0
    return 0; // FIXME: check for errs in this function (return < 0 if so)
1403
0
}
1404
1405
bcf_hdr_t *bcf_hdr_init(const char *mode)
1406
5.20k
{
1407
5.20k
    int i;
1408
5.20k
    bcf_hdr_t *h;
1409
5.20k
    h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
1410
5.20k
    if (!h) return NULL;
1411
20.8k
    for (i = 0; i < 3; ++i) {
1412
15.6k
        if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail;
1413
        // Supersize the hash to make collisions very unlikely
1414
15.6k
        static int dsize[3] = {16384,16384,2048}; // info, contig, format
1415
15.6k
        if (kh_resize(vdict, h->dict[i], dsize[i]) < 0) goto fail;
1416
15.6k
    }
1417
1418
5.20k
    bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t));
1419
5.20k
    if ( !aux ) goto fail;
1420
5.20k
    if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; }
1421
5.20k
    aux->key_len = NULL;
1422
5.20k
    aux->dict = *((vdict_t*)h->dict[0]);
1423
5.20k
    free(h->dict[0]);
1424
5.20k
    h->dict[0] = aux;
1425
1426
5.20k
    if ( strchr(mode,'w') )
1427
0
    {
1428
0
        bcf_hdr_append(h, "##fileformat=VCFv4.2");
1429
        // The filter PASS must appear first in the dictionary
1430
0
        bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
1431
0
    }
1432
5.20k
    return h;
1433
1434
0
 fail:
1435
0
    for (i = 0; i < 3; ++i)
1436
0
        kh_destroy(vdict, h->dict[i]);
1437
0
    free(h);
1438
0
    return NULL;
1439
5.20k
}
1440
1441
void bcf_hdr_destroy(bcf_hdr_t *h)
1442
5.20k
{
1443
5.20k
    int i;
1444
5.20k
    khint_t k;
1445
5.20k
    if (!h) return;
1446
20.8k
    for (i = 0; i < 3; ++i) {
1447
15.6k
        vdict_t *d = (vdict_t*)h->dict[i];
1448
15.6k
        if (d == 0) continue;
1449
181M
        for (k = kh_begin(d); k != kh_end(d); ++k)
1450
181M
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
1451
15.6k
        if ( i==0 )
1452
5.20k
        {
1453
5.20k
            bcf_hdr_aux_t *aux = get_hdr_aux(h);
1454
33.1k
            for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
1455
27.9k
                if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k));
1456
5.20k
            kh_destroy(hdict, aux->gen);
1457
5.20k
            free(aux->key_len); // may exist for dict[0] only
1458
5.20k
        }
1459
15.6k
        kh_destroy(vdict, d);
1460
15.6k
        free(h->id[i]);
1461
15.6k
    }
1462
151k
    for (i=0; i<h->nhrec; i++)
1463
146k
        bcf_hrec_destroy(h->hrec[i]);
1464
5.20k
    if (h->nhrec) free(h->hrec);
1465
5.20k
    if (h->samples) free(h->samples);
1466
5.20k
    free(h->keep_samples);
1467
5.20k
    free(h->transl[0]); free(h->transl[1]);
1468
5.20k
    free(h->mem.s);
1469
5.20k
    free(h);
1470
5.20k
}
1471
1472
bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
1473
5.20k
{
1474
5.20k
    if (hfp->format.format == vcf)
1475
4.76k
        return vcf_hdr_read(hfp);
1476
448
    if (hfp->format.format != bcf) {
1477
0
        hts_log_error("Input is not detected as bcf or vcf format");
1478
0
        return NULL;
1479
0
    }
1480
1481
448
    assert(hfp->is_bgzf);
1482
1483
448
    BGZF *fp = hfp->fp.bgzf;
1484
448
    uint8_t magic[5];
1485
448
    bcf_hdr_t *h;
1486
448
    h = bcf_hdr_init("r");
1487
448
    if (!h) {
1488
0
        hts_log_error("Failed to allocate bcf header");
1489
0
        return NULL;
1490
0
    }
1491
448
    if (bgzf_read(fp, magic, 5) != 5)
1492
0
    {
1493
0
        hts_log_error("Failed to read the header (reading BCF in text mode?)");
1494
0
        bcf_hdr_destroy(h);
1495
0
        return NULL;
1496
0
    }
1497
448
    if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
1498
0
    {
1499
0
        if (!strncmp((char*)magic, "BCF", 3))
1500
0
            hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported");
1501
0
        else
1502
0
            hts_log_error("Invalid BCF2 magic string");
1503
0
        bcf_hdr_destroy(h);
1504
0
        return NULL;
1505
0
    }
1506
448
    uint8_t buf[4];
1507
448
    size_t hlen;
1508
448
    char *htxt = NULL;
1509
448
    if (bgzf_read(fp, buf, 4) != 4) goto fail;
1510
448
    hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24);
1511
448
    if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; }
1512
448
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1513
448
    if (hlen > FUZZ_ALLOC_LIMIT/2) { errno = ENOMEM; goto fail; }
1514
444
#endif
1515
444
    htxt = (char*)malloc(hlen + 1);
1516
444
    if (!htxt) goto fail;
1517
444
    if (bgzf_read(fp, htxt, hlen) != hlen) goto fail;
1518
444
    htxt[hlen] = '\0'; // Ensure htxt is terminated
1519
444
    if ( bcf_hdr_parse(h, htxt) < 0 ) goto fail;
1520
412
    free(htxt);
1521
412
    return h;
1522
36
 fail:
1523
36
    hts_log_error("Failed to read BCF header");
1524
36
    free(htxt);
1525
36
    bcf_hdr_destroy(h);
1526
36
    return NULL;
1527
444
}
1528
1529
int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
1530
4.37k
{
1531
4.37k
    if (!h) {
1532
0
        errno = EINVAL;
1533
0
        return -1;
1534
0
    }
1535
4.37k
    if ( h->dirty ) {
1536
0
        if (bcf_hdr_sync(h) < 0) return -1;
1537
0
    }
1538
4.37k
    hfp->format.category = variant_data;
1539
4.37k
    if (hfp->format.format == vcf || hfp->format.format == text_format) {
1540
2.18k
        hfp->format.format = vcf;
1541
2.18k
        return vcf_hdr_write(hfp, h);
1542
2.18k
    }
1543
1544
2.18k
    if (hfp->format.format == binary_format)
1545
2.18k
        hfp->format.format = bcf;
1546
1547
2.18k
    kstring_t htxt = {0,0,0};
1548
2.18k
    if (bcf_hdr_format(h, 1, &htxt) < 0) {
1549
0
        free(htxt.s);
1550
0
        return -1;
1551
0
    }
1552
2.18k
    kputc('\0', &htxt); // include the \0 byte
1553
1554
2.18k
    BGZF *fp = hfp->fp.bgzf;
1555
2.18k
    if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
1556
2.18k
    uint8_t hlen[4];
1557
2.18k
    u32_to_le(htxt.l, hlen);
1558
2.18k
    if ( bgzf_write(fp, hlen, 4) !=4 ) return -1;
1559
2.18k
    if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1;
1560
1561
2.18k
    free(htxt.s);
1562
2.18k
    return 0;
1563
2.18k
}
1564
1565
/********************
1566
 *** BCF site I/O ***
1567
 ********************/
1568
1569
bcf1_t *bcf_init()
1570
4.37k
{
1571
4.37k
    bcf1_t *v;
1572
4.37k
    v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
1573
4.37k
    return v;
1574
4.37k
}
1575
1576
void bcf_clear(bcf1_t *v)
1577
23.0k
{
1578
23.0k
    int i;
1579
23.0k
    for (i=0; i<v->d.m_info; i++)
1580
0
    {
1581
0
        if ( v->d.info[i].vptr_free )
1582
0
        {
1583
0
            free(v->d.info[i].vptr - v->d.info[i].vptr_off);
1584
0
            v->d.info[i].vptr_free = 0;
1585
0
        }
1586
0
    }
1587
23.0k
    for (i=0; i<v->d.m_fmt; i++)
1588
0
    {
1589
0
        if ( v->d.fmt[i].p_free )
1590
0
        {
1591
0
            free(v->d.fmt[i].p - v->d.fmt[i].p_off);
1592
0
            v->d.fmt[i].p_free = 0;
1593
0
        }
1594
0
    }
1595
23.0k
    v->rid = v->pos = v->rlen = v->unpacked = 0;
1596
23.0k
    bcf_float_set_missing(v->qual);
1597
23.0k
    v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
1598
23.0k
    v->shared.l = v->indiv.l = 0;
1599
23.0k
    v->d.var_type = -1;
1600
23.0k
    v->d.shared_dirty = 0;
1601
23.0k
    v->d.indiv_dirty  = 0;
1602
23.0k
    v->d.n_flt = 0;
1603
23.0k
    v->errcode = 0;
1604
23.0k
    if (v->d.m_als) v->d.als[0] = 0;
1605
23.0k
    if (v->d.m_id) v->d.id[0] = 0;
1606
23.0k
}
1607
1608
void bcf_empty(bcf1_t *v)
1609
4.37k
{
1610
4.37k
    bcf_clear1(v);
1611
4.37k
    free(v->d.id);
1612
4.37k
    free(v->d.als);
1613
4.37k
    free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
1614
4.37k
    if (v->d.var ) free(v->d.var);
1615
4.37k
    free(v->shared.s); free(v->indiv.s);
1616
4.37k
    memset(&v->d,0,sizeof(v->d));
1617
4.37k
    memset(&v->shared,0,sizeof(v->shared));
1618
4.37k
    memset(&v->indiv,0,sizeof(v->indiv));
1619
4.37k
}
1620
1621
void bcf_destroy(bcf1_t *v)
1622
4.37k
{
1623
4.37k
    if (!v) return;
1624
4.37k
    bcf_empty1(v);
1625
4.37k
    free(v);
1626
4.37k
}
1627
1628
static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
1629
1.10k
{
1630
1.10k
    uint8_t x[32];
1631
1.10k
    ssize_t ret;
1632
1.10k
    uint32_t shared_len, indiv_len;
1633
1.10k
    if ((ret = bgzf_read(fp, x, 32)) != 32) {
1634
14
        if (ret == 0) return -1;
1635
14
        return -2;
1636
14
    }
1637
1.08k
    bcf_clear1(v);
1638
1.08k
    shared_len = le_to_u32(x);
1639
1.08k
    if (shared_len < 24) return -2;
1640
1.08k
    shared_len -= 24; // to exclude six 32-bit integers
1641
1.08k
    indiv_len = le_to_u32(x + 4);
1642
1.08k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1643
    // ks_resize() normally allocates 1.5 * requested size to allow for growth
1644
1.08k
    if ((uint64_t) shared_len + indiv_len > FUZZ_ALLOC_LIMIT / 3 * 2) return -2;
1645
1.06k
#endif
1646
1.06k
    if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2;
1647
1.06k
    if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2;
1648
1.06k
    v->rid  = le_to_i32(x + 8);
1649
1.06k
    v->pos  = le_to_u32(x + 12);
1650
1.06k
    if ( v->pos==UINT32_MAX ) v->pos = -1;  // this is for telomere coordinate, e.g. MT:0
1651
1.06k
    v->rlen = le_to_i32(x + 16);
1652
1.06k
    v->qual = le_to_float(x + 20);
1653
1.06k
    v->n_info = le_to_u16(x + 24);
1654
1.06k
    v->n_allele = le_to_u16(x + 26);
1655
1.06k
    v->n_sample = le_to_u32(x + 28) & 0xffffff;
1656
1.06k
    v->n_fmt = x[31];
1657
1.06k
    v->shared.l = shared_len;
1658
1.06k
    v->indiv.l = indiv_len;
1659
    // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
1660
1.06k
    if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
1661
1662
1.06k
    if (bgzf_read(fp, v->shared.s, v->shared.l) != v->shared.l) return -2;
1663
1.01k
    if (bgzf_read(fp, v->indiv.s, v->indiv.l) != v->indiv.l) return -2;
1664
1.00k
    return 0;
1665
1.01k
}
1666
1667
0
#define bit_array_size(n) ((n)/8+1)
1668
0
#define bit_array_set(a,i)   ((a)[(i)/8] |=   1 << ((i)%8))
1669
0
#define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
1670
0
#define bit_array_test(a,i)  ((a)[(i)/8] &   (1 << ((i)%8)))
1671
1672
static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1673
4.86k
                                   int32_t *val) {
1674
4.86k
    uint32_t t;
1675
4.86k
    if (end - p < 2) return -1;
1676
4.85k
    t = *p++ & 0xf;
1677
    /* Use if .. else if ... else instead of switch to force order.  Assumption
1678
       is that small integers are more frequent than big ones. */
1679
4.85k
    if (t == BCF_BT_INT8) {
1680
2.52k
        *val = *(int8_t *) p++;
1681
2.52k
    } else {
1682
2.33k
        if (end - p < (1<<bcf_type_shift[t])) return -1;
1683
2.33k
        if (t == BCF_BT_INT16) {
1684
954
            *val = le_to_i16(p);
1685
954
            p += 2;
1686
1.37k
        } else if (t == BCF_BT_INT32) {
1687
1.22k
            *val = le_to_i32(p);
1688
1.22k
            p += 4;
1689
#ifdef VCF_ALLOW_INT64
1690
        } else if (t == BCF_BT_INT64) {
1691
            // This case should never happen because there should be no
1692
            // 64-bit BCFs at all, definitely not coming from htslib
1693
            *val = le_to_i64(p);
1694
            p += 8;
1695
#endif
1696
1.22k
        } else {
1697
154
            return -1;
1698
154
        }
1699
2.33k
    }
1700
4.69k
    *q = p;
1701
4.69k
    return 0;
1702
4.85k
}
1703
1704
static int bcf_dec_size_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1705
12.0k
                             int *num, int *type) {
1706
12.0k
    int r;
1707
12.0k
    if (p >= end) return -1;
1708
12.0k
    *type = *p & 0xf;
1709
12.0k
    if (*p>>4 != 15) {
1710
11.6k
        *q = p + 1;
1711
11.6k
        *num = *p >> 4;
1712
11.6k
        return 0;
1713
11.6k
    }
1714
400
    r = bcf_dec_typed_int1_safe(p + 1, end, q, num);
1715
400
    if (r) return r;
1716
338
    return *num >= 0 ? 0 : -1;
1717
400
}
1718
1719
446
static const char *get_type_name(int type) {
1720
446
    const char *types[9] = {
1721
446
        "null", "int (8-bit)", "int (16 bit)", "int (32 bit)",
1722
446
        "unknown", "float", "unknown", "char", "unknown"
1723
446
    };
1724
446
    int t = (type >= 0 && type < 8) ? type : 8;
1725
446
    return types[t];
1726
446
}
1727
1728
static void bcf_record_check_err(const bcf_hdr_t *hdr, bcf1_t *rec,
1729
898
                                 char *type, uint32_t *reports, int i) {
1730
898
    if (*reports == 0 || hts_verbose >= HTS_LOG_DEBUG)
1731
78
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos
1732
898
                        ": Invalid FORMAT %s %d",
1733
898
                        bcf_seqname_safe(hdr,rec), rec->pos+1, type, i);
1734
898
    (*reports)++;
1735
898
}
1736
1737
1.00k
static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
1738
1.00k
    uint8_t *ptr, *end;
1739
1.00k
    size_t bytes;
1740
1.00k
    uint32_t err = 0;
1741
1.00k
    int type = 0;
1742
1.00k
    int num  = 0;
1743
1.00k
    int reflen = 0;
1744
1.00k
    uint32_t i, reports;
1745
1.00k
    const uint32_t is_integer = ((1 << BCF_BT_INT8)  |
1746
1.00k
                                 (1 << BCF_BT_INT16) |
1747
#ifdef VCF_ALLOW_INT64
1748
                                 (1 << BCF_BT_INT64) |
1749
#endif
1750
1.00k
                                 (1 << BCF_BT_INT32));
1751
1.00k
    const uint32_t is_valid_type = (is_integer          |
1752
1.00k
                                    (1 << BCF_BT_NULL)  |
1753
1.00k
                                    (1 << BCF_BT_FLOAT) |
1754
1.00k
                                    (1 << BCF_BT_CHAR));
1755
1.00k
    int32_t max_id = hdr ? hdr->n[BCF_DT_ID] : 0;
1756
1757
    // Check for valid contig ID
1758
1.00k
    if (rec->rid < 0
1759
1.00k
        || (hdr && (rec->rid >= hdr->n[BCF_DT_CTG]
1760
930
                    || hdr->id[BCF_DT_CTG][rec->rid].key == NULL))) {
1761
282
        hts_log_warning("Bad BCF record at %"PRIhts_pos": Invalid %s id %d", rec->pos+1, "CONTIG", rec->rid);
1762
282
        err |= BCF_ERR_CTG_INVALID;
1763
282
    }
1764
1765
    // Check ID
1766
1.00k
    ptr = (uint8_t *) rec->shared.s;
1767
1.00k
    end = ptr + rec->shared.l;
1768
1.00k
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1769
996
    if (type != BCF_BT_CHAR) {
1770
278
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "ID", type, get_type_name(type));
1771
278
        err |= BCF_ERR_TAG_INVALID;
1772
278
    }
1773
996
    bytes = (size_t) num << bcf_type_shift[type];
1774
996
    if (end - ptr < bytes) goto bad_shared;
1775
988
    ptr += bytes;
1776
1777
    // Check REF and ALT
1778
988
    if (rec->n_allele < 1) {
1779
158
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": No REF allele",
1780
158
                        bcf_seqname_safe(hdr,rec), rec->pos+1);
1781
158
        err |= BCF_ERR_TAG_UNDEF;
1782
158
    }
1783
1784
988
    reports = 0;
1785
6.66k
    for (i = 0; i < rec->n_allele; i++) {
1786
5.72k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1787
5.69k
        if (type != BCF_BT_CHAR) {
1788
4.51k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1789
112
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "REF/ALT", type, get_type_name(type));
1790
4.51k
            err |= BCF_ERR_CHAR;
1791
4.51k
        }
1792
5.69k
        if (i == 0) reflen = num;
1793
5.69k
        bytes = (size_t) num << bcf_type_shift[type];
1794
5.69k
        if (end - ptr < bytes) goto bad_shared;
1795
5.67k
        ptr += bytes;
1796
5.67k
    }
1797
1798
    // Check FILTER
1799
936
    reports = 0;
1800
936
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1801
934
    if (num > 0) {
1802
134
        bytes = (size_t) num << bcf_type_shift[type];
1803
134
        if (((1 << type) & is_integer) == 0) {
1804
44
            hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", type, get_type_name(type));
1805
44
            err |= BCF_ERR_TAG_INVALID;
1806
44
            if (end - ptr < bytes) goto bad_shared;
1807
42
            ptr += bytes;
1808
90
        } else {
1809
90
            if (end - ptr < bytes) goto bad_shared;
1810
5.62k
            for (i = 0; i < num; i++) {
1811
5.53k
                int32_t key = bcf_dec_int1(ptr, type, &ptr);
1812
5.53k
                if (key < 0
1813
5.53k
                    || (hdr && (key >= max_id
1814
4.49k
                                || hdr->id[BCF_DT_ID][key].key == NULL))) {
1815
4.49k
                    if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1816
88
                        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", key);
1817
4.49k
                    err |= BCF_ERR_TAG_UNDEF;
1818
4.49k
                }
1819
5.53k
            }
1820
88
        }
1821
134
    }
1822
1823
    // Check INFO
1824
930
    reports = 0;
1825
930
    bcf_idpair_t *id_tmp = hdr ? hdr->id[BCF_DT_ID] : NULL;
1826
4.33k
    for (i = 0; i < rec->n_info; i++) {
1827
3.51k
        int32_t key = -1;
1828
3.51k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_shared;
1829
3.44k
        if (key < 0 || (hdr && (key >= max_id
1830
3.04k
                                || id_tmp[key].key == NULL))) {
1831
3.04k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1832
118
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", key);
1833
3.04k
            err |= BCF_ERR_TAG_UNDEF;
1834
3.04k
        }
1835
3.44k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1836
3.42k
        if (((1 << type) & is_valid_type) == 0
1837
3.42k
            || (type == BCF_BT_NULL && num > 0)) {
1838
354
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1839
12
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", type, get_type_name(type));
1840
354
            err |= BCF_ERR_TAG_INVALID;
1841
354
        }
1842
3.42k
        bytes = (size_t) num << bcf_type_shift[type];
1843
3.42k
        if (end - ptr < bytes) goto bad_shared;
1844
3.40k
        ptr += bytes;
1845
3.40k
    }
1846
1847
    // Check FORMAT and individual information
1848
818
    ptr = (uint8_t *) rec->indiv.s;
1849
818
    end = ptr + rec->indiv.l;
1850
818
    reports = 0;
1851
1.68k
    for (i = 0; i < rec->n_fmt; i++) {
1852
950
        int32_t key = -1;
1853
950
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_indiv;
1854
916
        if (key < 0
1855
916
            || (hdr && (key >= max_id
1856
804
                        || id_tmp[key].key == NULL))) {
1857
786
            bcf_record_check_err(hdr, rec, "id", &reports, key);
1858
786
            err |= BCF_ERR_TAG_UNDEF;
1859
786
        }
1860
916
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv;
1861
900
        if (((1 << type) & is_valid_type) == 0
1862
900
            || (type == BCF_BT_NULL && num > 0)) {
1863
112
            bcf_record_check_err(hdr, rec, "type", &reports, type);
1864
112
            err |= BCF_ERR_TAG_INVALID;
1865
112
        }
1866
900
        bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample;
1867
900
        if (end - ptr < bytes) goto bad_indiv;
1868
862
        ptr += bytes;
1869
862
    }
1870
1871
730
    if (!err && rec->rlen < 0) {
1872
        // Treat bad rlen as a warning instead of an error, and try to
1873
        // fix up by using the length of the stored REF allele.
1874
456
        static int warned = 0;
1875
456
        if (!warned) {
1876
1
            hts_log_warning("BCF record at %s:%"PRIhts_pos" has invalid RLEN (%"PRIhts_pos"). "
1877
1
                            "Only one invalid RLEN will be reported.",
1878
1
                            bcf_seqname_safe(hdr,rec), rec->pos+1, rec->rlen);
1879
1
            warned = 1;
1880
1
        }
1881
456
        rec->rlen = reflen >= 0 ? reflen : 0;
1882
456
    }
1883
1884
730
    rec->errcode |= err;
1885
1886
730
    return err ? -2 : 0; // Return -2 so bcf_read() reports an error
1887
1888
190
 bad_shared:
1889
190
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - shared section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
1890
190
    return -2;
1891
1892
88
 bad_indiv:
1893
88
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - individuals section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
1894
88
    return -2;
1895
818
}
1896
1897
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
1898
int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
1899
0
{
1900
0
    if ( !hdr->keep_samples ) return 0;
1901
0
    if ( !bcf_hdr_nsamples(hdr) )
1902
0
    {
1903
0
        rec->indiv.l = rec->n_sample = 0;
1904
0
        return 0;
1905
0
    }
1906
1907
0
    int i, j;
1908
0
    uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
1909
0
    bcf_dec_t *dec = &rec->d;
1910
0
    hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
1911
0
    for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
1912
1913
0
    for (i=0; i<rec->n_fmt; i++)
1914
0
    {
1915
0
        ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
1916
0
        src = dec->fmt[i].p - dec->fmt[i].size;
1917
0
        if ( dst )
1918
0
        {
1919
0
            memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
1920
0
            dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
1921
0
        }
1922
0
        dst = dec->fmt[i].p;
1923
0
        for (j=0; j<hdr->nsamples_ori; j++)
1924
0
        {
1925
0
            src += dec->fmt[i].size;
1926
0
            if ( !bit_array_test(hdr->keep_samples,j) ) continue;
1927
0
            memmove(dst, src, dec->fmt[i].size);
1928
0
            dst += dec->fmt[i].size;
1929
0
        }
1930
0
        rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
1931
0
        dec->fmt[i].p_len = dst - dec->fmt[i].p;
1932
0
    }
1933
0
    rec->unpacked |= BCF_UN_FMT;
1934
1935
0
    rec->n_sample = bcf_hdr_nsamples(hdr);
1936
0
    return 0;
1937
0
}
1938
1939
int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
1940
19.6k
{
1941
19.6k
    if (fp->format.format == vcf) return vcf_read(fp,h,v);
1942
1.10k
    int ret = bcf_read1_core(fp->fp.bgzf, v);
1943
1.10k
    if (ret == 0) ret = bcf_record_check(h, v);
1944
1.10k
    if ( ret!=0 || !h->keep_samples ) return ret;
1945
0
    return bcf_subset_format(h,v);
1946
1.10k
}
1947
1948
int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1949
0
{
1950
0
    bcf1_t *v = (bcf1_t *) vv;
1951
0
    int ret = bcf_read1_core(fp, v);
1952
0
    if (ret == 0) ret = bcf_record_check(NULL, v);
1953
0
    if (ret  >= 0)
1954
0
        *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
1955
0
    return ret;
1956
0
}
1957
1958
static inline int bcf1_sync_id(bcf1_t *line, kstring_t *str)
1959
0
{
1960
    // single typed string
1961
0
    if ( line->d.id && strcmp(line->d.id, ".") ) {
1962
0
        return bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
1963
0
    } else {
1964
0
        return bcf_enc_size(str, 0, BCF_BT_CHAR);
1965
0
    }
1966
0
}
1967
static inline int bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
1968
0
{
1969
    // list of typed strings
1970
0
    int i;
1971
0
    for (i=0; i<line->n_allele; i++) {
1972
0
        if (bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]) < 0)
1973
0
            return -1;
1974
0
    }
1975
0
    if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
1976
0
    return 0;
1977
0
}
1978
static inline int bcf1_sync_filter(bcf1_t *line, kstring_t *str)
1979
0
{
1980
    // typed vector of integers
1981
0
    if ( line->d.n_flt ) {
1982
0
        return bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
1983
0
    } else {
1984
0
        return bcf_enc_vint(str, 0, 0, -1);
1985
0
    }
1986
0
}
1987
1988
static inline int bcf1_sync_info(bcf1_t *line, kstring_t *str)
1989
0
{
1990
    // pairs of typed vectors
1991
0
    int i, irm = -1, e = 0;
1992
0
    for (i=0; i<line->n_info; i++)
1993
0
    {
1994
0
        bcf_info_t *info = &line->d.info[i];
1995
0
        if ( !info->vptr )
1996
0
        {
1997
            // marked for removal
1998
0
            if ( irm < 0 ) irm = i;
1999
0
            continue;
2000
0
        }
2001
0
        e |= kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str) < 0;
2002
0
        if ( irm >=0 )
2003
0
        {
2004
0
            bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
2005
0
            while ( irm<=i && line->d.info[irm].vptr ) irm++;
2006
0
        }
2007
0
    }
2008
0
    if ( irm>=0 ) line->n_info = irm;
2009
0
    return e == 0 ? 0 : -1;
2010
0
}
2011
2012
static int bcf1_sync(bcf1_t *line)
2013
345
{
2014
345
    char *shared_ori = line->shared.s;
2015
345
    size_t prev_len;
2016
2017
345
    kstring_t tmp = {0,0,0};
2018
345
    if ( !line->shared.l )
2019
0
    {
2020
        // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
2021
0
        tmp = line->shared;
2022
0
        bcf1_sync_id(line, &tmp);
2023
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2024
2025
0
        bcf1_sync_alleles(line, &tmp);
2026
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2027
2028
0
        bcf1_sync_filter(line, &tmp);
2029
0
        line->unpack_size[2] = tmp.l - prev_len;
2030
2031
0
        bcf1_sync_info(line, &tmp);
2032
0
        line->shared = tmp;
2033
0
    }
2034
345
    else if ( line->d.shared_dirty )
2035
0
    {
2036
        // The line was edited, update the BCF data block.
2037
2038
0
        if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line,BCF_UN_STR);
2039
2040
        // ptr_ori points to the original unchanged BCF data.
2041
0
        uint8_t *ptr_ori = (uint8_t *) line->shared.s;
2042
2043
        // ID: single typed string
2044
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ID )
2045
0
            bcf1_sync_id(line, &tmp);
2046
0
        else
2047
0
            kputsn_(ptr_ori, line->unpack_size[0], &tmp);
2048
0
        ptr_ori += line->unpack_size[0];
2049
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
2050
2051
        // REF+ALT: list of typed strings
2052
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
2053
0
            bcf1_sync_alleles(line, &tmp);
2054
0
        else
2055
0
        {
2056
0
            kputsn_(ptr_ori, line->unpack_size[1], &tmp);
2057
0
            if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
2058
0
        }
2059
0
        ptr_ori += line->unpack_size[1];
2060
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
2061
2062
0
        if ( line->unpacked & BCF_UN_FLT )
2063
0
        {
2064
            // FILTER: typed vector of integers
2065
0
            if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
2066
0
                bcf1_sync_filter(line, &tmp);
2067
0
            else if ( line->d.n_flt )
2068
0
                kputsn_(ptr_ori, line->unpack_size[2], &tmp);
2069
0
            else
2070
0
                bcf_enc_vint(&tmp, 0, 0, -1);
2071
0
            ptr_ori += line->unpack_size[2];
2072
0
            line->unpack_size[2] = tmp.l - prev_len;
2073
2074
0
            if ( line->unpacked & BCF_UN_INFO )
2075
0
            {
2076
                // INFO: pairs of typed vectors
2077
0
                if ( line->d.shared_dirty & BCF1_DIRTY_INF )
2078
0
                {
2079
0
                    bcf1_sync_info(line, &tmp);
2080
0
                    ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
2081
0
                }
2082
0
            }
2083
0
        }
2084
2085
0
        int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
2086
0
        if ( size ) kputsn_(ptr_ori, size, &tmp);
2087
2088
0
        free(line->shared.s);
2089
0
        line->shared = tmp;
2090
0
    }
2091
345
    if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
2092
0
    {
2093
        // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
2094
0
        size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
2095
0
        int i;
2096
0
        for (i=0; i<line->n_info; i++)
2097
0
        {
2098
0
            uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
2099
0
            line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
2100
0
            off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
2101
0
            if ( vptr_free )
2102
0
            {
2103
0
                free(vptr_free);
2104
0
                line->d.info[i].vptr_free = 0;
2105
0
            }
2106
0
        }
2107
0
    }
2108
2109
345
    if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
2110
0
    {
2111
        // The genotype fields changed or are not present
2112
0
        tmp.l = tmp.m = 0; tmp.s = NULL;
2113
0
        int i, irm = -1;
2114
0
        for (i=0; i<line->n_fmt; i++)
2115
0
        {
2116
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
2117
0
            if ( !fmt->p )
2118
0
            {
2119
                // marked for removal
2120
0
                if ( irm < 0 ) irm = i;
2121
0
                continue;
2122
0
            }
2123
0
            kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
2124
0
            if ( irm >=0 )
2125
0
            {
2126
0
                bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
2127
0
                while ( irm<=i && line->d.fmt[irm].p ) irm++;
2128
0
            }
2129
2130
0
        }
2131
0
        if ( irm>=0 ) line->n_fmt = irm;
2132
0
        free(line->indiv.s);
2133
0
        line->indiv = tmp;
2134
2135
        // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
2136
0
        size_t off_new = 0;
2137
0
        for (i=0; i<line->n_fmt; i++)
2138
0
        {
2139
0
            uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
2140
0
            line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
2141
0
            off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
2142
0
            if ( p_free )
2143
0
            {
2144
0
                free(p_free);
2145
0
                line->d.fmt[i].p_free = 0;
2146
0
            }
2147
0
        }
2148
0
    }
2149
345
    if ( !line->n_sample ) line->n_fmt = 0;
2150
345
    line->d.shared_dirty = line->d.indiv_dirty = 0;
2151
345
    return 0;
2152
345
}
2153
2154
bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
2155
0
{
2156
0
    bcf1_sync(src);
2157
2158
0
    bcf_clear(dst);
2159
0
    dst->rid  = src->rid;
2160
0
    dst->pos  = src->pos;
2161
0
    dst->rlen = src->rlen;
2162
0
    dst->qual = src->qual;
2163
0
    dst->n_info = src->n_info; dst->n_allele = src->n_allele;
2164
0
    dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
2165
2166
0
    if ( dst->shared.m < src->shared.l )
2167
0
    {
2168
0
        dst->shared.s = (char*) realloc(dst->shared.s, src->shared.l);
2169
0
        dst->shared.m = src->shared.l;
2170
0
    }
2171
0
    dst->shared.l = src->shared.l;
2172
0
    memcpy(dst->shared.s,src->shared.s,dst->shared.l);
2173
2174
0
    if ( dst->indiv.m < src->indiv.l )
2175
0
    {
2176
0
        dst->indiv.s = (char*) realloc(dst->indiv.s, src->indiv.l);
2177
0
        dst->indiv.m = src->indiv.l;
2178
0
    }
2179
0
    dst->indiv.l = src->indiv.l;
2180
0
    memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
2181
2182
0
    return dst;
2183
0
}
2184
bcf1_t *bcf_dup(bcf1_t *src)
2185
0
{
2186
0
    bcf1_t *out = bcf_init1();
2187
0
    return bcf_copy(out, src);
2188
0
}
2189
2190
int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
2191
17.0k
{
2192
17.0k
    if ( h->dirty ) {
2193
0
        if (bcf_hdr_sync(h) < 0) return -1;
2194
0
    }
2195
17.0k
    if ( bcf_hdr_nsamples(h)!=v->n_sample )
2196
54
    {
2197
54
        hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
2198
54
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
2199
54
        return -1;
2200
54
    }
2201
2202
16.9k
    if ( hfp->format.format == vcf || hfp->format.format == text_format )
2203
14.8k
        return vcf_write(hfp,h,v);
2204
2205
2.11k
    if ( v->errcode & ~BCF_ERR_LIMITS ) // todo: unsure about the other BCF_ERR_LIMITS branches in vcf_parse_format_alloc4()
2206
1.76k
    {
2207
        // vcf_parse1() encountered a new contig or tag, undeclared in the
2208
        // header.  At this point, the header must have been printed,
2209
        // proceeding would lead to a broken BCF file. Errors must be checked
2210
        // and cleared by the caller before we can proceed.
2211
1.76k
        char errdescription[1024] = "";
2212
1.76k
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos, v->errcode, bcf_strerror(v->errcode, errdescription, sizeof(errdescription)), bcf_seqname_safe(h,v), v->pos+1);
2213
1.76k
        return -1;
2214
1.76k
    }
2215
345
    bcf1_sync(v);   // check if the BCF record was modified
2216
2217
345
    if ( v->unpacked & BCF_IS_64BIT )
2218
0
    {
2219
0
        hts_log_error("Data at %s:%"PRIhts_pos" contains 64-bit values not representable in BCF. Please use VCF instead", bcf_seqname_safe(h,v), v->pos+1);
2220
0
        return -1;
2221
0
    }
2222
2223
345
    BGZF *fp = hfp->fp.bgzf;
2224
345
    uint8_t x[32];
2225
345
    u32_to_le(v->shared.l + 24, x); // to include six 32-bit integers
2226
345
    u32_to_le(v->indiv.l, x + 4);
2227
345
    i32_to_le(v->rid, x + 8);
2228
345
    u32_to_le(v->pos, x + 12);
2229
345
    u32_to_le(v->rlen, x + 16);
2230
345
    float_to_le(v->qual, x + 20);
2231
345
    u16_to_le(v->n_info, x + 24);
2232
345
    u16_to_le(v->n_allele, x + 26);
2233
345
    u32_to_le((uint32_t)v->n_fmt<<24 | (v->n_sample & 0xffffff), x + 28);
2234
345
    if ( bgzf_write(fp, x, 32) != 32 ) return -1;
2235
345
    if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
2236
345
    if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
2237
2238
345
    if (hfp->idx) {
2239
0
        if (bgzf_idx_push(fp, hfp->idx, v->rid, v->pos, v->pos + v->rlen,
2240
0
                          bgzf_tell(fp), 1) < 0)
2241
0
            return -1;
2242
0
    }
2243
2244
345
    return 0;
2245
345
}
2246
2247
/**********************
2248
 *** VCF header I/O ***
2249
 **********************/
2250
2251
0
static int add_missing_contig_hrec(bcf_hdr_t *h, const char *name) {
2252
0
    bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t));
2253
0
    int save_errno;
2254
0
    if (!hrec) goto fail;
2255
2256
0
    hrec->key = strdup("contig");
2257
0
    if (!hrec->key) goto fail;
2258
2259
0
    if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) goto fail;
2260
0
    if (bcf_hrec_set_val(hrec, hrec->nkeys-1, name, strlen(name), 0) < 0)
2261
0
        goto fail;
2262
0
    if (bcf_hdr_add_hrec(h, hrec) < 0)
2263
0
        goto fail;
2264
0
    return 0;
2265
2266
0
 fail:
2267
0
    save_errno = errno;
2268
0
    hts_log_error("%s", strerror(errno));
2269
0
    if (hrec) bcf_hrec_destroy(hrec);
2270
0
    errno = save_errno;
2271
0
    return -1;
2272
0
}
2273
2274
bcf_hdr_t *vcf_hdr_read(htsFile *fp)
2275
4.76k
{
2276
4.76k
    kstring_t txt, *s = &fp->line;
2277
4.76k
    int ret;
2278
4.76k
    bcf_hdr_t *h;
2279
4.76k
    tbx_t *idx = NULL;
2280
4.76k
    const char **names = NULL;
2281
4.76k
    h = bcf_hdr_init("r");
2282
4.76k
    if (!h) {
2283
0
        hts_log_error("Failed to allocate bcf header");
2284
0
        return NULL;
2285
0
    }
2286
4.76k
    txt.l = txt.m = 0; txt.s = 0;
2287
96.3k
    while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) {
2288
95.8k
        int e = 0;
2289
95.8k
        if (s->l == 0) continue;
2290
94.9k
        if (s->s[0] != '#') {
2291
10
            hts_log_error("No sample line");
2292
10
            goto error;
2293
10
        }
2294
94.9k
        if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
2295
0
            kstring_t tmp = { 0, 0, NULL };
2296
0
            hFILE *f = hopen(fp->fn_aux, "r");
2297
0
            if (f == NULL) {
2298
0
                hts_log_error("Couldn't open \"%s\"", fp->fn_aux);
2299
0
                goto error;
2300
0
            }
2301
0
            while (tmp.l = 0, kgetline(&tmp, (kgets_func *) hgets, f) >= 0) {
2302
0
                char *tab = strchr(tmp.s, '\t');
2303
0
                if (tab == NULL) continue;
2304
0
                e |= (kputs("##contig=<ID=", &txt) < 0);
2305
0
                e |= (kputsn(tmp.s, tab - tmp.s, &txt) < 0);
2306
0
                e |= (kputs(",length=", &txt) < 0);
2307
0
                e |= (kputl(atol(tab), &txt) < 0);
2308
0
                e |= (kputsn(">\n", 2, &txt) < 0);
2309
0
            }
2310
0
            free(tmp.s);
2311
0
            if (hclose(f) != 0) {
2312
0
                hts_log_error("Error on closing %s", fp->fn_aux);
2313
0
                goto error;
2314
0
            }
2315
0
            if (e) goto error;
2316
0
        }
2317
94.9k
        if (kputsn(s->s, s->l, &txt) < 0) goto error;
2318
94.9k
        if (kputc('\n', &txt) < 0) goto error;
2319
94.9k
        if (s->s[1] != '#') break;
2320
94.9k
    }
2321
4.75k
    if ( ret < -1 ) goto error;
2322
4.73k
    if ( !txt.s )
2323
0
    {
2324
0
        hts_log_error("Could not read the header");
2325
0
        goto error;
2326
0
    }
2327
4.73k
    if ( bcf_hdr_parse(h, txt.s) < 0 ) goto error;
2328
2329
    // check tabix index, are all contigs listed in the header? add the missing ones
2330
3.96k
    idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SILENT_FAIL);
2331
3.96k
    if ( idx )
2332
0
    {
2333
0
        int i, n, need_sync = 0;
2334
0
        names = tbx_seqnames(idx, &n);
2335
0
        if (!names) goto error;
2336
0
        for (i=0; i<n; i++)
2337
0
        {
2338
0
            bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL);
2339
0
            if ( hrec ) continue;
2340
0
            if (add_missing_contig_hrec(h, names[i]) < 0) goto error;
2341
0
            need_sync = 1;
2342
0
        }
2343
0
        if ( need_sync ) {
2344
0
            if (bcf_hdr_sync(h) < 0) goto error;
2345
0
        }
2346
0
        free(names);
2347
0
        tbx_destroy(idx);
2348
0
    }
2349
3.96k
    free(txt.s);
2350
3.96k
    return h;
2351
2352
798
 error:
2353
798
    if (idx) tbx_destroy(idx);
2354
798
    free(names);
2355
798
    free(txt.s);
2356
798
    if (h) bcf_hdr_destroy(h);
2357
798
    return NULL;
2358
3.96k
}
2359
2360
int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
2361
0
{
2362
0
    int i = 0, n = 0, save_errno;
2363
0
    char **lines = hts_readlines(fname, &n);
2364
0
    if ( !lines ) return 1;
2365
0
    for (i=0; i<n-1; i++)
2366
0
    {
2367
0
        int k;
2368
0
        bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
2369
0
        if (!hrec) goto fail;
2370
0
        if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
2371
0
            bcf_hrec_destroy(hrec);
2372
0
            goto fail;
2373
0
        }
2374
0
        free(lines[i]);
2375
0
        lines[i] = NULL;
2376
0
    }
2377
0
    if (bcf_hdr_parse_sample_line(hdr, lines[n-1]) < 0) goto fail;
2378
0
    if (bcf_hdr_sync(hdr) < 0) goto fail;
2379
0
    free(lines[n-1]);
2380
0
    free(lines);
2381
0
    return 0;
2382
2383
0
 fail:
2384
0
    save_errno = errno;
2385
0
    for (; i < n; i++)
2386
0
        free(lines[i]);
2387
0
    free(lines);
2388
0
    errno = save_errno;
2389
0
    return 1;
2390
0
}
2391
2392
static int _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
2393
32.5k
{
2394
32.5k
    uint32_t e = 0;
2395
32.5k
    if ( !hrec->value )
2396
24.9k
    {
2397
24.9k
        int j, nout = 0;
2398
24.9k
        e |= ksprintf(str, "##%s=<", hrec->key) < 0;
2399
131k
        for (j=0; j<hrec->nkeys; j++)
2400
106k
        {
2401
            // do not output IDX if output is VCF
2402
106k
            if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
2403
101k
            if ( nout ) e |= kputc(',',str) < 0;
2404
101k
            e |= ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]) < 0;
2405
101k
            nout++;
2406
101k
        }
2407
24.9k
        e |= ksprintf(str,">\n") < 0;
2408
24.9k
    }
2409
7.63k
    else
2410
7.63k
        e |= ksprintf(str,"##%s=%s\n", hrec->key,hrec->value) < 0;
2411
2412
32.5k
    return e == 0 ? 0 : -1;
2413
32.5k
}
2414
2415
int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
2416
0
{
2417
0
    return _bcf_hrec_format(hrec,0,str);
2418
0
}
2419
2420
int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str)
2421
4.37k
{
2422
4.37k
    int i, r = 0;
2423
36.9k
    for (i=0; i<hdr->nhrec; i++)
2424
32.5k
        r |= _bcf_hrec_format(hdr->hrec[i], is_bcf, str) < 0;
2425
2426
4.37k
    r |= ksprintf(str, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") < 0;
2427
4.37k
    if ( bcf_hdr_nsamples(hdr) )
2428
1.47k
    {
2429
1.47k
        r |= ksprintf(str, "\tFORMAT") < 0;
2430
13.2k
        for (i=0; i<bcf_hdr_nsamples(hdr); i++)
2431
11.8k
            r |= ksprintf(str, "\t%s", hdr->samples[i]) < 0;
2432
1.47k
    }
2433
4.37k
    r |= ksprintf(str, "\n") < 0;
2434
2435
4.37k
    return r ? -1 : 0;
2436
4.37k
}
2437
2438
char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
2439
0
{
2440
0
    kstring_t txt = {0,0,0};
2441
0
    if (bcf_hdr_format(hdr, is_bcf, &txt) < 0)
2442
0
        return NULL;
2443
0
    if ( len ) *len = txt.l;
2444
0
    return txt.s;
2445
0
}
2446
2447
const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
2448
0
{
2449
0
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
2450
0
    int i, tid, m = kh_size(d);
2451
0
    const char **names = (const char**) calloc(m,sizeof(const char*));
2452
0
    if ( !names )
2453
0
    {
2454
0
        hts_log_error("Failed to allocate memory");
2455
0
        *n = 0;
2456
0
        return NULL;
2457
0
    }
2458
0
    khint_t k;
2459
0
    for (k=kh_begin(d); k<kh_end(d); k++)
2460
0
    {
2461
0
        if ( !kh_exist(d,k) ) continue;
2462
0
        if ( !kh_val(d, k).hrec[0] ) continue;  // removed via bcf_hdr_remove
2463
0
        tid = kh_val(d,k).id;
2464
0
        if ( tid >= m )
2465
0
        {
2466
            // This can happen after a contig has been removed from BCF header via bcf_hdr_remove()
2467
0
            if ( hts_resize(const char*, tid + 1, &m, &names, HTS_RESIZE_CLEAR)<0 )
2468
0
            {
2469
0
                hts_log_error("Failed to allocate memory");
2470
0
                *n = 0;
2471
0
                free(names);
2472
0
                return NULL;
2473
0
            }
2474
0
            m = tid + 1;
2475
0
        }
2476
0
        names[tid] = kh_key(d,k);
2477
0
    }
2478
    // ensure there are no gaps
2479
0
    for (i=0,tid=0; tid<m; i++,tid++)
2480
0
    {
2481
0
        while ( tid<m && !names[tid] ) tid++;
2482
0
        if ( tid==m ) break;
2483
0
        if ( i==tid ) continue;
2484
0
        names[i] = names[tid];
2485
0
        names[tid] = 0;
2486
0
    }
2487
0
    *n = i;
2488
0
    return names;
2489
0
}
2490
2491
int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
2492
2.18k
{
2493
2.18k
    kstring_t htxt = {0,0,0};
2494
2.18k
    if (bcf_hdr_format(h, 0, &htxt) < 0) {
2495
0
        free(htxt.s);
2496
0
        return -1;
2497
0
    }
2498
2.18k
    while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros
2499
2.18k
    int ret;
2500
2.18k
    if ( fp->format.compression!=no_compression ) {
2501
0
        ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l);
2502
0
        if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2503
2.18k
    } else {
2504
2.18k
        ret = hwrite(fp->fp.hfile, htxt.s, htxt.l);
2505
2.18k
    }
2506
2.18k
    free(htxt.s);
2507
2.18k
    return ret<0 ? -1 : 0;
2508
2.18k
}
2509
2510
/***********************
2511
 *** Typed value I/O ***
2512
 ***********************/
2513
2514
int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
2515
491k
{
2516
491k
    int32_t max = INT32_MIN, min = INT32_MAX;
2517
491k
    int i;
2518
491k
    if (n <= 0) {
2519
1.54k
        return bcf_enc_size(s, 0, BCF_BT_NULL);
2520
489k
    } else if (n == 1) {
2521
13.4k
        return bcf_enc_int1(s, a[0]);
2522
476k
    } else {
2523
476k
        if (wsize <= 0) wsize = n;
2524
2525
        // Equivalent to:
2526
        // for (i = 0; i < n; ++i) {
2527
        //     if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end )
2528
        //         continue;
2529
        //     if (max < a[i]) max = a[i];
2530
        //     if (min > a[i]) min = a[i];
2531
        // }
2532
476k
        int max4[4] = {INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN};
2533
476k
        int min4[4] = {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX};
2534
149M
        for (i = 0; i < (n&~3); i+=4) {
2535
            // bcf_int32_missing    == INT32_MIN and
2536
            // bcf_int32_vector_end == INT32_MIN+1.
2537
            // We skip these, but can mostly avoid explicit checking
2538
148M
            if (max4[0] < a[i+0]) max4[0] = a[i+0];
2539
148M
            if (max4[1] < a[i+1]) max4[1] = a[i+1];
2540
148M
            if (max4[2] < a[i+2]) max4[2] = a[i+2];
2541
148M
            if (max4[3] < a[i+3]) max4[3] = a[i+3];
2542
148M
            if (min4[0] > a[i+0] && a[i+0] > INT32_MIN+1) min4[0] = a[i+0];
2543
148M
            if (min4[1] > a[i+1] && a[i+1] > INT32_MIN+1) min4[1] = a[i+1];
2544
148M
            if (min4[2] > a[i+2] && a[i+2] > INT32_MIN+1) min4[2] = a[i+2];
2545
148M
            if (min4[3] > a[i+3] && a[i+3] > INT32_MIN+1) min4[3] = a[i+3];
2546
148M
        }
2547
476k
        min = min4[0];
2548
476k
        if (min > min4[1]) min = min4[1];
2549
476k
        if (min > min4[2]) min = min4[2];
2550
476k
        if (min > min4[3]) min = min4[3];
2551
476k
        max = max4[0];
2552
476k
        if (max < max4[1]) max = max4[1];
2553
476k
        if (max < max4[2]) max = max4[2];
2554
476k
        if (max < max4[3]) max = max4[3];
2555
1.11M
        for (; i < n; ++i) {
2556
637k
            if (max < a[i]) max = a[i];
2557
637k
            if (min > a[i] && a[i] > INT32_MIN+1) min = a[i];
2558
637k
        }
2559
2560
476k
        if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) {
2561
41.6k
            if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 ||
2562
41.6k
                ks_resize(s, s->l + n) < 0)
2563
0
                return -1;
2564
41.6k
            uint8_t *p = (uint8_t *) s->s + s->l;
2565
18.2M
            for (i = 0; i < n; ++i, p++) {
2566
18.1M
                if ( a[i]==bcf_int32_vector_end )   *p = bcf_int8_vector_end;
2567
17.8M
                else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing;
2568
321k
                else *p = a[i];
2569
18.1M
            }
2570
41.6k
            s->l += n;
2571
434k
        } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) {
2572
285k
            uint8_t *p;
2573
285k
            if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 ||
2574
285k
                ks_resize(s, s->l + n * sizeof(int16_t)) < 0)
2575
0
                return -1;
2576
285k
            p = (uint8_t *) s->s + s->l;
2577
154M
            for (i = 0; i < n; ++i)
2578
154M
            {
2579
154M
                int16_t x;
2580
154M
                if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
2581
152M
                else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
2582
3.07M
                else x = a[i];
2583
154M
                i16_to_le(x, p);
2584
154M
                p += sizeof(int16_t);
2585
154M
            }
2586
285k
            s->l += n * sizeof(int16_t);
2587
285k
        } else {
2588
149k
            uint8_t *p;
2589
149k
            if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 ||
2590
149k
                ks_resize(s, s->l + n * sizeof(int32_t)) < 0)
2591
0
                return -1;
2592
149k
            p = (uint8_t *) s->s + s->l;
2593
423M
            for (i = 0; i < n; ++i) {
2594
423M
                i32_to_le(a[i], p);
2595
423M
                p += sizeof(int32_t);
2596
423M
            }
2597
149k
            s->l += n * sizeof(int32_t);
2598
149k
        }
2599
476k
    }
2600
2601
476k
    return 0;
2602
491k
}
2603
2604
#ifdef VCF_ALLOW_INT64
2605
static int bcf_enc_long1(kstring_t *s, int64_t x) {
2606
    uint32_t e = 0;
2607
    if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32)
2608
        return bcf_enc_int1(s, x);
2609
    if (x == bcf_int64_vector_end) {
2610
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2611
        e |= kputc(bcf_int8_vector_end, s) < 0;
2612
    } else if (x == bcf_int64_missing) {
2613
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2614
        e |= kputc(bcf_int8_missing, s) < 0;
2615
    } else {
2616
        e |= bcf_enc_size(s, 1, BCF_BT_INT64);
2617
        e |= ks_expand(s, 8);
2618
        if (e == 0) { u64_to_le(x, (uint8_t *) s->s + s->l); s->l += 8; }
2619
    }
2620
    return e == 0 ? 0 : -1;
2621
}
2622
#endif
2623
2624
669k
static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) {
2625
669k
    uint8_t *p;
2626
669k
    size_t i;
2627
669k
    size_t bytes = n * sizeof(float);
2628
2629
669k
    if (bytes / sizeof(float) != n) return -1;
2630
669k
    if (ks_resize(s, s->l + bytes) < 0) return -1;
2631
2632
669k
    p = (uint8_t *) s->s + s->l;
2633
139M
    for (i = 0; i < n; i++) {
2634
138M
        float_to_le(a[i], p);
2635
138M
        p += sizeof(float);
2636
138M
    }
2637
669k
    s->l += bytes;
2638
2639
669k
    return 0;
2640
669k
}
2641
2642
int bcf_enc_vfloat(kstring_t *s, int n, float *a)
2643
669k
{
2644
669k
    assert(n >= 0);
2645
669k
    bcf_enc_size(s, n, BCF_BT_FLOAT);
2646
669k
    serialize_float_array(s, n, a);
2647
669k
    return 0; // FIXME: check for errs in this function
2648
669k
}
2649
2650
int bcf_enc_vchar(kstring_t *s, int l, const char *a)
2651
4.36M
{
2652
4.36M
    bcf_enc_size(s, l, BCF_BT_CHAR);
2653
4.36M
    kputsn(a, l, s);
2654
4.36M
    return 0; // FIXME: check for errs in this function
2655
4.36M
}
2656
2657
// Special case of n==1 as it also occurs quite often in FORMAT data.
2658
// This version is also small enough to get inlined.
2659
316k
static inline int bcf_fmt_array1(kstring_t *s, int type, void *data) {
2660
316k
    uint32_t e = 0;
2661
316k
    uint8_t *p = (uint8_t *)data;
2662
316k
    int32_t v;
2663
2664
    // helps gcc more than clang here. In billions of cycles:
2665
    //          bcf_fmt_array1  bcf_fmt_array
2666
    // gcc7:    23.2            24.3
2667
    // gcc13:   21.6            23.0
2668
    // clang13: 27.1            27.8
2669
316k
    switch (type) {
2670
314k
    case BCF_BT_CHAR:
2671
314k
        e |= kputc_(*p == bcf_str_missing ? '.' : *p, s) < 0;
2672
314k
        break;
2673
2674
956
    case BCF_BT_INT8:
2675
956
        if (*(int8_t *)p != bcf_int8_vector_end) {
2676
956
            e |= ((*(int8_t *)p == bcf_int8_missing)
2677
956
                  ? kputc_('.', s)
2678
956
                  : kputw(*(int8_t *)p, s)) < 0;
2679
956
        }
2680
956
        break;
2681
410
    case BCF_BT_INT16:
2682
410
        v = le_to_i16(p);
2683
410
        if (v != bcf_int16_vector_end) {
2684
410
            e |= (v == bcf_int16_missing
2685
410
                  ? kputc_('.', s)
2686
410
                  : kputw(v, s)) < 0;
2687
410
        }
2688
410
        break;
2689
2690
540
    case BCF_BT_INT32:
2691
540
        v = le_to_i32(p);
2692
540
        if (v != bcf_int32_vector_end) {
2693
540
            e |= (v == bcf_int32_missing
2694
540
                  ? kputc_('.', s)
2695
540
                  : kputw(v, s)) < 0;
2696
540
        }
2697
540
        break;
2698
2699
0
    case BCF_BT_FLOAT:
2700
0
        v = le_to_u32(p);
2701
0
        if (v != bcf_float_vector_end) {
2702
0
            e |= (v == bcf_float_missing
2703
0
                  ? kputc_('.', s)
2704
0
                  : kputd(le_to_float(p), s)) < 0;
2705
0
        }
2706
0
        break;
2707
2708
0
    default:
2709
0
        hts_log_error("Unexpected type %d", type);
2710
0
        return -1;
2711
316k
    }
2712
2713
316k
    return e == 0 ? 0 : -1;
2714
316k
}
2715
2716
int bcf_fmt_array(kstring_t *s, int n, int type, void *data)
2717
4.88M
{
2718
4.88M
    int j = 0;
2719
4.88M
    uint32_t e = 0;
2720
4.88M
    if (n == 0) {
2721
3.30M
        return kputc_('.', s) >= 0 ? 0 : -1;
2722
3.30M
    }
2723
2724
1.58M
    if (type == BCF_BT_CHAR)
2725
134k
    {
2726
134k
        char *p = (char *)data;
2727
2728
        // Note bcf_str_missing is already accounted for in n==0 above.
2729
134k
        if (n >= 8) {
2730
63.1k
            char *p_end = memchr(p, 0, n);
2731
63.1k
            e |= kputsn(p, p_end ? p_end-p : n, s) < 0;
2732
71.0k
        } else {
2733
203k
            for (j = 0; j < n && *p; ++j, ++p)
2734
132k
               e |= kputc(*p, s) < 0;
2735
71.0k
        }
2736
134k
    }
2737
1.44M
    else
2738
1.44M
    {
2739
1.44M
        #define BRANCH(type_t, convert, is_missing, is_vector_end, kprint) { \
2740
1.44M
            uint8_t *p = (uint8_t *) data; \
2741
363M
            for (j=0; j<n; j++, p += sizeof(type_t))    \
2742
361M
            { \
2743
361M
                type_t v = convert(p); \
2744
361M
                if ( is_vector_end ) break; \
2745
361M
                if ( j ) e |= kputc_(',', s) < 0; \
2746
361M
                e |= (is_missing ? kputc('.', s) : kprint) < 0; \
2747
361M
            } \
2748
1.44M
        }
2749
1.44M
        switch (type) {
2750
480k
            case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, v==bcf_int8_missing,  v==bcf_int8_vector_end,  kputw(v, s)); break;
2751
274k
            case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break;
2752
356k
            case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break;
2753
335k
            case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break;
2754
0
            default: hts_log_error("Unexpected type %d", type); exit(1); break;
2755
1.44M
        }
2756
1.44M
        #undef BRANCH
2757
1.44M
    }
2758
1.58M
    return e == 0 ? 0 : -1;
2759
1.58M
}
2760
2761
uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
2762
2.49M
{
2763
2.49M
    int x, type;
2764
2.49M
    x = bcf_dec_size(ptr, &ptr, &type);
2765
2.49M
    bcf_fmt_array(s, x, type, ptr);
2766
2.49M
    return ptr + (x << bcf_type_shift[type]);
2767
2.49M
}
2768
2769
/********************
2770
 *** VCF site I/O ***
2771
 ********************/
2772
2773
typedef struct {
2774
    int key;            // Key for h->id[BCF_DT_ID][key] vdict
2775
    int max_m;          // number of elements in field array (ie commas)
2776
    int size;           // field size (max_l or max_g*4 if is_gt)
2777
    int offset;         // offset of buf into h->mem
2778
    uint32_t is_gt:1,   // is genotype
2779
             max_g:31;  // maximum number of genotypes
2780
    uint32_t max_l;     // length of field
2781
    uint32_t y;         // h->id[0][fmt[j].key].val->info[BCF_HL_FMT]
2782
    uint8_t *buf;       // Pointer into h->mem
2783
} fmt_aux_t;
2784
2785
// fmt_aux_t field notes:
2786
// max_* are biggest sizes of the various FORMAT fields across all samples.
2787
// We use these after pivoting the data to ensure easy random access
2788
// of a specific sample.
2789
//
2790
// max_m is only used for type BCF_HT_REAL or BCF_HT_INT
2791
// max_g is only used for is_gt == 1 (will be BCF_HT_STR)
2792
// max_l is only used for is_gt == 0 (will be BCF_HT_STR)
2793
//
2794
// These are computed in vcf_parse_format_max3 and used in
2795
// vcf_parse_format_alloc4 to get the size.
2796
//
2797
// size is computed from max_g, max_l, max_m and is_gt.  Once computed
2798
// the max values are never accessed again.
2799
//
2800
// In theory all 4 vars could be coalesced into a single variable, but this
2801
// significantly harms speed (even if done via a union).  It's about 25-30%
2802
// slower.
2803
2804
static inline int align_mem(kstring_t *s)
2805
135k
{
2806
135k
    int e = 0;
2807
135k
    if (s->l&7) {
2808
20.2k
        uint64_t zero = 0;
2809
20.2k
        e = kputsn((char*)&zero, 8 - (s->l&7), s) < 0;
2810
20.2k
    }
2811
135k
    return e == 0 ? 0 : -1;
2812
135k
}
2813
2814
136k
#define MAX_N_FMT 255   /* Limited by size of bcf1_t n_fmt field */
2815
2816
// detect FORMAT "."
2817
static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2818
6.92k
                                   const char *p, const char *q) {
2819
6.92k
    const char *end = s->s + s->l;
2820
6.92k
    if ( q>=end )
2821
34
    {
2822
34
        hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1);
2823
34
        v->errcode |= BCF_ERR_NCOLS;
2824
34
        return -1;
2825
34
    }
2826
2827
6.89k
    v->n_fmt = 0;
2828
6.89k
    if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "."
2829
45
    {
2830
45
        v->n_sample = bcf_hdr_nsamples(h);
2831
45
        return 1;
2832
45
    }
2833
2834
6.84k
    return 0;
2835
6.89k
}
2836
2837
// get format information from the dictionary
2838
static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2839
6.84k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
2840
6.84k
    const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
2841
6.84k
    char *t;
2842
6.84k
    int j;
2843
6.84k
    ks_tokaux_t aux1;
2844
2845
143k
    for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
2846
136k
        if (j >= MAX_N_FMT) {
2847
3
            v->errcode |= BCF_ERR_LIMITS;
2848
3
            hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle",
2849
3
                bcf_seqname_safe(h,v), v->pos+1);
2850
3
            return -1;
2851
3
        }
2852
2853
136k
        *(char*)aux1.p = 0;
2854
136k
        khint_t k = kh_get(vdict, d, t);
2855
136k
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
2856
8.18k
            if ( t[0]=='.' && t[1]==0 )
2857
1
            {
2858
1
                hts_log_error("Invalid FORMAT tag name '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2859
1
                v->errcode |= BCF_ERR_TAG_INVALID;
2860
1
                return -1;
2861
1
            }
2862
8.17k
            hts_log_warning("FORMAT '%s' at %s:%"PRIhts_pos" is not defined in the header, assuming Type=String", t, bcf_seqname_safe(h,v), v->pos+1);
2863
8.17k
            kstring_t tmp = {0,0,0};
2864
8.17k
            int l;
2865
8.17k
            ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
2866
8.17k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
2867
8.17k
            free(tmp.s);
2868
8.17k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
2869
8.17k
            if (res < 0) bcf_hrec_destroy(hrec);
2870
8.17k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
2871
2872
8.17k
            k = kh_get(vdict, d, t);
2873
8.17k
            v->errcode |= BCF_ERR_TAG_UNDEF;
2874
8.17k
            if (res || k == kh_end(d)) {
2875
16
                hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
2876
16
                v->errcode |= BCF_ERR_TAG_INVALID;
2877
16
                return -1;
2878
16
            }
2879
8.17k
        }
2880
136k
        fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
2881
136k
        fmt[j].key = kh_val(d, k).id;
2882
136k
        fmt[j].is_gt = (t[0] == 'G' && t[1] == 'T' && !t[2]);
2883
136k
        fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
2884
136k
        v->n_fmt++;
2885
136k
    }
2886
6.82k
    return 0;
2887
6.84k
}
2888
2889
// compute max
2890
static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2891
6.82k
                                 char *p, char *q, fmt_aux_t *fmt) {
2892
6.82k
    int n_sample_ori = -1;
2893
6.82k
    char *r = q + 1;  // r: position in the format string
2894
6.82k
    int l = 0, m = 1, g = 1, j;
2895
6.82k
    v->n_sample = 0;  // m: max vector size, l: max field len, g: max number of alleles
2896
6.82k
    const char *end = s->s + s->l;
2897
2898
31.2k
    while ( r<end )
2899
31.0k
    {
2900
        // can we skip some samples?
2901
31.0k
        if ( h->keep_samples )
2902
0
        {
2903
0
            n_sample_ori++;
2904
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
2905
0
            {
2906
0
                while ( *r!='\t' && r<end ) r++;
2907
0
                if ( *r=='\t' ) { *r = 0; r++; }
2908
0
                continue;
2909
0
            }
2910
0
        }
2911
2912
        // collect fmt stats: max vector size, length, number of alleles
2913
31.0k
        j = 0;  // j-th format field
2914
31.0k
        fmt_aux_t *f = fmt;
2915
31.0k
        static char meta[256] = {
2916
            // \0 \t , / : |
2917
31.0k
            1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2918
31.0k
            0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1, 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
2919
31.0k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2920
31.0k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
2921
31.0k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2922
31.0k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2923
31.0k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2924
31.0k
            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2925
31.0k
        };
2926
2927
31.0k
        char *r_start = r;
2928
11.4M
        for (;;) {
2929
            // Quickly skip ahead to an appropriate meta-character
2930
14.5M
            while (!meta[(unsigned char)*r]) r++;
2931
2932
11.4M
            switch (*r) {
2933
10.2M
            case ',':
2934
10.2M
                m++;
2935
10.2M
                break;
2936
2937
11.7k
            case '|':
2938
1.10M
            case '/':
2939
1.10M
                if (f->is_gt) g++;
2940
1.10M
                break;
2941
2942
10.8k
            case '\t':
2943
10.8k
                *r = 0; // fall through
2944
2945
10.8k
            default: // valid due to while loop above.
2946
31.0k
            case '\0':
2947
64.1k
            case ':':
2948
64.1k
                l = r - r_start; r_start = r;
2949
64.1k
                if (f->max_m < m) f->max_m = m;
2950
64.1k
                if (f->max_l < l) f->max_l = l;
2951
64.1k
                if (f->is_gt && f->max_g < g) f->max_g = g;
2952
64.1k
                l = 0, m = g = 1;
2953
64.1k
                if ( *r==':' ) {
2954
33.0k
                    j++; f++;
2955
33.0k
                    if ( j>=v->n_fmt ) {
2956
9
                        hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"",
2957
9
                                      h->id[BCF_DT_CTG][v->rid].key, v->pos+1);
2958
9
                        v->errcode |= BCF_ERR_NCOLS;
2959
9
                        return -1;
2960
9
                    }
2961
33.0k
                } else goto end_for;
2962
33.0k
                break;
2963
11.4M
            }
2964
11.4M
            if ( r>=end ) break;
2965
11.4M
            r++;
2966
11.4M
        }
2967
31.0k
    end_for:
2968
31.0k
        v->n_sample++;
2969
31.0k
        if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
2970
24.3k
        r++;
2971
24.3k
    }
2972
2973
6.82k
    return 0;
2974
6.82k
}
2975
2976
// allocate memory for arrays
2977
static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
2978
                                   const char *p, const char *q,
2979
6.82k
                                   fmt_aux_t *fmt) {
2980
6.82k
    kstring_t *mem = (kstring_t*)&h->mem;
2981
2982
6.82k
    int j;
2983
142k
    for (j = 0; j < v->n_fmt; ++j) {
2984
135k
        fmt_aux_t *f = &fmt[j];
2985
135k
        if ( !f->max_m ) f->max_m = 1;  // omitted trailing format field
2986
2987
135k
        if ((f->y>>4&0xf) == BCF_HT_STR) {
2988
135k
            f->size = f->is_gt? f->max_g << 2 : f->max_l;
2989
135k
        } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
2990
0
            f->size = f->max_m << 2;
2991
4
        } else {
2992
4
            hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
2993
4
            v->errcode |= BCF_ERR_TAG_INVALID;
2994
4
            return -1;
2995
4
        }
2996
2997
135k
        if (align_mem(mem) < 0) {
2998
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2999
0
            v->errcode |= BCF_ERR_LIMITS;
3000
0
            return -1;
3001
0
        }
3002
3003
        // Limit the total memory to ~2Gb per VCF row.  This should mean
3004
        // malformed VCF data is less likely to take excessive memory and/or
3005
        // time.
3006
135k
        if ((uint64_t) mem->l + v->n_sample * (uint64_t)f->size > INT_MAX) {
3007
0
            static int warned = 0;
3008
0
            if ( !warned ) hts_log_warning("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3009
0
            warned = 1;
3010
0
            v->errcode |= BCF_ERR_LIMITS;
3011
0
            f->size = -1;
3012
0
            f->offset = 0;
3013
0
            continue;
3014
0
        }
3015
3016
135k
        f->offset = mem->l;
3017
135k
        if (ks_resize(mem, mem->l + v->n_sample * (size_t)f->size) < 0) {
3018
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3019
0
            v->errcode |= BCF_ERR_LIMITS;
3020
0
            return -1;
3021
0
        }
3022
135k
        mem->l += v->n_sample * f->size;
3023
135k
    }
3024
3025
6.81k
    {
3026
6.81k
        int j;
3027
142k
        for (j = 0; j < v->n_fmt; ++j)
3028
135k
            fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
3029
6.81k
    }
3030
3031
6.81k
    return 0;
3032
6.82k
}
3033
3034
// Fill the sample fields
3035
static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3036
6.81k
                                  const char *p, const char *q, fmt_aux_t *fmt) {
3037
6.81k
    static int extreme_val_warned = 0;
3038
6.81k
    int n_sample_ori = -1;
3039
    // At beginning of the loop t points to the first char of a format
3040
6.81k
    const char *t = q + 1;
3041
6.81k
    int m = 0;   // m: sample id
3042
6.81k
    const int nsamples = bcf_hdr_nsamples(h);
3043
3044
6.81k
    const char *end = s->s + s->l;
3045
37.4k
    while ( t<end )
3046
35.3k
    {
3047
        // can we skip some samples?
3048
35.3k
        if ( h->keep_samples )
3049
0
        {
3050
0
            n_sample_ori++;
3051
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
3052
0
            {
3053
0
                while ( *t && t<end ) t++;
3054
0
                t++;
3055
0
                continue;
3056
0
            }
3057
0
        }
3058
35.3k
        if ( m == nsamples ) break;
3059
3060
30.8k
        int j = 0; // j-th format field, m-th sample
3061
63.6k
        while ( t < end )
3062
63.4k
        {
3063
63.4k
            fmt_aux_t *z = &fmt[j++];
3064
63.4k
            const int htype = z->y>>4&0xf;
3065
63.4k
            if (!z->buf) {
3066
1
                hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos,
3067
1
                              z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
3068
1
                v->errcode |= BCF_ERR_LIMITS;
3069
1
                return -1;
3070
1
            }
3071
3072
63.4k
            if ( z->size==-1 )
3073
0
            {
3074
                // this field is to be ignored, it's too big
3075
0
                while ( *t != ':' && *t ) t++;
3076
0
            }
3077
63.4k
            else if (htype == BCF_HT_STR) {
3078
63.4k
                int l;
3079
63.4k
                if (z->is_gt) {
3080
                    // Genotypes.
3081
                    // <val>([|/]<val>)+... where <val> is [0-9]+ or ".".
3082
3.64k
                    int32_t is_phased = 0;
3083
3.64k
                    uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
3084
3.64k
                    uint32_t unreadable = 0;
3085
3.64k
                    uint32_t max = 0;
3086
3.64k
                    int overflow = 0;
3087
536k
                    for (l = 0;; ++t) {
3088
536k
                        if (*t == '.') {
3089
305k
                            ++t, x[l++] = is_phased;
3090
305k
                        } else {
3091
230k
                            const char *tt = t;
3092
230k
                            uint32_t val;
3093
                            // Or "v->n_allele < 10", but it doesn't
3094
                            // seem to be any faster and this feels safer.
3095
230k
                            if (*t >= '0' && *t <= '9' &&
3096
230k
                                !(t[1] >= '0' && t[1] <= '9')) {
3097
215k
                                val = *t++ - '0';
3098
215k
                            } else {
3099
14.9k
                                val = hts_str2uint(t, (char **)&t,
3100
14.9k
                                                   sizeof(val) * CHAR_MAX - 2,
3101
14.9k
                                                   &overflow);
3102
14.9k
                                unreadable |= tt == t;
3103
14.9k
                            }
3104
230k
                            if (max < val) max = val;
3105
230k
                            x[l++] = (val + 1) << 1 | is_phased;
3106
230k
                        }
3107
536k
                        is_phased = (*t == '|');
3108
536k
                        if (*t != '|' && *t != '/') break;
3109
536k
                    }
3110
                    // Possibly check max against v->n_allele instead?
3111
3.64k
                    if (overflow || max > (INT32_MAX >> 1) - 1) {
3112
132
                        hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3113
132
                        return -1;
3114
132
                    }
3115
3.51k
                    if (unreadable) {
3116
33
                        hts_log_error("Couldn't read GT data: value not a number or '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3117
33
                        return -1;
3118
33
                    }
3119
3.48k
                    if ( !l ) x[l++] = 0;   // An empty field, insert missing value
3120
597k
                    for (; l < z->size>>2; ++l)
3121
594k
                        x[l] = bcf_int32_vector_end;
3122
3123
59.7k
                } else {
3124
                    // Otherwise arbitrary strings
3125
59.7k
                    char *x = (char*)z->buf + z->size * (size_t)m;
3126
12.2M
                    for (l = 0; *t != ':' && *t; ++t)
3127
12.1M
                        x[l++] = *t;
3128
59.7k
                    if (z->size > l)
3129
47.5k
                        memset(&x[l], 0, (z->size-l) * sizeof(*x));
3130
59.7k
                }
3131
3132
63.4k
            } else if (htype == BCF_HT_INT) {
3133
                // One or more integers in an array
3134
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3135
0
                int l;
3136
0
                for (l = 0;; ++t) {
3137
0
                    if (*t == '.') {
3138
0
                        x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
3139
0
                    } else {
3140
0
                        int overflow = 0;
3141
0
                        char *te;
3142
0
                        long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3143
0
                        if ( te==t || overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3144
0
                        {
3145
0
                            if ( !extreme_val_warned )
3146
0
                            {
3147
0
                                hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos,
3148
0
                                                h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1);
3149
0
                                extreme_val_warned = 1;
3150
0
                            }
3151
0
                            tmp_val = bcf_int32_missing;
3152
0
                        }
3153
0
                        x[l++] = tmp_val;
3154
0
                        t = te;
3155
0
                    }
3156
0
                    if (*t != ',') break;
3157
0
                }
3158
0
                if ( !l )
3159
0
                    x[l++] = bcf_int32_missing;
3160
0
                for (; l < z->size>>2; ++l)
3161
0
                    x[l] = bcf_int32_vector_end;
3162
3163
0
            } else if (htype == BCF_HT_REAL) {
3164
                // One of more floating point values in an array
3165
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3166
0
                int l;
3167
0
                for (l = 0;; ++t) {
3168
0
                    if (*t == '.' && !isdigit_c(t[1])) {
3169
0
                        bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
3170
0
                    } else {
3171
0
                        int overflow = 0;
3172
0
                        char *te;
3173
0
                        float tmp_val = hts_str2dbl(t, &te, &overflow);
3174
0
                        if ( (te==t || overflow) && !extreme_val_warned )
3175
0
                        {
3176
0
                            hts_log_warning("Extreme FORMAT/%s value encountered at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname(h,v), v->pos+1);
3177
0
                            extreme_val_warned = 1;
3178
0
                        }
3179
0
                        x[l++] = tmp_val;
3180
0
                        t = te;
3181
0
                    }
3182
0
                    if (*t != ',') break;
3183
0
                }
3184
0
                if ( !l )
3185
                    // An empty field, insert missing value
3186
0
                    bcf_float_set_missing(x[l++]);
3187
0
                for (; l < z->size>>2; ++l)
3188
0
                    bcf_float_set_vector_end(x[l]);
3189
0
            } else {
3190
0
                hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, htype, bcf_seqname_safe(h,v), v->pos+1);
3191
0
                v->errcode |= BCF_ERR_TAG_INVALID;
3192
0
                return -1;
3193
0
            }
3194
3195
63.2k
            if (*t == '\0') {
3196
30.3k
                break;
3197
30.3k
            }
3198
32.9k
            else if (*t == ':') {
3199
32.8k
                t++;
3200
32.8k
            }
3201
38
            else {
3202
38
                char buffer[8];
3203
38
                hts_log_error("Invalid character %s in '%s' FORMAT field at %s:%"PRIhts_pos"",
3204
38
                    hts_strprint(buffer, sizeof buffer, '\'', t, 1),
3205
38
                    h->id[BCF_DT_ID][z->key].key, bcf_seqname_safe(h,v), v->pos+1);
3206
38
                v->errcode |= BCF_ERR_CHAR;
3207
38
                return -1;
3208
38
            }
3209
63.2k
        }
3210
3211
        // fill end-of-vector values
3212
1.60M
        for (; j < v->n_fmt; ++j) {
3213
1.57M
            fmt_aux_t *z = &fmt[j];
3214
1.57M
            const int htype = z->y>>4&0xf;
3215
1.57M
            int l;
3216
1.57M
            if (htype == BCF_HT_STR) {
3217
1.57M
                if (z->is_gt) {
3218
11.3k
                    int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3219
11.3k
                    if (z->size) x[0] = bcf_int32_missing;
3220
3.65M
                    for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3221
1.56M
                } else {
3222
1.56M
                    char *x = (char*)z->buf + z->size * (size_t)m;
3223
1.56M
                    if ( z->size ) {
3224
500k
                        x[0] = '.';
3225
500k
                        memset(&x[1], 0, (z->size-1) * sizeof(*x));
3226
500k
                    }
3227
1.56M
                }
3228
1.57M
            } else if (htype == BCF_HT_INT) {
3229
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
3230
0
                x[0] = bcf_int32_missing;
3231
0
                for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
3232
0
            } else if (htype == BCF_HT_REAL) {
3233
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
3234
0
                bcf_float_set_missing(x[0]);
3235
0
                for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
3236
0
            }
3237
1.57M
        }
3238
3239
30.6k
        m++; t++;
3240
30.6k
    }
3241
3242
6.61k
    return 0;
3243
6.81k
}
3244
3245
// write individual genotype information
3246
static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3247
6.61k
                                const char *p, const char *q, fmt_aux_t *fmt) {
3248
6.61k
    kstring_t *str = &v->indiv;
3249
6.61k
    int i, need_downsize = 0;
3250
6.61k
    if (v->n_sample > 0) {
3251
140k
        for (i = 0; i < v->n_fmt; ++i) {
3252
133k
            fmt_aux_t *z = &fmt[i];
3253
133k
            if ( z->size==-1 ) {
3254
0
                need_downsize = 1;
3255
0
                continue;
3256
0
            }
3257
133k
            bcf_enc_int1(str, z->key);
3258
133k
            if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
3259
129k
                bcf_enc_size(str, z->size, BCF_BT_CHAR);
3260
129k
                kputsn((char*)z->buf, z->size * (size_t)v->n_sample, str);
3261
129k
            } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
3262
3.65k
                bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
3263
3.65k
            } else {
3264
0
                bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
3265
0
                if (serialize_float_array(str, (z->size>>2) * (size_t)v->n_sample,
3266
0
                                          (float *) z->buf) != 0) {
3267
0
                    v->errcode |= BCF_ERR_LIMITS;
3268
0
                    hts_log_error("Out of memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3269
0
                    return -1;
3270
0
                }
3271
0
            }
3272
133k
        }
3273
3274
6.60k
    }
3275
6.61k
    if ( need_downsize ) {
3276
0
        i = 1;
3277
0
        while ( i < v->n_fmt ) {
3278
0
            if ( fmt[i].size==-1 )
3279
0
            {
3280
0
                memmove(&fmt[i-1],&fmt[i],sizeof(*fmt));
3281
0
                v->n_fmt--;
3282
0
            }
3283
0
            else
3284
0
                i++;
3285
0
        }
3286
0
    }
3287
3288
6.61k
    return 0;
3289
6.61k
}
3290
3291
// validity checking
3292
6.61k
static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) {
3293
6.61k
    if ( v->n_sample!=bcf_hdr_nsamples(h) )
3294
100
    {
3295
100
        hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
3296
100
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
3297
100
        v->errcode |= BCF_ERR_NCOLS;
3298
100
        return -1;
3299
100
    }
3300
6.51k
    if ( v->indiv.l > 0xffffffff )
3301
0
    {
3302
0
        hts_log_error("The FORMAT at %s:%"PRIhts_pos" is too long", bcf_seqname_safe(h,v), v->pos+1);
3303
0
        v->errcode |= BCF_ERR_LIMITS;
3304
3305
        // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed
3306
0
        v->n_fmt = 0;
3307
0
        return -1;
3308
0
    }
3309
3310
6.51k
    return 0;
3311
6.51k
}
3312
3313
// p,q is the start and the end of the FORMAT field
3314
static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3315
                            char *p, char *q)
3316
13.1k
{
3317
13.1k
    if ( !bcf_hdr_nsamples(h) ) return 0;
3318
6.92k
    kstring_t *mem = (kstring_t*)&h->mem;
3319
6.92k
    mem->l = 0;
3320
3321
6.92k
    fmt_aux_t fmt[MAX_N_FMT];
3322
3323
    // detect FORMAT "."
3324
6.92k
    int ret; // +ve = ok, -ve = err
3325
6.92k
    if ((ret = vcf_parse_format_empty1(s, h, v, p, q)))
3326
79
        return ret ? 0 : -1;
3327
3328
    // get format information from the dictionary
3329
6.84k
    if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0)
3330
20
        return -1;
3331
3332
    // FORMAT data is per-sample A:B:C A:B:C A:B:C ... but in memory it is
3333
    // stored as per-type arrays AAA... BBB... CCC...  This is basically
3334
    // a data rotation or pivot.
3335
3336
    // The size of elements in the array grow to their maximum needed,
3337
    // permitting fast random access.  This means however we have to first
3338
    // scan the whole FORMAT line to find the maximum of each type, and
3339
    // then scan it again to find the store the data.
3340
    // We break this down into compute-max, allocate, fill-out-buffers
3341
3342
    // TODO: ?
3343
    // The alternative would be to pivot on the first pass, with fixed
3344
    // size entries for numerics and concatenated strings otherwise, also
3345
    // tracking maximum sizes.  Then on a second pass we reallocate and
3346
    // copy the data again to a uniformly sized array.  Two passes through
3347
    // memory, but without doubling string parsing.
3348
3349
    // compute max
3350
6.82k
    if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0)
3351
9
        return -1;
3352
3353
    // allocate memory for arrays
3354
6.82k
    if (vcf_parse_format_alloc4(s, h, v, p, q, fmt) < 0)
3355
4
        return -1;
3356
3357
    // fill the sample fields; at beginning of the loop
3358
6.81k
    if (vcf_parse_format_fill5(s, h, v, p, q, fmt) < 0)
3359
204
        return -1;
3360
3361
    // write individual genotype information
3362
6.61k
    if (vcf_parse_format_gt6(s, h, v, p, q, fmt) < 0)
3363
0
        return -1;
3364
3365
    // validity checking
3366
6.61k
    if (vcf_parse_format_check7(h, v) < 0)
3367
100
        return -1;
3368
3369
6.51k
    return 0;
3370
6.61k
}
3371
3372
4.58k
static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
3373
    // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
3374
    // been already printed, but will enable tools like vcfcheck to proceed.
3375
3376
4.58k
    kstring_t tmp = {0,0,0};
3377
4.58k
    khint_t k;
3378
4.58k
    int l;
3379
4.58k
    if (ksprintf(&tmp, "##contig=<ID=%s>", p) < 0)
3380
0
        return kh_end(d);
3381
4.58k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3382
4.58k
    free(tmp.s);
3383
4.58k
    int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3384
4.58k
    if (res < 0) bcf_hrec_destroy(hrec);
3385
4.58k
    if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3386
4.58k
    k = kh_get(vdict, d, p);
3387
3388
4.58k
    return k;
3389
4.58k
}
3390
3391
16.6k
static int vcf_parse_filter(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3392
16.6k
    int i, n_flt = 1, max_n_flt = 0;
3393
16.6k
    char *r, *t;
3394
16.6k
    int32_t *a_flt = NULL;
3395
16.6k
    ks_tokaux_t aux1;
3396
16.6k
    khint_t k;
3397
16.6k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3398
    // count the number of filters
3399
16.6k
    if (*(q-1) == ';') *(q-1) = 0;
3400
490M
    for (r = p; *r; ++r)
3401
490M
        if (*r == ';') ++n_flt;
3402
16.6k
    if (n_flt > max_n_flt) {
3403
16.6k
        a_flt = malloc(n_flt * sizeof(*a_flt));
3404
16.6k
        if (!a_flt) {
3405
0
            hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3406
0
            v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3407
0
            return -1;
3408
0
        }
3409
16.6k
        max_n_flt = n_flt;
3410
16.6k
    }
3411
    // add filters
3412
2.17M
    for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
3413
2.15M
        *(char*)aux1.p = 0;
3414
2.15M
        k = kh_get(vdict, d, t);
3415
2.15M
        if (k == kh_end(d))
3416
43.9k
        {
3417
            // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
3418
            // been already printed, but will enable tools like vcfcheck to proceed.
3419
43.9k
            hts_log_warning("FILTER '%s' is not defined in the header", t);
3420
43.9k
            kstring_t tmp = {0,0,0};
3421
43.9k
            int l;
3422
43.9k
            ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
3423
43.9k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3424
43.9k
            free(tmp.s);
3425
43.9k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3426
43.9k
            if (res < 0) bcf_hrec_destroy(hrec);
3427
43.9k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3428
43.9k
            k = kh_get(vdict, d, t);
3429
43.9k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3430
43.9k
            if (res || k == kh_end(d)) {
3431
44
                hts_log_error("Could not add dummy header for FILTER '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
3432
44
                v->errcode |= BCF_ERR_TAG_INVALID;
3433
44
                free(a_flt);
3434
44
                return -1;
3435
44
            }
3436
43.9k
        }
3437
2.15M
        a_flt[i++] = kh_val(d, k).id;
3438
2.15M
    }
3439
3440
16.5k
    bcf_enc_vint(str, n_flt, a_flt, -1);
3441
16.5k
    free(a_flt);
3442
3443
16.5k
    return 0;
3444
16.6k
}
3445
3446
15.9k
static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3447
15.9k
    static int extreme_int_warned = 0, negative_rlen_warned = 0;
3448
15.9k
    int max_n_val = 0, overflow = 0;
3449
15.9k
    char *r, *key;
3450
15.9k
    khint_t k;
3451
15.9k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3452
15.9k
    int32_t *a_val = NULL;
3453
3454
15.9k
    v->n_info = 0;
3455
15.9k
    if (*(q-1) == ';') *(q-1) = 0;
3456
5.67M
    for (r = key = p;; ++r) {
3457
5.67M
        int c;
3458
5.67M
        char *val, *end;
3459
551M
        while (*r > '=' || (*r != ';' && *r != '=' && *r != 0)) r++;
3460
5.67M
        if (v->n_info == UINT16_MAX) {
3461
3
            hts_log_error("Too many INFO entries at %s:%"PRIhts_pos,
3462
3
                          bcf_seqname_safe(h,v), v->pos+1);
3463
3
            v->errcode |= BCF_ERR_LIMITS;
3464
3
            goto fail;
3465
3
        }
3466
5.67M
        val = end = NULL;
3467
5.67M
        c = *r; *r = 0;
3468
5.67M
        if (c == '=') {
3469
2.98M
            val = r + 1;
3470
3471
1.00G
            for (end = val; *end != ';' && *end != 0; ++end);
3472
2.98M
            c = *end; *end = 0;
3473
2.98M
        } else end = r;
3474
5.67M
        if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; }  // faulty VCF, ";;" in the INFO
3475
5.63M
        k = kh_get(vdict, d, key);
3476
5.63M
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
3477
49.5k
        {
3478
49.5k
            hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key);
3479
49.5k
            kstring_t tmp = {0,0,0};
3480
49.5k
            int l;
3481
49.5k
            ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
3482
49.5k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3483
49.5k
            free(tmp.s);
3484
49.5k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3485
49.5k
            if (res < 0) bcf_hrec_destroy(hrec);
3486
49.5k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3487
49.5k
            k = kh_get(vdict, d, key);
3488
49.5k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3489
49.5k
            if (res || k == kh_end(d)) {
3490
89
                hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1);
3491
89
                v->errcode |= BCF_ERR_TAG_INVALID;
3492
89
                goto fail;
3493
89
            }
3494
49.5k
        }
3495
5.63M
        uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
3496
5.63M
        ++v->n_info;
3497
5.63M
        bcf_enc_int1(str, kh_val(d, k).id);
3498
5.63M
        if (val == 0) {
3499
2.64M
            bcf_enc_size(str, 0, BCF_BT_NULL);
3500
2.98M
        } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
3501
42.2k
            bcf_enc_vchar(str, end - val, val);
3502
2.93M
        } else { // int/float value/array
3503
2.93M
            int i, n_val;
3504
2.93M
            char *t, *te;
3505
945M
            for (t = val, n_val = 1; *t; ++t) // count the number of values
3506
942M
                if (*t == ',') ++n_val;
3507
            // Check both int and float size in one step for simplicity
3508
2.93M
            if (n_val > max_n_val) {
3509
4.42k
                int32_t *a_tmp = (int32_t *)realloc(a_val, n_val * sizeof(*a_val));
3510
4.42k
                if (!a_tmp) {
3511
0
                    hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3512
0
                    v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3513
0
                    goto fail;
3514
0
                }
3515
4.42k
                a_val = a_tmp;
3516
4.42k
                max_n_val = n_val;
3517
4.42k
            }
3518
2.93M
            if ((y>>4&0xf) == BCF_HT_INT) {
3519
2.26M
                i = 0, t = val;
3520
2.26M
                int64_t val1;
3521
2.26M
                int is_int64 = 0;
3522
#ifdef VCF_ALLOW_INT64
3523
                if ( n_val==1 )
3524
                {
3525
                    overflow = 0;
3526
                    long long int tmp_val = hts_str2int(val, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3527
                    if ( te==val ) tmp_val = bcf_int32_missing;
3528
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT64 || tmp_val>BCF_MAX_BT_INT64 )
3529
                    {
3530
                        if ( !extreme_int_warned )
3531
                        {
3532
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3533
                            extreme_int_warned = 1;
3534
                        }
3535
                        tmp_val = bcf_int32_missing;
3536
                    }
3537
                    else
3538
                        is_int64 = 1;
3539
                    val1 = tmp_val;
3540
                    t = te;
3541
                    i = 1;  // this is just to avoid adding another nested block...
3542
                }
3543
#endif
3544
595M
                for (; i < n_val; ++i, ++t)
3545
593M
                {
3546
593M
                    overflow = 0;
3547
593M
                    long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3548
593M
                    if ( te==t ) tmp_val = bcf_int32_missing;
3549
6.17M
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3550
1.19M
                    {
3551
1.19M
                        if ( !extreme_int_warned )
3552
1
                        {
3553
1
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3554
1
                            extreme_int_warned = 1;
3555
1
                        }
3556
1.19M
                        tmp_val = bcf_int32_missing;
3557
1.19M
                    }
3558
593M
                    a_val[i] = tmp_val;
3559
735M
                    for (t = te; *t && *t != ','; t++);
3560
593M
                }
3561
2.26M
                if (n_val == 1) {
3562
#ifdef VCF_ALLOW_INT64
3563
                    if ( is_int64 )
3564
                    {
3565
                        v->unpacked |= BCF_IS_64BIT;
3566
                        bcf_enc_long1(str, val1);
3567
                    }
3568
                    else
3569
                        bcf_enc_int1(str, (int32_t)val1);
3570
#else
3571
1.79M
                    val1 = a_val[0];
3572
1.79M
                    bcf_enc_int1(str, (int32_t)val1);
3573
1.79M
#endif
3574
1.79M
                } else {
3575
470k
                    bcf_enc_vint(str, n_val, a_val, -1);
3576
470k
                }
3577
2.26M
                if (n_val==1 && (val1!=bcf_int32_missing || is_int64)
3578
2.26M
                    && memcmp(key, "END", 4) == 0)
3579
0
                {
3580
0
                    if ( val1 <= v->pos )
3581
0
                    {
3582
0
                        if ( !negative_rlen_warned )
3583
0
                        {
3584
0
                            hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,val1,bcf_seqname_safe(h,v),v->pos+1);
3585
0
                            negative_rlen_warned = 1;
3586
0
                        }
3587
0
                    }
3588
0
                    else
3589
0
                        v->rlen = val1 - v->pos;
3590
0
                }
3591
2.26M
            } else if ((y>>4&0xf) == BCF_HT_REAL) {
3592
669k
                float *val_f = (float *)a_val;
3593
139M
                for (i = 0, t = val; i < n_val; ++i, ++t)
3594
138M
                {
3595
138M
                    overflow = 0;
3596
138M
                    val_f[i] = hts_str2dbl(t, &te, &overflow);
3597
138M
                    if ( te==t || overflow ) // conversion failed
3598
137M
                        bcf_float_set_missing(val_f[i]);
3599
172M
                    for (t = te; *t && *t != ','; t++);
3600
138M
                }
3601
669k
                bcf_enc_vfloat(str, n_val, val_f);
3602
669k
            }
3603
2.93M
        }
3604
5.63M
        if (c == 0) break;
3605
5.62M
        r = end;
3606
5.62M
        key = r + 1;
3607
5.62M
    }
3608
3609
15.8k
    free(a_val);
3610
15.8k
    return 0;
3611
3612
92
 fail:
3613
92
    free(a_val);
3614
92
    return -1;
3615
15.9k
}
3616
3617
int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
3618
17.5k
{
3619
17.5k
    int ret = -2, overflow = 0;
3620
17.5k
    char *p, *q, *r, *t;
3621
17.5k
    kstring_t *str;
3622
17.5k
    khint_t k;
3623
17.5k
    ks_tokaux_t aux;
3624
3625
//#define NOT_DOT(p) strcmp((p), ".")
3626
//#define NOT_DOT(p) (!(*p == '.' && !p[1]))
3627
//#define NOT_DOT(p) ((*p) != '.' || (p)[1])
3628
//#define NOT_DOT(p) (q-p != 1 || memcmp(p, ".\0", 2))
3629
84.6k
#define NOT_DOT(p) (memcmp(p, ".\0", 2))
3630
3631
17.5k
    if (!s || !h || !v || !(s->s))
3632
0
        return ret;
3633
3634
    // Assumed in lots of places, but we may as well spot this early
3635
17.5k
    assert(sizeof(float) == sizeof(int32_t));
3636
3637
    // Ensure string we parse has space to permit some over-flow when during
3638
    // parsing.  Eg to do memcmp(key, "END", 4) in vcf_parse_info over
3639
    // the more straight forward looking strcmp, giving a speed advantage.
3640
17.5k
    if (ks_resize(s, s->l+4) < 0)
3641
0
        return -1;
3642
3643
    // Force our memory to be initialised so we avoid the technicality of
3644
    // undefined behaviour in using a 4-byte memcmp.  (The reality is this
3645
    // almost certainly is never detected by the compiler so has no impact,
3646
    // but equally so this code has minimal (often beneficial) impact on
3647
    // performance too.)
3648
17.5k
    s->s[s->l+0] = 0;
3649
17.5k
    s->s[s->l+1] = 0;
3650
17.5k
    s->s[s->l+2] = 0;
3651
17.5k
    s->s[s->l+3] = 0;
3652
3653
17.5k
    bcf_clear1(v);
3654
17.5k
    str = &v->shared;
3655
17.5k
    memset(&aux, 0, sizeof(ks_tokaux_t));
3656
3657
    // CHROM
3658
17.5k
    if (!(p = kstrtok(s->s, "\t", &aux)))
3659
0
        goto err;
3660
17.5k
    *(q = (char*)aux.p) = 0;
3661
3662
17.5k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
3663
17.5k
    k = kh_get(vdict, d, p);
3664
17.5k
    if (k == kh_end(d)) {
3665
4.58k
        hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p);
3666
4.58k
        v->errcode = BCF_ERR_CTG_UNDEF;
3667
4.58k
        if ((k = fix_chromosome(h, d, p)) == kh_end(d)) {
3668
84
            hts_log_error("Could not add dummy header for contig '%s'", p);
3669
84
            v->errcode |= BCF_ERR_CTG_INVALID;
3670
84
            goto err;
3671
84
        }
3672
4.58k
    }
3673
17.5k
    v->rid = kh_val(d, k).id;
3674
3675
    // POS
3676
17.5k
    if (!(p = kstrtok(0, 0, &aux)))
3677
361
        goto err;
3678
17.1k
    *(q = (char*)aux.p) = 0;
3679
3680
17.1k
    overflow = 0;
3681
17.1k
    char *tmp = p;
3682
17.1k
    v->pos = hts_str2uint(p, &p, 63, &overflow);
3683
17.1k
    if (overflow) {
3684
11
        hts_log_error("Position value '%s' is too large", tmp);
3685
11
        goto err;
3686
17.1k
    } else if ( *p ) {
3687
55
        hts_log_error("Could not parse the position '%s'", tmp);
3688
55
        goto err;
3689
17.0k
    } else {
3690
17.0k
        v->pos -= 1;
3691
17.0k
    }
3692
17.0k
    if (v->pos >= INT32_MAX)
3693
174
        v->unpacked |= BCF_IS_64BIT;
3694
3695
    // ID
3696
17.0k
    if (!(p = kstrtok(0, 0, &aux)))
3697
21
        goto err;
3698
17.0k
    *(q = (char*)aux.p) = 0;
3699
3700
17.0k
    if (NOT_DOT(p)) bcf_enc_vchar(str, q - p, p);
3701
266
    else bcf_enc_size(str, 0, BCF_BT_CHAR);
3702
3703
    // REF
3704
17.0k
    if (!(p = kstrtok(0, 0, &aux)))
3705
55
        goto err;
3706
17.0k
    *(q = (char*)aux.p) = 0;
3707
3708
17.0k
    bcf_enc_vchar(str, q - p, p);
3709
17.0k
    v->n_allele = 1, v->rlen = q - p;
3710
3711
    // ALT
3712
17.0k
    if (!(p = kstrtok(0, 0, &aux)))
3713
27
        goto err;
3714
16.9k
    *(q = (char*)aux.p) = 0;
3715
3716
16.9k
    if (NOT_DOT(p)) {
3717
143M
        for (r = t = p;; ++r) {
3718
143M
            if (*r == ',' || *r == 0) {
3719
4.28M
                if (v->n_allele == UINT16_MAX) {
3720
4
                    hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos,
3721
4
                                  bcf_seqname_safe(h,v), v->pos+1);
3722
4
                    v->errcode |= BCF_ERR_LIMITS;
3723
4
                    goto err;
3724
4
                }
3725
4.28M
                bcf_enc_vchar(str, r - t, t);
3726
4.28M
                t = r + 1;
3727
4.28M
                ++v->n_allele;
3728
4.28M
            }
3729
143M
            if (r == q) break;
3730
143M
        }
3731
16.8k
    }
3732
3733
    // QUAL
3734
16.9k
    if (!(p = kstrtok(0, 0, &aux)))
3735
32
        goto err;
3736
16.9k
    *(q = (char*)aux.p) = 0;
3737
3738
16.9k
    if (NOT_DOT(p)) v->qual = atof(p);
3739
113
    else bcf_float_set_missing(v->qual);
3740
16.9k
    if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR
3741
3742
    // FILTER
3743
16.9k
    if (!(p = kstrtok(0, 0, &aux)))
3744
31
        goto err;
3745
16.9k
    *(q = (char*)aux.p) = 0;
3746
3747
16.9k
    if (NOT_DOT(p)) {
3748
16.6k
        if (vcf_parse_filter(str, h, v, p, q)) {
3749
44
            goto err;
3750
44
        }
3751
16.6k
    } else bcf_enc_vint(str, 0, 0, -1);
3752
16.8k
    if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT
3753
3754
    // INFO
3755
16.8k
    if (!(p = kstrtok(0, 0, &aux)))
3756
79
        goto err;
3757
16.7k
    *(q = (char*)aux.p) = 0;
3758
3759
16.7k
    if (NOT_DOT(p)) {
3760
15.9k
        if (vcf_parse_info(str, h, v, p, q)) {
3761
92
            goto err;
3762
92
        }
3763
15.9k
    }
3764
16.6k
    if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
3765
3766
    // FORMAT; optional
3767
16.6k
    p = kstrtok(0, 0, &aux);
3768
16.6k
    if (p) {
3769
13.1k
        *(q = (char*)aux.p) = 0;
3770
3771
13.1k
        return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2;
3772
13.1k
    } else {
3773
3.51k
        return 0;
3774
3.51k
    }
3775
3776
0
 end:
3777
0
    ret = 0;
3778
3779
896
 err:
3780
896
    return ret;
3781
0
}
3782
3783
int vcf_open_mode(char *mode, const char *fn, const char *format)
3784
0
{
3785
0
    if (format == NULL) {
3786
        // Try to pick a format based on the filename extension
3787
0
        char extension[HTS_MAX_EXT_LEN];
3788
0
        if (find_file_extension(fn, extension) < 0) return -1;
3789
0
        return vcf_open_mode(mode, fn, extension);
3790
0
    }
3791
0
    else if (strcasecmp(format, "bcf") == 0) strcpy(mode, "b");
3792
0
    else if (strcasecmp(format, "vcf") == 0) strcpy(mode, "");
3793
0
    else if (strcasecmp(format, "vcf.gz") == 0 || strcasecmp(format, "vcf.bgz") == 0) strcpy(mode, "z");
3794
0
    else return -1;
3795
3796
0
    return 0;
3797
0
}
3798
3799
int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
3800
18.4k
{
3801
18.4k
    int ret;
3802
18.4k
    ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
3803
18.4k
    if (ret < 0) return ret;
3804
17.5k
    return vcf_parse1(&fp->line, h, v);
3805
18.4k
}
3806
3807
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
3808
0
{
3809
0
    uint8_t *ptr_start = ptr;
3810
0
    fmt->id = bcf_dec_typed_int1(ptr, &ptr);
3811
0
    fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
3812
0
    fmt->size = fmt->n << bcf_type_shift[fmt->type];
3813
0
    fmt->p = ptr;
3814
0
    fmt->p_off  = ptr - ptr_start;
3815
0
    fmt->p_free = 0;
3816
0
    ptr += n_sample * fmt->size;
3817
0
    fmt->p_len = ptr - fmt->p;
3818
0
    return ptr;
3819
0
}
3820
3821
static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
3822
0
{
3823
0
    uint8_t *ptr_start = ptr;
3824
0
    int64_t len = 0;
3825
0
    info->key = bcf_dec_typed_int1(ptr, &ptr);
3826
0
    len = info->len = bcf_dec_size(ptr, &ptr, &info->type);
3827
0
    info->vptr = ptr;
3828
0
    info->vptr_off  = ptr - ptr_start;
3829
0
    info->vptr_free = 0;
3830
0
    info->v1.i = 0;
3831
0
    if (info->len == 1) {
3832
0
        switch(info->type) {
3833
0
        case BCF_BT_INT8:
3834
0
        case BCF_BT_CHAR:
3835
0
            info->v1.i = *(int8_t*)ptr;
3836
0
            break;
3837
0
        case BCF_BT_INT16:
3838
0
            info->v1.i = le_to_i16(ptr);
3839
0
            len <<= 1;
3840
0
            break;
3841
0
        case BCF_BT_INT32:
3842
0
            info->v1.i = le_to_i32(ptr);
3843
0
            len <<= 2;
3844
0
            break;
3845
0
        case BCF_BT_FLOAT:
3846
0
            info->v1.f = le_to_float(ptr);
3847
0
            len <<= 2;
3848
0
            break;
3849
0
        case BCF_BT_INT64:
3850
0
            info->v1.i = le_to_i64(ptr);
3851
0
            len <<= 3;
3852
0
            break;
3853
0
        }
3854
0
    } else {
3855
0
        len <<= bcf_type_shift[info->type];
3856
0
    }
3857
0
    ptr += len;
3858
3859
0
    info->vptr_len = ptr - info->vptr;
3860
0
    return ptr;
3861
0
}
3862
3863
int bcf_unpack(bcf1_t *b, int which)
3864
14.8k
{
3865
14.8k
    if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
3866
14.8k
    uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
3867
14.8k
    int i;
3868
14.8k
    bcf_dec_t *d = &b->d;
3869
14.8k
    if (which & BCF_UN_FLT) which |= BCF_UN_STR;
3870
14.8k
    if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
3871
14.8k
    if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
3872
14.8k
    {
3873
14.8k
        kstring_t tmp;
3874
3875
        // ID
3876
14.8k
        tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
3877
14.8k
        ptr_ori = ptr;
3878
14.8k
        ptr = bcf_fmt_sized_array(&tmp, ptr);
3879
14.8k
        b->unpack_size[0] = ptr - ptr_ori;
3880
14.8k
        kputc_('\0', &tmp);
3881
14.8k
        d->id = tmp.s; d->m_id = tmp.m;
3882
3883
        // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
3884
14.8k
        hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
3885
14.8k
        tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
3886
14.8k
        ptr_ori = ptr;
3887
2.49M
        for (i = 0; i < b->n_allele; ++i) {
3888
            // Use offset within tmp.s as realloc may change pointer
3889
2.48M
            d->allele[i] = (char *)(intptr_t)tmp.l;
3890
2.48M
            ptr = bcf_fmt_sized_array(&tmp, ptr);
3891
2.48M
            kputc_('\0', &tmp);
3892
2.48M
        }
3893
14.8k
        b->unpack_size[1] = ptr - ptr_ori;
3894
14.8k
        d->als = tmp.s; d->m_als = tmp.m;
3895
3896
        // Convert our offsets within tmp.s back to pointers again
3897
2.49M
        for (i = 0; i < b->n_allele; ++i)
3898
2.48M
            d->allele[i] = d->als + (ptrdiff_t)d->allele[i];
3899
14.8k
        b->unpacked |= BCF_UN_STR;
3900
14.8k
    }
3901
14.8k
    if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
3902
14.8k
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
3903
14.8k
        ptr_ori = ptr;
3904
14.8k
        if (*ptr>>4) {
3905
14.2k
            int type;
3906
14.2k
            d->n_flt = bcf_dec_size(ptr, &ptr, &type);
3907
14.2k
            hts_expand(int, d->n_flt, d->m_flt, d->flt);
3908
1.58M
            for (i = 0; i < d->n_flt; ++i)
3909
1.56M
                d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
3910
14.2k
        } else ++ptr, d->n_flt = 0;
3911
14.8k
        b->unpack_size[2] = ptr - ptr_ori;
3912
14.8k
        b->unpacked |= BCF_UN_FLT;
3913
14.8k
    }
3914
14.8k
    if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
3915
0
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
3916
0
        hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
3917
0
        for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
3918
0
        for (i = 0; i < b->n_info; ++i)
3919
0
            ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
3920
0
        b->unpacked |= BCF_UN_INFO;
3921
0
    }
3922
14.8k
    if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
3923
0
        ptr = (uint8_t*)b->indiv.s;
3924
0
        hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
3925
0
        for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
3926
0
        for (i = 0; i < b->n_fmt; ++i)
3927
0
            ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
3928
0
        b->unpacked |= BCF_UN_FMT;
3929
0
    }
3930
14.8k
    return 0;
3931
14.8k
}
3932
3933
int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
3934
14.8k
{
3935
14.8k
    int i;
3936
14.8k
    int32_t max_dt_id = h->n[BCF_DT_ID];
3937
14.8k
    const char *chrom = bcf_seqname(h, v);
3938
14.8k
    if (!chrom) {
3939
0
        hts_log_error("Invalid BCF, CONTIG id=%d not present in the header",
3940
0
                      v->rid);
3941
0
        errno = EINVAL;
3942
0
        return -1;
3943
0
    }
3944
3945
14.8k
    bcf_unpack((bcf1_t*)v, BCF_UN_ALL & ~(BCF_UN_INFO|BCF_UN_FMT));
3946
3947
    // Cache of key lengths so we don't keep repeatedly using them.
3948
    // This assumes we're not modifying the header between successive calls
3949
    // to vcf_format, but that would lead to many other forms of breakage
3950
    // so it feels like a valid assumption to make.
3951
    //
3952
    // We cannot just do this in bcf_hdr_sync as some code (eg bcftools
3953
    // annotate) manipulates the headers directly without calling sync to
3954
    // refresh the data structures.  So we must do just-in-time length
3955
    // calculation during writes instead.
3956
14.8k
    bcf_hdr_aux_t *aux = get_hdr_aux(h);
3957
14.8k
    if (!aux->key_len) {
3958
3.66k
        if (!(aux->key_len = calloc(h->n[BCF_DT_ID]+1, sizeof(*aux->key_len))))
3959
0
            return -1;
3960
3.66k
    }
3961
14.8k
    size_t *key_len = aux->key_len;
3962
3963
14.8k
    kputs(chrom, s); // CHROM
3964
14.8k
    kputc_('\t', s); kputll(v->pos + 1, s); // POS
3965
14.8k
    kputc_('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
3966
14.8k
    kputc_('\t', s); // REF
3967
14.8k
    if (v->n_allele > 0) kputs(v->d.allele[0], s);
3968
0
    else kputc_('.', s);
3969
14.8k
    kputc_('\t', s); // ALT
3970
14.8k
    if (v->n_allele > 1) {
3971
2.48M
        for (i = 1; i < v->n_allele; ++i) {
3972
2.46M
            if (i > 1) kputc_(',', s);
3973
2.46M
            kputs(v->d.allele[i], s);
3974
2.46M
        }
3975
14.4k
    } else kputc_('.', s);
3976
14.8k
    kputc_('\t', s); // QUAL
3977
14.8k
    if ( bcf_float_is_missing(v->qual) ) kputc_('.', s); // QUAL
3978
14.7k
    else kputd(v->qual, s);
3979
14.8k
    kputc_('\t', s); // FILTER
3980
14.8k
    if (v->d.n_flt) {
3981
1.58M
        for (i = 0; i < v->d.n_flt; ++i) {
3982
1.56M
            int32_t idx = v->d.flt[i];
3983
1.56M
            if (idx < 0 || idx >= max_dt_id
3984
1.56M
                || h->id[BCF_DT_ID][idx].key == NULL) {
3985
0
                hts_log_error("Invalid BCF, the FILTER tag id=%d at %s:%"PRIhts_pos" not present in the header",
3986
0
                              idx, bcf_seqname_safe(h, v), v->pos + 1);
3987
0
                errno = EINVAL;
3988
0
                return -1;
3989
0
            }
3990
1.56M
            if (i) kputc_(';', s);
3991
1.56M
            if (!key_len[idx])
3992
89.0k
                key_len[idx] = strlen(h->id[BCF_DT_ID][idx].key);
3993
1.56M
            kputsn(h->id[BCF_DT_ID][idx].key, key_len[idx], s);
3994
1.56M
        }
3995
14.2k
    } else kputc_('.', s);
3996
3997
14.8k
    kputc_('\t', s); // INFO
3998
14.8k
    if (v->n_info) {
3999
5.24k
        uint8_t *ptr = (uint8_t *)v->shared.s + v->unpack_size[0] + v->unpack_size[1] + v->unpack_size[2];
4000
5.24k
        int first = 1;
4001
5.24k
        bcf_info_t *info = v->d.info;
4002
4003
        // Note if we duplicate this code into custom packed and unpacked
4004
        // implementations then we gain a bit more speed, particularly with
4005
        // clang 13 (up to 5%).  Not sure why this is, but code duplication
4006
        // isn't pleasant and it's still faster adding packed support than
4007
        // not so it's a win, just not as good as it should be.
4008
5.24k
        const int info_packed = !(v->unpacked & BCF_UN_INFO) && v->shared.l;
4009
2.90M
        for (i = 0; i < v->n_info; ++i) {
4010
2.89M
            bcf_info_t in, *z;
4011
2.89M
            if (info_packed) {
4012
                // Use a local bcf_info_t when data is packed
4013
2.89M
                z = &in;
4014
2.89M
                z->key  = bcf_dec_typed_int1(ptr, &ptr);
4015
2.89M
                z->len  = bcf_dec_size(ptr, &ptr, &z->type);
4016
2.89M
                z->vptr = ptr;
4017
2.89M
                ptr += z->len << bcf_type_shift[z->type];
4018
2.89M
            } else {
4019
                // Else previously unpacked INFO struct
4020
0
                z = &info[i];
4021
4022
                // Also potentially since deleted
4023
0
                if ( !z->vptr ) continue;
4024
0
            }
4025
4026
2.89M
            bcf_idpair_t *id = z->key >= 0 && z->key < max_dt_id
4027
2.89M
                ? &h->id[BCF_DT_ID][z->key]
4028
2.89M
                : NULL;
4029
4030
2.89M
            if (!id || !id->key) {
4031
0
                hts_log_error("Invalid BCF, the INFO tag id=%d is %s at %s:%"PRIhts_pos,
4032
0
                              z->key,
4033
0
                              z->key < 0 ? "negative"
4034
0
                              : (z->key >= max_dt_id ? "too large" : "not present in the header"),
4035
0
                              bcf_seqname_safe(h, v), v->pos+1);
4036
0
                errno = EINVAL;
4037
0
                return -1;
4038
0
            }
4039
4040
            // KEY
4041
2.89M
            if (!key_len[z->key])
4042
28.6k
                key_len[z->key] = strlen(id->key);
4043
2.89M
            size_t id_len = key_len[z->key];
4044
2.89M
            if (ks_resize(s, s->l + 3 + id_len) < 0)
4045
0
                return -1;
4046
2.89M
            char *sptr = s->s + s->l;
4047
2.89M
            if ( !first ) {
4048
2.89M
                *sptr++ = ';';
4049
2.89M
                s->l++;
4050
2.89M
            }
4051
2.89M
            first = 0;
4052
2.89M
            memcpy(sptr, id->key, id_len);
4053
2.89M
            s->l += id_len;
4054
4055
            // VALUE
4056
2.89M
            if (z->len <= 0) continue;
4057
1.46M
            sptr[id_len] = '=';
4058
1.46M
            s->l++;
4059
4060
1.46M
            if (z->len != 1 || info_packed) {
4061
1.46M
                bcf_fmt_array(s, z->len, z->type, z->vptr);
4062
1.46M
            } else {
4063
                // Single length vectors are unpacked into their
4064
                // own info.v1 union and handled separately.
4065
0
                if (z->type == BCF_BT_FLOAT) {
4066
0
                    if ( bcf_float_is_missing(z->v1.f) )
4067
0
                        kputc_('.', s);
4068
0
                    else
4069
0
                        kputd(z->v1.f, s);
4070
0
                } else if (z->type == BCF_BT_CHAR) {
4071
0
                    kputc_(z->v1.i, s);
4072
0
                } else if (z->type < BCF_BT_INT64) {
4073
0
                    int64_t missing[] = {
4074
0
                        0, // BCF_BT_NULL
4075
0
                        bcf_int8_missing,
4076
0
                        bcf_int16_missing,
4077
0
                        bcf_int32_missing,
4078
0
                    };
4079
0
                    if (z->v1.i == missing[z->type])
4080
0
                        kputc_('.', s);
4081
0
                    else
4082
0
                        kputw(z->v1.i, s);
4083
0
                } else if (z->type == BCF_BT_INT64) {
4084
0
                    if (z->v1.i == bcf_int64_missing)
4085
0
                        kputc_('.', s);
4086
0
                    else
4087
0
                        kputll(z->v1.i, s);
4088
0
                } else {
4089
0
                    hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, z->type, bcf_seqname_safe(h, v), v->pos+1);
4090
0
                    errno = EINVAL;
4091
0
                    return -1;
4092
0
                }
4093
0
            }
4094
1.46M
        }
4095
5.24k
        if ( first ) kputc_('.', s);
4096
9.63k
    } else kputc_('.', s);
4097
4098
    // FORMAT and individual information
4099
14.8k
    if (v->n_sample) {
4100
5.96k
        int i,j;
4101
5.96k
        if ( v->n_fmt) {
4102
5.92k
            uint8_t *ptr = (uint8_t *)v->indiv.s;
4103
5.92k
            int gt_i = -1;
4104
5.92k
            bcf_fmt_t *fmt = v->d.fmt;
4105
5.92k
            int first = 1;
4106
5.92k
            int fmt_packed = !(v->unpacked & BCF_UN_FMT);
4107
4108
5.92k
            if (fmt_packed) {
4109
                // Local fmt as we have an array of num FORMAT keys,
4110
                // each of which points to N.Sample values.
4111
4112
                // No real gain to be had in handling unpacked data here,
4113
                // but it doesn't cost us much in complexity either and
4114
                // it gives us flexibility.
4115
5.92k
                fmt = malloc(v->n_fmt * sizeof(*fmt));
4116
5.92k
                if (!fmt)
4117
0
                    return -1;
4118
5.92k
            }
4119
4120
            // KEYS
4121
117k
            for (i = 0; i < (int)v->n_fmt; ++i) {
4122
111k
                bcf_fmt_t *z;
4123
111k
                z = &fmt[i];
4124
111k
                if (fmt_packed) {
4125
111k
                    z->id   = bcf_dec_typed_int1(ptr, &ptr);
4126
111k
                    z->n    = bcf_dec_size(ptr, &ptr, &z->type);
4127
111k
                    z->p    = ptr;
4128
111k
                    z->size = z->n << bcf_type_shift[z->type];
4129
111k
                    ptr += v->n_sample * z->size;
4130
111k
                }
4131
111k
                if ( !z->p ) continue;
4132
111k
                kputc_(!first ? ':' : '\t', s); first = 0;
4133
4134
111k
                bcf_idpair_t *id = z->id >= 0 && z->id < max_dt_id
4135
111k
                    ? &h->id[BCF_DT_ID][z->id]
4136
111k
                    : NULL;
4137
4138
111k
                if (!id || !id->key) {
4139
0
                    hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", z->id, bcf_seqname_safe(h, v), v->pos+1);
4140
0
                    errno = EINVAL;
4141
0
                    return -1;
4142
0
                }
4143
4144
111k
                if (!key_len[z->id])
4145
101k
                    key_len[z->id] = strlen(id->key);
4146
111k
                size_t id_len = key_len[z->id];
4147
111k
                kputsn(id->key, id_len, s);
4148
111k
                if (id_len == 2 && id->key[0] == 'G' && id->key[1] == 'T')
4149
3.08k
                    gt_i = i;
4150
111k
            }
4151
5.92k
            if ( first ) kputsn("\t.", 2, s);
4152
4153
            // VALUES per sample
4154
30.2k
            for (j = 0; j < v->n_sample; ++j) {
4155
24.3k
                kputc_('\t', s);
4156
24.3k
                first = 1;
4157
24.3k
                bcf_fmt_t *f = fmt;
4158
1.22M
                for (i = 0; i < (int)v->n_fmt; i++, f++) {
4159
1.21M
                    if ( !f->p ) continue;
4160
1.21M
                    if (!first) kputc_(':', s);
4161
1.21M
                    first = 0;
4162
1.21M
                    if (gt_i == i) {
4163
8.17k
                        bcf_format_gt(f,j,s);
4164
8.17k
                        break;
4165
8.17k
                    }
4166
1.20M
                    else if (f->n == 1)
4167
315k
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4168
887k
                    else
4169
887k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4170
1.21M
                }
4171
4172
                // Simpler loop post GT and at least 1 iteration
4173
60.7k
                for (i++, f++; i < (int)v->n_fmt; i++, f++) {
4174
36.4k
                    if ( !f->p ) continue;
4175
36.4k
                    kputc_(':', s);
4176
36.4k
                    if (f->n == 1)
4177
1.06k
                        bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size);
4178
35.3k
                    else
4179
35.3k
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
4180
36.4k
                }
4181
24.3k
                if ( first ) kputc_('.', s);
4182
24.3k
            }
4183
5.92k
            if (fmt_packed)
4184
5.92k
                free(fmt);
4185
5.92k
        }
4186
37
        else
4187
207
            for (j=0; j<=v->n_sample; j++)
4188
170
                kputsn("\t.", 2, s);
4189
5.96k
    }
4190
14.8k
    kputc('\n', s);
4191
14.8k
    return 0;
4192
14.8k
}
4193
4194
int vcf_write_line(htsFile *fp, kstring_t *line)
4195
0
{
4196
0
    int ret;
4197
0
    if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
4198
0
    if ( fp->format.compression!=no_compression )
4199
0
        ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
4200
0
    else
4201
0
        ret = hwrite(fp->fp.hfile, line->s, line->l);
4202
0
    return ret==line->l ? 0 : -1;
4203
0
}
4204
4205
int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
4206
14.8k
{
4207
14.8k
    ssize_t ret;
4208
14.8k
    fp->line.l = 0;
4209
14.8k
    if (vcf_format1(h, v, &fp->line) != 0)
4210
0
        return -1;
4211
14.8k
    if ( fp->format.compression!=no_compression ) {
4212
0
        if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4213
0
            return -1;
4214
0
        ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
4215
14.8k
    } else {
4216
14.8k
        ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
4217
14.8k
    }
4218
4219
14.8k
    if (fp->idx && fp->format.compression == bgzf) {
4220
0
        int tid;
4221
0
        if ((tid = hts_idx_tbi_name(fp->idx, v->rid, bcf_seqname_safe(h, v))) < 0)
4222
0
            return -1;
4223
4224
0
        if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
4225
0
                          tid, v->pos, v->pos + v->rlen,
4226
0
                          bgzf_tell(fp->fp.bgzf), 1) < 0)
4227
0
            return -1;
4228
0
    }
4229
4230
14.8k
    return ret==fp->line.l ? 0 : -1;
4231
14.8k
}
4232
4233
/************************
4234
 * Data access routines *
4235
 ************************/
4236
4237
int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
4238
8.74k
{
4239
8.74k
    khint_t k;
4240
8.74k
    vdict_t *d = (vdict_t*)h->dict[which];
4241
8.74k
    k = kh_get(vdict, d, id);
4242
8.74k
    return k == kh_end(d)? -1 : kh_val(d, k).id;
4243
8.74k
}
4244
4245
4246
/********************
4247
 *** BCF indexing ***
4248
 ********************/
4249
4250
// Calculate number of index levels given min_shift and the header contig
4251
// list.  Also returns number of contigs in *nids_out.
4252
static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift,
4253
                               int starting_n_lvls, int *nids_out)
4254
0
{
4255
0
    int n_lvls, i, nids = 0;
4256
0
    int64_t max_len = 0, s;
4257
4258
0
    for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
4259
0
    {
4260
0
        if ( !h->id[BCF_DT_CTG][i].val ) continue;
4261
0
        if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] )
4262
0
            max_len = h->id[BCF_DT_CTG][i].val->info[0];
4263
0
        nids++;
4264
0
    }
4265
0
    if ( !max_len ) max_len = (1LL<<31) - 1;  // In case contig line is broken.
4266
0
    max_len += 256;
4267
0
    s = 1LL << (min_shift + starting_n_lvls * 3);
4268
0
    for (n_lvls = starting_n_lvls; max_len > s; ++n_lvls, s <<= 3);
4269
4270
0
    if (nids_out) *nids_out = nids;
4271
0
    return n_lvls;
4272
0
}
4273
4274
hts_idx_t *bcf_index(htsFile *fp, int min_shift)
4275
0
{
4276
0
    int n_lvls;
4277
0
    bcf1_t *b = NULL;
4278
0
    hts_idx_t *idx = NULL;
4279
0
    bcf_hdr_t *h;
4280
0
    int r;
4281
0
    h = bcf_hdr_read(fp);
4282
0
    if ( !h ) return NULL;
4283
0
    int nids = 0;
4284
0
    n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
4285
0
    idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4286
0
    if (!idx) goto fail;
4287
0
    b = bcf_init1();
4288
0
    if (!b) goto fail;
4289
0
    while ((r = bcf_read1(fp,h, b)) >= 0) {
4290
0
        int ret;
4291
0
        ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
4292
0
        if (ret < 0) goto fail;
4293
0
    }
4294
0
    if (r < -1) goto fail;
4295
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
4296
0
    bcf_destroy1(b);
4297
0
    bcf_hdr_destroy(h);
4298
0
    return idx;
4299
4300
0
 fail:
4301
0
    hts_idx_destroy(idx);
4302
0
    bcf_destroy1(b);
4303
0
    bcf_hdr_destroy(h);
4304
0
    return NULL;
4305
0
}
4306
4307
hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
4308
0
{
4309
0
    return fnidx? hts_idx_load2(fn, fnidx) : bcf_index_load(fn);
4310
0
}
4311
4312
hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags)
4313
0
{
4314
0
    return hts_idx_load3(fn, fnidx, HTS_FMT_CSI, flags);
4315
0
}
4316
4317
int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads)
4318
0
{
4319
0
    htsFile *fp;
4320
0
    hts_idx_t *idx;
4321
0
    tbx_t *tbx;
4322
0
    int ret;
4323
0
    if ((fp = hts_open(fn, "rb")) == 0) return -2;
4324
0
    if (n_threads)
4325
0
        hts_set_threads(fp, n_threads);
4326
0
    if ( fp->format.compression!=bgzf ) { hts_close(fp); return -3; }
4327
0
    switch (fp->format.format) {
4328
0
        case bcf:
4329
0
            if (!min_shift) {
4330
0
                hts_log_error("TBI indices for BCF files are not supported");
4331
0
                ret = -1;
4332
0
            } else {
4333
0
                idx = bcf_index(fp, min_shift);
4334
0
                if (idx) {
4335
0
                    ret = hts_idx_save_as(idx, fn, fnidx, HTS_FMT_CSI);
4336
0
                    if (ret < 0) ret = -4;
4337
0
                    hts_idx_destroy(idx);
4338
0
                }
4339
0
                else ret = -1;
4340
0
            }
4341
0
            break;
4342
4343
0
        case vcf:
4344
0
            tbx = tbx_index(hts_get_bgzfp(fp), min_shift, &tbx_conf_vcf);
4345
0
            if (tbx) {
4346
0
                ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0 ? HTS_FMT_CSI : HTS_FMT_TBI);
4347
0
                if (ret < 0) ret = -4;
4348
0
                tbx_destroy(tbx);
4349
0
            }
4350
0
            else ret = -1;
4351
0
            break;
4352
4353
0
        default:
4354
0
            ret = -3;
4355
0
            break;
4356
0
    }
4357
0
    hts_close(fp);
4358
0
    return ret;
4359
0
}
4360
4361
int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
4362
0
{
4363
0
    return bcf_index_build3(fn, fnidx, min_shift, 0);
4364
0
}
4365
4366
int bcf_index_build(const char *fn, int min_shift)
4367
0
{
4368
0
    return bcf_index_build3(fn, NULL, min_shift, 0);
4369
0
}
4370
4371
// Initialise fp->idx for the current format type.
4372
// This must be called after the header has been written but no other data.
4373
0
static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4374
0
    int n_lvls, fmt;
4375
4376
0
    if (min_shift == 0) {
4377
0
        min_shift = 14;
4378
0
        n_lvls = 5;
4379
0
        fmt = HTS_FMT_TBI;
4380
0
    } else {
4381
        // Set initial n_lvls to match tbx_index()
4382
0
        int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3;
4383
        // Increase if necessary
4384
0
        n_lvls = idx_calc_n_lvls_ids(h, min_shift, starting_n_lvls, NULL);
4385
0
        fmt = HTS_FMT_CSI;
4386
0
    }
4387
4388
0
    fp->idx = hts_idx_init(0, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4389
0
    if (!fp->idx) return -1;
4390
4391
    // Tabix meta data, added even in CSI for VCF
4392
0
    uint8_t conf[4*7];
4393
0
    u32_to_le(TBX_VCF, conf+0);  // fmt
4394
0
    u32_to_le(1,       conf+4);  // name col
4395
0
    u32_to_le(2,       conf+8);  // beg col
4396
0
    u32_to_le(0,       conf+12); // end col
4397
0
    u32_to_le('#',     conf+16); // comment
4398
0
    u32_to_le(0,       conf+20); // n.skip
4399
0
    u32_to_le(0,       conf+24); // ref name len
4400
0
    if (hts_idx_set_meta(fp->idx, sizeof(conf)*sizeof(*conf), (uint8_t *)conf, 1) < 0) {
4401
0
        hts_idx_destroy(fp->idx);
4402
0
        fp->idx = NULL;
4403
0
        return -1;
4404
0
    }
4405
0
    fp->fnidx = fnidx;
4406
4407
0
    return 0;
4408
0
}
4409
4410
// Initialise fp->idx for the current format type.
4411
// This must be called after the header has been written but no other data.
4412
0
int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
4413
0
    int n_lvls, nids = 0;
4414
4415
0
    if (fp->format.compression != bgzf) {
4416
0
        hts_log_error("Indexing is only supported on BGZF-compressed files");
4417
0
        return -3; // Matches no-compression return for bcf_index_build3()
4418
0
    }
4419
4420
0
    if (fp->format.format == vcf)
4421
0
        return vcf_idx_init(fp, h, min_shift, fnidx);
4422
4423
0
    if (!min_shift)
4424
0
        min_shift = 14;
4425
4426
0
    n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
4427
4428
0
    fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
4429
0
    if (!fp->idx) return -1;
4430
0
    fp->fnidx = fnidx;
4431
4432
0
    return 0;
4433
0
}
4434
4435
// Finishes an index. Call after the last record has been written.
4436
// Returns 0 on success, <0 on failure.
4437
//
4438
// NB: same format as SAM/BAM as it uses bgzf.
4439
0
int bcf_idx_save(htsFile *fp) {
4440
0
    return sam_idx_save(fp);
4441
0
}
4442
4443
/*****************
4444
 *** Utilities ***
4445
 *****************/
4446
4447
int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
4448
0
{
4449
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res;
4450
0
    for (i=0; i<src->nhrec; i++)
4451
0
    {
4452
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4453
0
        {
4454
0
            int j;
4455
0
            for (j=0; j<ndst_ori; j++)
4456
0
            {
4457
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4458
4459
                // Checking only the key part of generic lines, otherwise
4460
                // the VCFs are too verbose. Should we perhaps add a flag
4461
                // to bcf_hdr_combine() and make this optional?
4462
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4463
0
            }
4464
0
            if ( j>=ndst_ori ) {
4465
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4466
0
                if (res < 0) return -1;
4467
0
                need_sync += res;
4468
0
            }
4469
0
        }
4470
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4471
0
        {
4472
            // NB: we are ignoring fields without ID
4473
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4474
0
            if ( j>=0 )
4475
0
            {
4476
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4477
0
                if ( !rec ) {
4478
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4479
0
                    if (res < 0) return -1;
4480
0
                    need_sync += res;
4481
0
                }
4482
0
            }
4483
0
        }
4484
0
        else
4485
0
        {
4486
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4487
0
            assert( j>=0 ); // this should always be true for valid VCFs
4488
4489
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4490
0
            if ( !rec ) {
4491
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4492
0
                if (res < 0) return -1;
4493
0
                need_sync += res;
4494
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4495
0
            {
4496
                // Check that both records are of the same type. The bcf_hdr_id2length
4497
                // macro cannot be used here because dst header is not synced yet.
4498
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4499
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4500
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4501
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4502
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4503
0
                {
4504
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4505
0
                        src->hrec[i]->vals[0]);
4506
0
                    ret |= 1;
4507
0
                }
4508
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4509
0
                {
4510
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4511
0
                        src->hrec[i]->vals[0]);
4512
0
                    ret |= 1;
4513
0
                }
4514
0
            }
4515
0
        }
4516
0
    }
4517
0
    if ( need_sync ) {
4518
0
        if (bcf_hdr_sync(dst) < 0) return -1;
4519
0
    }
4520
0
    return ret;
4521
0
}
4522
4523
bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
4524
0
{
4525
0
    if ( !dst )
4526
0
    {
4527
        // this will effectively strip existing IDX attributes from src to become dst
4528
0
        dst = bcf_hdr_init("r");
4529
0
        kstring_t htxt = {0,0,0};
4530
0
        if (bcf_hdr_format(src, 0, &htxt) < 0) {
4531
0
            free(htxt.s);
4532
0
            return NULL;
4533
0
        }
4534
0
        if ( bcf_hdr_parse(dst, htxt.s) < 0 ) {
4535
0
            bcf_hdr_destroy(dst);
4536
0
            dst = NULL;
4537
0
        }
4538
0
        free(htxt.s);
4539
0
        return dst;
4540
0
    }
4541
4542
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, res;
4543
0
    for (i=0; i<src->nhrec; i++)
4544
0
    {
4545
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
4546
0
        {
4547
0
            int j;
4548
0
            for (j=0; j<ndst_ori; j++)
4549
0
            {
4550
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
4551
4552
                // Checking only the key part of generic lines, otherwise
4553
                // the VCFs are too verbose. Should we perhaps add a flag
4554
                // to bcf_hdr_combine() and make this optional?
4555
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
4556
0
            }
4557
0
            if ( j>=ndst_ori ) {
4558
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4559
0
                if (res < 0) return NULL;
4560
0
                need_sync += res;
4561
0
            }
4562
0
        }
4563
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
4564
0
        {
4565
            // NB: we are ignoring fields without ID
4566
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4567
0
            if ( j>=0 )
4568
0
            {
4569
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
4570
0
                if ( !rec ) {
4571
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4572
0
                    if (res < 0) return NULL;
4573
0
                    need_sync += res;
4574
0
                }
4575
0
            }
4576
0
        }
4577
0
        else
4578
0
        {
4579
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
4580
0
            assert( j>=0 ); // this should always be true for valid VCFs
4581
4582
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
4583
0
            if ( !rec ) {
4584
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
4585
0
                if (res < 0) return NULL;
4586
0
                need_sync += res;
4587
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
4588
0
            {
4589
                // Check that both records are of the same type. The bcf_hdr_id2length
4590
                // macro cannot be used here because dst header is not synced yet.
4591
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
4592
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
4593
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
4594
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
4595
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
4596
0
                {
4597
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
4598
0
                        src->hrec[i]->vals[0]);
4599
0
                }
4600
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
4601
0
                {
4602
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
4603
0
                        src->hrec[i]->vals[0]);
4604
0
                }
4605
0
            }
4606
0
        }
4607
0
    }
4608
0
    if ( need_sync ) {
4609
0
        if (bcf_hdr_sync(dst) < 0) return NULL;
4610
0
    }
4611
0
    return dst;
4612
0
}
4613
4614
int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
4615
0
{
4616
0
    int i;
4617
0
    if ( line->errcode )
4618
0
    {
4619
0
        char errordescription[1024] = "";
4620
0
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_strerror(line->errcode, errordescription, sizeof(errordescription)),  bcf_seqname_safe(src_hdr,line), line->pos+1);
4621
0
        exit(1);
4622
0
    }
4623
0
    if ( src_hdr->ntransl==-1 ) return 0;    // no need to translate, all tags have the same id
4624
0
    if ( !src_hdr->ntransl )  // called for the first time, see what needs translating
4625
0
    {
4626
0
        int dict;
4627
0
        for (dict=0; dict<2; dict++)    // BCF_DT_ID and BCF_DT_CTG
4628
0
        {
4629
0
            src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
4630
0
            for (i=0; i<src_hdr->n[dict]; i++)
4631
0
            {
4632
0
                if ( !src_hdr->id[dict][i].key ) // gap left after removed BCF header lines
4633
0
                {
4634
0
                    src_hdr->transl[dict][i] = -1;
4635
0
                    continue;
4636
0
                }
4637
0
                src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
4638
0
                if ( src_hdr->transl[dict][i]!=-1 && i!=src_hdr->transl[dict][i] ) src_hdr->ntransl++;
4639
0
            }
4640
0
        }
4641
0
        if ( !src_hdr->ntransl )
4642
0
        {
4643
0
            free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
4644
0
            free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
4645
0
            src_hdr->ntransl = -1;
4646
0
        }
4647
0
        if ( src_hdr->ntransl==-1 ) return 0;
4648
0
    }
4649
0
    bcf_unpack(line,BCF_UN_ALL);
4650
4651
    // CHROM
4652
0
    if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
4653
4654
    // FILTER
4655
0
    for (i=0; i<line->d.n_flt; i++)
4656
0
    {
4657
0
        int src_id = line->d.flt[i];
4658
0
        if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
4659
0
            line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
4660
0
        line->d.shared_dirty |= BCF1_DIRTY_FLT;
4661
0
    }
4662
4663
    // INFO
4664
0
    for (i=0; i<line->n_info; i++)
4665
0
    {
4666
0
        int src_id = line->d.info[i].key;
4667
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
4668
0
        if ( dst_id<0 ) continue;
4669
0
        line->d.info[i].key = dst_id;
4670
0
        if ( !line->d.info[i].vptr ) continue;  // skip deleted
4671
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4672
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4673
0
        if ( src_size==dst_size )   // can overwrite
4674
0
        {
4675
0
            uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
4676
0
            if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
4677
0
            else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
4678
0
            else { *(uint32_t*)vptr = (uint32_t)dst_id; }
4679
0
        }
4680
0
        else    // must realloc
4681
0
        {
4682
0
            bcf_info_t *info = &line->d.info[i];
4683
0
            kstring_t str = {0,0,0};
4684
0
            bcf_enc_int1(&str, dst_id);
4685
0
            bcf_enc_size(&str, info->len,info->type);
4686
0
            uint32_t vptr_off = str.l;
4687
0
            kputsn((char*)info->vptr, info->vptr_len, &str);
4688
0
            if( info->vptr_free ) free(info->vptr - info->vptr_off);
4689
0
            info->vptr_off = vptr_off;
4690
0
            info->vptr = (uint8_t*)str.s + info->vptr_off;
4691
0
            info->vptr_free = 1;
4692
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
4693
0
        }
4694
0
    }
4695
4696
    // FORMAT
4697
0
    for (i=0; i<line->n_fmt; i++)
4698
0
    {
4699
0
        int src_id = line->d.fmt[i].id;
4700
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
4701
0
        if ( dst_id<0 ) continue;
4702
0
        line->d.fmt[i].id = dst_id;
4703
0
        if( !line->d.fmt[i].p ) continue;  // skip deleted
4704
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4705
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4706
0
        if ( src_size==dst_size )   // can overwrite
4707
0
        {
4708
0
            uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off;    // pointer to the vector size (4bits) and BT type (4bits)
4709
0
            if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
4710
0
            else if ( dst_size==BCF_BT_INT16 ) { i16_to_le(dst_id, p + 1); }
4711
0
            else { i32_to_le(dst_id, p + 1); }
4712
0
        }
4713
0
        else    // must realloc
4714
0
        {
4715
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
4716
0
            kstring_t str = {0,0,0};
4717
0
            bcf_enc_int1(&str, dst_id);
4718
0
            bcf_enc_size(&str, fmt->n, fmt->type);
4719
0
            uint32_t p_off = str.l;
4720
0
            kputsn((char*)fmt->p, fmt->p_len, &str);
4721
0
            if( fmt->p_free ) free(fmt->p - fmt->p_off);
4722
0
            fmt->p_off = p_off;
4723
0
            fmt->p = (uint8_t*)str.s + fmt->p_off;
4724
0
            fmt->p_free = 1;
4725
0
            line->d.indiv_dirty = 1;
4726
0
        }
4727
0
    }
4728
0
    return 0;
4729
0
}
4730
4731
bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
4732
0
{
4733
0
    bcf_hdr_t *hout = bcf_hdr_init("r");
4734
0
    if (!hout) {
4735
0
        hts_log_error("Failed to allocate bcf header");
4736
0
        return NULL;
4737
0
    }
4738
0
    kstring_t htxt = {0,0,0};
4739
0
    if (bcf_hdr_format(hdr, 1, &htxt) < 0) {
4740
0
        free(htxt.s);
4741
0
        return NULL;
4742
0
    }
4743
0
    if ( bcf_hdr_parse(hout, htxt.s) < 0 ) {
4744
0
        bcf_hdr_destroy(hout);
4745
0
        hout = NULL;
4746
0
    }
4747
0
    free(htxt.s);
4748
0
    return hout;
4749
0
}
4750
4751
bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
4752
0
{
4753
0
    void *names_hash = khash_str2int_init();
4754
0
    kstring_t htxt = {0,0,0};
4755
0
    kstring_t str = {0,0,0};
4756
0
    bcf_hdr_t *h = bcf_hdr_init("w");
4757
0
    int r = 0;
4758
0
    if (!h || !names_hash) {
4759
0
        hts_log_error("Failed to allocate bcf header");
4760
0
        goto err;
4761
0
    }
4762
0
    if (bcf_hdr_format(h0, 1, &htxt) < 0) {
4763
0
        hts_log_error("Failed to get header text");
4764
0
        goto err;
4765
0
    }
4766
0
    bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
4767
0
    int j;
4768
0
    for (j=0; j<n; j++) imap[j] = -1;
4769
0
    if ( bcf_hdr_nsamples(h0) > 0) {
4770
0
        char *p = find_chrom_header_line(htxt.s);
4771
0
        int i = 0, end = n? 8 : 7;
4772
0
        while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
4773
0
        if (i != end) {
4774
0
            hts_log_error("Wrong number of columns in header #CHROM line");
4775
0
            goto err;
4776
0
        }
4777
0
        r |= kputsn(htxt.s, p - htxt.s, &str) < 0;
4778
0
        for (i = 0; i < n; ++i) {
4779
0
            if ( khash_str2int_has_key(names_hash,samples[i]) )
4780
0
            {
4781
0
                hts_log_error("Duplicate sample name \"%s\"", samples[i]);
4782
0
                goto err;
4783
0
            }
4784
0
            imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
4785
0
            if (imap[i] < 0) continue;
4786
0
            r |= kputc('\t', &str) < 0;
4787
0
            r |= kputs(samples[i], &str) < 0;
4788
0
            r |= khash_str2int_inc(names_hash,samples[i]) < 0;
4789
0
        }
4790
0
    } else r |= kputsn(htxt.s, htxt.l, &str) < 0;
4791
0
    while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
4792
0
    r |= kputc('\n',&str) < 0;
4793
0
    if (r) {
4794
0
        hts_log_error("%s", strerror(errno));
4795
0
        goto err;
4796
0
    }
4797
0
    if ( bcf_hdr_parse(h, str.s) < 0 ) {
4798
0
        bcf_hdr_destroy(h);
4799
0
        h = NULL;
4800
0
    }
4801
0
    free(str.s);
4802
0
    free(htxt.s);
4803
0
    khash_str2int_destroy(names_hash);
4804
0
    return h;
4805
4806
0
 err:
4807
0
    ks_free(&str);
4808
0
    ks_free(&htxt);
4809
0
    khash_str2int_destroy(names_hash);
4810
0
    bcf_hdr_destroy(h);
4811
0
    return NULL;
4812
0
}
4813
4814
int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
4815
0
{
4816
0
    if ( samples && !strcmp("-",samples) ) return 0;            // keep all samples
4817
4818
0
    int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
4819
0
    hdr->keep_samples = (uint8_t*) calloc(narr,1);
4820
0
    if (!hdr->keep_samples) return -1;
4821
4822
0
    hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
4823
0
    if ( !samples )
4824
0
    {
4825
        // exclude all samples
4826
0
        khint_t k;
4827
0
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE], *new_dict;
4828
0
        new_dict = kh_init(vdict);
4829
0
        if (!new_dict) return -1;
4830
4831
0
        bcf_hdr_nsamples(hdr) = 0;
4832
4833
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
4834
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
4835
0
        kh_destroy(vdict, d);
4836
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
4837
0
        if (bcf_hdr_sync(hdr) < 0) return -1;
4838
4839
0
        return 0;
4840
0
    }
4841
4842
0
    if ( samples[0]=='^' )
4843
0
        for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
4844
4845
0
    int idx, n, ret = 0;
4846
0
    char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
4847
0
    if ( !smpls ) return -1;
4848
0
    for (i=0; i<n; i++)
4849
0
    {
4850
0
        idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
4851
0
        if ( idx<0 )
4852
0
        {
4853
0
            if ( !ret ) ret = i+1;
4854
0
            continue;
4855
0
        }
4856
0
        assert( idx<bcf_hdr_nsamples(hdr) );
4857
0
        if (  samples[0]=='^' )
4858
0
            bit_array_clear(hdr->keep_samples, idx);
4859
0
        else
4860
0
            bit_array_set(hdr->keep_samples, idx);
4861
0
    }
4862
0
    for (i=0; i<n; i++) free(smpls[i]);
4863
0
    free(smpls);
4864
4865
0
    bcf_hdr_nsamples(hdr) = 0;
4866
0
    for (i=0; i<hdr->nsamples_ori; i++)
4867
0
        if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
4868
4869
0
    if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
4870
0
    else
4871
0
    {
4872
        // Make new list and dictionary with desired samples
4873
0
        char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr));
4874
0
        vdict_t *new_dict, *d;
4875
0
        int k, res;
4876
0
        if (!samples) return -1;
4877
4878
0
        new_dict = kh_init(vdict);
4879
0
        if (!new_dict) {
4880
0
            free(samples);
4881
0
            return -1;
4882
0
        }
4883
0
        idx = 0;
4884
0
        for (i=0; i<hdr->nsamples_ori; i++) {
4885
0
            if ( bit_array_test(hdr->keep_samples,i) ) {
4886
0
                samples[idx] = hdr->samples[i];
4887
0
                k = kh_put(vdict, new_dict, hdr->samples[i], &res);
4888
0
                if (res < 0) {
4889
0
                    free(samples);
4890
0
                    kh_destroy(vdict, new_dict);
4891
0
                    return -1;
4892
0
                }
4893
0
                kh_val(new_dict, k) = bcf_idinfo_def;
4894
0
                kh_val(new_dict, k).id = idx;
4895
0
                idx++;
4896
0
            }
4897
0
        }
4898
4899
        // Delete desired samples from old dictionary, so we don't free them
4900
0
        d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
4901
0
        for (i=0; i < idx; i++) {
4902
0
            int k = kh_get(vdict, d, samples[i]);
4903
0
            if (k < kh_end(d)) kh_del(vdict, d, k);
4904
0
        }
4905
4906
        // Free everything else
4907
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
4908
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
4909
0
        kh_destroy(vdict, d);
4910
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
4911
4912
0
        free(hdr->samples);
4913
0
        hdr->samples = samples;
4914
4915
0
        if (bcf_hdr_sync(hdr) < 0)
4916
0
            return -1;
4917
0
    }
4918
4919
0
    return ret;
4920
0
}
4921
4922
int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
4923
0
{
4924
0
    kstring_t ind;
4925
0
    ind.s = 0; ind.l = ind.m = 0;
4926
0
    if (n) {
4927
0
        bcf_fmt_t fmt[MAX_N_FMT];
4928
0
        int i, j;
4929
0
        uint8_t *ptr = (uint8_t*)v->indiv.s;
4930
0
        for (i = 0; i < v->n_fmt; ++i)
4931
0
            ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
4932
0
        for (i = 0; i < (int)v->n_fmt; ++i) {
4933
0
            bcf_fmt_t *f = &fmt[i];
4934
0
            bcf_enc_int1(&ind, f->id);
4935
0
            bcf_enc_size(&ind, f->n, f->type);
4936
0
            for (j = 0; j < n; ++j)
4937
0
                if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
4938
0
        }
4939
0
        for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
4940
0
        v->n_sample = i;
4941
0
    } else v->n_sample = 0;
4942
0
    if ( !v->n_sample ) v->n_fmt = 0;
4943
0
    free(v->indiv.s);
4944
0
    v->indiv = ind;
4945
0
    v->unpacked &= ~BCF_UN_FMT;    // only BCF is ready for output, VCF will need to unpack again
4946
0
    return 0;
4947
0
}
4948
4949
int bcf_is_snp(bcf1_t *v)
4950
0
{
4951
0
    int i;
4952
0
    bcf_unpack(v, BCF_UN_STR);
4953
0
    for (i = 0; i < v->n_allele; ++i)
4954
0
    {
4955
0
        if ( v->d.allele[i][1]==0 && v->d.allele[i][0]!='*' ) continue;
4956
4957
        // mpileup's <X> allele, see also below. This is not completely satisfactory,
4958
        // a general library is here narrowly tailored to fit samtools.
4959
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue;
4960
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='*' && v->d.allele[i][2]=='>' ) continue;
4961
4962
0
        break;
4963
0
    }
4964
0
    return i == v->n_allele;
4965
0
}
4966
4967
static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t *var)
4968
0
{
4969
0
    if ( *alt == '*' && !alt[1] ) { var->n = 0; var->type = VCF_OVERLAP; return; }  // overlapping variant
4970
4971
    // The most frequent case
4972
0
    if ( !ref[1] && !alt[1] )
4973
0
    {
4974
0
        if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
4975
0
        if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
4976
0
        var->n = 1; var->type = VCF_SNP; return;
4977
0
    }
4978
0
    if ( alt[0]=='<' )
4979
0
    {
4980
0
        if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
4981
0
        if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }
4982
0
        if ( !strcmp("NON_REF>",alt+1) ) { var->n = 0; var->type = VCF_REF; return; }
4983
0
        var->type = VCF_OTHER;
4984
0
        return;
4985
0
    }
4986
4987
    // Catch "joined before" breakend case
4988
0
    if ( alt[0]==']' || alt[0] == '[' )
4989
0
    {
4990
0
        var->type = VCF_BND; return;
4991
0
    }
4992
4993
    // Iterate through alt characters that match the reference
4994
0
    const char *r = ref, *a = alt;
4995
0
    while (*r && *a && toupper_c(*r)==toupper_c(*a) ) { r++; a++; }     // unfortunately, matching REF,ALT case is not guaranteed
4996
4997
0
    if ( *a && !*r )
4998
0
    {
4999
0
        if ( *a==']' || *a=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend
5000
0
        while ( *a ) a++;
5001
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_INS; return;
5002
0
    }
5003
0
    else if ( *r && !*a )
5004
0
    {
5005
0
        while ( *r ) r++;
5006
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_DEL; return;
5007
0
    }
5008
0
    else if ( !*r && !*a )
5009
0
    {
5010
0
        var->n = 0; var->type = VCF_REF; return;
5011
0
    }
5012
5013
0
    const char *re = r, *ae = a;
5014
0
    while ( re[1] ) re++;
5015
0
    while ( ae[1] ) ae++;
5016
0
    while ( re>r && ae>a && toupper_c(*re)==toupper_c(*ae) ) { re--; ae--; }
5017
0
    if ( ae==a )
5018
0
    {
5019
0
        if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
5020
0
        var->n = -(re-r);
5021
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_DEL; return; }
5022
0
        var->type = VCF_OTHER; return;
5023
0
    }
5024
0
    else if ( re==r )
5025
0
    {
5026
0
        var->n = ae-a;
5027
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_INS; return; }
5028
0
        var->type = VCF_OTHER; return;
5029
0
    }
5030
5031
0
    var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
5032
0
    var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
5033
5034
    // should do also complex events, SVs, etc...
5035
0
}
5036
5037
static int bcf_set_variant_types(bcf1_t *b)
5038
0
{
5039
0
    if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
5040
0
    bcf_dec_t *d = &b->d;
5041
0
    if ( d->n_var < b->n_allele )
5042
0
    {
5043
0
        bcf_variant_t *new_var = realloc(d->var, sizeof(bcf_variant_t)*b->n_allele);
5044
0
        if (!new_var)
5045
0
            return -1;
5046
0
        d->var = new_var;
5047
0
        d->n_var = b->n_allele;
5048
0
    }
5049
0
    int i;
5050
0
    b->d.var_type = 0;
5051
0
    d->var[0].type = VCF_REF;
5052
0
    d->var[0].n    = 0;
5053
0
    for (i=1; i<b->n_allele; i++)
5054
0
    {
5055
0
        bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
5056
0
        b->d.var_type |= d->var[i].type;
5057
        //fprintf(stderr,"[set_variant_type] %d   %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
5058
0
    }
5059
0
    return 0;
5060
0
}
5061
5062
// bcf_get_variant_type/bcf_get_variant_types should only return the following,
5063
// to be compatible with callers that are not expecting newer values
5064
// like VCF_INS, VCF_DEL.  The full set is available from the newer
5065
// vcf_has_variant_type* interfaces.
5066
0
#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP)
5067
int bcf_get_variant_types(bcf1_t *rec)
5068
0
{
5069
0
    if ( rec->d.var_type==-1 ) {
5070
0
        if (bcf_set_variant_types(rec) != 0) {
5071
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5072
0
            exit(1); // Due to legacy API having no way to report failures
5073
0
        }
5074
0
    }
5075
0
    return rec->d.var_type & ORIG_VAR_TYPES;
5076
0
}
5077
5078
int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
5079
0
{
5080
0
    if ( rec->d.var_type==-1 ) {
5081
0
        if (bcf_set_variant_types(rec) != 0) {
5082
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
5083
0
            exit(1); // Due to legacy API having no way to report failures
5084
0
        }
5085
0
    }
5086
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) {
5087
0
        hts_log_error("Requested allele outside valid range");
5088
0
        exit(1);
5089
0
    }
5090
0
    return rec->d.var[ith_allele].type & ORIG_VAR_TYPES;
5091
0
}
5092
#undef ORIG_VAR_TYPES
5093
5094
int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask)
5095
0
{
5096
0
    if ( rec->d.var_type==-1 ) {
5097
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5098
0
    }
5099
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return -1;
5100
0
    if (bitmask == VCF_REF) {  // VCF_REF is 0, so handled as a special case
5101
0
        return rec->d.var[ith_allele].type == VCF_REF;
5102
0
    }
5103
0
    return bitmask & rec->d.var[ith_allele].type;
5104
0
}
5105
5106
int bcf_variant_length(bcf1_t *rec, int ith_allele)
5107
0
{
5108
0
    if ( rec->d.var_type==-1 ) {
5109
0
        if (bcf_set_variant_types(rec) != 0) return bcf_int32_missing;
5110
0
    }
5111
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return bcf_int32_missing;
5112
0
    return rec->d.var[ith_allele].n;
5113
0
}
5114
5115
int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask,
5116
                          enum bcf_variant_match mode)
5117
0
{
5118
0
    if ( rec->d.var_type==-1 ) {
5119
0
        if (bcf_set_variant_types(rec) != 0) return -1;
5120
0
    }
5121
0
    uint32_t type = rec->d.var_type;
5122
0
    if ( mode==bcf_match_overlap ) return bitmask & type;
5123
5124
    // VCF_INDEL is always set with VCF_INS and VCF_DEL by bcf_set_variant_type[s], but the bitmask may
5125
    // ask for say `VCF_INS` or `VCF_INDEL` only
5126
0
    if ( bitmask&(VCF_INS|VCF_DEL) && !(bitmask&VCF_INDEL) ) type &= ~VCF_INDEL;
5127
0
    else if ( bitmask&VCF_INDEL && !(bitmask&(VCF_INS|VCF_DEL)) ) type &= ~(VCF_INS|VCF_DEL);
5128
5129
0
    if ( mode==bcf_match_subset )
5130
0
    {
5131
0
        if ( ~bitmask & type ) return 0;
5132
0
        else return bitmask & type;
5133
0
    }
5134
    // mode == bcf_match_exact
5135
0
    return type==bitmask ? type : 0;
5136
0
}
5137
5138
int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5139
0
{
5140
0
    static int negative_rlen_warned = 0;
5141
0
    int is_end_tag;
5142
5143
    // Is the field already present?
5144
0
    int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5145
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1;    // No such INFO field in the header
5146
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
5147
5148
0
    is_end_tag = strcmp(key, "END") == 0;
5149
5150
0
    for (i=0; i<line->n_info; i++)
5151
0
        if ( inf_id==line->d.info[i].key ) break;
5152
0
    bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
5153
5154
0
    if ( !n || (type==BCF_HT_STR && !values) )
5155
0
    {
5156
0
        if ( n==0 && is_end_tag )
5157
0
            line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
5158
0
        if ( inf )
5159
0
        {
5160
            // Mark the tag for removal, free existing memory if necessary
5161
0
            if ( inf->vptr_free )
5162
0
            {
5163
0
                free(inf->vptr - inf->vptr_off);
5164
0
                inf->vptr_free = 0;
5165
0
            }
5166
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5167
0
            inf->vptr = NULL;
5168
0
            inf->vptr_off = inf->vptr_len = 0;
5169
0
        }
5170
0
        return 0;
5171
0
    }
5172
5173
0
    if (is_end_tag)
5174
0
    {
5175
0
        if (n != 1)
5176
0
        {
5177
0
            hts_log_error("END info tag should only have one value at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5178
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5179
0
            return -1;
5180
0
        }
5181
0
        if (type != BCF_HT_INT && type != BCF_HT_LONG)
5182
0
        {
5183
0
            hts_log_error("Wrong type (%d) for END info tag at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5184
0
            line->errcode |= BCF_ERR_TAG_INVALID;
5185
0
            return -1;
5186
0
        }
5187
0
    }
5188
5189
    // Encode the values and determine the size required to accommodate the values
5190
0
    kstring_t str = {0,0,0};
5191
0
    bcf_enc_int1(&str, inf_id);
5192
0
    if ( type==BCF_HT_INT )
5193
0
        bcf_enc_vint(&str, n, (int32_t*)values, -1);
5194
0
    else if ( type==BCF_HT_REAL )
5195
0
        bcf_enc_vfloat(&str, n, (float*)values);
5196
0
    else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
5197
0
    {
5198
0
        if ( values==NULL )
5199
0
            bcf_enc_size(&str, 0, BCF_BT_NULL);
5200
0
        else
5201
0
            bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
5202
0
    }
5203
#ifdef VCF_ALLOW_INT64
5204
    else if ( type==BCF_HT_LONG )
5205
    {
5206
        if (n != 1) {
5207
            hts_log_error("Only storing a single BCF_HT_LONG value is supported at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
5208
            abort();
5209
        }
5210
        bcf_enc_long1(&str, *(int64_t *) values);
5211
    }
5212
#endif
5213
0
    else
5214
0
    {
5215
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5216
0
        abort();
5217
0
    }
5218
5219
    // Is the INFO tag already present
5220
0
    if ( inf )
5221
0
    {
5222
        // Is it big enough to accommodate new block?
5223
0
        if ( inf->vptr && str.l <= inf->vptr_len + inf->vptr_off )
5224
0
        {
5225
0
            if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
5226
0
            uint8_t *ptr = inf->vptr - inf->vptr_off;
5227
0
            memcpy(ptr, str.s, str.l);
5228
0
            free(str.s);
5229
0
            int vptr_free = inf->vptr_free;
5230
0
            bcf_unpack_info_core1(ptr, inf);
5231
0
            inf->vptr_free = vptr_free;
5232
0
        }
5233
0
        else
5234
0
        {
5235
0
            if ( inf->vptr_free )
5236
0
                free(inf->vptr - inf->vptr_off);
5237
0
            bcf_unpack_info_core1((uint8_t*)str.s, inf);
5238
0
            inf->vptr_free = 1;
5239
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
5240
0
        }
5241
0
    }
5242
0
    else
5243
0
    {
5244
        // The tag is not present, create new one
5245
0
        line->n_info++;
5246
0
        hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
5247
0
        inf = &line->d.info[line->n_info-1];
5248
0
        bcf_unpack_info_core1((uint8_t*)str.s, inf);
5249
0
        inf->vptr_free = 1;
5250
0
        line->d.shared_dirty |= BCF1_DIRTY_INF;
5251
0
    }
5252
0
    line->unpacked |= BCF_UN_INFO;
5253
5254
0
   if ( n==1 && is_end_tag) {
5255
0
        hts_pos_t end = type == BCF_HT_INT ? *(int32_t *) values : *(int64_t *) values;
5256
0
        if ( (type == BCF_HT_INT && end!=bcf_int32_missing) || (type == BCF_HT_LONG && end!=bcf_int64_missing) )
5257
0
        {
5258
0
            if ( end <= line->pos )
5259
0
            {
5260
0
                if ( !negative_rlen_warned )
5261
0
                {
5262
0
                    hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,end,bcf_seqname_safe(hdr,line),line->pos+1);
5263
0
                    negative_rlen_warned = 1;
5264
0
                }
5265
0
                line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
5266
0
            }
5267
0
            else
5268
0
                line->rlen = end - line->pos;
5269
0
        }
5270
0
    }
5271
0
    return 0;
5272
0
}
5273
5274
int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
5275
0
{
5276
0
    if ( !n )
5277
0
        return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
5278
5279
0
    int i, max_len = 0;
5280
0
    for (i=0; i<n; i++)
5281
0
    {
5282
0
        int len = strlen(values[i]);
5283
0
        if ( len > max_len ) max_len = len;
5284
0
    }
5285
0
    char *out = (char*) malloc(max_len*n);
5286
0
    if ( !out ) return -2;
5287
0
    for (i=0; i<n; i++)
5288
0
    {
5289
0
        char *dst = out+i*max_len;
5290
0
        const char *src = values[i];
5291
0
        int j = 0;
5292
0
        while ( src[j] ) { dst[j] = src[j]; j++; }
5293
0
        for (; j<max_len; j++) dst[j] = 0;
5294
0
    }
5295
0
    int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
5296
0
    free(out);
5297
0
    return ret;
5298
0
}
5299
5300
int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
5301
0
{
5302
    // Is the field already present?
5303
0
    int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
5304
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
5305
0
    {
5306
0
        if ( !n ) return 0;
5307
0
        return -1;  // the key not present in the header
5308
0
    }
5309
5310
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5311
5312
0
    for (i=0; i<line->n_fmt; i++)
5313
0
        if ( line->d.fmt[i].id==fmt_id ) break;
5314
0
    bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
5315
5316
0
    if ( !n )
5317
0
    {
5318
0
        if ( fmt )
5319
0
        {
5320
            // Mark the tag for removal, free existing memory if necessary
5321
0
            if ( fmt->p_free )
5322
0
            {
5323
0
                free(fmt->p - fmt->p_off);
5324
0
                fmt->p_free = 0;
5325
0
            }
5326
0
            line->d.indiv_dirty = 1;
5327
0
            fmt->p = NULL;
5328
0
        }
5329
0
        return 0;
5330
0
    }
5331
5332
0
    line->n_sample = bcf_hdr_nsamples(hdr);
5333
0
    int nps = n / line->n_sample;  // number of values per sample
5334
0
    assert( nps && nps*line->n_sample==n );     // must be divisible by n_sample
5335
5336
    // Encode the values and determine the size required to accommodate the values
5337
0
    kstring_t str = {0,0,0};
5338
0
    bcf_enc_int1(&str, fmt_id);
5339
0
    assert(values != NULL);
5340
0
    if ( type==BCF_HT_INT )
5341
0
        bcf_enc_vint(&str, n, (int32_t*)values, nps);
5342
0
    else if ( type==BCF_HT_REAL )
5343
0
    {
5344
0
        bcf_enc_size(&str, nps, BCF_BT_FLOAT);
5345
0
        serialize_float_array(&str, nps*line->n_sample, (float *) values);
5346
0
    }
5347
0
    else if ( type==BCF_HT_STR )
5348
0
    {
5349
0
        bcf_enc_size(&str, nps, BCF_BT_CHAR);
5350
0
        kputsn((char*)values, nps*line->n_sample, &str);
5351
0
    }
5352
0
    else
5353
0
    {
5354
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5355
0
        abort();
5356
0
    }
5357
5358
0
    if ( !fmt )
5359
0
    {
5360
        // Not present, new format field
5361
0
        line->n_fmt++;
5362
0
        hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
5363
5364
        // Special case: VCF specification requires that GT is always first
5365
0
        if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
5366
0
        {
5367
0
            for (i=line->n_fmt-1; i>0; i--)
5368
0
                line->d.fmt[i] = line->d.fmt[i-1];
5369
0
            fmt = &line->d.fmt[0];
5370
0
        }
5371
0
        else
5372
0
            fmt = &line->d.fmt[line->n_fmt-1];
5373
0
        bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5374
0
        line->d.indiv_dirty = 1;
5375
0
        fmt->p_free = 1;
5376
0
    }
5377
0
    else
5378
0
    {
5379
        // The tag is already present, check if it is big enough to accommodate the new block
5380
0
        if ( fmt->p && str.l <= fmt->p_len + fmt->p_off )
5381
0
        {
5382
            // good, the block is big enough
5383
0
            if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
5384
0
            uint8_t *ptr = fmt->p - fmt->p_off;
5385
0
            memcpy(ptr, str.s, str.l);
5386
0
            free(str.s);
5387
0
            int p_free = fmt->p_free;
5388
0
            bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
5389
0
            fmt->p_free = p_free;
5390
0
        }
5391
0
        else
5392
0
        {
5393
0
            if ( fmt->p_free )
5394
0
                free(fmt->p - fmt->p_off);
5395
0
            bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
5396
0
            fmt->p_free = 1;
5397
0
            line->d.indiv_dirty = 1;
5398
0
        }
5399
0
    }
5400
0
    line->unpacked |= BCF_UN_FMT;
5401
0
    return 0;
5402
0
}
5403
5404
5405
int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
5406
0
{
5407
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5408
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5409
0
    line->d.n_flt = n;
5410
0
    if ( !n ) return 0;
5411
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5412
0
    int i;
5413
0
    for (i=0; i<n; i++)
5414
0
        line->d.flt[i] = flt_ids[i];
5415
0
    return 0;
5416
0
}
5417
5418
int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
5419
0
{
5420
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5421
0
    int i;
5422
0
    for (i=0; i<line->d.n_flt; i++)
5423
0
        if ( flt_id==line->d.flt[i] ) break;
5424
0
    if ( i<line->d.n_flt ) return 0;    // this filter is already set
5425
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5426
0
    if ( flt_id==0 )    // set to PASS
5427
0
        line->d.n_flt = 1;
5428
0
    else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
5429
0
        line->d.n_flt = 1;
5430
0
    else
5431
0
        line->d.n_flt++;
5432
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
5433
0
    line->d.flt[line->d.n_flt-1] = flt_id;
5434
0
    return 1;
5435
0
}
5436
int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
5437
0
{
5438
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5439
0
    int i;
5440
0
    for (i=0; i<line->d.n_flt; i++)
5441
0
        if ( flt_id==line->d.flt[i] ) break;
5442
0
    if ( i==line->d.n_flt ) return 0;   // the filter is not present
5443
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
5444
0
    if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt));
5445
0
    line->d.n_flt--;
5446
0
    if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
5447
0
    return 0;
5448
0
}
5449
5450
int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
5451
0
{
5452
0
    if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
5453
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
5454
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1;  // not defined in the header
5455
5456
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
5457
0
    if ( id==0 && !line->d.n_flt) return 1; // PASS
5458
5459
0
    int i;
5460
0
    for (i=0; i<line->d.n_flt; i++)
5461
0
        if ( line->d.flt[i]==id ) return 1;
5462
0
    return 0;
5463
0
}
5464
5465
static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
5466
0
{
5467
0
    line->d.shared_dirty |= BCF1_DIRTY_ALS;
5468
5469
0
    line->n_allele = nals;
5470
0
    hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
5471
5472
0
    char *als = line->d.als;
5473
0
    int n = 0;
5474
0
    while (n<nals)
5475
0
    {
5476
0
        line->d.allele[n] = als;
5477
0
        while ( *als ) als++;
5478
0
        als++;
5479
0
        n++;
5480
0
    }
5481
5482
    // Update REF length. Note that END is 1-based while line->pos 0-based
5483
0
    bcf_info_t *end_info = bcf_get_info(hdr,line,"END");
5484
0
    if ( end_info )
5485
0
    {
5486
0
        if ( end_info->type==BCF_HT_INT && end_info->v1.i==bcf_int32_missing ) end_info = NULL;
5487
0
        else if ( end_info->type==BCF_HT_LONG && end_info->v1.i==bcf_int64_missing ) end_info = NULL;
5488
0
    }
5489
0
    if ( end_info && end_info->v1.i > line->pos )
5490
0
        line->rlen = end_info->v1.i - line->pos;
5491
0
    else if ( nals > 0 )
5492
0
        line->rlen = strlen(line->d.allele[0]);
5493
0
    else
5494
0
        line->rlen = 0;
5495
5496
0
    return 0;
5497
0
}
5498
int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
5499
0
{
5500
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5501
0
    char *free_old = NULL;
5502
0
    char buffer[256];
5503
0
    size_t used = 0;
5504
5505
    // The pointers in alleles may point into the existing line->d.als memory,
5506
    // so care needs to be taken not to clobber them while updating.  Usually
5507
    // they will be short so we can copy through an intermediate buffer.
5508
    // If they're longer, or won't fit in the existing allocation we
5509
    // can allocate a new buffer to write into.  Note that in either case
5510
    // pointers to line->d.als memory in alleles may not be valid when we've
5511
    // finished.
5512
0
    int i;
5513
0
    size_t avail = line->d.m_als < sizeof(buffer) ? line->d.m_als : sizeof(buffer);
5514
0
    for (i=0; i<nals; i++) {
5515
0
        size_t sz = strlen(alleles[i]) + 1;
5516
0
        if (avail - used < sz)
5517
0
            break;
5518
0
        memcpy(buffer + used, alleles[i], sz);
5519
0
        used += sz;
5520
0
    }
5521
5522
    // Did we miss anything?
5523
0
    if (i < nals) {
5524
0
        int j;
5525
0
        size_t needed = used;
5526
0
        char *new_als;
5527
0
        for (j = i; j < nals; j++)
5528
0
            needed += strlen(alleles[j]) + 1;
5529
0
        if (needed < line->d.m_als) // Don't shrink the buffer
5530
0
            needed = line->d.m_als;
5531
0
        if (needed > INT_MAX) {
5532
0
            hts_log_error("REF + alleles too long to fit in a BCF record");
5533
0
            return -1;
5534
0
        }
5535
0
        new_als = malloc(needed);
5536
0
        if (!new_als)
5537
0
            return -1;
5538
0
        free_old = line->d.als;
5539
0
        line->d.als = new_als;
5540
0
        line->d.m_als = needed;
5541
0
    }
5542
5543
    // Copy from the temp buffer to the destination
5544
0
    if (used) {
5545
0
        assert(used <= line->d.m_als);
5546
0
        memcpy(line->d.als, buffer, used);
5547
0
    }
5548
5549
    // Add in any remaining entries - if this happens we will always be
5550
    // writing to a newly-allocated buffer.
5551
0
    for (; i < nals; i++) {
5552
0
        size_t sz = strlen(alleles[i]) + 1;
5553
0
        memcpy(line->d.als + used, alleles[i], sz);
5554
0
        used += sz;
5555
0
    }
5556
5557
0
    if (free_old)
5558
0
        free(free_old);
5559
0
    return _bcf1_sync_alleles(hdr,line,nals);
5560
0
}
5561
5562
int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
5563
0
{
5564
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5565
0
    kstring_t tmp;
5566
0
    tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
5567
0
    kputs(alleles_string, &tmp);
5568
0
    line->d.als = tmp.s; line->d.m_als = tmp.m;
5569
5570
0
    int nals = 1;
5571
0
    char *t = line->d.als;
5572
0
    while (*t)
5573
0
    {
5574
0
        if ( *t==',' ) { *t = 0; nals++; }
5575
0
        t++;
5576
0
    }
5577
0
    return _bcf1_sync_alleles(hdr, line, nals);
5578
0
}
5579
5580
int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
5581
0
{
5582
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5583
0
    kstring_t tmp;
5584
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
5585
0
    if ( id )
5586
0
        kputs(id, &tmp);
5587
0
    else
5588
0
        kputs(".", &tmp);
5589
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
5590
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
5591
0
    return 0;
5592
0
}
5593
5594
int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
5595
0
{
5596
0
    if ( !id ) return 0;
5597
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
5598
5599
0
    kstring_t tmp;
5600
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
5601
5602
0
    int len = strlen(id);
5603
0
    char *dst = line->d.id;
5604
0
    while ( *dst && (dst=strstr(dst,id)) )
5605
0
    {
5606
0
        if ( dst[len]!=0 && dst[len]!=';' ) dst++;              // a prefix, not a match
5607
0
        else if ( dst==line->d.id || dst[-1]==';' ) return 0;   // already present
5608
0
        dst++;  // a suffix, not a match
5609
0
    }
5610
0
    if ( line->d.id && (line->d.id[0]!='.' || line->d.id[1]) )
5611
0
    {
5612
0
        tmp.l = strlen(line->d.id);
5613
0
        kputc(';',&tmp);
5614
0
    }
5615
0
    kputs(id,&tmp);
5616
5617
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
5618
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
5619
0
    return 0;
5620
5621
0
}
5622
5623
bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
5624
0
{
5625
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
5626
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL;   // no such FMT field in the header
5627
0
    return bcf_get_fmt_id(line, id);
5628
0
}
5629
5630
bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
5631
0
{
5632
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
5633
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL;   // no such INFO field in the header
5634
0
    return bcf_get_info_id(line, id);
5635
0
}
5636
5637
bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
5638
0
{
5639
0
    int i;
5640
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5641
0
    for (i=0; i<line->n_fmt; i++)
5642
0
    {
5643
0
        if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
5644
0
    }
5645
0
    return NULL;
5646
0
}
5647
5648
bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
5649
0
{
5650
0
    int i;
5651
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
5652
0
    for (i=0; i<line->n_info; i++)
5653
0
    {
5654
0
        if ( line->d.info[i].key==id ) return &line->d.info[i];
5655
0
    }
5656
0
    return NULL;
5657
0
}
5658
5659
5660
int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
5661
0
{
5662
0
    int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
5663
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1;    // no such INFO field in the header
5664
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2;     // expected different type
5665
5666
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
5667
5668
0
    for (i=0; i<line->n_info; i++)
5669
0
        if ( line->d.info[i].key==tag_id ) break;
5670
0
    if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3;       // the tag is not present in this record
5671
0
    if ( type==BCF_HT_FLAG ) return 1;
5672
5673
0
    bcf_info_t *info = &line->d.info[i];
5674
0
    if ( !info->vptr ) return -3;           // the tag was marked for removal
5675
0
    if ( type==BCF_HT_STR )
5676
0
    {
5677
0
        if ( *ndst < info->len+1 )
5678
0
        {
5679
0
            *ndst = info->len + 1;
5680
0
            *dst  = realloc(*dst, *ndst);
5681
0
        }
5682
0
        memcpy(*dst,info->vptr,info->len);
5683
0
        ((uint8_t*)*dst)[info->len] = 0;
5684
0
        return info->len;
5685
0
    }
5686
5687
    // Make sure the buffer is big enough
5688
0
    int size1;
5689
0
    switch (type) {
5690
0
        case BCF_HT_INT:  size1 = sizeof(int32_t); break;
5691
0
        case BCF_HT_LONG: size1 = sizeof(int64_t); break;
5692
0
        case BCF_HT_REAL: size1 = sizeof(float); break;
5693
0
        default:
5694
0
            hts_log_error("Unexpected output type %d at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5695
0
            return -2;
5696
0
    }
5697
0
    if ( *ndst < info->len )
5698
0
    {
5699
0
        *ndst = info->len;
5700
0
        *dst  = realloc(*dst, *ndst * size1);
5701
0
    }
5702
5703
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_regular, out_type_t) do { \
5704
0
        out_type_t *tmp = (out_type_t *) *dst; \
5705
0
        int j; \
5706
0
        for (j=0; j<info->len; j++) \
5707
0
        { \
5708
0
            type_t p = convert(info->vptr + j * sizeof(type_t)); \
5709
0
            if ( is_vector_end ) break; \
5710
0
            if ( is_missing ) set_missing; \
5711
0
            else set_regular; \
5712
0
            tmp++; \
5713
0
        } \
5714
0
        ret = j; \
5715
0
    } while (0)
5716
0
    switch (info->type) {
5717
0
        case BCF_BT_INT8:
5718
0
            if (type == BCF_HT_LONG) {
5719
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int64_missing, *tmp=p, int64_t);
5720
0
            } else {
5721
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=p, int32_t);
5722
0
            }
5723
0
            break;
5724
0
        case BCF_BT_INT16:
5725
0
            if (type == BCF_HT_LONG) {
5726
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t);
5727
0
            } else {
5728
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t);
5729
0
            }
5730
0
            break;
5731
0
        case BCF_BT_INT32:
5732
0
            if (type == BCF_HT_LONG) {
5733
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); break;
5734
0
            } else {
5735
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break;
5736
0
            }
5737
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break;
5738
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, info->type, bcf_seqname_safe(hdr,line), line->pos+1); return -2;
5739
0
    }
5740
0
    #undef BRANCH
5741
0
    return ret;  // set by BRANCH
5742
0
}
5743
5744
int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
5745
0
{
5746
0
    int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
5747
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
5748
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;     // expected different type
5749
5750
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5751
5752
0
    for (i=0; i<line->n_fmt; i++)
5753
0
        if ( line->d.fmt[i].id==tag_id ) break;
5754
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
5755
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
5756
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
5757
5758
0
    int nsmpl = bcf_hdr_nsamples(hdr);
5759
0
    if ( !*dst )
5760
0
    {
5761
0
        *dst = (char**) malloc(sizeof(char*)*nsmpl);
5762
0
        if ( !*dst ) return -4;     // could not alloc
5763
0
        (*dst)[0] = NULL;
5764
0
    }
5765
0
    int n = (fmt->n+1)*nsmpl;
5766
0
    if ( *ndst < n )
5767
0
    {
5768
0
        (*dst)[0] = realloc((*dst)[0], n);
5769
0
        if ( !(*dst)[0] ) return -4;    // could not alloc
5770
0
        *ndst = n;
5771
0
    }
5772
0
    for (i=0; i<nsmpl; i++)
5773
0
    {
5774
0
        uint8_t *src = fmt->p + i*fmt->n;
5775
0
        uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
5776
0
        memcpy(tmp,src,fmt->n);
5777
0
        tmp[fmt->n] = 0;
5778
0
        (*dst)[i] = (char*) tmp;
5779
0
    }
5780
0
    return n;
5781
0
}
5782
5783
int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
5784
0
{
5785
0
    int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
5786
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
5787
0
    if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
5788
0
    {
5789
        // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
5790
0
        if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
5791
0
    }
5792
0
    else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2;     // expected different type
5793
5794
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5795
5796
0
    for (i=0; i<line->n_fmt; i++)
5797
0
        if ( line->d.fmt[i].id==tag_id ) break;
5798
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
5799
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
5800
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
5801
5802
0
    if ( type==BCF_HT_STR )
5803
0
    {
5804
0
        int n = fmt->n*bcf_hdr_nsamples(hdr);
5805
0
        if ( *ndst < n )
5806
0
        {
5807
0
            *dst  = realloc(*dst, n);
5808
0
            if ( !*dst ) return -4;     // could not alloc
5809
0
            *ndst = n;
5810
0
        }
5811
0
        memcpy(*dst,fmt->p,n);
5812
0
        return n;
5813
0
    }
5814
5815
    // Make sure the buffer is big enough
5816
0
    int nsmpl = bcf_hdr_nsamples(hdr);
5817
0
    int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
5818
0
    if ( *ndst < fmt->n*nsmpl )
5819
0
    {
5820
0
        *ndst = fmt->n*nsmpl;
5821
0
        *dst  = realloc(*dst, *ndst*size1);
5822
0
        if ( !*dst ) return -4;     // could not alloc
5823
0
    }
5824
5825
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_vector_end, set_regular, out_type_t) { \
5826
0
        out_type_t *tmp = (out_type_t *) *dst; \
5827
0
        uint8_t *fmt_p = fmt->p; \
5828
0
        for (i=0; i<nsmpl; i++) \
5829
0
        { \
5830
0
            for (j=0; j<fmt->n; j++) \
5831
0
            { \
5832
0
                type_t p = convert(fmt_p + j * sizeof(type_t)); \
5833
0
                if ( is_missing ) set_missing; \
5834
0
                else if ( is_vector_end ) { set_vector_end; break; } \
5835
0
                else set_regular; \
5836
0
                tmp++; \
5837
0
            } \
5838
0
            for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
5839
0
            fmt_p += fmt->size; \
5840
0
        } \
5841
0
    }
5842
0
    switch (fmt->type) {
5843
0
        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
5844
0
        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
5845
0
        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
5846
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), bcf_float_set(tmp, p), float); break;
5847
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt->type, bcf_seqname_safe(hdr,line), line->pos+1); exit(1);
5848
0
    }
5849
0
    #undef BRANCH
5850
0
    return nsmpl*fmt->n;
5851
0
}
5852
5853
//error description structure definition
5854
typedef struct err_desc {
5855
    int  errorcode;
5856
    const char *description;
5857
}err_desc;
5858
5859
// error descriptions
5860
static const err_desc errdesc_bcf[] = {
5861
    { BCF_ERR_CTG_UNDEF, "Contig not defined in header"},
5862
    { BCF_ERR_TAG_UNDEF, "Tag not defined in header" },
5863
    { BCF_ERR_NCOLS, "Incorrect number of columns" },
5864
    { BCF_ERR_LIMITS, "Limits reached" },
5865
    { BCF_ERR_CHAR, "Invalid character" },
5866
    { BCF_ERR_CTG_INVALID, "Invalid contig" },
5867
    { BCF_ERR_TAG_INVALID, "Invalid tag" },
5868
};
5869
5870
/// append given description to buffer based on available size and add ... when not enough space
5871
    /** @param buffer       buffer to which description to be appended
5872
        @param offset       offset at which to be appended
5873
        @param maxbuffer    maximum size of the buffer
5874
        @param description  the description to be appended
5875
on failure returns -1 - when buffer is not big enough; returns -1 on invalid params and on too small buffer which are improbable due to validation at caller site
5876
on success returns 0
5877
    */
5878
3.52k
static int add_desc_to_buffer(char *buffer, size_t *offset, size_t maxbuffer, const char *description) {
5879
5880
3.52k
    if (!description || !buffer || !offset || (maxbuffer < 4))
5881
0
        return -1;
5882
5883
3.52k
    size_t rembuffer = maxbuffer - *offset;
5884
3.52k
    if (rembuffer > (strlen(description) + (rembuffer == maxbuffer ? 0 : 1))) {    //add description with optionally required ','
5885
3.52k
        *offset += snprintf(buffer + *offset, rembuffer, "%s%s", (rembuffer == maxbuffer)? "": ",", description);
5886
3.52k
    } else {    //not enough space for description, put ...
5887
0
        size_t tmppos = (rembuffer <= 4) ? maxbuffer - 4 : *offset;
5888
0
        snprintf(buffer + tmppos, 4, "...");    //ignore offset update
5889
0
        return -1;
5890
0
    }
5891
3.52k
    return 0;
5892
3.52k
}
5893
5894
//get description for given error code. return NULL on error
5895
1.76k
const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) {
5896
1.76k
    size_t usedup = 0;
5897
1.76k
    int ret = 0;
5898
1.76k
    int idx;
5899
5900
1.76k
    if (!buffer || maxbuffer < 4)
5901
0
        return NULL;           //invalid / insufficient buffer
5902
5903
1.76k
    if (!errorcode) {
5904
0
        buffer[0] = '\0';      //no error, set null
5905
0
        return buffer;
5906
0
    }
5907
5908
14.1k
    for (idx = 0; idx < sizeof(errdesc_bcf) / sizeof(err_desc); ++idx) {
5909
12.3k
        if (errorcode & errdesc_bcf[idx].errorcode) {    //error is set, add description
5910
3.52k
            ret = add_desc_to_buffer(buffer, &usedup, maxbuffer, errdesc_bcf[idx].description);
5911
3.52k
            if (ret < 0)
5912
0
                break;         //not enough space, ... added, no need to continue
5913
5914
3.52k
            errorcode &= ~errdesc_bcf[idx].errorcode;    //reset the error
5915
3.52k
        }
5916
12.3k
    }
5917
5918
1.76k
    if (errorcode && (ret >= 0))  {     //undescribed error is present in error code and had enough buffer, try to add unkonwn error as well§
5919
0
        add_desc_to_buffer(buffer, &usedup, maxbuffer, "Unknown error");
5920
0
    }
5921
1.76k
    return buffer;
5922
1.76k
}
5923