Coverage Report

Created: 2023-06-07 06:43

/src/htslib/vcf.c
Line
Count
Source (jump to first uncovered line)
1
/*  vcf.c -- VCF/BCF API functions.
2
3
    Copyright (C) 2012, 2013 Broad Institute.
4
    Copyright (C) 2012-2023 Genome Research Ltd.
5
    Portions copyright (C) 2014 Intel Corporation.
6
7
    Author: Heng Li <lh3@sanger.ac.uk>
8
9
Permission is hereby granted, free of charge, to any person obtaining a copy
10
of this software and associated documentation files (the "Software"), to deal
11
in the Software without restriction, including without limitation the rights
12
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
copies of the Software, and to permit persons to whom the Software is
14
furnished to do so, subject to the following conditions:
15
16
The above copyright notice and this permission notice shall be included in
17
all copies or substantial portions of the Software.
18
19
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25
DEALINGS IN THE SOFTWARE.  */
26
27
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
28
#include <config.h>
29
30
#include <stdio.h>
31
#include <assert.h>
32
#include <string.h>
33
#include <strings.h>
34
#include <stdlib.h>
35
#include <limits.h>
36
#include <stdint.h>
37
#include <inttypes.h>
38
#include <errno.h>
39
40
#include "htslib/vcf.h"
41
#include "htslib/bgzf.h"
42
#include "htslib/tbx.h"
43
#include "htslib/hfile.h"
44
#include "hts_internal.h"
45
#include "htslib/hts_endian.h"
46
#include "htslib/khash_str2int.h"
47
#include "htslib/kstring.h"
48
#include "htslib/sam.h"
49
50
#include "htslib/khash.h"
51
KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
52
typedef khash_t(vdict) vdict_t;
53
54
KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*)
55
typedef khash_t(hdict) hdict_t;
56
57
58
#include "htslib/kseq.h"
59
HTSLIB_EXPORT
60
uint32_t bcf_float_missing    = 0x7F800001;
61
62
HTSLIB_EXPORT
63
uint32_t bcf_float_vector_end = 0x7F800002;
64
65
HTSLIB_EXPORT
66
uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
67
68
static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
69
70
/*
71
    Partial support for 64-bit POS and Number=1 INFO tags.
72
    Notes:
73
     - the support for 64-bit values is motivated by POS and INFO/END for large genomes
74
     - the use of 64-bit values does not conform to the specification
75
     - cannot output 64-bit BCF and if it does, it is not compatible with anything
76
     - experimental, use at your risk
77
*/
78
#ifdef VCF_ALLOW_INT64
79
    #define BCF_MAX_BT_INT64 (0x7fffffffffffffff)       /* INT64_MAX, for internal use only */
80
    #define BCF_MIN_BT_INT64 -9223372036854775800LL     /* INT64_MIN + 8, for internal use only */
81
#endif
82
83
807
#define BCF_IS_64BIT (1<<30)
84
85
86
// Opaque structure with auxilary data which allows to extend bcf_hdr_t without breaking ABI.
87
// Note that this preserving API and ABI requires that the first element is vdict_t struct
88
// rather than a pointer, as user programs may (and in some cases do) access the dictionary
89
// directly as (vdict_t*)hdr->dict.
90
typedef struct
91
{
92
    vdict_t dict;   // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT
93
    hdict_t *gen;   // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields
94
}
95
bcf_hdr_aux_t;
96
97
static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr)
98
48.1k
{
99
48.1k
    return (bcf_hdr_aux_t *)hdr->dict[0];
100
48.1k
}
101
102
static char *find_chrom_header_line(char *s)
103
0
{
104
0
    char *nl;
105
0
    if (strncmp(s, "#CHROM\t", 7) == 0) return s;
106
0
    else if ((nl = strstr(s, "\n#CHROM\t")) != NULL) return nl+1;
107
0
    else return NULL;
108
0
}
109
110
/*************************
111
 *** VCF header parser ***
112
 *************************/
113
114
static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len)
115
8.89k
{
116
8.89k
    const char *ss = s;
117
9.09k
    while ( *ss && isspace_c(*ss) && ss - s < len) ss++;
118
8.89k
    if ( !*ss || ss - s == len)
119
1
    {
120
1
        hts_log_error("Empty sample name: trailing spaces/tabs in the header line?");
121
1
        return -1;
122
1
    }
123
124
8.89k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
125
8.89k
    int ret;
126
8.89k
    char *sdup = malloc(len + 1);
127
8.89k
    if (!sdup) return -1;
128
8.89k
    memcpy(sdup, s, len);
129
8.89k
    sdup[len] = 0;
130
131
    // Ensure space is available in h->samples
132
8.89k
    size_t n = kh_size(d);
133
8.89k
    char **new_samples = realloc(h->samples, sizeof(char*) * (n + 1));
134
8.89k
    if (!new_samples) {
135
0
        free(sdup);
136
0
        return -1;
137
0
    }
138
8.89k
    h->samples = new_samples;
139
140
8.89k
    int k = kh_put(vdict, d, sdup, &ret);
141
8.89k
    if (ret < 0) {
142
0
        free(sdup);
143
0
        return -1;
144
0
    }
145
8.89k
    if (ret) { // absent
146
8.89k
        kh_val(d, k) = bcf_idinfo_def;
147
8.89k
        kh_val(d, k).id = n;
148
8.89k
    } else {
149
0
        hts_log_error("Duplicated sample name '%s'", sdup);
150
0
        free(sdup);
151
0
        return -1;
152
0
    }
153
8.89k
    h->samples[n] = sdup;
154
8.89k
    h->dirty = 1;
155
8.89k
    return 0;
156
8.89k
}
157
158
int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
159
0
{
160
0
    if (!s) {
161
        // Allowed for backwards-compatibility, calling with s == NULL
162
        // used to trigger bcf_hdr_sync(h);
163
0
        return 0;
164
0
    }
165
0
    return bcf_hdr_add_sample_len(h, s, strlen(s));
166
0
}
167
168
int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *hdr, const char *str)
169
1.51k
{
170
1.51k
    const char *mandatory = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
171
1.51k
    if ( strncmp(str,mandatory,strlen(mandatory)) )
172
2
    {
173
2
        hts_log_error("Could not parse the \"#CHROM..\" line, either the fields are incorrect or spaces are present instead of tabs:\n\t%s",str);
174
2
        return -1;
175
2
    }
176
177
1.51k
    const char *beg = str + strlen(mandatory), *end;
178
1.51k
    if ( !*beg || *beg=='\n' ) return 0;
179
380
    if ( strncmp(beg,"\tFORMAT\t",8) )
180
0
    {
181
0
        hts_log_error("Could not parse the \"#CHROM..\" line, either FORMAT is missing or spaces are present instead of tabs:\n\t%s",str);
182
0
        return -1;
183
0
    }
184
380
    beg += 8;
185
186
380
    int ret = 0;
187
8.90k
    while ( *beg )
188
8.89k
    {
189
8.89k
        end = beg;
190
35.8M
        while ( *end && *end!='\t' && *end!='\n' ) end++;
191
8.89k
        if ( bcf_hdr_add_sample_len(hdr, beg, end-beg) < 0 ) ret = -1;
192
8.89k
        if ( !*end || *end=='\n' || ret<0 ) break;
193
8.52k
        beg = end + 1;
194
8.52k
    }
195
380
    return ret;
196
380
}
197
198
int bcf_hdr_sync(bcf_hdr_t *h)
199
19.6k
{
200
19.6k
    int i;
201
78.6k
    for (i = 0; i < 3; i++)
202
58.9k
    {
203
58.9k
        vdict_t *d = (vdict_t*)h->dict[i];
204
58.9k
        khint_t k;
205
58.9k
        if ( h->n[i] < kh_size(d) )
206
379
        {
207
379
            bcf_idpair_t *new_idpair;
208
            // this should be true only for i=2, BCF_DT_SAMPLE
209
379
            new_idpair = (bcf_idpair_t*) realloc(h->id[i], kh_size(d)*sizeof(bcf_idpair_t));
210
379
            if (!new_idpair) return -1;
211
379
            h->n[i] = kh_size(d);
212
379
            h->id[i] = new_idpair;
213
379
        }
214
3.07M
        for (k=kh_begin(d); k<kh_end(d); k++)
215
3.02M
        {
216
3.02M
            if (!kh_exist(d,k)) continue;
217
1.59M
            h->id[i][kh_val(d,k).id].key = kh_key(d,k);
218
1.59M
            h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
219
1.59M
        }
220
58.9k
    }
221
19.6k
    h->dirty = 0;
222
19.6k
    return 0;
223
19.6k
}
224
225
void bcf_hrec_destroy(bcf_hrec_t *hrec)
226
51.5k
{
227
51.5k
    if (!hrec) return;
228
50.1k
    free(hrec->key);
229
50.1k
    if ( hrec->value ) free(hrec->value);
230
50.1k
    int i;
231
153k
    for (i=0; i<hrec->nkeys; i++)
232
103k
    {
233
103k
        free(hrec->keys[i]);
234
103k
        free(hrec->vals[i]);
235
103k
    }
236
50.1k
    free(hrec->keys);
237
50.1k
    free(hrec->vals);
238
50.1k
    free(hrec);
239
50.1k
}
240
241
// Copies all fields except IDX.
242
bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
243
0
{
244
0
    int save_errno;
245
0
    bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
246
0
    if (!out) return NULL;
247
248
0
    out->type = hrec->type;
249
0
    if ( hrec->key ) {
250
0
        out->key = strdup(hrec->key);
251
0
        if (!out->key) goto fail;
252
0
    }
253
0
    if ( hrec->value ) {
254
0
        out->value = strdup(hrec->value);
255
0
        if (!out->value) goto fail;
256
0
    }
257
0
    out->nkeys = hrec->nkeys;
258
0
    out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys);
259
0
    if (!out->keys) goto fail;
260
0
    out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys);
261
0
    if (!out->vals) goto fail;
262
0
    int i, j = 0;
263
0
    for (i=0; i<hrec->nkeys; i++)
264
0
    {
265
0
        if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
266
0
        if ( hrec->keys[i] ) {
267
0
            out->keys[j] = strdup(hrec->keys[i]);
268
0
            if (!out->keys[j]) goto fail;
269
0
        }
270
0
        if ( hrec->vals[i] ) {
271
0
            out->vals[j] = strdup(hrec->vals[i]);
272
0
            if (!out->vals[j]) goto fail;
273
0
        }
274
0
        j++;
275
0
    }
276
0
    if ( i!=j ) out->nkeys -= i-j;   // IDX was omitted
277
0
    return out;
278
279
0
 fail:
280
0
    save_errno = errno;
281
0
    hts_log_error("%s", strerror(errno));
282
0
    bcf_hrec_destroy(out);
283
0
    errno = save_errno;
284
0
    return NULL;
285
0
}
286
287
void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
288
0
{
289
0
    fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
290
0
    int i;
291
0
    for (i=0; i<hrec->nkeys; i++)
292
0
        fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
293
0
    fprintf(fp, "\n");
294
0
}
295
296
void bcf_header_debug(bcf_hdr_t *hdr)
297
0
{
298
0
    int i, j;
299
0
    for (i=0; i<hdr->nhrec; i++)
300
0
    {
301
0
        if ( !hdr->hrec[i]->value )
302
0
        {
303
0
            fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
304
0
            fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
305
0
            for (j=1; j<hdr->hrec[i]->nkeys; j++)
306
0
                fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
307
0
            fprintf(stderr,">\n");
308
0
        }
309
0
        else
310
0
            fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
311
0
    }
312
0
}
313
314
int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len)
315
81.3k
{
316
81.3k
    char **tmp;
317
81.3k
    size_t n = hrec->nkeys + 1;
318
81.3k
    assert(len > 0 && len < SIZE_MAX);
319
81.3k
    tmp = realloc(hrec->keys, sizeof(char*)*n);
320
81.3k
    if (!tmp) return -1;
321
81.3k
    hrec->keys = tmp;
322
81.3k
    tmp = realloc(hrec->vals, sizeof(char*)*n);
323
81.3k
    if (!tmp) return -1;
324
81.3k
    hrec->vals = tmp;
325
326
81.3k
    hrec->keys[hrec->nkeys] = (char*) malloc((len+1)*sizeof(char));
327
81.3k
    if (!hrec->keys[hrec->nkeys]) return -1;
328
81.3k
    memcpy(hrec->keys[hrec->nkeys],str,len);
329
81.3k
    hrec->keys[hrec->nkeys][len] = 0;
330
81.3k
    hrec->vals[hrec->nkeys] = NULL;
331
81.3k
    hrec->nkeys = n;
332
81.3k
    return 0;
333
81.3k
}
334
335
int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted)
336
81.3k
{
337
81.3k
    if ( hrec->vals[i] ) {
338
0
        free(hrec->vals[i]);
339
0
        hrec->vals[i] = NULL;
340
0
    }
341
81.3k
    if ( !str ) return 0;
342
81.3k
    if ( is_quoted )
343
18.5k
    {
344
18.5k
        if (len >= SIZE_MAX - 3) {
345
0
            errno = ENOMEM;
346
0
            return -1;
347
0
        }
348
18.5k
        hrec->vals[i] = (char*) malloc((len+3)*sizeof(char));
349
18.5k
        if (!hrec->vals[i]) return -1;
350
18.5k
        hrec->vals[i][0] = '"';
351
18.5k
        memcpy(&hrec->vals[i][1],str,len);
352
18.5k
        hrec->vals[i][len+1] = '"';
353
18.5k
        hrec->vals[i][len+2] = 0;
354
18.5k
    }
355
62.7k
    else
356
62.7k
    {
357
62.7k
        if (len == SIZE_MAX) {
358
0
            errno = ENOMEM;
359
0
            return -1;
360
0
        }
361
62.7k
        hrec->vals[i] = (char*) malloc((len+1)*sizeof(char));
362
62.7k
        if (!hrec->vals[i]) return -1;
363
62.7k
        memcpy(hrec->vals[i],str,len);
364
62.7k
        hrec->vals[i][len] = 0;
365
62.7k
    }
366
81.3k
    return 0;
367
81.3k
}
368
369
int hrec_add_idx(bcf_hrec_t *hrec, int idx)
370
21.6k
{
371
21.6k
    int n = hrec->nkeys + 1;
372
21.6k
    char **tmp = (char**) realloc(hrec->keys, sizeof(char*)*n);
373
21.6k
    if (!tmp) return -1;
374
21.6k
    hrec->keys = tmp;
375
376
21.6k
    tmp = (char**) realloc(hrec->vals, sizeof(char*)*n);
377
21.6k
    if (!tmp) return -1;
378
21.6k
    hrec->vals = tmp;
379
380
21.6k
    hrec->keys[hrec->nkeys] = strdup("IDX");
381
21.6k
    if (!hrec->keys[hrec->nkeys]) return -1;
382
383
21.6k
    kstring_t str = {0,0,0};
384
21.6k
    if (kputw(idx, &str) < 0) {
385
0
        free(hrec->keys[hrec->nkeys]);
386
0
        return -1;
387
0
    }
388
21.6k
    hrec->vals[hrec->nkeys] = str.s;
389
21.6k
    hrec->nkeys = n;
390
21.6k
    return 0;
391
21.6k
}
392
393
int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
394
63.9k
{
395
63.9k
    int i;
396
109k
    for (i=0; i<hrec->nkeys; i++)
397
84.0k
        if ( !strcasecmp(key,hrec->keys[i]) ) return i;
398
25.4k
    return -1;
399
63.9k
}
400
401
static void bcf_hrec_set_type(bcf_hrec_t *hrec)
402
92.8k
{
403
92.8k
    if ( !strcmp(hrec->key, "contig") ) hrec->type = BCF_HL_CTG;
404
69.9k
    else if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
405
48.6k
    else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
406
37.1k
    else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
407
25.3k
    else if ( hrec->nkeys>0 ) hrec->type = BCF_HL_STR;
408
18.0k
    else hrec->type = BCF_HL_GEN;
409
92.8k
}
410
411
412
/**
413
    The arrays were generated with
414
415
    valid_ctg:
416
        perl -le '@v = (split(//,q[!#$%&*+./:;=?@^_|~-]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
417
418
    valid_tag:
419
        perl -le '@v = (split(//,q[_.]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48
420
*/
421
static const uint8_t valid_ctg[256] =
422
{
423
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
424
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
425
    0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
426
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
427
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
428
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
429
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
430
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
431
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
432
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
433
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
434
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
435
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
436
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
437
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
438
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
439
};
440
static const uint8_t valid_tag[256] =
441
{
442
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
443
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
444
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
445
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
446
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
447
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
448
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
449
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
450
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
451
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
452
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
453
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
454
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
455
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
456
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
457
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
458
};
459
460
/**
461
    bcf_hrec_check() - check the validity of structured header lines
462
463
    Returns 0 on success or negative value on error.
464
465
    Currently the return status is not checked by the caller
466
    and only a warning is printed on stderr. This should be improved
467
    to propagate the error all the way up to the caller and let it
468
    decide what to do: throw an error or proceed anyway.
469
 */
470
static int bcf_hrec_check(bcf_hrec_t *hrec)
471
46.4k
{
472
46.4k
    int i;
473
46.4k
    bcf_hrec_set_type(hrec);
474
475
46.4k
    if ( hrec->type==BCF_HL_CTG )
476
11.4k
    {
477
11.4k
        i = bcf_hrec_find_key(hrec,"ID");
478
11.4k
        if ( i<0 ) goto err_missing_id;
479
10.4k
        char *val = hrec->vals[i];
480
10.4k
        if ( val[0]=='*' || val[0]=='=' || !valid_ctg[(uint8_t)val[0]] ) goto err_invalid_ctg;
481
113k
        while ( *(++val) )
482
110k
            if ( !valid_ctg[(uint8_t)*val] ) goto err_invalid_ctg;
483
3.07k
        return 0;
484
4.28k
    }
485
34.9k
    if ( hrec->type==BCF_HL_INFO )
486
10.6k
    {
487
10.6k
        i = bcf_hrec_find_key(hrec,"ID");
488
10.6k
        if ( i<0 ) goto err_missing_id;
489
7.35k
        char *val = hrec->vals[i];
490
7.35k
        if ( !strcmp(val,"1000G") ) return 0;
491
7.21k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
492
145k
        while ( *(++val) )
493
144k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
494
942
        return 0;
495
1.90k
    }
496
24.3k
    if ( hrec->type==BCF_HL_FMT )
497
5.87k
    {
498
5.87k
        i = bcf_hrec_find_key(hrec,"ID");
499
5.87k
        if ( i<0 ) goto err_missing_id;
500
5.09k
        char *val = hrec->vals[i];
501
5.09k
        if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag;
502
242k
        while ( *(++val) )
503
241k
            if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag;
504
1.13k
        return 0;
505
2.07k
    }
506
18.4k
    return 0;
507
508
5.09k
  err_missing_id:
509
5.09k
    hts_log_warning("Missing ID attribute in one or more header lines");
510
5.09k
    return -1;
511
512
7.33k
  err_invalid_ctg:
513
7.33k
    hts_log_warning("Invalid contig name: \"%s\"", hrec->vals[i]);
514
7.33k
    return -1;
515
516
10.2k
  err_invalid_tag:
517
10.2k
    hts_log_warning("Invalid tag name: \"%s\"", hrec->vals[i]);
518
10.2k
    return -1;
519
24.3k
}
520
521
static inline int is_escaped(const char *min, const char *str)
522
16.7k
{
523
16.7k
    int n = 0;
524
16.7k
    while ( --str>=min && *str=='\\' ) n++;
525
16.7k
    return n%2;
526
16.7k
}
527
528
bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
529
53.0k
{
530
53.0k
    bcf_hrec_t *hrec = NULL;
531
53.0k
    const char *p = line;
532
53.0k
    if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
533
51.1k
    p += 2;
534
535
51.1k
    const char *q = p;
536
843k
    while ( *q && *q!='=' && *q != '\n' ) q++;
537
51.1k
    ptrdiff_t n = q-p;
538
51.1k
    if ( *q!='=' || !n ) // wrong format
539
1.04k
        goto malformed_line;
540
541
50.1k
    hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
542
50.1k
    if (!hrec) { *len = -1; return NULL; }
543
50.1k
    hrec->key = (char*) malloc(sizeof(char)*(n+1));
544
50.1k
    if (!hrec->key) goto fail;
545
50.1k
    memcpy(hrec->key,p,n);
546
50.1k
    hrec->key[n] = 0;
547
50.1k
    hrec->type = -1;
548
549
50.1k
    p = ++q;
550
50.1k
    if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
551
9.70k
    {
552
179M
        while ( *q && *q!='\n' ) q++;
553
9.70k
        hrec->value = (char*) malloc((q-p+1)*sizeof(char));
554
9.70k
        if (!hrec->value) goto fail;
555
9.70k
        memcpy(hrec->value, p, q-p);
556
9.70k
        hrec->value[q-p] = 0;
557
9.70k
        *len = q - line + (*q ? 1 : 0); // Skip \n but not \0
558
9.70k
        return hrec;
559
9.70k
    }
560
561
    // structured line, e.g.
562
    // ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
563
    // ##PEDIGREE=<Name_0=G0-ID,Name_1=G1-ID,Name_3=GN-ID>
564
40.4k
    int nopen = 1;
565
121k
    while ( *q && *q!='\n' && nopen>0 )
566
85.0k
    {
567
85.0k
        p = ++q;
568
85.0k
        while ( *q && *q==' ' ) { p++; q++; }
569
        // ^[A-Za-z_][0-9A-Za-z_.]*$
570
85.0k
        if (p==q && *q && (isalpha_c(*q) || *q=='_'))
571
83.7k
        {
572
83.7k
            q++;
573
416k
            while ( *q && (isalnum_c(*q) || *q=='_' || *q=='.') ) q++;
574
83.7k
        }
575
85.0k
        n = q-p;
576
85.0k
        int m = 0;
577
85.0k
        while ( *q && *q==' ' ) { q++; m++; }
578
85.0k
        if ( *q!='=' || !n )
579
3.70k
            goto malformed_line;
580
581
81.3k
        if (bcf_hrec_add_key(hrec, p, q-p-m) < 0) goto fail;
582
81.3k
        p = ++q;
583
81.3k
        while ( *q && *q==' ' ) { p++; q++; }
584
585
81.3k
        int quoted = 0;
586
81.3k
        char ending = '\0';
587
81.3k
        switch (*p) {
588
18.5k
        case '"':
589
18.5k
            quoted = 1;
590
18.5k
            ending = '"';
591
18.5k
            p++;
592
18.5k
            break;
593
132
        case '[':
594
132
            quoted = 1;
595
132
            ending = ']';
596
132
            break;
597
81.3k
        }
598
81.3k
        if ( quoted ) q++;
599
1.17G
        while ( *q && *q != '\n' )
600
1.17G
        {
601
1.17G
            if ( quoted ) { if ( *q==ending && !is_escaped(p,q) ) break; }
602
1.17G
            else
603
1.17G
            {
604
1.17G
                if ( *q=='<' ) nopen++;
605
1.17G
                if ( *q=='>' ) nopen--;
606
1.17G
                if ( !nopen ) break;
607
1.17G
                if ( *q==',' && nopen==1 ) break;
608
1.17G
            }
609
1.17G
            q++;
610
1.17G
        }
611
81.3k
        const char *r = q;
612
81.3k
        if (quoted && ending == ']') {
613
132
            if (*q == ending) {
614
130
                r++;
615
130
                q++;
616
130
                quoted = 0;
617
130
            } else {
618
2
                char buffer[320];
619
2
                hts_log_error("Missing ']' in header line %s",
620
2
                              hts_strprint(buffer, sizeof(buffer), '"',
621
2
                                           line, q-line));
622
2
                goto fail;
623
2
            }
624
132
        }
625
81.3k
        while ( r > p && r[-1] == ' ' ) r--;
626
81.3k
        if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0)
627
0
            goto fail;
628
81.3k
        if ( quoted && *q==ending ) q++;
629
81.3k
        if ( *q=='>' )
630
20.5k
        {
631
20.5k
            if (nopen) nopen--;     // this can happen with nested angle brackets <>
632
20.5k
            q++;
633
20.5k
        }
634
81.3k
    }
635
36.7k
    if ( nopen )
636
16.1k
        hts_log_warning("Incomplete header line, trying to proceed anyway:\n\t[%s]\n\t[%d]",line,q[0]);
637
638
    // Skip to end of line
639
36.7k
    int nonspace = 0;
640
36.7k
    p = q;
641
5.32M
    while ( *q && *q!='\n' ) { nonspace |= !isspace_c(*q); q++; }
642
36.7k
    if (nonspace) {
643
666
        char buffer[320];
644
666
        hts_log_warning("Dropped trailing junk from header line '%s'",
645
666
                        hts_strprint(buffer, sizeof(buffer),
646
666
                                     '"', line, q - line));
647
666
    }
648
649
36.7k
    *len = q - line + (*q ? 1 : 0);
650
36.7k
    return hrec;
651
652
2
 fail:
653
2
    *len = -1;
654
2
    bcf_hrec_destroy(hrec);
655
2
    return NULL;
656
657
4.75k
 malformed_line:
658
4.75k
    {
659
4.75k
        char buffer[320];
660
44.3M
        while ( *q && *q!='\n' ) q++;  // Ensure *len includes full line
661
4.75k
        hts_log_error("Could not parse the header line: %s",
662
4.75k
                      hts_strprint(buffer, sizeof(buffer),
663
4.75k
                                   '"', line, q - line));
664
4.75k
        *len = q - line + (*q ? 1 : 0);
665
4.75k
        bcf_hrec_destroy(hrec);
666
4.75k
        return NULL;
667
40.4k
    }
668
40.4k
}
669
670
static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_idinfo_t *idinfo)
671
21.1k
{
672
21.1k
    size_t new_n;
673
674
    // If available, preserve existing IDX
675
21.1k
    if ( idinfo->id==-1 )
676
21.1k
        idinfo->id = hdr->n[dict_type];
677
44
    else if ( idinfo->id < hdr->n[dict_type] && hdr->id[dict_type][idinfo->id].key )
678
0
    {
679
0
        hts_log_error("Conflicting IDX=%d lines in the header dictionary, the new tag is %s",
680
0
            idinfo->id, tag);
681
0
        errno = EINVAL;
682
0
        return -1;
683
0
    }
684
685
21.1k
    new_n = idinfo->id >= hdr->n[dict_type] ? idinfo->id+1 : hdr->n[dict_type];
686
21.1k
    if (hts_resize(bcf_idpair_t, new_n, &hdr->m[dict_type],
687
21.1k
                   &hdr->id[dict_type], HTS_RESIZE_CLEAR)) {
688
0
        return -1;
689
0
    }
690
21.1k
    hdr->n[dict_type] = new_n;
691
692
    // NB: the next kh_put call can invalidate the idinfo pointer, therefore
693
    // we leave it unassigned here. It must be set explicitly in bcf_hdr_sync.
694
21.1k
    hdr->id[dict_type][idinfo->id].key = tag;
695
696
21.1k
    return 0;
697
21.1k
}
698
699
// returns: 1 when hdr needs to be synced, -1 on error, 0 otherwise
700
static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
701
46.4k
{
702
    // contig
703
46.4k
    int i, ret, replacing = 0;
704
46.4k
    khint_t k;
705
46.4k
    char *str = NULL;
706
707
46.4k
    bcf_hrec_set_type(hrec);
708
709
46.4k
    if ( hrec->type==BCF_HL_CTG )
710
11.4k
    {
711
11.4k
        hts_pos_t len = 0;
712
713
        // Get the contig ID ($str) and length ($j)
714
11.4k
        i = bcf_hrec_find_key(hrec,"length");
715
11.4k
        if ( i<0 ) len = 0;
716
418
        else {
717
418
            char *end = hrec->vals[i];
718
418
            len = strtoll(hrec->vals[i], &end, 10);
719
418
            if (end == hrec->vals[i] || len < 0) return 0;
720
418
        }
721
722
11.1k
        i = bcf_hrec_find_key(hrec,"ID");
723
11.1k
        if ( i<0 ) return 0;
724
10.4k
        str = strdup(hrec->vals[i]);
725
10.4k
        if (!str) return -1;
726
727
        // Register in the dictionary
728
10.4k
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
729
10.4k
        khint_t k = kh_get(vdict, d, str);
730
10.4k
        if ( k != kh_end(d) ) { // already present
731
680
            free(str); str=NULL;
732
680
            if (kh_val(d, k).hrec[0] != NULL) // and not removed
733
680
                return 0;
734
0
            replacing = 1;
735
9.72k
        } else {
736
9.72k
            k = kh_put(vdict, d, str, &ret);
737
9.72k
            if (ret < 0) { free(str); return -1; }
738
9.72k
        }
739
740
9.72k
        int idx = bcf_hrec_find_key(hrec,"IDX");
741
9.72k
        if ( idx!=-1 )
742
3.85k
        {
743
3.85k
            char *tmp = hrec->vals[idx];
744
3.85k
            idx = strtol(hrec->vals[idx], &tmp, 10);
745
3.85k
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
746
3.85k
            {
747
3.85k
                if (!replacing) {
748
3.85k
                    kh_del(vdict, d, k);
749
3.85k
                    free(str);
750
3.85k
                }
751
3.85k
                hts_log_warning("Error parsing the IDX tag, skipping");
752
3.85k
                return 0;
753
3.85k
            }
754
3.85k
        }
755
756
5.87k
        kh_val(d, k) = bcf_idinfo_def;
757
5.87k
        kh_val(d, k).id = idx;
758
5.87k
        kh_val(d, k).info[0] = len;
759
5.87k
        kh_val(d, k).hrec[0] = hrec;
760
5.87k
        if (bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k)) < 0) {
761
0
            if (!replacing) {
762
0
                kh_del(vdict, d, k);
763
0
                free(str);
764
0
            }
765
0
            return -1;
766
0
        }
767
5.87k
        if ( idx==-1 ) {
768
5.86k
            if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
769
0
               return -1;
770
0
            }
771
5.86k
        }
772
773
5.87k
        return 1;
774
5.87k
    }
775
776
34.9k
    if ( hrec->type==BCF_HL_STR ) return 1;
777
31.3k
    if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_FMT ) return 0;
778
779
    // INFO/FILTER/FORMAT
780
22.2k
    char *id = NULL;
781
22.2k
    uint32_t type = UINT32_MAX, var = UINT32_MAX;
782
22.2k
    int num = -1, idx = -1;
783
77.0k
    for (i=0; i<hrec->nkeys; i++)
784
55.6k
    {
785
55.6k
        if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
786
37.8k
        else if ( !strcmp(hrec->keys[i], "IDX") )
787
1.17k
        {
788
1.17k
            char *tmp = hrec->vals[i];
789
1.17k
            idx = strtol(hrec->vals[i], &tmp, 10);
790
1.17k
            if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
791
944
            {
792
944
                hts_log_warning("Error parsing the IDX tag, skipping");
793
944
                return 0;
794
944
            }
795
1.17k
        }
796
36.7k
        else if ( !strcmp(hrec->keys[i], "Type") )
797
12.5k
        {
798
12.5k
            if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
799
11.6k
            else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
800
10.6k
            else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
801
2.21k
            else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
802
2.09k
            else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
803
1.66k
            else
804
1.66k
            {
805
1.66k
                hts_log_warning("The type \"%s\" is not supported, assuming \"String\"", hrec->vals[i]);
806
1.66k
                type = BCF_HT_STR;
807
1.66k
            }
808
12.5k
        }
809
24.1k
        else if ( !strcmp(hrec->keys[i], "Number") )
810
9.38k
        {
811
9.38k
            if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
812
9.25k
            else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
813
9.00k
            else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
814
8.87k
            else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
815
8.73k
            else
816
8.73k
            {
817
8.73k
                sscanf(hrec->vals[i],"%d",&num);
818
8.73k
                var = BCF_VL_FIXED;
819
8.73k
            }
820
9.38k
            if (var != BCF_VL_FIXED) num = 0xfffff;
821
9.38k
        }
822
55.6k
    }
823
21.3k
    if (hrec->type == BCF_HL_INFO || hrec->type == BCF_HL_FMT) {
824
15.7k
        if (type == -1) {
825
3.12k
            hts_log_warning("%s %s field has no Type defined. Assuming String",
826
3.12k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
827
3.12k
            type = BCF_HT_STR;
828
3.12k
        }
829
15.7k
        if (var == -1) {
830
6.33k
            hts_log_warning("%s %s field has no Number defined. Assuming '.'",
831
6.33k
                *hrec->key == 'I' ? "An" : "A", hrec->key);
832
6.33k
            var = BCF_VL_VAR;
833
6.33k
        }
834
15.7k
        if ( type==BCF_HT_FLAG && (var!=BCF_VL_FIXED || num!=0) )
835
289
        {
836
289
            hts_log_warning("The definition of Flag \"%s/%s\" is invalid, forcing Number=0", hrec->key,id);
837
289
            var = BCF_VL_FIXED;
838
289
            num = 0;
839
289
        }
840
15.7k
    }
841
21.3k
    uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 |
842
21.3k
                     (var & 0xf) << 8 |
843
21.3k
                     (type & 0xf) << 4 |
844
21.3k
                     (((uint32_t) hrec->type) & 0xf));
845
846
21.3k
    if ( !id ) return 0;
847
17.6k
    str = strdup(id);
848
17.6k
    if (!str) return -1;
849
850
17.6k
    vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
851
17.6k
    k = kh_get(vdict, d, str);
852
17.6k
    if ( k != kh_end(d) )
853
2.35k
    {
854
        // already present
855
2.35k
        free(str);
856
2.35k
        if ( kh_val(d, k).hrec[info&0xf] ) return 0;
857
497
        kh_val(d, k).info[info&0xf] = info;
858
497
        kh_val(d, k).hrec[info&0xf] = hrec;
859
497
        if ( idx==-1 ) {
860
497
            if (hrec_add_idx(hrec, kh_val(d, k).id) < 0) {
861
0
                return -1;
862
0
            }
863
497
        }
864
497
        return 1;
865
497
    }
866
15.3k
    k = kh_put(vdict, d, str, &ret);
867
15.3k
    if (ret < 0) {
868
0
        free(str);
869
0
        return -1;
870
0
    }
871
15.3k
    kh_val(d, k) = bcf_idinfo_def;
872
15.3k
    kh_val(d, k).info[info&0xf] = info;
873
15.3k
    kh_val(d, k).hrec[info&0xf] = hrec;
874
15.3k
    kh_val(d, k).id = idx;
875
15.3k
    if (bcf_hdr_set_idx(hdr, BCF_DT_ID, kh_key(d,k), &kh_val(d,k)) < 0) {
876
0
        kh_del(vdict, d, k);
877
0
        free(str);
878
0
        return -1;
879
0
    }
880
15.3k
    if ( idx==-1 ) {
881
15.2k
        if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) {
882
0
            return -1;
883
0
        }
884
15.2k
    }
885
886
15.3k
    return 1;
887
15.3k
}
888
889
int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp)
890
0
{
891
    // currently only for bcf_hdr_set_version
892
0
    assert( hrec->type==BCF_HL_GEN );
893
0
    int ret;
894
0
    khint_t k;
895
0
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
896
0
    for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
897
0
    {
898
0
        if ( !kh_exist(aux->gen,k) ) continue;
899
0
        if ( hrec!=(bcf_hrec_t*)kh_val(aux->gen,k) ) continue;
900
0
        break;
901
0
    }
902
0
    assert( k<kh_end(aux->gen) );   // something went wrong, should never happen
903
0
    free((char*)kh_key(aux->gen,k));
904
0
    kh_del(hdict,aux->gen,k);
905
0
    kstring_t str = {0,0,0};
906
0
    if ( ksprintf(&str, "##%s=%s", tmp->key,tmp->value) < 0 )
907
0
    {
908
0
        free(str.s);
909
0
        return -1;
910
0
    }
911
0
    k = kh_put(hdict, aux->gen, str.s, &ret);
912
0
    if ( ret<0 )
913
0
    {
914
0
        free(str.s);
915
0
        return -1;
916
0
    }
917
0
    free(hrec->value);
918
0
    hrec->value = strdup(tmp->value);
919
0
    if ( !hrec->value ) return -1;
920
0
    return 0;
921
0
}
922
923
int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
924
46.4k
{
925
46.4k
    kstring_t str = {0,0,0};
926
46.4k
    bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
927
928
46.4k
    int res;
929
46.4k
    if ( !hrec ) return 0;
930
931
46.4k
    bcf_hrec_check(hrec);   // todo: check return status and propagate errors up
932
933
46.4k
    res = bcf_hdr_register_hrec(hdr,hrec);
934
46.4k
    if (res < 0) return -1;
935
46.4k
    if ( !res )
936
21.0k
    {
937
        // If one of the hashed field, then it is already present
938
21.0k
        if ( hrec->type != BCF_HL_GEN )
939
12.0k
        {
940
12.0k
            bcf_hrec_destroy(hrec);
941
12.0k
            return 0;
942
12.0k
        }
943
944
        // Is one of the generic fields and already present?
945
9.04k
        if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 )
946
0
        {
947
0
            free(str.s);
948
0
            return -1;
949
0
        }
950
9.04k
        khint_t k = kh_get(hdict, aux->gen, str.s);
951
9.04k
        if ( k != kh_end(aux->gen) )
952
3.11k
        {
953
            // duplicate record
954
3.11k
            bcf_hrec_destroy(hrec);
955
3.11k
            free(str.s);
956
3.11k
            return 0;
957
3.11k
        }
958
9.04k
    }
959
960
31.2k
    int i;
961
31.2k
    if ( hrec->type==BCF_HL_STR && (i=bcf_hrec_find_key(hrec,"ID"))>=0 )
962
914
    {
963
914
        if ( ksprintf(&str, "##%s=<ID=%s>", hrec->key,hrec->vals[i]) < 0 )
964
0
        {
965
0
            free(str.s);
966
0
            return -1;
967
0
        }
968
914
        khint_t k = kh_get(hdict, aux->gen, str.s);
969
914
        if ( k != kh_end(aux->gen) )
970
564
        {
971
            // duplicate record
972
564
            bcf_hrec_destroy(hrec);
973
564
            free(str.s);
974
564
            return 0;
975
564
        }
976
914
    }
977
30.7k
    if ( str.s )
978
6.28k
    {
979
6.28k
        khint_t k = kh_put(hdict, aux->gen, str.s, &res);
980
6.28k
        if ( res<0 )
981
0
        {
982
0
            bcf_hrec_destroy(hrec);
983
0
            free(str.s);
984
0
            return -1;
985
0
        }
986
6.28k
        kh_val(aux->gen,k) = hrec;
987
6.28k
    }
988
989
    // New record, needs to be added
990
30.7k
    int n = hdr->nhrec + 1;
991
30.7k
    bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
992
30.7k
    if (!new_hrec) return -1;
993
30.7k
    hdr->hrec = new_hrec;
994
30.7k
    hdr->hrec[hdr->nhrec] = hrec;
995
30.7k
    hdr->dirty = 1;
996
30.7k
    hdr->nhrec = n;
997
998
30.7k
    return hrec->type==BCF_HL_GEN ? 0 : 1;
999
30.7k
}
1000
1001
bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
1002
0
{
1003
0
    int i;
1004
0
    if ( type==BCF_HL_GEN )
1005
0
    {
1006
        // e.g. ##fileformat=VCFv4.2
1007
        //      ##source=GenomicsDBImport
1008
        //      ##bcftools_viewVersion=1.16-80-gdfdb0923+htslib-1.16-34-g215d364
1009
0
        if ( value )
1010
0
        {
1011
0
            kstring_t str = {0,0,0};
1012
0
            ksprintf(&str, "##%s=%s", key,value);
1013
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1014
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1015
0
            free(str.s);
1016
0
            if ( k == kh_end(aux->gen) ) return NULL;
1017
0
            return kh_val(aux->gen, k);
1018
0
        }
1019
0
        for (i=0; i<hdr->nhrec; i++)
1020
0
        {
1021
0
            if ( hdr->hrec[i]->type!=type ) continue;
1022
0
            if ( strcmp(hdr->hrec[i]->key,key) ) continue;
1023
0
            return hdr->hrec[i];
1024
0
        }
1025
0
        return NULL;
1026
0
    }
1027
0
    else if ( type==BCF_HL_STR )
1028
0
    {
1029
        // e.g. ##GATKCommandLine=<ID=GenomicsDBImport,CommandLine="GenomicsDBImport....">
1030
        //      ##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
1031
0
        if (!str_class) return NULL;
1032
0
        if ( !strcmp("ID",key) )
1033
0
        {
1034
0
            kstring_t str = {0,0,0};
1035
0
            ksprintf(&str, "##%s=<%s=%s>",str_class,key,value);
1036
0
            bcf_hdr_aux_t *aux = get_hdr_aux(hdr);
1037
0
            khint_t k = kh_get(hdict, aux->gen, str.s);
1038
0
            free(str.s);
1039
0
            if ( k == kh_end(aux->gen) ) return NULL;
1040
0
            return kh_val(aux->gen, k);
1041
0
        }
1042
0
        for (i=0; i<hdr->nhrec; i++)
1043
0
        {
1044
0
            if ( hdr->hrec[i]->type!=type ) continue;
1045
0
            if ( strcmp(hdr->hrec[i]->key,str_class) ) continue;
1046
0
            int j = bcf_hrec_find_key(hdr->hrec[i],key);
1047
0
            if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i];
1048
0
        }
1049
0
        return NULL;
1050
0
    }
1051
0
    vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1052
0
    khint_t k = kh_get(vdict, d, value);
1053
0
    if ( k == kh_end(d) ) return NULL;
1054
0
    return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
1055
0
}
1056
1057
void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
1058
1.51k
{
1059
1.51k
    static int PL_warned = 0, GL_warned = 0;
1060
1061
1.51k
    if ( !PL_warned )
1062
1.51k
    {
1063
1.51k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "PL");
1064
1.51k
        if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
1065
0
        {
1066
0
            hts_log_warning("PL should be declared as Number=G");
1067
0
            PL_warned = 1;
1068
0
        }
1069
1.51k
    }
1070
1.51k
    if ( !GL_warned )
1071
1.51k
    {
1072
1.51k
        int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GL");
1073
1.51k
        if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
1074
0
        {
1075
0
            hts_log_warning("GL should be declared as Number=G");
1076
0
            GL_warned = 1;
1077
0
        }
1078
1.51k
    }
1079
1.51k
}
1080
1081
int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
1082
1.64k
{
1083
1.64k
    int len, done = 0;
1084
1.64k
    char *p = htxt;
1085
1086
    // Check sanity: "fileformat" string must come as first
1087
1.64k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
1088
1.64k
    if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") )
1089
44
        hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?");
1090
1.64k
    if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1091
0
        bcf_hrec_destroy(hrec);
1092
0
        return -1;
1093
0
    }
1094
1095
    // The filter PASS must appear first in the dictionary
1096
1.64k
    hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
1097
1.64k
    if (!hrec || bcf_hdr_add_hrec(hdr, hrec) < 0) {
1098
0
        bcf_hrec_destroy(hrec);
1099
0
        return -1;
1100
0
    }
1101
1102
    // Parse the whole header
1103
6.16k
    do {
1104
31.1k
        while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) {
1105
25.0k
            if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
1106
0
                bcf_hrec_destroy(hrec);
1107
0
                return -1;
1108
0
            }
1109
25.0k
            p += len;
1110
25.0k
        }
1111
6.16k
        assert(hrec == NULL);
1112
6.16k
        if (len < 0) {
1113
            // len < 0 indicates out-of-memory, or similar error
1114
0
            hts_log_error("Could not parse header line: %s", strerror(errno));
1115
0
            return -1;
1116
6.16k
        } else if (len > 0) {
1117
            // Bad header line.  bcf_hdr_parse_line() will have logged it.
1118
            // Skip and try again on the next line (p + len will be the start
1119
            // of the next one).
1120
4.35k
            p += len;
1121
4.35k
            continue;
1122
4.35k
        }
1123
1124
        // Next should be the sample line.  If not, it was a malformed
1125
        // header, in which case print a warning and skip (many VCF
1126
        // operations do not really care about a few malformed lines).
1127
        // In the future we may want to add a strict mode that errors in
1128
        // this case.
1129
1.80k
        if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) {
1130
289
            char *eol = strchr(p, '\n');
1131
289
            if (*p != '\0') {
1132
162
                char buffer[320];
1133
162
                hts_log_warning("Could not parse header line: %s",
1134
162
                                hts_strprint(buffer, sizeof(buffer),
1135
162
                                               '"', p,
1136
162
                                               eol ? (eol - p) : SIZE_MAX));
1137
162
            }
1138
289
            if (eol) {
1139
161
                p = eol + 1; // Try from the next line.
1140
161
            } else {
1141
128
                done = -1; // No more lines left, give up.
1142
128
            }
1143
1.51k
        } else {
1144
1.51k
            done = 1; // Sample line found
1145
1.51k
        }
1146
6.16k
    } while (!done);
1147
1148
1.64k
    if (done < 0) {
1149
        // No sample line is fatal.
1150
128
        hts_log_error("Could not parse the header, sample line not found");
1151
128
        return -1;
1152
128
    }
1153
1154
1.51k
    if (bcf_hdr_parse_sample_line(hdr,p) < 0)
1155
3
        return -1;
1156
1.51k
    if (bcf_hdr_sync(hdr) < 0)
1157
0
        return -1;
1158
1.51k
    bcf_hdr_check_sanity(hdr);
1159
1.51k
    return 0;
1160
1.51k
}
1161
1162
int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
1163
0
{
1164
0
    int len;
1165
0
    bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
1166
0
    if ( !hrec ) return -1;
1167
0
    if (bcf_hdr_add_hrec(hdr, hrec) < 0)
1168
0
        return -1;
1169
0
    return 0;
1170
0
}
1171
1172
void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
1173
0
{
1174
0
    int i = 0;
1175
0
    bcf_hrec_t *hrec;
1176
0
    if ( !key )
1177
0
    {
1178
        // no key, remove all entries of this type
1179
0
        while ( i<hdr->nhrec )
1180
0
        {
1181
0
            if ( hdr->hrec[i]->type!=type ) { i++; continue; }
1182
0
            hrec = hdr->hrec[i];
1183
1184
0
            if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
1185
0
            {
1186
0
                int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
1187
0
                if ( j>=0 )
1188
0
                {
1189
0
                    vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1190
0
                    khint_t k = kh_get(vdict, d, hdr->hrec[i]->vals[j]);
1191
0
                    kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
1192
0
                }
1193
0
            }
1194
1195
0
            hdr->dirty = 1;
1196
0
            hdr->nhrec--;
1197
0
            if ( i < hdr->nhrec )
1198
0
                memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1199
0
            bcf_hrec_destroy(hrec);
1200
0
        }
1201
0
        return;
1202
0
    }
1203
0
    while (1)
1204
0
    {
1205
0
        if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
1206
0
        {
1207
0
            hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL);
1208
0
            if ( !hrec ) return;
1209
1210
0
            for (i=0; i<hdr->nhrec; i++)
1211
0
                if ( hdr->hrec[i]==hrec ) break;
1212
0
            assert( i<hdr->nhrec );
1213
1214
0
            vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
1215
0
            khint_t k = kh_get(vdict, d, key);
1216
0
            kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
1217
0
        }
1218
0
        else
1219
0
        {
1220
0
            for (i=0; i<hdr->nhrec; i++)
1221
0
            {
1222
0
                if ( hdr->hrec[i]->type!=type ) continue;
1223
0
                if ( type==BCF_HL_GEN )
1224
0
                {
1225
0
                    if ( !strcmp(hdr->hrec[i]->key,key) ) break;
1226
0
                }
1227
0
                else
1228
0
                {
1229
                    // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
1230
0
                    int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
1231
0
                    if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break;
1232
0
                }
1233
0
            }
1234
0
            if ( i==hdr->nhrec ) return;
1235
0
            hrec = hdr->hrec[i];
1236
0
        }
1237
1238
0
        hdr->nhrec--;
1239
0
        if ( i < hdr->nhrec )
1240
0
            memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
1241
0
        bcf_hrec_destroy(hrec);
1242
0
        hdr->dirty = 1;
1243
0
    }
1244
0
}
1245
1246
int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
1247
0
{
1248
0
    char tmp[256], *line = tmp;
1249
0
    va_list ap;
1250
0
    va_start(ap, fmt);
1251
0
    int n = vsnprintf(line, sizeof(tmp), fmt, ap);
1252
0
    va_end(ap);
1253
1254
0
    if (n >= sizeof(tmp)) {
1255
0
        n++; // For trailing NUL
1256
0
        line = (char*)malloc(n);
1257
0
        if (!line)
1258
0
            return -1;
1259
1260
0
        va_start(ap, fmt);
1261
0
        vsnprintf(line, n, fmt, ap);
1262
0
        va_end(ap);
1263
0
    }
1264
1265
0
    int ret = bcf_hdr_append(hdr, line);
1266
1267
0
    if (line != tmp) free(line);
1268
0
    return ret;
1269
0
}
1270
1271
1272
/**********************
1273
 *** BCF header I/O ***
1274
 **********************/
1275
1276
const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
1277
0
{
1278
0
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1279
0
    if ( !hrec )
1280
0
    {
1281
0
        hts_log_warning("No version string found, assuming VCFv4.2");
1282
0
        return "VCFv4.2";
1283
0
    }
1284
0
    return hrec->value;
1285
0
}
1286
1287
int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
1288
0
{
1289
0
    bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
1290
0
    if ( !hrec )
1291
0
    {
1292
0
        int len;
1293
0
        kstring_t str = {0,0,0};
1294
0
        if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1;
1295
0
        hrec = bcf_hdr_parse_line(hdr, str.s, &len);
1296
0
        free(str.s);
1297
0
    }
1298
0
    else
1299
0
    {
1300
0
        bcf_hrec_t *tmp = bcf_hrec_dup(hrec);
1301
0
        if ( !tmp ) return -1;
1302
0
        free(tmp->value);
1303
0
        tmp->value = strdup(version);
1304
0
        if ( !tmp->value ) return -1;
1305
0
        bcf_hdr_update_hrec(hdr, hrec, tmp);
1306
0
        bcf_hrec_destroy(tmp);
1307
0
    }
1308
0
    hdr->dirty = 1;
1309
0
    return 0; // FIXME: check for errs in this function (return < 0 if so)
1310
0
}
1311
1312
bcf_hdr_t *bcf_hdr_init(const char *mode)
1313
1.67k
{
1314
1.67k
    int i;
1315
1.67k
    bcf_hdr_t *h;
1316
1.67k
    h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
1317
1.67k
    if (!h) return NULL;
1318
6.68k
    for (i = 0; i < 3; ++i)
1319
5.01k
        if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail;
1320
1321
1.67k
    bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t));
1322
1.67k
    if ( !aux ) goto fail;
1323
1.67k
    if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; }
1324
1.67k
    aux->dict = *((vdict_t*)h->dict[0]);
1325
1.67k
    free(h->dict[0]);
1326
1.67k
    h->dict[0] = aux;
1327
1328
1.67k
    if ( strchr(mode,'w') )
1329
0
    {
1330
0
        bcf_hdr_append(h, "##fileformat=VCFv4.2");
1331
        // The filter PASS must appear first in the dictionary
1332
0
        bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
1333
0
    }
1334
1.67k
    return h;
1335
1336
0
 fail:
1337
0
    for (i = 0; i < 3; ++i)
1338
0
        kh_destroy(vdict, h->dict[i]);
1339
0
    free(h);
1340
0
    return NULL;
1341
1.67k
}
1342
1343
void bcf_hdr_destroy(bcf_hdr_t *h)
1344
1.67k
{
1345
1.67k
    int i;
1346
1.67k
    khint_t k;
1347
1.67k
    if (!h) return;
1348
6.68k
    for (i = 0; i < 3; ++i) {
1349
5.01k
        vdict_t *d = (vdict_t*)h->dict[i];
1350
5.01k
        if (d == 0) continue;
1351
60.2k
        for (k = kh_begin(d); k != kh_end(d); ++k)
1352
55.2k
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
1353
5.01k
        if ( i==0 )
1354
1.67k
        {
1355
1.67k
            bcf_hdr_aux_t *aux = get_hdr_aux(h);
1356
14.0k
            for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++)
1357
12.3k
                if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k));
1358
1.67k
            kh_destroy(hdict, aux->gen);
1359
1.67k
        }
1360
5.01k
        kh_destroy(vdict, d);
1361
5.01k
        free(h->id[i]);
1362
5.01k
    }
1363
32.3k
    for (i=0; i<h->nhrec; i++)
1364
30.7k
        bcf_hrec_destroy(h->hrec[i]);
1365
1.67k
    if (h->nhrec) free(h->hrec);
1366
1.67k
    if (h->samples) free(h->samples);
1367
1.67k
    free(h->keep_samples);
1368
1.67k
    free(h->transl[0]); free(h->transl[1]);
1369
1.67k
    free(h->mem.s);
1370
1.67k
    free(h);
1371
1.67k
}
1372
1373
bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
1374
1.67k
{
1375
1.67k
    if (hfp->format.format == vcf)
1376
1.62k
        return vcf_hdr_read(hfp);
1377
48
    if (hfp->format.format != bcf) {
1378
0
        hts_log_error("Input is not detected as bcf or vcf format");
1379
0
        return NULL;
1380
0
    }
1381
1382
48
    assert(hfp->is_bgzf);
1383
1384
48
    BGZF *fp = hfp->fp.bgzf;
1385
48
    uint8_t magic[5];
1386
48
    bcf_hdr_t *h;
1387
48
    h = bcf_hdr_init("r");
1388
48
    if (!h) {
1389
0
        hts_log_error("Failed to allocate bcf header");
1390
0
        return NULL;
1391
0
    }
1392
48
    if (bgzf_read(fp, magic, 5) != 5)
1393
0
    {
1394
0
        hts_log_error("Failed to read the header (reading BCF in text mode?)");
1395
0
        bcf_hdr_destroy(h);
1396
0
        return NULL;
1397
0
    }
1398
48
    if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
1399
3
    {
1400
3
        if (!strncmp((char*)magic, "BCF", 3))
1401
3
            hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported");
1402
0
        else
1403
0
            hts_log_error("Invalid BCF2 magic string");
1404
3
        bcf_hdr_destroy(h);
1405
3
        return NULL;
1406
3
    }
1407
45
    uint8_t buf[4];
1408
45
    size_t hlen;
1409
45
    char *htxt = NULL;
1410
45
    if (bgzf_read(fp, buf, 4) != 4) goto fail;
1411
45
    hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24);
1412
45
    if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; }
1413
45
    htxt = (char*)malloc(hlen + 1);
1414
45
    if (!htxt) goto fail;
1415
45
    if (bgzf_read(fp, htxt, hlen) != hlen) goto fail;
1416
44
    htxt[hlen] = '\0'; // Ensure htxt is terminated
1417
44
    if ( bcf_hdr_parse(h, htxt) < 0 ) goto fail;
1418
44
    free(htxt);
1419
44
    return h;
1420
1
 fail:
1421
1
    hts_log_error("Failed to read BCF header");
1422
1
    free(htxt);
1423
1
    bcf_hdr_destroy(h);
1424
1
    return NULL;
1425
44
}
1426
1427
int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
1428
1.51k
{
1429
1.51k
    if (!h) {
1430
0
        errno = EINVAL;
1431
0
        return -1;
1432
0
    }
1433
1.51k
    if ( h->dirty ) {
1434
0
        if (bcf_hdr_sync(h) < 0) return -1;
1435
0
    }
1436
1.51k
    hfp->format.category = variant_data;
1437
1.51k
    if (hfp->format.format == vcf || hfp->format.format == text_format) {
1438
1.51k
        hfp->format.format = vcf;
1439
1.51k
        return vcf_hdr_write(hfp, h);
1440
1.51k
    }
1441
1442
0
    if (hfp->format.format == binary_format)
1443
0
        hfp->format.format = bcf;
1444
1445
0
    kstring_t htxt = {0,0,0};
1446
0
    if (bcf_hdr_format(h, 1, &htxt) < 0) {
1447
0
        free(htxt.s);
1448
0
        return -1;
1449
0
    }
1450
0
    kputc('\0', &htxt); // include the \0 byte
1451
1452
0
    BGZF *fp = hfp->fp.bgzf;
1453
0
    if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
1454
0
    uint8_t hlen[4];
1455
0
    u32_to_le(htxt.l, hlen);
1456
0
    if ( bgzf_write(fp, hlen, 4) !=4 ) return -1;
1457
0
    if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1;
1458
1459
0
    free(htxt.s);
1460
0
    return 0;
1461
0
}
1462
1463
/********************
1464
 *** BCF site I/O ***
1465
 ********************/
1466
1467
bcf1_t *bcf_init()
1468
1.51k
{
1469
1.51k
    bcf1_t *v;
1470
1.51k
    v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
1471
1.51k
    return v;
1472
1.51k
}
1473
1474
void bcf_clear(bcf1_t *v)
1475
198k
{
1476
198k
    int i;
1477
57.5M
    for (i=0; i<v->d.m_info; i++)
1478
57.3M
    {
1479
57.3M
        if ( v->d.info[i].vptr_free )
1480
0
        {
1481
0
            free(v->d.info[i].vptr - v->d.info[i].vptr_off);
1482
0
            v->d.info[i].vptr_free = 0;
1483
0
        }
1484
57.3M
    }
1485
395k
    for (i=0; i<v->d.m_fmt; i++)
1486
197k
    {
1487
197k
        if ( v->d.fmt[i].p_free )
1488
0
        {
1489
0
            free(v->d.fmt[i].p - v->d.fmt[i].p_off);
1490
0
            v->d.fmt[i].p_free = 0;
1491
0
        }
1492
197k
    }
1493
198k
    v->rid = v->pos = v->rlen = v->unpacked = 0;
1494
198k
    bcf_float_set_missing(v->qual);
1495
198k
    v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
1496
198k
    v->shared.l = v->indiv.l = 0;
1497
198k
    v->d.var_type = -1;
1498
198k
    v->d.shared_dirty = 0;
1499
198k
    v->d.indiv_dirty  = 0;
1500
198k
    v->d.n_flt = 0;
1501
198k
    v->errcode = 0;
1502
198k
    if (v->d.m_als) v->d.als[0] = 0;
1503
198k
    if (v->d.m_id) v->d.id[0] = 0;
1504
198k
}
1505
1506
void bcf_empty(bcf1_t *v)
1507
1.51k
{
1508
1.51k
    bcf_clear1(v);
1509
1.51k
    free(v->d.id);
1510
1.51k
    free(v->d.als);
1511
1.51k
    free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
1512
1.51k
    if (v->d.var ) free(v->d.var);
1513
1.51k
    free(v->shared.s); free(v->indiv.s);
1514
1.51k
    memset(&v->d,0,sizeof(v->d));
1515
1.51k
    memset(&v->shared,0,sizeof(v->shared));
1516
1.51k
    memset(&v->indiv,0,sizeof(v->indiv));
1517
1.51k
}
1518
1519
void bcf_destroy(bcf1_t *v)
1520
1.51k
{
1521
1.51k
    if (!v) return;
1522
1.51k
    bcf_empty1(v);
1523
1.51k
    free(v);
1524
1.51k
}
1525
1526
static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
1527
975
{
1528
975
    uint8_t x[32];
1529
975
    ssize_t ret;
1530
975
    uint32_t shared_len, indiv_len;
1531
975
    if ((ret = bgzf_read(fp, x, 32)) != 32) {
1532
1
        if (ret == 0) return -1;
1533
1
        return -2;
1534
1
    }
1535
974
    bcf_clear1(v);
1536
974
    shared_len = le_to_u32(x);
1537
974
    if (shared_len < 24) return -2;
1538
972
    shared_len -= 24; // to exclude six 32-bit integers
1539
972
    if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2;
1540
972
    indiv_len = le_to_u32(x + 4);
1541
972
    if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2;
1542
972
    v->rid  = le_to_i32(x + 8);
1543
972
    v->pos  = le_to_u32(x + 12);
1544
972
    if ( v->pos==UINT32_MAX ) v->pos = -1;  // this is for telomere coordinate, e.g. MT:0
1545
972
    v->rlen = le_to_i32(x + 16);
1546
972
    v->qual = le_to_float(x + 20);
1547
972
    v->n_info = le_to_u16(x + 24);
1548
972
    v->n_allele = le_to_u16(x + 26);
1549
972
    v->n_sample = le_to_u32(x + 28) & 0xffffff;
1550
972
    v->n_fmt = x[31];
1551
972
    v->shared.l = shared_len;
1552
972
    v->indiv.l = indiv_len;
1553
    // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
1554
972
    if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
1555
1556
972
    if (bgzf_read(fp, v->shared.s, v->shared.l) != v->shared.l) return -2;
1557
963
    if (bgzf_read(fp, v->indiv.s, v->indiv.l) != v->indiv.l) return -2;
1558
962
    return 0;
1559
963
}
1560
1561
0
#define bit_array_size(n) ((n)/8+1)
1562
0
#define bit_array_set(a,i)   ((a)[(i)/8] |=   1 << ((i)%8))
1563
0
#define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
1564
0
#define bit_array_test(a,i)  ((a)[(i)/8] &   (1 << ((i)%8)))
1565
1566
static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1567
3.54k
                                   int32_t *val) {
1568
3.54k
    uint32_t t;
1569
3.54k
    if (end - p < 2) return -1;
1570
3.54k
    t = *p++ & 0xf;
1571
    /* Use if .. else if ... else instead of switch to force order.  Assumption
1572
       is that small integers are more frequent than big ones. */
1573
3.54k
    if (t == BCF_BT_INT8) {
1574
1.08k
        *val = *(int8_t *) p++;
1575
2.45k
    } else {
1576
2.45k
        if (end - p < (1<<bcf_type_shift[t])) return -1;
1577
2.45k
        if (t == BCF_BT_INT16) {
1578
1.70k
            *val = le_to_i16(p);
1579
1.70k
            p += 2;
1580
1.70k
        } else if (t == BCF_BT_INT32) {
1581
732
            *val = le_to_i32(p);
1582
732
            p += 4;
1583
#ifdef VCF_ALLOW_INT64
1584
        } else if (t == BCF_BT_INT64) {
1585
            // This case should never happen because there should be no
1586
            // 64-bit BCFs at all, definitely not coming from htslib
1587
            *val = le_to_i64(p);
1588
            p += 8;
1589
#endif
1590
732
        } else {
1591
25
            return -1;
1592
25
        }
1593
2.45k
    }
1594
3.51k
    *q = p;
1595
3.51k
    return 0;
1596
3.54k
}
1597
1598
static int bcf_dec_size_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1599
12.2k
                             int *num, int *type) {
1600
12.2k
    int r;
1601
12.2k
    if (p >= end) return -1;
1602
12.2k
    *type = *p & 0xf;
1603
12.2k
    if (*p>>4 != 15) {
1604
12.0k
        *q = p + 1;
1605
12.0k
        *num = *p >> 4;
1606
12.0k
        return 0;
1607
12.0k
    }
1608
205
    r = bcf_dec_typed_int1_safe(p + 1, end, q, num);
1609
205
    if (r) return r;
1610
193
    return *num >= 0 ? 0 : -1;
1611
205
}
1612
1613
46
static const char *get_type_name(int type) {
1614
46
    const char *types[9] = {
1615
46
        "null", "int (8-bit)", "int (16 bit)", "int (32 bit)",
1616
46
        "unknown", "float", "unknown", "char", "unknown"
1617
46
    };
1618
46
    int t = (type >= 0 && type < 8) ? type : 8;
1619
46
    return types[t];
1620
46
}
1621
1622
static void bcf_record_check_err(const bcf_hdr_t *hdr, bcf1_t *rec,
1623
417
                                 char *type, uint32_t *reports, int i) {
1624
417
    if (*reports == 0 || hts_verbose >= HTS_LOG_DEBUG)
1625
4
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos
1626
417
                        ": Invalid FORMAT %s %d",
1627
417
                        bcf_seqname_safe(hdr,rec), rec->pos+1, type, i);
1628
417
    (*reports)++;
1629
417
}
1630
1631
962
static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
1632
962
    uint8_t *ptr, *end;
1633
962
    size_t bytes;
1634
962
    uint32_t err = 0;
1635
962
    int type = 0;
1636
962
    int num  = 0;
1637
962
    int reflen = 0;
1638
962
    uint32_t i, reports;
1639
962
    const uint32_t is_integer = ((1 << BCF_BT_INT8)  |
1640
962
                                 (1 << BCF_BT_INT16) |
1641
#ifdef VCF_ALLOW_INT64
1642
                                 (1 << BCF_BT_INT64) |
1643
#endif
1644
962
                                 (1 << BCF_BT_INT32));
1645
962
    const uint32_t is_valid_type = (is_integer          |
1646
962
                                    (1 << BCF_BT_NULL)  |
1647
962
                                    (1 << BCF_BT_FLOAT) |
1648
962
                                    (1 << BCF_BT_CHAR));
1649
962
    int32_t max_id = hdr ? hdr->n[BCF_DT_ID] : 0;
1650
1651
    // Check for valid contig ID
1652
962
    if (rec->rid < 0
1653
962
        || (hdr && (rec->rid >= hdr->n[BCF_DT_CTG]
1654
956
                    || hdr->id[BCF_DT_CTG][rec->rid].key == NULL))) {
1655
26
        hts_log_warning("Bad BCF record at %"PRIhts_pos": Invalid %s id %d", rec->pos+1, "CONTIG", rec->rid);
1656
26
        err |= BCF_ERR_CTG_INVALID;
1657
26
    }
1658
1659
    // Check ID
1660
962
    ptr = (uint8_t *) rec->shared.s;
1661
962
    end = ptr + rec->shared.l;
1662
962
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1663
957
    if (type != BCF_BT_CHAR) {
1664
25
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "ID", type, get_type_name(type));
1665
25
        err |= BCF_ERR_TAG_INVALID;
1666
25
    }
1667
957
    bytes = (size_t) num << bcf_type_shift[type];
1668
957
    if (end - ptr < bytes) goto bad_shared;
1669
957
    ptr += bytes;
1670
1671
    // Check REF and ALT
1672
957
    if (rec->n_allele < 1) {
1673
6
        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": No REF allele",
1674
6
                        bcf_seqname_safe(hdr,rec), rec->pos+1);
1675
6
        err |= BCF_ERR_TAG_UNDEF;
1676
6
    }
1677
1678
957
    reports = 0;
1679
7.96k
    for (i = 0; i < rec->n_allele; i++) {
1680
7.01k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1681
7.01k
        if (type != BCF_BT_CHAR) {
1682
6.05k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1683
20
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "REF/ALT", type, get_type_name(type));
1684
6.05k
            err |= BCF_ERR_CHAR;
1685
6.05k
        }
1686
7.01k
        if (i == 0) reflen = num;
1687
7.01k
        bytes = (size_t) num << bcf_type_shift[type];
1688
7.01k
        if (end - ptr < bytes) goto bad_shared;
1689
7.01k
        ptr += bytes;
1690
7.01k
    }
1691
1692
    // Check FILTER
1693
948
    reports = 0;
1694
948
    if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1695
947
    if (num > 0) {
1696
5
        bytes = (size_t) num << bcf_type_shift[type];
1697
5
        if (((1 << type) & is_integer) == 0) {
1698
0
            hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", type, get_type_name(type));
1699
0
            err |= BCF_ERR_TAG_INVALID;
1700
0
            if (end - ptr < bytes) goto bad_shared;
1701
0
            ptr += bytes;
1702
5
        } else {
1703
5
            if (end - ptr < bytes) goto bad_shared;
1704
525
            for (i = 0; i < num; i++) {
1705
520
                int32_t key = bcf_dec_int1(ptr, type, &ptr);
1706
520
                if (key < 0
1707
520
                    || (hdr && (key >= max_id
1708
478
                                || hdr->id[BCF_DT_ID][key].key == NULL))) {
1709
478
                    if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1710
5
                        hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", key);
1711
478
                    err |= BCF_ERR_TAG_UNDEF;
1712
478
                }
1713
520
            }
1714
5
        }
1715
5
    }
1716
1717
    // Check INFO
1718
947
    reports = 0;
1719
947
    bcf_idpair_t *id_tmp = hdr ? hdr->id[BCF_DT_ID] : NULL;
1720
3.86k
    for (i = 0; i < rec->n_info; i++) {
1721
2.92k
        int32_t key = -1;
1722
2.92k
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_shared;
1723
2.91k
        if (key < 0 || (hdr && (key >= max_id
1724
2.90k
                                || id_tmp[key].key == NULL))) {
1725
2.90k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1726
7
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", key);
1727
2.90k
            err |= BCF_ERR_TAG_UNDEF;
1728
2.90k
        }
1729
2.91k
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1730
2.91k
        if (((1 << type) & is_valid_type) == 0
1731
2.91k
            || (type == BCF_BT_NULL && num > 0)) {
1732
1.24k
            if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1733
1
                hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", type, get_type_name(type));
1734
1.24k
            err |= BCF_ERR_TAG_INVALID;
1735
1.24k
        }
1736
2.91k
        bytes = (size_t) num << bcf_type_shift[type];
1737
2.91k
        if (end - ptr < bytes) goto bad_shared;
1738
2.91k
        ptr += bytes;
1739
2.91k
    }
1740
1741
    // Check FORMAT and individual information
1742
936
    ptr = (uint8_t *) rec->indiv.s;
1743
936
    end = ptr + rec->indiv.l;
1744
936
    reports = 0;
1745
1.33k
    for (i = 0; i < rec->n_fmt; i++) {
1746
407
        int32_t key = -1;
1747
407
        if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_indiv;
1748
404
        if (key < 0
1749
404
            || (hdr && (key >= max_id
1750
404
                        || id_tmp[key].key == NULL))) {
1751
404
            bcf_record_check_err(hdr, rec, "id", &reports, key);
1752
404
            err |= BCF_ERR_TAG_UNDEF;
1753
404
        }
1754
404
        if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv;
1755
404
        if (((1 << type) & is_valid_type) == 0
1756
404
            || (type == BCF_BT_NULL && num > 0)) {
1757
13
            bcf_record_check_err(hdr, rec, "type", &reports, type);
1758
13
            err |= BCF_ERR_TAG_INVALID;
1759
13
        }
1760
404
        bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample;
1761
404
        if (end - ptr < bytes) goto bad_indiv;
1762
403
        ptr += bytes;
1763
403
    }
1764
1765
932
    if (!err && rec->rlen < 0) {
1766
        // Treat bad rlen as a warning instead of an error, and try to
1767
        // fix up by using the length of the stored REF allele.
1768
547
        static int warned = 0;
1769
547
        if (!warned) {
1770
1
            hts_log_warning("BCF record at %s:%"PRIhts_pos" has invalid RLEN (%"PRIhts_pos"). "
1771
1
                            "Only one invalid RLEN will be reported.",
1772
1
                            bcf_seqname_safe(hdr,rec), rec->pos+1, rec->rlen);
1773
1
            warned = 1;
1774
1
        }
1775
547
        rec->rlen = reflen >= 0 ? reflen : 0;
1776
547
    }
1777
1778
932
    rec->errcode |= err;
1779
1780
932
    return err ? -2 : 0; // Return -2 so bcf_read() reports an error
1781
1782
26
 bad_shared:
1783
26
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - shared section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
1784
26
    return -2;
1785
1786
4
 bad_indiv:
1787
4
    hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - individuals section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1);
1788
4
    return -2;
1789
936
}
1790
1791
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
1792
int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
1793
0
{
1794
0
    if ( !hdr->keep_samples ) return 0;
1795
0
    if ( !bcf_hdr_nsamples(hdr) )
1796
0
    {
1797
0
        rec->indiv.l = rec->n_sample = 0;
1798
0
        return 0;
1799
0
    }
1800
1801
0
    int i, j;
1802
0
    uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
1803
0
    bcf_dec_t *dec = &rec->d;
1804
0
    hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
1805
0
    for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
1806
1807
0
    for (i=0; i<rec->n_fmt; i++)
1808
0
    {
1809
0
        ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
1810
0
        src = dec->fmt[i].p - dec->fmt[i].size;
1811
0
        if ( dst )
1812
0
        {
1813
0
            memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
1814
0
            dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
1815
0
        }
1816
0
        dst = dec->fmt[i].p;
1817
0
        for (j=0; j<hdr->nsamples_ori; j++)
1818
0
        {
1819
0
            src += dec->fmt[i].size;
1820
0
            if ( !bit_array_test(hdr->keep_samples,j) ) continue;
1821
0
            memmove(dst, src, dec->fmt[i].size);
1822
0
            dst += dec->fmt[i].size;
1823
0
        }
1824
0
        rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
1825
0
        dec->fmt[i].p_len = dst - dec->fmt[i].p;
1826
0
    }
1827
0
    rec->unpacked |= BCF_UN_FMT;
1828
1829
0
    rec->n_sample = bcf_hdr_nsamples(hdr);
1830
0
    return 0;
1831
0
}
1832
1833
int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
1834
197k
{
1835
197k
    if (fp->format.format == vcf) return vcf_read(fp,h,v);
1836
975
    int ret = bcf_read1_core(fp->fp.bgzf, v);
1837
975
    if (ret == 0) ret = bcf_record_check(h, v);
1838
975
    if ( ret!=0 || !h->keep_samples ) return ret;
1839
0
    return bcf_subset_format(h,v);
1840
975
}
1841
1842
int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1843
0
{
1844
0
    bcf1_t *v = (bcf1_t *) vv;
1845
0
    int ret = bcf_read1_core(fp, v);
1846
0
    if (ret == 0) ret = bcf_record_check(NULL, v);
1847
0
    if (ret  >= 0)
1848
0
        *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
1849
0
    return ret;
1850
0
}
1851
1852
static inline int bcf1_sync_id(bcf1_t *line, kstring_t *str)
1853
0
{
1854
    // single typed string
1855
0
    if ( line->d.id && strcmp(line->d.id, ".") ) {
1856
0
        return bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
1857
0
    } else {
1858
0
        return bcf_enc_size(str, 0, BCF_BT_CHAR);
1859
0
    }
1860
0
}
1861
static inline int bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
1862
0
{
1863
    // list of typed strings
1864
0
    int i;
1865
0
    for (i=0; i<line->n_allele; i++) {
1866
0
        if (bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]) < 0)
1867
0
            return -1;
1868
0
    }
1869
0
    if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
1870
0
    return 0;
1871
0
}
1872
static inline int bcf1_sync_filter(bcf1_t *line, kstring_t *str)
1873
0
{
1874
    // typed vector of integers
1875
0
    if ( line->d.n_flt ) {
1876
0
        return bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
1877
0
    } else {
1878
0
        return bcf_enc_vint(str, 0, 0, -1);
1879
0
    }
1880
0
}
1881
1882
static inline int bcf1_sync_info(bcf1_t *line, kstring_t *str)
1883
0
{
1884
    // pairs of typed vectors
1885
0
    int i, irm = -1, e = 0;
1886
0
    for (i=0; i<line->n_info; i++)
1887
0
    {
1888
0
        bcf_info_t *info = &line->d.info[i];
1889
0
        if ( !info->vptr )
1890
0
        {
1891
            // marked for removal
1892
0
            if ( irm < 0 ) irm = i;
1893
0
            continue;
1894
0
        }
1895
0
        e |= kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str) < 0;
1896
0
        if ( irm >=0 )
1897
0
        {
1898
0
            bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
1899
0
            while ( irm<=i && line->d.info[irm].vptr ) irm++;
1900
0
        }
1901
0
    }
1902
0
    if ( irm>=0 ) line->n_info = irm;
1903
0
    return e == 0 ? 0 : -1;
1904
0
}
1905
1906
static int bcf1_sync(bcf1_t *line)
1907
0
{
1908
0
    char *shared_ori = line->shared.s;
1909
0
    size_t prev_len;
1910
1911
0
    kstring_t tmp = {0,0,0};
1912
0
    if ( !line->shared.l )
1913
0
    {
1914
        // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
1915
0
        tmp = line->shared;
1916
0
        bcf1_sync_id(line, &tmp);
1917
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
1918
1919
0
        bcf1_sync_alleles(line, &tmp);
1920
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
1921
1922
0
        bcf1_sync_filter(line, &tmp);
1923
0
        line->unpack_size[2] = tmp.l - prev_len;
1924
1925
0
        bcf1_sync_info(line, &tmp);
1926
0
        line->shared = tmp;
1927
0
    }
1928
0
    else if ( line->d.shared_dirty )
1929
0
    {
1930
        // The line was edited, update the BCF data block.
1931
1932
0
        if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line,BCF_UN_STR);
1933
1934
        // ptr_ori points to the original unchanged BCF data.
1935
0
        uint8_t *ptr_ori = (uint8_t *) line->shared.s;
1936
1937
        // ID: single typed string
1938
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ID )
1939
0
            bcf1_sync_id(line, &tmp);
1940
0
        else
1941
0
            kputsn_(ptr_ori, line->unpack_size[0], &tmp);
1942
0
        ptr_ori += line->unpack_size[0];
1943
0
        line->unpack_size[0] = tmp.l; prev_len = tmp.l;
1944
1945
        // REF+ALT: list of typed strings
1946
0
        if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
1947
0
            bcf1_sync_alleles(line, &tmp);
1948
0
        else
1949
0
        {
1950
0
            kputsn_(ptr_ori, line->unpack_size[1], &tmp);
1951
0
            if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
1952
0
        }
1953
0
        ptr_ori += line->unpack_size[1];
1954
0
        line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
1955
1956
0
        if ( line->unpacked & BCF_UN_FLT )
1957
0
        {
1958
            // FILTER: typed vector of integers
1959
0
            if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
1960
0
                bcf1_sync_filter(line, &tmp);
1961
0
            else if ( line->d.n_flt )
1962
0
                kputsn_(ptr_ori, line->unpack_size[2], &tmp);
1963
0
            else
1964
0
                bcf_enc_vint(&tmp, 0, 0, -1);
1965
0
            ptr_ori += line->unpack_size[2];
1966
0
            line->unpack_size[2] = tmp.l - prev_len;
1967
1968
0
            if ( line->unpacked & BCF_UN_INFO )
1969
0
            {
1970
                // INFO: pairs of typed vectors
1971
0
                if ( line->d.shared_dirty & BCF1_DIRTY_INF )
1972
0
                {
1973
0
                    bcf1_sync_info(line, &tmp);
1974
0
                    ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
1975
0
                }
1976
0
            }
1977
0
        }
1978
1979
0
        int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
1980
0
        if ( size ) kputsn_(ptr_ori, size, &tmp);
1981
1982
0
        free(line->shared.s);
1983
0
        line->shared = tmp;
1984
0
    }
1985
0
    if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
1986
0
    {
1987
        // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
1988
0
        size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
1989
0
        int i;
1990
0
        for (i=0; i<line->n_info; i++)
1991
0
        {
1992
0
            uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
1993
0
            line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
1994
0
            off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
1995
0
            if ( vptr_free )
1996
0
            {
1997
0
                free(vptr_free);
1998
0
                line->d.info[i].vptr_free = 0;
1999
0
            }
2000
0
        }
2001
0
    }
2002
2003
0
    if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
2004
0
    {
2005
        // The genotype fields changed or are not present
2006
0
        tmp.l = tmp.m = 0; tmp.s = NULL;
2007
0
        int i, irm = -1;
2008
0
        for (i=0; i<line->n_fmt; i++)
2009
0
        {
2010
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
2011
0
            if ( !fmt->p )
2012
0
            {
2013
                // marked for removal
2014
0
                if ( irm < 0 ) irm = i;
2015
0
                continue;
2016
0
            }
2017
0
            kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
2018
0
            if ( irm >=0 )
2019
0
            {
2020
0
                bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
2021
0
                while ( irm<=i && line->d.fmt[irm].p ) irm++;
2022
0
            }
2023
2024
0
        }
2025
0
        if ( irm>=0 ) line->n_fmt = irm;
2026
0
        free(line->indiv.s);
2027
0
        line->indiv = tmp;
2028
2029
        // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
2030
0
        size_t off_new = 0;
2031
0
        for (i=0; i<line->n_fmt; i++)
2032
0
        {
2033
0
            uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
2034
0
            line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
2035
0
            off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
2036
0
            if ( p_free )
2037
0
            {
2038
0
                free(p_free);
2039
0
                line->d.fmt[i].p_free = 0;
2040
0
            }
2041
0
        }
2042
0
    }
2043
0
    if ( !line->n_sample ) line->n_fmt = 0;
2044
0
    line->d.shared_dirty = line->d.indiv_dirty = 0;
2045
0
    return 0;
2046
0
}
2047
2048
bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
2049
0
{
2050
0
    bcf1_sync(src);
2051
2052
0
    bcf_clear(dst);
2053
0
    dst->rid  = src->rid;
2054
0
    dst->pos  = src->pos;
2055
0
    dst->rlen = src->rlen;
2056
0
    dst->qual = src->qual;
2057
0
    dst->n_info = src->n_info; dst->n_allele = src->n_allele;
2058
0
    dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
2059
2060
0
    if ( dst->shared.m < src->shared.l )
2061
0
    {
2062
0
        dst->shared.s = (char*) realloc(dst->shared.s, src->shared.l);
2063
0
        dst->shared.m = src->shared.l;
2064
0
    }
2065
0
    dst->shared.l = src->shared.l;
2066
0
    memcpy(dst->shared.s,src->shared.s,dst->shared.l);
2067
2068
0
    if ( dst->indiv.m < src->indiv.l )
2069
0
    {
2070
0
        dst->indiv.s = (char*) realloc(dst->indiv.s, src->indiv.l);
2071
0
        dst->indiv.m = src->indiv.l;
2072
0
    }
2073
0
    dst->indiv.l = src->indiv.l;
2074
0
    memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
2075
2076
0
    return dst;
2077
0
}
2078
bcf1_t *bcf_dup(bcf1_t *src)
2079
0
{
2080
0
    bcf1_t *out = bcf_init1();
2081
0
    return bcf_copy(out, src);
2082
0
}
2083
2084
int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
2085
195k
{
2086
195k
    if ( h->dirty ) {
2087
0
        if (bcf_hdr_sync(h) < 0) return -1;
2088
0
    }
2089
195k
    if ( bcf_hdr_nsamples(h)!=v->n_sample )
2090
79
    {
2091
79
        hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
2092
79
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
2093
79
        return -1;
2094
79
    }
2095
2096
195k
    if ( hfp->format.format == vcf || hfp->format.format == text_format )
2097
195k
        return vcf_write(hfp,h,v);
2098
2099
0
    if ( v->errcode )
2100
0
    {
2101
        // vcf_parse1() encountered a new contig or tag, undeclared in the
2102
        // header.  At this point, the header must have been printed,
2103
        // proceeding would lead to a broken BCF file. Errors must be checked
2104
        // and cleared by the caller before we can proceed.
2105
0
        char errdescription[1024] = "";
2106
0
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos, v->errcode, bcf_strerror(v->errcode, errdescription, sizeof(errdescription)), bcf_seqname_safe(h,v), v->pos+1);
2107
0
        return -1;
2108
0
    }
2109
0
    bcf1_sync(v);   // check if the BCF record was modified
2110
2111
0
    if ( v->unpacked & BCF_IS_64BIT )
2112
0
    {
2113
0
        hts_log_error("Data at %s:%"PRIhts_pos" contains 64-bit values not representable in BCF. Please use VCF instead", bcf_seqname_safe(h,v), v->pos+1);
2114
0
        return -1;
2115
0
    }
2116
2117
0
    BGZF *fp = hfp->fp.bgzf;
2118
0
    uint8_t x[32];
2119
0
    u32_to_le(v->shared.l + 24, x); // to include six 32-bit integers
2120
0
    u32_to_le(v->indiv.l, x + 4);
2121
0
    i32_to_le(v->rid, x + 8);
2122
0
    u32_to_le(v->pos, x + 12);
2123
0
    u32_to_le(v->rlen, x + 16);
2124
0
    float_to_le(v->qual, x + 20);
2125
0
    u16_to_le(v->n_info, x + 24);
2126
0
    u16_to_le(v->n_allele, x + 26);
2127
0
    u32_to_le((uint32_t)v->n_fmt<<24 | (v->n_sample & 0xffffff), x + 28);
2128
0
    if ( bgzf_write(fp, x, 32) != 32 ) return -1;
2129
0
    if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
2130
0
    if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
2131
2132
0
    if (hfp->idx) {
2133
0
        if (hts_idx_push(hfp->idx, v->rid, v->pos, v->pos + v->rlen, bgzf_tell(fp), 1) < 0)
2134
0
            return -1;
2135
0
    }
2136
2137
0
    return 0;
2138
0
}
2139
2140
/**********************
2141
 *** VCF header I/O ***
2142
 **********************/
2143
2144
0
static int add_missing_contig_hrec(bcf_hdr_t *h, const char *name) {
2145
0
    bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t));
2146
0
    int save_errno;
2147
0
    if (!hrec) goto fail;
2148
2149
0
    hrec->key = strdup("contig");
2150
0
    if (!hrec->key) goto fail;
2151
2152
0
    if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) goto fail;
2153
0
    if (bcf_hrec_set_val(hrec, hrec->nkeys-1, name, strlen(name), 0) < 0)
2154
0
        goto fail;
2155
0
    if (bcf_hdr_add_hrec(h, hrec) < 0)
2156
0
        goto fail;
2157
0
    return 0;
2158
2159
0
 fail:
2160
0
    save_errno = errno;
2161
0
    hts_log_error("%s", strerror(errno));
2162
0
    if (hrec) bcf_hrec_destroy(hrec);
2163
0
    errno = save_errno;
2164
0
    return -1;
2165
0
}
2166
2167
bcf_hdr_t *vcf_hdr_read(htsFile *fp)
2168
1.62k
{
2169
1.62k
    kstring_t txt, *s = &fp->line;
2170
1.62k
    int ret;
2171
1.62k
    bcf_hdr_t *h;
2172
1.62k
    tbx_t *idx = NULL;
2173
1.62k
    const char **names = NULL;
2174
1.62k
    h = bcf_hdr_init("r");
2175
1.62k
    if (!h) {
2176
0
        hts_log_error("Failed to allocate bcf header");
2177
0
        return NULL;
2178
0
    }
2179
1.62k
    txt.l = txt.m = 0; txt.s = 0;
2180
66.8k
    while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) {
2181
66.7k
        int e = 0;
2182
66.7k
        if (s->l == 0) continue;
2183
66.6k
        if (s->s[0] != '#') {
2184
10
            hts_log_error("No sample line");
2185
10
            goto error;
2186
10
        }
2187
66.6k
        if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
2188
0
            kstring_t tmp = { 0, 0, NULL };
2189
0
            hFILE *f = hopen(fp->fn_aux, "r");
2190
0
            if (f == NULL) {
2191
0
                hts_log_error("Couldn't open \"%s\"", fp->fn_aux);
2192
0
                goto error;
2193
0
            }
2194
0
            while (tmp.l = 0, kgetline(&tmp, (kgets_func *) hgets, f) >= 0) {
2195
0
                char *tab = strchr(tmp.s, '\t');
2196
0
                if (tab == NULL) continue;
2197
0
                e |= (kputs("##contig=<ID=", &txt) < 0);
2198
0
                e |= (kputsn(tmp.s, tab - tmp.s, &txt) < 0);
2199
0
                e |= (kputs(",length=", &txt) < 0);
2200
0
                e |= (kputl(atol(tab), &txt) < 0);
2201
0
                e |= (kputsn(">\n", 2, &txt) < 0);
2202
0
            }
2203
0
            free(tmp.s);
2204
0
            if (hclose(f) != 0) {
2205
0
                hts_log_error("Error on closing %s", fp->fn_aux);
2206
0
                goto error;
2207
0
            }
2208
0
            if (e) goto error;
2209
0
        }
2210
66.6k
        if (kputsn(s->s, s->l, &txt) < 0) goto error;
2211
66.6k
        if (kputc('\n', &txt) < 0) goto error;
2212
66.6k
        if (s->s[1] != '#') break;
2213
66.6k
    }
2214
1.61k
    if ( ret < -1 ) goto error;
2215
1.60k
    if ( !txt.s )
2216
0
    {
2217
0
        hts_log_error("Could not read the header");
2218
0
        goto error;
2219
0
    }
2220
1.60k
    if ( bcf_hdr_parse(h, txt.s) < 0 ) goto error;
2221
2222
    // check tabix index, are all contigs listed in the header? add the missing ones
2223
1.47k
    idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SILENT_FAIL);
2224
1.47k
    if ( idx )
2225
0
    {
2226
0
        int i, n, need_sync = 0;
2227
0
        names = tbx_seqnames(idx, &n);
2228
0
        if (!names) goto error;
2229
0
        for (i=0; i<n; i++)
2230
0
        {
2231
0
            bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL);
2232
0
            if ( hrec ) continue;
2233
0
            if (add_missing_contig_hrec(h, names[i]) < 0) goto error;
2234
0
            need_sync = 1;
2235
0
        }
2236
0
        if ( need_sync ) {
2237
0
            if (bcf_hdr_sync(h) < 0) goto error;
2238
0
        }
2239
0
        free(names);
2240
0
        tbx_destroy(idx);
2241
0
    }
2242
1.47k
    free(txt.s);
2243
1.47k
    return h;
2244
2245
152
 error:
2246
152
    if (idx) tbx_destroy(idx);
2247
152
    free(names);
2248
152
    free(txt.s);
2249
152
    if (h) bcf_hdr_destroy(h);
2250
152
    return NULL;
2251
1.47k
}
2252
2253
int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
2254
0
{
2255
0
    int i = 0, n = 0, save_errno;
2256
0
    char **lines = hts_readlines(fname, &n);
2257
0
    if ( !lines ) return 1;
2258
0
    for (i=0; i<n-1; i++)
2259
0
    {
2260
0
        int k;
2261
0
        bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
2262
0
        if (!hrec) goto fail;
2263
0
        if (bcf_hdr_add_hrec(hdr, hrec) < 0) {
2264
0
            bcf_hrec_destroy(hrec);
2265
0
            goto fail;
2266
0
        }
2267
0
        free(lines[i]);
2268
0
        lines[i] = NULL;
2269
0
    }
2270
0
    if (bcf_hdr_parse_sample_line(hdr, lines[n-1]) < 0) goto fail;
2271
0
    if (bcf_hdr_sync(hdr) < 0) goto fail;
2272
0
    free(lines[n-1]);
2273
0
    free(lines);
2274
0
    return 0;
2275
2276
0
 fail:
2277
0
    save_errno = errno;
2278
0
    for (; i < n; i++)
2279
0
        free(lines[i]);
2280
0
    free(lines);
2281
0
    errno = save_errno;
2282
0
    return 1;
2283
0
}
2284
2285
static int _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
2286
9.18k
{
2287
9.18k
    uint32_t e = 0;
2288
9.18k
    if ( !hrec->value )
2289
4.85k
    {
2290
4.85k
        int j, nout = 0;
2291
4.85k
        e |= ksprintf(str, "##%s=<", hrec->key) < 0;
2292
18.4k
        for (j=0; j<hrec->nkeys; j++)
2293
13.6k
        {
2294
            // do not output IDX if output is VCF
2295
13.6k
            if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
2296
10.2k
            if ( nout ) e |= kputc(',',str) < 0;
2297
10.2k
            e |= ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]) < 0;
2298
10.2k
            nout++;
2299
10.2k
        }
2300
4.85k
        e |= ksprintf(str,">\n") < 0;
2301
4.85k
    }
2302
4.32k
    else
2303
4.32k
        e |= ksprintf(str,"##%s=%s\n", hrec->key,hrec->value) < 0;
2304
2305
9.18k
    return e == 0 ? 0 : -1;
2306
9.18k
}
2307
2308
int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
2309
0
{
2310
0
    return _bcf_hrec_format(hrec,0,str);
2311
0
}
2312
2313
int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str)
2314
1.51k
{
2315
1.51k
    int i, r = 0;
2316
10.7k
    for (i=0; i<hdr->nhrec; i++)
2317
9.18k
        r |= _bcf_hrec_format(hdr->hrec[i], is_bcf, str) < 0;
2318
2319
1.51k
    r |= ksprintf(str, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") < 0;
2320
1.51k
    if ( bcf_hdr_nsamples(hdr) )
2321
379
    {
2322
379
        r |= ksprintf(str, "\tFORMAT") < 0;
2323
9.27k
        for (i=0; i<bcf_hdr_nsamples(hdr); i++)
2324
8.89k
            r |= ksprintf(str, "\t%s", hdr->samples[i]) < 0;
2325
379
    }
2326
1.51k
    r |= ksprintf(str, "\n") < 0;
2327
2328
1.51k
    return r ? -1 : 0;
2329
1.51k
}
2330
2331
char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
2332
0
{
2333
0
    kstring_t txt = {0,0,0};
2334
0
    if (bcf_hdr_format(hdr, is_bcf, &txt) < 0)
2335
0
        return NULL;
2336
0
    if ( len ) *len = txt.l;
2337
0
    return txt.s;
2338
0
}
2339
2340
const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
2341
0
{
2342
0
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
2343
0
    int i, tid, m = kh_size(d);
2344
0
    const char **names = (const char**) calloc(m,sizeof(const char*));
2345
0
    if ( !names )
2346
0
    {
2347
0
        hts_log_error("Failed to allocate memory");
2348
0
        *n = 0;
2349
0
        return NULL;
2350
0
    }
2351
0
    khint_t k;
2352
0
    for (k=kh_begin(d); k<kh_end(d); k++)
2353
0
    {
2354
0
        if ( !kh_exist(d,k) ) continue;
2355
0
        if ( !kh_val(d, k).hrec[0] ) continue;  // removed via bcf_hdr_remove
2356
0
        tid = kh_val(d,k).id;
2357
0
        if ( tid >= m )
2358
0
        {
2359
            // This can happen after a contig has been removed from BCF header via bcf_hdr_remove()
2360
0
            if ( hts_resize(const char*, tid + 1, &m, &names, HTS_RESIZE_CLEAR)<0 )
2361
0
            {
2362
0
                hts_log_error("Failed to allocate memory");
2363
0
                *n = 0;
2364
0
                free(names);
2365
0
                return NULL;
2366
0
            }
2367
0
            m = tid + 1;
2368
0
        }
2369
0
        names[tid] = kh_key(d,k);
2370
0
    }
2371
    // ensure there are no gaps
2372
0
    for (i=0,tid=0; tid<m; i++,tid++)
2373
0
    {
2374
0
        while ( tid<m && !names[tid] ) tid++;
2375
0
        if ( tid==m ) break;
2376
0
        if ( i==tid ) continue;
2377
0
        names[i] = names[tid];
2378
0
        names[tid] = 0;
2379
0
    }
2380
0
    *n = i;
2381
0
    return names;
2382
0
}
2383
2384
int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
2385
1.51k
{
2386
1.51k
    kstring_t htxt = {0,0,0};
2387
1.51k
    if (bcf_hdr_format(h, 0, &htxt) < 0) {
2388
0
        free(htxt.s);
2389
0
        return -1;
2390
0
    }
2391
1.51k
    while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros
2392
1.51k
    int ret;
2393
1.51k
    if ( fp->format.compression!=no_compression ) {
2394
0
        ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l);
2395
0
        if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2396
1.51k
    } else {
2397
1.51k
        ret = hwrite(fp->fp.hfile, htxt.s, htxt.l);
2398
1.51k
    }
2399
1.51k
    free(htxt.s);
2400
1.51k
    return ret<0 ? -1 : 0;
2401
1.51k
}
2402
2403
/***********************
2404
 *** Typed value I/O ***
2405
 ***********************/
2406
2407
int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
2408
694k
{
2409
694k
    int32_t max = INT32_MIN, min = INT32_MAX;
2410
694k
    int i;
2411
694k
    if (n <= 0) bcf_enc_size(s, 0, BCF_BT_NULL);
2412
694k
    else if (n == 1) bcf_enc_int1(s, a[0]);
2413
687k
    else {
2414
687k
        if (wsize <= 0) wsize = n;
2415
999M
        for (i = 0; i < n; ++i) {
2416
998M
            if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) continue;
2417
7.55M
            if (max < a[i]) max = a[i];
2418
7.55M
            if (min > a[i]) min = a[i];
2419
7.55M
        }
2420
687k
        if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) {
2421
30.7k
            bcf_enc_size(s, wsize, BCF_BT_INT8);
2422
63.1M
            for (i = 0; i < n; ++i)
2423
63.1M
                if ( a[i]==bcf_int32_vector_end ) kputc(bcf_int8_vector_end, s);
2424
62.0M
                else if ( a[i]==bcf_int32_missing ) kputc(bcf_int8_missing, s);
2425
1.86M
                else kputc(a[i], s);
2426
656k
        } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) {
2427
466k
            uint8_t *p;
2428
466k
            bcf_enc_size(s, wsize, BCF_BT_INT16);
2429
466k
            ks_resize(s, s->l + n * sizeof(int16_t));
2430
466k
            p = (uint8_t *) s->s + s->l;
2431
190M
            for (i = 0; i < n; ++i)
2432
190M
            {
2433
190M
                int16_t x;
2434
190M
                if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
2435
189M
                else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
2436
1.42M
                else x = a[i];
2437
190M
                i16_to_le(x, p);
2438
190M
                p += sizeof(int16_t);
2439
190M
            }
2440
466k
            s->l += n * sizeof(int16_t);
2441
466k
        } else {
2442
189k
            uint8_t *p;
2443
189k
            bcf_enc_size(s, wsize, BCF_BT_INT32);
2444
189k
            ks_resize(s, s->l + n * sizeof(int32_t));
2445
189k
            p = (uint8_t *) s->s + s->l;
2446
745M
            for (i = 0; i < n; ++i) {
2447
745M
                i32_to_le(a[i], p);
2448
745M
                p += sizeof(int32_t);
2449
745M
            }
2450
189k
            s->l += n * sizeof(int32_t);
2451
189k
        }
2452
687k
    }
2453
2454
694k
    return 0; // FIXME: check for errs in this function
2455
694k
}
2456
2457
#ifdef VCF_ALLOW_INT64
2458
static int bcf_enc_long1(kstring_t *s, int64_t x) {
2459
    uint32_t e = 0;
2460
    if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32)
2461
        return bcf_enc_int1(s, x);
2462
    if (x == bcf_int64_vector_end) {
2463
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2464
        e |= kputc(bcf_int8_vector_end, s) < 0;
2465
    } else if (x == bcf_int64_missing) {
2466
        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
2467
        e |= kputc(bcf_int8_missing, s) < 0;
2468
    } else {
2469
        e |= bcf_enc_size(s, 1, BCF_BT_INT64);
2470
        e |= ks_expand(s, 8);
2471
        if (e == 0) { u64_to_le(x, (uint8_t *) s->s + s->l); s->l += 8; }
2472
    }
2473
    return e == 0 ? 0 : -1;
2474
}
2475
#endif
2476
2477
1.59M
static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) {
2478
1.59M
    uint8_t *p;
2479
1.59M
    size_t i;
2480
1.59M
    size_t bytes = n * sizeof(float);
2481
2482
1.59M
    if (bytes / sizeof(float) != n) return -1;
2483
1.59M
    if (ks_resize(s, s->l + bytes) < 0) return -1;
2484
2485
1.59M
    p = (uint8_t *) s->s + s->l;
2486
502M
    for (i = 0; i < n; i++) {
2487
501M
        float_to_le(a[i], p);
2488
501M
        p += sizeof(float);
2489
501M
    }
2490
1.59M
    s->l += bytes;
2491
2492
1.59M
    return 0;
2493
1.59M
}
2494
2495
int bcf_enc_vfloat(kstring_t *s, int n, float *a)
2496
1.59M
{
2497
1.59M
    assert(n >= 0);
2498
1.59M
    bcf_enc_size(s, n, BCF_BT_FLOAT);
2499
1.59M
    serialize_float_array(s, n, a);
2500
1.59M
    return 0; // FIXME: check for errs in this function
2501
1.59M
}
2502
2503
int bcf_enc_vchar(kstring_t *s, int l, const char *a)
2504
4.66M
{
2505
4.66M
    bcf_enc_size(s, l, BCF_BT_CHAR);
2506
4.66M
    kputsn(a, l, s);
2507
4.66M
    return 0; // FIXME: check for errs in this function
2508
4.66M
}
2509
2510
int bcf_fmt_array(kstring_t *s, int n, int type, void *data)
2511
7.10M
{
2512
7.10M
    int j = 0;
2513
7.10M
    uint32_t e = 0;
2514
7.10M
    if (n == 0) {
2515
5.71M
        return kputc('.', s) >= 0 ? 0 : -1;
2516
5.71M
    }
2517
1.39M
    if (type == BCF_BT_CHAR)
2518
560k
    {
2519
560k
        char *p = (char*)data;
2520
2.60G
        for (j = 0; j < n && *p; ++j, ++p)
2521
2.60G
        {
2522
2.60G
            if ( *p==bcf_str_missing ) e |= kputc('.', s) < 0;
2523
2.59G
            else e |= kputc(*p, s) < 0;
2524
2.60G
        }
2525
560k
    }
2526
834k
    else
2527
834k
    {
2528
834k
        #define BRANCH(type_t, convert, is_missing, is_vector_end, kprint) { \
2529
834k
            uint8_t *p = (uint8_t *) data; \
2530
1.31G
            for (j=0; j<n; j++, p += sizeof(type_t))    \
2531
1.30G
            { \
2532
1.30G
                type_t v = convert(p); \
2533
1.30G
                if ( is_vector_end ) break; \
2534
1.30G
                if ( j ) kputc(',', s); \
2535
1.30G
                if ( is_missing ) kputc('.', s); \
2536
1.30G
                else e |= kprint < 0; \
2537
1.30G
            } \
2538
834k
        }
2539
834k
        switch (type) {
2540
31.2k
            case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, v==bcf_int8_missing,  v==bcf_int8_vector_end,  kputw(v, s)); break;
2541
416k
            case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break;
2542
166k
            case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break;
2543
220k
            case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break;
2544
0
            default: hts_log_error("Unexpected type %d", type); exit(1); break;
2545
834k
        }
2546
834k
        #undef BRANCH
2547
834k
    }
2548
1.39M
    return e == 0 ? 0 : -1;
2549
1.39M
}
2550
2551
uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
2552
4.28M
{
2553
4.28M
    int x, type;
2554
4.28M
    x = bcf_dec_size(ptr, &ptr, &type);
2555
4.28M
    bcf_fmt_array(s, x, type, ptr);
2556
4.28M
    return ptr + (x << bcf_type_shift[type]);
2557
4.28M
}
2558
2559
/********************
2560
 *** VCF site I/O ***
2561
 ********************/
2562
2563
typedef struct {
2564
    int key, max_m, size, offset;
2565
    uint32_t is_gt:1, max_g:31;
2566
    uint32_t max_l;
2567
    uint32_t y;
2568
    uint8_t *buf;
2569
} fmt_aux_t;
2570
2571
static inline int align_mem(kstring_t *s)
2572
82.1k
{
2573
82.1k
    int e = 0;
2574
82.1k
    if (s->l&7) {
2575
13.4k
        uint64_t zero = 0;
2576
13.4k
        e = kputsn((char*)&zero, 8 - (s->l&7), s) < 0;
2577
13.4k
    }
2578
82.1k
    return e == 0 ? 0 : -1;
2579
82.1k
}
2580
2581
// p,q is the start and the end of the FORMAT field
2582
82.4k
#define MAX_N_FMT 255   /* Limited by size of bcf1_t n_fmt field */
2583
static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q)
2584
5.22k
{
2585
5.22k
    if ( !bcf_hdr_nsamples(h) ) return 0;
2586
2587
3.25k
    static int extreme_val_warned = 0;
2588
3.25k
    char *r, *t;
2589
3.25k
    int j, l, m, g, overflow = 0;
2590
3.25k
    khint_t k;
2591
3.25k
    ks_tokaux_t aux1;
2592
3.25k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
2593
3.25k
    kstring_t *mem = (kstring_t*)&h->mem;
2594
3.25k
    fmt_aux_t fmt[MAX_N_FMT];
2595
3.25k
    mem->l = 0;
2596
2597
3.25k
    char *end = s->s + s->l;
2598
3.25k
    if ( q>=end )
2599
8
    {
2600
8
        hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1);
2601
8
        v->errcode |= BCF_ERR_NCOLS;
2602
8
        return -1;
2603
8
    }
2604
2605
3.25k
    v->n_fmt = 0;
2606
3.25k
    if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "."
2607
146
    {
2608
146
        v->n_sample = bcf_hdr_nsamples(h);
2609
146
        return 0;
2610
146
    }
2611
2612
    // get format information from the dictionary
2613
85.5k
    for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
2614
82.4k
        if (j >= MAX_N_FMT) {
2615
1
            v->errcode |= BCF_ERR_LIMITS;
2616
1
            hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle",
2617
1
                bcf_seqname_safe(h,v), v->pos+1);
2618
1
            return -1;
2619
1
        }
2620
2621
82.4k
        *(char*)aux1.p = 0;
2622
82.4k
        k = kh_get(vdict, d, t);
2623
82.4k
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
2624
3.75k
            if ( t[0]=='.' && t[1]==0 )
2625
0
            {
2626
0
                hts_log_error("Invalid FORMAT tag name '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2627
0
                v->errcode |= BCF_ERR_TAG_INVALID;
2628
0
                return -1;
2629
0
            }
2630
3.75k
            hts_log_warning("FORMAT '%s' at %s:%"PRIhts_pos" is not defined in the header, assuming Type=String", t, bcf_seqname_safe(h,v), v->pos+1);
2631
3.75k
            kstring_t tmp = {0,0,0};
2632
3.75k
            int l;
2633
3.75k
            ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
2634
3.75k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
2635
3.75k
            free(tmp.s);
2636
3.75k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
2637
3.75k
            if (res < 0) bcf_hrec_destroy(hrec);
2638
3.75k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
2639
2640
3.75k
            k = kh_get(vdict, d, t);
2641
3.75k
            v->errcode |= BCF_ERR_TAG_UNDEF;
2642
3.75k
            if (res || k == kh_end(d)) {
2643
2
                hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
2644
2
                v->errcode |= BCF_ERR_TAG_INVALID;
2645
2
                return -1;
2646
2
            }
2647
3.75k
        }
2648
82.4k
        fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
2649
82.4k
        fmt[j].key = kh_val(d, k).id;
2650
82.4k
        fmt[j].is_gt = !strcmp(t, "GT");
2651
82.4k
        fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
2652
82.4k
        v->n_fmt++;
2653
82.4k
    }
2654
    // compute max
2655
3.10k
    int n_sample_ori = -1;
2656
3.10k
    r = q + 1;  // r: position in the format string
2657
3.10k
    l = 0, m = g = 1, v->n_sample = 0;  // m: max vector size, l: max field len, g: max number of alleles
2658
27.2k
    while ( r<end )
2659
27.1k
    {
2660
        // can we skip some samples?
2661
27.1k
        if ( h->keep_samples )
2662
0
        {
2663
0
            n_sample_ori++;
2664
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
2665
0
            {
2666
0
                while ( *r!='\t' && r<end ) r++;
2667
0
                if ( *r=='\t' ) { *r = 0; r++; }
2668
0
                continue;
2669
0
            }
2670
0
        }
2671
2672
        // collect fmt stats: max vector size, length, number of alleles
2673
27.1k
        j = 0;  // j-th format field
2674
27.1k
        fmt_aux_t *f = fmt;
2675
15.9M
        for (;;) {
2676
15.9M
            switch (*r) {
2677
8.87M
            case ',':
2678
8.87M
                m++;
2679
8.87M
                break;
2680
2681
12.5k
            case '|':
2682
1.72M
            case '/':
2683
1.72M
                if (f->is_gt) g++;
2684
1.72M
                break;
2685
2686
5.48k
            case '\t':
2687
5.48k
                *r = 0; // fall through
2688
2689
27.1k
            case '\0':
2690
47.3k
            case ':':
2691
47.3k
                if (f->max_m < m) f->max_m = m;
2692
47.3k
                if (f->max_l < l) f->max_l = l;
2693
47.3k
                if (f->is_gt && f->max_g < g) f->max_g = g;
2694
47.3k
                l = 0, m = g = 1;
2695
47.3k
                if ( *r==':' ) {
2696
20.2k
                    j++; f++;
2697
20.2k
                    if ( j>=v->n_fmt ) {
2698
3
                        hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"",
2699
3
                                      h->id[BCF_DT_CTG][v->rid].key, v->pos+1);
2700
3
                        v->errcode |= BCF_ERR_NCOLS;
2701
3
                        return -1;
2702
3
                    }
2703
27.1k
                } else goto end_for;
2704
20.2k
                break;
2705
15.9M
            }
2706
15.9M
            if ( r>=end ) break;
2707
15.9M
            r++; l++;
2708
15.9M
        }
2709
27.1k
    end_for:
2710
27.1k
        v->n_sample++;
2711
27.1k
        if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
2712
24.1k
        r++;
2713
24.1k
    }
2714
2715
    // allocate memory for arrays
2716
85.2k
    for (j = 0; j < v->n_fmt; ++j) {
2717
82.1k
        fmt_aux_t *f = &fmt[j];
2718
82.1k
        if ( !f->max_m ) f->max_m = 1;  // omitted trailing format field
2719
82.1k
        if ((f->y>>4&0xf) == BCF_HT_STR) {
2720
82.1k
            f->size = f->is_gt? f->max_g << 2 : f->max_l;
2721
82.1k
        } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
2722
0
            f->size = f->max_m << 2;
2723
0
        } else
2724
0
        {
2725
0
            hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
2726
0
            v->errcode |= BCF_ERR_TAG_INVALID;
2727
0
            return -1;
2728
0
        }
2729
82.1k
        if (align_mem(mem) < 0) {
2730
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2731
0
            v->errcode |= BCF_ERR_LIMITS;
2732
0
            return -1;
2733
0
        }
2734
2735
        // Limit the total memory to ~2Gb per VCF row.  This should mean
2736
        // malformed VCF data is less likely to take excessive memory and/or
2737
        // time.
2738
82.1k
        if ((uint64_t) mem->l + v->n_sample * (uint64_t)f->size > INT_MAX) {
2739
0
            hts_log_error("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2740
0
            v->errcode |= BCF_ERR_LIMITS;
2741
0
            return -1;
2742
0
        }
2743
2744
82.1k
        f->offset = mem->l;
2745
82.1k
        if (ks_resize(mem, mem->l + v->n_sample * (size_t)f->size) < 0) {
2746
0
            hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2747
0
            v->errcode |= BCF_ERR_LIMITS;
2748
0
            return -1;
2749
0
        }
2750
82.1k
        mem->l += v->n_sample * f->size;
2751
82.1k
    }
2752
85.2k
    for (j = 0; j < v->n_fmt; ++j)
2753
82.1k
        fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
2754
    // fill the sample fields; at beginning of the loop, t points to the first char of a format
2755
3.09k
    n_sample_ori = -1;
2756
3.09k
    t = q + 1; m = 0;   // m: sample id
2757
29.6k
    while ( t<end )
2758
28.6k
    {
2759
        // can we skip some samples?
2760
28.6k
        if ( h->keep_samples )
2761
0
        {
2762
0
            n_sample_ori++;
2763
0
            if ( !bit_array_test(h->keep_samples,n_sample_ori) )
2764
0
            {
2765
0
                while ( *t && t<end ) t++;
2766
0
                t++;
2767
0
                continue;
2768
0
            }
2769
0
        }
2770
28.6k
        if ( m == bcf_hdr_nsamples(h) ) break;
2771
2772
26.6k
        j = 0; // j-th format field, m-th sample
2773
46.8k
        while ( t < end )
2774
46.6k
        {
2775
46.6k
            fmt_aux_t *z = &fmt[j++];
2776
46.6k
            if (!z->buf) {
2777
0
                hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos,
2778
0
                              z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
2779
0
                v->errcode |= BCF_ERR_LIMITS;
2780
0
                return -1;
2781
0
            }
2782
46.6k
            if ((z->y>>4&0xf) == BCF_HT_STR) {
2783
46.6k
                if (z->is_gt) { // genotypes
2784
1.47k
                    int32_t is_phased = 0;
2785
1.47k
                    uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
2786
1.47k
                    uint32_t unreadable = 0;
2787
1.47k
                    uint32_t max = 0;
2788
1.47k
                    overflow = 0;
2789
397k
                    for (l = 0;; ++t) {
2790
397k
                        if (*t == '.') {
2791
226k
                            ++t, x[l++] = is_phased;
2792
226k
                        } else {
2793
170k
                            char *tt = t;
2794
170k
                            uint32_t val = hts_str2uint(t, &t, sizeof(val) * CHAR_MAX - 2, &overflow);
2795
170k
                            unreadable |= tt == t;
2796
170k
                            if (max < val) max = val;
2797
170k
                            x[l++] = (val + 1) << 1 | is_phased;
2798
170k
                        }
2799
397k
                        is_phased = (*t == '|');
2800
397k
                        if (*t != '|' && *t != '/') break;
2801
397k
                    }
2802
                    // Possibly check max against v->n_allele instead?
2803
1.47k
                    if (overflow || max > (INT32_MAX >> 1) - 1) {
2804
33
                        hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2805
33
                        return -1;
2806
33
                    }
2807
1.44k
                    if (unreadable) {
2808
18
                        hts_log_error("Couldn't read GT data: value not a number or '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2809
18
                        return -1;
2810
18
                    }
2811
1.42k
                    if ( !l ) x[l++] = 0;   // An empty field, insert missing value
2812
389k
                    for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
2813
45.1k
                } else {
2814
45.1k
                    char *x = (char*)z->buf + z->size * (size_t)m;
2815
14.7M
                    for (r = t, l = 0; *t != ':' && *t; ++t) x[l++] = *t;
2816
278M
                    for (; l < z->size; ++l) x[l] = 0;
2817
45.1k
                }
2818
46.6k
            } else if ((z->y>>4&0xf) == BCF_HT_INT) {
2819
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
2820
0
                for (l = 0;; ++t) {
2821
0
                    if (*t == '.') {
2822
0
                        x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
2823
0
                    } else {
2824
0
                        overflow = 0;
2825
0
                        char *te;
2826
0
                        long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
2827
0
                        if ( te==t || overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
2828
0
                        {
2829
0
                            if ( !extreme_val_warned )
2830
0
                            {
2831
0
                                hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1);
2832
0
                                extreme_val_warned = 1;
2833
0
                            }
2834
0
                            tmp_val = bcf_int32_missing;
2835
0
                        }
2836
0
                        x[l++] = tmp_val;
2837
0
                        t = te;
2838
0
                    }
2839
0
                    if (*t != ',') break;
2840
0
                }
2841
0
                if ( !l ) x[l++] = bcf_int32_missing;
2842
0
                for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
2843
0
            } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
2844
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
2845
0
                for (l = 0;; ++t) {
2846
0
                    if (*t == '.' && !isdigit_c(t[1])) {
2847
0
                        bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
2848
0
                    } else {
2849
0
                        overflow = 0;
2850
0
                        char *te;
2851
0
                        float tmp_val = hts_str2dbl(t, &te, &overflow);
2852
0
                        if ( (te==t || overflow) && !extreme_val_warned )
2853
0
                        {
2854
0
                            hts_log_warning("Extreme FORMAT/%s value encountered at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname(h,v), v->pos+1);
2855
0
                            extreme_val_warned = 1;
2856
0
                        }
2857
0
                        x[l++] = tmp_val;
2858
0
                        t = te;
2859
0
                    }
2860
0
                    if (*t != ',') break;
2861
0
                }
2862
0
                if ( !l ) bcf_float_set_missing(x[l++]);    // An empty field, insert missing value
2863
0
                for (; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
2864
0
            } else {
2865
0
                hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
2866
0
                v->errcode |= BCF_ERR_TAG_INVALID;
2867
0
                return -1;
2868
0
            }
2869
2870
46.5k
            if (*t == '\0') {
2871
26.3k
                break;
2872
26.3k
            }
2873
20.2k
            else if (*t == ':') {
2874
20.2k
                t++;
2875
20.2k
            }
2876
2
            else {
2877
2
                char buffer[8];
2878
2
                hts_log_error("Invalid character %s in '%s' FORMAT field at %s:%"PRIhts_pos"",
2879
2
                    hts_strprint(buffer, sizeof buffer, '\'', t, 1),
2880
2
                    h->id[BCF_DT_ID][z->key].key, bcf_seqname_safe(h,v), v->pos+1);
2881
2
                v->errcode |= BCF_ERR_CHAR;
2882
2
                return -1;
2883
2
            }
2884
46.5k
        }
2885
2886
2.10M
        for (; j < v->n_fmt; ++j) { // fill end-of-vector values
2887
2.07M
            fmt_aux_t *z = &fmt[j];
2888
2.07M
            if ((z->y>>4&0xf) == BCF_HT_STR) {
2889
2.07M
                if (z->is_gt) {
2890
13.9k
                    int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
2891
13.9k
                    if (z->size) x[0] = bcf_int32_missing;
2892
1.60M
                    for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
2893
2.05M
                } else {
2894
2.05M
                    char *x = (char*)z->buf + z->size * (size_t)m;
2895
2.05M
                    if ( z->size ) x[0] = '.';
2896
232M
                    for (l = 1; l < z->size; ++l) x[l] = 0;
2897
2.05M
                }
2898
2.07M
            } else if ((z->y>>4&0xf) == BCF_HT_INT) {
2899
0
                int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
2900
0
                x[0] = bcf_int32_missing;
2901
0
                for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
2902
0
            } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
2903
0
                float *x = (float*)(z->buf + z->size * (size_t)m);
2904
0
                bcf_float_set_missing(x[0]);
2905
0
                for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
2906
0
            }
2907
2.07M
        }
2908
2909
26.5k
        m++; t++;
2910
26.5k
    }
2911
2912
    // write individual genotype information
2913
3.04k
    kstring_t *str = &v->indiv;
2914
3.04k
    int i;
2915
3.04k
    if (v->n_sample > 0) {
2916
84.4k
        for (i = 0; i < v->n_fmt; ++i) {
2917
81.4k
            fmt_aux_t *z = &fmt[i];
2918
81.4k
            bcf_enc_int1(str, z->key);
2919
81.4k
            if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
2920
79.9k
                bcf_enc_size(str, z->size, BCF_BT_CHAR);
2921
79.9k
                kputsn((char*)z->buf, z->size * (size_t)v->n_sample, str);
2922
79.9k
            } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
2923
1.45k
                bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
2924
1.45k
            } else {
2925
0
                bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
2926
0
                if (serialize_float_array(str, (z->size>>2) * (size_t)v->n_sample,
2927
0
                                          (float *) z->buf) != 0) {
2928
0
                    v->errcode |= BCF_ERR_LIMITS;
2929
0
                    hts_log_error("Out of memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2930
0
                    return -1;
2931
0
                }
2932
0
            }
2933
81.4k
        }
2934
3.04k
    }
2935
2936
3.04k
    if ( v->n_sample!=bcf_hdr_nsamples(h) )
2937
63
    {
2938
63
        hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
2939
63
            bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
2940
63
        v->errcode |= BCF_ERR_NCOLS;
2941
63
        return -1;
2942
63
    }
2943
2.98k
    if ( v->indiv.l > 0xffffffff )
2944
0
    {
2945
0
        hts_log_error("The FORMAT at %s:%"PRIhts_pos" is too long", bcf_seqname_safe(h,v), v->pos+1);
2946
0
        v->errcode |= BCF_ERR_LIMITS;
2947
2948
        // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed
2949
0
        v->n_fmt = 0;
2950
0
        return -1;
2951
0
    }
2952
2953
2.98k
    return 0;
2954
2.98k
}
2955
2956
6.11k
static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
2957
    // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
2958
    // been already printed, but will enable tools like vcfcheck to proceed.
2959
2960
6.11k
    kstring_t tmp = {0,0,0};
2961
6.11k
    khint_t k;
2962
6.11k
    int l;
2963
6.11k
    if (ksprintf(&tmp, "##contig=<ID=%s>", p) < 0)
2964
0
        return kh_end(d);
2965
6.11k
    bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
2966
6.11k
    free(tmp.s);
2967
6.11k
    int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
2968
6.11k
    if (res < 0) bcf_hrec_destroy(hrec);
2969
6.11k
    if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
2970
6.11k
    k = kh_get(vdict, d, p);
2971
2972
6.11k
    return k;
2973
6.11k
}
2974
2975
7.53k
static int vcf_parse_filter(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
2976
7.53k
    int i, n_flt = 1, max_n_flt = 0;
2977
7.53k
    char *r, *t;
2978
7.53k
    int32_t *a_flt = NULL;
2979
7.53k
    ks_tokaux_t aux1;
2980
7.53k
    khint_t k;
2981
7.53k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
2982
    // count the number of filters
2983
7.53k
    if (*(q-1) == ';') *(q-1) = 0;
2984
250M
    for (r = p; *r; ++r)
2985
250M
        if (*r == ';') ++n_flt;
2986
7.53k
    if (n_flt > max_n_flt) {
2987
7.53k
        a_flt = malloc(n_flt * sizeof(*a_flt));
2988
7.53k
        if (!a_flt) {
2989
0
            hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
2990
0
            v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
2991
0
            return -1;
2992
0
        }
2993
7.53k
        max_n_flt = n_flt;
2994
7.53k
    }
2995
    // add filters
2996
1.60M
    for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
2997
1.59M
        *(char*)aux1.p = 0;
2998
1.59M
        k = kh_get(vdict, d, t);
2999
1.59M
        if (k == kh_end(d))
3000
3.96k
        {
3001
            // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
3002
            // been already printed, but will enable tools like vcfcheck to proceed.
3003
3.96k
            hts_log_warning("FILTER '%s' is not defined in the header", t);
3004
3.96k
            kstring_t tmp = {0,0,0};
3005
3.96k
            int l;
3006
3.96k
            ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
3007
3.96k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3008
3.96k
            free(tmp.s);
3009
3.96k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3010
3.96k
            if (res < 0) bcf_hrec_destroy(hrec);
3011
3.96k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3012
3.96k
            k = kh_get(vdict, d, t);
3013
3.96k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3014
3.96k
            if (res || k == kh_end(d)) {
3015
18
                hts_log_error("Could not add dummy header for FILTER '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1);
3016
18
                v->errcode |= BCF_ERR_TAG_INVALID;
3017
18
                free(a_flt);
3018
18
                return -1;
3019
18
            }
3020
3.96k
        }
3021
1.59M
        a_flt[i++] = kh_val(d, k).id;
3022
1.59M
    }
3023
3024
7.51k
    bcf_enc_vint(str, n_flt, a_flt, -1);
3025
7.51k
    free(a_flt);
3026
3027
7.51k
    return 0;
3028
7.53k
}
3029
3030
6.51k
static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) {
3031
6.51k
    static int extreme_int_warned = 0, negative_rlen_warned = 0;
3032
6.51k
    int max_n_val = 0, overflow = 0;
3033
6.51k
    char *r, *key;
3034
6.51k
    khint_t k;
3035
6.51k
    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
3036
6.51k
    int32_t *a_val = NULL;
3037
3038
6.51k
    v->n_info = 0;
3039
6.51k
    if (*(q-1) == ';') *(q-1) = 0;
3040
178M
    for (r = key = p;; ++r) {
3041
178M
        int c;
3042
178M
        char *val, *end;
3043
178M
        if (*r != ';' && *r != '=' && *r != 0) continue;
3044
5.51M
        if (v->n_info == UINT16_MAX) {
3045
9
            hts_log_error("Too many INFO entries at %s:%"PRIhts_pos,
3046
9
                          bcf_seqname_safe(h,v), v->pos+1);
3047
9
            v->errcode |= BCF_ERR_LIMITS;
3048
9
            goto fail;
3049
9
        }
3050
5.51M
        val = end = 0;
3051
5.51M
        c = *r; *r = 0;
3052
5.51M
        if (c == '=') {
3053
5.17M
            val = r + 1;
3054
2.20G
            for (end = val; *end != ';' && *end != 0; ++end);
3055
5.17M
            c = *end; *end = 0;
3056
5.17M
        } else end = r;
3057
5.51M
        if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; }  // faulty VCF, ";;" in the INFO
3058
5.50M
        k = kh_get(vdict, d, key);
3059
5.50M
        if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
3060
4.71k
        {
3061
4.71k
            hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key);
3062
4.71k
            kstring_t tmp = {0,0,0};
3063
4.71k
            int l;
3064
4.71k
            ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
3065
4.71k
            bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
3066
4.71k
            free(tmp.s);
3067
4.71k
            int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1;
3068
4.71k
            if (res < 0) bcf_hrec_destroy(hrec);
3069
4.71k
            if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h);
3070
4.71k
            k = kh_get(vdict, d, key);
3071
4.71k
            v->errcode |= BCF_ERR_TAG_UNDEF;
3072
4.71k
            if (res || k == kh_end(d)) {
3073
45
                hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1);
3074
45
                v->errcode |= BCF_ERR_TAG_INVALID;
3075
45
                goto fail;
3076
45
            }
3077
4.71k
        }
3078
5.50M
        uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
3079
5.50M
        ++v->n_info;
3080
5.50M
        bcf_enc_int1(str, kh_val(d, k).id);
3081
5.50M
        if (val == 0) {
3082
335k
            bcf_enc_size(str, 0, BCF_BT_NULL);
3083
5.16M
        } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
3084
47.1k
            bcf_enc_vchar(str, end - val, val);
3085
5.12M
        } else { // int/float value/array
3086
5.12M
            int i, n_val;
3087
5.12M
            char *t, *te;
3088
1.89G
            for (t = val, n_val = 1; *t; ++t) // count the number of values
3089
1.88G
                if (*t == ',') ++n_val;
3090
            // Check both int and float size in one step for simplicity
3091
5.12M
            if (n_val > max_n_val) {
3092
2.84k
                int32_t *a_tmp = (int32_t *)realloc(a_val, n_val * sizeof(*a_val));
3093
2.84k
                if (!a_tmp) {
3094
0
                    hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
3095
0
                    v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
3096
0
                    goto fail;
3097
0
                }
3098
2.84k
                a_val = a_tmp;
3099
2.84k
                max_n_val = n_val;
3100
2.84k
            }
3101
5.12M
            if ((y>>4&0xf) == BCF_HT_INT) {
3102
3.52M
                i = 0, t = val;
3103
3.52M
                int64_t val1;
3104
3.52M
                int is_int64 = 0;
3105
#ifdef VCF_ALLOW_INT64
3106
                if ( n_val==1 )
3107
                {
3108
                    overflow = 0;
3109
                    long long int tmp_val = hts_str2int(val, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3110
                    if ( te==val ) tmp_val = bcf_int32_missing;
3111
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT64 || tmp_val>BCF_MAX_BT_INT64 )
3112
                    {
3113
                        if ( !extreme_int_warned )
3114
                        {
3115
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3116
                            extreme_int_warned = 1;
3117
                        }
3118
                        tmp_val = bcf_int32_missing;
3119
                    }
3120
                    else
3121
                        is_int64 = 1;
3122
                    val1 = tmp_val;
3123
                    t = te;
3124
                    i = 1;  // this is just to avoid adding another nested block...
3125
                }
3126
#endif
3127
1.00G
                for (; i < n_val; ++i, ++t)
3128
997M
                {
3129
997M
                    overflow = 0;
3130
997M
                    long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
3131
997M
                    if ( te==t ) tmp_val = bcf_int32_missing;
3132
9.95M
                    else if ( overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
3133
2.10M
                    {
3134
2.10M
                        if ( !extreme_int_warned )
3135
1
                        {
3136
1
                            hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1);
3137
1
                            extreme_int_warned = 1;
3138
1
                        }
3139
2.10M
                        tmp_val = bcf_int32_missing;
3140
2.10M
                    }
3141
997M
                    a_val[i] = tmp_val;
3142
1.19G
                    for (t = te; *t && *t != ','; t++);
3143
997M
                }
3144
3.52M
                if (n_val == 1) {
3145
#ifdef VCF_ALLOW_INT64
3146
                    if ( is_int64 )
3147
                    {
3148
                        v->unpacked |= BCF_IS_64BIT;
3149
                        bcf_enc_long1(str, val1);
3150
                    }
3151
                    else
3152
                        bcf_enc_int1(str, (int32_t)val1);
3153
#else
3154
2.83M
                    val1 = a_val[0];
3155
2.83M
                    bcf_enc_int1(str, (int32_t)val1);
3156
2.83M
#endif
3157
2.83M
                } else {
3158
685k
                    bcf_enc_vint(str, n_val, a_val, -1);
3159
685k
                }
3160
3.52M
                if (n_val==1 && (val1!=bcf_int32_missing || is_int64) && strcmp(key, "END") == 0)
3161
0
                {
3162
0
                    if ( val1 <= v->pos )
3163
0
                    {
3164
0
                        if ( !negative_rlen_warned )
3165
0
                        {
3166
0
                            hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,val1,bcf_seqname_safe(h,v),v->pos+1);
3167
0
                            negative_rlen_warned = 1;
3168
0
                        }
3169
0
                    }
3170
0
                    else
3171
0
                        v->rlen = val1 - v->pos;
3172
0
                }
3173
3.52M
            } else if ((y>>4&0xf) == BCF_HT_REAL) {
3174
1.59M
                float *val_f = (float *)a_val;
3175
502M
                for (i = 0, t = val; i < n_val; ++i, ++t)
3176
501M
                {
3177
501M
                    overflow = 0;
3178
501M
                    val_f[i] = hts_str2dbl(t, &te, &overflow);
3179
501M
                    if ( te==t || overflow ) // conversion failed
3180
497M
                        bcf_float_set_missing(val_f[i]);
3181
623M
                    for (t = te; *t && *t != ','; t++);
3182
501M
                }
3183
1.59M
                bcf_enc_vfloat(str, n_val, val_f);
3184
1.59M
            }
3185
5.12M
        }
3186
5.50M
        if (c == 0) break;
3187
5.50M
        r = end;
3188
5.50M
        key = r + 1;
3189
5.50M
    }
3190
3191
6.46k
    free(a_val);
3192
6.46k
    return 0;
3193
3194
54
 fail:
3195
54
    free(a_val);
3196
54
    return -1;
3197
6.51k
}
3198
3199
int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
3200
195k
{
3201
195k
    int i = 0, ret = -2, overflow = 0;
3202
195k
    char *p, *q, *r, *t;
3203
195k
    kstring_t *str;
3204
195k
    khint_t k;
3205
195k
    ks_tokaux_t aux;
3206
3207
195k
    if (!s || !h || !v || !(s->s))
3208
0
        return ret;
3209
3210
    // Assumed in lots of places, but we may as well spot this early
3211
195k
    assert(sizeof(float) == sizeof(int32_t));
3212
3213
195k
    bcf_clear1(v);
3214
195k
    str = &v->shared;
3215
195k
    memset(&aux, 0, sizeof(ks_tokaux_t));
3216
453k
    for (p = kstrtok(s->s, "\t", &aux), i = 0; p; p = kstrtok(0, 0, &aux), ++i) {
3217
263k
        q = (char*)aux.p;
3218
263k
        *q = 0;
3219
263k
        if (i == 0) { // CHROM
3220
195k
            vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
3221
195k
            k = kh_get(vdict, d, p);
3222
195k
            if (k == kh_end(d))
3223
6.11k
            {
3224
6.11k
                hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p);
3225
6.11k
                v->errcode = BCF_ERR_CTG_UNDEF;
3226
6.11k
                if ((k = fix_chromosome(h, d, p)) == kh_end(d)) {
3227
424
                    hts_log_error("Could not add dummy header for contig '%s'", p);
3228
424
                    v->errcode |= BCF_ERR_CTG_INVALID;
3229
424
                    goto err;
3230
424
                }
3231
6.11k
            }
3232
195k
            v->rid = kh_val(d, k).id;
3233
195k
        } else if (i == 1) { // POS
3234
11.6k
            overflow = 0;
3235
11.6k
            char *tmp = p;
3236
11.6k
            v->pos = hts_str2uint(p, &p, 63, &overflow);
3237
11.6k
            if (overflow) {
3238
2
                hts_log_error("Position value '%s' is too large", tmp);
3239
2
                goto err;
3240
11.6k
            } else if ( *p ) {
3241
136
                hts_log_error("Could not parse the position '%s'", tmp);
3242
136
                goto err;
3243
11.5k
            } else {
3244
11.5k
                v->pos -= 1;
3245
11.5k
            }
3246
11.5k
            if (v->pos >= INT32_MAX)
3247
807
                v->unpacked |= BCF_IS_64BIT;
3248
56.0k
        } else if (i == 2) { // ID
3249
10.4k
            if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p);
3250
173
            else bcf_enc_size(str, 0, BCF_BT_CHAR);
3251
45.6k
        } else if (i == 3) { // REF
3252
9.98k
            bcf_enc_vchar(str, q - p, p);
3253
9.98k
            v->n_allele = 1, v->rlen = q - p;
3254
35.6k
        } else if (i == 4) { // ALT
3255
8.46k
            if (strcmp(p, ".")) {
3256
721M
                for (r = t = p;; ++r) {
3257
721M
                    if (*r == ',' || *r == 0) {
3258
4.59M
                        if (v->n_allele == UINT16_MAX) {
3259
4
                            hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos,
3260
4
                                          bcf_seqname_safe(h,v), v->pos+1);
3261
4
                            v->errcode |= BCF_ERR_LIMITS;
3262
4
                            goto err;
3263
4
                        }
3264
4.59M
                        bcf_enc_vchar(str, r - t, t);
3265
4.59M
                        t = r + 1;
3266
4.59M
                        ++v->n_allele;
3267
4.59M
                    }
3268
721M
                    if (r == q) break;
3269
721M
                }
3270
8.17k
            }
3271
27.1k
        } else if (i == 5) { // QUAL
3272
7.84k
            if (strcmp(p, ".")) v->qual = atof(p);
3273
21
            else bcf_float_set_missing(v->qual);
3274
7.84k
            if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR
3275
19.3k
        } else if (i == 6) { // FILTER
3276
7.55k
            if (strcmp(p, ".")) {
3277
7.53k
                if (vcf_parse_filter(str, h, v, p, q)) goto err;
3278
7.53k
            } else bcf_enc_vint(str, 0, 0, -1);
3279
7.53k
            if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT
3280
11.7k
        } else if (i == 7) { // INFO
3281
6.53k
            if (strcmp(p, ".")) {
3282
6.51k
                if (vcf_parse_info(str, h, v, p, q)) goto err;
3283
6.51k
            }
3284
6.48k
            if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
3285
6.48k
        } else if (i == 8) {// FORMAT
3286
5.22k
            return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2;
3287
5.22k
        }
3288
263k
    }
3289
3290
189k
 end:
3291
189k
    ret = 0;
3292
3293
190k
 err:
3294
190k
    return ret;
3295
189k
}
3296
3297
int vcf_open_mode(char *mode, const char *fn, const char *format)
3298
0
{
3299
0
    if (format == NULL) {
3300
        // Try to pick a format based on the filename extension
3301
0
        char extension[HTS_MAX_EXT_LEN];
3302
0
        if (find_file_extension(fn, extension) < 0) return -1;
3303
0
        return vcf_open_mode(mode, fn, extension);
3304
0
    }
3305
0
    else if (strcasecmp(format, "bcf") == 0) strcpy(mode, "b");
3306
0
    else if (strcasecmp(format, "vcf") == 0) strcpy(mode, "");
3307
0
    else if (strcasecmp(format, "vcf.gz") == 0 || strcasecmp(format, "vcf.bgz") == 0) strcpy(mode, "z");
3308
0
    else return -1;
3309
3310
0
    return 0;
3311
0
}
3312
3313
int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
3314
196k
{
3315
196k
    int ret;
3316
196k
    ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
3317
196k
    if (ret < 0) return ret;
3318
195k
    return vcf_parse1(&fp->line, h, v);
3319
196k
}
3320
3321
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
3322
76.8k
{
3323
76.8k
    uint8_t *ptr_start = ptr;
3324
76.8k
    fmt->id = bcf_dec_typed_int1(ptr, &ptr);
3325
76.8k
    fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
3326
76.8k
    fmt->size = fmt->n << bcf_type_shift[fmt->type];
3327
76.8k
    fmt->p = ptr;
3328
76.8k
    fmt->p_off  = ptr - ptr_start;
3329
76.8k
    fmt->p_free = 0;
3330
76.8k
    ptr += n_sample * fmt->size;
3331
76.8k
    fmt->p_len = ptr - fmt->p;
3332
76.8k
    return ptr;
3333
76.8k
}
3334
3335
static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
3336
4.60M
{
3337
4.60M
    uint8_t *ptr_start = ptr;
3338
4.60M
    info->key = bcf_dec_typed_int1(ptr, &ptr);
3339
4.60M
    info->len = bcf_dec_size(ptr, &ptr, &info->type);
3340
4.60M
    info->vptr = ptr;
3341
4.60M
    info->vptr_off  = ptr - ptr_start;
3342
4.60M
    info->vptr_free = 0;
3343
4.60M
    info->v1.i = 0;
3344
4.60M
    if (info->len == 1) {
3345
3.56M
        if (info->type == BCF_BT_INT8 || info->type == BCF_BT_CHAR) info->v1.i = *(int8_t*)ptr;
3346
2.24M
        else if (info->type == BCF_BT_INT32) info->v1.i = le_to_i32(ptr);
3347
1.48M
        else if (info->type == BCF_BT_FLOAT) info->v1.f = le_to_float(ptr);
3348
469k
        else if (info->type == BCF_BT_INT16) info->v1.i = le_to_i16(ptr);
3349
0
        else if (info->type == BCF_BT_INT64) info->v1.i = le_to_i64(ptr);
3350
3.56M
    }
3351
4.60M
    ptr += info->len << bcf_type_shift[info->type];
3352
4.60M
    info->vptr_len = ptr - info->vptr;
3353
4.60M
    return ptr;
3354
4.60M
}
3355
3356
int bcf_unpack(bcf1_t *b, int which)
3357
195k
{
3358
195k
    if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
3359
11.1k
    uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
3360
11.1k
    int i;
3361
11.1k
    bcf_dec_t *d = &b->d;
3362
11.1k
    if (which & BCF_UN_FLT) which |= BCF_UN_STR;
3363
11.1k
    if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
3364
11.1k
    if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
3365
11.1k
    {
3366
11.1k
        kstring_t tmp;
3367
3368
        // ID
3369
11.1k
        tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
3370
11.1k
        ptr_ori = ptr;
3371
11.1k
        ptr = bcf_fmt_sized_array(&tmp, ptr);
3372
11.1k
        b->unpack_size[0] = ptr - ptr_ori;
3373
11.1k
        kputc('\0', &tmp);
3374
11.1k
        d->id = tmp.s; d->m_id = tmp.m;
3375
3376
        // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
3377
11.1k
        hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
3378
11.1k
        tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
3379
11.1k
        ptr_ori = ptr;
3380
4.28M
        for (i = 0; i < b->n_allele; ++i) {
3381
            // Use offset within tmp.s as realloc may change pointer
3382
4.27M
            d->allele[i] = (char *)(intptr_t)tmp.l;
3383
4.27M
            ptr = bcf_fmt_sized_array(&tmp, ptr);
3384
4.27M
            kputc('\0', &tmp);
3385
4.27M
        }
3386
11.1k
        b->unpack_size[1] = ptr - ptr_ori;
3387
11.1k
        d->als = tmp.s; d->m_als = tmp.m;
3388
3389
        // Convert our offsets within tmp.s back to pointers again
3390
4.28M
        for (i = 0; i < b->n_allele; ++i)
3391
4.27M
            d->allele[i] = d->als + (ptrdiff_t)d->allele[i];
3392
11.1k
        b->unpacked |= BCF_UN_STR;
3393
11.1k
    }
3394
11.1k
    if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
3395
11.1k
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
3396
11.1k
        ptr_ori = ptr;
3397
11.1k
        if (*ptr>>4) {
3398
7.32k
            int type;
3399
7.32k
            d->n_flt = bcf_dec_size(ptr, &ptr, &type);
3400
7.32k
            hts_expand(int, d->n_flt, d->m_flt, d->flt);
3401
1.60M
            for (i = 0; i < d->n_flt; ++i)
3402
1.59M
                d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
3403
7.32k
        } else ++ptr, d->n_flt = 0;
3404
11.1k
        b->unpack_size[2] = ptr - ptr_ori;
3405
11.1k
        b->unpacked |= BCF_UN_FLT;
3406
11.1k
    }
3407
11.1k
    if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
3408
11.1k
        ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
3409
11.1k
        hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
3410
8.09M
        for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
3411
4.61M
        for (i = 0; i < b->n_info; ++i)
3412
4.60M
            ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
3413
11.1k
        b->unpacked |= BCF_UN_INFO;
3414
11.1k
    }
3415
11.1k
    if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
3416
3.12k
        ptr = (uint8_t*)b->indiv.s;
3417
3.12k
        hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
3418
177k
        for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
3419
79.9k
        for (i = 0; i < b->n_fmt; ++i)
3420
76.8k
            ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
3421
3.12k
        b->unpacked |= BCF_UN_FMT;
3422
3.12k
    }
3423
11.1k
    return 0;
3424
195k
}
3425
3426
int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
3427
195k
{
3428
195k
    int i;
3429
195k
    int32_t max_dt_id = h->n[BCF_DT_ID];
3430
195k
    const char *chrom = bcf_seqname(h, v);
3431
195k
    if (!chrom) {
3432
0
        hts_log_error("Invalid BCF, CONTIG id=%d not present in the header",
3433
0
                      v->rid);
3434
0
        errno = EINVAL;
3435
0
        return -1;
3436
0
    }
3437
195k
    bcf_unpack((bcf1_t*)v, BCF_UN_ALL);
3438
195k
    kputs(chrom, s); // CHROM
3439
195k
    kputc('\t', s); kputll(v->pos + 1, s); // POS
3440
195k
    kputc('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
3441
195k
    kputc('\t', s); // REF
3442
195k
    if (v->n_allele > 0) kputs(v->d.allele[0], s);
3443
185k
    else kputc('.', s);
3444
195k
    kputc('\t', s); // ALT
3445
195k
    if (v->n_allele > 1) {
3446
4.27M
        for (i = 1; i < v->n_allele; ++i) {
3447
4.26M
            if (i > 1) kputc(',', s);
3448
4.26M
            kputs(v->d.allele[i], s);
3449
4.26M
        }
3450
187k
    } else kputc('.', s);
3451
195k
    kputc('\t', s); // QUAL
3452
195k
    if ( bcf_float_is_missing(v->qual) ) kputc('.', s); // QUAL
3453
8.54k
    else kputd(v->qual, s);
3454
195k
    kputc('\t', s); // FILTER
3455
195k
    if (v->d.n_flt) {
3456
1.60M
        for (i = 0; i < v->d.n_flt; ++i) {
3457
1.59M
            int32_t idx = v->d.flt[i];
3458
1.59M
            if (idx < 0 || idx >= max_dt_id
3459
1.59M
                || h->id[BCF_DT_ID][idx].key == NULL) {
3460
0
                hts_log_error("Invalid BCF, the FILTER tag id=%d at %s:%"PRIhts_pos" not present in the header",
3461
0
                              idx, bcf_seqname_safe(h, v), v->pos + 1);
3462
0
                errno = EINVAL;
3463
0
                return -1;
3464
0
            }
3465
1.59M
            if (i) kputc(';', s);
3466
1.59M
            kputs(h->id[BCF_DT_ID][idx].key, s);
3467
1.59M
        }
3468
188k
    } else kputc('.', s);
3469
195k
    kputc('\t', s); // INFO
3470
195k
    if (v->n_info) {
3471
3.40k
        int first = 1;
3472
4.60M
        for (i = 0; i < v->n_info; ++i) {
3473
4.60M
            bcf_info_t *z = &v->d.info[i];
3474
4.60M
            if ( !z->vptr ) continue;
3475
4.60M
            if ( !first ) kputc(';', s);
3476
4.60M
            first = 0;
3477
4.60M
            if (z->key < 0 || z->key >= max_dt_id
3478
4.60M
                || h->id[BCF_DT_ID][z->key].key == NULL) {
3479
0
                hts_log_error("Invalid BCF, the INFO tag id=%d is %s at %s:%"PRIhts_pos,
3480
0
                              z->key,
3481
0
                              z->key < 0 ? "negative"
3482
0
                              : (z->key >= max_dt_id ? "too large" : "not present in the header"),
3483
0
                              bcf_seqname_safe(h, v), v->pos+1);
3484
0
                errno = EINVAL;
3485
0
                return -1;
3486
0
            }
3487
4.60M
            kputs(h->id[BCF_DT_ID][z->key].key, s);
3488
4.60M
            if (z->len <= 0) continue;
3489
4.41M
            kputc('=', s);
3490
4.41M
            if (z->len == 1)
3491
3.56M
            {
3492
3.56M
                switch (z->type)
3493
3.56M
                {
3494
1.30M
                    case BCF_BT_INT8:  if ( z->v1.i==bcf_int8_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
3495
469k
                    case BCF_BT_INT16: if ( z->v1.i==bcf_int16_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
3496
766k
                    case BCF_BT_INT32: if ( z->v1.i==bcf_int32_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
3497
0
                    case BCF_BT_INT64: if ( z->v1.i==bcf_int64_missing ) kputc('.', s); else kputll(z->v1.i, s); break;
3498
1.01M
                    case BCF_BT_FLOAT: if ( bcf_float_is_missing(z->v1.f) ) kputc('.', s); else kputd(z->v1.f, s); break;
3499
10.8k
                    case BCF_BT_CHAR:  kputc(z->v1.i, s); break;
3500
0
                    default:
3501
0
                        hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, z->type, bcf_seqname_safe(h, v), v->pos+1);
3502
0
                        errno = EINVAL;
3503
0
                        return -1;
3504
3.56M
                }
3505
3.56M
            }
3506
851k
            else bcf_fmt_array(s, z->len, z->type, z->vptr);
3507
4.41M
        }
3508
3.40k
        if ( first ) kputc('.', s);
3509
192k
    } else kputc('.', s);
3510
    // FORMAT and individual information
3511
195k
    if (v->n_sample)
3512
3.12k
    {
3513
3.12k
        int i,j;
3514
3.12k
        if ( v->n_fmt)
3515
2.98k
        {
3516
2.98k
            int gt_i = -1;
3517
2.98k
            bcf_fmt_t *fmt = v->d.fmt;
3518
2.98k
            int first = 1;
3519
79.8k
            for (i = 0; i < (int)v->n_fmt; ++i) {
3520
76.8k
                if ( !fmt[i].p ) continue;
3521
76.8k
                kputc(!first ? ':' : '\t', s); first = 0;
3522
76.8k
                if (fmt[i].id < 0 || fmt[i].id >= max_dt_id
3523
76.8k
                    || h->id[BCF_DT_ID][fmt[i].id].key == NULL) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) )
3524
0
                {
3525
0
                    hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", fmt[i].id, bcf_seqname_safe(h, v), v->pos+1);
3526
0
                    errno = EINVAL;
3527
0
                    return -1;
3528
0
                }
3529
76.8k
                kputs(h->id[BCF_DT_ID][fmt[i].id].key, s);
3530
76.8k
                if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i;
3531
76.8k
            }
3532
2.98k
            if ( first ) kputs("\t.", s);
3533
27.9k
            for (j = 0; j < v->n_sample; ++j) {
3534
25.0k
                kputc('\t', s);
3535
25.0k
                first = 1;
3536
2.00M
                for (i = 0; i < (int)v->n_fmt; ++i) {
3537
1.98M
                    bcf_fmt_t *f = &fmt[i];
3538
1.98M
                    if ( !f->p ) continue;
3539
1.98M
                    if (!first) kputc(':', s);
3540
1.98M
                    first = 0;
3541
1.98M
                    if (gt_i == i)
3542
8.88k
                        bcf_format_gt(f,j,s);
3543
1.97M
                    else
3544
1.97M
                        bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
3545
1.98M
                }
3546
25.0k
                if ( first ) kputc('.', s);
3547
25.0k
            }
3548
2.98k
        }
3549
146
        else
3550
455
            for (j=0; j<=v->n_sample; j++)
3551
309
                kputs("\t.", s);
3552
3.12k
    }
3553
195k
    kputc('\n', s);
3554
195k
    return 0;
3555
195k
}
3556
3557
int vcf_write_line(htsFile *fp, kstring_t *line)
3558
0
{
3559
0
    int ret;
3560
0
    if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
3561
0
    if ( fp->format.compression!=no_compression )
3562
0
        ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
3563
0
    else
3564
0
        ret = hwrite(fp->fp.hfile, line->s, line->l);
3565
0
    return ret==line->l ? 0 : -1;
3566
0
}
3567
3568
int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
3569
195k
{
3570
195k
    ssize_t ret;
3571
195k
    fp->line.l = 0;
3572
195k
    if (vcf_format1(h, v, &fp->line) != 0)
3573
0
        return -1;
3574
195k
    if ( fp->format.compression!=no_compression ) {
3575
0
        if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
3576
0
            return -1;
3577
0
        if (fp->idx)
3578
0
            hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
3579
0
        ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
3580
195k
    } else {
3581
195k
        ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
3582
195k
    }
3583
3584
195k
    if (fp->idx) {
3585
0
        int tid;
3586
0
        if ((tid = hts_idx_tbi_name(fp->idx, v->rid, bcf_seqname_safe(h, v))) < 0)
3587
0
            return -1;
3588
3589
0
        if (hts_idx_push(fp->idx, tid, v->pos, v->pos + v->rlen, bgzf_tell(fp->fp.bgzf), 1) < 0)
3590
0
            return -1;
3591
0
    }
3592
3593
195k
    return ret==fp->line.l ? 0 : -1;
3594
195k
}
3595
3596
/************************
3597
 * Data access routines *
3598
 ************************/
3599
3600
int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
3601
3.03k
{
3602
3.03k
    khint_t k;
3603
3.03k
    vdict_t *d = (vdict_t*)h->dict[which];
3604
3.03k
    k = kh_get(vdict, d, id);
3605
3.03k
    return k == kh_end(d)? -1 : kh_val(d, k).id;
3606
3.03k
}
3607
3608
3609
/********************
3610
 *** BCF indexing ***
3611
 ********************/
3612
3613
// Calculate number of index levels given min_shift and the header contig
3614
// list.  Also returns number of contigs in *nids_out.
3615
static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift,
3616
                               int starting_n_lvls, int *nids_out)
3617
0
{
3618
0
    int n_lvls, i, nids = 0;
3619
0
    int64_t max_len = 0, s;
3620
3621
0
    for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
3622
0
    {
3623
0
        if ( !h->id[BCF_DT_CTG][i].val ) continue;
3624
0
        if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] )
3625
0
            max_len = h->id[BCF_DT_CTG][i].val->info[0];
3626
0
        nids++;
3627
0
    }
3628
0
    if ( !max_len ) max_len = (1LL<<31) - 1;  // In case contig line is broken.
3629
0
    max_len += 256;
3630
0
    s = 1LL << (min_shift + starting_n_lvls * 3);
3631
0
    for (n_lvls = starting_n_lvls; max_len > s; ++n_lvls, s <<= 3);
3632
3633
0
    if (nids_out) *nids_out = nids;
3634
0
    return n_lvls;
3635
0
}
3636
3637
hts_idx_t *bcf_index(htsFile *fp, int min_shift)
3638
0
{
3639
0
    int n_lvls;
3640
0
    bcf1_t *b = NULL;
3641
0
    hts_idx_t *idx = NULL;
3642
0
    bcf_hdr_t *h;
3643
0
    int r;
3644
0
    h = bcf_hdr_read(fp);
3645
0
    if ( !h ) return NULL;
3646
0
    int nids = 0;
3647
0
    n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
3648
0
    idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
3649
0
    if (!idx) goto fail;
3650
0
    b = bcf_init1();
3651
0
    if (!b) goto fail;
3652
0
    while ((r = bcf_read1(fp,h, b)) >= 0) {
3653
0
        int ret;
3654
0
        ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
3655
0
        if (ret < 0) goto fail;
3656
0
    }
3657
0
    if (r < -1) goto fail;
3658
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
3659
0
    bcf_destroy1(b);
3660
0
    bcf_hdr_destroy(h);
3661
0
    return idx;
3662
3663
0
 fail:
3664
0
    hts_idx_destroy(idx);
3665
0
    bcf_destroy1(b);
3666
0
    bcf_hdr_destroy(h);
3667
0
    return NULL;
3668
0
}
3669
3670
hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
3671
0
{
3672
0
    return fnidx? hts_idx_load2(fn, fnidx) : bcf_index_load(fn);
3673
0
}
3674
3675
hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags)
3676
0
{
3677
0
    return hts_idx_load3(fn, fnidx, HTS_FMT_CSI, flags);
3678
0
}
3679
3680
int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads)
3681
0
{
3682
0
    htsFile *fp;
3683
0
    hts_idx_t *idx;
3684
0
    tbx_t *tbx;
3685
0
    int ret;
3686
0
    if ((fp = hts_open(fn, "rb")) == 0) return -2;
3687
0
    if (n_threads)
3688
0
        hts_set_threads(fp, n_threads);
3689
0
    if ( fp->format.compression!=bgzf ) { hts_close(fp); return -3; }
3690
0
    switch (fp->format.format) {
3691
0
        case bcf:
3692
0
            if (!min_shift) {
3693
0
                hts_log_error("TBI indices for BCF files are not supported");
3694
0
                ret = -1;
3695
0
            } else {
3696
0
                idx = bcf_index(fp, min_shift);
3697
0
                if (idx) {
3698
0
                    ret = hts_idx_save_as(idx, fn, fnidx, HTS_FMT_CSI);
3699
0
                    if (ret < 0) ret = -4;
3700
0
                    hts_idx_destroy(idx);
3701
0
                }
3702
0
                else ret = -1;
3703
0
            }
3704
0
            break;
3705
3706
0
        case vcf:
3707
0
            tbx = tbx_index(hts_get_bgzfp(fp), min_shift, &tbx_conf_vcf);
3708
0
            if (tbx) {
3709
0
                ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0 ? HTS_FMT_CSI : HTS_FMT_TBI);
3710
0
                if (ret < 0) ret = -4;
3711
0
                tbx_destroy(tbx);
3712
0
            }
3713
0
            else ret = -1;
3714
0
            break;
3715
3716
0
        default:
3717
0
            ret = -3;
3718
0
            break;
3719
0
    }
3720
0
    hts_close(fp);
3721
0
    return ret;
3722
0
}
3723
3724
int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
3725
0
{
3726
0
    return bcf_index_build3(fn, fnidx, min_shift, 0);
3727
0
}
3728
3729
int bcf_index_build(const char *fn, int min_shift)
3730
0
{
3731
0
    return bcf_index_build3(fn, NULL, min_shift, 0);
3732
0
}
3733
3734
// Initialise fp->idx for the current format type.
3735
// This must be called after the header has been written but no other data.
3736
0
static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
3737
0
    int n_lvls, fmt;
3738
3739
0
    if (min_shift == 0) {
3740
0
        min_shift = 14;
3741
0
        n_lvls = 5;
3742
0
        fmt = HTS_FMT_TBI;
3743
0
    } else {
3744
        // Set initial n_lvls to match tbx_index()
3745
0
        int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3;
3746
        // Increase if necessary
3747
0
        n_lvls = idx_calc_n_lvls_ids(h, min_shift, starting_n_lvls, NULL);
3748
0
        fmt = HTS_FMT_CSI;
3749
0
    }
3750
3751
0
    fp->idx = hts_idx_init(0, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
3752
0
    if (!fp->idx) return -1;
3753
3754
    // Tabix meta data, added even in CSI for VCF
3755
0
    uint8_t conf[4*7];
3756
0
    u32_to_le(TBX_VCF, conf+0);  // fmt
3757
0
    u32_to_le(1,       conf+4);  // name col
3758
0
    u32_to_le(2,       conf+8);  // beg col
3759
0
    u32_to_le(0,       conf+12); // end col
3760
0
    u32_to_le('#',     conf+16); // comment
3761
0
    u32_to_le(0,       conf+20); // n.skip
3762
0
    u32_to_le(0,       conf+24); // ref name len
3763
0
    if (hts_idx_set_meta(fp->idx, sizeof(conf)*sizeof(*conf), (uint8_t *)conf, 1) < 0) {
3764
0
        hts_idx_destroy(fp->idx);
3765
0
        fp->idx = NULL;
3766
0
        return -1;
3767
0
    }
3768
0
    fp->fnidx = fnidx;
3769
3770
0
    return 0;
3771
0
}
3772
3773
// Initialise fp->idx for the current format type.
3774
// This must be called after the header has been written but no other data.
3775
0
int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) {
3776
0
    int n_lvls, nids = 0;
3777
3778
0
    if (fp->format.format == vcf)
3779
0
        return vcf_idx_init(fp, h, min_shift, fnidx);
3780
3781
0
    if (!min_shift)
3782
0
        min_shift = 14;
3783
3784
0
    n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids);
3785
3786
0
    fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
3787
0
    if (!fp->idx) return -1;
3788
0
    fp->fnidx = fnidx;
3789
3790
0
    return 0;
3791
0
}
3792
3793
// Finishes an index. Call after the last record has been written.
3794
// Returns 0 on success, <0 on failure.
3795
//
3796
// NB: same format as SAM/BAM as it uses bgzf.
3797
0
int bcf_idx_save(htsFile *fp) {
3798
0
    return sam_idx_save(fp);
3799
0
}
3800
3801
/*****************
3802
 *** Utilities ***
3803
 *****************/
3804
3805
int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
3806
0
{
3807
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res;
3808
0
    for (i=0; i<src->nhrec; i++)
3809
0
    {
3810
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
3811
0
        {
3812
0
            int j;
3813
0
            for (j=0; j<ndst_ori; j++)
3814
0
            {
3815
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
3816
3817
                // Checking only the key part of generic lines, otherwise
3818
                // the VCFs are too verbose. Should we perhaps add a flag
3819
                // to bcf_hdr_combine() and make this optional?
3820
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
3821
0
            }
3822
0
            if ( j>=ndst_ori ) {
3823
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
3824
0
                if (res < 0) return -1;
3825
0
                need_sync += res;
3826
0
            }
3827
0
        }
3828
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
3829
0
        {
3830
            // NB: we are ignoring fields without ID
3831
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
3832
0
            if ( j>=0 )
3833
0
            {
3834
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
3835
0
                if ( !rec ) {
3836
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
3837
0
                    if (res < 0) return -1;
3838
0
                    need_sync += res;
3839
0
                }
3840
0
            }
3841
0
        }
3842
0
        else
3843
0
        {
3844
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
3845
0
            assert( j>=0 ); // this should always be true for valid VCFs
3846
3847
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
3848
0
            if ( !rec ) {
3849
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
3850
0
                if (res < 0) return -1;
3851
0
                need_sync += res;
3852
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
3853
0
            {
3854
                // Check that both records are of the same type. The bcf_hdr_id2length
3855
                // macro cannot be used here because dst header is not synced yet.
3856
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
3857
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
3858
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
3859
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
3860
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
3861
0
                {
3862
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
3863
0
                        src->hrec[i]->vals[0]);
3864
0
                    ret |= 1;
3865
0
                }
3866
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
3867
0
                {
3868
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
3869
0
                        src->hrec[i]->vals[0]);
3870
0
                    ret |= 1;
3871
0
                }
3872
0
            }
3873
0
        }
3874
0
    }
3875
0
    if ( need_sync ) {
3876
0
        if (bcf_hdr_sync(dst) < 0) return -1;
3877
0
    }
3878
0
    return ret;
3879
0
}
3880
3881
bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
3882
0
{
3883
0
    if ( !dst )
3884
0
    {
3885
        // this will effectively strip existing IDX attributes from src to become dst
3886
0
        dst = bcf_hdr_init("r");
3887
0
        kstring_t htxt = {0,0,0};
3888
0
        if (bcf_hdr_format(src, 0, &htxt) < 0) {
3889
0
            free(htxt.s);
3890
0
            return NULL;
3891
0
        }
3892
0
        if ( bcf_hdr_parse(dst, htxt.s) < 0 ) {
3893
0
            bcf_hdr_destroy(dst);
3894
0
            dst = NULL;
3895
0
        }
3896
0
        free(htxt.s);
3897
0
        return dst;
3898
0
    }
3899
3900
0
    int i, ndst_ori = dst->nhrec, need_sync = 0, res;
3901
0
    for (i=0; i<src->nhrec; i++)
3902
0
    {
3903
0
        if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
3904
0
        {
3905
0
            int j;
3906
0
            for (j=0; j<ndst_ori; j++)
3907
0
            {
3908
0
                if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
3909
3910
                // Checking only the key part of generic lines, otherwise
3911
                // the VCFs are too verbose. Should we perhaps add a flag
3912
                // to bcf_hdr_combine() and make this optional?
3913
0
                if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
3914
0
            }
3915
0
            if ( j>=ndst_ori ) {
3916
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
3917
0
                if (res < 0) return NULL;
3918
0
                need_sync += res;
3919
0
            }
3920
0
        }
3921
0
        else if ( src->hrec[i]->type==BCF_HL_STR )
3922
0
        {
3923
            // NB: we are ignoring fields without ID
3924
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
3925
0
            if ( j>=0 )
3926
0
            {
3927
0
                bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
3928
0
                if ( !rec ) {
3929
0
                    res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
3930
0
                    if (res < 0) return NULL;
3931
0
                    need_sync += res;
3932
0
                }
3933
0
            }
3934
0
        }
3935
0
        else
3936
0
        {
3937
0
            int j = bcf_hrec_find_key(src->hrec[i],"ID");
3938
0
            assert( j>=0 ); // this should always be true for valid VCFs
3939
3940
0
            bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
3941
0
            if ( !rec ) {
3942
0
                res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
3943
0
                if (res < 0) return NULL;
3944
0
                need_sync += res;
3945
0
            } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
3946
0
            {
3947
                // Check that both records are of the same type. The bcf_hdr_id2length
3948
                // macro cannot be used here because dst header is not synced yet.
3949
0
                vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
3950
0
                vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
3951
0
                khint_t k_src  = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
3952
0
                khint_t k_dst  = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
3953
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
3954
0
                {
3955
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
3956
0
                        src->hrec[i]->vals[0]);
3957
0
                }
3958
0
                if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
3959
0
                {
3960
0
                    hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
3961
0
                        src->hrec[i]->vals[0]);
3962
0
                }
3963
0
            }
3964
0
        }
3965
0
    }
3966
0
    if ( need_sync ) {
3967
0
        if (bcf_hdr_sync(dst) < 0) return NULL;
3968
0
    }
3969
0
    return dst;
3970
0
}
3971
3972
int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
3973
0
{
3974
0
    int i;
3975
0
    if ( line->errcode )
3976
0
    {
3977
0
        char errordescription[1024] = "";
3978
0
        hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_strerror(line->errcode, errordescription, sizeof(errordescription)),  bcf_seqname_safe(src_hdr,line), line->pos+1);
3979
0
        exit(1);
3980
0
    }
3981
0
    if ( src_hdr->ntransl==-1 ) return 0;    // no need to translate, all tags have the same id
3982
0
    if ( !src_hdr->ntransl )  // called for the first time, see what needs translating
3983
0
    {
3984
0
        int dict;
3985
0
        for (dict=0; dict<2; dict++)    // BCF_DT_ID and BCF_DT_CTG
3986
0
        {
3987
0
            src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
3988
0
            for (i=0; i<src_hdr->n[dict]; i++)
3989
0
            {
3990
0
                if ( !src_hdr->id[dict][i].key ) // gap left after removed BCF header lines
3991
0
                {
3992
0
                    src_hdr->transl[dict][i] = -1;
3993
0
                    continue;
3994
0
                }
3995
0
                src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
3996
0
                if ( src_hdr->transl[dict][i]!=-1 && i!=src_hdr->transl[dict][i] ) src_hdr->ntransl++;
3997
0
            }
3998
0
        }
3999
0
        if ( !src_hdr->ntransl )
4000
0
        {
4001
0
            free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
4002
0
            free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
4003
0
            src_hdr->ntransl = -1;
4004
0
        }
4005
0
        if ( src_hdr->ntransl==-1 ) return 0;
4006
0
    }
4007
0
    bcf_unpack(line,BCF_UN_ALL);
4008
4009
    // CHROM
4010
0
    if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
4011
4012
    // FILTER
4013
0
    for (i=0; i<line->d.n_flt; i++)
4014
0
    {
4015
0
        int src_id = line->d.flt[i];
4016
0
        if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
4017
0
            line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
4018
0
        line->d.shared_dirty |= BCF1_DIRTY_FLT;
4019
0
    }
4020
4021
    // INFO
4022
0
    for (i=0; i<line->n_info; i++)
4023
0
    {
4024
0
        int src_id = line->d.info[i].key;
4025
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
4026
0
        if ( dst_id<0 ) continue;
4027
0
        line->d.info[i].key = dst_id;
4028
0
        if ( !line->d.info[i].vptr ) continue;  // skip deleted
4029
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4030
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4031
0
        if ( src_size==dst_size )   // can overwrite
4032
0
        {
4033
0
            uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
4034
0
            if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
4035
0
            else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
4036
0
            else { *(uint32_t*)vptr = (uint32_t)dst_id; }
4037
0
        }
4038
0
        else    // must realloc
4039
0
        {
4040
0
            bcf_info_t *info = &line->d.info[i];
4041
0
            kstring_t str = {0,0,0};
4042
0
            bcf_enc_int1(&str, dst_id);
4043
0
            bcf_enc_size(&str, info->len,info->type);
4044
0
            uint32_t vptr_off = str.l;
4045
0
            kputsn((char*)info->vptr, info->vptr_len, &str);
4046
0
            if( info->vptr_free ) free(info->vptr - info->vptr_off);
4047
0
            info->vptr_off = vptr_off;
4048
0
            info->vptr = (uint8_t*)str.s + info->vptr_off;
4049
0
            info->vptr_free = 1;
4050
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
4051
0
        }
4052
0
    }
4053
4054
    // FORMAT
4055
0
    for (i=0; i<line->n_fmt; i++)
4056
0
    {
4057
0
        int src_id = line->d.fmt[i].id;
4058
0
        int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
4059
0
        if ( dst_id<0 ) continue;
4060
0
        line->d.fmt[i].id = dst_id;
4061
0
        if( !line->d.fmt[i].p ) continue;  // skip deleted
4062
0
        int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4063
0
        int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
4064
0
        if ( src_size==dst_size )   // can overwrite
4065
0
        {
4066
0
            uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off;    // pointer to the vector size (4bits) and BT type (4bits)
4067
0
            if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
4068
0
            else if ( dst_size==BCF_BT_INT16 ) { i16_to_le(dst_id, p + 1); }
4069
0
            else { i32_to_le(dst_id, p + 1); }
4070
0
        }
4071
0
        else    // must realloc
4072
0
        {
4073
0
            bcf_fmt_t *fmt = &line->d.fmt[i];
4074
0
            kstring_t str = {0,0,0};
4075
0
            bcf_enc_int1(&str, dst_id);
4076
0
            bcf_enc_size(&str, fmt->n, fmt->type);
4077
0
            uint32_t p_off = str.l;
4078
0
            kputsn((char*)fmt->p, fmt->p_len, &str);
4079
0
            if( fmt->p_free ) free(fmt->p - fmt->p_off);
4080
0
            fmt->p_off = p_off;
4081
0
            fmt->p = (uint8_t*)str.s + fmt->p_off;
4082
0
            fmt->p_free = 1;
4083
0
            line->d.indiv_dirty = 1;
4084
0
        }
4085
0
    }
4086
0
    return 0;
4087
0
}
4088
4089
bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
4090
0
{
4091
0
    bcf_hdr_t *hout = bcf_hdr_init("r");
4092
0
    if (!hout) {
4093
0
        hts_log_error("Failed to allocate bcf header");
4094
0
        return NULL;
4095
0
    }
4096
0
    kstring_t htxt = {0,0,0};
4097
0
    if (bcf_hdr_format(hdr, 1, &htxt) < 0) {
4098
0
        free(htxt.s);
4099
0
        return NULL;
4100
0
    }
4101
0
    if ( bcf_hdr_parse(hout, htxt.s) < 0 ) {
4102
0
        bcf_hdr_destroy(hout);
4103
0
        hout = NULL;
4104
0
    }
4105
0
    free(htxt.s);
4106
0
    return hout;
4107
0
}
4108
4109
bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
4110
0
{
4111
0
    void *names_hash = khash_str2int_init();
4112
0
    kstring_t htxt = {0,0,0};
4113
0
    kstring_t str = {0,0,0};
4114
0
    bcf_hdr_t *h = bcf_hdr_init("w");
4115
0
    int r = 0;
4116
0
    if (!h || !names_hash) {
4117
0
        hts_log_error("Failed to allocate bcf header");
4118
0
        goto err;
4119
0
    }
4120
0
    if (bcf_hdr_format(h0, 1, &htxt) < 0) {
4121
0
        hts_log_error("Failed to get header text");
4122
0
        goto err;
4123
0
    }
4124
0
    bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
4125
0
    int j;
4126
0
    for (j=0; j<n; j++) imap[j] = -1;
4127
0
    if ( bcf_hdr_nsamples(h0) > 0) {
4128
0
        char *p = find_chrom_header_line(htxt.s);
4129
0
        int i = 0, end = n? 8 : 7;
4130
0
        while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
4131
0
        if (i != end) {
4132
0
            hts_log_error("Wrong number of columns in header #CHROM line");
4133
0
            goto err;
4134
0
        }
4135
0
        r |= kputsn(htxt.s, p - htxt.s, &str) < 0;
4136
0
        for (i = 0; i < n; ++i) {
4137
0
            if ( khash_str2int_has_key(names_hash,samples[i]) )
4138
0
            {
4139
0
                hts_log_error("Duplicate sample name \"%s\"", samples[i]);
4140
0
                goto err;
4141
0
            }
4142
0
            imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
4143
0
            if (imap[i] < 0) continue;
4144
0
            r |= kputc('\t', &str) < 0;
4145
0
            r |= kputs(samples[i], &str) < 0;
4146
0
            r |= khash_str2int_inc(names_hash,samples[i]) < 0;
4147
0
        }
4148
0
    } else r |= kputsn(htxt.s, htxt.l, &str) < 0;
4149
0
    while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
4150
0
    r |= kputc('\n',&str) < 0;
4151
0
    if (r) {
4152
0
        hts_log_error("%s", strerror(errno));
4153
0
        goto err;
4154
0
    }
4155
0
    if ( bcf_hdr_parse(h, str.s) < 0 ) {
4156
0
        bcf_hdr_destroy(h);
4157
0
        h = NULL;
4158
0
    }
4159
0
    free(str.s);
4160
0
    free(htxt.s);
4161
0
    khash_str2int_destroy(names_hash);
4162
0
    return h;
4163
4164
0
 err:
4165
0
    ks_free(&str);
4166
0
    ks_free(&htxt);
4167
0
    khash_str2int_destroy(names_hash);
4168
0
    bcf_hdr_destroy(h);
4169
0
    return NULL;
4170
0
}
4171
4172
int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
4173
0
{
4174
0
    if ( samples && !strcmp("-",samples) ) return 0;            // keep all samples
4175
4176
0
    int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
4177
0
    hdr->keep_samples = (uint8_t*) calloc(narr,1);
4178
0
    if (!hdr->keep_samples) return -1;
4179
4180
0
    hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
4181
0
    if ( !samples )
4182
0
    {
4183
        // exclude all samples
4184
0
        khint_t k;
4185
0
        vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE], *new_dict;
4186
0
        new_dict = kh_init(vdict);
4187
0
        if (!new_dict) return -1;
4188
4189
0
        bcf_hdr_nsamples(hdr) = 0;
4190
4191
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
4192
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
4193
0
        kh_destroy(vdict, d);
4194
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
4195
0
        if (bcf_hdr_sync(hdr) < 0) return -1;
4196
4197
0
        return 0;
4198
0
    }
4199
4200
0
    if ( samples[0]=='^' )
4201
0
        for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
4202
4203
0
    int idx, n, ret = 0;
4204
0
    char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
4205
0
    if ( !smpls ) return -1;
4206
0
    for (i=0; i<n; i++)
4207
0
    {
4208
0
        idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
4209
0
        if ( idx<0 )
4210
0
        {
4211
0
            if ( !ret ) ret = i+1;
4212
0
            continue;
4213
0
        }
4214
0
        assert( idx<bcf_hdr_nsamples(hdr) );
4215
0
        if (  samples[0]=='^' )
4216
0
            bit_array_clear(hdr->keep_samples, idx);
4217
0
        else
4218
0
            bit_array_set(hdr->keep_samples, idx);
4219
0
    }
4220
0
    for (i=0; i<n; i++) free(smpls[i]);
4221
0
    free(smpls);
4222
4223
0
    bcf_hdr_nsamples(hdr) = 0;
4224
0
    for (i=0; i<hdr->nsamples_ori; i++)
4225
0
        if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
4226
4227
0
    if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
4228
0
    else
4229
0
    {
4230
        // Make new list and dictionary with desired samples
4231
0
        char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr));
4232
0
        vdict_t *new_dict, *d;
4233
0
        int k, res;
4234
0
        if (!samples) return -1;
4235
4236
0
        new_dict = kh_init(vdict);
4237
0
        if (!new_dict) {
4238
0
            free(samples);
4239
0
            return -1;
4240
0
        }
4241
0
        idx = 0;
4242
0
        for (i=0; i<hdr->nsamples_ori; i++) {
4243
0
            if ( bit_array_test(hdr->keep_samples,i) ) {
4244
0
                samples[idx] = hdr->samples[i];
4245
0
                k = kh_put(vdict, new_dict, hdr->samples[i], &res);
4246
0
                if (res < 0) {
4247
0
                    free(samples);
4248
0
                    kh_destroy(vdict, new_dict);
4249
0
                    return -1;
4250
0
                }
4251
0
                kh_val(new_dict, k) = bcf_idinfo_def;
4252
0
                kh_val(new_dict, k).id = idx;
4253
0
                idx++;
4254
0
            }
4255
0
        }
4256
4257
        // Delete desired samples from old dictionary, so we don't free them
4258
0
        d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
4259
0
        for (i=0; i < idx; i++) {
4260
0
            int k = kh_get(vdict, d, samples[i]);
4261
0
            if (k < kh_end(d)) kh_del(vdict, d, k);
4262
0
        }
4263
4264
        // Free everything else
4265
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
4266
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
4267
0
        kh_destroy(vdict, d);
4268
0
        hdr->dict[BCF_DT_SAMPLE] = new_dict;
4269
4270
0
        free(hdr->samples);
4271
0
        hdr->samples = samples;
4272
4273
0
        if (bcf_hdr_sync(hdr) < 0)
4274
0
            return -1;
4275
0
    }
4276
4277
0
    return ret;
4278
0
}
4279
4280
int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
4281
0
{
4282
0
    kstring_t ind;
4283
0
    ind.s = 0; ind.l = ind.m = 0;
4284
0
    if (n) {
4285
0
        bcf_fmt_t fmt[MAX_N_FMT];
4286
0
        int i, j;
4287
0
        uint8_t *ptr = (uint8_t*)v->indiv.s;
4288
0
        for (i = 0; i < v->n_fmt; ++i)
4289
0
            ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
4290
0
        for (i = 0; i < (int)v->n_fmt; ++i) {
4291
0
            bcf_fmt_t *f = &fmt[i];
4292
0
            bcf_enc_int1(&ind, f->id);
4293
0
            bcf_enc_size(&ind, f->n, f->type);
4294
0
            for (j = 0; j < n; ++j)
4295
0
                if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
4296
0
        }
4297
0
        for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
4298
0
        v->n_sample = i;
4299
0
    } else v->n_sample = 0;
4300
0
    if ( !v->n_sample ) v->n_fmt = 0;
4301
0
    free(v->indiv.s);
4302
0
    v->indiv = ind;
4303
0
    v->unpacked &= ~BCF_UN_FMT;    // only BCF is ready for output, VCF will need to unpack again
4304
0
    return 0;
4305
0
}
4306
4307
int bcf_is_snp(bcf1_t *v)
4308
0
{
4309
0
    int i;
4310
0
    bcf_unpack(v, BCF_UN_STR);
4311
0
    for (i = 0; i < v->n_allele; ++i)
4312
0
    {
4313
0
        if ( v->d.allele[i][1]==0 && v->d.allele[i][0]!='*' ) continue;
4314
4315
        // mpileup's <X> allele, see also below. This is not completely satisfactory,
4316
        // a general library is here narrowly tailored to fit samtools.
4317
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue;
4318
0
        if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='*' && v->d.allele[i][2]=='>' ) continue;
4319
4320
0
        break;
4321
0
    }
4322
0
    return i == v->n_allele;
4323
0
}
4324
4325
static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t *var)
4326
0
{
4327
0
    if ( *alt == '*' && !alt[1] ) { var->n = 0; var->type = VCF_OVERLAP; return; }  // overlapping variant
4328
4329
    // The most frequent case
4330
0
    if ( !ref[1] && !alt[1] )
4331
0
    {
4332
0
        if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
4333
0
        if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
4334
0
        var->n = 1; var->type = VCF_SNP; return;
4335
0
    }
4336
0
    if ( alt[0]=='<' )
4337
0
    {
4338
0
        if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }  // mpileup's X allele shouldn't be treated as variant
4339
0
        if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }
4340
0
        if ( !strcmp("NON_REF>",alt+1) ) { var->n = 0; var->type = VCF_REF; return; }
4341
0
        var->type = VCF_OTHER;
4342
0
        return;
4343
0
    }
4344
4345
    // Catch "joined before" breakend case
4346
0
    if ( alt[0]==']' || alt[0] == '[' )
4347
0
    {
4348
0
        var->type = VCF_BND; return;
4349
0
    }
4350
4351
    // Iterate through alt characters that match the reference
4352
0
    const char *r = ref, *a = alt;
4353
0
    while (*r && *a && toupper_c(*r)==toupper_c(*a) ) { r++; a++; }     // unfortunately, matching REF,ALT case is not guaranteed
4354
4355
0
    if ( *a && !*r )
4356
0
    {
4357
0
        if ( *a==']' || *a=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend
4358
0
        while ( *a ) a++;
4359
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_INS; return;
4360
0
    }
4361
0
    else if ( *r && !*a )
4362
0
    {
4363
0
        while ( *r ) r++;
4364
0
        var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_DEL; return;
4365
0
    }
4366
0
    else if ( !*r && !*a )
4367
0
    {
4368
0
        var->n = 0; var->type = VCF_REF; return;
4369
0
    }
4370
4371
0
    const char *re = r, *ae = a;
4372
0
    while ( re[1] ) re++;
4373
0
    while ( ae[1] ) ae++;
4374
0
    while ( re>r && ae>a && toupper_c(*re)==toupper_c(*ae) ) { re--; ae--; }
4375
0
    if ( ae==a )
4376
0
    {
4377
0
        if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
4378
0
        var->n = -(re-r);
4379
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_DEL; return; }
4380
0
        var->type = VCF_OTHER; return;
4381
0
    }
4382
0
    else if ( re==r )
4383
0
    {
4384
0
        var->n = ae-a;
4385
0
        if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_INS; return; }
4386
0
        var->type = VCF_OTHER; return;
4387
0
    }
4388
4389
0
    var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
4390
0
    var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
4391
4392
    // should do also complex events, SVs, etc...
4393
0
}
4394
4395
static int bcf_set_variant_types(bcf1_t *b)
4396
0
{
4397
0
    if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
4398
0
    bcf_dec_t *d = &b->d;
4399
0
    if ( d->n_var < b->n_allele )
4400
0
    {
4401
0
        bcf_variant_t *new_var = realloc(d->var, sizeof(bcf_variant_t)*b->n_allele);
4402
0
        if (!new_var)
4403
0
            return -1;
4404
0
        d->var = new_var;
4405
0
        d->n_var = b->n_allele;
4406
0
    }
4407
0
    int i;
4408
0
    b->d.var_type = 0;
4409
0
    d->var[0].type = VCF_REF;
4410
0
    d->var[0].n    = 0;
4411
0
    for (i=1; i<b->n_allele; i++)
4412
0
    {
4413
0
        bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
4414
0
        b->d.var_type |= d->var[i].type;
4415
        //fprintf(stderr,"[set_variant_type] %d   %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
4416
0
    }
4417
0
    return 0;
4418
0
}
4419
4420
// bcf_get_variant_type/bcf_get_variant_types should only return the following,
4421
// to be compatible with callers that are not expecting newer values
4422
// like VCF_INS, VCF_DEL.  The full set is available from the newer
4423
// vcf_has_variant_type* interfaces.
4424
0
#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP)
4425
int bcf_get_variant_types(bcf1_t *rec)
4426
0
{
4427
0
    if ( rec->d.var_type==-1 ) {
4428
0
        if (bcf_set_variant_types(rec) != 0) {
4429
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
4430
0
            exit(1); // Due to legacy API having no way to report failures
4431
0
        }
4432
0
    }
4433
0
    return rec->d.var_type & ORIG_VAR_TYPES;
4434
0
}
4435
4436
int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
4437
0
{
4438
0
    if ( rec->d.var_type==-1 ) {
4439
0
        if (bcf_set_variant_types(rec) != 0) {
4440
0
            hts_log_error("Couldn't get variant types: %s", strerror(errno));
4441
0
            exit(1); // Due to legacy API having no way to report failures
4442
0
        }
4443
0
    }
4444
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) {
4445
0
        hts_log_error("Requested allele outside valid range");
4446
0
        exit(1);
4447
0
    }
4448
0
    return rec->d.var[ith_allele].type & ORIG_VAR_TYPES;
4449
0
}
4450
#undef ORIG_VAR_TYPES
4451
4452
int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask)
4453
0
{
4454
0
    if ( rec->d.var_type==-1 ) {
4455
0
        if (bcf_set_variant_types(rec) != 0) return -1;
4456
0
    }
4457
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return -1;
4458
0
    if (bitmask == VCF_REF) {  // VCF_REF is 0, so handled as a special case
4459
0
        return rec->d.var[ith_allele].type == VCF_REF;
4460
0
    }
4461
0
    return bitmask & rec->d.var[ith_allele].type;
4462
0
}
4463
4464
int bcf_variant_length(bcf1_t *rec, int ith_allele)
4465
0
{
4466
0
    if ( rec->d.var_type==-1 ) {
4467
0
        if (bcf_set_variant_types(rec) != 0) return bcf_int32_missing;
4468
0
    }
4469
0
    if (ith_allele < 0 || ith_allele >= rec->n_allele) return bcf_int32_missing;
4470
0
    return rec->d.var[ith_allele].n;
4471
0
}
4472
4473
int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask,
4474
                          enum bcf_variant_match mode)
4475
0
{
4476
0
    if ( rec->d.var_type==-1 ) {
4477
0
        if (bcf_set_variant_types(rec) != 0) return -1;
4478
0
    }
4479
0
    uint32_t type = rec->d.var_type;
4480
0
    if ( mode==bcf_match_overlap ) return bitmask & type;
4481
4482
    // VCF_INDEL is always set with VCF_INS and VCF_DEL by bcf_set_variant_type[s], but the bitmask may
4483
    // ask for say `VCF_INS` or `VCF_INDEL` only
4484
0
    if ( bitmask&(VCF_INS|VCF_DEL) && !(bitmask&VCF_INDEL) ) type &= ~VCF_INDEL;
4485
0
    else if ( bitmask&VCF_INDEL && !(bitmask&(VCF_INS|VCF_DEL)) ) type &= ~(VCF_INS|VCF_DEL);
4486
4487
0
    if ( mode==bcf_match_subset )
4488
0
    {
4489
0
        if ( ~bitmask & type ) return 0;
4490
0
        else return bitmask & type;
4491
0
    }
4492
    // mode == bcf_match_exact
4493
0
    return type==bitmask ? type : 0;
4494
0
}
4495
4496
int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
4497
0
{
4498
0
    static int negative_rlen_warned = 0;
4499
0
    int is_end_tag;
4500
4501
    // Is the field already present?
4502
0
    int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
4503
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1;    // No such INFO field in the header
4504
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
4505
4506
0
    is_end_tag = strcmp(key, "END") == 0;
4507
4508
0
    for (i=0; i<line->n_info; i++)
4509
0
        if ( inf_id==line->d.info[i].key ) break;
4510
0
    bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
4511
4512
0
    if ( !n || (type==BCF_HT_STR && !values) )
4513
0
    {
4514
0
        if ( n==0 && is_end_tag )
4515
0
            line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
4516
0
        if ( inf )
4517
0
        {
4518
            // Mark the tag for removal, free existing memory if necessary
4519
0
            if ( inf->vptr_free )
4520
0
            {
4521
0
                free(inf->vptr - inf->vptr_off);
4522
0
                inf->vptr_free = 0;
4523
0
            }
4524
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
4525
0
            inf->vptr = NULL;
4526
0
            inf->vptr_off = inf->vptr_len = 0;
4527
0
        }
4528
0
        return 0;
4529
0
    }
4530
4531
0
    if (is_end_tag)
4532
0
    {
4533
0
        if (n != 1)
4534
0
        {
4535
0
            hts_log_error("END info tag should only have one value at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
4536
0
            line->errcode |= BCF_ERR_TAG_INVALID;
4537
0
            return -1;
4538
0
        }
4539
0
        if (type != BCF_HT_INT && type != BCF_HT_LONG)
4540
0
        {
4541
0
            hts_log_error("Wrong type (%d) for END info tag at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
4542
0
            line->errcode |= BCF_ERR_TAG_INVALID;
4543
0
            return -1;
4544
0
        }
4545
0
    }
4546
4547
    // Encode the values and determine the size required to accommodate the values
4548
0
    kstring_t str = {0,0,0};
4549
0
    bcf_enc_int1(&str, inf_id);
4550
0
    if ( type==BCF_HT_INT )
4551
0
        bcf_enc_vint(&str, n, (int32_t*)values, -1);
4552
0
    else if ( type==BCF_HT_REAL )
4553
0
        bcf_enc_vfloat(&str, n, (float*)values);
4554
0
    else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
4555
0
    {
4556
0
        if ( values==NULL )
4557
0
            bcf_enc_size(&str, 0, BCF_BT_NULL);
4558
0
        else
4559
0
            bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
4560
0
    }
4561
#ifdef VCF_ALLOW_INT64
4562
    else if ( type==BCF_HT_LONG )
4563
    {
4564
        if (n != 1) {
4565
            hts_log_error("Only storing a single BCF_HT_LONG value is supported at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1);
4566
            abort();
4567
        }
4568
        bcf_enc_long1(&str, *(int64_t *) values);
4569
    }
4570
#endif
4571
0
    else
4572
0
    {
4573
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
4574
0
        abort();
4575
0
    }
4576
4577
    // Is the INFO tag already present
4578
0
    if ( inf )
4579
0
    {
4580
        // Is it big enough to accommodate new block?
4581
0
        if ( inf->vptr && str.l <= inf->vptr_len + inf->vptr_off )
4582
0
        {
4583
0
            if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
4584
0
            uint8_t *ptr = inf->vptr - inf->vptr_off;
4585
0
            memcpy(ptr, str.s, str.l);
4586
0
            free(str.s);
4587
0
            int vptr_free = inf->vptr_free;
4588
0
            bcf_unpack_info_core1(ptr, inf);
4589
0
            inf->vptr_free = vptr_free;
4590
0
        }
4591
0
        else
4592
0
        {
4593
0
            if ( inf->vptr_free )
4594
0
                free(inf->vptr - inf->vptr_off);
4595
0
            bcf_unpack_info_core1((uint8_t*)str.s, inf);
4596
0
            inf->vptr_free = 1;
4597
0
            line->d.shared_dirty |= BCF1_DIRTY_INF;
4598
0
        }
4599
0
    }
4600
0
    else
4601
0
    {
4602
        // The tag is not present, create new one
4603
0
        line->n_info++;
4604
0
        hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
4605
0
        inf = &line->d.info[line->n_info-1];
4606
0
        bcf_unpack_info_core1((uint8_t*)str.s, inf);
4607
0
        inf->vptr_free = 1;
4608
0
        line->d.shared_dirty |= BCF1_DIRTY_INF;
4609
0
    }
4610
0
    line->unpacked |= BCF_UN_INFO;
4611
4612
0
   if ( n==1 && is_end_tag) {
4613
0
        hts_pos_t end = type == BCF_HT_INT ? *(int32_t *) values : *(int64_t *) values;
4614
0
        if ( (type == BCF_HT_INT && end!=bcf_int32_missing) || (type == BCF_HT_LONG && end!=bcf_int64_missing) )
4615
0
        {
4616
0
            if ( end <= line->pos )
4617
0
            {
4618
0
                if ( !negative_rlen_warned )
4619
0
                {
4620
0
                    hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,end,bcf_seqname_safe(hdr,line),line->pos+1);
4621
0
                    negative_rlen_warned = 1;
4622
0
                }
4623
0
                line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
4624
0
            }
4625
0
            else
4626
0
                line->rlen = end - line->pos;
4627
0
        }
4628
0
    }
4629
0
    return 0;
4630
0
}
4631
4632
int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
4633
0
{
4634
0
    if ( !n )
4635
0
        return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
4636
4637
0
    int i, max_len = 0;
4638
0
    for (i=0; i<n; i++)
4639
0
    {
4640
0
        int len = strlen(values[i]);
4641
0
        if ( len > max_len ) max_len = len;
4642
0
    }
4643
0
    char *out = (char*) malloc(max_len*n);
4644
0
    if ( !out ) return -2;
4645
0
    for (i=0; i<n; i++)
4646
0
    {
4647
0
        char *dst = out+i*max_len;
4648
0
        const char *src = values[i];
4649
0
        int j = 0;
4650
0
        while ( src[j] ) { dst[j] = src[j]; j++; }
4651
0
        for (; j<max_len; j++) dst[j] = 0;
4652
0
    }
4653
0
    int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
4654
0
    free(out);
4655
0
    return ret;
4656
0
}
4657
4658
int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
4659
0
{
4660
    // Is the field already present?
4661
0
    int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
4662
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
4663
0
    {
4664
0
        if ( !n ) return 0;
4665
0
        return -1;  // the key not present in the header
4666
0
    }
4667
4668
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
4669
4670
0
    for (i=0; i<line->n_fmt; i++)
4671
0
        if ( line->d.fmt[i].id==fmt_id ) break;
4672
0
    bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
4673
4674
0
    if ( !n )
4675
0
    {
4676
0
        if ( fmt )
4677
0
        {
4678
            // Mark the tag for removal, free existing memory if necessary
4679
0
            if ( fmt->p_free )
4680
0
            {
4681
0
                free(fmt->p - fmt->p_off);
4682
0
                fmt->p_free = 0;
4683
0
            }
4684
0
            line->d.indiv_dirty = 1;
4685
0
            fmt->p = NULL;
4686
0
        }
4687
0
        return 0;
4688
0
    }
4689
4690
0
    line->n_sample = bcf_hdr_nsamples(hdr);
4691
0
    int nps = n / line->n_sample;  // number of values per sample
4692
0
    assert( nps && nps*line->n_sample==n );     // must be divisible by n_sample
4693
4694
    // Encode the values and determine the size required to accommodate the values
4695
0
    kstring_t str = {0,0,0};
4696
0
    bcf_enc_int1(&str, fmt_id);
4697
0
    assert(values != NULL);
4698
0
    if ( type==BCF_HT_INT )
4699
0
        bcf_enc_vint(&str, n, (int32_t*)values, nps);
4700
0
    else if ( type==BCF_HT_REAL )
4701
0
    {
4702
0
        bcf_enc_size(&str, nps, BCF_BT_FLOAT);
4703
0
        serialize_float_array(&str, nps*line->n_sample, (float *) values);
4704
0
    }
4705
0
    else if ( type==BCF_HT_STR )
4706
0
    {
4707
0
        bcf_enc_size(&str, nps, BCF_BT_CHAR);
4708
0
        kputsn((char*)values, nps*line->n_sample, &str);
4709
0
    }
4710
0
    else
4711
0
    {
4712
0
        hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
4713
0
        abort();
4714
0
    }
4715
4716
0
    if ( !fmt )
4717
0
    {
4718
        // Not present, new format field
4719
0
        line->n_fmt++;
4720
0
        hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
4721
4722
        // Special case: VCF specification requires that GT is always first
4723
0
        if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
4724
0
        {
4725
0
            for (i=line->n_fmt-1; i>0; i--)
4726
0
                line->d.fmt[i] = line->d.fmt[i-1];
4727
0
            fmt = &line->d.fmt[0];
4728
0
        }
4729
0
        else
4730
0
            fmt = &line->d.fmt[line->n_fmt-1];
4731
0
        bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
4732
0
        line->d.indiv_dirty = 1;
4733
0
        fmt->p_free = 1;
4734
0
    }
4735
0
    else
4736
0
    {
4737
        // The tag is already present, check if it is big enough to accommodate the new block
4738
0
        if ( fmt->p && str.l <= fmt->p_len + fmt->p_off )
4739
0
        {
4740
            // good, the block is big enough
4741
0
            if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
4742
0
            uint8_t *ptr = fmt->p - fmt->p_off;
4743
0
            memcpy(ptr, str.s, str.l);
4744
0
            free(str.s);
4745
0
            int p_free = fmt->p_free;
4746
0
            bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
4747
0
            fmt->p_free = p_free;
4748
0
        }
4749
0
        else
4750
0
        {
4751
0
            if ( fmt->p_free )
4752
0
                free(fmt->p - fmt->p_off);
4753
0
            bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
4754
0
            fmt->p_free = 1;
4755
0
            line->d.indiv_dirty = 1;
4756
0
        }
4757
0
    }
4758
0
    line->unpacked |= BCF_UN_FMT;
4759
0
    return 0;
4760
0
}
4761
4762
4763
int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
4764
0
{
4765
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
4766
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
4767
0
    line->d.n_flt = n;
4768
0
    if ( !n ) return 0;
4769
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
4770
0
    int i;
4771
0
    for (i=0; i<n; i++)
4772
0
        line->d.flt[i] = flt_ids[i];
4773
0
    return 0;
4774
0
}
4775
4776
int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
4777
0
{
4778
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
4779
0
    int i;
4780
0
    for (i=0; i<line->d.n_flt; i++)
4781
0
        if ( flt_id==line->d.flt[i] ) break;
4782
0
    if ( i<line->d.n_flt ) return 0;    // this filter is already set
4783
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
4784
0
    if ( flt_id==0 )    // set to PASS
4785
0
        line->d.n_flt = 1;
4786
0
    else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
4787
0
        line->d.n_flt = 1;
4788
0
    else
4789
0
        line->d.n_flt++;
4790
0
    hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
4791
0
    line->d.flt[line->d.n_flt-1] = flt_id;
4792
0
    return 1;
4793
0
}
4794
int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
4795
0
{
4796
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
4797
0
    int i;
4798
0
    for (i=0; i<line->d.n_flt; i++)
4799
0
        if ( flt_id==line->d.flt[i] ) break;
4800
0
    if ( i==line->d.n_flt ) return 0;   // the filter is not present
4801
0
    line->d.shared_dirty |= BCF1_DIRTY_FLT;
4802
0
    if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt));
4803
0
    line->d.n_flt--;
4804
0
    if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
4805
0
    return 0;
4806
0
}
4807
4808
int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
4809
0
{
4810
0
    if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
4811
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
4812
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1;  // not defined in the header
4813
4814
0
    if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
4815
0
    if ( id==0 && !line->d.n_flt) return 1; // PASS
4816
4817
0
    int i;
4818
0
    for (i=0; i<line->d.n_flt; i++)
4819
0
        if ( line->d.flt[i]==id ) return 1;
4820
0
    return 0;
4821
0
}
4822
4823
static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
4824
0
{
4825
0
    line->d.shared_dirty |= BCF1_DIRTY_ALS;
4826
4827
0
    line->n_allele = nals;
4828
0
    hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
4829
4830
0
    char *als = line->d.als;
4831
0
    int n = 0;
4832
0
    while (n<nals)
4833
0
    {
4834
0
        line->d.allele[n] = als;
4835
0
        while ( *als ) als++;
4836
0
        als++;
4837
0
        n++;
4838
0
    }
4839
4840
    // Update REF length. Note that END is 1-based while line->pos 0-based
4841
0
    bcf_info_t *end_info = bcf_get_info(hdr,line,"END");
4842
0
    if ( end_info )
4843
0
    {
4844
0
        if ( end_info->type==BCF_HT_INT && end_info->v1.i==bcf_int32_missing ) end_info = NULL;
4845
0
        else if ( end_info->type==BCF_HT_LONG && end_info->v1.i==bcf_int64_missing ) end_info = NULL;
4846
0
    }
4847
0
    if ( end_info && end_info->v1.i > line->pos )
4848
0
        line->rlen = end_info->v1.i - line->pos;
4849
0
    else if ( nals > 0 )
4850
0
        line->rlen = strlen(line->d.allele[0]);
4851
0
    else
4852
0
        line->rlen = 0;
4853
4854
0
    return 0;
4855
0
}
4856
int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
4857
0
{
4858
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
4859
0
    char *free_old = NULL;
4860
0
    char buffer[256];
4861
0
    size_t used = 0;
4862
4863
    // The pointers in alleles may point into the existing line->d.als memory,
4864
    // so care needs to be taken not to clobber them while updating.  Usually
4865
    // they will be short so we can copy through an intermediate buffer.
4866
    // If they're longer, or won't fit in the existing allocation we
4867
    // can allocate a new buffer to write into.  Note that in either case
4868
    // pointers to line->d.als memory in alleles may not be valid when we've
4869
    // finished.
4870
0
    int i;
4871
0
    size_t avail = line->d.m_als < sizeof(buffer) ? line->d.m_als : sizeof(buffer);
4872
0
    for (i=0; i<nals; i++) {
4873
0
        size_t sz = strlen(alleles[i]) + 1;
4874
0
        if (avail - used < sz)
4875
0
            break;
4876
0
        memcpy(buffer + used, alleles[i], sz);
4877
0
        used += sz;
4878
0
    }
4879
4880
    // Did we miss anything?
4881
0
    if (i < nals) {
4882
0
        int j;
4883
0
        size_t needed = used;
4884
0
        char *new_als;
4885
0
        for (j = i; j < nals; j++)
4886
0
            needed += strlen(alleles[j]) + 1;
4887
0
        if (needed < line->d.m_als) // Don't shrink the buffer
4888
0
            needed = line->d.m_als;
4889
0
        if (needed > INT_MAX) {
4890
0
            hts_log_error("REF + alleles too long to fit in a BCF record");
4891
0
            return -1;
4892
0
        }
4893
0
        new_als = malloc(needed);
4894
0
        if (!new_als)
4895
0
            return -1;
4896
0
        free_old = line->d.als;
4897
0
        line->d.als = new_als;
4898
0
        line->d.m_als = needed;
4899
0
    }
4900
4901
    // Copy from the temp buffer to the destination
4902
0
    if (used) {
4903
0
        assert(used <= line->d.m_als);
4904
0
        memcpy(line->d.als, buffer, used);
4905
0
    }
4906
4907
    // Add in any remaining entries - if this happens we will always be
4908
    // writing to a newly-allocated buffer.
4909
0
    for (; i < nals; i++) {
4910
0
        size_t sz = strlen(alleles[i]) + 1;
4911
0
        memcpy(line->d.als + used, alleles[i], sz);
4912
0
        used += sz;
4913
0
    }
4914
4915
0
    if (free_old)
4916
0
        free(free_old);
4917
0
    return _bcf1_sync_alleles(hdr,line,nals);
4918
0
}
4919
4920
int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
4921
0
{
4922
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
4923
0
    kstring_t tmp;
4924
0
    tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
4925
0
    kputs(alleles_string, &tmp);
4926
0
    line->d.als = tmp.s; line->d.m_als = tmp.m;
4927
4928
0
    int nals = 1;
4929
0
    char *t = line->d.als;
4930
0
    while (*t)
4931
0
    {
4932
0
        if ( *t==',' ) { *t = 0; nals++; }
4933
0
        t++;
4934
0
    }
4935
0
    return _bcf1_sync_alleles(hdr, line, nals);
4936
0
}
4937
4938
int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
4939
0
{
4940
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
4941
0
    kstring_t tmp;
4942
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
4943
0
    if ( id )
4944
0
        kputs(id, &tmp);
4945
0
    else
4946
0
        kputs(".", &tmp);
4947
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
4948
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
4949
0
    return 0;
4950
0
}
4951
4952
int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
4953
0
{
4954
0
    if ( !id ) return 0;
4955
0
    if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR);
4956
4957
0
    kstring_t tmp;
4958
0
    tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
4959
4960
0
    int len = strlen(id);
4961
0
    char *dst = line->d.id;
4962
0
    while ( *dst && (dst=strstr(dst,id)) )
4963
0
    {
4964
0
        if ( dst[len]!=0 && dst[len]!=';' ) dst++;              // a prefix, not a match
4965
0
        else if ( dst==line->d.id || dst[-1]==';' ) return 0;   // already present
4966
0
        dst++;  // a suffix, not a match
4967
0
    }
4968
0
    if ( line->d.id && (line->d.id[0]!='.' || line->d.id[1]) )
4969
0
    {
4970
0
        tmp.l = strlen(line->d.id);
4971
0
        kputc(';',&tmp);
4972
0
    }
4973
0
    kputs(id,&tmp);
4974
4975
0
    line->d.id = tmp.s; line->d.m_id = tmp.m;
4976
0
    line->d.shared_dirty |= BCF1_DIRTY_ID;
4977
0
    return 0;
4978
4979
0
}
4980
4981
bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
4982
0
{
4983
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
4984
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL;   // no such FMT field in the header
4985
0
    return bcf_get_fmt_id(line, id);
4986
0
}
4987
4988
bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
4989
0
{
4990
0
    int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
4991
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL;   // no such INFO field in the header
4992
0
    return bcf_get_info_id(line, id);
4993
0
}
4994
4995
bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
4996
0
{
4997
0
    int i;
4998
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
4999
0
    for (i=0; i<line->n_fmt; i++)
5000
0
    {
5001
0
        if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
5002
0
    }
5003
0
    return NULL;
5004
0
}
5005
5006
bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
5007
0
{
5008
0
    int i;
5009
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
5010
0
    for (i=0; i<line->n_info; i++)
5011
0
    {
5012
0
        if ( line->d.info[i].key==id ) return &line->d.info[i];
5013
0
    }
5014
0
    return NULL;
5015
0
}
5016
5017
5018
int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
5019
0
{
5020
0
    int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
5021
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1;    // no such INFO field in the header
5022
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2;     // expected different type
5023
5024
0
    if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
5025
5026
0
    for (i=0; i<line->n_info; i++)
5027
0
        if ( line->d.info[i].key==tag_id ) break;
5028
0
    if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3;       // the tag is not present in this record
5029
0
    if ( type==BCF_HT_FLAG ) return 1;
5030
5031
0
    bcf_info_t *info = &line->d.info[i];
5032
0
    if ( !info->vptr ) return -3;           // the tag was marked for removal
5033
0
    if ( type==BCF_HT_STR )
5034
0
    {
5035
0
        if ( *ndst < info->len+1 )
5036
0
        {
5037
0
            *ndst = info->len + 1;
5038
0
            *dst  = realloc(*dst, *ndst);
5039
0
        }
5040
0
        memcpy(*dst,info->vptr,info->len);
5041
0
        ((uint8_t*)*dst)[info->len] = 0;
5042
0
        return info->len;
5043
0
    }
5044
5045
    // Make sure the buffer is big enough
5046
0
    int size1;
5047
0
    switch (type) {
5048
0
        case BCF_HT_INT:  size1 = sizeof(int32_t); break;
5049
0
        case BCF_HT_LONG: size1 = sizeof(int64_t); break;
5050
0
        case BCF_HT_REAL: size1 = sizeof(float); break;
5051
0
        default:
5052
0
            hts_log_error("Unexpected output type %d at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1);
5053
0
            return -2;
5054
0
    }
5055
0
    if ( *ndst < info->len )
5056
0
    {
5057
0
        *ndst = info->len;
5058
0
        *dst  = realloc(*dst, *ndst * size1);
5059
0
    }
5060
5061
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_regular, out_type_t) do { \
5062
0
        out_type_t *tmp = (out_type_t *) *dst; \
5063
0
        int j; \
5064
0
        for (j=0; j<info->len; j++) \
5065
0
        { \
5066
0
            type_t p = convert(info->vptr + j * sizeof(type_t)); \
5067
0
            if ( is_vector_end ) break; \
5068
0
            if ( is_missing ) set_missing; \
5069
0
            else set_regular; \
5070
0
            tmp++; \
5071
0
        } \
5072
0
        ret = j; \
5073
0
    } while (0)
5074
0
    switch (info->type) {
5075
0
        case BCF_BT_INT8:
5076
0
            if (type == BCF_HT_LONG) {
5077
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int64_missing, *tmp=p, int64_t);
5078
0
            } else {
5079
0
                BRANCH(int8_t,  le_to_i8,  p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=p, int32_t);
5080
0
            }
5081
0
            break;
5082
0
        case BCF_BT_INT16:
5083
0
            if (type == BCF_HT_LONG) {
5084
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t);
5085
0
            } else {
5086
0
                BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t);
5087
0
            }
5088
0
            break;
5089
0
        case BCF_BT_INT32:
5090
0
            if (type == BCF_HT_LONG) {
5091
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); break;
5092
0
            } else {
5093
0
                BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break;
5094
0
            }
5095
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break;
5096
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, info->type, bcf_seqname_safe(hdr,line), line->pos+1); return -2;
5097
0
    }
5098
0
    #undef BRANCH
5099
0
    return ret;  // set by BRANCH
5100
0
}
5101
5102
int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
5103
0
{
5104
0
    int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
5105
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
5106
0
    if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;     // expected different type
5107
5108
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5109
5110
0
    for (i=0; i<line->n_fmt; i++)
5111
0
        if ( line->d.fmt[i].id==tag_id ) break;
5112
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
5113
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
5114
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
5115
5116
0
    int nsmpl = bcf_hdr_nsamples(hdr);
5117
0
    if ( !*dst )
5118
0
    {
5119
0
        *dst = (char**) malloc(sizeof(char*)*nsmpl);
5120
0
        if ( !*dst ) return -4;     // could not alloc
5121
0
        (*dst)[0] = NULL;
5122
0
    }
5123
0
    int n = (fmt->n+1)*nsmpl;
5124
0
    if ( *ndst < n )
5125
0
    {
5126
0
        (*dst)[0] = realloc((*dst)[0], n);
5127
0
        if ( !(*dst)[0] ) return -4;    // could not alloc
5128
0
        *ndst = n;
5129
0
    }
5130
0
    for (i=0; i<nsmpl; i++)
5131
0
    {
5132
0
        uint8_t *src = fmt->p + i*fmt->n;
5133
0
        uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
5134
0
        memcpy(tmp,src,fmt->n);
5135
0
        tmp[fmt->n] = 0;
5136
0
        (*dst)[i] = (char*) tmp;
5137
0
    }
5138
0
    return n;
5139
0
}
5140
5141
int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
5142
0
{
5143
0
    int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
5144
0
    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1;    // no such FORMAT field in the header
5145
0
    if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
5146
0
    {
5147
        // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
5148
0
        if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
5149
0
    }
5150
0
    else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2;     // expected different type
5151
5152
0
    if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
5153
5154
0
    for (i=0; i<line->n_fmt; i++)
5155
0
        if ( line->d.fmt[i].id==tag_id ) break;
5156
0
    if ( i==line->n_fmt ) return -3;                               // the tag is not present in this record
5157
0
    bcf_fmt_t *fmt = &line->d.fmt[i];
5158
0
    if ( !fmt->p ) return -3;                                      // the tag was marked for removal
5159
5160
0
    if ( type==BCF_HT_STR )
5161
0
    {
5162
0
        int n = fmt->n*bcf_hdr_nsamples(hdr);
5163
0
        if ( *ndst < n )
5164
0
        {
5165
0
            *dst  = realloc(*dst, n);
5166
0
            if ( !*dst ) return -4;     // could not alloc
5167
0
            *ndst = n;
5168
0
        }
5169
0
        memcpy(*dst,fmt->p,n);
5170
0
        return n;
5171
0
    }
5172
5173
    // Make sure the buffer is big enough
5174
0
    int nsmpl = bcf_hdr_nsamples(hdr);
5175
0
    int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
5176
0
    if ( *ndst < fmt->n*nsmpl )
5177
0
    {
5178
0
        *ndst = fmt->n*nsmpl;
5179
0
        *dst  = realloc(*dst, *ndst*size1);
5180
0
        if ( !*dst ) return -4;     // could not alloc
5181
0
    }
5182
5183
0
    #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_vector_end, set_regular, out_type_t) { \
5184
0
        out_type_t *tmp = (out_type_t *) *dst; \
5185
0
        uint8_t *fmt_p = fmt->p; \
5186
0
        for (i=0; i<nsmpl; i++) \
5187
0
        { \
5188
0
            for (j=0; j<fmt->n; j++) \
5189
0
            { \
5190
0
                type_t p = convert(fmt_p + j * sizeof(type_t)); \
5191
0
                if ( is_missing ) set_missing; \
5192
0
                else if ( is_vector_end ) { set_vector_end; break; } \
5193
0
                else set_regular; \
5194
0
                tmp++; \
5195
0
            } \
5196
0
            for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
5197
0
            fmt_p += fmt->size; \
5198
0
        } \
5199
0
    }
5200
0
    switch (fmt->type) {
5201
0
        case BCF_BT_INT8:  BRANCH(int8_t,  le_to_i8, p==bcf_int8_missing,  p==bcf_int8_vector_end,  *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
5202
0
        case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
5203
0
        case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
5204
0
        case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), bcf_float_set(tmp, p), float); break;
5205
0
        default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt->type, bcf_seqname_safe(hdr,line), line->pos+1); exit(1);
5206
0
    }
5207
0
    #undef BRANCH
5208
0
    return nsmpl*fmt->n;
5209
0
}
5210
5211
//error description structure definition
5212
typedef struct err_desc {
5213
    int  errorcode;
5214
    const char *description;
5215
}err_desc;
5216
5217
// error descriptions
5218
static const err_desc errdesc_bcf[] = {
5219
    { BCF_ERR_CTG_UNDEF, "Contig not defined in header"},
5220
    { BCF_ERR_TAG_UNDEF, "Tag not defined in header" },
5221
    { BCF_ERR_NCOLS, "Incorrect number of columns" },
5222
    { BCF_ERR_LIMITS, "Limits reached" },
5223
    { BCF_ERR_CHAR, "Invalid character" },
5224
    { BCF_ERR_CTG_INVALID, "Invalid contig" },
5225
    { BCF_ERR_TAG_INVALID, "Invalid tag" },
5226
};
5227
5228
/// append given description to buffer based on available size and add ... when not enough space
5229
    /** @param buffer       buffer to which description to be appended
5230
        @param offset       offset at which to be appended
5231
        @param maxbuffer    maximum size of the buffer
5232
        @param description  the description to be appended
5233
on failure returns -1 - when buffer is not big enough; returns -1 on invalid params and on too small buffer which are improbable due to validation at caller site
5234
on success returns 0
5235
    */
5236
0
static int add_desc_to_buffer(char *buffer, size_t *offset, size_t maxbuffer, const char *description) {
5237
5238
0
    if (!description || !buffer || !offset || (maxbuffer < 4))
5239
0
        return -1;
5240
5241
0
    size_t rembuffer = maxbuffer - *offset;
5242
0
    if (rembuffer > (strlen(description) + (rembuffer == maxbuffer ? 0 : 1))) {    //add description with optionally required ','
5243
0
        *offset += snprintf(buffer + *offset, rembuffer, "%s%s", (rembuffer == maxbuffer)? "": ",", description);
5244
0
    } else {    //not enough space for description, put ...
5245
0
        size_t tmppos = (rembuffer <= 4) ? maxbuffer - 4 : *offset;
5246
0
        snprintf(buffer + tmppos, 4, "...");    //ignore offset update
5247
0
        return -1;
5248
0
    }
5249
0
    return 0;
5250
0
}
5251
5252
//get description for given error code. return NULL on error
5253
0
const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) {
5254
0
    size_t usedup = 0;
5255
0
    int ret = 0;
5256
0
    int idx;
5257
5258
0
    if (!buffer || maxbuffer < 4)
5259
0
        return NULL;           //invalid / insufficient buffer
5260
5261
0
    if (!errorcode) {
5262
0
        buffer[0] = '\0';      //no error, set null
5263
0
        return buffer;
5264
0
    }
5265
5266
0
    for (idx = 0; idx < sizeof(errdesc_bcf) / sizeof(err_desc); ++idx) {
5267
0
        if (errorcode & errdesc_bcf[idx].errorcode) {    //error is set, add description
5268
0
            ret = add_desc_to_buffer(buffer, &usedup, maxbuffer, errdesc_bcf[idx].description);
5269
0
            if (ret < 0)
5270
0
                break;         //not enough space, ... added, no need to continue
5271
5272
0
            errorcode &= ~errdesc_bcf[idx].errorcode;    //reset the error
5273
0
        }
5274
0
    }
5275
5276
0
    if (errorcode && (ret >= 0))  {     //undescribed error is present in error code and had enough buffer, try to add unkonwn error as well§
5277
0
        add_desc_to_buffer(buffer, &usedup, maxbuffer, "Unknown error");
5278
0
    }
5279
0
    return buffer;
5280
0
}
5281