Line | Count | Source |
1 | | /* vcf.c -- VCF/BCF API functions. |
2 | | |
3 | | Copyright (C) 2012, 2013 Broad Institute. |
4 | | Copyright (C) 2012-2025 Genome Research Ltd. |
5 | | Portions copyright (C) 2014 Intel Corporation. |
6 | | |
7 | | Author: Heng Li <lh3@sanger.ac.uk> |
8 | | |
9 | | Permission is hereby granted, free of charge, to any person obtaining a copy |
10 | | of this software and associated documentation files (the "Software"), to deal |
11 | | in the Software without restriction, including without limitation the rights |
12 | | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
13 | | copies of the Software, and to permit persons to whom the Software is |
14 | | furnished to do so, subject to the following conditions: |
15 | | |
16 | | The above copyright notice and this permission notice shall be included in |
17 | | all copies or substantial portions of the Software. |
18 | | |
19 | | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
20 | | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
21 | | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
22 | | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
23 | | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
24 | | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
25 | | DEALINGS IN THE SOFTWARE. */ |
26 | | |
27 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
28 | | #include <config.h> |
29 | | |
30 | | #include <stdio.h> |
31 | | #include <assert.h> |
32 | | #include <string.h> |
33 | | #include <strings.h> |
34 | | #include <stdlib.h> |
35 | | #include <limits.h> |
36 | | #include <stdint.h> |
37 | | #include <inttypes.h> |
38 | | #include <errno.h> |
39 | | |
40 | | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
41 | | #include "fuzz_settings.h" |
42 | | #endif |
43 | | |
44 | | #include "htslib/vcf.h" |
45 | | #include "htslib/bgzf.h" |
46 | | #include "htslib/tbx.h" |
47 | | #include "htslib/hfile.h" |
48 | | #include "hts_internal.h" |
49 | | #include "htslib/hts_endian.h" |
50 | | #include "htslib/khash_str2int.h" |
51 | | #include "htslib/kstring.h" |
52 | | #include "htslib/sam.h" |
53 | | #include "htslib/khash.h" |
54 | | #include "bgzf_internal.h" |
55 | | |
56 | | #if 0 |
57 | | // This helps on Intel a bit, often 6-7% faster VCF parsing. |
58 | | // Conversely sometimes harms AMD Zen4 as ~9% slower. |
59 | | // Possibly related to IPC differences. However for now it's just a |
60 | | // curiousity we ignore and stick with the simpler code. |
61 | | // |
62 | | // Left here as a hint for future explorers. |
63 | | static inline int xstreq(const char *a, const char *b) { |
64 | | while (*a && *a == *b) |
65 | | a++, b++; |
66 | | return *a == *b; |
67 | | } |
68 | | |
69 | | #define KHASH_MAP_INIT_XSTR(name, khval_t) \ |
70 | | KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, xstreq) |
71 | | |
72 | | KHASH_MAP_INIT_XSTR(vdict, bcf_idinfo_t) |
73 | | #else |
74 | | KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t) |
75 | | #endif |
76 | | |
77 | | typedef khash_t(vdict) vdict_t; |
78 | | |
79 | | KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*) |
80 | | typedef khash_t(hdict) hdict_t; |
81 | | |
82 | | |
83 | | #include "htslib/kseq.h" |
84 | | HTSLIB_EXPORT |
85 | | uint32_t bcf_float_missing = 0x7F800001; |
86 | | |
87 | | HTSLIB_EXPORT |
88 | | uint32_t bcf_float_vector_end = 0x7F800002; |
89 | | |
90 | | HTSLIB_EXPORT |
91 | | uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; |
92 | | |
93 | | static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 }; |
94 | | |
95 | | /* |
96 | | Partial support for 64-bit POS and Number=1 INFO tags. |
97 | | Notes: |
98 | | - the support for 64-bit values is motivated by POS and INFO/END for large genomes |
99 | | - the use of 64-bit values does not conform to the specification |
100 | | - cannot output 64-bit BCF and if it does, it is not compatible with anything |
101 | | - experimental, use at your risk |
102 | | */ |
103 | | #ifdef VCF_ALLOW_INT64 |
104 | | #define BCF_MAX_BT_INT64 (0x7fffffffffffffff) /* INT64_MAX, for internal use only */ |
105 | | #define BCF_MIN_BT_INT64 -9223372036854775800LL /* INT64_MIN + 8, for internal use only */ |
106 | | #endif |
107 | | |
108 | 440 | #define BCF_IS_64BIT (1<<30) |
109 | | |
110 | | |
111 | | // Opaque structure with auxilary data which allows to extend bcf_hdr_t without breaking ABI. |
112 | | // Note that this preserving API and ABI requires that the first element is vdict_t struct |
113 | | // rather than a pointer, as user programs may (and in some cases do) access the dictionary |
114 | | // directly as (vdict_t*)hdr->dict. |
115 | | typedef struct |
116 | | { |
117 | | vdict_t dict; // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT |
118 | | hdict_t *gen; // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields |
119 | | size_t *key_len;// length of h->id[BCF_DT_ID] strings |
120 | | int version; //cached version |
121 | | uint32_t ref_count; // reference count, low bit indicates bcf_hdr_destroy() has been called |
122 | | } |
123 | | bcf_hdr_aux_t; |
124 | | |
125 | | static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr) |
126 | 297k | { |
127 | 297k | return (bcf_hdr_aux_t *)hdr->dict[0]; |
128 | 297k | } |
129 | | |
130 | | //version macros |
131 | 88.4k | #define VCF_DEF 4002000 |
132 | 31.4k | #define VCF44 4004000 |
133 | 17.9k | #define VCF45 4005000 |
134 | | |
135 | | #define VCF_MAJOR_VER(x) ( (x) / 10000 / 100 ) |
136 | | #define VCF_MINOR_VER(x) ( ((x) % 1000000) / 1000 ) |
137 | | |
138 | | /** |
139 | | * bcf_get_version - get the version as int |
140 | | * @param hdr - bcf header, to get version |
141 | | * @param verstr- version string, which is already available |
142 | | * Returns version on success and default version on failure |
143 | | * version = major * 100 * 10000 + minor * 1000 |
144 | | */ |
145 | | static int bcf_get_version(const bcf_hdr_t *hdr, const char *verstr) |
146 | 23.9k | { |
147 | 23.9k | const char *version = NULL, vcf[] = "VCFv"; |
148 | 23.9k | char *major = NULL, *minor = NULL; |
149 | 23.9k | int ver = -1; |
150 | 23.9k | long tmp = 0; |
151 | 23.9k | bcf_hdr_aux_t *aux = NULL; |
152 | | |
153 | 23.9k | if (!hdr && !verstr) { //invalid input |
154 | 0 | goto fail; |
155 | 0 | } |
156 | | |
157 | 23.9k | if (hdr) { |
158 | 20.5k | if ((aux = get_hdr_aux(hdr)) && aux->version != 0) { //use cached version |
159 | 20.2k | return aux->version; |
160 | 20.2k | } |
161 | | //get from header |
162 | 337 | version = bcf_hdr_get_version(hdr); |
163 | 3.34k | } else { |
164 | | //get from version string |
165 | 3.34k | version = verstr; |
166 | 3.34k | } |
167 | 3.67k | if (!(major = strstr(version, vcf))) { //bad format |
168 | 2.50k | goto fail; |
169 | 2.50k | } |
170 | 1.17k | major += sizeof(vcf) - 1; |
171 | 1.17k | if (!(minor = strchr(major, '.'))) { //bad format |
172 | 284 | goto fail; |
173 | 284 | } |
174 | 891 | tmp = strtol(major, NULL, 10); |
175 | 891 | if ((!tmp && errno == EINVAL) || |
176 | 726 | ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) { //failed |
177 | 167 | goto fail; |
178 | 167 | } |
179 | 724 | ver = tmp * 100 * 10000; |
180 | 724 | tmp = strtol(++minor, NULL, 10); |
181 | 724 | if ((!tmp && errno == EINVAL) || |
182 | 691 | ((tmp == LONG_MIN || tmp == LONG_MAX) && errno == ERANGE)) { //failed |
183 | 112 | goto fail; |
184 | 112 | } |
185 | 612 | ver += tmp * 1000; |
186 | 612 | return ver; |
187 | | |
188 | 3.06k | fail: |
189 | 3.06k | hts_log_warning("Couldn't get VCF version, considering as %d.%d", |
190 | 3.06k | VCF_MAJOR_VER(VCF_DEF), VCF_MINOR_VER(VCF_DEF)); |
191 | 3.06k | return VCF_DEF; |
192 | 724 | } |
193 | | |
194 | | // Header reference counting |
195 | | |
196 | | static void bcf_hdr_incr_ref(bcf_hdr_t *h) |
197 | 1.29k | { |
198 | 1.29k | bcf_hdr_aux_t *aux = get_hdr_aux(h); |
199 | 1.29k | aux->ref_count += 2; |
200 | 1.29k | } |
201 | | |
202 | | static void bcf_hdr_decr_ref(bcf_hdr_t *h) |
203 | 1.29k | { |
204 | 1.29k | bcf_hdr_aux_t *aux = get_hdr_aux(h); |
205 | 1.29k | if (aux->ref_count >= 2) |
206 | 1.29k | aux->ref_count -= 2; |
207 | | |
208 | 1.29k | if (aux->ref_count == 0) |
209 | 1.20k | bcf_hdr_destroy(h); |
210 | 1.29k | } |
211 | | |
212 | | static void hdr_bgzf_private_data_cleanup(void *data) |
213 | 1.29k | { |
214 | 1.29k | bcf_hdr_t *h = (bcf_hdr_t *) data; |
215 | 1.29k | bcf_hdr_decr_ref(h); |
216 | 1.29k | } |
217 | | |
218 | | static char *find_chrom_header_line(char *s) |
219 | 0 | { |
220 | 0 | char *nl; |
221 | 0 | if (strncmp(s, "#CHROM\t", 7) == 0) return s; |
222 | 0 | else if ((nl = strstr(s, "\n#CHROM\t")) != NULL) return nl+1; |
223 | 0 | else return NULL; |
224 | 0 | } |
225 | | |
226 | | static int64_t get_rlen(const bcf_hdr_t *h, bcf1_t *v); |
227 | | |
228 | | /************************* |
229 | | *** VCF header parser *** |
230 | | *************************/ |
231 | | |
232 | | static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len) |
233 | 4.64k | { |
234 | 4.64k | const char *ss = s; |
235 | 4.79k | while ( *ss && isspace_c(*ss) && ss - s < len) ss++; |
236 | 4.64k | if ( !*ss || ss - s == len) |
237 | 2 | { |
238 | 2 | hts_log_error("Empty sample name: trailing spaces/tabs in the header line?"); |
239 | 2 | return -1; |
240 | 2 | } |
241 | | |
242 | 4.64k | vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE]; |
243 | 4.64k | int ret; |
244 | 4.64k | char *sdup = malloc(len + 1); |
245 | 4.64k | if (!sdup) return -1; |
246 | 4.64k | memcpy(sdup, s, len); |
247 | 4.64k | sdup[len] = 0; |
248 | | |
249 | | // Ensure space is available in h->samples |
250 | 4.64k | size_t n = kh_size(d); |
251 | 4.64k | char **new_samples = realloc(h->samples, sizeof(char*) * (n + 1)); |
252 | 4.64k | if (!new_samples) { |
253 | 0 | free(sdup); |
254 | 0 | return -1; |
255 | 0 | } |
256 | 4.64k | h->samples = new_samples; |
257 | | |
258 | 4.64k | int k = kh_put(vdict, d, sdup, &ret); |
259 | 4.64k | if (ret < 0) { |
260 | 0 | free(sdup); |
261 | 0 | return -1; |
262 | 0 | } |
263 | 4.64k | if (ret) { // absent |
264 | 4.64k | kh_val(d, k) = bcf_idinfo_def; |
265 | 4.64k | kh_val(d, k).id = n; |
266 | 4.64k | } else { |
267 | 0 | hts_log_error("Duplicated sample name '%s'", sdup); |
268 | 0 | free(sdup); |
269 | 0 | return -1; |
270 | 0 | } |
271 | 4.64k | h->samples[n] = sdup; |
272 | 4.64k | h->dirty = 1; |
273 | 4.64k | return 0; |
274 | 4.64k | } |
275 | | |
276 | | int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s) |
277 | 0 | { |
278 | 0 | if (!s) { |
279 | | // Allowed for backwards-compatibility, calling with s == NULL |
280 | | // used to trigger bcf_hdr_sync(h); |
281 | 0 | return 0; |
282 | 0 | } |
283 | 0 | return bcf_hdr_add_sample_len(h, s, strlen(s)); |
284 | 0 | } |
285 | | |
286 | | int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *hdr, const char *str) |
287 | 2.28k | { |
288 | 2.28k | const char *mandatory = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"; |
289 | 2.28k | if ( strncmp(str,mandatory,strlen(mandatory)) ) |
290 | 26 | { |
291 | 26 | hts_log_error("Could not parse the \"#CHROM..\" line, either the fields are incorrect or spaces are present instead of tabs:\n\t%s",str); |
292 | 26 | return -1; |
293 | 26 | } |
294 | | |
295 | 2.25k | const char *beg = str + strlen(mandatory), *end; |
296 | 2.25k | if ( !*beg || *beg=='\n' ) return 0; |
297 | 602 | if ( strncmp(beg,"\tFORMAT\t",8) ) |
298 | 4 | { |
299 | 4 | hts_log_error("Could not parse the \"#CHROM..\" line, either FORMAT is missing or spaces are present instead of tabs:\n\t%s",str); |
300 | 4 | return -1; |
301 | 4 | } |
302 | 598 | beg += 8; |
303 | | |
304 | 598 | int ret = 0; |
305 | 4.65k | while ( *beg ) |
306 | 4.64k | { |
307 | 4.64k | end = beg; |
308 | 464M | while ( *end && *end!='\t' && *end!='\n' ) end++; |
309 | 4.64k | if ( bcf_hdr_add_sample_len(hdr, beg, end-beg) < 0 ) ret = -1; |
310 | 4.64k | if ( !*end || *end=='\n' || ret<0 ) break; |
311 | 4.05k | beg = end + 1; |
312 | 4.05k | } |
313 | 598 | return ret; |
314 | 602 | } |
315 | | |
316 | | int bcf_hdr_sync(bcf_hdr_t *h) |
317 | 64.5k | { |
318 | 64.5k | int i; |
319 | 258k | for (i = 0; i < 3; i++) |
320 | 193k | { |
321 | 193k | vdict_t *d = (vdict_t*)h->dict[i]; |
322 | 193k | khint_t k; |
323 | 193k | if ( h->n[i] < kh_size(d) ) |
324 | 594 | { |
325 | 594 | bcf_idpair_t *new_idpair; |
326 | | // this should be true only for i=2, BCF_DT_SAMPLE |
327 | 594 | new_idpair = (bcf_idpair_t*) realloc(h->id[i], kh_size(d)*sizeof(bcf_idpair_t)); |
328 | 594 | if (!new_idpair) return -1; |
329 | 594 | h->n[i] = kh_size(d); |
330 | 594 | h->id[i] = new_idpair; |
331 | 594 | } |
332 | 2.24G | for (k=kh_begin(d); k<kh_end(d); k++) |
333 | 2.24G | { |
334 | 2.24G | if (!kh_exist(d,k)) continue; |
335 | 13.8M | h->id[i][kh_val(d,k).id].key = kh_key(d,k); |
336 | 13.8M | h->id[i][kh_val(d,k).id].val = &kh_val(d,k); |
337 | 13.8M | } |
338 | 193k | } |
339 | | |
340 | | // Invalidate key length cache |
341 | 64.5k | bcf_hdr_aux_t *aux = get_hdr_aux(h); |
342 | 64.5k | if (aux && aux->key_len) { |
343 | 2.87k | free(aux->key_len); |
344 | 2.87k | aux->key_len = NULL; |
345 | 2.87k | } |
346 | | |
347 | 64.5k | h->dirty = 0; |
348 | 64.5k | return 0; |
349 | 64.5k | } |
350 | | |
351 | | void bcf_hrec_destroy(bcf_hrec_t *hrec) |
352 | 173k | { |
353 | 173k | if (!hrec) return; |
354 | 164k | free(hrec->key); |
355 | 164k | if ( hrec->value ) free(hrec->value); |
356 | 164k | int i; |
357 | 493k | for (i=0; i<hrec->nkeys; i++) |
358 | 328k | { |
359 | 328k | free(hrec->keys[i]); |
360 | 328k | free(hrec->vals[i]); |
361 | 328k | } |
362 | 164k | free(hrec->keys); |
363 | 164k | free(hrec->vals); |
364 | 164k | free(hrec); |
365 | 164k | } |
366 | | |
367 | | // Copies all fields except IDX. |
368 | | bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec) |
369 | 0 | { |
370 | 0 | int save_errno; |
371 | 0 | bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t)); |
372 | 0 | if (!out) return NULL; |
373 | | |
374 | 0 | out->type = hrec->type; |
375 | 0 | if ( hrec->key ) { |
376 | 0 | out->key = strdup(hrec->key); |
377 | 0 | if (!out->key) goto fail; |
378 | 0 | } |
379 | 0 | if ( hrec->value ) { |
380 | 0 | out->value = strdup(hrec->value); |
381 | 0 | if (!out->value) goto fail; |
382 | 0 | } |
383 | 0 | out->nkeys = hrec->nkeys; |
384 | 0 | out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys); |
385 | 0 | if (!out->keys) goto fail; |
386 | 0 | out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys); |
387 | 0 | if (!out->vals) goto fail; |
388 | 0 | int i, j = 0; |
389 | 0 | for (i=0; i<hrec->nkeys; i++) |
390 | 0 | { |
391 | 0 | if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue; |
392 | 0 | if ( hrec->keys[i] ) { |
393 | 0 | out->keys[j] = strdup(hrec->keys[i]); |
394 | 0 | if (!out->keys[j]) goto fail; |
395 | 0 | } |
396 | 0 | if ( hrec->vals[i] ) { |
397 | 0 | out->vals[j] = strdup(hrec->vals[i]); |
398 | 0 | if (!out->vals[j]) goto fail; |
399 | 0 | } |
400 | 0 | j++; |
401 | 0 | } |
402 | 0 | if ( i!=j ) out->nkeys -= i-j; // IDX was omitted |
403 | 0 | return out; |
404 | | |
405 | 0 | fail: |
406 | 0 | save_errno = errno; |
407 | 0 | hts_log_error("%s", strerror(errno)); |
408 | 0 | bcf_hrec_destroy(out); |
409 | 0 | errno = save_errno; |
410 | 0 | return NULL; |
411 | 0 | } |
412 | | |
413 | | void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec) |
414 | 0 | { |
415 | 0 | fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:""); |
416 | 0 | int i; |
417 | 0 | for (i=0; i<hrec->nkeys; i++) |
418 | 0 | fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]); |
419 | 0 | fprintf(fp, "\n"); |
420 | 0 | } |
421 | | |
422 | | void bcf_header_debug(bcf_hdr_t *hdr) |
423 | 0 | { |
424 | 0 | int i, j; |
425 | 0 | for (i=0; i<hdr->nhrec; i++) |
426 | 0 | { |
427 | 0 | if ( !hdr->hrec[i]->value ) |
428 | 0 | { |
429 | 0 | fprintf(stderr, "##%s=<", hdr->hrec[i]->key); |
430 | 0 | fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]); |
431 | 0 | for (j=1; j<hdr->hrec[i]->nkeys; j++) |
432 | 0 | fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]); |
433 | 0 | fprintf(stderr,">\n"); |
434 | 0 | } |
435 | 0 | else |
436 | 0 | fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value); |
437 | 0 | } |
438 | 0 | } |
439 | | |
440 | | int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len) |
441 | 261k | { |
442 | 261k | char **tmp; |
443 | 261k | size_t n = hrec->nkeys + 1; |
444 | 261k | assert(len > 0 && len < SIZE_MAX); |
445 | 261k | tmp = realloc(hrec->keys, sizeof(char*)*n); |
446 | 261k | if (!tmp) return -1; |
447 | 261k | hrec->keys = tmp; |
448 | 261k | tmp = realloc(hrec->vals, sizeof(char*)*n); |
449 | 261k | if (!tmp) return -1; |
450 | 261k | hrec->vals = tmp; |
451 | | |
452 | 261k | hrec->keys[hrec->nkeys] = (char*) malloc((len+1)*sizeof(char)); |
453 | 261k | if (!hrec->keys[hrec->nkeys]) return -1; |
454 | 261k | memcpy(hrec->keys[hrec->nkeys],str,len); |
455 | 261k | hrec->keys[hrec->nkeys][len] = 0; |
456 | 261k | hrec->vals[hrec->nkeys] = NULL; |
457 | 261k | hrec->nkeys = n; |
458 | 261k | return 0; |
459 | 261k | } |
460 | | |
461 | | int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted) |
462 | 261k | { |
463 | 261k | if ( hrec->vals[i] ) { |
464 | 0 | free(hrec->vals[i]); |
465 | 0 | hrec->vals[i] = NULL; |
466 | 0 | } |
467 | 261k | if ( !str ) return 0; |
468 | 261k | if ( is_quoted ) |
469 | 66.7k | { |
470 | 66.7k | if (len >= SIZE_MAX - 3) { |
471 | 0 | errno = ENOMEM; |
472 | 0 | return -1; |
473 | 0 | } |
474 | 66.7k | hrec->vals[i] = (char*) malloc((len+3)*sizeof(char)); |
475 | 66.7k | if (!hrec->vals[i]) return -1; |
476 | 66.7k | hrec->vals[i][0] = '"'; |
477 | 66.7k | memcpy(&hrec->vals[i][1],str,len); |
478 | 66.7k | hrec->vals[i][len+1] = '"'; |
479 | 66.7k | hrec->vals[i][len+2] = 0; |
480 | 66.7k | } |
481 | 194k | else |
482 | 194k | { |
483 | 194k | if (len == SIZE_MAX) { |
484 | 0 | errno = ENOMEM; |
485 | 0 | return -1; |
486 | 0 | } |
487 | 194k | hrec->vals[i] = (char*) malloc((len+1)*sizeof(char)); |
488 | 194k | if (!hrec->vals[i]) return -1; |
489 | 194k | memcpy(hrec->vals[i],str,len); |
490 | 194k | hrec->vals[i][len] = 0; |
491 | 194k | } |
492 | 261k | return 0; |
493 | 261k | } |
494 | | |
495 | | int hrec_add_idx(bcf_hrec_t *hrec, int idx) |
496 | 67.0k | { |
497 | 67.0k | int n = hrec->nkeys + 1; |
498 | 67.0k | char **tmp = (char**) realloc(hrec->keys, sizeof(char*)*n); |
499 | 67.0k | if (!tmp) return -1; |
500 | 67.0k | hrec->keys = tmp; |
501 | | |
502 | 67.0k | tmp = (char**) realloc(hrec->vals, sizeof(char*)*n); |
503 | 67.0k | if (!tmp) return -1; |
504 | 67.0k | hrec->vals = tmp; |
505 | | |
506 | 67.0k | hrec->keys[hrec->nkeys] = strdup("IDX"); |
507 | 67.0k | if (!hrec->keys[hrec->nkeys]) return -1; |
508 | | |
509 | 67.0k | kstring_t str = {0,0,0}; |
510 | 67.0k | if (kputw(idx, &str) < 0) { |
511 | 0 | free(hrec->keys[hrec->nkeys]); |
512 | 0 | return -1; |
513 | 0 | } |
514 | 67.0k | hrec->vals[hrec->nkeys] = str.s; |
515 | 67.0k | hrec->nkeys = n; |
516 | 67.0k | return 0; |
517 | 67.0k | } |
518 | | |
519 | | int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key) |
520 | 117k | { |
521 | 117k | int i; |
522 | 190k | for (i=0; i<hrec->nkeys; i++) |
523 | 139k | if ( !strcasecmp(key,hrec->keys[i]) ) return i; |
524 | 51.5k | return -1; |
525 | 117k | } |
526 | | |
527 | | static void bcf_hrec_set_type(bcf_hrec_t *hrec) |
528 | 302k | { |
529 | 302k | if ( !strcmp(hrec->key, "contig") ) hrec->type = BCF_HL_CTG; |
530 | 277k | else if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO; |
531 | 176k | else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT; |
532 | 114k | else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT; |
533 | 84.4k | else if ( hrec->nkeys>0 ) hrec->type = BCF_HL_STR; |
534 | 63.0k | else hrec->type = BCF_HL_GEN; |
535 | 302k | } |
536 | | |
537 | | |
538 | | /** |
539 | | The arrays were generated with |
540 | | |
541 | | valid_ctg: |
542 | | perl -le '@v = (split(//,q[!#$%&*+./:;=?@^_|~-]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48 |
543 | | |
544 | | valid_tag: |
545 | | perl -le '@v = (split(//,q[_.]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48 |
546 | | */ |
547 | | static const uint8_t valid_ctg[256] = |
548 | | { |
549 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
550 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
551 | | 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, |
552 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, |
553 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
554 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, |
555 | | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
556 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, |
557 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
558 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
559 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
560 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
561 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
562 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
563 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
564 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
565 | | }; |
566 | | static const uint8_t valid_tag[256] = |
567 | | { |
568 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
569 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
570 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, |
571 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, |
572 | | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
573 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, |
574 | | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
575 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, |
576 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
577 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
578 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
579 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
580 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
581 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
582 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
583 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
584 | | }; |
585 | | |
586 | | /** |
587 | | bcf_hrec_check() - check the validity of structured header lines |
588 | | |
589 | | Returns 0 on success or negative value on error. |
590 | | |
591 | | Currently the return status is not checked by the caller |
592 | | and only a warning is printed on stderr. This should be improved |
593 | | to propagate the error all the way up to the caller and let it |
594 | | decide what to do: throw an error or proceed anyway. |
595 | | */ |
596 | | static int bcf_hrec_check(bcf_hrec_t *hrec) |
597 | 151k | { |
598 | 151k | int i; |
599 | 151k | bcf_hrec_set_type(hrec); |
600 | | |
601 | 151k | if ( hrec->type==BCF_HL_CTG ) |
602 | 12.2k | { |
603 | 12.2k | i = bcf_hrec_find_key(hrec,"ID"); |
604 | 12.2k | if ( i<0 ) goto err_missing_id; |
605 | 6.15k | char *val = hrec->vals[i]; |
606 | 6.15k | if ( val[0]=='*' || val[0]=='=' || !valid_ctg[(uint8_t)val[0]] ) goto err_invalid_ctg; |
607 | 62.4k | while ( *(++val) ) |
608 | 61.7k | if ( !valid_ctg[(uint8_t)*val] ) goto err_invalid_ctg; |
609 | 702 | return 0; |
610 | 1.18k | } |
611 | 138k | if ( hrec->type==BCF_HL_INFO ) |
612 | 50.4k | { |
613 | 50.4k | i = bcf_hrec_find_key(hrec,"ID"); |
614 | 50.4k | if ( i<0 ) goto err_missing_id; |
615 | 32.9k | char *val = hrec->vals[i]; |
616 | 32.9k | if ( !strcmp(val,"1000G") ) return 0; |
617 | 32.9k | if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag; |
618 | 14.8k | while ( *(++val) ) |
619 | 12.0k | if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag; |
620 | 2.84k | return 0; |
621 | 3.88k | } |
622 | 88.4k | if ( hrec->type==BCF_HL_FMT ) |
623 | 14.9k | { |
624 | 14.9k | i = bcf_hrec_find_key(hrec,"ID"); |
625 | 14.9k | if ( i<0 ) goto err_missing_id; |
626 | 10.1k | char *val = hrec->vals[i]; |
627 | 10.1k | if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag; |
628 | 17.4k | while ( *(++val) ) |
629 | 15.0k | if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag; |
630 | 2.39k | return 0; |
631 | 7.47k | } |
632 | 73.5k | return 0; |
633 | | |
634 | 28.3k | err_missing_id: |
635 | 28.3k | hts_log_warning("Missing ID attribute in one or more header lines"); |
636 | 28.3k | return -1; |
637 | | |
638 | 5.45k | err_invalid_ctg: |
639 | 5.45k | hts_log_warning("Invalid contig name: \"%s\"", hrec->vals[i]); |
640 | 5.45k | return -1; |
641 | | |
642 | 37.7k | err_invalid_tag: |
643 | 37.7k | hts_log_warning("Invalid tag name: \"%s\"", hrec->vals[i]); |
644 | 37.7k | return -1; |
645 | 88.4k | } |
646 | | |
647 | | static inline int is_escaped(const char *min, const char *str) |
648 | 65.9k | { |
649 | 65.9k | int n = 0; |
650 | 65.9k | while ( --str>=min && *str=='\\' ) n++; |
651 | 65.9k | return n%2; |
652 | 65.9k | } |
653 | | |
654 | | bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) |
655 | 176k | { |
656 | 176k | bcf_hrec_t *hrec = NULL; |
657 | 176k | const char *p = line; |
658 | 176k | if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; } |
659 | 173k | p += 2; |
660 | | |
661 | 173k | const char *q = p; |
662 | 1.27M | while ( *q && *q!='=' && *q != '\n' ) q++; |
663 | 173k | ptrdiff_t n = q-p; |
664 | 173k | if ( *q!='=' || !n ) // wrong format |
665 | 8.99k | goto malformed_line; |
666 | | |
667 | 164k | hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t)); |
668 | 164k | if (!hrec) { *len = -1; return NULL; } |
669 | 164k | hrec->key = (char*) malloc(sizeof(char)*(n+1)); |
670 | 164k | if (!hrec->key) goto fail; |
671 | 164k | memcpy(hrec->key,p,n); |
672 | 164k | hrec->key[n] = 0; |
673 | 164k | hrec->type = -1; |
674 | | |
675 | 164k | p = ++q; |
676 | 164k | if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579 |
677 | 38.9k | { |
678 | 6.57M | while ( *q && *q!='\n' ) q++; |
679 | 38.9k | hrec->value = (char*) malloc((q-p+1)*sizeof(char)); |
680 | 38.9k | if (!hrec->value) goto fail; |
681 | 38.9k | memcpy(hrec->value, p, q-p); |
682 | 38.9k | hrec->value[q-p] = 0; |
683 | 38.9k | *len = q - line + (*q ? 1 : 0); // Skip \n but not \0 |
684 | 38.9k | return hrec; |
685 | 38.9k | } |
686 | | |
687 | | // structured line, e.g. |
688 | | // ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias"> |
689 | | // ##PEDIGREE=<Name_0=G0-ID,Name_1=G1-ID,Name_3=GN-ID> |
690 | 125k | int nopen = 1; |
691 | 387k | while ( *q && *q!='\n' && nopen>0 ) |
692 | 274k | { |
693 | 274k | p = ++q; |
694 | 275k | while ( *q && *q==' ' ) { p++; q++; } |
695 | | // ^[A-Za-z_][0-9A-Za-z_.]*$ |
696 | 274k | if (p==q && *q && (isalpha_c(*q) || *q=='_')) |
697 | 271k | { |
698 | 271k | q++; |
699 | 1.41M | while ( *q && (isalnum_c(*q) || *q=='_' || *q=='.') ) q++; |
700 | 271k | } |
701 | 274k | n = q-p; |
702 | 274k | int m = 0; |
703 | 275k | while ( *q && *q==' ' ) { q++; m++; } |
704 | 274k | if ( *q!='=' || !n ) |
705 | 13.4k | goto malformed_line; |
706 | | |
707 | 261k | if (bcf_hrec_add_key(hrec, p, q-p-m) < 0) goto fail; |
708 | 261k | p = ++q; |
709 | 264k | while ( *q && *q==' ' ) { p++; q++; } |
710 | | |
711 | 261k | int quoted = 0; |
712 | 261k | char ending = '\0'; |
713 | 261k | switch (*p) { |
714 | 66.7k | case '"': |
715 | 66.7k | quoted = 1; |
716 | 66.7k | ending = '"'; |
717 | 66.7k | p++; |
718 | 66.7k | break; |
719 | 14 | case '[': |
720 | 14 | quoted = 1; |
721 | 14 | ending = ']'; |
722 | 14 | break; |
723 | 261k | } |
724 | 261k | if ( quoted ) q++; |
725 | 224M | while ( *q && *q != '\n' ) |
726 | 224M | { |
727 | 224M | if ( quoted ) { if ( *q==ending && !is_escaped(p,q) ) break; } |
728 | 223M | else |
729 | 223M | { |
730 | 223M | if ( *q=='<' ) nopen++; |
731 | 223M | if ( *q=='>' ) nopen--; |
732 | 223M | if ( !nopen ) break; |
733 | 223M | if ( *q==',' && nopen==1 ) break; |
734 | 223M | } |
735 | 223M | q++; |
736 | 223M | } |
737 | 261k | const char *r = q; |
738 | 261k | if (quoted && ending == ']') { |
739 | 14 | if (*q == ending) { |
740 | 1 | r++; |
741 | 1 | q++; |
742 | 1 | quoted = 0; |
743 | 13 | } else { |
744 | 13 | char buffer[320]; |
745 | 13 | hts_log_error("Missing ']' in header line %s", |
746 | 13 | hts_strprint(buffer, sizeof(buffer), '"', |
747 | 13 | line, q-line)); |
748 | 13 | goto fail; |
749 | 13 | } |
750 | 14 | } |
751 | 261k | while ( r > p && r[-1] == ' ' ) r--; |
752 | 261k | if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0) |
753 | 0 | goto fail; |
754 | 261k | if ( quoted && *q==ending ) q++; |
755 | 261k | if ( *q=='>' ) |
756 | 69.6k | { |
757 | 69.6k | if (nopen) nopen--; // this can happen with nested angle brackets <> |
758 | 69.6k | q++; |
759 | 69.6k | } |
760 | 261k | } |
761 | 112k | if ( nopen ) |
762 | 42.5k | hts_log_warning("Incomplete header line, trying to proceed anyway:\n\t[%s]\n\t[%d]",line,q[0]); |
763 | | |
764 | | // Skip to end of line |
765 | 112k | int nonspace = 0; |
766 | 112k | p = q; |
767 | 998k | while ( *q && *q!='\n' ) { nonspace |= !isspace_c(*q); q++; } |
768 | 112k | if (nonspace) { |
769 | 1.09k | char buffer[320]; |
770 | 1.09k | hts_log_warning("Dropped trailing junk from header line '%s'", |
771 | 1.09k | hts_strprint(buffer, sizeof(buffer), |
772 | 1.09k | '"', line, q - line)); |
773 | 1.09k | } |
774 | | |
775 | 112k | *len = q - line + (*q ? 1 : 0); |
776 | 112k | return hrec; |
777 | | |
778 | 13 | fail: |
779 | 13 | *len = -1; |
780 | 13 | bcf_hrec_destroy(hrec); |
781 | 13 | return NULL; |
782 | | |
783 | 22.4k | malformed_line: |
784 | 22.4k | { |
785 | 22.4k | char buffer[320]; |
786 | 15.9M | while ( *q && *q!='\n' ) q++; // Ensure *len includes full line |
787 | 22.4k | hts_log_error("Could not parse the header line: %s", |
788 | 22.4k | hts_strprint(buffer, sizeof(buffer), |
789 | 22.4k | '"', line, q - line)); |
790 | 22.4k | *len = q - line + (*q ? 1 : 0); |
791 | 22.4k | bcf_hrec_destroy(hrec); |
792 | 22.4k | return NULL; |
793 | 125k | } |
794 | 125k | } |
795 | | |
796 | | static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_idinfo_t *idinfo) |
797 | 66.0k | { |
798 | 66.0k | size_t new_n; |
799 | | |
800 | | // If available, preserve existing IDX |
801 | 66.0k | if ( idinfo->id==-1 ) |
802 | 65.6k | idinfo->id = hdr->n[dict_type]; |
803 | 336 | else if ( idinfo->id < hdr->n[dict_type] && hdr->id[dict_type][idinfo->id].key ) |
804 | 4 | { |
805 | 4 | hts_log_error("Conflicting IDX=%d lines in the header dictionary, the new tag is %s", |
806 | 4 | idinfo->id, tag); |
807 | 4 | errno = EINVAL; |
808 | 4 | return -1; |
809 | 4 | } |
810 | | |
811 | 66.0k | new_n = idinfo->id >= hdr->n[dict_type] ? idinfo->id+1 : hdr->n[dict_type]; |
812 | 66.0k | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
813 | | // hts_resize() can attempt to allocate up to 2 * requested items |
814 | 66.0k | if (new_n > FUZZ_ALLOC_LIMIT/(2 * sizeof(bcf_idpair_t))) |
815 | 4 | return -1; |
816 | 65.9k | #endif |
817 | 65.9k | if (hts_resize(bcf_idpair_t, new_n, &hdr->m[dict_type], |
818 | 65.9k | &hdr->id[dict_type], HTS_RESIZE_CLEAR)) { |
819 | 0 | return -1; |
820 | 0 | } |
821 | 65.9k | hdr->n[dict_type] = new_n; |
822 | | |
823 | | // NB: the next kh_put call can invalidate the idinfo pointer, therefore |
824 | | // we leave it unassigned here. It must be set explicitly in bcf_hdr_sync. |
825 | 65.9k | hdr->id[dict_type][idinfo->id].key = tag; |
826 | | |
827 | 65.9k | return 0; |
828 | 65.9k | } |
829 | | |
830 | | // returns: 1 when hdr needs to be synced, -1 on error, 0 otherwise |
831 | | static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) |
832 | 151k | { |
833 | | // contig |
834 | 151k | int i, ret, replacing = 0; |
835 | 151k | khint_t k; |
836 | 151k | char *str = NULL; |
837 | | |
838 | 151k | bcf_hrec_set_type(hrec); |
839 | | |
840 | 151k | if ( hrec->type==BCF_HL_CTG ) |
841 | 12.2k | { |
842 | 12.2k | hts_pos_t len = 0; |
843 | | |
844 | | // Get the contig ID ($str) and length ($j) |
845 | 12.2k | i = bcf_hrec_find_key(hrec,"length"); |
846 | 12.2k | if ( i<0 ) len = 0; |
847 | 2.56k | else { |
848 | 2.56k | char *end = hrec->vals[i]; |
849 | 2.56k | len = strtoll(hrec->vals[i], &end, 10); |
850 | 2.56k | if (end == hrec->vals[i] || len < 0) return 0; |
851 | 2.56k | } |
852 | | |
853 | 11.3k | i = bcf_hrec_find_key(hrec,"ID"); |
854 | 11.3k | if ( i<0 ) return 0; |
855 | 6.15k | str = strdup(hrec->vals[i]); |
856 | 6.15k | if (!str) return -1; |
857 | | |
858 | | // Register in the dictionary |
859 | 6.15k | vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG]; |
860 | 6.15k | khint_t k = kh_get(vdict, d, str); |
861 | 6.15k | if ( k != kh_end(d) ) { // already present |
862 | 866 | free(str); str=NULL; |
863 | 866 | if (kh_val(d, k).hrec[0] != NULL) // and not removed |
864 | 866 | return 0; |
865 | 0 | replacing = 1; |
866 | 5.29k | } else { |
867 | 5.29k | k = kh_put(vdict, d, str, &ret); |
868 | 5.29k | if (ret < 0) { free(str); return -1; } |
869 | 5.29k | } |
870 | | |
871 | 5.29k | int idx = bcf_hrec_find_key(hrec,"IDX"); |
872 | 5.29k | if ( idx!=-1 ) |
873 | 2.52k | { |
874 | 2.52k | char *tmp = hrec->vals[idx]; |
875 | 2.52k | idx = strtol(hrec->vals[idx], &tmp, 10); |
876 | 2.52k | if ( *tmp || idx < 0 || idx >= INT_MAX - 1) |
877 | 2.50k | { |
878 | 2.50k | if (!replacing) { |
879 | 2.50k | kh_del(vdict, d, k); |
880 | 2.50k | free(str); |
881 | 2.50k | } |
882 | 2.50k | hts_log_warning("Error parsing the IDX tag, skipping"); |
883 | 2.50k | return 0; |
884 | 2.50k | } |
885 | 2.52k | } |
886 | | |
887 | 2.78k | kh_val(d, k) = bcf_idinfo_def; |
888 | 2.78k | kh_val(d, k).id = idx; |
889 | 2.78k | kh_val(d, k).info[0] = len; |
890 | 2.78k | kh_val(d, k).hrec[0] = hrec; |
891 | 2.78k | if (bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k)) < 0) { |
892 | 4 | if (!replacing) { |
893 | 4 | kh_del(vdict, d, k); |
894 | 4 | free(str); |
895 | 4 | } |
896 | 4 | return -1; |
897 | 4 | } |
898 | 2.78k | if ( idx==-1 ) { |
899 | 2.77k | if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) { |
900 | 0 | return -1; |
901 | 0 | } |
902 | 2.77k | } |
903 | | |
904 | 2.78k | return 1; |
905 | 2.78k | } |
906 | | |
907 | 138k | if ( hrec->type==BCF_HL_STR ) return 1; |
908 | 128k | if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_FMT ) return 0; |
909 | | |
910 | | // INFO/FILTER/FORMAT |
911 | 96.6k | char *id = NULL; |
912 | 96.6k | uint32_t type = UINT32_MAX, var = UINT32_MAX; |
913 | 96.6k | int num = -1, idx = -1; |
914 | 322k | for (i=0; i<hrec->nkeys; i++) |
915 | 226k | { |
916 | 226k | if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i]; |
917 | 152k | else if ( !strcmp(hrec->keys[i], "IDX") ) |
918 | 2.71k | { |
919 | 2.71k | char *tmp = hrec->vals[i]; |
920 | 2.71k | idx = strtol(hrec->vals[i], &tmp, 10); |
921 | 2.71k | if ( *tmp || idx < 0 || idx >= INT_MAX - 1) |
922 | 666 | { |
923 | 666 | hts_log_warning("Error parsing the IDX tag, skipping"); |
924 | 666 | return 0; |
925 | 666 | } |
926 | 2.71k | } |
927 | 149k | else if ( !strcmp(hrec->keys[i], "Type") ) |
928 | 42.7k | { |
929 | 42.7k | if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT; |
930 | 41.2k | else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL; |
931 | 40.2k | else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR; |
932 | 9.37k | else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR; |
933 | 8.90k | else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG; |
934 | 7.31k | else |
935 | 7.31k | { |
936 | 7.31k | hts_log_warning("The type \"%s\" is not supported, assuming \"String\"", hrec->vals[i]); |
937 | 7.31k | type = BCF_HT_STR; |
938 | 7.31k | } |
939 | 42.7k | } |
940 | 107k | else if ( !strcmp(hrec->keys[i], "Number") ) |
941 | 35.6k | { |
942 | 35.6k | int is_fmt = hrec->type == BCF_HL_FMT; |
943 | 35.6k | if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A; |
944 | 34.0k | else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R; |
945 | 33.9k | else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G; |
946 | 32.7k | else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR; |
947 | 32.7k | else if ( is_fmt && !strcmp(hrec->vals[i],"P") ) var = BCF_VL_P; |
948 | 32.7k | else if ( is_fmt && !strcmp(hrec->vals[i],"LA") ) var = BCF_VL_LA; |
949 | 32.7k | else if ( is_fmt && !strcmp(hrec->vals[i],"LR") ) var = BCF_VL_LR; |
950 | 32.7k | else if ( is_fmt && !strcmp(hrec->vals[i],"LG") ) var = BCF_VL_LG; |
951 | 32.7k | else if ( is_fmt && !strcmp(hrec->vals[i],"M") ) var = BCF_VL_M; |
952 | 32.7k | else |
953 | 32.7k | { |
954 | 32.7k | if (sscanf(hrec->vals[i],"%d",&num) == 1) |
955 | 32.2k | var = BCF_VL_FIXED; |
956 | 32.7k | } |
957 | 35.6k | if (var != BCF_VL_FIXED) num = 0xfffff; |
958 | 35.6k | } |
959 | 226k | } |
960 | 95.9k | if (hrec->type == BCF_HL_INFO || hrec->type == BCF_HL_FMT) { |
961 | 64.7k | if (type == -1) { |
962 | 23.2k | hts_log_warning("%s %s field has no Type defined. Assuming String", |
963 | 23.2k | *hrec->key == 'I' ? "An" : "A", hrec->key); |
964 | 23.2k | type = BCF_HT_STR; |
965 | 23.2k | } |
966 | 64.7k | if (var == UINT32_MAX) { |
967 | 29.5k | hts_log_warning("%s %s field has no Number defined. Assuming '.'", |
968 | 29.5k | *hrec->key == 'I' ? "An" : "A", hrec->key); |
969 | 29.5k | var = BCF_VL_VAR; |
970 | 29.5k | } |
971 | 64.7k | if ( type==BCF_HT_FLAG && (var!=BCF_VL_FIXED || num!=0) ) |
972 | 1.15k | { |
973 | 1.15k | hts_log_warning("The definition of Flag \"%s/%s\" is invalid, forcing Number=0", hrec->key,id); |
974 | 1.15k | var = BCF_VL_FIXED; |
975 | 1.15k | num = 0; |
976 | 1.15k | } |
977 | 64.7k | } |
978 | 95.9k | uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 | |
979 | 95.9k | (var & 0xf) << 8 | |
980 | 95.9k | (type & 0xf) << 4 | |
981 | 95.9k | (((uint32_t) hrec->type) & 0xf)); |
982 | | |
983 | 95.9k | if ( !id ) return 0; |
984 | 74.0k | str = strdup(id); |
985 | 74.0k | if (!str) return -1; |
986 | | |
987 | 74.0k | vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID]; |
988 | 74.0k | k = kh_get(vdict, d, str); |
989 | 74.0k | if ( k != kh_end(d) ) |
990 | 10.7k | { |
991 | | // already present |
992 | 10.7k | free(str); |
993 | 10.7k | if ( kh_val(d, k).hrec[info&0xf] ) return 0; |
994 | 1.41k | kh_val(d, k).info[info&0xf] = info; |
995 | 1.41k | kh_val(d, k).hrec[info&0xf] = hrec; |
996 | 1.41k | if ( idx==-1 ) { |
997 | 1.41k | if (hrec_add_idx(hrec, kh_val(d, k).id) < 0) { |
998 | 0 | return -1; |
999 | 0 | } |
1000 | 1.41k | } |
1001 | 1.41k | return 1; |
1002 | 1.41k | } |
1003 | 63.2k | k = kh_put(vdict, d, str, &ret); |
1004 | 63.2k | if (ret < 0) { |
1005 | 0 | free(str); |
1006 | 0 | return -1; |
1007 | 0 | } |
1008 | 63.2k | kh_val(d, k) = bcf_idinfo_def; |
1009 | 63.2k | kh_val(d, k).info[info&0xf] = info; |
1010 | 63.2k | kh_val(d, k).hrec[info&0xf] = hrec; |
1011 | 63.2k | kh_val(d, k).id = idx; |
1012 | 63.2k | if (bcf_hdr_set_idx(hdr, BCF_DT_ID, kh_key(d,k), &kh_val(d,k)) < 0) { |
1013 | 4 | kh_del(vdict, d, k); |
1014 | 4 | free(str); |
1015 | 4 | return -1; |
1016 | 4 | } |
1017 | 63.2k | if ( idx==-1 ) { |
1018 | 62.9k | if (hrec_add_idx(hrec, kh_val(d,k).id) < 0) { |
1019 | 0 | return -1; |
1020 | 0 | } |
1021 | 62.9k | } |
1022 | | |
1023 | 63.2k | return 1; |
1024 | 63.2k | } |
1025 | | |
1026 | | static void bcf_hdr_unregister_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) |
1027 | 0 | { |
1028 | 0 | if (hrec->type == BCF_HL_FLT || |
1029 | 0 | hrec->type == BCF_HL_INFO || |
1030 | 0 | hrec->type == BCF_HL_FMT || |
1031 | 0 | hrec->type == BCF_HL_CTG) { |
1032 | 0 | int id = bcf_hrec_find_key(hrec, "ID"); |
1033 | 0 | if (id < 0 || !hrec->vals[id]) |
1034 | 0 | return; |
1035 | 0 | vdict_t *dict = (hrec->type == BCF_HL_CTG |
1036 | 0 | ? (vdict_t*)hdr->dict[BCF_DT_CTG] |
1037 | 0 | : (vdict_t*)hdr->dict[BCF_DT_ID]); |
1038 | 0 | khint_t k = kh_get(vdict, dict, hrec->vals[id]); |
1039 | 0 | if (k != kh_end(dict)) |
1040 | 0 | kh_val(dict, k).hrec[hrec->type==BCF_HL_CTG ? 0 : hrec->type] = NULL; |
1041 | 0 | } |
1042 | 0 | } |
1043 | | |
1044 | | static void bcf_hdr_remove_from_hdict(bcf_hdr_t *hdr, bcf_hrec_t *hrec) |
1045 | 0 | { |
1046 | 0 | kstring_t str = KS_INITIALIZE; |
1047 | 0 | bcf_hdr_aux_t *aux = get_hdr_aux(hdr); |
1048 | 0 | khint_t k; |
1049 | 0 | int id; |
1050 | |
|
1051 | 0 | switch (hrec->type) { |
1052 | 0 | case BCF_HL_GEN: |
1053 | 0 | if (ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0) |
1054 | 0 | str.l = 0; |
1055 | 0 | break; |
1056 | 0 | case BCF_HL_STR: |
1057 | 0 | id = bcf_hrec_find_key(hrec, "ID"); |
1058 | 0 | if (id < 0) |
1059 | 0 | return; |
1060 | 0 | if (!hrec->vals[id] || |
1061 | 0 | ksprintf(&str, "##%s=<ID=%s>", hrec->key, hrec->vals[id]) < 0) |
1062 | 0 | str.l = 0; |
1063 | 0 | break; |
1064 | 0 | default: |
1065 | 0 | return; |
1066 | 0 | } |
1067 | 0 | if (str.l) { |
1068 | 0 | k = kh_get(hdict, aux->gen, str.s); |
1069 | 0 | } else { |
1070 | | // Couldn't get a string for some reason, so try the hard way... |
1071 | 0 | for (k = kh_begin(aux->gen); k < kh_end(aux->gen); k++) { |
1072 | 0 | if (kh_exist(aux->gen, k) && kh_val(aux->gen, k) == hrec) |
1073 | 0 | break; |
1074 | 0 | } |
1075 | 0 | } |
1076 | 0 | if (k != kh_end(aux->gen) && kh_val(aux->gen, k) == hrec) { |
1077 | 0 | kh_val(aux->gen, k) = NULL; |
1078 | 0 | free((char *) kh_key(aux->gen, k)); |
1079 | 0 | kh_key(aux->gen, k) = NULL; |
1080 | 0 | kh_del(hdict, aux->gen, k); |
1081 | 0 | } |
1082 | 0 | free(str.s); |
1083 | 0 | } |
1084 | | |
1085 | | int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp) |
1086 | 0 | { |
1087 | 0 | assert( hrec->type==BCF_HL_GEN ); |
1088 | 0 | int ret; |
1089 | 0 | khint_t k; |
1090 | 0 | bcf_hdr_aux_t *aux = get_hdr_aux(hdr); |
1091 | 0 | for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++) |
1092 | 0 | { |
1093 | 0 | if ( !kh_exist(aux->gen,k) ) continue; |
1094 | 0 | if ( hrec!=(bcf_hrec_t*)kh_val(aux->gen,k) ) continue; |
1095 | 0 | break; |
1096 | 0 | } |
1097 | 0 | assert( k<kh_end(aux->gen) ); // something went wrong, should never happen |
1098 | 0 | free((char*)kh_key(aux->gen,k)); |
1099 | 0 | kh_del(hdict,aux->gen,k); |
1100 | 0 | kstring_t str = {0,0,0}; |
1101 | 0 | if ( ksprintf(&str, "##%s=%s", tmp->key,tmp->value) < 0 ) |
1102 | 0 | { |
1103 | 0 | free(str.s); |
1104 | 0 | return -1; |
1105 | 0 | } |
1106 | 0 | k = kh_put(hdict, aux->gen, str.s, &ret); |
1107 | 0 | if ( ret<0 ) |
1108 | 0 | { |
1109 | 0 | free(str.s); |
1110 | 0 | return -1; |
1111 | 0 | } |
1112 | 0 | free(hrec->value); |
1113 | 0 | hrec->value = strdup(tmp->value); |
1114 | 0 | if ( !hrec->value ) return -1; |
1115 | 0 | kh_val(aux->gen,k) = hrec; |
1116 | |
|
1117 | 0 | if (!strcmp(hrec->key,"fileformat")) { |
1118 | | //update version |
1119 | 0 | get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value); |
1120 | 0 | } |
1121 | 0 | return 0; |
1122 | 0 | } |
1123 | | |
1124 | | int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) |
1125 | 151k | { |
1126 | 151k | kstring_t str = {0,0,0}; |
1127 | 151k | bcf_hdr_aux_t *aux = get_hdr_aux(hdr); |
1128 | | |
1129 | 151k | int res; |
1130 | 151k | if ( !hrec ) return 0; |
1131 | | |
1132 | 151k | bcf_hrec_check(hrec); // todo: check return status and propagate errors up |
1133 | | |
1134 | 151k | res = bcf_hdr_register_hrec(hdr,hrec); |
1135 | 151k | if (res < 0) return -1; |
1136 | 151k | if ( !res ) |
1137 | 73.0k | { |
1138 | | // If one of the hashed field, then it is already present |
1139 | 73.0k | if ( hrec->type != BCF_HL_GEN ) |
1140 | 41.5k | { |
1141 | 41.5k | bcf_hrec_destroy(hrec); |
1142 | 41.5k | return 0; |
1143 | 41.5k | } |
1144 | | // Is one of the generic fields and already present? |
1145 | 31.5k | if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 ) |
1146 | 0 | { |
1147 | 0 | free(str.s); |
1148 | 0 | return -1; |
1149 | 0 | } |
1150 | 31.5k | khint_t k = kh_get(hdict, aux->gen, str.s); |
1151 | 31.5k | if ( k != kh_end(aux->gen) ) |
1152 | 20.7k | { |
1153 | | // duplicate record |
1154 | 20.7k | bcf_hrec_destroy(hrec); |
1155 | 20.7k | free(str.s); |
1156 | 20.7k | return 0; |
1157 | 20.7k | } |
1158 | 10.7k | if (!strcmp(hrec->key, "fileformat")) { |
1159 | 3.34k | aux->version = bcf_get_version(NULL, hrec->value); |
1160 | 3.34k | } |
1161 | 10.7k | } |
1162 | | |
1163 | 88.8k | int i; |
1164 | 88.8k | if ( hrec->type==BCF_HL_STR && (i=bcf_hrec_find_key(hrec,"ID"))>=0 ) |
1165 | 5.14k | { |
1166 | 5.14k | if ( ksprintf(&str, "##%s=<ID=%s>", hrec->key,hrec->vals[i]) < 0 ) |
1167 | 0 | { |
1168 | 0 | free(str.s); |
1169 | 0 | return -1; |
1170 | 0 | } |
1171 | 5.14k | khint_t k = kh_get(hdict, aux->gen, str.s); |
1172 | 5.14k | if ( k != kh_end(aux->gen) ) |
1173 | 4.18k | { |
1174 | | // duplicate record |
1175 | 4.18k | bcf_hrec_destroy(hrec); |
1176 | 4.18k | free(str.s); |
1177 | 4.18k | return 0; |
1178 | 4.18k | } |
1179 | 5.14k | } |
1180 | | |
1181 | | // New record, needs to be added |
1182 | 84.6k | int n = hdr->nhrec + 1; |
1183 | 84.6k | bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*)); |
1184 | 84.6k | if (!new_hrec) { |
1185 | 0 | free(str.s); |
1186 | 0 | bcf_hdr_unregister_hrec(hdr, hrec); |
1187 | 0 | return -1; |
1188 | 0 | } |
1189 | 84.6k | hdr->hrec = new_hrec; |
1190 | | |
1191 | 84.6k | if ( str.s ) |
1192 | 11.6k | { |
1193 | 11.6k | khint_t k = kh_put(hdict, aux->gen, str.s, &res); |
1194 | 11.6k | if ( res<0 ) |
1195 | 0 | { |
1196 | 0 | free(str.s); |
1197 | 0 | return -1; |
1198 | 0 | } |
1199 | 11.6k | kh_val(aux->gen,k) = hrec; |
1200 | 11.6k | } |
1201 | | |
1202 | 84.6k | hdr->hrec[hdr->nhrec] = hrec; |
1203 | 84.6k | hdr->dirty = 1; |
1204 | 84.6k | hdr->nhrec = n; |
1205 | | |
1206 | 84.6k | return hrec->type==BCF_HL_GEN ? 0 : 1; |
1207 | 84.6k | } |
1208 | | |
1209 | | bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class) |
1210 | 337 | { |
1211 | 337 | int i; |
1212 | 337 | if ( type==BCF_HL_GEN ) |
1213 | 337 | { |
1214 | | // e.g. ##fileformat=VCFv4.2 |
1215 | | // ##source=GenomicsDBImport |
1216 | | // ##bcftools_viewVersion=1.16-80-gdfdb0923+htslib-1.16-34-g215d364 |
1217 | 337 | if ( value ) |
1218 | 0 | { |
1219 | 0 | kstring_t str = {0,0,0}; |
1220 | 0 | ksprintf(&str, "##%s=%s", key,value); |
1221 | 0 | bcf_hdr_aux_t *aux = get_hdr_aux(hdr); |
1222 | 0 | khint_t k = kh_get(hdict, aux->gen, str.s); |
1223 | 0 | free(str.s); |
1224 | 0 | if ( k == kh_end(aux->gen) ) return NULL; |
1225 | 0 | return kh_val(aux->gen, k); |
1226 | 0 | } |
1227 | 901 | for (i=0; i<hdr->nhrec; i++) |
1228 | 591 | { |
1229 | 591 | if ( hdr->hrec[i]->type!=type ) continue; |
1230 | 115 | if ( strcmp(hdr->hrec[i]->key,key) ) continue; |
1231 | 27 | return hdr->hrec[i]; |
1232 | 115 | } |
1233 | 310 | return NULL; |
1234 | 337 | } |
1235 | 0 | else if ( type==BCF_HL_STR ) |
1236 | 0 | { |
1237 | | // e.g. ##GATKCommandLine=<ID=GenomicsDBImport,CommandLine="GenomicsDBImport...."> |
1238 | | // ##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT"> |
1239 | 0 | if (!str_class) return NULL; |
1240 | 0 | if ( !strcmp("ID",key) ) |
1241 | 0 | { |
1242 | 0 | kstring_t str = {0,0,0}; |
1243 | 0 | ksprintf(&str, "##%s=<%s=%s>",str_class,key,value); |
1244 | 0 | bcf_hdr_aux_t *aux = get_hdr_aux(hdr); |
1245 | 0 | khint_t k = kh_get(hdict, aux->gen, str.s); |
1246 | 0 | free(str.s); |
1247 | 0 | if ( k == kh_end(aux->gen) ) return NULL; |
1248 | 0 | return kh_val(aux->gen, k); |
1249 | 0 | } |
1250 | 0 | for (i=0; i<hdr->nhrec; i++) |
1251 | 0 | { |
1252 | 0 | if ( hdr->hrec[i]->type!=type ) continue; |
1253 | 0 | if ( strcmp(hdr->hrec[i]->key,str_class) ) continue; |
1254 | 0 | int j = bcf_hrec_find_key(hdr->hrec[i],key); |
1255 | 0 | if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i]; |
1256 | 0 | } |
1257 | 0 | return NULL; |
1258 | 0 | } |
1259 | 0 | vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID]; |
1260 | 0 | khint_t k = kh_get(vdict, d, value); |
1261 | 0 | if ( k == kh_end(d) ) return NULL; |
1262 | 0 | return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type]; |
1263 | 0 | } |
1264 | | |
1265 | | // Check the VCF header is correctly formatted as per the specification. |
1266 | | // Note the code that calls this doesn't bother to check return values and |
1267 | | // we have so many broken VCFs in the wild that for now we just reprt a |
1268 | | // warning and continue anyway. So currently this is a void function. |
1269 | | void bcf_hdr_check_sanity(bcf_hdr_t *hdr) |
1270 | 2.24k | { |
1271 | 2.24k | int version = bcf_get_version(hdr, NULL); |
1272 | | |
1273 | 2.24k | struct tag { |
1274 | 2.24k | char name[10]; |
1275 | 2.24k | char number_str[3]; |
1276 | 2.24k | int number; |
1277 | 2.24k | int version; |
1278 | 2.24k | int type; |
1279 | 2.24k | }; |
1280 | | |
1281 | 2.24k | char type_str[][8] = {"Flag", "Integer", "Float", "String"}; |
1282 | | |
1283 | 2.24k | struct tag info_tags[] = { |
1284 | 2.24k | {"AD", "R", BCF_VL_R, VCF_DEF, BCF_HT_INT}, |
1285 | 2.24k | {"ADF", "R", BCF_VL_R, VCF_DEF, BCF_HT_INT}, |
1286 | 2.24k | {"ADR", "R", BCF_VL_R, VCF_DEF, BCF_HT_INT}, |
1287 | 2.24k | {"AC", "A", BCF_VL_A, VCF_DEF, BCF_HT_INT}, |
1288 | 2.24k | {"AF", "A", BCF_VL_A, VCF_DEF, BCF_HT_REAL}, |
1289 | 2.24k | {"CIGAR", "A", BCF_VL_A, VCF_DEF, BCF_HT_STR}, |
1290 | 2.24k | {"AA", "1", BCF_VL_FIXED, VCF_DEF, BCF_HT_STR}, |
1291 | 2.24k | {"AN", "1", BCF_VL_FIXED, VCF_DEF, BCF_HT_INT}, |
1292 | 2.24k | {"BQ", "1", BCF_VL_FIXED, VCF_DEF, BCF_HT_REAL}, |
1293 | 2.24k | {"DB", "0", BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG}, |
1294 | 2.24k | {"DP", "1", BCF_VL_FIXED, VCF_DEF, BCF_HT_INT}, |
1295 | 2.24k | {"END", "1", BCF_VL_FIXED, VCF_DEF, BCF_HT_INT}, |
1296 | 2.24k | {"H2", "0", BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG}, |
1297 | 2.24k | {"H3", "0", BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG}, |
1298 | 2.24k | {"MQ", "1", BCF_VL_FIXED, VCF_DEF, BCF_HT_REAL}, |
1299 | 2.24k | {"MQ0", "1", BCF_VL_FIXED, VCF_DEF, BCF_HT_INT}, |
1300 | 2.24k | {"NS", "1", BCF_VL_FIXED, VCF_DEF, BCF_HT_INT}, |
1301 | 2.24k | {"SB", "4", BCF_VL_FIXED, VCF_DEF, BCF_HT_INT}, |
1302 | 2.24k | {"SOMATIC", "0", BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG}, |
1303 | 2.24k | {"VALIDATED", "0", BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG}, |
1304 | 2.24k | {"1000G", "0", BCF_VL_FIXED, VCF_DEF, BCF_HT_FLAG}, |
1305 | 2.24k | }; |
1306 | 2.24k | static int info_warned[sizeof(info_tags)/sizeof(*info_tags)] = {0}; |
1307 | | |
1308 | 2.24k | struct tag fmt_tags[] = { |
1309 | 2.24k | {"AD", "R", BCF_VL_R, VCF_DEF, BCF_HT_INT}, |
1310 | 2.24k | {"ADF", "R", BCF_VL_R, VCF_DEF, BCF_HT_INT}, |
1311 | 2.24k | {"ADR", "R", BCF_VL_R, VCF_DEF, BCF_HT_INT}, |
1312 | 2.24k | {"EC", "A", BCF_VL_A, VCF_DEF, BCF_HT_INT}, |
1313 | 2.24k | {"GL", "G", BCF_VL_G, VCF_DEF, BCF_HT_REAL}, |
1314 | 2.24k | {"GP", "G", BCF_VL_G, VCF_DEF, BCF_HT_REAL}, |
1315 | 2.24k | {"PL", "G", BCF_VL_G, VCF_DEF, BCF_HT_INT}, |
1316 | 2.24k | {"PP", "G", BCF_VL_G, VCF_DEF, BCF_HT_INT}, |
1317 | 2.24k | {"DP", "1", BCF_VL_FIXED, VCF_DEF, BCF_HT_INT}, |
1318 | 2.24k | {"LEN", "1", BCF_VL_FIXED, VCF_DEF, BCF_HT_INT}, |
1319 | 2.24k | {"FT", "1", BCF_VL_FIXED, VCF_DEF, BCF_HT_STR}, |
1320 | 2.24k | {"GQ", "1", BCF_VL_FIXED, VCF_DEF, BCF_HT_INT}, |
1321 | 2.24k | {"GT", "1", BCF_VL_FIXED, VCF_DEF, BCF_HT_STR}, |
1322 | 2.24k | {"HQ", "2", BCF_VL_FIXED, VCF_DEF, BCF_HT_INT}, |
1323 | 2.24k | {"MQ", "1", BCF_VL_FIXED, VCF_DEF, BCF_HT_INT}, |
1324 | 2.24k | {"PQ", "1", BCF_VL_FIXED, VCF_DEF, BCF_HT_INT}, |
1325 | 2.24k | {"PS", "1", BCF_VL_FIXED, VCF_DEF, BCF_HT_INT}, |
1326 | 2.24k | {"PSL", "P", BCF_VL_P, VCF44, BCF_HT_STR}, |
1327 | 2.24k | {"PSO", "P", BCF_VL_P, VCF44, BCF_HT_INT}, |
1328 | 2.24k | {"PSQ", "P", BCF_VL_P, VCF44, BCF_HT_INT}, |
1329 | 2.24k | {"LGL", "LG", BCF_VL_LG, VCF45, BCF_HT_INT}, |
1330 | 2.24k | {"LGP", "LG", BCF_VL_LG, VCF45, BCF_HT_INT}, |
1331 | 2.24k | {"LPL", "LG", BCF_VL_LG, VCF45, BCF_HT_INT}, |
1332 | 2.24k | {"LPP", "LG", BCF_VL_LG, VCF45, BCF_HT_INT}, |
1333 | 2.24k | {"LEC", "LA", BCF_VL_LA, VCF45, BCF_HT_INT}, |
1334 | 2.24k | {"LAD", "LR", BCF_VL_LR, VCF45, BCF_HT_INT}, |
1335 | 2.24k | {"LADF", "LR", BCF_VL_LR, VCF45, BCF_HT_INT}, |
1336 | 2.24k | {"LADR", "LR", BCF_VL_LR, VCF45, BCF_HT_INT}, |
1337 | 2.24k | }; |
1338 | 2.24k | static int fmt_warned[sizeof(fmt_tags)/sizeof(*fmt_tags)] = {0}; |
1339 | | |
1340 | | // Check INFO tag numbers. We shouldn't really permit ".", but it's |
1341 | | // commonly misused so we let it slide unless it's a new tag and the |
1342 | | // file format claims to be new also. We also cannot distinguish between |
1343 | | // Number=1 and Number=2, but we at least report the correct term if we |
1344 | | // get, say, Number=G in its place. |
1345 | | // Also check the types. |
1346 | 2.24k | int i; |
1347 | 49.4k | for (i = 0; i < sizeof(info_tags)/sizeof(*info_tags); i++) { |
1348 | 47.2k | if (info_warned[i]) |
1349 | 0 | continue; |
1350 | 47.2k | int id = bcf_hdr_id2int(hdr, BCF_DT_ID, info_tags[i].name); |
1351 | 47.2k | if (bcf_hdr_idinfo_exists(hdr, BCF_HL_INFO, id)) { |
1352 | 0 | if (bcf_hdr_id2length(hdr, BCF_HL_INFO, id) != info_tags[i].number && |
1353 | 0 | bcf_hdr_id2length(hdr, BCF_HL_INFO, id) != BCF_VL_VAR) { |
1354 | 0 | info_warned[i] = 1; |
1355 | 0 | } else if (bcf_hdr_id2length(hdr, BCF_HL_INFO, id) == BCF_VL_FIXED && |
1356 | 0 | bcf_hdr_id2number(hdr, BCF_HL_INFO, id) != atoi(info_tags[i].number_str)) { |
1357 | 0 | info_warned[i] = 1; |
1358 | 0 | } |
1359 | |
|
1360 | 0 | if (info_warned[i]) { |
1361 | 0 | hts_log_warning("%s should be declared as Number=%s", |
1362 | 0 | info_tags[i].name, info_tags[i].number_str); |
1363 | 0 | } |
1364 | |
|
1365 | 0 | if (bcf_hdr_id2type(hdr, BCF_HL_INFO, id) != info_tags[i].type) { |
1366 | 0 | hts_log_warning("%s should be declared as Type=%s", |
1367 | 0 | info_tags[i].name, type_str[info_tags[i].type]); |
1368 | 0 | info_warned[i] = 1; |
1369 | 0 | } |
1370 | 0 | } |
1371 | 47.2k | } |
1372 | | |
1373 | | // Check FORMAT tag numbers and types. |
1374 | 65.1k | for (i = 0; i < sizeof(fmt_tags)/sizeof(*fmt_tags); i++) { |
1375 | 62.9k | if (fmt_warned[i]) |
1376 | 0 | continue; |
1377 | 62.9k | int id = bcf_hdr_id2int(hdr, BCF_DT_ID, fmt_tags[i].name); |
1378 | 62.9k | if (bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, id)) { |
1379 | 0 | if (bcf_hdr_id2length(hdr, BCF_HL_FMT, id) != fmt_tags[i].number) { |
1380 | | // Permit "Number=." if this tag predates the vcf version it is |
1381 | | // defined within. This is a common tactic for callers to use |
1382 | | // new tags with older formats in order to avoid parsing failures |
1383 | | // with some software. |
1384 | | // We don't care for 4.3 and earlier as that's more of a wild-west |
1385 | | // and it's not abnormal to see incorrect usage of Number=. there. |
1386 | 0 | if ((version < VCF44 && |
1387 | 0 | bcf_hdr_id2length(hdr, BCF_HL_FMT, id) != BCF_VL_VAR) || |
1388 | 0 | (version >= VCF44 && version >= fmt_tags[i].version)) { |
1389 | 0 | fmt_warned[i] = 1; |
1390 | 0 | } |
1391 | 0 | } else if (bcf_hdr_id2length(hdr, BCF_HL_FMT, id) == BCF_VL_FIXED && |
1392 | 0 | bcf_hdr_id2number(hdr, BCF_HL_FMT, id) != atoi(fmt_tags[i].number_str)) { |
1393 | 0 | fmt_warned[i] = 1; |
1394 | 0 | } |
1395 | |
|
1396 | 0 | if (fmt_warned[i]) { |
1397 | 0 | hts_log_warning("%s should be declared as Number=%s", |
1398 | 0 | fmt_tags[i].name, fmt_tags[i].number_str); |
1399 | 0 | } |
1400 | |
|
1401 | 0 | if (bcf_hdr_id2type(hdr, BCF_HL_FMT, id) != fmt_tags[i].type) { |
1402 | 0 | hts_log_warning("%s should be declared as Type=%s", |
1403 | 0 | fmt_tags[i].name, type_str[fmt_tags[i].type]); |
1404 | 0 | fmt_warned[i] = 1; |
1405 | 0 | } |
1406 | 0 | } |
1407 | 62.9k | } |
1408 | 2.24k | } |
1409 | | |
1410 | | int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) |
1411 | 2.60k | { |
1412 | 2.60k | int len, done = 0; |
1413 | 2.60k | char *p = htxt; |
1414 | | |
1415 | | // Check sanity: "fileformat" string must come as first |
1416 | 2.60k | bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len); |
1417 | 2.60k | if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") ) |
1418 | 166 | hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?"); |
1419 | 2.60k | if (bcf_hdr_add_hrec(hdr, hrec) < 0) { |
1420 | 0 | bcf_hrec_destroy(hrec); |
1421 | 0 | return -1; |
1422 | 0 | } |
1423 | | |
1424 | | // The filter PASS must appear first in the dictionary |
1425 | 2.60k | hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len); |
1426 | 2.60k | if (!hrec || bcf_hdr_add_hrec(hdr, hrec) < 0) { |
1427 | 0 | bcf_hrec_destroy(hrec); |
1428 | 0 | return -1; |
1429 | 0 | } |
1430 | | |
1431 | | // Parse the whole header |
1432 | 25.5k | do { |
1433 | 109k | while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) { |
1434 | 83.7k | if (bcf_hdr_add_hrec(hdr, hrec) < 0) { |
1435 | 8 | bcf_hrec_destroy(hrec); |
1436 | 8 | return -1; |
1437 | 8 | } |
1438 | 83.7k | p += len; |
1439 | 83.7k | } |
1440 | 25.5k | assert(hrec == NULL); |
1441 | 25.5k | if (len < 0) { |
1442 | | // len < 0 indicates out-of-memory, or similar error |
1443 | 0 | hts_log_error("Could not parse header line: %s", strerror(errno)); |
1444 | 0 | return -1; |
1445 | 25.5k | } else if (len > 0) { |
1446 | | // Bad header line. bcf_hdr_parse_line() will have logged it. |
1447 | | // Skip and try again on the next line (p + len will be the start |
1448 | | // of the next one). |
1449 | 22.4k | p += len; |
1450 | 22.4k | continue; |
1451 | 22.4k | } |
1452 | | |
1453 | | // Next should be the sample line. If not, it was a malformed |
1454 | | // header, in which case print a warning and skip (many VCF |
1455 | | // operations do not really care about a few malformed lines). |
1456 | | // In the future we may want to add a strict mode that errors in |
1457 | | // this case. |
1458 | 3.14k | if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) { |
1459 | 860 | char *eol = strchr(p, '\n'); |
1460 | 860 | if (*p != '\0') { |
1461 | 556 | char buffer[320]; |
1462 | 556 | hts_log_warning("Could not parse header line: %s", |
1463 | 556 | hts_strprint(buffer, sizeof(buffer), |
1464 | 556 | '"', p, |
1465 | 556 | eol ? (eol - p) : SIZE_MAX)); |
1466 | 556 | } |
1467 | 860 | if (eol) { |
1468 | 546 | p = eol + 1; // Try from the next line. |
1469 | 546 | } else { |
1470 | 314 | done = -1; // No more lines left, give up. |
1471 | 314 | } |
1472 | 2.28k | } else { |
1473 | 2.28k | done = 1; // Sample line found |
1474 | 2.28k | } |
1475 | 25.5k | } while (!done); |
1476 | | |
1477 | 2.59k | if (done < 0) { |
1478 | | // No sample line is fatal. |
1479 | 314 | hts_log_error("Could not parse the header, sample line not found"); |
1480 | 314 | return -1; |
1481 | 314 | } |
1482 | | |
1483 | 2.28k | if (bcf_hdr_parse_sample_line(hdr,p) < 0) |
1484 | 32 | return -1; |
1485 | 2.24k | if (bcf_hdr_sync(hdr) < 0) |
1486 | 0 | return -1; |
1487 | 2.24k | bcf_hdr_check_sanity(hdr); |
1488 | 2.24k | return 0; |
1489 | 2.24k | } |
1490 | | |
1491 | | int bcf_hdr_append(bcf_hdr_t *hdr, const char *line) |
1492 | 0 | { |
1493 | 0 | int len; |
1494 | 0 | bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len); |
1495 | 0 | if ( !hrec ) return -1; |
1496 | 0 | if (bcf_hdr_add_hrec(hdr, hrec) < 0) |
1497 | 0 | return -1; |
1498 | 0 | return 0; |
1499 | 0 | } |
1500 | | |
1501 | | void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key) |
1502 | 0 | { |
1503 | 0 | int i = 0; |
1504 | 0 | bcf_hrec_t *hrec; |
1505 | 0 | if ( !key ) |
1506 | 0 | { |
1507 | | // no key, remove all entries of this type |
1508 | 0 | while ( i<hdr->nhrec ) |
1509 | 0 | { |
1510 | 0 | if ( hdr->hrec[i]->type!=type ) { i++; continue; } |
1511 | 0 | hrec = hdr->hrec[i]; |
1512 | 0 | bcf_hdr_unregister_hrec(hdr, hrec); |
1513 | 0 | bcf_hdr_remove_from_hdict(hdr, hrec); |
1514 | 0 | hdr->dirty = 1; |
1515 | 0 | hdr->nhrec--; |
1516 | 0 | if ( i < hdr->nhrec ) |
1517 | 0 | memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*)); |
1518 | 0 | bcf_hrec_destroy(hrec); |
1519 | 0 | } |
1520 | 0 | return; |
1521 | 0 | } |
1522 | 0 | while (1) |
1523 | 0 | { |
1524 | 0 | if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG ) |
1525 | 0 | { |
1526 | 0 | hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL); |
1527 | 0 | if ( !hrec ) return; |
1528 | | |
1529 | 0 | for (i=0; i<hdr->nhrec; i++) |
1530 | 0 | if ( hdr->hrec[i]==hrec ) break; |
1531 | 0 | assert( i<hdr->nhrec ); |
1532 | |
|
1533 | 0 | vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID]; |
1534 | 0 | khint_t k = kh_get(vdict, d, key); |
1535 | 0 | kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL; |
1536 | 0 | } |
1537 | 0 | else |
1538 | 0 | { |
1539 | 0 | for (i=0; i<hdr->nhrec; i++) |
1540 | 0 | { |
1541 | 0 | if ( hdr->hrec[i]->type!=type ) continue; |
1542 | 0 | if ( type==BCF_HL_GEN ) |
1543 | 0 | { |
1544 | 0 | if ( !strcmp(hdr->hrec[i]->key,key) ) break; |
1545 | 0 | } |
1546 | 0 | else |
1547 | 0 | { |
1548 | | // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec() |
1549 | 0 | int j = bcf_hrec_find_key(hdr->hrec[i], "ID"); |
1550 | 0 | if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break; |
1551 | 0 | } |
1552 | 0 | } |
1553 | 0 | if ( i==hdr->nhrec ) return; |
1554 | 0 | hrec = hdr->hrec[i]; |
1555 | 0 | bcf_hdr_remove_from_hdict(hdr, hrec); |
1556 | 0 | } |
1557 | | |
1558 | 0 | hdr->nhrec--; |
1559 | 0 | if ( i < hdr->nhrec ) |
1560 | 0 | memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*)); |
1561 | 0 | bcf_hrec_destroy(hrec); |
1562 | 0 | hdr->dirty = 1; |
1563 | 0 | } |
1564 | 0 | } |
1565 | | |
1566 | | int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...) |
1567 | 0 | { |
1568 | 0 | char tmp[256], *line = tmp; |
1569 | 0 | va_list ap; |
1570 | 0 | va_start(ap, fmt); |
1571 | 0 | int n = vsnprintf(line, sizeof(tmp), fmt, ap); |
1572 | 0 | va_end(ap); |
1573 | |
|
1574 | 0 | if (n >= sizeof(tmp)) { |
1575 | 0 | n++; // For trailing NUL |
1576 | 0 | line = (char*)malloc(n); |
1577 | 0 | if (!line) |
1578 | 0 | return -1; |
1579 | | |
1580 | 0 | va_start(ap, fmt); |
1581 | 0 | vsnprintf(line, n, fmt, ap); |
1582 | 0 | va_end(ap); |
1583 | 0 | } |
1584 | | |
1585 | 0 | int ret = bcf_hdr_append(hdr, line); |
1586 | |
|
1587 | 0 | if (line != tmp) free(line); |
1588 | 0 | return ret; |
1589 | 0 | } |
1590 | | |
1591 | | |
1592 | | /********************** |
1593 | | *** BCF header I/O *** |
1594 | | **********************/ |
1595 | | |
1596 | | const char *bcf_hdr_get_version(const bcf_hdr_t *hdr) |
1597 | 337 | { |
1598 | 337 | bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL); |
1599 | 337 | if ( !hrec ) |
1600 | 310 | { |
1601 | 310 | hts_log_warning("No version string found, assuming VCFv4.2"); |
1602 | 310 | return "VCFv4.2"; |
1603 | 310 | } |
1604 | 27 | return hrec->value; |
1605 | 337 | } |
1606 | | |
1607 | | int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version) |
1608 | 0 | { |
1609 | 0 | bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL); |
1610 | 0 | if ( !hrec ) |
1611 | 0 | { |
1612 | 0 | int len; |
1613 | 0 | kstring_t str = {0,0,0}; |
1614 | 0 | if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1; |
1615 | 0 | hrec = bcf_hdr_parse_line(hdr, str.s, &len); |
1616 | 0 | free(str.s); |
1617 | |
|
1618 | 0 | get_hdr_aux(hdr)->version = bcf_get_version(NULL, hrec->value); |
1619 | 0 | } |
1620 | 0 | else |
1621 | 0 | { |
1622 | 0 | bcf_hrec_t *tmp = bcf_hrec_dup(hrec); |
1623 | 0 | if ( !tmp ) return -1; |
1624 | 0 | free(tmp->value); |
1625 | 0 | tmp->value = strdup(version); |
1626 | 0 | if ( !tmp->value ) return -1; |
1627 | 0 | bcf_hdr_update_hrec(hdr, hrec, tmp); |
1628 | 0 | bcf_hrec_destroy(tmp); |
1629 | 0 | } |
1630 | 0 | hdr->dirty = 1; |
1631 | | //TODO rlen may change, deal with it |
1632 | 0 | return 0; // FIXME: check for errs in this function (return < 0 if so) |
1633 | 0 | } |
1634 | | |
1635 | | bcf_hdr_t *bcf_hdr_init(const char *mode) |
1636 | 2.62k | { |
1637 | 2.62k | int i; |
1638 | 2.62k | bcf_hdr_t *h; |
1639 | 2.62k | h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t)); |
1640 | 2.62k | if (!h) return NULL; |
1641 | 10.5k | for (i = 0; i < 3; ++i) { |
1642 | 7.88k | if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail; |
1643 | | // Supersize the hash to make collisions very unlikely |
1644 | 7.88k | static int dsize[3] = {16384,16384,2048}; // info, contig, format |
1645 | 7.88k | if (kh_resize(vdict, h->dict[i], dsize[i]) < 0) goto fail; |
1646 | 7.88k | } |
1647 | | |
1648 | 2.62k | bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t)); |
1649 | 2.62k | if ( !aux ) goto fail; |
1650 | 2.62k | if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; } |
1651 | 2.62k | aux->key_len = NULL; |
1652 | 2.62k | aux->dict = *((vdict_t*)h->dict[0]); |
1653 | 2.62k | aux->version = 0; |
1654 | 2.62k | aux->ref_count = 1; |
1655 | 2.62k | free(h->dict[0]); |
1656 | 2.62k | h->dict[0] = aux; |
1657 | | |
1658 | 2.62k | if ( strchr(mode,'w') ) |
1659 | 0 | { |
1660 | 0 | bcf_hdr_append(h, "##fileformat=VCFv4.2"); |
1661 | | // The filter PASS must appear first in the dictionary |
1662 | 0 | bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">"); |
1663 | 0 | aux->version = VCF_DEF; |
1664 | 0 | } |
1665 | 2.62k | return h; |
1666 | | |
1667 | 0 | fail: |
1668 | 0 | for (i = 0; i < 3; ++i) |
1669 | 0 | kh_destroy(vdict, h->dict[i]); |
1670 | 0 | free(h); |
1671 | 0 | return NULL; |
1672 | 2.62k | } |
1673 | | |
1674 | | void bcf_hdr_destroy(bcf_hdr_t *h) |
1675 | 3.83k | { |
1676 | 3.83k | int i; |
1677 | 3.83k | khint_t k; |
1678 | 3.83k | if (!h) return; |
1679 | 3.83k | bcf_hdr_aux_t *aux = get_hdr_aux(h); |
1680 | 3.83k | if (aux->ref_count > 1) // Refs still held, so delay destruction |
1681 | 1.20k | { |
1682 | 1.20k | aux->ref_count &= ~1; |
1683 | 1.20k | return; |
1684 | 1.20k | } |
1685 | 10.5k | for (i = 0; i < 3; ++i) { |
1686 | 7.88k | vdict_t *d = (vdict_t*)h->dict[i]; |
1687 | 7.88k | if (d == 0) continue; |
1688 | 91.5M | for (k = kh_begin(d); k != kh_end(d); ++k) |
1689 | 91.4M | if (kh_exist(d, k)) free((char*)kh_key(d, k)); |
1690 | 7.88k | if ( i==0 ) |
1691 | 2.62k | { |
1692 | 26.2k | for (k=kh_begin(aux->gen); k<kh_end(aux->gen); k++) |
1693 | 23.6k | if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k)); |
1694 | 2.62k | kh_destroy(hdict, aux->gen); |
1695 | 2.62k | free(aux->key_len); // may exist for dict[0] only |
1696 | 2.62k | } |
1697 | 7.88k | kh_destroy(vdict, d); |
1698 | 7.88k | free(h->id[i]); |
1699 | 7.88k | } |
1700 | 87.2k | for (i=0; i<h->nhrec; i++) |
1701 | 84.6k | bcf_hrec_destroy(h->hrec[i]); |
1702 | 2.62k | if (h->nhrec) free(h->hrec); |
1703 | 2.62k | if (h->samples) free(h->samples); |
1704 | 2.62k | free(h->keep_samples); |
1705 | 2.62k | free(h->transl[0]); free(h->transl[1]); |
1706 | 2.62k | free(h->mem.s); |
1707 | 2.62k | free(h); |
1708 | 2.62k | } |
1709 | | |
1710 | | bcf_hdr_t *bcf_hdr_read(htsFile *hfp) |
1711 | 2.62k | { |
1712 | 2.62k | if (hfp->format.format == vcf) |
1713 | 2.46k | return vcf_hdr_read(hfp); |
1714 | 166 | if (hfp->format.format != bcf) { |
1715 | 0 | hts_log_error("Input is not detected as bcf or vcf format"); |
1716 | 0 | return NULL; |
1717 | 0 | } |
1718 | | |
1719 | 166 | assert(hfp->is_bgzf); |
1720 | | |
1721 | 166 | BGZF *fp = hfp->fp.bgzf; |
1722 | 166 | uint8_t magic[5]; |
1723 | 166 | bcf_hdr_t *h; |
1724 | 166 | h = bcf_hdr_init("r"); |
1725 | 166 | if (!h) { |
1726 | 0 | hts_log_error("Failed to allocate bcf header"); |
1727 | 0 | return NULL; |
1728 | 0 | } |
1729 | 166 | if (bgzf_read(fp, magic, 5) != 5) |
1730 | 0 | { |
1731 | 0 | hts_log_error("Failed to read the header (reading BCF in text mode?)"); |
1732 | 0 | bcf_hdr_destroy(h); |
1733 | 0 | return NULL; |
1734 | 0 | } |
1735 | 166 | if (strncmp((char*)magic, "BCF\2\2", 5) != 0) |
1736 | 0 | { |
1737 | 0 | if (!strncmp((char*)magic, "BCF", 3)) |
1738 | 0 | hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported"); |
1739 | 0 | else |
1740 | 0 | hts_log_error("Invalid BCF2 magic string"); |
1741 | 0 | bcf_hdr_destroy(h); |
1742 | 0 | return NULL; |
1743 | 0 | } |
1744 | 166 | uint8_t buf[4]; |
1745 | 166 | size_t hlen; |
1746 | 166 | char *htxt = NULL; |
1747 | 166 | if (bgzf_read(fp, buf, 4) != 4) goto fail; |
1748 | 166 | hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24); |
1749 | 166 | if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; } |
1750 | 166 | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
1751 | 166 | if (hlen > FUZZ_ALLOC_LIMIT/2) { errno = ENOMEM; goto fail; } |
1752 | 166 | #endif |
1753 | 166 | htxt = (char*)malloc(hlen + 1); |
1754 | 166 | if (!htxt) goto fail; |
1755 | 166 | if (bgzf_read(fp, htxt, hlen) != hlen) goto fail; |
1756 | 166 | htxt[hlen] = '\0'; // Ensure htxt is terminated |
1757 | 166 | if ( bcf_hdr_parse(h, htxt) < 0 ) goto fail; |
1758 | 166 | free(htxt); |
1759 | | |
1760 | 166 | bcf_hdr_incr_ref(h); |
1761 | 166 | bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup); |
1762 | | |
1763 | 166 | return h; |
1764 | 0 | fail: |
1765 | 0 | hts_log_error("Failed to read BCF header"); |
1766 | 0 | free(htxt); |
1767 | 0 | bcf_hdr_destroy(h); |
1768 | 0 | return NULL; |
1769 | 166 | } |
1770 | | |
1771 | | int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h) |
1772 | 2.24k | { |
1773 | 2.24k | if (!h) { |
1774 | 0 | errno = EINVAL; |
1775 | 0 | return -1; |
1776 | 0 | } |
1777 | 2.24k | if ( h->dirty ) { |
1778 | 0 | if (bcf_hdr_sync(h) < 0) return -1; |
1779 | 0 | } |
1780 | 2.24k | hfp->format.category = variant_data; |
1781 | 2.24k | if (hfp->format.format == vcf || hfp->format.format == text_format) { |
1782 | 1.12k | hfp->format.format = vcf; |
1783 | 1.12k | return vcf_hdr_write(hfp, h); |
1784 | 1.12k | } |
1785 | | |
1786 | 1.12k | if (hfp->format.format == binary_format) |
1787 | 1.12k | hfp->format.format = bcf; |
1788 | | |
1789 | 1.12k | kstring_t htxt = {0,0,0}; |
1790 | 1.12k | if (bcf_hdr_format(h, 1, &htxt) < 0) { |
1791 | 0 | free(htxt.s); |
1792 | 0 | return -1; |
1793 | 0 | } |
1794 | 1.12k | kputc('\0', &htxt); // include the \0 byte |
1795 | | |
1796 | 1.12k | BGZF *fp = hfp->fp.bgzf; |
1797 | 1.12k | if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1; |
1798 | 1.12k | uint8_t hlen[4]; |
1799 | 1.12k | u32_to_le(htxt.l, hlen); |
1800 | 1.12k | if ( bgzf_write(fp, hlen, 4) !=4 ) return -1; |
1801 | 1.12k | if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1; |
1802 | 1.12k | if ( bgzf_flush(fp) < 0) return -1; |
1803 | | |
1804 | 1.12k | bcf_hdr_incr_ref(h); |
1805 | 1.12k | bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup); |
1806 | | |
1807 | 1.12k | free(htxt.s); |
1808 | 1.12k | return 0; |
1809 | 1.12k | } |
1810 | | |
1811 | | /******************** |
1812 | | *** BCF site I/O *** |
1813 | | ********************/ |
1814 | | |
1815 | | bcf1_t *bcf_init(void) |
1816 | 2.24k | { |
1817 | 2.24k | bcf1_t *v; |
1818 | 2.24k | v = (bcf1_t*)calloc(1, sizeof(bcf1_t)); |
1819 | 2.24k | return v; |
1820 | 2.24k | } |
1821 | | |
1822 | | void bcf_clear(bcf1_t *v) |
1823 | 45.1k | { |
1824 | 45.1k | int i; |
1825 | 45.1k | for (i=0; i<v->d.m_info; i++) |
1826 | 0 | { |
1827 | 0 | if ( v->d.info[i].vptr_free ) |
1828 | 0 | { |
1829 | 0 | free(v->d.info[i].vptr - v->d.info[i].vptr_off); |
1830 | 0 | v->d.info[i].vptr_free = 0; |
1831 | 0 | } |
1832 | 0 | } |
1833 | 45.1k | for (i=0; i<v->d.m_fmt; i++) |
1834 | 0 | { |
1835 | 0 | if ( v->d.fmt[i].p_free ) |
1836 | 0 | { |
1837 | 0 | free(v->d.fmt[i].p - v->d.fmt[i].p_off); |
1838 | 0 | v->d.fmt[i].p_free = 0; |
1839 | 0 | } |
1840 | 0 | } |
1841 | 45.1k | v->rid = v->pos = v->rlen = v->unpacked = 0; |
1842 | 45.1k | bcf_float_set_missing(v->qual); |
1843 | 45.1k | v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0; |
1844 | 45.1k | v->shared.l = v->indiv.l = 0; |
1845 | 45.1k | v->d.var_type = -1; |
1846 | 45.1k | v->d.shared_dirty = 0; |
1847 | 45.1k | v->d.indiv_dirty = 0; |
1848 | 45.1k | v->d.n_flt = 0; |
1849 | 45.1k | v->errcode = 0; |
1850 | 45.1k | if (v->d.m_als) v->d.als[0] = 0; |
1851 | 45.1k | if (v->d.m_id) v->d.id[0] = 0; |
1852 | 45.1k | } |
1853 | | |
1854 | | void bcf_empty(bcf1_t *v) |
1855 | 2.24k | { |
1856 | 2.24k | bcf_clear1(v); |
1857 | 2.24k | free(v->d.id); |
1858 | 2.24k | free(v->d.als); |
1859 | 2.24k | free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt); |
1860 | 2.24k | if (v->d.var ) free(v->d.var); |
1861 | 2.24k | free(v->shared.s); free(v->indiv.s); |
1862 | 2.24k | memset(&v->d,0,sizeof(v->d)); |
1863 | 2.24k | memset(&v->shared,0,sizeof(v->shared)); |
1864 | 2.24k | memset(&v->indiv,0,sizeof(v->indiv)); |
1865 | 2.24k | } |
1866 | | |
1867 | | void bcf_destroy(bcf1_t *v) |
1868 | 2.24k | { |
1869 | 2.24k | if (!v) return; |
1870 | 2.24k | bcf_empty1(v); |
1871 | 2.24k | free(v); |
1872 | 2.24k | } |
1873 | | |
1874 | | static inline int bcf_read1_core(BGZF *fp, bcf1_t *v) |
1875 | 166 | { |
1876 | 166 | uint8_t x[32]; |
1877 | 166 | ssize_t ret; |
1878 | 166 | uint32_t shared_len, indiv_len; |
1879 | 166 | if ((ret = bgzf_read(fp, x, 32)) != 32) { |
1880 | 0 | if (ret == 0) return -1; |
1881 | 0 | return -2; |
1882 | 0 | } |
1883 | 166 | bcf_clear1(v); |
1884 | 166 | shared_len = le_to_u32(x); |
1885 | 166 | if (shared_len < 24) return -2; |
1886 | 164 | shared_len -= 24; // to exclude six 32-bit integers |
1887 | 164 | indiv_len = le_to_u32(x + 4); |
1888 | 164 | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
1889 | | // ks_resize() normally allocates 1.5 * requested size to allow for growth |
1890 | 164 | if ((uint64_t) shared_len + indiv_len > FUZZ_ALLOC_LIMIT / 3 * 2) return -2; |
1891 | 160 | #endif |
1892 | 160 | if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2; |
1893 | 160 | if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2; |
1894 | 160 | v->rid = le_to_i32(x + 8); |
1895 | 160 | v->pos = le_to_u32(x + 12); |
1896 | 160 | if ( v->pos==UINT32_MAX ) v->pos = -1; // this is for telomere coordinate, e.g. MT:0 |
1897 | 160 | v->rlen = le_to_i32(x + 16); |
1898 | 160 | v->qual = le_to_float(x + 20); |
1899 | 160 | v->n_info = le_to_u16(x + 24); |
1900 | 160 | v->n_allele = le_to_u16(x + 26); |
1901 | 160 | v->n_sample = le_to_u32(x + 28) & 0xffffff; |
1902 | 160 | v->n_fmt = x[31]; |
1903 | 160 | v->shared.l = shared_len; |
1904 | 160 | v->indiv.l = indiv_len; |
1905 | | // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4 |
1906 | 160 | if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0; |
1907 | | |
1908 | 160 | if (bgzf_read(fp, v->shared.s, v->shared.l) != v->shared.l) return -2; |
1909 | 148 | if (bgzf_read(fp, v->indiv.s, v->indiv.l) != v->indiv.l) return -2; |
1910 | 144 | return 0; |
1911 | 148 | } |
1912 | | |
1913 | 0 | #define bit_array_size(n) ((n)/8+1) |
1914 | 0 | #define bit_array_set(a,i) ((a)[(i)/8] |= 1 << ((i)%8)) |
1915 | 0 | #define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8))) |
1916 | 0 | #define bit_array_test(a,i) ((a)[(i)/8] & (1 << ((i)%8))) |
1917 | | |
1918 | | static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q, |
1919 | 3.68k | int32_t *val) { |
1920 | 3.68k | uint32_t t; |
1921 | 3.68k | if (end - p < 2) return -1; |
1922 | 3.67k | t = *p++ & 0xf; |
1923 | | /* Use if .. else if ... else instead of switch to force order. Assumption |
1924 | | is that small integers are more frequent than big ones. */ |
1925 | 3.67k | if (t == BCF_BT_INT8) { |
1926 | 1.92k | *val = *(int8_t *) p++; |
1927 | 1.92k | } else { |
1928 | 1.75k | if (end - p < (1<<bcf_type_shift[t])) return -1; |
1929 | 1.74k | if (t == BCF_BT_INT16) { |
1930 | 848 | *val = le_to_i16(p); |
1931 | 848 | p += 2; |
1932 | 898 | } else if (t == BCF_BT_INT32) { |
1933 | 806 | *val = le_to_i32(p); |
1934 | 806 | p += 4; |
1935 | | #ifdef VCF_ALLOW_INT64 |
1936 | | } else if (t == BCF_BT_INT64) { |
1937 | | // This case should never happen because there should be no |
1938 | | // 64-bit BCFs at all, definitely not coming from htslib |
1939 | | *val = le_to_i64(p); |
1940 | | p += 8; |
1941 | | #endif |
1942 | 806 | } else { |
1943 | 92 | return -1; |
1944 | 92 | } |
1945 | 1.74k | } |
1946 | 3.57k | *q = p; |
1947 | 3.57k | return 0; |
1948 | 3.67k | } |
1949 | | |
1950 | | static int bcf_dec_size_safe(uint8_t *p, uint8_t *end, uint8_t **q, |
1951 | 13.3k | int *num, int *type) { |
1952 | 13.3k | int r; |
1953 | 13.3k | if (p >= end) return -1; |
1954 | 13.3k | *type = *p & 0xf; |
1955 | 13.3k | if (*p>>4 != 15) { |
1956 | 12.9k | *q = p + 1; |
1957 | 12.9k | *num = *p >> 4; |
1958 | 12.9k | return 0; |
1959 | 12.9k | } |
1960 | 410 | r = bcf_dec_typed_int1_safe(p + 1, end, q, num); |
1961 | 410 | if (r) return r; |
1962 | 394 | return *num >= 0 ? 0 : -1; |
1963 | 410 | } |
1964 | | |
1965 | 248 | static const char *get_type_name(int type) { |
1966 | 248 | const char *types[9] = { |
1967 | 248 | "null", "int (8-bit)", "int (16 bit)", "int (32 bit)", |
1968 | 248 | "unknown", "float", "unknown", "char", "unknown" |
1969 | 248 | }; |
1970 | 248 | int t = (type >= 0 && type < 8) ? type : 8; |
1971 | 248 | return types[t]; |
1972 | 248 | } |
1973 | | |
1974 | | /** |
1975 | | * updatephasing - updates 1st phasing based on other phasing status |
1976 | | * @param p - pointer to phase value array |
1977 | | * @param end - end of array |
1978 | | * @param q - pointer to consumed data |
1979 | | * @param samples - no. of samples in array |
1980 | | * @param ploidy - no. of phasing values per sample |
1981 | | * @param type - value type (one of BCF_BT_...) |
1982 | | * Returns 0 on success and 1 on failure |
1983 | | * Update for haploids made only if it is not unknown (.) |
1984 | | */ |
1985 | | static int updatephasing(uint8_t *p, uint8_t *end, uint8_t **q, int samples, int ploidy, int type) |
1986 | 0 | { |
1987 | 0 | int j, k; |
1988 | 0 | unsigned int inc = 1 << bcf_type_shift[type]; |
1989 | 0 | ptrdiff_t bytes = samples * ploidy * inc; |
1990 | |
|
1991 | 0 | if (samples < 0 || ploidy < 0 || end - p < bytes) |
1992 | 0 | return 1; |
1993 | | |
1994 | | /* |
1995 | | * This works because phasing is stored in the least-significant bit |
1996 | | * of the GT encoding, and the data is always stored little-endian. |
1997 | | * Thus it's possible to get the desired result by doing bit operations |
1998 | | * on the least-significant byte of each value and ignoring the |
1999 | | * higher bytes (for 16-bit and 32-bit values). |
2000 | | */ |
2001 | | |
2002 | 0 | switch (ploidy) { |
2003 | 0 | case 1: |
2004 | | // Trivial case - haploid data is phased by default |
2005 | 0 | for (j = 0; j < samples; ++j) { |
2006 | 0 | if (*p) *p |= 1; //only if not unknown (.) |
2007 | 0 | p += inc; |
2008 | 0 | } |
2009 | 0 | break; |
2010 | 0 | case 2: |
2011 | | // Mostly trivial case - first is phased if second is. |
2012 | 0 | for (j = 0; j < samples; ++j) { |
2013 | 0 | *p |= (p[inc] & 1); |
2014 | 0 | p += 2 * inc; |
2015 | 0 | } |
2016 | 0 | break; |
2017 | 0 | default: |
2018 | | // Generic case - first is phased if all other alleles are. |
2019 | 0 | for (j = 0; j < samples; ++j) { |
2020 | 0 | uint8_t allphased = 1; |
2021 | 0 | for (k = 1; k < ploidy; ++k) |
2022 | 0 | allphased &= (p[inc * k]); |
2023 | 0 | *p |= allphased; |
2024 | 0 | p += ploidy * inc; |
2025 | 0 | } |
2026 | 0 | } |
2027 | 0 | *q = p; |
2028 | 0 | return 0; |
2029 | 0 | } |
2030 | | |
2031 | | static void bcf_record_check_err(const bcf_hdr_t *hdr, bcf1_t *rec, |
2032 | 1.86k | char *type, uint32_t *reports, int i) { |
2033 | 1.86k | if (*reports == 0 || hts_verbose >= HTS_LOG_DEBUG) |
2034 | 52 | hts_log_warning("Bad BCF record at %s:%"PRIhts_pos |
2035 | 1.86k | ": Invalid FORMAT %s %d", |
2036 | 1.86k | bcf_seqname_safe(hdr,rec), rec->pos+1, type, i); |
2037 | 1.86k | (*reports)++; |
2038 | 1.86k | } |
2039 | | |
2040 | 144 | static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { |
2041 | 144 | uint8_t *ptr, *end; |
2042 | 144 | size_t bytes; |
2043 | 144 | uint32_t err = 0; |
2044 | 144 | int type = 0; |
2045 | 144 | int num = 0; |
2046 | 144 | uint32_t i, reports; |
2047 | 144 | const uint32_t is_integer = ((1 << BCF_BT_INT8) | |
2048 | 144 | (1 << BCF_BT_INT16) | |
2049 | | #ifdef VCF_ALLOW_INT64 |
2050 | | (1 << BCF_BT_INT64) | |
2051 | | #endif |
2052 | 144 | (1 << BCF_BT_INT32)); |
2053 | 144 | const uint32_t is_valid_type = (is_integer | |
2054 | 144 | (1 << BCF_BT_NULL) | |
2055 | 144 | (1 << BCF_BT_FLOAT) | |
2056 | 144 | (1 << BCF_BT_CHAR)); |
2057 | 144 | int32_t max_id = hdr ? hdr->n[BCF_DT_ID] : 0; |
2058 | | /* set phasing for 1st allele as in v44 for versions upto v43, to have |
2059 | | consistent binary values irrespective of version; not run for v >= v44, |
2060 | | to retain explicit phasing in v44 and higher */ |
2061 | 144 | int idgt = hdr ? |
2062 | 144 | bcf_get_version(hdr, NULL) < VCF44 ? |
2063 | 144 | bcf_hdr_id2int(hdr, BCF_DT_ID, "GT") : -1 : |
2064 | 144 | -1; |
2065 | | |
2066 | | // Check for valid contig ID |
2067 | 144 | if (rec->rid < 0 |
2068 | 122 | || (hdr && (rec->rid >= hdr->n[BCF_DT_CTG] |
2069 | 144 | || hdr->id[BCF_DT_CTG][rec->rid].key == NULL))) { |
2070 | 144 | hts_log_warning("Bad BCF record at %"PRIhts_pos": Invalid %s id %d", rec->pos+1, "CONTIG", rec->rid); |
2071 | 144 | err |= BCF_ERR_CTG_INVALID; |
2072 | 144 | } |
2073 | | |
2074 | | // Check ID |
2075 | 144 | ptr = (uint8_t *) rec->shared.s; |
2076 | 144 | end = ptr + rec->shared.l; |
2077 | 144 | if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared; |
2078 | 142 | if (type != BCF_BT_CHAR) { |
2079 | 142 | hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "ID", type, get_type_name(type)); |
2080 | 142 | err |= BCF_ERR_TAG_INVALID; |
2081 | 142 | } |
2082 | 142 | bytes = (size_t) num << bcf_type_shift[type]; |
2083 | 142 | if (end - ptr < bytes) goto bad_shared; |
2084 | 142 | ptr += bytes; |
2085 | | |
2086 | | // Check REF and ALT |
2087 | 142 | if (rec->n_allele < 1) { |
2088 | 64 | hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": No REF allele", |
2089 | 64 | bcf_seqname_safe(hdr,rec), rec->pos+1); |
2090 | 64 | err |= BCF_ERR_TAG_UNDEF; |
2091 | 64 | } |
2092 | | |
2093 | 142 | reports = 0; |
2094 | 10.0k | for (i = 0; i < rec->n_allele; i++) { |
2095 | 9.88k | if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared; |
2096 | 9.87k | if (type != BCF_BT_CHAR) { |
2097 | 9.61k | if (!reports++ || hts_verbose >= HTS_LOG_DEBUG) |
2098 | 78 | hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "REF/ALT", type, get_type_name(type)); |
2099 | 9.61k | err |= BCF_ERR_CHAR; |
2100 | 9.61k | } |
2101 | 9.87k | bytes = (size_t) num << bcf_type_shift[type]; |
2102 | 9.87k | if (end - ptr < bytes) goto bad_shared; |
2103 | 9.87k | ptr += bytes; |
2104 | 9.87k | } |
2105 | | |
2106 | | // Check FILTER |
2107 | 130 | reports = 0; |
2108 | 130 | if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared; |
2109 | 130 | if (num > 0) { |
2110 | 74 | bytes = (size_t) num << bcf_type_shift[type]; |
2111 | 74 | if (((1 << type) & is_integer) == 0) { |
2112 | 24 | hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", type, get_type_name(type)); |
2113 | 24 | err |= BCF_ERR_TAG_INVALID; |
2114 | 24 | if (end - ptr < bytes) goto bad_shared; |
2115 | 24 | ptr += bytes; |
2116 | 50 | } else { |
2117 | 50 | if (end - ptr < bytes) goto bad_shared; |
2118 | 5.01k | for (i = 0; i < num; i++) { |
2119 | 4.96k | int32_t key = bcf_dec_int1(ptr, type, &ptr); |
2120 | 4.96k | if (key < 0 |
2121 | 4.03k | || (hdr && (key >= max_id |
2122 | 4.67k | || hdr->id[BCF_DT_ID][key].key == NULL))) { |
2123 | 4.67k | if (!reports++ || hts_verbose >= HTS_LOG_DEBUG) |
2124 | 50 | hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "FILTER", key); |
2125 | 4.67k | err |= BCF_ERR_TAG_UNDEF; |
2126 | 4.67k | } |
2127 | 4.96k | } |
2128 | 50 | } |
2129 | 74 | } |
2130 | | |
2131 | | // Check INFO |
2132 | 130 | reports = 0; |
2133 | 130 | bcf_idpair_t *id_tmp = hdr ? hdr->id[BCF_DT_ID] : NULL; |
2134 | 1.42k | for (i = 0; i < rec->n_info; i++) { |
2135 | 1.36k | int32_t key = -1; |
2136 | 1.36k | if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_shared; |
2137 | 1.30k | if (key < 0 || (hdr && (key >= max_id |
2138 | 890 | || id_tmp[key].key == NULL))) { |
2139 | 890 | if (!reports++ || hts_verbose >= HTS_LOG_DEBUG) |
2140 | 56 | hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s id %d", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", key); |
2141 | 890 | err |= BCF_ERR_TAG_UNDEF; |
2142 | 890 | } |
2143 | 1.30k | if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared; |
2144 | 1.30k | if (((1 << type) & is_valid_type) == 0 |
2145 | 1.24k | || (type == BCF_BT_NULL && num > 0)) { |
2146 | 68 | if (!reports++ || hts_verbose >= HTS_LOG_DEBUG) |
2147 | 4 | hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", type, get_type_name(type)); |
2148 | 68 | err |= BCF_ERR_TAG_INVALID; |
2149 | 68 | } |
2150 | 1.30k | bytes = (size_t) num << bcf_type_shift[type]; |
2151 | 1.30k | if (end - ptr < bytes) goto bad_shared; |
2152 | 1.29k | ptr += bytes; |
2153 | 1.29k | } |
2154 | | |
2155 | | // Check FORMAT and individual information |
2156 | 56 | ptr = (uint8_t *) rec->indiv.s; |
2157 | 56 | end = ptr + rec->indiv.l; |
2158 | 56 | reports = 0; |
2159 | 1.90k | for (i = 0; i < rec->n_fmt; i++) { |
2160 | 1.90k | int32_t key = -1; |
2161 | 1.90k | if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_indiv; |
2162 | 1.87k | if (key < 0 |
2163 | 1.73k | || (hdr && (key >= max_id |
2164 | 1.74k | || id_tmp[key].key == NULL))) { |
2165 | 1.74k | bcf_record_check_err(hdr, rec, "id", &reports, key); |
2166 | 1.74k | err |= BCF_ERR_TAG_UNDEF; |
2167 | 1.74k | } |
2168 | 1.87k | if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv; |
2169 | 1.87k | if (((1 << type) & is_valid_type) == 0 |
2170 | 1.78k | || (type == BCF_BT_NULL && num > 0)) { |
2171 | 114 | bcf_record_check_err(hdr, rec, "type", &reports, type); |
2172 | 114 | err |= BCF_ERR_TAG_INVALID; |
2173 | 114 | } |
2174 | 1.87k | if (idgt >= 0 && idgt == key) { |
2175 | | // check first GT phasing bit and fix up if necessary |
2176 | 0 | if (updatephasing(ptr, end, &ptr, rec->n_sample, num, type)) { |
2177 | 0 | err |= BCF_ERR_TAG_INVALID; |
2178 | 0 | } |
2179 | 1.87k | } else { |
2180 | 1.87k | bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample; |
2181 | 1.87k | if (end - ptr < bytes) goto bad_indiv; |
2182 | 1.84k | ptr += bytes; |
2183 | 1.84k | } |
2184 | 1.87k | } |
2185 | | |
2186 | 0 | if (!err && rec->rlen < 0) { |
2187 | | // Treat bad rlen as a warning instead of an error, and try to |
2188 | | // fix up by using the length of the stored REF allele. |
2189 | 0 | static int warned = 0; |
2190 | 0 | if (!warned) { |
2191 | 0 | hts_log_warning("BCF record at %s:%"PRIhts_pos" has invalid RLEN (%"PRIhts_pos"). " |
2192 | 0 | "Only one invalid RLEN will be reported.", |
2193 | 0 | bcf_seqname_safe(hdr,rec), rec->pos+1, rec->rlen); |
2194 | 0 | warned = 1; |
2195 | 0 | } |
2196 | | //find rlen considering reflen, END, SVLEN, fmt LEN |
2197 | 0 | hts_pos_t len = get_rlen(hdr, rec); |
2198 | 0 | rec->rlen = len >= 0 ? len : 0; |
2199 | 0 | } |
2200 | |
|
2201 | 0 | rec->errcode |= err; |
2202 | |
|
2203 | 0 | return err ? -2 : 0; // Return -2 so bcf_read() reports an error |
2204 | | |
2205 | 88 | bad_shared: |
2206 | 88 | hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - shared section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1); |
2207 | 88 | return -2; |
2208 | | |
2209 | 56 | bad_indiv: |
2210 | 56 | hts_log_error("Bad BCF record at %s:%"PRIhts_pos" - individuals section malformed or too short", bcf_seqname_safe(hdr,rec), rec->pos+1); |
2211 | 56 | return -2; |
2212 | 56 | } |
2213 | | |
2214 | | static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt); |
2215 | | int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec) |
2216 | 0 | { |
2217 | 0 | if ( !hdr->keep_samples ) return 0; |
2218 | 0 | if ( !bcf_hdr_nsamples(hdr) ) |
2219 | 0 | { |
2220 | 0 | rec->indiv.l = rec->n_sample = 0; |
2221 | 0 | return 0; |
2222 | 0 | } |
2223 | | |
2224 | 0 | int i, j; |
2225 | 0 | uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src; |
2226 | 0 | bcf_dec_t *dec = &rec->d; |
2227 | 0 | hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt); |
2228 | 0 | for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0; |
2229 | |
|
2230 | 0 | for (i=0; i<rec->n_fmt; i++) |
2231 | 0 | { |
2232 | 0 | ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]); |
2233 | 0 | src = dec->fmt[i].p - dec->fmt[i].size; |
2234 | 0 | if ( dst ) |
2235 | 0 | { |
2236 | 0 | memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off); |
2237 | 0 | dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off; |
2238 | 0 | } |
2239 | 0 | dst = dec->fmt[i].p; |
2240 | 0 | for (j=0; j<hdr->nsamples_ori; j++) |
2241 | 0 | { |
2242 | 0 | src += dec->fmt[i].size; |
2243 | 0 | if ( !bit_array_test(hdr->keep_samples,j) ) continue; |
2244 | 0 | memmove(dst, src, dec->fmt[i].size); |
2245 | 0 | dst += dec->fmt[i].size; |
2246 | 0 | } |
2247 | 0 | rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p); |
2248 | 0 | dec->fmt[i].p_len = dst - dec->fmt[i].p; |
2249 | 0 | } |
2250 | 0 | rec->unpacked |= BCF_UN_FMT; |
2251 | |
|
2252 | 0 | rec->n_sample = bcf_hdr_nsamples(hdr); |
2253 | 0 | return 0; |
2254 | 0 | } |
2255 | | |
2256 | | int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) |
2257 | 43.2k | { |
2258 | 43.2k | if (fp->format.format == vcf) return vcf_read(fp, h, v); |
2259 | 166 | if (!h) |
2260 | 0 | h = (const bcf_hdr_t *) bgzf_get_private_data(fp->fp.bgzf); |
2261 | 166 | int ret = bcf_read1_core(fp->fp.bgzf, v); |
2262 | 166 | if (ret == 0) ret = bcf_record_check(h, v); |
2263 | 166 | if ( ret!=0 || !h->keep_samples ) return ret; |
2264 | 0 | return bcf_subset_format(h,v); |
2265 | 166 | } |
2266 | | |
2267 | | int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end) |
2268 | 0 | { |
2269 | 0 | bcf1_t *v = (bcf1_t *) vv; |
2270 | 0 | const bcf_hdr_t *hdr = (const bcf_hdr_t *) bgzf_get_private_data(fp); |
2271 | 0 | int ret = bcf_read1_core(fp, v); |
2272 | 0 | if (ret == 0) ret = bcf_record_check(hdr, v); |
2273 | 0 | if (ret >= 0) |
2274 | 0 | *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen; |
2275 | 0 | return ret; |
2276 | 0 | } |
2277 | | |
2278 | | static inline int bcf1_sync_id(bcf1_t *line, kstring_t *str) |
2279 | 0 | { |
2280 | | // single typed string |
2281 | 0 | if ( line->d.id && strcmp(line->d.id, ".") ) { |
2282 | 0 | return bcf_enc_vchar(str, strlen(line->d.id), line->d.id); |
2283 | 0 | } else { |
2284 | 0 | return bcf_enc_size(str, 0, BCF_BT_CHAR); |
2285 | 0 | } |
2286 | 0 | } |
2287 | | static inline int bcf1_sync_alleles(bcf1_t *line, kstring_t *str) |
2288 | 0 | { |
2289 | | // list of typed strings |
2290 | 0 | int i; |
2291 | 0 | for (i=0; i<line->n_allele; i++) { |
2292 | 0 | if (bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]) < 0) |
2293 | 0 | return -1; |
2294 | 0 | } |
2295 | 0 | if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]); |
2296 | 0 | return 0; |
2297 | 0 | } |
2298 | | static inline int bcf1_sync_filter(bcf1_t *line, kstring_t *str) |
2299 | 0 | { |
2300 | | // typed vector of integers |
2301 | 0 | if ( line->d.n_flt ) { |
2302 | 0 | return bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1); |
2303 | 0 | } else { |
2304 | 0 | return bcf_enc_vint(str, 0, 0, -1); |
2305 | 0 | } |
2306 | 0 | } |
2307 | | |
2308 | | static inline int bcf1_sync_info(bcf1_t *line, kstring_t *str) |
2309 | 0 | { |
2310 | | // pairs of typed vectors |
2311 | 0 | int i, irm = -1, e = 0; |
2312 | 0 | for (i=0; i<line->n_info; i++) |
2313 | 0 | { |
2314 | 0 | bcf_info_t *info = &line->d.info[i]; |
2315 | 0 | if ( !info->vptr ) |
2316 | 0 | { |
2317 | | // marked for removal |
2318 | 0 | if ( irm < 0 ) irm = i; |
2319 | 0 | continue; |
2320 | 0 | } |
2321 | 0 | e |= kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str) < 0; |
2322 | 0 | if ( irm >=0 ) |
2323 | 0 | { |
2324 | 0 | bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp; |
2325 | 0 | while ( irm<=i && line->d.info[irm].vptr ) irm++; |
2326 | 0 | } |
2327 | 0 | } |
2328 | 0 | if ( irm>=0 ) line->n_info = irm; |
2329 | 0 | return e == 0 ? 0 : -1; |
2330 | 0 | } |
2331 | | |
2332 | | static int bcf1_sync(bcf1_t *line) |
2333 | 0 | { |
2334 | 0 | char *shared_ori = line->shared.s; |
2335 | 0 | size_t prev_len; |
2336 | |
|
2337 | 0 | kstring_t tmp = {0,0,0}; |
2338 | 0 | if ( !line->shared.l ) |
2339 | 0 | { |
2340 | | // New line created via API, BCF data blocks do not exist. Get it ready for BCF output |
2341 | 0 | tmp = line->shared; |
2342 | 0 | bcf1_sync_id(line, &tmp); |
2343 | 0 | line->unpack_size[0] = tmp.l; prev_len = tmp.l; |
2344 | |
|
2345 | 0 | bcf1_sync_alleles(line, &tmp); |
2346 | 0 | line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l; |
2347 | |
|
2348 | 0 | bcf1_sync_filter(line, &tmp); |
2349 | 0 | line->unpack_size[2] = tmp.l - prev_len; |
2350 | |
|
2351 | 0 | bcf1_sync_info(line, &tmp); |
2352 | 0 | line->shared = tmp; |
2353 | 0 | } |
2354 | 0 | else if ( line->d.shared_dirty ) |
2355 | 0 | { |
2356 | | // The line was edited, update the BCF data block. |
2357 | |
|
2358 | 0 | if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line,BCF_UN_STR); |
2359 | | |
2360 | | // ptr_ori points to the original unchanged BCF data. |
2361 | 0 | uint8_t *ptr_ori = (uint8_t *) line->shared.s; |
2362 | | |
2363 | | // ID: single typed string |
2364 | 0 | if ( line->d.shared_dirty & BCF1_DIRTY_ID ) |
2365 | 0 | bcf1_sync_id(line, &tmp); |
2366 | 0 | else |
2367 | 0 | kputsn_(ptr_ori, line->unpack_size[0], &tmp); |
2368 | 0 | ptr_ori += line->unpack_size[0]; |
2369 | 0 | line->unpack_size[0] = tmp.l; prev_len = tmp.l; |
2370 | | |
2371 | | // REF+ALT: list of typed strings |
2372 | 0 | if ( line->d.shared_dirty & BCF1_DIRTY_ALS ) |
2373 | 0 | bcf1_sync_alleles(line, &tmp); |
2374 | 0 | else |
2375 | 0 | { |
2376 | 0 | kputsn_(ptr_ori, line->unpack_size[1], &tmp); |
2377 | 0 | if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]); |
2378 | 0 | } |
2379 | 0 | ptr_ori += line->unpack_size[1]; |
2380 | 0 | line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l; |
2381 | |
|
2382 | 0 | if ( line->unpacked & BCF_UN_FLT ) |
2383 | 0 | { |
2384 | | // FILTER: typed vector of integers |
2385 | 0 | if ( line->d.shared_dirty & BCF1_DIRTY_FLT ) |
2386 | 0 | bcf1_sync_filter(line, &tmp); |
2387 | 0 | else if ( line->d.n_flt ) |
2388 | 0 | kputsn_(ptr_ori, line->unpack_size[2], &tmp); |
2389 | 0 | else |
2390 | 0 | bcf_enc_vint(&tmp, 0, 0, -1); |
2391 | 0 | ptr_ori += line->unpack_size[2]; |
2392 | 0 | line->unpack_size[2] = tmp.l - prev_len; |
2393 | |
|
2394 | 0 | if ( line->unpacked & BCF_UN_INFO ) |
2395 | 0 | { |
2396 | | // INFO: pairs of typed vectors |
2397 | 0 | if ( line->d.shared_dirty & BCF1_DIRTY_INF ) |
2398 | 0 | { |
2399 | 0 | bcf1_sync_info(line, &tmp); |
2400 | 0 | ptr_ori = (uint8_t*)line->shared.s + line->shared.l; |
2401 | 0 | } |
2402 | 0 | } |
2403 | 0 | } |
2404 | |
|
2405 | 0 | int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s; |
2406 | 0 | if ( size ) kputsn_(ptr_ori, size, &tmp); |
2407 | |
|
2408 | 0 | free(line->shared.s); |
2409 | 0 | line->shared = tmp; |
2410 | 0 | } |
2411 | 0 | if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO ) |
2412 | 0 | { |
2413 | | // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers |
2414 | 0 | size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2]; |
2415 | 0 | int i; |
2416 | 0 | for (i=0; i<line->n_info; i++) |
2417 | 0 | { |
2418 | 0 | uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL; |
2419 | 0 | line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off; |
2420 | 0 | off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off; |
2421 | 0 | if ( vptr_free ) |
2422 | 0 | { |
2423 | 0 | free(vptr_free); |
2424 | 0 | line->d.info[i].vptr_free = 0; |
2425 | 0 | } |
2426 | 0 | } |
2427 | 0 | } |
2428 | |
|
2429 | 0 | if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) ) |
2430 | 0 | { |
2431 | | // The genotype fields changed or are not present |
2432 | 0 | tmp.l = tmp.m = 0; tmp.s = NULL; |
2433 | 0 | int i, irm = -1; |
2434 | 0 | for (i=0; i<line->n_fmt; i++) |
2435 | 0 | { |
2436 | 0 | bcf_fmt_t *fmt = &line->d.fmt[i]; |
2437 | 0 | if ( !fmt->p ) |
2438 | 0 | { |
2439 | | // marked for removal |
2440 | 0 | if ( irm < 0 ) irm = i; |
2441 | 0 | continue; |
2442 | 0 | } |
2443 | 0 | kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp); |
2444 | 0 | if ( irm >=0 ) |
2445 | 0 | { |
2446 | 0 | bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt; |
2447 | 0 | while ( irm<=i && line->d.fmt[irm].p ) irm++; |
2448 | 0 | } |
2449 | |
|
2450 | 0 | } |
2451 | 0 | if ( irm>=0 ) line->n_fmt = irm; |
2452 | 0 | free(line->indiv.s); |
2453 | 0 | line->indiv = tmp; |
2454 | | |
2455 | | // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers |
2456 | 0 | size_t off_new = 0; |
2457 | 0 | for (i=0; i<line->n_fmt; i++) |
2458 | 0 | { |
2459 | 0 | uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL; |
2460 | 0 | line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off; |
2461 | 0 | off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off; |
2462 | 0 | if ( p_free ) |
2463 | 0 | { |
2464 | 0 | free(p_free); |
2465 | 0 | line->d.fmt[i].p_free = 0; |
2466 | 0 | } |
2467 | 0 | } |
2468 | 0 | } |
2469 | 0 | if ( !line->n_sample ) line->n_fmt = 0; |
2470 | 0 | line->d.shared_dirty = line->d.indiv_dirty = 0; |
2471 | 0 | return 0; |
2472 | 0 | } |
2473 | | |
2474 | | bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src) |
2475 | 0 | { |
2476 | 0 | bcf1_sync(src); |
2477 | |
|
2478 | 0 | bcf_clear(dst); |
2479 | 0 | dst->rid = src->rid; |
2480 | 0 | dst->pos = src->pos; |
2481 | 0 | dst->rlen = src->rlen; |
2482 | 0 | dst->qual = src->qual; |
2483 | 0 | dst->n_info = src->n_info; dst->n_allele = src->n_allele; |
2484 | 0 | dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample; |
2485 | |
|
2486 | 0 | if ( dst->shared.m < src->shared.l ) |
2487 | 0 | { |
2488 | 0 | dst->shared.s = (char*) realloc(dst->shared.s, src->shared.l); |
2489 | 0 | dst->shared.m = src->shared.l; |
2490 | 0 | } |
2491 | 0 | dst->shared.l = src->shared.l; |
2492 | 0 | memcpy(dst->shared.s,src->shared.s,dst->shared.l); |
2493 | |
|
2494 | 0 | if ( dst->indiv.m < src->indiv.l ) |
2495 | 0 | { |
2496 | 0 | dst->indiv.s = (char*) realloc(dst->indiv.s, src->indiv.l); |
2497 | 0 | dst->indiv.m = src->indiv.l; |
2498 | 0 | } |
2499 | 0 | dst->indiv.l = src->indiv.l; |
2500 | 0 | memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l); |
2501 | |
|
2502 | 0 | return dst; |
2503 | 0 | } |
2504 | | bcf1_t *bcf_dup(bcf1_t *src) |
2505 | 0 | { |
2506 | 0 | bcf1_t *out = bcf_init1(); |
2507 | 0 | return bcf_copy(out, src); |
2508 | 0 | } |
2509 | | |
2510 | | int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) |
2511 | 42.0k | { |
2512 | 42.0k | if ( h->dirty ) { |
2513 | 0 | if (bcf_hdr_sync(h) < 0) return -1; |
2514 | 0 | } |
2515 | 42.0k | if ( bcf_hdr_nsamples(h)!=v->n_sample ) |
2516 | 25 | { |
2517 | 25 | hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)", |
2518 | 25 | bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h)); |
2519 | 25 | return -1; |
2520 | 25 | } |
2521 | | |
2522 | 41.9k | if ( hfp->format.format == vcf || hfp->format.format == text_format ) |
2523 | 41.0k | return vcf_write(hfp,h,v); |
2524 | | |
2525 | 968 | if ( v->errcode & ~BCF_ERR_LIMITS ) // todo: unsure about the other BCF_ERR_LIMITS branches in vcf_parse_format_alloc4() |
2526 | 968 | { |
2527 | | // vcf_parse1() encountered a new contig or tag, undeclared in the |
2528 | | // header. At this point, the header must have been printed, |
2529 | | // proceeding would lead to a broken BCF file. Errors must be checked |
2530 | | // and cleared by the caller before we can proceed. |
2531 | 968 | char errdescription[1024] = ""; |
2532 | 968 | hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos, v->errcode, bcf_strerror(v->errcode, errdescription, sizeof(errdescription)), bcf_seqname_safe(h,v), v->pos+1); |
2533 | 968 | return -1; |
2534 | 968 | } |
2535 | 0 | bcf1_sync(v); // check if the BCF record was modified |
2536 | |
|
2537 | 0 | if ( v->unpacked & BCF_IS_64BIT ) |
2538 | 0 | { |
2539 | 0 | hts_log_error("Data at %s:%"PRIhts_pos" contains 64-bit values not representable in BCF. Please use VCF instead", bcf_seqname_safe(h,v), v->pos+1); |
2540 | 0 | return -1; |
2541 | 0 | } |
2542 | | |
2543 | 0 | BGZF *fp = hfp->fp.bgzf; |
2544 | 0 | uint8_t x[32]; |
2545 | 0 | u32_to_le(v->shared.l + 24, x); // to include six 32-bit integers |
2546 | 0 | u32_to_le(v->indiv.l, x + 4); |
2547 | 0 | i32_to_le(v->rid, x + 8); |
2548 | 0 | u32_to_le(v->pos, x + 12); |
2549 | 0 | u32_to_le(v->rlen, x + 16); |
2550 | 0 | float_to_le(v->qual, x + 20); |
2551 | 0 | u16_to_le(v->n_info, x + 24); |
2552 | 0 | u16_to_le(v->n_allele, x + 26); |
2553 | 0 | u32_to_le((uint32_t)v->n_fmt<<24 | (v->n_sample & 0xffffff), x + 28); |
2554 | 0 | if ( bgzf_write(fp, x, 32) != 32 ) return -1; |
2555 | 0 | if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1; |
2556 | 0 | if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1; |
2557 | | |
2558 | 0 | if (hfp->idx) { |
2559 | 0 | if (bgzf_idx_push(fp, hfp->idx, v->rid, v->pos, v->pos + v->rlen, |
2560 | 0 | bgzf_tell(fp), 1) < 0) |
2561 | 0 | return -1; |
2562 | 0 | } |
2563 | | |
2564 | 0 | return 0; |
2565 | 0 | } |
2566 | | |
2567 | | /********************** |
2568 | | *** VCF header I/O *** |
2569 | | **********************/ |
2570 | | |
2571 | 0 | static int add_missing_contig_hrec(bcf_hdr_t *h, const char *name) { |
2572 | 0 | bcf_hrec_t *hrec = calloc(1, sizeof(bcf_hrec_t)); |
2573 | 0 | int save_errno; |
2574 | 0 | if (!hrec) goto fail; |
2575 | | |
2576 | 0 | hrec->key = strdup("contig"); |
2577 | 0 | if (!hrec->key) goto fail; |
2578 | | |
2579 | 0 | if (bcf_hrec_add_key(hrec, "ID", strlen("ID")) < 0) goto fail; |
2580 | 0 | if (bcf_hrec_set_val(hrec, hrec->nkeys-1, name, strlen(name), 0) < 0) |
2581 | 0 | goto fail; |
2582 | 0 | if (bcf_hdr_add_hrec(h, hrec) < 0) |
2583 | 0 | goto fail; |
2584 | 0 | return 0; |
2585 | | |
2586 | 0 | fail: |
2587 | 0 | save_errno = errno; |
2588 | 0 | hts_log_error("%s", strerror(errno)); |
2589 | 0 | if (hrec) bcf_hrec_destroy(hrec); |
2590 | 0 | errno = save_errno; |
2591 | 0 | return -1; |
2592 | 0 | } |
2593 | | |
2594 | | bcf_hdr_t *vcf_hdr_read(htsFile *fp) |
2595 | 2.46k | { |
2596 | 2.46k | kstring_t txt, *s = &fp->line; |
2597 | 2.46k | int ret; |
2598 | 2.46k | bcf_hdr_t *h; |
2599 | 2.46k | tbx_t *idx = NULL; |
2600 | 2.46k | const char **names = NULL; |
2601 | 2.46k | h = bcf_hdr_init("r"); |
2602 | 2.46k | if (!h) { |
2603 | 0 | hts_log_error("Failed to allocate bcf header"); |
2604 | 0 | return NULL; |
2605 | 0 | } |
2606 | 2.46k | txt.l = txt.m = 0; txt.s = 0; |
2607 | 131k | while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) { |
2608 | 131k | int e = 0; |
2609 | 131k | if (s->l == 0) continue; |
2610 | 117k | if (s->s[0] != '#') { |
2611 | 16 | hts_log_error("No sample line"); |
2612 | 16 | goto error; |
2613 | 16 | } |
2614 | 116k | if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here |
2615 | 0 | kstring_t tmp = { 0, 0, NULL }; |
2616 | 0 | hFILE *f = hopen(fp->fn_aux, "r"); |
2617 | 0 | if (f == NULL) { |
2618 | 0 | hts_log_error("Couldn't open \"%s\"", fp->fn_aux); |
2619 | 0 | goto error; |
2620 | 0 | } |
2621 | 0 | while (tmp.l = 0, kgetline(&tmp, (kgets_func *) hgets, f) >= 0) { |
2622 | 0 | char *tab = strchr(tmp.s, '\t'); |
2623 | 0 | if (tab == NULL) continue; |
2624 | 0 | e |= (kputs("##contig=<ID=", &txt) < 0); |
2625 | 0 | e |= (kputsn(tmp.s, tab - tmp.s, &txt) < 0); |
2626 | 0 | e |= (kputs(",length=", &txt) < 0); |
2627 | 0 | e |= (kputl(atol(tab), &txt) < 0); |
2628 | 0 | e |= (kputsn(">\n", 2, &txt) < 0); |
2629 | 0 | } |
2630 | 0 | free(tmp.s); |
2631 | 0 | if (hclose(f) != 0) { |
2632 | 0 | hts_log_error("Error on closing %s", fp->fn_aux); |
2633 | 0 | goto error; |
2634 | 0 | } |
2635 | 0 | if (e) goto error; |
2636 | 0 | } |
2637 | 116k | if (kputsn(s->s, s->l, &txt) < 0) goto error; |
2638 | 116k | if (kputc('\n', &txt) < 0) goto error; |
2639 | 116k | if (s->s[1] != '#') break; |
2640 | 116k | } |
2641 | 2.44k | if ( ret < -1 ) goto error; |
2642 | 2.43k | if ( !txt.s ) |
2643 | 0 | { |
2644 | 0 | hts_log_error("Could not read the header"); |
2645 | 0 | goto error; |
2646 | 0 | } |
2647 | 2.43k | if ( bcf_hdr_parse(h, txt.s) < 0 ) goto error; |
2648 | | |
2649 | | // check tabix index, are all contigs listed in the header? add the missing ones |
2650 | 2.08k | idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SILENT_FAIL); |
2651 | 2.08k | if ( idx ) |
2652 | 0 | { |
2653 | 0 | int i, n, need_sync = 0; |
2654 | 0 | names = tbx_seqnames(idx, &n); |
2655 | 0 | if (!names) goto error; |
2656 | 0 | for (i=0; i<n; i++) |
2657 | 0 | { |
2658 | 0 | bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL); |
2659 | 0 | if ( hrec ) continue; |
2660 | 0 | if (add_missing_contig_hrec(h, names[i]) < 0) goto error; |
2661 | 0 | need_sync = 1; |
2662 | 0 | } |
2663 | 0 | if ( need_sync ) { |
2664 | 0 | if (bcf_hdr_sync(h) < 0) goto error; |
2665 | 0 | } |
2666 | 0 | free(names); |
2667 | 0 | tbx_destroy(idx); |
2668 | 0 | } |
2669 | 2.08k | free(txt.s); |
2670 | 2.08k | return h; |
2671 | | |
2672 | 380 | error: |
2673 | 380 | if (idx) tbx_destroy(idx); |
2674 | 380 | free(names); |
2675 | 380 | free(txt.s); |
2676 | 380 | if (h) bcf_hdr_destroy(h); |
2677 | 380 | return NULL; |
2678 | 2.08k | } |
2679 | | |
2680 | | int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname) |
2681 | 0 | { |
2682 | 0 | int i = 0, n = 0, save_errno; |
2683 | 0 | char **lines = hts_readlines(fname, &n); |
2684 | 0 | if ( !lines ) return 1; |
2685 | 0 | for (i=0; i<n-1; i++) |
2686 | 0 | { |
2687 | 0 | int k; |
2688 | 0 | bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k); |
2689 | 0 | if (!hrec) goto fail; |
2690 | 0 | if (bcf_hdr_add_hrec(hdr, hrec) < 0) { |
2691 | 0 | bcf_hrec_destroy(hrec); |
2692 | 0 | goto fail; |
2693 | 0 | } |
2694 | 0 | free(lines[i]); |
2695 | 0 | lines[i] = NULL; |
2696 | 0 | } |
2697 | 0 | if (bcf_hdr_parse_sample_line(hdr, lines[n-1]) < 0) goto fail; |
2698 | 0 | if (bcf_hdr_sync(hdr) < 0) goto fail; |
2699 | 0 | free(lines[n-1]); |
2700 | 0 | free(lines); |
2701 | 0 | return 0; |
2702 | | |
2703 | 0 | fail: |
2704 | 0 | save_errno = errno; |
2705 | 0 | for (; i < n; i++) |
2706 | 0 | free(lines[i]); |
2707 | 0 | free(lines); |
2708 | 0 | errno = save_errno; |
2709 | 0 | return 1; |
2710 | 0 | } |
2711 | | |
2712 | | static int _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str) |
2713 | 13.4k | { |
2714 | 13.4k | uint32_t e = 0; |
2715 | 13.4k | if ( !hrec->value ) |
2716 | 7.61k | { |
2717 | 7.61k | int j, nout = 0; |
2718 | 7.61k | e |= ksprintf(str, "##%s=<", hrec->key) < 0; |
2719 | 26.4k | for (j=0; j<hrec->nkeys; j++) |
2720 | 18.8k | { |
2721 | | // do not output IDX if output is VCF |
2722 | 18.8k | if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue; |
2723 | 16.3k | if ( nout ) e |= kputc(',',str) < 0; |
2724 | 16.3k | e |= ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]) < 0; |
2725 | 16.3k | nout++; |
2726 | 16.3k | } |
2727 | 7.61k | e |= ksprintf(str,">\n") < 0; |
2728 | 7.61k | } |
2729 | 5.87k | else |
2730 | 5.87k | e |= ksprintf(str,"##%s=%s\n", hrec->key,hrec->value) < 0; |
2731 | | |
2732 | 13.4k | return e == 0 ? 0 : -1; |
2733 | 13.4k | } |
2734 | | |
2735 | | int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str) |
2736 | 0 | { |
2737 | 0 | return _bcf_hrec_format(hrec,0,str); |
2738 | 0 | } |
2739 | | |
2740 | | int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str) |
2741 | 2.24k | { |
2742 | 2.24k | int i, r = 0; |
2743 | 15.7k | for (i=0; i<hdr->nhrec; i++) |
2744 | 13.4k | r |= _bcf_hrec_format(hdr->hrec[i], is_bcf, str) < 0; |
2745 | | |
2746 | 2.24k | r |= ksprintf(str, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") < 0; |
2747 | 2.24k | if ( bcf_hdr_nsamples(hdr) ) |
2748 | 594 | { |
2749 | 594 | r |= ksprintf(str, "\tFORMAT") < 0; |
2750 | 5.23k | for (i=0; i<bcf_hdr_nsamples(hdr); i++) |
2751 | 4.64k | r |= ksprintf(str, "\t%s", hdr->samples[i]) < 0; |
2752 | 594 | } |
2753 | 2.24k | r |= ksprintf(str, "\n") < 0; |
2754 | | |
2755 | 2.24k | return r ? -1 : 0; |
2756 | 2.24k | } |
2757 | | |
2758 | | char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len) |
2759 | 0 | { |
2760 | 0 | kstring_t txt = {0,0,0}; |
2761 | 0 | if (bcf_hdr_format(hdr, is_bcf, &txt) < 0) |
2762 | 0 | return NULL; |
2763 | 0 | if ( len ) *len = txt.l; |
2764 | 0 | return txt.s; |
2765 | 0 | } |
2766 | | |
2767 | | const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n) |
2768 | 0 | { |
2769 | 0 | vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG]; |
2770 | 0 | int i, tid, m = kh_size(d); |
2771 | 0 | const char **names = (const char**) calloc(m,sizeof(const char*)); |
2772 | 0 | if ( !names ) |
2773 | 0 | { |
2774 | 0 | hts_log_error("Failed to allocate memory"); |
2775 | 0 | *n = 0; |
2776 | 0 | return NULL; |
2777 | 0 | } |
2778 | 0 | khint_t k; |
2779 | 0 | for (k=kh_begin(d); k<kh_end(d); k++) |
2780 | 0 | { |
2781 | 0 | if ( !kh_exist(d,k) ) continue; |
2782 | 0 | if ( !kh_val(d, k).hrec[0] ) continue; // removed via bcf_hdr_remove |
2783 | 0 | tid = kh_val(d,k).id; |
2784 | 0 | if ( tid >= m ) |
2785 | 0 | { |
2786 | | // This can happen after a contig has been removed from BCF header via bcf_hdr_remove() |
2787 | 0 | if ( hts_resize(const char*, tid + 1, &m, &names, HTS_RESIZE_CLEAR)<0 ) |
2788 | 0 | { |
2789 | 0 | hts_log_error("Failed to allocate memory"); |
2790 | 0 | *n = 0; |
2791 | 0 | free(names); |
2792 | 0 | return NULL; |
2793 | 0 | } |
2794 | 0 | m = tid + 1; |
2795 | 0 | } |
2796 | 0 | names[tid] = kh_key(d,k); |
2797 | 0 | } |
2798 | | // ensure there are no gaps |
2799 | 0 | for (i=0,tid=0; tid<m; i++,tid++) |
2800 | 0 | { |
2801 | 0 | while ( tid<m && !names[tid] ) tid++; |
2802 | 0 | if ( tid==m ) break; |
2803 | 0 | if ( i==tid ) continue; |
2804 | 0 | names[i] = names[tid]; |
2805 | 0 | names[tid] = 0; |
2806 | 0 | } |
2807 | 0 | *n = i; |
2808 | 0 | return names; |
2809 | 0 | } |
2810 | | |
2811 | | int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h) |
2812 | 1.12k | { |
2813 | 1.12k | kstring_t htxt = {0,0,0}; |
2814 | 1.12k | if (bcf_hdr_format(h, 0, &htxt) < 0) { |
2815 | 0 | free(htxt.s); |
2816 | 0 | return -1; |
2817 | 0 | } |
2818 | 1.12k | while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros |
2819 | 1.12k | int ret; |
2820 | 1.12k | if ( fp->format.compression!=no_compression ) { |
2821 | 0 | ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l); |
2822 | 0 | if (bgzf_flush(fp->fp.bgzf) != 0) return -1; |
2823 | 1.12k | } else { |
2824 | 1.12k | ret = hwrite(fp->fp.hfile, htxt.s, htxt.l); |
2825 | 1.12k | } |
2826 | 1.12k | free(htxt.s); |
2827 | 1.12k | return ret<0 ? -1 : 0; |
2828 | 1.12k | } |
2829 | | |
2830 | | /*********************** |
2831 | | *** Typed value I/O *** |
2832 | | ***********************/ |
2833 | | |
2834 | | int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) |
2835 | 182k | { |
2836 | 182k | int32_t max = INT32_MIN, min = INT32_MAX; |
2837 | 182k | int i; |
2838 | 182k | if (n <= 0) { |
2839 | 3.54k | return bcf_enc_size(s, 0, BCF_BT_NULL); |
2840 | 179k | } else if (n == 1) { |
2841 | 37.3k | return bcf_enc_int1(s, a[0]); |
2842 | 142k | } else { |
2843 | 142k | if (wsize <= 0) wsize = n; |
2844 | | |
2845 | | // Equivalent to: |
2846 | | // for (i = 0; i < n; ++i) { |
2847 | | // if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) |
2848 | | // continue; |
2849 | | // if (max < a[i]) max = a[i]; |
2850 | | // if (min > a[i]) min = a[i]; |
2851 | | // } |
2852 | 142k | int max4[4] = {INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN}; |
2853 | 142k | int min4[4] = {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX}; |
2854 | 36.4M | for (i = 0; i < (n&~3); i+=4) { |
2855 | | // bcf_int32_missing == INT32_MIN and |
2856 | | // bcf_int32_vector_end == INT32_MIN+1. |
2857 | | // We skip these, but can mostly avoid explicit checking |
2858 | 36.3M | if (max4[0] < a[i+0]) max4[0] = a[i+0]; |
2859 | 36.3M | if (max4[1] < a[i+1]) max4[1] = a[i+1]; |
2860 | 36.3M | if (max4[2] < a[i+2]) max4[2] = a[i+2]; |
2861 | 36.3M | if (max4[3] < a[i+3]) max4[3] = a[i+3]; |
2862 | 36.3M | if (min4[0] > a[i+0] && a[i+0] > INT32_MIN+1) min4[0] = a[i+0]; |
2863 | 36.3M | if (min4[1] > a[i+1] && a[i+1] > INT32_MIN+1) min4[1] = a[i+1]; |
2864 | 36.3M | if (min4[2] > a[i+2] && a[i+2] > INT32_MIN+1) min4[2] = a[i+2]; |
2865 | 36.3M | if (min4[3] > a[i+3] && a[i+3] > INT32_MIN+1) min4[3] = a[i+3]; |
2866 | 36.3M | } |
2867 | 142k | min = min4[0]; |
2868 | 142k | if (min > min4[1]) min = min4[1]; |
2869 | 142k | if (min > min4[2]) min = min4[2]; |
2870 | 142k | if (min > min4[3]) min = min4[3]; |
2871 | 142k | max = max4[0]; |
2872 | 142k | if (max < max4[1]) max = max4[1]; |
2873 | 142k | if (max < max4[2]) max = max4[2]; |
2874 | 142k | if (max < max4[3]) max = max4[3]; |
2875 | 363k | for (; i < n; ++i) { |
2876 | 221k | if (max < a[i]) max = a[i]; |
2877 | 221k | if (min > a[i] && a[i] > INT32_MIN+1) min = a[i]; |
2878 | 221k | } |
2879 | | |
2880 | 142k | if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) { |
2881 | 16.4k | if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 || |
2882 | 16.4k | ks_resize(s, s->l + n) < 0) |
2883 | 0 | return -1; |
2884 | 16.4k | uint8_t *p = (uint8_t *) s->s + s->l; |
2885 | 3.28M | for (i = 0; i < n; ++i, p++) { |
2886 | 3.27M | if ( a[i]==bcf_int32_vector_end ) *p = bcf_int8_vector_end; |
2887 | 3.26M | else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing; |
2888 | 104k | else *p = a[i]; |
2889 | 3.27M | } |
2890 | 16.4k | s->l += n; |
2891 | 125k | } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) { |
2892 | 88.4k | uint8_t *p; |
2893 | 88.4k | if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 || |
2894 | 88.4k | ks_resize(s, s->l + n * sizeof(int16_t)) < 0) |
2895 | 0 | return -1; |
2896 | 88.4k | p = (uint8_t *) s->s + s->l; |
2897 | 32.7M | for (i = 0; i < n; ++i) |
2898 | 32.6M | { |
2899 | 32.6M | int16_t x; |
2900 | 32.6M | if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end; |
2901 | 32.6M | else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing; |
2902 | 779k | else x = a[i]; |
2903 | 32.6M | i16_to_le(x, p); |
2904 | 32.6M | p += sizeof(int16_t); |
2905 | 32.6M | } |
2906 | 88.4k | s->l += n * sizeof(int16_t); |
2907 | 88.4k | } else { |
2908 | 37.2k | uint8_t *p; |
2909 | 37.2k | if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 || |
2910 | 37.2k | ks_resize(s, s->l + n * sizeof(int32_t)) < 0) |
2911 | 0 | return -1; |
2912 | 37.2k | p = (uint8_t *) s->s + s->l; |
2913 | 109M | for (i = 0; i < n; ++i) { |
2914 | 109M | i32_to_le(a[i], p); |
2915 | 109M | p += sizeof(int32_t); |
2916 | 109M | } |
2917 | 37.2k | s->l += n * sizeof(int32_t); |
2918 | 37.2k | } |
2919 | 142k | } |
2920 | | |
2921 | 142k | return 0; |
2922 | 182k | } |
2923 | | |
2924 | | #ifdef VCF_ALLOW_INT64 |
2925 | | static int bcf_enc_long1(kstring_t *s, int64_t x) { |
2926 | | uint32_t e = 0; |
2927 | | if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32) |
2928 | | return bcf_enc_int1(s, x); |
2929 | | if (x == bcf_int64_vector_end) { |
2930 | | e |= bcf_enc_size(s, 1, BCF_BT_INT8); |
2931 | | e |= kputc(bcf_int8_vector_end, s) < 0; |
2932 | | } else if (x == bcf_int64_missing) { |
2933 | | e |= bcf_enc_size(s, 1, BCF_BT_INT8); |
2934 | | e |= kputc(bcf_int8_missing, s) < 0; |
2935 | | } else { |
2936 | | e |= bcf_enc_size(s, 1, BCF_BT_INT64); |
2937 | | e |= ks_expand(s, 8); |
2938 | | if (e == 0) { u64_to_le(x, (uint8_t *) s->s + s->l); s->l += 8; } |
2939 | | } |
2940 | | return e == 0 ? 0 : -1; |
2941 | | } |
2942 | | #endif |
2943 | | |
2944 | 331k | static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) { |
2945 | 331k | uint8_t *p; |
2946 | 331k | size_t i; |
2947 | 331k | size_t bytes = n * sizeof(float); |
2948 | | |
2949 | 331k | if (bytes / sizeof(float) != n) return -1; |
2950 | 331k | if (ks_resize(s, s->l + bytes) < 0) return -1; |
2951 | | |
2952 | 331k | p = (uint8_t *) s->s + s->l; |
2953 | 75.1M | for (i = 0; i < n; i++) { |
2954 | 74.8M | float_to_le(a[i], p); |
2955 | 74.8M | p += sizeof(float); |
2956 | 74.8M | } |
2957 | 331k | s->l += bytes; |
2958 | | |
2959 | 331k | return 0; |
2960 | 331k | } |
2961 | | |
2962 | | int bcf_enc_vfloat(kstring_t *s, int n, float *a) |
2963 | 331k | { |
2964 | 331k | assert(n >= 0); |
2965 | 331k | bcf_enc_size(s, n, BCF_BT_FLOAT); |
2966 | 331k | serialize_float_array(s, n, a); |
2967 | 331k | return 0; // FIXME: check for errs in this function |
2968 | 331k | } |
2969 | | |
2970 | | int bcf_enc_vchar(kstring_t *s, int l, const char *a) |
2971 | 2.09M | { |
2972 | 2.09M | bcf_enc_size(s, l, BCF_BT_CHAR); |
2973 | 2.09M | kputsn(a, l, s); |
2974 | 2.09M | return 0; // FIXME: check for errs in this function |
2975 | 2.09M | } |
2976 | | |
2977 | | // Special case of n==1 as it also occurs quite often in FORMAT data. |
2978 | | // This version is also small enough to get inlined. |
2979 | 6.30k | static inline int bcf_fmt_array1(kstring_t *s, int type, void *data) { |
2980 | 6.30k | uint32_t e = 0; |
2981 | 6.30k | uint8_t *p = (uint8_t *)data; |
2982 | 6.30k | int32_t v; |
2983 | | |
2984 | | // helps gcc more than clang here. In billions of cycles: |
2985 | | // bcf_fmt_array1 bcf_fmt_array |
2986 | | // gcc7: 23.2 24.3 |
2987 | | // gcc13: 21.6 23.0 |
2988 | | // clang13: 27.1 27.8 |
2989 | 6.30k | switch (type) { |
2990 | 6.30k | case BCF_BT_CHAR: |
2991 | 6.30k | e |= kputc_(*p == bcf_str_missing ? '.' : *p, s) < 0; |
2992 | 6.30k | break; |
2993 | | |
2994 | 0 | case BCF_BT_INT8: |
2995 | 0 | if (*(int8_t *)p != bcf_int8_vector_end) { |
2996 | 0 | e |= ((*(int8_t *)p == bcf_int8_missing) |
2997 | 0 | ? kputc_('.', s) |
2998 | 0 | : kputw(*(int8_t *)p, s)) < 0; |
2999 | 0 | } |
3000 | 0 | break; |
3001 | 0 | case BCF_BT_INT16: |
3002 | 0 | v = le_to_i16(p); |
3003 | 0 | if (v != bcf_int16_vector_end) { |
3004 | 0 | e |= (v == bcf_int16_missing |
3005 | 0 | ? kputc_('.', s) |
3006 | 0 | : kputw(v, s)) < 0; |
3007 | 0 | } |
3008 | 0 | break; |
3009 | | |
3010 | 0 | case BCF_BT_INT32: |
3011 | 0 | v = le_to_i32(p); |
3012 | 0 | if (v != bcf_int32_vector_end) { |
3013 | 0 | e |= (v == bcf_int32_missing |
3014 | 0 | ? kputc_('.', s) |
3015 | 0 | : kputw(v, s)) < 0; |
3016 | 0 | } |
3017 | 0 | break; |
3018 | | |
3019 | 0 | case BCF_BT_FLOAT: |
3020 | 0 | v = le_to_u32(p); |
3021 | 0 | if (v != bcf_float_vector_end) { |
3022 | 0 | e |= (v == bcf_float_missing |
3023 | 0 | ? kputc_('.', s) |
3024 | 0 | : kputd(le_to_float(p), s)) < 0; |
3025 | 0 | } |
3026 | 0 | break; |
3027 | | |
3028 | 0 | default: |
3029 | 0 | hts_log_error("Unexpected type %d", type); |
3030 | 0 | return -1; |
3031 | 6.30k | } |
3032 | | |
3033 | 6.30k | return e == 0 ? 0 : -1; |
3034 | 6.30k | } |
3035 | | |
3036 | | int bcf_fmt_array(kstring_t *s, int n, int type, void *data) |
3037 | 1.75M | { |
3038 | 1.75M | int j = 0; |
3039 | 1.75M | uint32_t e = 0; |
3040 | 1.75M | if (n == 0) { |
3041 | 1.03M | return kputc_('.', s) >= 0 ? 0 : -1; |
3042 | 1.03M | } |
3043 | | |
3044 | 721k | if (type == BCF_BT_CHAR) |
3045 | 257k | { |
3046 | 257k | char *p = (char *)data; |
3047 | | |
3048 | | // Note bcf_str_missing is already accounted for in n==0 above. |
3049 | 257k | if (n >= 8) { |
3050 | 64.5k | char *p_end = memchr(p, 0, n); |
3051 | 64.5k | e |= kputsn(p, p_end ? p_end-p : n, s) < 0; |
3052 | 193k | } else { |
3053 | 736k | for (j = 0; j < n && *p; ++j, ++p) |
3054 | 543k | e |= kputc(*p, s) < 0; |
3055 | 193k | } |
3056 | 257k | } |
3057 | 463k | else |
3058 | 463k | { |
3059 | 463k | #define BRANCH(type_t, convert, is_missing, is_vector_end, kprint) { \ |
3060 | 463k | uint8_t *p = (uint8_t *) data; \ |
3061 | 110M | for (j=0; j<n; j++, p += sizeof(type_t)) \ |
3062 | 109M | { \ |
3063 | 109M | type_t v = convert(p); \ |
3064 | 109M | if ( is_vector_end ) break; \ |
3065 | 109M | if ( j ) e |= kputc_(',', s) < 0; \ |
3066 | 109M | e |= (is_missing ? kputc('.', s) : kprint) < 0; \ |
3067 | 109M | } \ |
3068 | 463k | } |
3069 | 463k | switch (type) { |
3070 | 126k | case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, v==bcf_int8_missing, v==bcf_int8_vector_end, kputw(v, s)); break; |
3071 | 83.2k | case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break; |
3072 | 88.4k | case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break; |
3073 | 165k | case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break; |
3074 | 0 | default: hts_log_error("Unexpected type %d", type); exit(1); break; |
3075 | 463k | } |
3076 | 463k | #undef BRANCH |
3077 | 463k | } |
3078 | 721k | return e == 0 ? 0 : -1; |
3079 | 721k | } |
3080 | | |
3081 | | uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr) |
3082 | 1.19M | { |
3083 | 1.19M | int x, type; |
3084 | 1.19M | x = bcf_dec_size(ptr, &ptr, &type); |
3085 | 1.19M | bcf_fmt_array(s, x, type, ptr); |
3086 | 1.19M | return ptr + (x << bcf_type_shift[type]); |
3087 | 1.19M | } |
3088 | | |
3089 | | /******************** |
3090 | | *** VCF site I/O *** |
3091 | | ********************/ |
3092 | | |
3093 | | typedef struct { |
3094 | | int key; // Key for h->id[BCF_DT_ID][key] vdict |
3095 | | int max_m; // number of elements in field array (ie commas) |
3096 | | int size; // field size (max_l or max_g*4 if is_gt) |
3097 | | int offset; // offset of buf into h->mem |
3098 | | uint32_t is_gt:1, // is genotype |
3099 | | max_g:31; // maximum number of genotypes |
3100 | | uint32_t max_l; // length of field |
3101 | | uint32_t y; // h->id[0][fmt[j].key].val->info[BCF_HL_FMT] |
3102 | | uint8_t *buf; // Pointer into h->mem |
3103 | | } fmt_aux_t; |
3104 | | |
3105 | | // fmt_aux_t field notes: |
3106 | | // max_* are biggest sizes of the various FORMAT fields across all samples. |
3107 | | // We use these after pivoting the data to ensure easy random access |
3108 | | // of a specific sample. |
3109 | | // |
3110 | | // max_m is only used for type BCF_HT_REAL or BCF_HT_INT |
3111 | | // max_g is only used for is_gt == 1 (will be BCF_HT_STR) |
3112 | | // max_l is only used for is_gt == 0 (will be BCF_HT_STR) |
3113 | | // |
3114 | | // These are computed in vcf_parse_format_max3 and used in |
3115 | | // vcf_parse_format_alloc4 to get the size. |
3116 | | // |
3117 | | // size is computed from max_g, max_l, max_m and is_gt. Once computed |
3118 | | // the max values are never accessed again. |
3119 | | // |
3120 | | // In theory all 4 vars could be coalesced into a single variable, but this |
3121 | | // significantly harms speed (even if done via a union). It's about 25-30% |
3122 | | // slower. |
3123 | | |
3124 | | static inline int align_mem(kstring_t *s) |
3125 | 51.8k | { |
3126 | 51.8k | int e = 0; |
3127 | 51.8k | if (s->l&7) { |
3128 | 12.2k | uint64_t zero = 0; |
3129 | 12.2k | e = kputsn((char*)&zero, 8 - (s->l&7), s) < 0; |
3130 | 12.2k | } |
3131 | 51.8k | return e == 0 ? 0 : -1; |
3132 | 51.8k | } |
3133 | | |
3134 | 52.3k | #define MAX_N_FMT 255 /* Limited by size of bcf1_t n_fmt field */ |
3135 | | |
3136 | | // detect FORMAT "." |
3137 | | static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, |
3138 | 18.2k | const char *p, const char *q) { |
3139 | 18.2k | const char *end = s->s + s->l; |
3140 | 18.2k | if ( q>=end ) |
3141 | 1 | { |
3142 | 1 | hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1); |
3143 | 1 | v->errcode |= BCF_ERR_NCOLS; |
3144 | 1 | return -1; |
3145 | 1 | } |
3146 | | |
3147 | 18.2k | v->n_fmt = 0; |
3148 | 18.2k | if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "." |
3149 | 8 | { |
3150 | 8 | v->n_sample = bcf_hdr_nsamples(h); |
3151 | 8 | return 1; |
3152 | 8 | } |
3153 | | |
3154 | 18.2k | return 0; |
3155 | 18.2k | } |
3156 | | |
3157 | | // get format information from the dictionary |
3158 | | static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, |
3159 | 18.2k | const char *p, const char *q, fmt_aux_t *fmt) { |
3160 | 18.2k | const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; |
3161 | 18.2k | char *t; |
3162 | 18.2k | int j; |
3163 | 18.2k | ks_tokaux_t aux1; |
3164 | | |
3165 | 70.5k | for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) { |
3166 | 52.3k | if (j >= MAX_N_FMT) { |
3167 | 2 | v->errcode |= BCF_ERR_LIMITS; |
3168 | 2 | hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle", |
3169 | 2 | bcf_seqname_safe(h,v), v->pos+1); |
3170 | 2 | return -1; |
3171 | 2 | } |
3172 | | |
3173 | 52.3k | *(char*)aux1.p = 0; |
3174 | 52.3k | khint_t k = kh_get(vdict, d, t); |
3175 | 52.3k | if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) { |
3176 | 3.20k | if ( t[0]=='.' && t[1]==0 ) |
3177 | 0 | { |
3178 | 0 | hts_log_error("Invalid FORMAT tag name '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); |
3179 | 0 | v->errcode |= BCF_ERR_TAG_INVALID; |
3180 | 0 | return -1; |
3181 | 0 | } |
3182 | 3.20k | hts_log_warning("FORMAT '%s' at %s:%"PRIhts_pos" is not defined in the header, assuming Type=String", t, bcf_seqname_safe(h,v), v->pos+1); |
3183 | 3.20k | kstring_t tmp = {0,0,0}; |
3184 | 3.20k | int l; |
3185 | 3.20k | ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t); |
3186 | 3.20k | bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l); |
3187 | 3.20k | free(tmp.s); |
3188 | 3.20k | int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1; |
3189 | 3.20k | if (res < 0) bcf_hrec_destroy(hrec); |
3190 | 3.20k | if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h); |
3191 | | |
3192 | 3.20k | k = kh_get(vdict, d, t); |
3193 | 3.20k | v->errcode |= BCF_ERR_TAG_UNDEF; |
3194 | 3.20k | if (res || k == kh_end(d)) { |
3195 | 4 | hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1); |
3196 | 4 | v->errcode |= BCF_ERR_TAG_INVALID; |
3197 | 4 | return -1; |
3198 | 4 | } |
3199 | 3.20k | } |
3200 | 52.3k | fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0; |
3201 | 52.3k | fmt[j].key = kh_val(d, k).id; |
3202 | 52.3k | fmt[j].is_gt = (t[0] == 'G' && t[1] == 'T' && !t[2]); |
3203 | 52.3k | fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT]; |
3204 | 52.3k | v->n_fmt++; |
3205 | 52.3k | } |
3206 | 18.1k | return 0; |
3207 | 18.2k | } |
3208 | | |
3209 | | // compute max |
3210 | | static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, |
3211 | 18.1k | char *p, char *q, fmt_aux_t *fmt) { |
3212 | 18.1k | int n_sample_ori = -1; |
3213 | 18.1k | char *r = q + 1; // r: position in the format string |
3214 | 18.1k | int l = 0, m = 1, g = 1, j; |
3215 | 18.1k | v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles |
3216 | 18.1k | const char *end = s->s + s->l; |
3217 | | |
3218 | 33.6k | while ( r<end ) |
3219 | 33.6k | { |
3220 | | // can we skip some samples? |
3221 | 33.6k | if ( h->keep_samples ) |
3222 | 0 | { |
3223 | 0 | n_sample_ori++; |
3224 | 0 | if ( !bit_array_test(h->keep_samples,n_sample_ori) ) |
3225 | 0 | { |
3226 | 0 | while ( *r!='\t' && r<end ) r++; |
3227 | 0 | if ( *r=='\t' ) { *r = 0; r++; } |
3228 | 0 | continue; |
3229 | 0 | } |
3230 | 0 | } |
3231 | | |
3232 | | // collect fmt stats: max vector size, length, number of alleles |
3233 | 33.6k | j = 0; // j-th format field |
3234 | 33.6k | fmt_aux_t *f = fmt; |
3235 | 33.6k | static char meta[256] = { |
3236 | | // \0 \t , / : | |
3237 | 33.6k | 1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
3238 | 33.6k | 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1, 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0, |
3239 | 33.6k | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
3240 | 33.6k | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, |
3241 | 33.6k | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
3242 | 33.6k | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
3243 | 33.6k | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
3244 | 33.6k | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 |
3245 | 33.6k | }; |
3246 | | |
3247 | 33.6k | char *r_start = r; |
3248 | 4.78M | for (;;) { |
3249 | | // Quickly skip ahead to an appropriate meta-character |
3250 | 5.55M | while (!meta[(unsigned char)*r]) r++; |
3251 | | |
3252 | 4.78M | switch (*r) { |
3253 | 4.72M | case ',': |
3254 | 4.72M | m++; |
3255 | 4.72M | break; |
3256 | | |
3257 | 1.99k | case '|': |
3258 | 10.3k | case '/': |
3259 | 10.3k | if (f->is_gt) g++; |
3260 | 10.3k | break; |
3261 | | |
3262 | 18.4k | case '\t': |
3263 | 18.4k | *r = 0; // fall through |
3264 | | |
3265 | 18.4k | default: // valid due to while loop above. |
3266 | 33.6k | case '\0': |
3267 | 45.6k | case ':': |
3268 | 45.6k | l = r - r_start; r_start = r; |
3269 | 45.6k | if (f->max_m < m) f->max_m = m; |
3270 | 45.6k | if (f->max_l < l) f->max_l = l; |
3271 | 45.6k | if (f->is_gt && f->max_g < g) f->max_g = g; |
3272 | 45.6k | l = 0, m = g = 1; |
3273 | 45.6k | if ( *r==':' ) { |
3274 | 12.0k | j++; f++; |
3275 | 12.0k | if ( j>=v->n_fmt ) { |
3276 | 10 | hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"", |
3277 | 10 | h->id[BCF_DT_CTG][v->rid].key, v->pos+1); |
3278 | 10 | v->errcode |= BCF_ERR_NCOLS; |
3279 | 10 | return -1; |
3280 | 10 | } |
3281 | 33.6k | } else goto end_for; |
3282 | 12.0k | break; |
3283 | 4.78M | } |
3284 | 4.74M | if ( r>=end ) break; |
3285 | 4.74M | r++; |
3286 | 4.74M | } |
3287 | 33.6k | end_for: |
3288 | 33.6k | v->n_sample++; |
3289 | 33.6k | if ( v->n_sample == bcf_hdr_nsamples(h) ) break; |
3290 | 15.4k | r++; |
3291 | 15.4k | } |
3292 | | |
3293 | 18.1k | return 0; |
3294 | 18.1k | } |
3295 | | |
3296 | | // allocate memory for arrays |
3297 | | static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, |
3298 | | const char *p, const char *q, |
3299 | 18.1k | fmt_aux_t *fmt) { |
3300 | 18.1k | kstring_t *mem = (kstring_t*)&h->mem; |
3301 | | |
3302 | 18.1k | int j; |
3303 | 69.9k | for (j = 0; j < v->n_fmt; ++j) { |
3304 | 51.8k | fmt_aux_t *f = &fmt[j]; |
3305 | 51.8k | if ( !f->max_m ) f->max_m = 1; // omitted trailing format field |
3306 | | |
3307 | 51.8k | if ((f->y>>4&0xf) == BCF_HT_STR) { |
3308 | 51.8k | f->size = f->is_gt? f->max_g << 2 : f->max_l; |
3309 | 51.8k | } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) { |
3310 | 0 | f->size = f->max_m << 2; |
3311 | 0 | } else { |
3312 | 0 | hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1); |
3313 | 0 | v->errcode |= BCF_ERR_TAG_INVALID; |
3314 | 0 | return -1; |
3315 | 0 | } |
3316 | | |
3317 | 51.8k | if (align_mem(mem) < 0) { |
3318 | 0 | hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); |
3319 | 0 | v->errcode |= BCF_ERR_LIMITS; |
3320 | 0 | return -1; |
3321 | 0 | } |
3322 | | |
3323 | | // Limit the total memory to ~2Gb per VCF row. This should mean |
3324 | | // malformed VCF data is less likely to take excessive memory and/or |
3325 | | // time. |
3326 | 51.8k | if ((uint64_t) mem->l + v->n_sample * (uint64_t)f->size > INT_MAX) { |
3327 | 0 | static int warned = 0; |
3328 | 0 | if ( !warned ) hts_log_warning("Excessive memory required by FORMAT fields at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); |
3329 | 0 | warned = 1; |
3330 | 0 | v->errcode |= BCF_ERR_LIMITS; |
3331 | 0 | f->size = -1; |
3332 | 0 | f->offset = 0; |
3333 | 0 | continue; |
3334 | 0 | } |
3335 | | |
3336 | 51.8k | f->offset = mem->l; |
3337 | 51.8k | if (ks_resize(mem, mem->l + v->n_sample * (size_t)f->size) < 0) { |
3338 | 0 | hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); |
3339 | 0 | v->errcode |= BCF_ERR_LIMITS; |
3340 | 0 | return -1; |
3341 | 0 | } |
3342 | 51.8k | mem->l += v->n_sample * f->size; |
3343 | 51.8k | } |
3344 | | |
3345 | 18.1k | { |
3346 | 18.1k | int j; |
3347 | 69.9k | for (j = 0; j < v->n_fmt; ++j) |
3348 | 51.8k | fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset; |
3349 | 18.1k | } |
3350 | | |
3351 | | // check for duplicate tags |
3352 | 18.1k | int i; |
3353 | 51.8k | for (i=1; i<v->n_fmt; i++) |
3354 | 33.6k | { |
3355 | 33.6k | fmt_aux_t *ifmt = &fmt[i]; |
3356 | 33.6k | if ( ifmt->size==-1 ) continue; // already marked for removal |
3357 | 110k | for (j=0; j<i; j++) |
3358 | 92.7k | { |
3359 | 92.7k | fmt_aux_t *jfmt = &fmt[j]; |
3360 | 92.7k | if ( jfmt->size==-1 ) continue; // already marked for removal |
3361 | 53.1k | if ( ifmt->key!=jfmt->key ) continue; |
3362 | 16.3k | static int warned = 0; |
3363 | 16.3k | if ( !warned ) hts_log_warning("Duplicate FORMAT tag %s at %s:%"PRIhts_pos, bcf_hdr_int2id(h,BCF_DT_ID,ifmt->key), bcf_seqname_safe(h,v), v->pos+1); |
3364 | 16.3k | warned = 1; |
3365 | 16.3k | v->errcode |= BCF_ERR_TAG_INVALID; |
3366 | 16.3k | ifmt->size = -1; |
3367 | 16.3k | ifmt->offset = 0; |
3368 | 16.3k | break; |
3369 | 53.1k | } |
3370 | 33.6k | } |
3371 | 18.1k | return 0; |
3372 | 18.1k | } |
3373 | | |
3374 | | // Fill the sample fields |
3375 | | static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, |
3376 | 18.1k | const char *p, const char *q, fmt_aux_t *fmt) { |
3377 | 18.1k | static int extreme_val_warned = 0; |
3378 | 18.1k | int n_sample_ori = -1; |
3379 | | // At beginning of the loop t points to the first char of a format |
3380 | 18.1k | const char *t = q + 1; |
3381 | 18.1k | int m = 0; // m: sample id |
3382 | 18.1k | const int nsamples = bcf_hdr_nsamples(h); |
3383 | 18.1k | const char *end = s->s + s->l; |
3384 | | |
3385 | 18.1k | int ver = bcf_get_version(h, NULL); |
3386 | | |
3387 | 51.6k | while ( t<end ) |
3388 | 50.6k | { |
3389 | | // can we skip some samples? |
3390 | 50.6k | if ( h->keep_samples ) |
3391 | 0 | { |
3392 | 0 | n_sample_ori++; |
3393 | 0 | if ( !bit_array_test(h->keep_samples,n_sample_ori) ) |
3394 | 0 | { |
3395 | 0 | while ( *t && t<end ) t++; |
3396 | 0 | t++; |
3397 | 0 | continue; |
3398 | 0 | } |
3399 | 0 | } |
3400 | 50.6k | if ( m == nsamples ) break; |
3401 | | |
3402 | 33.5k | int j = 0; // j-th format field, m-th sample |
3403 | 45.2k | while ( t < end ) |
3404 | 45.2k | { |
3405 | 45.2k | fmt_aux_t *z = &fmt[j++]; |
3406 | 45.2k | const int htype = z->y>>4&0xf; |
3407 | 45.2k | if (!z->buf) { |
3408 | 0 | hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos, |
3409 | 0 | z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1); |
3410 | 0 | v->errcode |= BCF_ERR_LIMITS; |
3411 | 0 | return -1; |
3412 | 0 | } |
3413 | | |
3414 | 45.2k | if ( z->size==-1 ) |
3415 | 3.88k | { |
3416 | | // this field is to be ignored, it's either too big or a duplicate |
3417 | 40.7k | while ( *t != ':' && *t ) t++; |
3418 | 3.88k | } |
3419 | 41.3k | else if (htype == BCF_HT_STR) { |
3420 | 41.3k | int l; |
3421 | 41.3k | if (z->is_gt) { |
3422 | | // Genotypes. |
3423 | | //([/|])?<val>)([|/]<val>)+... where <val> is [0-9]+ or ".". |
3424 | 5.36k | int32_t is_phased = 0; |
3425 | 5.36k | uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m); |
3426 | 5.36k | uint32_t unreadable = 0; |
3427 | 5.36k | uint32_t max = 0; |
3428 | 5.36k | int overflow = 0, ploidy = 0, anyunphased = 0, \ |
3429 | 5.36k | phasingprfx = 0, unknown1 = 0; |
3430 | | |
3431 | | /* with prefixed phasing, it is explicitly given for 1st one |
3432 | | with non-prefixed, set based on ploidy and phasing of other |
3433 | | alleles. */ |
3434 | 5.36k | if (ver >= VCF44 && (*t == '|' || *t == '/')) { |
3435 | | // cache prefix and phasing status |
3436 | 63 | is_phased = *t++ == '|'; |
3437 | 63 | phasingprfx = 1; |
3438 | 63 | } |
3439 | | |
3440 | 12.1k | for (l = 0;; ++t) { |
3441 | 12.1k | ploidy++; |
3442 | 12.1k | if (*t == '.') { |
3443 | 1.28k | ++t, x[l++] = is_phased; |
3444 | 1.28k | if (l==1) { //for 1st allele only |
3445 | 566 | unknown1 = 1; |
3446 | 566 | } |
3447 | 10.8k | } else { |
3448 | 10.8k | const char *tt = t; |
3449 | 10.8k | uint32_t val; |
3450 | | // Or "v->n_allele < 10", but it doesn't |
3451 | | // seem to be any faster and this feels safer. |
3452 | 10.8k | if (*t >= '0' && *t <= '9' && |
3453 | 10.6k | !(t[1] >= '0' && t[1] <= '9')) { |
3454 | 4.11k | val = *t++ - '0'; |
3455 | 6.73k | } else { |
3456 | 6.73k | val = hts_str2uint(t, (char **)&t, |
3457 | 6.73k | sizeof(val) * CHAR_MAX - 2, |
3458 | 6.73k | &overflow); |
3459 | 6.73k | unreadable |= tt == t; |
3460 | 6.73k | } |
3461 | 10.8k | if (max < val) max = val; |
3462 | 10.8k | x[l++] = (val + 1) << 1 | is_phased; |
3463 | 10.8k | } |
3464 | 12.1k | anyunphased |= (ploidy != 1) && !is_phased; |
3465 | 12.1k | is_phased = (*t == '|'); |
3466 | 12.1k | if (*t != '|' && *t != '/') break; |
3467 | 12.1k | } |
3468 | 5.36k | if (!phasingprfx) { //get GT in v44 way when no prefixed phasing |
3469 | | /* no explicit phasing for 1st allele, set based on |
3470 | | other alleles and ploidy */ |
3471 | 5.30k | if (ploidy == 1) { //implicitly phased |
3472 | 1.19k | if (!unknown1) { |
3473 | 1.12k | x[0] |= 1; |
3474 | 1.12k | } |
3475 | 4.10k | } else { //set by other unphased alleles |
3476 | 4.10k | x[0] |= (anyunphased)? 0 : 1; |
3477 | 4.10k | } |
3478 | 5.30k | } |
3479 | | // Possibly check max against v->n_allele instead? |
3480 | 5.36k | if (overflow || max > (INT32_MAX >> 1) - 1) { |
3481 | 25 | hts_log_error("Couldn't read GT data: value too large at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); |
3482 | 25 | return -1; |
3483 | 25 | } |
3484 | 5.33k | if (unreadable) { |
3485 | 9 | hts_log_error("Couldn't read GT data: value not a number or '.' at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); |
3486 | 9 | return -1; |
3487 | 9 | } |
3488 | 5.32k | if ( !l ) x[l++] = 0; // An empty field, insert missing value |
3489 | 5.82k | for (; l < z->size>>2; ++l) |
3490 | 494 | x[l] = bcf_int32_vector_end; |
3491 | | |
3492 | 35.9k | } else { |
3493 | | // Otherwise arbitrary strings |
3494 | 35.9k | char *x = (char*)z->buf + z->size * (size_t)m; |
3495 | 5.29M | for (l = 0; *t != ':' && *t; ++t) |
3496 | 5.25M | x[l++] = *t; |
3497 | 35.9k | if (z->size > l) |
3498 | 15.7k | memset(&x[l], 0, (z->size-l) * sizeof(*x)); |
3499 | 35.9k | } |
3500 | | |
3501 | 41.3k | } else if (htype == BCF_HT_INT) { |
3502 | | // One or more integers in an array |
3503 | 0 | int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m); |
3504 | 0 | int l; |
3505 | 0 | for (l = 0;; ++t) { |
3506 | 0 | if (*t == '.') { |
3507 | 0 | x[l++] = bcf_int32_missing, ++t; // ++t to skip "." |
3508 | 0 | } else { |
3509 | 0 | int overflow = 0; |
3510 | 0 | char *te; |
3511 | 0 | long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow); |
3512 | 0 | if ( te==t || overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 ) |
3513 | 0 | { |
3514 | 0 | if ( !extreme_val_warned ) |
3515 | 0 | { |
3516 | 0 | hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos, |
3517 | 0 | h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1); |
3518 | 0 | extreme_val_warned = 1; |
3519 | 0 | } |
3520 | 0 | tmp_val = bcf_int32_missing; |
3521 | 0 | } |
3522 | 0 | x[l++] = tmp_val; |
3523 | 0 | t = te; |
3524 | 0 | } |
3525 | 0 | if (*t != ',') break; |
3526 | 0 | } |
3527 | 0 | if ( !l ) |
3528 | 0 | x[l++] = bcf_int32_missing; |
3529 | 0 | for (; l < z->size>>2; ++l) |
3530 | 0 | x[l] = bcf_int32_vector_end; |
3531 | |
|
3532 | 0 | } else if (htype == BCF_HT_REAL) { |
3533 | | // One of more floating point values in an array |
3534 | 0 | float *x = (float*)(z->buf + z->size * (size_t)m); |
3535 | 0 | int l; |
3536 | 0 | for (l = 0;; ++t) { |
3537 | 0 | if (*t == '.' && !isdigit_c(t[1])) { |
3538 | 0 | bcf_float_set_missing(x[l++]), ++t; // ++t to skip "." |
3539 | 0 | } else { |
3540 | 0 | int overflow = 0; |
3541 | 0 | char *te; |
3542 | 0 | float tmp_val = hts_str2dbl(t, &te, &overflow); |
3543 | 0 | if ( (te==t || overflow) && !extreme_val_warned ) |
3544 | 0 | { |
3545 | 0 | hts_log_warning("Extreme FORMAT/%s value encountered at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname(h,v), v->pos+1); |
3546 | 0 | extreme_val_warned = 1; |
3547 | 0 | } |
3548 | 0 | x[l++] = tmp_val; |
3549 | 0 | t = te; |
3550 | 0 | } |
3551 | 0 | if (*t != ',') break; |
3552 | 0 | } |
3553 | 0 | if ( !l ) |
3554 | | // An empty field, insert missing value |
3555 | 0 | bcf_float_set_missing(x[l++]); |
3556 | 0 | for (; l < z->size>>2; ++l) |
3557 | 0 | bcf_float_set_vector_end(x[l]); |
3558 | 0 | } else { |
3559 | 0 | hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, htype, bcf_seqname_safe(h,v), v->pos+1); |
3560 | 0 | v->errcode |= BCF_ERR_TAG_INVALID; |
3561 | 0 | return -1; |
3562 | 0 | } |
3563 | | |
3564 | 45.2k | if (*t == '\0') { |
3565 | 33.4k | break; |
3566 | 33.4k | } |
3567 | 11.7k | else if (*t == ':') { |
3568 | 11.7k | t++; |
3569 | 11.7k | } |
3570 | 10 | else { |
3571 | 10 | char buffer[8]; |
3572 | 10 | hts_log_error("Invalid character %s in '%s' FORMAT field at %s:%"PRIhts_pos"", |
3573 | 10 | hts_strprint(buffer, sizeof buffer, '\'', t, 1), |
3574 | 10 | h->id[BCF_DT_ID][z->key].key, bcf_seqname_safe(h,v), v->pos+1); |
3575 | 10 | v->errcode |= BCF_ERR_CHAR; |
3576 | 10 | return -1; |
3577 | 10 | } |
3578 | 45.2k | } |
3579 | | |
3580 | | // fill end-of-vector values |
3581 | 353k | for (; j < v->n_fmt; ++j) { |
3582 | 319k | fmt_aux_t *z = &fmt[j]; |
3583 | 319k | const int htype = z->y>>4&0xf; |
3584 | 319k | int l; |
3585 | | |
3586 | 319k | if (z->size == -1) // this field is to be ignored |
3587 | 251k | continue; |
3588 | | |
3589 | 68.2k | if (htype == BCF_HT_STR) { |
3590 | 68.2k | if (z->is_gt) { |
3591 | 9.21k | int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m); |
3592 | 9.21k | if (z->size) x[0] = bcf_int32_missing; |
3593 | 17.9k | for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; |
3594 | 59.0k | } else { |
3595 | 59.0k | char *x = (char*)z->buf + z->size * (size_t)m; |
3596 | 59.0k | if ( z->size ) { |
3597 | 12.9k | x[0] = '.'; |
3598 | 12.9k | memset(&x[1], 0, (z->size-1) * sizeof(*x)); |
3599 | 12.9k | } |
3600 | 59.0k | } |
3601 | 68.2k | } else if (htype == BCF_HT_INT) { |
3602 | 0 | int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m); |
3603 | 0 | x[0] = bcf_int32_missing; |
3604 | 0 | for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; |
3605 | 0 | } else if (htype == BCF_HT_REAL) { |
3606 | 0 | float *x = (float*)(z->buf + z->size * (size_t)m); |
3607 | 0 | bcf_float_set_missing(x[0]); |
3608 | 0 | for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]); |
3609 | 0 | } |
3610 | 68.2k | } |
3611 | | |
3612 | 33.4k | m++; t++; |
3613 | 33.4k | } |
3614 | | |
3615 | 18.1k | return 0; |
3616 | 18.1k | } |
3617 | | |
3618 | | // write individual genotype information |
3619 | | static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, |
3620 | 18.1k | const char *p, const char *q, fmt_aux_t *fmt) { |
3621 | 18.1k | kstring_t *str = &v->indiv; |
3622 | 18.1k | int i, need_downsize = 0; |
3623 | 18.1k | if (v->n_sample > 0) { |
3624 | 69.6k | for (i = 0; i < v->n_fmt; ++i) { |
3625 | 51.4k | fmt_aux_t *z = &fmt[i]; |
3626 | 51.4k | if ( z->size==-1 ) { |
3627 | 16.1k | need_downsize = 1; |
3628 | 16.1k | continue; |
3629 | 16.1k | } |
3630 | 35.3k | bcf_enc_int1(str, z->key); |
3631 | 35.3k | if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) { |
3632 | 30.2k | bcf_enc_size(str, z->size, BCF_BT_CHAR); |
3633 | 30.2k | kputsn((char*)z->buf, z->size * (size_t)v->n_sample, str); |
3634 | 30.2k | } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) { |
3635 | 5.09k | bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2); |
3636 | 5.09k | } else { |
3637 | 0 | bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT); |
3638 | 0 | if (serialize_float_array(str, (z->size>>2) * (size_t)v->n_sample, |
3639 | 0 | (float *) z->buf) != 0) { |
3640 | 0 | v->errcode |= BCF_ERR_LIMITS; |
3641 | 0 | hts_log_error("Out of memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); |
3642 | 0 | return -1; |
3643 | 0 | } |
3644 | 0 | } |
3645 | 35.3k | } |
3646 | | |
3647 | 18.1k | } |
3648 | 18.1k | if ( need_downsize ) { |
3649 | 5.33k | i = 0; |
3650 | 33.7k | while ( i < v->n_fmt ) { |
3651 | 28.4k | if ( fmt[i].size==-1 ) |
3652 | 16.1k | { |
3653 | 16.1k | v->n_fmt--; |
3654 | 16.1k | if ( i < v->n_fmt ) memmove(&fmt[i],&fmt[i+1],sizeof(*fmt)*(v->n_fmt-i)); |
3655 | 16.1k | } |
3656 | 12.2k | else |
3657 | 12.2k | i++; |
3658 | 28.4k | } |
3659 | 5.33k | } |
3660 | 18.1k | return 0; |
3661 | 18.1k | } |
3662 | | |
3663 | | // validity checking |
3664 | 18.1k | static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) { |
3665 | 18.1k | if ( v->n_sample!=bcf_hdr_nsamples(h) ) |
3666 | 43 | { |
3667 | 43 | hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)", |
3668 | 43 | bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h)); |
3669 | 43 | v->errcode |= BCF_ERR_NCOLS; |
3670 | 43 | return -1; |
3671 | 43 | } |
3672 | 18.0k | if ( v->indiv.l > 0xffffffff ) |
3673 | 0 | { |
3674 | 0 | hts_log_error("The FORMAT at %s:%"PRIhts_pos" is too long", bcf_seqname_safe(h,v), v->pos+1); |
3675 | 0 | v->errcode |= BCF_ERR_LIMITS; |
3676 | | |
3677 | | // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed |
3678 | 0 | v->n_fmt = 0; |
3679 | 0 | return -1; |
3680 | 0 | } |
3681 | | |
3682 | 18.0k | return 0; |
3683 | 18.0k | } |
3684 | | |
3685 | | // p,q is the start and the end of the FORMAT field |
3686 | | static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, |
3687 | | char *p, char *q) |
3688 | 37.7k | { |
3689 | 37.7k | if ( !bcf_hdr_nsamples(h) ) return 0; |
3690 | 18.2k | kstring_t *mem = (kstring_t*)&h->mem; |
3691 | 18.2k | mem->l = 0; |
3692 | | |
3693 | 18.2k | fmt_aux_t fmt[MAX_N_FMT]; |
3694 | | |
3695 | | // detect FORMAT "." |
3696 | 18.2k | int ret; // +ve = ok, -ve = err |
3697 | 18.2k | if ((ret = vcf_parse_format_empty1(s, h, v, p, q))) |
3698 | 9 | return ret ? 0 : -1; |
3699 | | |
3700 | | // get format information from the dictionary |
3701 | 18.2k | if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0) |
3702 | 6 | return -1; |
3703 | | |
3704 | | // FORMAT data is per-sample A:B:C A:B:C A:B:C ... but in memory it is |
3705 | | // stored as per-type arrays AAA... BBB... CCC... This is basically |
3706 | | // a data rotation or pivot. |
3707 | | |
3708 | | // The size of elements in the array grow to their maximum needed, |
3709 | | // permitting fast random access. This means however we have to first |
3710 | | // scan the whole FORMAT line to find the maximum of each type, and |
3711 | | // then scan it again to find the store the data. |
3712 | | // We break this down into compute-max, allocate, fill-out-buffers |
3713 | | |
3714 | | // TODO: ? |
3715 | | // The alternative would be to pivot on the first pass, with fixed |
3716 | | // size entries for numerics and concatenated strings otherwise, also |
3717 | | // tracking maximum sizes. Then on a second pass we reallocate and |
3718 | | // copy the data again to a uniformly sized array. Two passes through |
3719 | | // memory, but without doubling string parsing. |
3720 | | |
3721 | | // compute max |
3722 | 18.1k | if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0) |
3723 | 10 | return -1; |
3724 | | |
3725 | | // allocate memory for arrays |
3726 | 18.1k | if (vcf_parse_format_alloc4(s, h, v, p, q, fmt) < 0) |
3727 | 0 | return -1; |
3728 | | |
3729 | | // fill the sample fields; at beginning of the loop |
3730 | 18.1k | if (vcf_parse_format_fill5(s, h, v, p, q, fmt) < 0) |
3731 | 44 | return -1; |
3732 | | |
3733 | | // write individual genotype information |
3734 | 18.1k | if (vcf_parse_format_gt6(s, h, v, p, q, fmt) < 0) |
3735 | 0 | return -1; |
3736 | | |
3737 | | // validity checking |
3738 | 18.1k | if (vcf_parse_format_check7(h, v) < 0) |
3739 | 43 | return -1; |
3740 | | |
3741 | 18.0k | return 0; |
3742 | 18.1k | } |
3743 | | |
3744 | 2.74k | static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) { |
3745 | | // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has |
3746 | | // been already printed, but will enable tools like vcfcheck to proceed. |
3747 | | |
3748 | 2.74k | kstring_t tmp = {0,0,0}; |
3749 | 2.74k | khint_t k; |
3750 | 2.74k | int l; |
3751 | 2.74k | if (ksprintf(&tmp, "##contig=<ID=%s>", p) < 0) |
3752 | 0 | return kh_end(d); |
3753 | 2.74k | bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l); |
3754 | 2.74k | free(tmp.s); |
3755 | 2.74k | int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1; |
3756 | 2.74k | if (res < 0) bcf_hrec_destroy(hrec); |
3757 | 2.74k | if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h); |
3758 | 2.74k | k = kh_get(vdict, d, p); |
3759 | | |
3760 | 2.74k | return k; |
3761 | 2.74k | } |
3762 | | |
3763 | 38.9k | static int vcf_parse_filter(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) { |
3764 | 38.9k | int i, n_flt = 1, max_n_flt = 0; |
3765 | 38.9k | char *r, *t; |
3766 | 38.9k | int32_t *a_flt = NULL; |
3767 | 38.9k | ks_tokaux_t aux1; |
3768 | 38.9k | khint_t k; |
3769 | 38.9k | vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; |
3770 | | // count the number of filters |
3771 | 38.9k | if (*(q-1) == ';') *(q-1) = 0; |
3772 | 224M | for (r = p; *r; ++r) |
3773 | 224M | if (*r == ';') ++n_flt; |
3774 | 38.9k | if (n_flt > max_n_flt) { |
3775 | 38.9k | a_flt = malloc(n_flt * sizeof(*a_flt)); |
3776 | 38.9k | if (!a_flt) { |
3777 | 0 | hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); |
3778 | 0 | v->errcode |= BCF_ERR_LIMITS; // No appropriate code? |
3779 | 0 | return -1; |
3780 | 0 | } |
3781 | 38.9k | max_n_flt = n_flt; |
3782 | 38.9k | } |
3783 | | // add filters |
3784 | 1.05M | for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) { |
3785 | 1.01M | *(char*)aux1.p = 0; |
3786 | 1.01M | k = kh_get(vdict, d, t); |
3787 | 1.01M | if (k == kh_end(d)) |
3788 | 28.6k | { |
3789 | | // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has |
3790 | | // been already printed, but will enable tools like vcfcheck to proceed. |
3791 | 28.6k | hts_log_warning("FILTER '%s' is not defined in the header", t); |
3792 | 28.6k | kstring_t tmp = {0,0,0}; |
3793 | 28.6k | int l; |
3794 | 28.6k | ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t); |
3795 | 28.6k | bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l); |
3796 | 28.6k | free(tmp.s); |
3797 | 28.6k | int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1; |
3798 | 28.6k | if (res < 0) bcf_hrec_destroy(hrec); |
3799 | 28.6k | if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h); |
3800 | 28.6k | k = kh_get(vdict, d, t); |
3801 | 28.6k | v->errcode |= BCF_ERR_TAG_UNDEF; |
3802 | 28.6k | if (res || k == kh_end(d)) { |
3803 | 36 | hts_log_error("Could not add dummy header for FILTER '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1); |
3804 | 36 | v->errcode |= BCF_ERR_TAG_INVALID; |
3805 | 36 | free(a_flt); |
3806 | 36 | return -1; |
3807 | 36 | } |
3808 | 28.6k | } |
3809 | 1.01M | a_flt[i++] = kh_val(d, k).id; |
3810 | 1.01M | } |
3811 | | |
3812 | 38.9k | bcf_enc_vint(str, n_flt, a_flt, -1); |
3813 | 38.9k | free(a_flt); |
3814 | | |
3815 | 38.9k | return 0; |
3816 | 38.9k | } |
3817 | | |
3818 | 41.6k | static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) { |
3819 | 41.6k | static int extreme_int_warned = 0, negative_rlen_warned = 0; |
3820 | 41.6k | int max_n_val = 0, overflow = 0; |
3821 | 41.6k | char *r, *key; |
3822 | 41.6k | khint_t k; |
3823 | 41.6k | vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; |
3824 | 41.6k | int32_t *a_val = NULL; |
3825 | | |
3826 | 41.6k | v->n_info = 0; |
3827 | 41.6k | if (*(q-1) == ';') *(q-1) = 0; |
3828 | 2.70M | for (r = key = p;; ++r) { |
3829 | 2.70M | int c; |
3830 | 2.70M | char *val, *end; |
3831 | 246M | while (*r > '=' || (*r != ';' && *r != '=' && *r != 0)) r++; |
3832 | 2.70M | if (v->n_info == UINT16_MAX) { |
3833 | 1 | hts_log_error("Too many INFO entries at %s:%"PRIhts_pos, |
3834 | 1 | bcf_seqname_safe(h,v), v->pos+1); |
3835 | 1 | v->errcode |= BCF_ERR_LIMITS; |
3836 | 1 | goto fail; |
3837 | 1 | } |
3838 | 2.70M | val = end = NULL; |
3839 | 2.70M | c = *r; *r = 0; |
3840 | 2.70M | if (c == '=') { |
3841 | 991k | val = r + 1; |
3842 | | |
3843 | 325M | for (end = val; *end != ';' && *end != 0; ++end); |
3844 | 991k | c = *end; *end = 0; |
3845 | 1.71M | } else end = r; |
3846 | 2.70M | if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; } // faulty VCF, ";;" in the INFO |
3847 | 2.64M | k = kh_get(vdict, d, key); |
3848 | 2.64M | if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15) |
3849 | 27.8k | { |
3850 | 27.8k | hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key); |
3851 | 27.8k | kstring_t tmp = {0,0,0}; |
3852 | 27.8k | int l; |
3853 | 27.8k | ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key); |
3854 | 27.8k | bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l); |
3855 | 27.8k | free(tmp.s); |
3856 | 27.8k | int res = hrec ? bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) : -1; |
3857 | 27.8k | if (res < 0) bcf_hrec_destroy(hrec); |
3858 | 27.8k | if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h); |
3859 | 27.8k | k = kh_get(vdict, d, key); |
3860 | 27.8k | v->errcode |= BCF_ERR_TAG_UNDEF; |
3861 | 27.8k | if (res || k == kh_end(d)) { |
3862 | 55 | hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1); |
3863 | 55 | v->errcode |= BCF_ERR_TAG_INVALID; |
3864 | 55 | goto fail; |
3865 | 55 | } |
3866 | 27.8k | } |
3867 | 2.64M | uint32_t y = kh_val(d, k).info[BCF_HL_INFO]; |
3868 | 2.64M | ++v->n_info; |
3869 | 2.64M | bcf_enc_int1(str, kh_val(d, k).id); |
3870 | 2.64M | if (val == 0) { |
3871 | 1.65M | bcf_enc_size(str, 0, BCF_BT_NULL); |
3872 | 1.65M | } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string |
3873 | 55.0k | bcf_enc_vchar(str, end - val, val); |
3874 | 936k | } else { // int/float value/array |
3875 | 936k | int i, n_val; |
3876 | 936k | char *t, *te; |
3877 | 314M | for (t = val, n_val = 1; *t; ++t) // count the number of values |
3878 | 313M | if (*t == ',') ++n_val; |
3879 | | // Check both int and float size in one step for simplicity |
3880 | 936k | if (n_val > max_n_val) { |
3881 | 2.99k | int32_t *a_tmp = (int32_t *)realloc(a_val, n_val * sizeof(*a_val)); |
3882 | 2.99k | if (!a_tmp) { |
3883 | 0 | hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); |
3884 | 0 | v->errcode |= BCF_ERR_LIMITS; // No appropriate code? |
3885 | 0 | goto fail; |
3886 | 0 | } |
3887 | 2.99k | a_val = a_tmp; |
3888 | 2.99k | max_n_val = n_val; |
3889 | 2.99k | } |
3890 | 936k | if ((y>>4&0xf) == BCF_HT_INT) { |
3891 | 604k | i = 0, t = val; |
3892 | 604k | int64_t val1; |
3893 | 604k | int is_int64 = 0; |
3894 | | #ifdef VCF_ALLOW_INT64 |
3895 | | if ( n_val==1 ) |
3896 | | { |
3897 | | overflow = 0; |
3898 | | long long int tmp_val = hts_str2int(val, &te, sizeof(tmp_val)*CHAR_BIT, &overflow); |
3899 | | if ( te==val ) tmp_val = bcf_int32_missing; |
3900 | | else if ( overflow || tmp_val<BCF_MIN_BT_INT64 || tmp_val>BCF_MAX_BT_INT64 ) |
3901 | | { |
3902 | | if ( !extreme_int_warned ) |
3903 | | { |
3904 | | hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1); |
3905 | | extreme_int_warned = 1; |
3906 | | } |
3907 | | tmp_val = bcf_int32_missing; |
3908 | | } |
3909 | | else |
3910 | | is_int64 = 1; |
3911 | | val1 = tmp_val; |
3912 | | t = te; |
3913 | | i = 1; // this is just to avoid adding another nested block... |
3914 | | } |
3915 | | #endif |
3916 | 145M | for (; i < n_val; ++i, ++t) |
3917 | 145M | { |
3918 | 145M | overflow = 0; |
3919 | 145M | long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow); |
3920 | 145M | if ( te==t ) tmp_val = bcf_int32_missing; |
3921 | 1.15M | else if ( overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 ) |
3922 | 137k | { |
3923 | 137k | if ( !extreme_int_warned ) |
3924 | 1 | { |
3925 | 1 | hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1); |
3926 | 1 | extreme_int_warned = 1; |
3927 | 1 | } |
3928 | 137k | tmp_val = bcf_int32_missing; |
3929 | 137k | } |
3930 | 145M | a_val[i] = tmp_val; |
3931 | 197M | for (t = te; *t && *t != ','; t++); |
3932 | 145M | } |
3933 | 604k | if (n_val == 1) { |
3934 | | #ifdef VCF_ALLOW_INT64 |
3935 | | if ( is_int64 ) |
3936 | | { |
3937 | | v->unpacked |= BCF_IS_64BIT; |
3938 | | bcf_enc_long1(str, val1); |
3939 | | } |
3940 | | else |
3941 | | bcf_enc_int1(str, (int32_t)val1); |
3942 | | #else |
3943 | 469k | val1 = a_val[0]; |
3944 | 469k | bcf_enc_int1(str, (int32_t)val1); |
3945 | 469k | #endif |
3946 | 469k | } else { |
3947 | 135k | bcf_enc_vint(str, n_val, a_val, -1); |
3948 | 135k | } |
3949 | 604k | if (n_val==1 && (val1!=bcf_int32_missing || is_int64) |
3950 | 388k | && memcmp(key, "END", 4) == 0) |
3951 | 0 | { |
3952 | 0 | if ( val1 <= v->pos ) |
3953 | 0 | { |
3954 | 0 | if ( !negative_rlen_warned ) |
3955 | 0 | { |
3956 | 0 | hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,val1,bcf_seqname_safe(h,v),v->pos+1); |
3957 | 0 | negative_rlen_warned = 1; |
3958 | 0 | } |
3959 | 0 | } |
3960 | 0 | } |
3961 | 604k | } else if ((y>>4&0xf) == BCF_HT_REAL) { |
3962 | 331k | float *val_f = (float *)a_val; |
3963 | 75.1M | for (i = 0, t = val; i < n_val; ++i, ++t) |
3964 | 74.8M | { |
3965 | 74.8M | overflow = 0; |
3966 | 74.8M | val_f[i] = hts_str2dbl(t, &te, &overflow); |
3967 | 74.8M | if ( te==t || overflow ) // conversion failed |
3968 | 73.5M | bcf_float_set_missing(val_f[i]); |
3969 | 106M | for (t = te; *t && *t != ','; t++); |
3970 | 74.8M | } |
3971 | 331k | bcf_enc_vfloat(str, n_val, val_f); |
3972 | 331k | } |
3973 | 936k | } |
3974 | 2.64M | if (c == 0) break; |
3975 | 2.62M | r = end; |
3976 | 2.62M | key = r + 1; |
3977 | 2.62M | } |
3978 | | |
3979 | 41.6k | free(a_val); |
3980 | 41.6k | return 0; |
3981 | | |
3982 | 56 | fail: |
3983 | 56 | free(a_val); |
3984 | 56 | return -1; |
3985 | 41.6k | } |
3986 | | |
3987 | | int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) |
3988 | 42.7k | { |
3989 | 42.7k | int ret = -2, overflow = 0; |
3990 | 42.7k | char *p, *q, *r, *t; |
3991 | 42.7k | kstring_t *str; |
3992 | 42.7k | khint_t k; |
3993 | 42.7k | ks_tokaux_t aux; |
3994 | | |
3995 | | //#define NOT_DOT(p) strcmp((p), ".") |
3996 | | //#define NOT_DOT(p) (!(*p == '.' && !p[1])) |
3997 | | //#define NOT_DOT(p) ((*p) != '.' || (p)[1]) |
3998 | | //#define NOT_DOT(p) (q-p != 1 || memcmp(p, ".\0", 2)) |
3999 | 211k | #define NOT_DOT(p) (memcmp(p, ".\0", 2)) |
4000 | | |
4001 | 42.7k | if (!s || !h || !v || !(s->s)) |
4002 | 0 | return ret; |
4003 | | |
4004 | | // Assumed in lots of places, but we may as well spot this early |
4005 | 42.7k | assert(sizeof(float) == sizeof(int32_t)); |
4006 | | |
4007 | | // Ensure string we parse has space to permit some over-flow when during |
4008 | | // parsing. Eg to do memcmp(key, "END", 4) in vcf_parse_info over |
4009 | | // the more straight forward looking strcmp, giving a speed advantage. |
4010 | 42.7k | if (ks_resize(s, s->l+4) < 0) |
4011 | 0 | return -2; |
4012 | | |
4013 | | // Force our memory to be initialised so we avoid the technicality of |
4014 | | // undefined behaviour in using a 4-byte memcmp. (The reality is this |
4015 | | // almost certainly is never detected by the compiler so has no impact, |
4016 | | // but equally so this code has minimal (often beneficial) impact on |
4017 | | // performance too.) |
4018 | 42.7k | s->s[s->l+0] = 0; |
4019 | 42.7k | s->s[s->l+1] = 0; |
4020 | 42.7k | s->s[s->l+2] = 0; |
4021 | 42.7k | s->s[s->l+3] = 0; |
4022 | | |
4023 | 42.7k | bcf_clear1(v); |
4024 | 42.7k | str = &v->shared; |
4025 | 42.7k | memset(&aux, 0, sizeof(ks_tokaux_t)); |
4026 | | |
4027 | | // CHROM |
4028 | 42.7k | if (!(p = kstrtok(s->s, "\t", &aux))) |
4029 | 0 | goto err; |
4030 | 42.7k | *(q = (char*)aux.p) = 0; |
4031 | | |
4032 | 42.7k | vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG]; |
4033 | 42.7k | k = kh_get(vdict, d, p); |
4034 | 42.7k | if (k == kh_end(d)) { |
4035 | 2.74k | hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p); |
4036 | 2.74k | v->errcode = BCF_ERR_CTG_UNDEF; |
4037 | 2.74k | if ((k = fix_chromosome(h, d, p)) == kh_end(d)) { |
4038 | 77 | hts_log_error("Could not add dummy header for contig '%s'", p); |
4039 | 77 | v->errcode |= BCF_ERR_CTG_INVALID; |
4040 | 77 | goto err; |
4041 | 77 | } |
4042 | 2.74k | } |
4043 | 42.6k | v->rid = kh_val(d, k).id; |
4044 | | |
4045 | | // POS |
4046 | 42.6k | if (!(p = kstrtok(0, 0, &aux))) |
4047 | 258 | goto err; |
4048 | 42.4k | *(q = (char*)aux.p) = 0; |
4049 | | |
4050 | 42.4k | overflow = 0; |
4051 | 42.4k | char *tmp = p; |
4052 | 42.4k | v->pos = hts_str2uint(p, &p, 62, &overflow); |
4053 | 42.4k | if (overflow) { |
4054 | 4 | hts_log_error("Position value '%s' is too large", tmp); |
4055 | 4 | goto err; |
4056 | 42.4k | } else if ( *p ) { |
4057 | 56 | hts_log_error("Could not parse the position '%s'", tmp); |
4058 | 56 | goto err; |
4059 | 42.3k | } else { |
4060 | 42.3k | v->pos -= 1; |
4061 | 42.3k | } |
4062 | 42.3k | if (v->pos >= INT32_MAX) |
4063 | 440 | v->unpacked |= BCF_IS_64BIT; |
4064 | | |
4065 | | // ID |
4066 | 42.3k | if (!(p = kstrtok(0, 0, &aux))) |
4067 | 2 | goto err; |
4068 | 42.3k | *(q = (char*)aux.p) = 0; |
4069 | | |
4070 | 42.3k | if (NOT_DOT(p)) bcf_enc_vchar(str, q - p, p); |
4071 | 77 | else bcf_enc_size(str, 0, BCF_BT_CHAR); |
4072 | | |
4073 | | // REF |
4074 | 42.3k | if (!(p = kstrtok(0, 0, &aux))) |
4075 | 12 | goto err; |
4076 | 42.3k | *(q = (char*)aux.p) = 0; |
4077 | | |
4078 | 42.3k | bcf_enc_vchar(str, q - p, p); |
4079 | 42.3k | v->n_allele = 1, v->rlen = q - p; |
4080 | | |
4081 | | // ALT |
4082 | 42.3k | if (!(p = kstrtok(0, 0, &aux))) |
4083 | 14 | goto err; |
4084 | 42.3k | *(q = (char*)aux.p) = 0; |
4085 | | |
4086 | 42.3k | if (NOT_DOT(p)) { |
4087 | 91.9M | for (r = t = p;; ++r) { |
4088 | 91.9M | if (*r == ',' || *r == 0) { |
4089 | 1.95M | if (v->n_allele == UINT16_MAX) { |
4090 | 3 | hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos, |
4091 | 3 | bcf_seqname_safe(h,v), v->pos+1); |
4092 | 3 | v->errcode |= BCF_ERR_LIMITS; |
4093 | 3 | goto err; |
4094 | 3 | } |
4095 | 1.95M | bcf_enc_vchar(str, r - t, t); |
4096 | 1.95M | t = r + 1; |
4097 | 1.95M | ++v->n_allele; |
4098 | 1.95M | } |
4099 | 91.9M | if (r == q) break; |
4100 | 91.9M | } |
4101 | 41.2k | } |
4102 | | |
4103 | | // QUAL |
4104 | 42.3k | if (!(p = kstrtok(0, 0, &aux))) |
4105 | 43 | goto err; |
4106 | 42.2k | *(q = (char*)aux.p) = 0; |
4107 | | |
4108 | 42.2k | if (NOT_DOT(p)) v->qual = atof(p); |
4109 | 1.13k | else bcf_float_set_missing(v->qual); |
4110 | 42.2k | if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR |
4111 | | |
4112 | | // FILTER |
4113 | 42.2k | if (!(p = kstrtok(0, 0, &aux))) |
4114 | 30 | goto err; |
4115 | 42.2k | *(q = (char*)aux.p) = 0; |
4116 | | |
4117 | 42.2k | if (NOT_DOT(p)) { |
4118 | 38.9k | if (vcf_parse_filter(str, h, v, p, q)) { |
4119 | 36 | goto err; |
4120 | 36 | } |
4121 | 38.9k | } else bcf_enc_vint(str, 0, 0, -1); |
4122 | 42.2k | if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT |
4123 | | |
4124 | | // INFO |
4125 | 42.2k | if (!(p = kstrtok(0, 0, &aux))) |
4126 | 42 | goto err; |
4127 | 42.1k | *(q = (char*)aux.p) = 0; |
4128 | | |
4129 | 42.1k | if (NOT_DOT(p)) { |
4130 | 41.6k | if (vcf_parse_info(str, h, v, p, q)) { |
4131 | 56 | goto err; |
4132 | 56 | } |
4133 | 41.6k | } |
4134 | 42.1k | if ( v->max_unpack && !(v->max_unpack>>3) ) goto end; |
4135 | | |
4136 | | // FORMAT; optional |
4137 | 42.1k | p = kstrtok(0, 0, &aux); |
4138 | 42.1k | if (p) { |
4139 | 37.7k | *(q = (char*)aux.p) = 0; |
4140 | | |
4141 | 37.7k | if (vcf_parse_format(s, h, v, p, q)) { |
4142 | 103 | goto err; |
4143 | 103 | } |
4144 | 37.7k | } |
4145 | | |
4146 | 42.0k | end: |
4147 | 42.0k | v->rlen = get_rlen(h, v); //set rlen based on version |
4148 | 42.0k | ret = 0; |
4149 | | |
4150 | 42.7k | err: |
4151 | 42.7k | return ret; |
4152 | 42.0k | } |
4153 | | |
4154 | | int vcf_open_mode(char *mode, const char *fn, const char *format) |
4155 | 0 | { |
4156 | 0 | if (format == NULL) { |
4157 | | // Try to pick a format based on the filename extension |
4158 | 0 | char extension[HTS_MAX_EXT_LEN]; |
4159 | 0 | if (find_file_extension(fn, extension) < 0) return -1; |
4160 | 0 | return vcf_open_mode(mode, fn, extension); |
4161 | 0 | } |
4162 | 0 | else if (strcasecmp(format, "bcf") == 0) strcpy(mode, "b"); |
4163 | 0 | else if (strcasecmp(format, "vcf") == 0) strcpy(mode, ""); |
4164 | 0 | else if (strcasecmp(format, "vcf.gz") == 0 || strcasecmp(format, "vcf.bgz") == 0) strcpy(mode, "z"); |
4165 | 0 | else return -1; |
4166 | | |
4167 | 0 | return 0; |
4168 | 0 | } |
4169 | | |
4170 | | int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) |
4171 | 43.1k | { |
4172 | 43.1k | int ret; |
4173 | 43.1k | ret = hts_getline(fp, KS_SEP_LINE, &fp->line); |
4174 | 43.1k | if (ret < 0) return ret; |
4175 | 42.7k | return vcf_parse1(&fp->line, h, v); |
4176 | 43.1k | } |
4177 | | |
4178 | | static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt) |
4179 | 0 | { |
4180 | 0 | uint8_t *ptr_start = ptr; |
4181 | 0 | fmt->id = bcf_dec_typed_int1(ptr, &ptr); |
4182 | 0 | fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type); |
4183 | 0 | fmt->size = fmt->n << bcf_type_shift[fmt->type]; |
4184 | 0 | fmt->p = ptr; |
4185 | 0 | fmt->p_off = ptr - ptr_start; |
4186 | 0 | fmt->p_free = 0; |
4187 | 0 | ptr += n_sample * fmt->size; |
4188 | 0 | fmt->p_len = ptr - fmt->p; |
4189 | 0 | return ptr; |
4190 | 0 | } |
4191 | | |
4192 | | static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info) |
4193 | 693 | { |
4194 | 693 | uint8_t *ptr_start = ptr; |
4195 | 693 | int64_t len = 0; |
4196 | 693 | info->key = bcf_dec_typed_int1(ptr, &ptr); |
4197 | 693 | len = info->len = bcf_dec_size(ptr, &ptr, &info->type); |
4198 | 693 | info->vptr = ptr; |
4199 | 693 | info->vptr_off = ptr - ptr_start; |
4200 | 693 | info->vptr_free = 0; |
4201 | 693 | info->v1.i = 0; |
4202 | 693 | if (info->len == 1) { |
4203 | 134 | switch(info->type) { |
4204 | 0 | case BCF_BT_INT8: |
4205 | 134 | case BCF_BT_CHAR: |
4206 | 134 | info->v1.i = *(int8_t*)ptr; |
4207 | 134 | break; |
4208 | 0 | case BCF_BT_INT16: |
4209 | 0 | info->v1.i = le_to_i16(ptr); |
4210 | 0 | len <<= 1; |
4211 | 0 | break; |
4212 | 0 | case BCF_BT_INT32: |
4213 | 0 | info->v1.i = le_to_i32(ptr); |
4214 | 0 | len <<= 2; |
4215 | 0 | break; |
4216 | 0 | case BCF_BT_FLOAT: |
4217 | 0 | info->v1.f = le_to_float(ptr); |
4218 | 0 | len <<= 2; |
4219 | 0 | break; |
4220 | 0 | case BCF_BT_INT64: |
4221 | 0 | info->v1.i = le_to_i64(ptr); |
4222 | 0 | len <<= 3; |
4223 | 0 | break; |
4224 | 134 | } |
4225 | 559 | } else { |
4226 | 559 | len <<= bcf_type_shift[info->type]; |
4227 | 559 | } |
4228 | 693 | ptr += len; |
4229 | | |
4230 | 693 | info->vptr_len = ptr - info->vptr; |
4231 | 693 | return ptr; |
4232 | 693 | } |
4233 | | |
4234 | | int bcf_unpack(bcf1_t *b, int which) |
4235 | 41.0k | { |
4236 | 41.0k | if ( !b->shared.l ) return 0; // Building a new BCF record from scratch |
4237 | 41.0k | uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori; |
4238 | 41.0k | int i; |
4239 | 41.0k | bcf_dec_t *d = &b->d; |
4240 | 41.0k | if (which & BCF_UN_FLT) which |= BCF_UN_STR; |
4241 | 41.0k | if (which & BCF_UN_INFO) which |= BCF_UN_SHR; |
4242 | 41.0k | if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR)) |
4243 | 41.0k | { |
4244 | 41.0k | kstring_t tmp; |
4245 | | |
4246 | | // ID |
4247 | 41.0k | tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id; |
4248 | 41.0k | ptr_ori = ptr; |
4249 | 41.0k | ptr = bcf_fmt_sized_array(&tmp, ptr); |
4250 | 41.0k | b->unpack_size[0] = ptr - ptr_ori; |
4251 | 41.0k | kputc_('\0', &tmp); |
4252 | 41.0k | d->id = tmp.s; d->m_id = tmp.m; |
4253 | | |
4254 | | // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block |
4255 | 41.0k | hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro |
4256 | 41.0k | tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als; |
4257 | 41.0k | ptr_ori = ptr; |
4258 | 1.19M | for (i = 0; i < b->n_allele; ++i) { |
4259 | | // Use offset within tmp.s as realloc may change pointer |
4260 | 1.15M | d->allele[i] = (char *)(intptr_t)tmp.l; |
4261 | 1.15M | ptr = bcf_fmt_sized_array(&tmp, ptr); |
4262 | 1.15M | kputc_('\0', &tmp); |
4263 | 1.15M | } |
4264 | 41.0k | b->unpack_size[1] = ptr - ptr_ori; |
4265 | 41.0k | d->als = tmp.s; d->m_als = tmp.m; |
4266 | | |
4267 | | // Convert our offsets within tmp.s back to pointers again |
4268 | 1.19M | for (i = 0; i < b->n_allele; ++i) |
4269 | 1.15M | d->allele[i] = d->als + (ptrdiff_t)d->allele[i]; |
4270 | 41.0k | b->unpacked |= BCF_UN_STR; |
4271 | 41.0k | } |
4272 | 41.0k | if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER |
4273 | 41.0k | ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1]; |
4274 | 41.0k | ptr_ori = ptr; |
4275 | 41.0k | if (*ptr>>4) { |
4276 | 37.8k | int type; |
4277 | 37.8k | d->n_flt = bcf_dec_size(ptr, &ptr, &type); |
4278 | 37.8k | hts_expand(int, d->n_flt, d->m_flt, d->flt); |
4279 | 583k | for (i = 0; i < d->n_flt; ++i) |
4280 | 545k | d->flt[i] = bcf_dec_int1(ptr, type, &ptr); |
4281 | 37.8k | } else ++ptr, d->n_flt = 0; |
4282 | 41.0k | b->unpack_size[2] = ptr - ptr_ori; |
4283 | 41.0k | b->unpacked |= BCF_UN_FLT; |
4284 | 41.0k | } |
4285 | 41.0k | if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO |
4286 | 0 | ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2]; |
4287 | 0 | hts_expand(bcf_info_t, b->n_info, d->m_info, d->info); |
4288 | 0 | for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0; |
4289 | 0 | for (i = 0; i < b->n_info; ++i) |
4290 | 0 | ptr = bcf_unpack_info_core1(ptr, &d->info[i]); |
4291 | 0 | b->unpacked |= BCF_UN_INFO; |
4292 | 0 | } |
4293 | 41.0k | if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT |
4294 | 0 | ptr = (uint8_t*)b->indiv.s; |
4295 | 0 | hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt); |
4296 | 0 | for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0; |
4297 | 0 | for (i = 0; i < b->n_fmt; ++i) |
4298 | 0 | ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]); |
4299 | 0 | b->unpacked |= BCF_UN_FMT; |
4300 | 0 | } |
4301 | 41.0k | return 0; |
4302 | 41.0k | } |
4303 | | |
4304 | | int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) |
4305 | 41.0k | { |
4306 | 41.0k | int i; |
4307 | 41.0k | int32_t max_dt_id = h->n[BCF_DT_ID]; |
4308 | 41.0k | const char *chrom = bcf_seqname(h, v); |
4309 | 41.0k | if (!chrom) { |
4310 | 0 | hts_log_error("Invalid BCF, CONTIG id=%d not present in the header", |
4311 | 0 | v->rid); |
4312 | 0 | errno = EINVAL; |
4313 | 0 | return -1; |
4314 | 0 | } |
4315 | | |
4316 | 41.0k | bcf_unpack((bcf1_t*)v, BCF_UN_ALL & ~(BCF_UN_INFO|BCF_UN_FMT)); |
4317 | | |
4318 | | // Cache of key lengths so we don't keep repeatedly using them. |
4319 | | // This assumes we're not modifying the header between successive calls |
4320 | | // to vcf_format, but that would lead to many other forms of breakage |
4321 | | // so it feels like a valid assumption to make. |
4322 | | // |
4323 | | // We cannot just do this in bcf_hdr_sync as some code (eg bcftools |
4324 | | // annotate) manipulates the headers directly without calling sync to |
4325 | | // refresh the data structures. So we must do just-in-time length |
4326 | | // calculation during writes instead. |
4327 | 41.0k | bcf_hdr_aux_t *aux = get_hdr_aux(h); |
4328 | 41.0k | if (!aux->key_len) { |
4329 | 3.57k | if (!(aux->key_len = calloc(h->n[BCF_DT_ID]+1, sizeof(*aux->key_len)))) |
4330 | 0 | return -1; |
4331 | 3.57k | } |
4332 | 41.0k | size_t *key_len = aux->key_len; |
4333 | | |
4334 | 41.0k | kputs(chrom, s); // CHROM |
4335 | 41.0k | kputc_('\t', s); kputll(v->pos + 1, s); // POS |
4336 | 41.0k | kputc_('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID |
4337 | 41.0k | kputc_('\t', s); // REF |
4338 | 41.0k | if (v->n_allele > 0) kputs(v->d.allele[0], s); |
4339 | 0 | else kputc_('.', s); |
4340 | 41.0k | kputc_('\t', s); // ALT |
4341 | 41.0k | if (v->n_allele > 1) { |
4342 | 1.14M | for (i = 1; i < v->n_allele; ++i) { |
4343 | 1.10M | if (i > 1) kputc_(',', s); |
4344 | 1.10M | kputs(v->d.allele[i], s); |
4345 | 1.10M | } |
4346 | 39.9k | } else kputc_('.', s); |
4347 | 41.0k | kputc_('\t', s); // QUAL |
4348 | 41.0k | if ( bcf_float_is_missing(v->qual) ) kputc_('.', s); // QUAL |
4349 | 39.9k | else kputd(v->qual, s); |
4350 | 41.0k | kputc_('\t', s); // FILTER |
4351 | 41.0k | if (v->d.n_flt) { |
4352 | 583k | for (i = 0; i < v->d.n_flt; ++i) { |
4353 | 545k | int32_t idx = v->d.flt[i]; |
4354 | 545k | if (idx < 0 || idx >= max_dt_id |
4355 | 545k | || h->id[BCF_DT_ID][idx].key == NULL) { |
4356 | 0 | hts_log_error("Invalid BCF, the FILTER tag id=%d at %s:%"PRIhts_pos" not present in the header", |
4357 | 0 | idx, bcf_seqname_safe(h, v), v->pos + 1); |
4358 | 0 | errno = EINVAL; |
4359 | 0 | return -1; |
4360 | 0 | } |
4361 | 545k | if (i) kputc_(';', s); |
4362 | 545k | if (!key_len[idx]) |
4363 | 64.2k | key_len[idx] = strlen(h->id[BCF_DT_ID][idx].key); |
4364 | 545k | kputsn(h->id[BCF_DT_ID][idx].key, key_len[idx], s); |
4365 | 545k | } |
4366 | 37.8k | } else kputc_('.', s); |
4367 | | |
4368 | 41.0k | kputc_('\t', s); // INFO |
4369 | 41.0k | if (v->n_info) { |
4370 | 14.7k | uint8_t *ptr = v->shared.s |
4371 | 14.7k | ? (uint8_t *)v->shared.s + v->unpack_size[0] + |
4372 | 14.7k | v->unpack_size[1] + v->unpack_size[2] |
4373 | 14.7k | : NULL; |
4374 | 14.7k | int first = 1; |
4375 | 14.7k | bcf_info_t *info = v->d.info; |
4376 | | |
4377 | | // Note if we duplicate this code into custom packed and unpacked |
4378 | | // implementations then we gain a bit more speed, particularly with |
4379 | | // clang 13 (up to 5%). Not sure why this is, but code duplication |
4380 | | // isn't pleasant and it's still faster adding packed support than |
4381 | | // not so it's a win, just not as good as it should be. |
4382 | 14.7k | const int info_packed = !(v->unpacked & BCF_UN_INFO) && v->shared.l; |
4383 | 1.39M | for (i = 0; i < v->n_info; ++i) { |
4384 | 1.38M | bcf_info_t in, *z; |
4385 | 1.38M | if (info_packed) { |
4386 | | // Use a local bcf_info_t when data is packed |
4387 | 1.38M | z = ∈ |
4388 | 1.38M | z->key = bcf_dec_typed_int1(ptr, &ptr); |
4389 | 1.38M | z->len = bcf_dec_size(ptr, &ptr, &z->type); |
4390 | 1.38M | z->vptr = ptr; |
4391 | 1.38M | ptr += z->len << bcf_type_shift[z->type]; |
4392 | 1.38M | } else { |
4393 | | // Else previously unpacked INFO struct |
4394 | 0 | z = &info[i]; |
4395 | | |
4396 | | // Also potentially since deleted |
4397 | 0 | if ( !z->vptr ) continue; |
4398 | 0 | } |
4399 | | |
4400 | 1.38M | bcf_idpair_t *id = z->key >= 0 && z->key < max_dt_id |
4401 | 1.38M | ? &h->id[BCF_DT_ID][z->key] |
4402 | 1.38M | : NULL; |
4403 | | |
4404 | 1.38M | if (!id || !id->key) { |
4405 | 0 | hts_log_error("Invalid BCF, the INFO tag id=%d is %s at %s:%"PRIhts_pos, |
4406 | 0 | z->key, |
4407 | 0 | z->key < 0 ? "negative" |
4408 | 0 | : (z->key >= max_dt_id ? "too large" : "not present in the header"), |
4409 | 0 | bcf_seqname_safe(h, v), v->pos+1); |
4410 | 0 | errno = EINVAL; |
4411 | 0 | return -1; |
4412 | 0 | } |
4413 | | |
4414 | | // KEY |
4415 | 1.38M | if (!key_len[z->key]) |
4416 | 20.7k | key_len[z->key] = strlen(id->key); |
4417 | 1.38M | size_t id_len = key_len[z->key]; |
4418 | 1.38M | if (ks_resize(s, s->l + 3 + id_len) < 0) |
4419 | 0 | return -1; |
4420 | 1.38M | char *sptr = s->s + s->l; |
4421 | 1.38M | if ( !first ) { |
4422 | 1.36M | *sptr++ = ';'; |
4423 | 1.36M | s->l++; |
4424 | 1.36M | } |
4425 | 1.38M | first = 0; |
4426 | 1.38M | memcpy(sptr, id->key, id_len); |
4427 | 1.38M | s->l += id_len; |
4428 | | |
4429 | | // VALUE |
4430 | 1.38M | if (z->len <= 0) continue; |
4431 | 491k | sptr[id_len] = '='; |
4432 | 491k | s->l++; |
4433 | | |
4434 | 491k | if (z->len != 1 || info_packed) { |
4435 | 491k | bcf_fmt_array(s, z->len, z->type, z->vptr); |
4436 | 491k | } else { |
4437 | | // Single length vectors are unpacked into their |
4438 | | // own info.v1 union and handled separately. |
4439 | 0 | if (z->type == BCF_BT_FLOAT) { |
4440 | 0 | if ( bcf_float_is_missing(z->v1.f) ) |
4441 | 0 | kputc_('.', s); |
4442 | 0 | else |
4443 | 0 | kputd(z->v1.f, s); |
4444 | 0 | } else if (z->type == BCF_BT_CHAR) { |
4445 | 0 | kputc_(z->v1.i, s); |
4446 | 0 | } else if (z->type < BCF_BT_INT64) { |
4447 | 0 | int64_t missing[] = { |
4448 | 0 | 0, // BCF_BT_NULL |
4449 | 0 | bcf_int8_missing, |
4450 | 0 | bcf_int16_missing, |
4451 | 0 | bcf_int32_missing, |
4452 | 0 | }; |
4453 | 0 | if (z->v1.i == missing[z->type]) |
4454 | 0 | kputc_('.', s); |
4455 | 0 | else |
4456 | 0 | kputw(z->v1.i, s); |
4457 | 0 | } else if (z->type == BCF_BT_INT64) { |
4458 | 0 | if (z->v1.i == bcf_int64_missing) |
4459 | 0 | kputc_('.', s); |
4460 | 0 | else |
4461 | 0 | kputll(z->v1.i, s); |
4462 | 0 | } else { |
4463 | 0 | hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, z->type, bcf_seqname_safe(h, v), v->pos+1); |
4464 | 0 | errno = EINVAL; |
4465 | 0 | return -1; |
4466 | 0 | } |
4467 | 0 | } |
4468 | 491k | } |
4469 | 14.7k | if ( first ) kputc_('.', s); |
4470 | 26.2k | } else kputc_('.', s); |
4471 | | |
4472 | | // FORMAT and individual information |
4473 | 41.0k | if (v->n_sample) { |
4474 | 17.8k | int i,j; |
4475 | 17.8k | if ( v->n_fmt) { |
4476 | 17.8k | uint8_t *ptr = (uint8_t *)v->indiv.s; |
4477 | 17.8k | int gt_i = -1; |
4478 | 17.8k | bcf_fmt_t *fmt = v->d.fmt; |
4479 | 17.8k | int first = 1, ret = 0; |
4480 | 17.8k | int fmt_packed = !(v->unpacked & BCF_UN_FMT); |
4481 | | |
4482 | 17.8k | if (fmt_packed) { |
4483 | | // Local fmt as we have an array of num FORMAT keys, |
4484 | | // each of which points to N.Sample values. |
4485 | | |
4486 | | // No real gain to be had in handling unpacked data here, |
4487 | | // but it doesn't cost us much in complexity either and |
4488 | | // it gives us flexibility. |
4489 | 17.8k | fmt = malloc(v->n_fmt * sizeof(*fmt)); |
4490 | 17.8k | if (!fmt) |
4491 | 0 | return -1; |
4492 | 17.8k | } |
4493 | | |
4494 | | // KEYS |
4495 | 52.0k | for (i = 0; i < (int)v->n_fmt; ++i) { |
4496 | 34.1k | bcf_fmt_t *z; |
4497 | 34.1k | z = &fmt[i]; |
4498 | 34.1k | if (fmt_packed) { |
4499 | 34.1k | z->id = bcf_dec_typed_int1(ptr, &ptr); |
4500 | 34.1k | z->n = bcf_dec_size(ptr, &ptr, &z->type); |
4501 | 34.1k | z->p = ptr; |
4502 | 34.1k | z->size = z->n << bcf_type_shift[z->type]; |
4503 | 34.1k | ptr += v->n_sample * z->size; |
4504 | 34.1k | } |
4505 | 34.1k | if ( !z->p ) continue; |
4506 | 34.1k | kputc_(!first ? ':' : '\t', s); first = 0; |
4507 | | |
4508 | 34.1k | bcf_idpair_t *id = z->id >= 0 && z->id < max_dt_id |
4509 | 34.1k | ? &h->id[BCF_DT_ID][z->id] |
4510 | 34.1k | : NULL; |
4511 | | |
4512 | 34.1k | if (!id || !id->key) { |
4513 | 0 | hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", z->id, bcf_seqname_safe(h, v), v->pos+1); |
4514 | 0 | errno = EINVAL; |
4515 | 0 | if (fmt_packed) |
4516 | 0 | free(fmt); |
4517 | 0 | return -1; |
4518 | 0 | } |
4519 | | |
4520 | 34.1k | if (!key_len[z->id]) |
4521 | 17.6k | key_len[z->id] = strlen(id->key); |
4522 | 34.1k | size_t id_len = key_len[z->id]; |
4523 | 34.1k | kputsn(id->key, id_len, s); |
4524 | 34.1k | if (id_len == 2 && id->key[0] == 'G' && id->key[1] == 'T') |
4525 | 4.98k | gt_i = i; |
4526 | 34.1k | } |
4527 | 17.8k | if ( first ) kputsn("\t.", 2, s); |
4528 | | |
4529 | | // VALUES per sample |
4530 | 49.0k | for (j = 0; j < v->n_sample; ++j) { |
4531 | 31.2k | kputc_('\t', s); |
4532 | 31.2k | first = 1; |
4533 | 31.2k | bcf_fmt_t *f = fmt; |
4534 | 94.4k | for (i = 0; i < (int)v->n_fmt; i++, f++) { |
4535 | 77.0k | if ( !f->p ) continue; |
4536 | 77.0k | if (!first) kputc_(':', s); |
4537 | 77.0k | first = 0; |
4538 | 77.0k | if (gt_i == i) { |
4539 | 13.8k | if ((ret = bcf_format_gt_v2(h, f,j,s)) < 0) { |
4540 | 0 | hts_log_error("Failed to format GT value for sample %d, returned %d", i, ret); |
4541 | 0 | errno = EINVAL; |
4542 | 0 | if (fmt_packed) |
4543 | 0 | free(fmt); |
4544 | 0 | return -1; |
4545 | 0 | } |
4546 | 13.8k | break; |
4547 | 13.8k | } |
4548 | 63.2k | else if (f->n == 1) |
4549 | 4.59k | bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size); |
4550 | 58.6k | else |
4551 | 58.6k | bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size); |
4552 | 77.0k | } |
4553 | | |
4554 | | // Simpler loop post GT and at least 1 iteration |
4555 | 45.6k | for (i++, f++; i < (int)v->n_fmt; i++, f++) { |
4556 | 14.4k | if ( !f->p ) continue; |
4557 | 14.4k | kputc_(':', s); |
4558 | 14.4k | if (f->n == 1) |
4559 | 1.71k | bcf_fmt_array1(s, f->type, f->p + j * (size_t)f->size); |
4560 | 12.7k | else |
4561 | 12.7k | bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size); |
4562 | 14.4k | } |
4563 | 31.2k | if ( first ) kputc_('.', s); |
4564 | 31.2k | } |
4565 | 17.8k | if (fmt_packed) |
4566 | 17.8k | free(fmt); |
4567 | 17.8k | } |
4568 | 7 | else |
4569 | 21 | for (j=0; j<=v->n_sample; j++) |
4570 | 14 | kputsn("\t.", 2, s); |
4571 | 17.8k | } |
4572 | 41.0k | kputc('\n', s); |
4573 | 41.0k | return 0; |
4574 | 41.0k | } |
4575 | | |
4576 | | int vcf_write_line(htsFile *fp, kstring_t *line) |
4577 | 0 | { |
4578 | 0 | int ret; |
4579 | 0 | if ( line->s[line->l-1]!='\n' ) kputc('\n',line); |
4580 | 0 | if ( fp->format.compression!=no_compression ) |
4581 | 0 | ret = bgzf_write(fp->fp.bgzf, line->s, line->l); |
4582 | 0 | else |
4583 | 0 | ret = hwrite(fp->fp.hfile, line->s, line->l); |
4584 | 0 | return ret==line->l ? 0 : -1; |
4585 | 0 | } |
4586 | | |
4587 | | int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) |
4588 | 41.0k | { |
4589 | 41.0k | ssize_t ret; |
4590 | 41.0k | fp->line.l = 0; |
4591 | 41.0k | if (vcf_format1(h, v, &fp->line) != 0) |
4592 | 0 | return -1; |
4593 | 41.0k | if ( fp->format.compression!=no_compression ) { |
4594 | 0 | if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) |
4595 | 0 | return -1; |
4596 | 0 | if (fp->idx && !fp->fp.bgzf->mt) |
4597 | 0 | hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf)); |
4598 | 0 | ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l); |
4599 | 41.0k | } else { |
4600 | 41.0k | ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l); |
4601 | 41.0k | } |
4602 | | |
4603 | 41.0k | if (fp->idx && fp->format.compression == bgzf) { |
4604 | 0 | int tid; |
4605 | 0 | if ((tid = hts_idx_tbi_name(fp->idx, v->rid, bcf_seqname_safe(h, v))) < 0) |
4606 | 0 | return -1; |
4607 | | |
4608 | 0 | if (bgzf_idx_push(fp->fp.bgzf, fp->idx, |
4609 | 0 | tid, v->pos, v->pos + v->rlen, |
4610 | 0 | bgzf_tell(fp->fp.bgzf), 1) < 0) |
4611 | 0 | return -1; |
4612 | 0 | } |
4613 | | |
4614 | 41.0k | return ret==fp->line.l ? 0 : -1; |
4615 | 41.0k | } |
4616 | | |
4617 | | /************************ |
4618 | | * Data access routines * |
4619 | | ************************/ |
4620 | | |
4621 | | int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id) |
4622 | 152k | { |
4623 | 152k | khint_t k; |
4624 | 152k | vdict_t *d = (vdict_t*)h->dict[which]; |
4625 | 152k | k = kh_get(vdict, d, id); |
4626 | 152k | return k == kh_end(d)? -1 : kh_val(d, k).id; |
4627 | 152k | } |
4628 | | |
4629 | | |
4630 | | /******************** |
4631 | | *** BCF indexing *** |
4632 | | ********************/ |
4633 | | |
4634 | | // Calculate number of index levels given min_shift and the header contig |
4635 | | // list. Also returns number of contigs in *nids_out. |
4636 | | static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift, |
4637 | | int starting_n_lvls, int *nids_out) |
4638 | 0 | { |
4639 | 0 | int n_lvls, i, nids = 0; |
4640 | 0 | int64_t max_len = 0, s; |
4641 | |
|
4642 | 0 | for (i = 0; i < h->n[BCF_DT_CTG]; ++i) |
4643 | 0 | { |
4644 | 0 | if ( !h->id[BCF_DT_CTG][i].val ) continue; |
4645 | 0 | if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] ) |
4646 | 0 | max_len = h->id[BCF_DT_CTG][i].val->info[0]; |
4647 | 0 | nids++; |
4648 | 0 | } |
4649 | 0 | if ( !max_len ) max_len = (1LL<<31) - 1; // In case contig line is broken. |
4650 | 0 | max_len += 256; |
4651 | 0 | s = hts_bin_maxpos(min_shift, starting_n_lvls); |
4652 | 0 | for (n_lvls = starting_n_lvls; max_len > s; ++n_lvls, s <<= 3); |
4653 | |
|
4654 | 0 | if (nids_out) *nids_out = nids; |
4655 | 0 | return n_lvls; |
4656 | 0 | } |
4657 | | |
4658 | | hts_idx_t *bcf_index(htsFile *fp, int min_shift) |
4659 | 0 | { |
4660 | 0 | int n_lvls; |
4661 | 0 | bcf1_t *b = NULL; |
4662 | 0 | hts_idx_t *idx = NULL; |
4663 | 0 | bcf_hdr_t *h; |
4664 | 0 | int r; |
4665 | 0 | h = bcf_hdr_read(fp); |
4666 | 0 | if ( !h ) return NULL; |
4667 | 0 | int nids = 0; |
4668 | 0 | n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids); |
4669 | 0 | idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); |
4670 | 0 | if (!idx) goto fail; |
4671 | 0 | b = bcf_init1(); |
4672 | 0 | if (!b) goto fail; |
4673 | 0 | while ((r = bcf_read1(fp,h, b)) >= 0) { |
4674 | 0 | int ret; |
4675 | 0 | ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1); |
4676 | 0 | if (ret < 0) goto fail; |
4677 | 0 | } |
4678 | 0 | if (r < -1) goto fail; |
4679 | 0 | hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf)); |
4680 | 0 | bcf_destroy1(b); |
4681 | 0 | bcf_hdr_destroy(h); |
4682 | 0 | return idx; |
4683 | | |
4684 | 0 | fail: |
4685 | 0 | hts_idx_destroy(idx); |
4686 | 0 | bcf_destroy1(b); |
4687 | 0 | bcf_hdr_destroy(h); |
4688 | 0 | return NULL; |
4689 | 0 | } |
4690 | | |
4691 | | hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx) |
4692 | 0 | { |
4693 | 0 | return fnidx? hts_idx_load2(fn, fnidx) : bcf_index_load(fn); |
4694 | 0 | } |
4695 | | |
4696 | | hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags) |
4697 | 0 | { |
4698 | 0 | return hts_idx_load3(fn, fnidx, HTS_FMT_CSI, flags); |
4699 | 0 | } |
4700 | | |
4701 | | int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads) |
4702 | 0 | { |
4703 | 0 | htsFile *fp; |
4704 | 0 | hts_idx_t *idx; |
4705 | 0 | tbx_t *tbx; |
4706 | 0 | int ret; |
4707 | 0 | if ((fp = hts_open(fn, "rb")) == 0) return -2; |
4708 | 0 | if (n_threads) |
4709 | 0 | hts_set_threads(fp, n_threads); |
4710 | 0 | if ( fp->format.compression!=bgzf ) { hts_close(fp); return -3; } |
4711 | 0 | switch (fp->format.format) { |
4712 | 0 | case bcf: |
4713 | 0 | if (!min_shift) { |
4714 | 0 | hts_log_error("TBI indices for BCF files are not supported"); |
4715 | 0 | ret = -1; |
4716 | 0 | } else { |
4717 | 0 | idx = bcf_index(fp, min_shift); |
4718 | 0 | if (idx) { |
4719 | 0 | ret = hts_idx_save_as(idx, fn, fnidx, HTS_FMT_CSI); |
4720 | 0 | if (ret < 0) ret = -4; |
4721 | 0 | hts_idx_destroy(idx); |
4722 | 0 | } |
4723 | 0 | else ret = -1; |
4724 | 0 | } |
4725 | 0 | break; |
4726 | | |
4727 | 0 | case vcf: |
4728 | 0 | tbx = tbx_index(hts_get_bgzfp(fp), min_shift, &tbx_conf_vcf); |
4729 | 0 | if (tbx) { |
4730 | 0 | ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0 ? HTS_FMT_CSI : HTS_FMT_TBI); |
4731 | 0 | if (ret < 0) ret = -4; |
4732 | 0 | tbx_destroy(tbx); |
4733 | 0 | } |
4734 | 0 | else ret = -1; |
4735 | 0 | break; |
4736 | | |
4737 | 0 | default: |
4738 | 0 | ret = -3; |
4739 | 0 | break; |
4740 | 0 | } |
4741 | 0 | hts_close(fp); |
4742 | 0 | return ret; |
4743 | 0 | } |
4744 | | |
4745 | | int bcf_index_build2(const char *fn, const char *fnidx, int min_shift) |
4746 | 0 | { |
4747 | 0 | return bcf_index_build3(fn, fnidx, min_shift, 0); |
4748 | 0 | } |
4749 | | |
4750 | | int bcf_index_build(const char *fn, int min_shift) |
4751 | 0 | { |
4752 | 0 | return bcf_index_build3(fn, NULL, min_shift, 0); |
4753 | 0 | } |
4754 | | |
4755 | | // Initialise fp->idx for the current format type. |
4756 | | // This must be called after the header has been written but no other data. |
4757 | 0 | static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) { |
4758 | 0 | int n_lvls, fmt; |
4759 | |
|
4760 | 0 | if (min_shift == 0) { |
4761 | 0 | min_shift = 14; |
4762 | 0 | n_lvls = 5; |
4763 | 0 | fmt = HTS_FMT_TBI; |
4764 | 0 | } else { |
4765 | | // Set initial n_lvls to match tbx_index() |
4766 | 0 | int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3; |
4767 | | // Increase if necessary |
4768 | 0 | n_lvls = idx_calc_n_lvls_ids(h, min_shift, starting_n_lvls, NULL); |
4769 | 0 | fmt = HTS_FMT_CSI; |
4770 | 0 | } |
4771 | |
|
4772 | 0 | fp->idx = hts_idx_init(0, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); |
4773 | 0 | if (!fp->idx) return -1; |
4774 | | |
4775 | | // Tabix meta data, added even in CSI for VCF |
4776 | 0 | uint8_t conf[4*7]; |
4777 | 0 | u32_to_le(TBX_VCF, conf+0); // fmt |
4778 | 0 | u32_to_le(1, conf+4); // name col |
4779 | 0 | u32_to_le(2, conf+8); // beg col |
4780 | 0 | u32_to_le(0, conf+12); // end col |
4781 | 0 | u32_to_le('#', conf+16); // comment |
4782 | 0 | u32_to_le(0, conf+20); // n.skip |
4783 | 0 | u32_to_le(0, conf+24); // ref name len |
4784 | 0 | if (hts_idx_set_meta(fp->idx, sizeof(conf)*sizeof(*conf), (uint8_t *)conf, 1) < 0) { |
4785 | 0 | hts_idx_destroy(fp->idx); |
4786 | 0 | fp->idx = NULL; |
4787 | 0 | return -1; |
4788 | 0 | } |
4789 | 0 | fp->fnidx = fnidx; |
4790 | |
|
4791 | 0 | return 0; |
4792 | 0 | } |
4793 | | |
4794 | | // Initialise fp->idx for the current format type. |
4795 | | // This must be called after the header has been written but no other data. |
4796 | 0 | int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) { |
4797 | 0 | int n_lvls, nids = 0; |
4798 | |
|
4799 | 0 | if (fp->format.compression != bgzf) { |
4800 | 0 | hts_log_error("Indexing is only supported on BGZF-compressed files"); |
4801 | 0 | return -3; // Matches no-compression return for bcf_index_build3() |
4802 | 0 | } |
4803 | | |
4804 | 0 | if (fp->format.format == vcf) |
4805 | 0 | return vcf_idx_init(fp, h, min_shift, fnidx); |
4806 | | |
4807 | 0 | if (!min_shift) |
4808 | 0 | min_shift = 14; |
4809 | |
|
4810 | 0 | n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids); |
4811 | |
|
4812 | 0 | fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); |
4813 | 0 | if (!fp->idx) return -1; |
4814 | 0 | fp->fnidx = fnidx; |
4815 | |
|
4816 | 0 | return 0; |
4817 | 0 | } |
4818 | | |
4819 | | // Finishes an index. Call after the last record has been written. |
4820 | | // Returns 0 on success, <0 on failure. |
4821 | | // |
4822 | | // NB: same format as SAM/BAM as it uses bgzf. |
4823 | 0 | int bcf_idx_save(htsFile *fp) { |
4824 | 0 | return sam_idx_save(fp); |
4825 | 0 | } |
4826 | | |
4827 | | /***************** |
4828 | | *** Utilities *** |
4829 | | *****************/ |
4830 | | |
4831 | | int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) |
4832 | 0 | { |
4833 | 0 | int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res; |
4834 | 0 | for (i=0; i<src->nhrec; i++) |
4835 | 0 | { |
4836 | 0 | if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value ) |
4837 | 0 | { |
4838 | 0 | int j; |
4839 | 0 | for (j=0; j<ndst_ori; j++) |
4840 | 0 | { |
4841 | 0 | if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue; |
4842 | | |
4843 | | // Checking only the key part of generic lines, otherwise |
4844 | | // the VCFs are too verbose. Should we perhaps add a flag |
4845 | | // to bcf_hdr_combine() and make this optional? |
4846 | 0 | if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break; |
4847 | 0 | } |
4848 | 0 | if ( j>=ndst_ori ) { |
4849 | 0 | res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); |
4850 | 0 | if (res < 0) return -1; |
4851 | 0 | need_sync += res; |
4852 | 0 | } |
4853 | 0 | } |
4854 | 0 | else if ( src->hrec[i]->type==BCF_HL_STR ) |
4855 | 0 | { |
4856 | | // NB: we are ignoring fields without ID |
4857 | 0 | int j = bcf_hrec_find_key(src->hrec[i],"ID"); |
4858 | 0 | if ( j>=0 ) |
4859 | 0 | { |
4860 | 0 | bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key); |
4861 | 0 | if ( !rec ) { |
4862 | 0 | res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); |
4863 | 0 | if (res < 0) return -1; |
4864 | 0 | need_sync += res; |
4865 | 0 | } |
4866 | 0 | } |
4867 | 0 | } |
4868 | 0 | else |
4869 | 0 | { |
4870 | 0 | int j = bcf_hrec_find_key(src->hrec[i],"ID"); |
4871 | 0 | assert( j>=0 ); // this should always be true for valid VCFs |
4872 | |
|
4873 | 0 | bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL); |
4874 | 0 | if ( !rec ) { |
4875 | 0 | res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); |
4876 | 0 | if (res < 0) return -1; |
4877 | 0 | need_sync += res; |
4878 | 0 | } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT ) |
4879 | 0 | { |
4880 | | // Check that both records are of the same type. The bcf_hdr_id2length |
4881 | | // macro cannot be used here because dst header is not synced yet. |
4882 | 0 | vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID]; |
4883 | 0 | vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID]; |
4884 | 0 | khint_t k_src = kh_get(vdict, d_src, src->hrec[i]->vals[0]); |
4885 | 0 | khint_t k_dst = kh_get(vdict, d_dst, src->hrec[i]->vals[0]); |
4886 | 0 | if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) ) |
4887 | 0 | { |
4888 | 0 | hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths", |
4889 | 0 | src->hrec[i]->vals[0]); |
4890 | 0 | ret |= 1; |
4891 | 0 | } |
4892 | 0 | if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) ) |
4893 | 0 | { |
4894 | 0 | hts_log_warning("Trying to combine \"%s\" tag definitions of different types", |
4895 | 0 | src->hrec[i]->vals[0]); |
4896 | 0 | ret |= 1; |
4897 | 0 | } |
4898 | 0 | } |
4899 | 0 | } |
4900 | 0 | } |
4901 | 0 | if ( need_sync ) { |
4902 | 0 | if (bcf_hdr_sync(dst) < 0) return -1; |
4903 | 0 | } |
4904 | 0 | return ret; |
4905 | 0 | } |
4906 | | |
4907 | | bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src) |
4908 | 0 | { |
4909 | 0 | if ( !dst ) |
4910 | 0 | { |
4911 | | // this will effectively strip existing IDX attributes from src to become dst |
4912 | 0 | dst = bcf_hdr_init("r"); |
4913 | 0 | kstring_t htxt = {0,0,0}; |
4914 | 0 | if (bcf_hdr_format(src, 0, &htxt) < 0) { |
4915 | 0 | free(htxt.s); |
4916 | 0 | return NULL; |
4917 | 0 | } |
4918 | 0 | if ( bcf_hdr_parse(dst, htxt.s) < 0 ) { |
4919 | 0 | bcf_hdr_destroy(dst); |
4920 | 0 | dst = NULL; |
4921 | 0 | } |
4922 | 0 | free(htxt.s); |
4923 | 0 | return dst; |
4924 | 0 | } |
4925 | | |
4926 | 0 | int i, ndst_ori = dst->nhrec, need_sync = 0, res; |
4927 | 0 | for (i=0; i<src->nhrec; i++) |
4928 | 0 | { |
4929 | 0 | if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value ) |
4930 | 0 | { |
4931 | 0 | int j; |
4932 | 0 | for (j=0; j<ndst_ori; j++) |
4933 | 0 | { |
4934 | 0 | if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue; |
4935 | | |
4936 | | // Checking only the key part of generic lines, otherwise |
4937 | | // the VCFs are too verbose. Should we perhaps add a flag |
4938 | | // to bcf_hdr_combine() and make this optional? |
4939 | 0 | if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break; |
4940 | 0 | } |
4941 | 0 | if ( j>=ndst_ori ) { |
4942 | 0 | res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); |
4943 | 0 | if (res < 0) return NULL; |
4944 | 0 | need_sync += res; |
4945 | 0 | } |
4946 | 0 | else if ( !strcmp(src->hrec[i]->key,"fileformat") ) |
4947 | 0 | { |
4948 | 0 | int ver_src = bcf_get_version(src,src->hrec[i]->value); |
4949 | 0 | int ver_dst = bcf_get_version(dst,dst->hrec[j]->value); |
4950 | 0 | if ( ver_src > ver_dst ) |
4951 | 0 | { |
4952 | 0 | if (bcf_hdr_set_version(dst,src->hrec[i]->value) < 0) |
4953 | 0 | return NULL; |
4954 | 0 | need_sync = 1; |
4955 | 0 | } |
4956 | 0 | } |
4957 | 0 | } |
4958 | 0 | else if ( src->hrec[i]->type==BCF_HL_STR ) |
4959 | 0 | { |
4960 | | // NB: we are ignoring fields without ID |
4961 | 0 | int j = bcf_hrec_find_key(src->hrec[i],"ID"); |
4962 | 0 | if ( j>=0 ) |
4963 | 0 | { |
4964 | 0 | bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key); |
4965 | 0 | if ( !rec ) { |
4966 | 0 | res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); |
4967 | 0 | if (res < 0) return NULL; |
4968 | 0 | need_sync += res; |
4969 | 0 | } |
4970 | 0 | } |
4971 | 0 | } |
4972 | 0 | else |
4973 | 0 | { |
4974 | 0 | int j = bcf_hrec_find_key(src->hrec[i],"ID"); |
4975 | 0 | assert( j>=0 ); // this should always be true for valid VCFs |
4976 | |
|
4977 | 0 | bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL); |
4978 | 0 | if ( !rec ) { |
4979 | 0 | res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); |
4980 | 0 | if (res < 0) return NULL; |
4981 | 0 | need_sync += res; |
4982 | 0 | } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT ) |
4983 | 0 | { |
4984 | | // Check that both records are of the same type. The bcf_hdr_id2length |
4985 | | // macro cannot be used here because dst header is not synced yet. |
4986 | 0 | vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID]; |
4987 | 0 | vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID]; |
4988 | 0 | khint_t k_src = kh_get(vdict, d_src, src->hrec[i]->vals[0]); |
4989 | 0 | khint_t k_dst = kh_get(vdict, d_dst, src->hrec[i]->vals[0]); |
4990 | 0 | if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) ) |
4991 | 0 | { |
4992 | 0 | hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths", |
4993 | 0 | src->hrec[i]->vals[0]); |
4994 | 0 | } |
4995 | 0 | if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) ) |
4996 | 0 | { |
4997 | 0 | hts_log_warning("Trying to combine \"%s\" tag definitions of different types", |
4998 | 0 | src->hrec[i]->vals[0]); |
4999 | 0 | } |
5000 | 0 | } |
5001 | 0 | } |
5002 | 0 | } |
5003 | 0 | if ( need_sync ) { |
5004 | 0 | if (bcf_hdr_sync(dst) < 0) return NULL; |
5005 | 0 | } |
5006 | 0 | return dst; |
5007 | 0 | } |
5008 | | |
5009 | | int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line) |
5010 | 0 | { |
5011 | 0 | int i; |
5012 | 0 | if ( line->errcode ) |
5013 | 0 | { |
5014 | 0 | char errordescription[1024] = ""; |
5015 | 0 | hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_strerror(line->errcode, errordescription, sizeof(errordescription)), bcf_seqname_safe(src_hdr,line), line->pos+1); |
5016 | 0 | exit(1); |
5017 | 0 | } |
5018 | 0 | if ( src_hdr->ntransl==-1 ) return 0; // no need to translate, all tags have the same id |
5019 | 0 | if ( !src_hdr->ntransl ) // called for the first time, see what needs translating |
5020 | 0 | { |
5021 | 0 | int dict; |
5022 | 0 | for (dict=0; dict<2; dict++) // BCF_DT_ID and BCF_DT_CTG |
5023 | 0 | { |
5024 | 0 | src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int)); |
5025 | 0 | for (i=0; i<src_hdr->n[dict]; i++) |
5026 | 0 | { |
5027 | 0 | if ( !src_hdr->id[dict][i].key ) // gap left after removed BCF header lines |
5028 | 0 | { |
5029 | 0 | src_hdr->transl[dict][i] = -1; |
5030 | 0 | continue; |
5031 | 0 | } |
5032 | 0 | src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key); |
5033 | 0 | if ( src_hdr->transl[dict][i]!=-1 && i!=src_hdr->transl[dict][i] ) src_hdr->ntransl++; |
5034 | 0 | } |
5035 | 0 | } |
5036 | 0 | if ( !src_hdr->ntransl ) |
5037 | 0 | { |
5038 | 0 | free(src_hdr->transl[0]); src_hdr->transl[0] = NULL; |
5039 | 0 | free(src_hdr->transl[1]); src_hdr->transl[1] = NULL; |
5040 | 0 | src_hdr->ntransl = -1; |
5041 | 0 | } |
5042 | 0 | if ( src_hdr->ntransl==-1 ) return 0; |
5043 | 0 | } |
5044 | 0 | bcf_unpack(line,BCF_UN_ALL); |
5045 | | |
5046 | | // CHROM |
5047 | 0 | if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid]; |
5048 | | |
5049 | | // FILTER |
5050 | 0 | for (i=0; i<line->d.n_flt; i++) |
5051 | 0 | { |
5052 | 0 | int src_id = line->d.flt[i]; |
5053 | 0 | if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 ) |
5054 | 0 | line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id]; |
5055 | 0 | line->d.shared_dirty |= BCF1_DIRTY_FLT; |
5056 | 0 | } |
5057 | | |
5058 | | // INFO |
5059 | 0 | for (i=0; i<line->n_info; i++) |
5060 | 0 | { |
5061 | 0 | int src_id = line->d.info[i].key; |
5062 | 0 | int dst_id = src_hdr->transl[BCF_DT_ID][src_id]; |
5063 | 0 | if ( dst_id<0 ) continue; |
5064 | 0 | line->d.info[i].key = dst_id; |
5065 | 0 | if ( !line->d.info[i].vptr ) continue; // skip deleted |
5066 | 0 | int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8; |
5067 | 0 | int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8; |
5068 | 0 | if ( src_size==dst_size ) // can overwrite |
5069 | 0 | { |
5070 | 0 | uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off; |
5071 | 0 | if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; } |
5072 | 0 | else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; } |
5073 | 0 | else { *(uint32_t*)vptr = (uint32_t)dst_id; } |
5074 | 0 | } |
5075 | 0 | else // must realloc |
5076 | 0 | { |
5077 | 0 | bcf_info_t *info = &line->d.info[i]; |
5078 | 0 | kstring_t str = {0,0,0}; |
5079 | 0 | bcf_enc_int1(&str, dst_id); |
5080 | 0 | bcf_enc_size(&str, info->len,info->type); |
5081 | 0 | uint32_t vptr_off = str.l; |
5082 | 0 | kputsn((char*)info->vptr, info->vptr_len, &str); |
5083 | 0 | if( info->vptr_free ) free(info->vptr - info->vptr_off); |
5084 | 0 | info->vptr_off = vptr_off; |
5085 | 0 | info->vptr = (uint8_t*)str.s + info->vptr_off; |
5086 | 0 | info->vptr_free = 1; |
5087 | 0 | line->d.shared_dirty |= BCF1_DIRTY_INF; |
5088 | 0 | } |
5089 | 0 | } |
5090 | | |
5091 | | // FORMAT |
5092 | 0 | for (i=0; i<line->n_fmt; i++) |
5093 | 0 | { |
5094 | 0 | int src_id = line->d.fmt[i].id; |
5095 | 0 | int dst_id = src_hdr->transl[BCF_DT_ID][src_id]; |
5096 | 0 | if ( dst_id<0 ) continue; |
5097 | 0 | line->d.fmt[i].id = dst_id; |
5098 | 0 | if( !line->d.fmt[i].p ) continue; // skip deleted |
5099 | 0 | int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8; |
5100 | 0 | int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8; |
5101 | 0 | if ( src_size==dst_size ) // can overwrite |
5102 | 0 | { |
5103 | 0 | uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off; // pointer to the vector size (4bits) and BT type (4bits) |
5104 | 0 | if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; } |
5105 | 0 | else if ( dst_size==BCF_BT_INT16 ) { i16_to_le(dst_id, p + 1); } |
5106 | 0 | else { i32_to_le(dst_id, p + 1); } |
5107 | 0 | } |
5108 | 0 | else // must realloc |
5109 | 0 | { |
5110 | 0 | bcf_fmt_t *fmt = &line->d.fmt[i]; |
5111 | 0 | kstring_t str = {0,0,0}; |
5112 | 0 | bcf_enc_int1(&str, dst_id); |
5113 | 0 | bcf_enc_size(&str, fmt->n, fmt->type); |
5114 | 0 | uint32_t p_off = str.l; |
5115 | 0 | kputsn((char*)fmt->p, fmt->p_len, &str); |
5116 | 0 | if( fmt->p_free ) free(fmt->p - fmt->p_off); |
5117 | 0 | fmt->p_off = p_off; |
5118 | 0 | fmt->p = (uint8_t*)str.s + fmt->p_off; |
5119 | 0 | fmt->p_free = 1; |
5120 | 0 | line->d.indiv_dirty = 1; |
5121 | 0 | } |
5122 | 0 | } |
5123 | 0 | return 0; |
5124 | 0 | } |
5125 | | |
5126 | | bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr) |
5127 | 0 | { |
5128 | 0 | bcf_hdr_t *hout = bcf_hdr_init("r"); |
5129 | 0 | if (!hout) { |
5130 | 0 | hts_log_error("Failed to allocate bcf header"); |
5131 | 0 | return NULL; |
5132 | 0 | } |
5133 | 0 | kstring_t htxt = {0,0,0}; |
5134 | 0 | if (bcf_hdr_format(hdr, 1, &htxt) < 0) { |
5135 | 0 | free(htxt.s); |
5136 | 0 | return NULL; |
5137 | 0 | } |
5138 | 0 | if ( bcf_hdr_parse(hout, htxt.s) < 0 ) { |
5139 | 0 | bcf_hdr_destroy(hout); |
5140 | 0 | hout = NULL; |
5141 | 0 | } |
5142 | 0 | free(htxt.s); |
5143 | 0 | return hout; |
5144 | 0 | } |
5145 | | |
5146 | | bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap) |
5147 | 0 | { |
5148 | 0 | void *names_hash = khash_str2int_init(); |
5149 | 0 | kstring_t htxt = {0,0,0}; |
5150 | 0 | kstring_t str = {0,0,0}; |
5151 | 0 | bcf_hdr_t *h = bcf_hdr_init("w"); |
5152 | 0 | int r = 0; |
5153 | 0 | if (!h || !names_hash) { |
5154 | 0 | hts_log_error("Failed to allocate bcf header"); |
5155 | 0 | goto err; |
5156 | 0 | } |
5157 | 0 | if (bcf_hdr_format(h0, 1, &htxt) < 0) { |
5158 | 0 | hts_log_error("Failed to get header text"); |
5159 | 0 | goto err; |
5160 | 0 | } |
5161 | 0 | bcf_hdr_set_version(h,bcf_hdr_get_version(h0)); |
5162 | 0 | int j; |
5163 | 0 | for (j=0; j<n; j++) imap[j] = -1; |
5164 | 0 | if ( bcf_hdr_nsamples(h0) > 0) { |
5165 | 0 | char *p = find_chrom_header_line(htxt.s); |
5166 | 0 | int i = 0, end = n? 8 : 7; |
5167 | 0 | while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p; |
5168 | 0 | if (i != end) { |
5169 | 0 | hts_log_error("Wrong number of columns in header #CHROM line"); |
5170 | 0 | goto err; |
5171 | 0 | } |
5172 | 0 | r |= kputsn(htxt.s, p - htxt.s, &str) < 0; |
5173 | 0 | for (i = 0; i < n; ++i) { |
5174 | 0 | if ( khash_str2int_has_key(names_hash,samples[i]) ) |
5175 | 0 | { |
5176 | 0 | hts_log_error("Duplicate sample name \"%s\"", samples[i]); |
5177 | 0 | goto err; |
5178 | 0 | } |
5179 | 0 | imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]); |
5180 | 0 | if (imap[i] < 0) continue; |
5181 | 0 | r |= kputc('\t', &str) < 0; |
5182 | 0 | r |= kputs(samples[i], &str) < 0; |
5183 | 0 | r |= khash_str2int_inc(names_hash,samples[i]) < 0; |
5184 | 0 | } |
5185 | 0 | } else r |= kputsn(htxt.s, htxt.l, &str) < 0; |
5186 | 0 | while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines |
5187 | 0 | r |= kputc('\n',&str) < 0; |
5188 | 0 | if (r) { |
5189 | 0 | hts_log_error("%s", strerror(errno)); |
5190 | 0 | goto err; |
5191 | 0 | } |
5192 | 0 | if ( bcf_hdr_parse(h, str.s) < 0 ) { |
5193 | 0 | bcf_hdr_destroy(h); |
5194 | 0 | h = NULL; |
5195 | 0 | } |
5196 | 0 | free(str.s); |
5197 | 0 | free(htxt.s); |
5198 | 0 | khash_str2int_destroy(names_hash); |
5199 | 0 | return h; |
5200 | | |
5201 | 0 | err: |
5202 | 0 | ks_free(&str); |
5203 | 0 | ks_free(&htxt); |
5204 | 0 | khash_str2int_destroy(names_hash); |
5205 | 0 | bcf_hdr_destroy(h); |
5206 | 0 | return NULL; |
5207 | 0 | } |
5208 | | |
5209 | | int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file) |
5210 | 0 | { |
5211 | 0 | if ( samples && !strcmp("-",samples) ) return 0; // keep all samples |
5212 | | |
5213 | 0 | int i, narr = bit_array_size(bcf_hdr_nsamples(hdr)); |
5214 | 0 | hdr->keep_samples = (uint8_t*) calloc(narr,1); |
5215 | 0 | if (!hdr->keep_samples) return -1; |
5216 | | |
5217 | 0 | hdr->nsamples_ori = bcf_hdr_nsamples(hdr); |
5218 | 0 | if ( !samples ) |
5219 | 0 | { |
5220 | | // exclude all samples |
5221 | 0 | khint_t k; |
5222 | 0 | vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE], *new_dict; |
5223 | 0 | new_dict = kh_init(vdict); |
5224 | 0 | if (!new_dict) return -1; |
5225 | | |
5226 | 0 | bcf_hdr_nsamples(hdr) = 0; |
5227 | |
|
5228 | 0 | for (k = kh_begin(d); k != kh_end(d); ++k) |
5229 | 0 | if (kh_exist(d, k)) free((char*)kh_key(d, k)); |
5230 | 0 | kh_destroy(vdict, d); |
5231 | 0 | hdr->dict[BCF_DT_SAMPLE] = new_dict; |
5232 | 0 | if (bcf_hdr_sync(hdr) < 0) return -1; |
5233 | | |
5234 | 0 | return 0; |
5235 | 0 | } |
5236 | | |
5237 | 0 | if ( samples[0]=='^' ) |
5238 | 0 | for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i); |
5239 | |
|
5240 | 0 | int idx, n, ret = 0; |
5241 | 0 | char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n); |
5242 | 0 | if ( !smpls ) return -1; |
5243 | 0 | for (i=0; i<n; i++) |
5244 | 0 | { |
5245 | 0 | idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]); |
5246 | 0 | if ( idx<0 ) |
5247 | 0 | { |
5248 | 0 | if ( !ret ) ret = i+1; |
5249 | 0 | continue; |
5250 | 0 | } |
5251 | 0 | assert( idx<bcf_hdr_nsamples(hdr) ); |
5252 | 0 | if ( samples[0]=='^' ) |
5253 | 0 | bit_array_clear(hdr->keep_samples, idx); |
5254 | 0 | else |
5255 | 0 | bit_array_set(hdr->keep_samples, idx); |
5256 | 0 | } |
5257 | 0 | for (i=0; i<n; i++) free(smpls[i]); |
5258 | 0 | free(smpls); |
5259 | |
|
5260 | 0 | bcf_hdr_nsamples(hdr) = 0; |
5261 | 0 | for (i=0; i<hdr->nsamples_ori; i++) |
5262 | 0 | if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++; |
5263 | |
|
5264 | 0 | if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; } |
5265 | 0 | else |
5266 | 0 | { |
5267 | | // Make new list and dictionary with desired samples |
5268 | 0 | char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr)); |
5269 | 0 | vdict_t *new_dict, *d; |
5270 | 0 | int k, res; |
5271 | 0 | if (!samples) return -1; |
5272 | | |
5273 | 0 | new_dict = kh_init(vdict); |
5274 | 0 | if (!new_dict) { |
5275 | 0 | free(samples); |
5276 | 0 | return -1; |
5277 | 0 | } |
5278 | 0 | idx = 0; |
5279 | 0 | for (i=0; i<hdr->nsamples_ori; i++) { |
5280 | 0 | if ( bit_array_test(hdr->keep_samples,i) ) { |
5281 | 0 | samples[idx] = hdr->samples[i]; |
5282 | 0 | k = kh_put(vdict, new_dict, hdr->samples[i], &res); |
5283 | 0 | if (res < 0) { |
5284 | 0 | free(samples); |
5285 | 0 | kh_destroy(vdict, new_dict); |
5286 | 0 | return -1; |
5287 | 0 | } |
5288 | 0 | kh_val(new_dict, k) = bcf_idinfo_def; |
5289 | 0 | kh_val(new_dict, k).id = idx; |
5290 | 0 | idx++; |
5291 | 0 | } |
5292 | 0 | } |
5293 | | |
5294 | | // Delete desired samples from old dictionary, so we don't free them |
5295 | 0 | d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE]; |
5296 | 0 | for (i=0; i < idx; i++) { |
5297 | 0 | int k = kh_get(vdict, d, samples[i]); |
5298 | 0 | if (k < kh_end(d)) kh_del(vdict, d, k); |
5299 | 0 | } |
5300 | | |
5301 | | // Free everything else |
5302 | 0 | for (k = kh_begin(d); k != kh_end(d); ++k) |
5303 | 0 | if (kh_exist(d, k)) free((char*)kh_key(d, k)); |
5304 | 0 | kh_destroy(vdict, d); |
5305 | 0 | hdr->dict[BCF_DT_SAMPLE] = new_dict; |
5306 | |
|
5307 | 0 | free(hdr->samples); |
5308 | 0 | hdr->samples = samples; |
5309 | |
|
5310 | 0 | if (bcf_hdr_sync(hdr) < 0) |
5311 | 0 | return -1; |
5312 | 0 | } |
5313 | | |
5314 | 0 | return ret; |
5315 | 0 | } |
5316 | | |
5317 | | int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap) |
5318 | 0 | { |
5319 | 0 | kstring_t ind; |
5320 | 0 | ind.s = 0; ind.l = ind.m = 0; |
5321 | 0 | if (n) { |
5322 | 0 | bcf_fmt_t fmt[MAX_N_FMT]; |
5323 | 0 | int i, j; |
5324 | 0 | uint8_t *ptr = (uint8_t*)v->indiv.s; |
5325 | 0 | for (i = 0; i < v->n_fmt; ++i) |
5326 | 0 | ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]); |
5327 | 0 | for (i = 0; i < (int)v->n_fmt; ++i) { |
5328 | 0 | bcf_fmt_t *f = &fmt[i]; |
5329 | 0 | bcf_enc_int1(&ind, f->id); |
5330 | 0 | bcf_enc_size(&ind, f->n, f->type); |
5331 | 0 | for (j = 0; j < n; ++j) |
5332 | 0 | if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind); |
5333 | 0 | } |
5334 | 0 | for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i; |
5335 | 0 | v->n_sample = i; |
5336 | 0 | } else v->n_sample = 0; |
5337 | 0 | if ( !v->n_sample ) v->n_fmt = 0; |
5338 | 0 | free(v->indiv.s); |
5339 | 0 | v->indiv = ind; |
5340 | 0 | v->unpacked &= ~BCF_UN_FMT; // only BCF is ready for output, VCF will need to unpack again |
5341 | 0 | return 0; |
5342 | 0 | } |
5343 | | |
5344 | | int bcf_is_snp(bcf1_t *v) |
5345 | 0 | { |
5346 | 0 | int i; |
5347 | 0 | bcf_unpack(v, BCF_UN_STR); |
5348 | 0 | for (i = 0; i < v->n_allele; ++i) |
5349 | 0 | { |
5350 | 0 | if ( v->d.allele[i][1]==0 && v->d.allele[i][0]!='*' ) continue; |
5351 | | |
5352 | | // mpileup's <X> allele, see also below. This is not completely satisfactory, |
5353 | | // a general library is here narrowly tailored to fit samtools. |
5354 | 0 | if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue; |
5355 | 0 | if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='*' && v->d.allele[i][2]=='>' ) continue; |
5356 | | |
5357 | 0 | break; |
5358 | 0 | } |
5359 | 0 | return i == v->n_allele; |
5360 | 0 | } |
5361 | | |
5362 | | static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t *var) |
5363 | 0 | { |
5364 | 0 | if ( *alt == '*' && !alt[1] ) { var->n = 0; var->type = VCF_OVERLAP; return; } // overlapping variant |
5365 | | |
5366 | | // The most frequent case |
5367 | 0 | if ( !ref[1] && !alt[1] ) |
5368 | 0 | { |
5369 | 0 | if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; } |
5370 | 0 | if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant |
5371 | 0 | var->n = 1; var->type = VCF_SNP; return; |
5372 | 0 | } |
5373 | 0 | if ( alt[0]=='<' ) |
5374 | 0 | { |
5375 | 0 | if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant |
5376 | 0 | if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; } |
5377 | 0 | if ( !strcmp("NON_REF>",alt+1) ) { var->n = 0; var->type = VCF_REF; return; } |
5378 | 0 | var->type = VCF_OTHER; |
5379 | 0 | return; |
5380 | 0 | } |
5381 | | |
5382 | | // Catch "joined before" breakend case |
5383 | 0 | if ( alt[0]==']' || alt[0] == '[' ) |
5384 | 0 | { |
5385 | 0 | var->type = VCF_BND; return; |
5386 | 0 | } |
5387 | | |
5388 | | // Iterate through alt characters that match the reference |
5389 | 0 | const char *r = ref, *a = alt; |
5390 | 0 | while (*r && *a && toupper_c(*r)==toupper_c(*a) ) { r++; a++; } // unfortunately, matching REF,ALT case is not guaranteed |
5391 | |
|
5392 | 0 | if ( *a && !*r ) |
5393 | 0 | { |
5394 | 0 | while ( *a ) a++; |
5395 | 0 | if ( *(a-1)==']' || *(a-1)=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend |
5396 | 0 | var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_INS; return; |
5397 | 0 | } |
5398 | 0 | else if ( *r && !*a ) |
5399 | 0 | { |
5400 | 0 | while ( *r ) r++; |
5401 | 0 | var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_DEL; return; |
5402 | 0 | } |
5403 | 0 | else if ( !*r && !*a ) |
5404 | 0 | { |
5405 | 0 | var->n = 0; var->type = VCF_REF; return; |
5406 | 0 | } |
5407 | | |
5408 | 0 | const char *re = r, *ae = a; |
5409 | 0 | while ( re[1] ) re++; |
5410 | 0 | while ( ae[1] ) ae++; |
5411 | 0 | if ( ae[0]==']' || ae[0]=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend |
5412 | 0 | while ( re>r && ae>a && toupper_c(*re)==toupper_c(*ae) ) { re--; ae--; } |
5413 | 0 | if ( ae==a ) |
5414 | 0 | { |
5415 | 0 | if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; } |
5416 | 0 | var->n = -(re-r); |
5417 | 0 | if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_DEL; return; } |
5418 | 0 | var->type = VCF_OTHER; return; |
5419 | 0 | } |
5420 | 0 | else if ( re==r ) |
5421 | 0 | { |
5422 | 0 | var->n = ae-a; |
5423 | 0 | if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_INS; return; } |
5424 | 0 | var->type = VCF_OTHER; return; |
5425 | 0 | } |
5426 | | |
5427 | 0 | var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER; |
5428 | 0 | var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1; |
5429 | | |
5430 | | // should do also complex events, SVs, etc... |
5431 | 0 | } |
5432 | | |
5433 | | static int bcf_set_variant_types(bcf1_t *b) |
5434 | 0 | { |
5435 | 0 | if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR); |
5436 | 0 | bcf_dec_t *d = &b->d; |
5437 | 0 | if ( d->n_var < b->n_allele ) |
5438 | 0 | { |
5439 | 0 | bcf_variant_t *new_var = realloc(d->var, sizeof(bcf_variant_t)*b->n_allele); |
5440 | 0 | if (!new_var) |
5441 | 0 | return -1; |
5442 | 0 | d->var = new_var; |
5443 | 0 | d->n_var = b->n_allele; |
5444 | 0 | } |
5445 | 0 | int i; |
5446 | 0 | b->d.var_type = 0; |
5447 | 0 | d->var[0].type = VCF_REF; |
5448 | 0 | d->var[0].n = 0; |
5449 | 0 | for (i=1; i<b->n_allele; i++) |
5450 | 0 | { |
5451 | 0 | bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]); |
5452 | 0 | b->d.var_type |= d->var[i].type; |
5453 | | //fprintf(stderr,"[set_variant_type] %d %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type); |
5454 | 0 | } |
5455 | 0 | return 0; |
5456 | 0 | } |
5457 | | |
5458 | | // bcf_get_variant_type/bcf_get_variant_types should only return the following, |
5459 | | // to be compatible with callers that are not expecting newer values |
5460 | | // like VCF_INS, VCF_DEL. The full set is available from the newer |
5461 | | // vcf_has_variant_type* interfaces. |
5462 | 0 | #define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP) |
5463 | | int bcf_get_variant_types(bcf1_t *rec) |
5464 | 0 | { |
5465 | 0 | if ( rec->d.var_type==-1 ) { |
5466 | 0 | if (bcf_set_variant_types(rec) != 0) { |
5467 | 0 | hts_log_error("Couldn't get variant types: %s", strerror(errno)); |
5468 | 0 | exit(1); // Due to legacy API having no way to report failures |
5469 | 0 | } |
5470 | 0 | } |
5471 | 0 | return rec->d.var_type & ORIG_VAR_TYPES; |
5472 | 0 | } |
5473 | | |
5474 | | int bcf_get_variant_type(bcf1_t *rec, int ith_allele) |
5475 | 0 | { |
5476 | 0 | if ( rec->d.var_type==-1 ) { |
5477 | 0 | if (bcf_set_variant_types(rec) != 0) { |
5478 | 0 | hts_log_error("Couldn't get variant types: %s", strerror(errno)); |
5479 | 0 | exit(1); // Due to legacy API having no way to report failures |
5480 | 0 | } |
5481 | 0 | } |
5482 | 0 | if (ith_allele < 0 || ith_allele >= rec->n_allele) { |
5483 | 0 | hts_log_error("Requested allele outside valid range"); |
5484 | 0 | exit(1); |
5485 | 0 | } |
5486 | 0 | return rec->d.var[ith_allele].type & ORIG_VAR_TYPES; |
5487 | 0 | } |
5488 | | #undef ORIG_VAR_TYPES |
5489 | | |
5490 | | int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask) |
5491 | 0 | { |
5492 | 0 | if ( rec->d.var_type==-1 ) { |
5493 | 0 | if (bcf_set_variant_types(rec) != 0) return -1; |
5494 | 0 | } |
5495 | 0 | if (ith_allele < 0 || ith_allele >= rec->n_allele) return -1; |
5496 | 0 | if (bitmask == VCF_REF) { // VCF_REF is 0, so handled as a special case |
5497 | 0 | return rec->d.var[ith_allele].type == VCF_REF; |
5498 | 0 | } |
5499 | 0 | return bitmask & rec->d.var[ith_allele].type; |
5500 | 0 | } |
5501 | | |
5502 | | int bcf_variant_length(bcf1_t *rec, int ith_allele) |
5503 | 0 | { |
5504 | 0 | if ( rec->d.var_type==-1 ) { |
5505 | 0 | if (bcf_set_variant_types(rec) != 0) return bcf_int32_missing; |
5506 | 0 | } |
5507 | 0 | if (ith_allele < 0 || ith_allele >= rec->n_allele) return bcf_int32_missing; |
5508 | 0 | return rec->d.var[ith_allele].n; |
5509 | 0 | } |
5510 | | |
5511 | | int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask, |
5512 | | enum bcf_variant_match mode) |
5513 | 0 | { |
5514 | 0 | if ( rec->d.var_type==-1 ) { |
5515 | 0 | if (bcf_set_variant_types(rec) != 0) return -1; |
5516 | 0 | } |
5517 | 0 | uint32_t type = rec->d.var_type; |
5518 | 0 | if ( mode==bcf_match_overlap ) return bitmask & type; |
5519 | | |
5520 | | // VCF_INDEL is always set with VCF_INS and VCF_DEL by bcf_set_variant_type[s], but the bitmask may |
5521 | | // ask for say `VCF_INS` or `VCF_INDEL` only |
5522 | 0 | if ( bitmask&(VCF_INS|VCF_DEL) && !(bitmask&VCF_INDEL) ) type &= ~VCF_INDEL; |
5523 | 0 | else if ( bitmask&VCF_INDEL && !(bitmask&(VCF_INS|VCF_DEL)) ) type &= ~(VCF_INS|VCF_DEL); |
5524 | |
|
5525 | 0 | if ( mode==bcf_match_subset ) |
5526 | 0 | { |
5527 | 0 | if ( ~bitmask & type ) return 0; |
5528 | 0 | else return bitmask & type; |
5529 | 0 | } |
5530 | | // mode == bcf_match_exact |
5531 | 0 | if ( bitmask==VCF_REF ) return type==bitmask ? 1 : 0; |
5532 | 0 | return type==bitmask ? type : 0; |
5533 | 0 | } |
5534 | | |
5535 | | int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type) |
5536 | 0 | { |
5537 | 0 | static int negative_rlen_warned = 0; |
5538 | 0 | int is_end_tag, is_svlen_tag = 0; |
5539 | | |
5540 | | // Is the field already present? |
5541 | 0 | int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key); |
5542 | 0 | if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1; // No such INFO field in the header |
5543 | 0 | if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO); |
5544 | |
|
5545 | 0 | is_end_tag = strcmp(key, "END") == 0; |
5546 | 0 | is_svlen_tag = strcmp(key, "SVLEN") == 0; |
5547 | |
|
5548 | 0 | for (i=0; i<line->n_info; i++) |
5549 | 0 | if ( inf_id==line->d.info[i].key ) break; |
5550 | 0 | bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i]; |
5551 | |
|
5552 | 0 | if ( !n || (type==BCF_HT_STR && !values) ) |
5553 | 0 | { |
5554 | 0 | if ( inf ) |
5555 | 0 | { |
5556 | | // Mark the tag for removal, free existing memory if necessary |
5557 | 0 | if ( inf->vptr_free ) |
5558 | 0 | { |
5559 | 0 | free(inf->vptr - inf->vptr_off); |
5560 | 0 | inf->vptr_free = 0; |
5561 | 0 | } |
5562 | 0 | line->d.shared_dirty |= BCF1_DIRTY_INF; |
5563 | 0 | inf->vptr = NULL; |
5564 | 0 | inf->vptr_off = inf->vptr_len = 0; |
5565 | 0 | } |
5566 | 0 | if ( n==0 && (is_end_tag || is_svlen_tag) ) { |
5567 | 0 | line->rlen = get_rlen(hdr, line); |
5568 | 0 | } |
5569 | 0 | return 0; |
5570 | 0 | } |
5571 | | |
5572 | 0 | if (is_end_tag) |
5573 | 0 | { |
5574 | 0 | if (n != 1) |
5575 | 0 | { |
5576 | 0 | hts_log_error("END info tag should only have one value at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1); |
5577 | 0 | line->errcode |= BCF_ERR_TAG_INVALID; |
5578 | 0 | return -1; |
5579 | 0 | } |
5580 | 0 | if (type != BCF_HT_INT && type != BCF_HT_LONG) |
5581 | 0 | { |
5582 | 0 | hts_log_error("Wrong type (%d) for END info tag at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1); |
5583 | 0 | line->errcode |= BCF_ERR_TAG_INVALID; |
5584 | 0 | return -1; |
5585 | 0 | } |
5586 | 0 | } |
5587 | | |
5588 | | // Encode the values and determine the size required to accommodate the values |
5589 | 0 | kstring_t str = {0,0,0}; |
5590 | 0 | bcf_enc_int1(&str, inf_id); |
5591 | 0 | if ( type==BCF_HT_INT ) |
5592 | 0 | bcf_enc_vint(&str, n, (int32_t*)values, -1); |
5593 | 0 | else if ( type==BCF_HT_REAL ) |
5594 | 0 | bcf_enc_vfloat(&str, n, (float*)values); |
5595 | 0 | else if ( type==BCF_HT_FLAG || type==BCF_HT_STR ) |
5596 | 0 | { |
5597 | 0 | if ( values==NULL ) |
5598 | 0 | bcf_enc_size(&str, 0, BCF_BT_NULL); |
5599 | 0 | else |
5600 | 0 | bcf_enc_vchar(&str, strlen((char*)values), (char*)values); |
5601 | 0 | } |
5602 | | #ifdef VCF_ALLOW_INT64 |
5603 | | else if ( type==BCF_HT_LONG ) |
5604 | | { |
5605 | | if (n != 1) { |
5606 | | hts_log_error("Only storing a single BCF_HT_LONG value is supported at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1); |
5607 | | abort(); |
5608 | | } |
5609 | | bcf_enc_long1(&str, *(int64_t *) values); |
5610 | | } |
5611 | | #endif |
5612 | 0 | else |
5613 | 0 | { |
5614 | 0 | hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1); |
5615 | 0 | abort(); |
5616 | 0 | } |
5617 | | |
5618 | | // Is the INFO tag already present |
5619 | 0 | if ( inf ) |
5620 | 0 | { |
5621 | | // Is it big enough to accommodate new block? |
5622 | 0 | if ( inf->vptr && str.l <= inf->vptr_len + inf->vptr_off ) |
5623 | 0 | { |
5624 | 0 | if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF; |
5625 | 0 | uint8_t *ptr = inf->vptr - inf->vptr_off; |
5626 | 0 | memcpy(ptr, str.s, str.l); |
5627 | 0 | free(str.s); |
5628 | 0 | int vptr_free = inf->vptr_free; |
5629 | 0 | bcf_unpack_info_core1(ptr, inf); |
5630 | 0 | inf->vptr_free = vptr_free; |
5631 | 0 | } |
5632 | 0 | else |
5633 | 0 | { |
5634 | 0 | if ( inf->vptr_free ) |
5635 | 0 | free(inf->vptr - inf->vptr_off); |
5636 | 0 | bcf_unpack_info_core1((uint8_t*)str.s, inf); |
5637 | 0 | inf->vptr_free = 1; |
5638 | 0 | line->d.shared_dirty |= BCF1_DIRTY_INF; |
5639 | 0 | } |
5640 | 0 | } |
5641 | 0 | else |
5642 | 0 | { |
5643 | | // The tag is not present, create new one |
5644 | 0 | line->n_info++; |
5645 | 0 | hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info); |
5646 | 0 | inf = &line->d.info[line->n_info-1]; |
5647 | 0 | bcf_unpack_info_core1((uint8_t*)str.s, inf); |
5648 | 0 | inf->vptr_free = 1; |
5649 | 0 | line->d.shared_dirty |= BCF1_DIRTY_INF; |
5650 | 0 | } |
5651 | 0 | line->unpacked |= BCF_UN_INFO; |
5652 | |
|
5653 | 0 | if ( n==1 && is_end_tag) { |
5654 | 0 | hts_pos_t end = type == BCF_HT_INT ? *(int32_t *) values : *(int64_t *) values; |
5655 | 0 | if ( (type == BCF_HT_INT && end!=bcf_int32_missing) || (type == BCF_HT_LONG && end!=bcf_int64_missing) ) |
5656 | 0 | { |
5657 | 0 | if ( end <= line->pos ) |
5658 | 0 | { |
5659 | 0 | if ( !negative_rlen_warned ) |
5660 | 0 | { |
5661 | 0 | hts_log_warning("INFO/END=%"PRIhts_pos" is smaller than POS at %s:%"PRIhts_pos,end,bcf_seqname_safe(hdr,line),line->pos+1); |
5662 | 0 | negative_rlen_warned = 1; |
5663 | 0 | } |
5664 | 0 | } |
5665 | 0 | } |
5666 | 0 | } |
5667 | 0 | if (is_svlen_tag || is_end_tag) { |
5668 | 0 | line->rlen = get_rlen(hdr, line); |
5669 | 0 | } |
5670 | 0 | return 0; |
5671 | 0 | } |
5672 | | |
5673 | | int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n) |
5674 | 0 | { |
5675 | 0 | if ( !n ) |
5676 | 0 | return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR); |
5677 | | |
5678 | 0 | int i, max_len = 0; |
5679 | 0 | for (i=0; i<n; i++) |
5680 | 0 | { |
5681 | 0 | int len = strlen(values[i]); |
5682 | 0 | if ( len > max_len ) max_len = len; |
5683 | 0 | } |
5684 | 0 | char *out = (char*) malloc(max_len*n); |
5685 | 0 | if ( !out ) return -2; |
5686 | 0 | for (i=0; i<n; i++) |
5687 | 0 | { |
5688 | 0 | char *dst = out+i*max_len; |
5689 | 0 | const char *src = values[i]; |
5690 | 0 | int j = 0; |
5691 | 0 | while ( src[j] ) { dst[j] = src[j]; j++; } |
5692 | 0 | for (; j<max_len; j++) dst[j] = 0; |
5693 | 0 | } |
5694 | 0 | int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR); |
5695 | 0 | free(out); |
5696 | 0 | return ret; |
5697 | 0 | } |
5698 | | |
5699 | | int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type) |
5700 | 0 | { |
5701 | | // Is the field already present? |
5702 | 0 | int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key); |
5703 | 0 | int is_len = 0; |
5704 | 0 | if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) ) |
5705 | 0 | { |
5706 | 0 | if ( !n ) return 0; |
5707 | 0 | return -1; // the key not present in the header |
5708 | 0 | } |
5709 | | |
5710 | 0 | if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); |
5711 | |
|
5712 | 0 | for (i=0; i<line->n_fmt; i++) |
5713 | 0 | if ( line->d.fmt[i].id==fmt_id ) break; |
5714 | 0 | bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i]; |
5715 | |
|
5716 | 0 | is_len = strcmp(key, "LEN") == 0; |
5717 | 0 | if ( !n ) |
5718 | 0 | { |
5719 | 0 | if ( fmt ) |
5720 | 0 | { |
5721 | | // Mark the tag for removal, free existing memory if necessary |
5722 | 0 | if ( fmt->p_free ) |
5723 | 0 | { |
5724 | 0 | free(fmt->p - fmt->p_off); |
5725 | 0 | fmt->p_free = 0; |
5726 | 0 | } |
5727 | 0 | line->d.indiv_dirty = 1; |
5728 | 0 | fmt->p = NULL; |
5729 | 0 | } |
5730 | 0 | if (is_len) { |
5731 | 0 | line->rlen = get_rlen(hdr, line); |
5732 | 0 | } |
5733 | 0 | return 0; |
5734 | 0 | } |
5735 | | |
5736 | 0 | line->n_sample = bcf_hdr_nsamples(hdr); |
5737 | 0 | int nps = n / line->n_sample; // number of values per sample |
5738 | 0 | assert( nps && nps*line->n_sample==n ); // must be divisible by n_sample |
5739 | | |
5740 | | // Encode the values and determine the size required to accommodate the values |
5741 | 0 | kstring_t str = {0,0,0}; |
5742 | 0 | bcf_enc_int1(&str, fmt_id); |
5743 | 0 | assert(values != NULL); |
5744 | 0 | if ( type==BCF_HT_INT ) |
5745 | 0 | bcf_enc_vint(&str, n, (int32_t*)values, nps); |
5746 | 0 | else if ( type==BCF_HT_REAL ) |
5747 | 0 | { |
5748 | 0 | bcf_enc_size(&str, nps, BCF_BT_FLOAT); |
5749 | 0 | serialize_float_array(&str, nps*line->n_sample, (float *) values); |
5750 | 0 | } |
5751 | 0 | else if ( type==BCF_HT_STR ) |
5752 | 0 | { |
5753 | 0 | bcf_enc_size(&str, nps, BCF_BT_CHAR); |
5754 | 0 | kputsn((char*)values, nps*line->n_sample, &str); |
5755 | 0 | } |
5756 | 0 | else |
5757 | 0 | { |
5758 | 0 | hts_log_error("The type %d not implemented yet at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1); |
5759 | 0 | abort(); |
5760 | 0 | } |
5761 | | |
5762 | 0 | if ( !fmt ) |
5763 | 0 | { |
5764 | | // Not present, new format field |
5765 | 0 | line->n_fmt++; |
5766 | 0 | hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt); |
5767 | | |
5768 | | // Special case: VCF specification requires that GT is always first |
5769 | 0 | if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] ) |
5770 | 0 | { |
5771 | 0 | for (i=line->n_fmt-1; i>0; i--) |
5772 | 0 | line->d.fmt[i] = line->d.fmt[i-1]; |
5773 | 0 | fmt = &line->d.fmt[0]; |
5774 | 0 | } |
5775 | 0 | else |
5776 | 0 | fmt = &line->d.fmt[line->n_fmt-1]; |
5777 | 0 | bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt); |
5778 | 0 | line->d.indiv_dirty = 1; |
5779 | 0 | fmt->p_free = 1; |
5780 | 0 | } |
5781 | 0 | else |
5782 | 0 | { |
5783 | | // The tag is already present, check if it is big enough to accommodate the new block |
5784 | 0 | if ( fmt->p && str.l <= fmt->p_len + fmt->p_off ) |
5785 | 0 | { |
5786 | | // good, the block is big enough |
5787 | 0 | if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1; |
5788 | 0 | uint8_t *ptr = fmt->p - fmt->p_off; |
5789 | 0 | memcpy(ptr, str.s, str.l); |
5790 | 0 | free(str.s); |
5791 | 0 | int p_free = fmt->p_free; |
5792 | 0 | bcf_unpack_fmt_core1(ptr, line->n_sample, fmt); |
5793 | 0 | fmt->p_free = p_free; |
5794 | 0 | } |
5795 | 0 | else |
5796 | 0 | { |
5797 | 0 | if ( fmt->p_free ) |
5798 | 0 | free(fmt->p - fmt->p_off); |
5799 | 0 | bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt); |
5800 | 0 | fmt->p_free = 1; |
5801 | 0 | line->d.indiv_dirty = 1; |
5802 | 0 | } |
5803 | 0 | } |
5804 | 0 | line->unpacked |= BCF_UN_FMT; |
5805 | |
|
5806 | 0 | if (is_len) { |
5807 | 0 | line->rlen = get_rlen(hdr, line); |
5808 | 0 | } |
5809 | 0 | return 0; |
5810 | 0 | } |
5811 | | |
5812 | | |
5813 | | int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n) |
5814 | 0 | { |
5815 | 0 | if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); |
5816 | 0 | line->d.shared_dirty |= BCF1_DIRTY_FLT; |
5817 | 0 | line->d.n_flt = n; |
5818 | 0 | if ( !n ) return 0; |
5819 | 0 | hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt); |
5820 | 0 | int i; |
5821 | 0 | for (i=0; i<n; i++) |
5822 | 0 | line->d.flt[i] = flt_ids[i]; |
5823 | 0 | return 0; |
5824 | 0 | } |
5825 | | |
5826 | | int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id) |
5827 | 0 | { |
5828 | 0 | if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); |
5829 | 0 | int i; |
5830 | 0 | for (i=0; i<line->d.n_flt; i++) |
5831 | 0 | if ( flt_id==line->d.flt[i] ) break; |
5832 | 0 | if ( i<line->d.n_flt ) return 0; // this filter is already set |
5833 | 0 | line->d.shared_dirty |= BCF1_DIRTY_FLT; |
5834 | 0 | if ( flt_id==0 ) // set to PASS |
5835 | 0 | line->d.n_flt = 1; |
5836 | 0 | else if ( line->d.n_flt==1 && line->d.flt[0]==0 ) |
5837 | 0 | line->d.n_flt = 1; |
5838 | 0 | else |
5839 | 0 | line->d.n_flt++; |
5840 | 0 | hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt); |
5841 | 0 | line->d.flt[line->d.n_flt-1] = flt_id; |
5842 | 0 | return 1; |
5843 | 0 | } |
5844 | | int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass) |
5845 | 0 | { |
5846 | 0 | if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); |
5847 | 0 | int i; |
5848 | 0 | for (i=0; i<line->d.n_flt; i++) |
5849 | 0 | if ( flt_id==line->d.flt[i] ) break; |
5850 | 0 | if ( i==line->d.n_flt ) return 0; // the filter is not present |
5851 | 0 | line->d.shared_dirty |= BCF1_DIRTY_FLT; |
5852 | 0 | if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt)); |
5853 | 0 | line->d.n_flt--; |
5854 | 0 | if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0); |
5855 | 0 | return 0; |
5856 | 0 | } |
5857 | | |
5858 | | int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter) |
5859 | 0 | { |
5860 | 0 | if ( filter[0]=='.' && !filter[1] ) filter = "PASS"; |
5861 | 0 | int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter); |
5862 | 0 | if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1; // not defined in the header |
5863 | | |
5864 | 0 | if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); |
5865 | 0 | if ( id==0 && !line->d.n_flt) return 1; // PASS |
5866 | | |
5867 | 0 | int i; |
5868 | 0 | for (i=0; i<line->d.n_flt; i++) |
5869 | 0 | if ( line->d.flt[i]==id ) return 1; |
5870 | 0 | return 0; |
5871 | 0 | } |
5872 | | |
5873 | | static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals) |
5874 | 0 | { |
5875 | 0 | line->d.shared_dirty |= BCF1_DIRTY_ALS; |
5876 | 0 | line->d.var_type = -1; |
5877 | |
|
5878 | 0 | line->n_allele = nals; |
5879 | 0 | hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele); |
5880 | |
|
5881 | 0 | char *als = line->d.als; |
5882 | 0 | int n = 0; |
5883 | 0 | while (n<nals) |
5884 | 0 | { |
5885 | 0 | line->d.allele[n] = als; |
5886 | 0 | while ( *als ) als++; |
5887 | 0 | als++; |
5888 | 0 | n++; |
5889 | 0 | } |
5890 | | // Update REF length. Note that END is 1-based while line->pos 0-based |
5891 | 0 | line->rlen = get_rlen(hdr, line); |
5892 | |
|
5893 | 0 | return 0; |
5894 | 0 | } |
5895 | | int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals) |
5896 | 0 | { |
5897 | 0 | if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR); |
5898 | 0 | char *free_old = NULL; |
5899 | 0 | char buffer[256]; |
5900 | 0 | size_t used = 0; |
5901 | | |
5902 | | // The pointers in alleles may point into the existing line->d.als memory, |
5903 | | // so care needs to be taken not to clobber them while updating. Usually |
5904 | | // they will be short so we can copy through an intermediate buffer. |
5905 | | // If they're longer, or won't fit in the existing allocation we |
5906 | | // can allocate a new buffer to write into. Note that in either case |
5907 | | // pointers to line->d.als memory in alleles may not be valid when we've |
5908 | | // finished. |
5909 | 0 | int i; |
5910 | 0 | size_t avail = line->d.m_als < sizeof(buffer) ? line->d.m_als : sizeof(buffer); |
5911 | 0 | for (i=0; i<nals; i++) { |
5912 | 0 | size_t sz = strlen(alleles[i]) + 1; |
5913 | 0 | if (avail - used < sz) |
5914 | 0 | break; |
5915 | 0 | memcpy(buffer + used, alleles[i], sz); |
5916 | 0 | used += sz; |
5917 | 0 | } |
5918 | | |
5919 | | // Did we miss anything? |
5920 | 0 | if (i < nals) { |
5921 | 0 | int j; |
5922 | 0 | size_t needed = used; |
5923 | 0 | char *new_als; |
5924 | 0 | for (j = i; j < nals; j++) |
5925 | 0 | needed += strlen(alleles[j]) + 1; |
5926 | 0 | if (needed < line->d.m_als) // Don't shrink the buffer |
5927 | 0 | needed = line->d.m_als; |
5928 | 0 | if (needed > INT_MAX) { |
5929 | 0 | hts_log_error("REF + alleles too long to fit in a BCF record"); |
5930 | 0 | return -1; |
5931 | 0 | } |
5932 | 0 | new_als = malloc(needed); |
5933 | 0 | if (!new_als) |
5934 | 0 | return -1; |
5935 | 0 | free_old = line->d.als; |
5936 | 0 | line->d.als = new_als; |
5937 | 0 | line->d.m_als = needed; |
5938 | 0 | } |
5939 | | |
5940 | | // Copy from the temp buffer to the destination |
5941 | 0 | if (used) { |
5942 | 0 | assert(used <= line->d.m_als); |
5943 | 0 | memcpy(line->d.als, buffer, used); |
5944 | 0 | } |
5945 | | |
5946 | | // Add in any remaining entries - if this happens we will always be |
5947 | | // writing to a newly-allocated buffer. |
5948 | 0 | for (; i < nals; i++) { |
5949 | 0 | size_t sz = strlen(alleles[i]) + 1; |
5950 | 0 | memcpy(line->d.als + used, alleles[i], sz); |
5951 | 0 | used += sz; |
5952 | 0 | } |
5953 | |
|
5954 | 0 | if (free_old) |
5955 | 0 | free(free_old); |
5956 | 0 | return _bcf1_sync_alleles(hdr,line,nals); |
5957 | 0 | } |
5958 | | |
5959 | | int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string) |
5960 | 0 | { |
5961 | 0 | if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR); |
5962 | 0 | kstring_t tmp; |
5963 | 0 | tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als; |
5964 | 0 | kputs(alleles_string, &tmp); |
5965 | 0 | line->d.als = tmp.s; line->d.m_als = tmp.m; |
5966 | |
|
5967 | 0 | int nals = 1; |
5968 | 0 | char *t = line->d.als; |
5969 | 0 | while (*t) |
5970 | 0 | { |
5971 | 0 | if ( *t==',' ) { *t = 0; nals++; } |
5972 | 0 | t++; |
5973 | 0 | } |
5974 | 0 | return _bcf1_sync_alleles(hdr, line, nals); |
5975 | 0 | } |
5976 | | |
5977 | | int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id) |
5978 | 0 | { |
5979 | 0 | if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR); |
5980 | 0 | kstring_t tmp; |
5981 | 0 | tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id; |
5982 | 0 | if ( id ) |
5983 | 0 | kputs(id, &tmp); |
5984 | 0 | else |
5985 | 0 | kputs(".", &tmp); |
5986 | 0 | line->d.id = tmp.s; line->d.m_id = tmp.m; |
5987 | 0 | line->d.shared_dirty |= BCF1_DIRTY_ID; |
5988 | 0 | return 0; |
5989 | 0 | } |
5990 | | |
5991 | | int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id) |
5992 | 0 | { |
5993 | 0 | if ( !id ) return 0; |
5994 | 0 | if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR); |
5995 | |
|
5996 | 0 | kstring_t tmp; |
5997 | 0 | tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id; |
5998 | |
|
5999 | 0 | int len = strlen(id); |
6000 | 0 | char *dst = line->d.id; |
6001 | 0 | while ( *dst && (dst=strstr(dst,id)) ) |
6002 | 0 | { |
6003 | 0 | if ( dst[len]!=0 && dst[len]!=';' ) dst++; // a prefix, not a match |
6004 | 0 | else if ( dst==line->d.id || dst[-1]==';' ) return 0; // already present |
6005 | 0 | dst++; // a suffix, not a match |
6006 | 0 | } |
6007 | 0 | if ( line->d.id && (line->d.id[0]!='.' || line->d.id[1]) ) |
6008 | 0 | { |
6009 | 0 | tmp.l = strlen(line->d.id); |
6010 | 0 | kputc(';',&tmp); |
6011 | 0 | } |
6012 | 0 | kputs(id,&tmp); |
6013 | |
|
6014 | 0 | line->d.id = tmp.s; line->d.m_id = tmp.m; |
6015 | 0 | line->d.shared_dirty |= BCF1_DIRTY_ID; |
6016 | 0 | return 0; |
6017 | |
|
6018 | 0 | } |
6019 | | |
6020 | | bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key) |
6021 | 0 | { |
6022 | 0 | int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key); |
6023 | 0 | if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL; // no such FMT field in the header |
6024 | 0 | return bcf_get_fmt_id(line, id); |
6025 | 0 | } |
6026 | | |
6027 | | bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key) |
6028 | 0 | { |
6029 | 0 | int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key); |
6030 | 0 | if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL; // no such INFO field in the header |
6031 | 0 | return bcf_get_info_id(line, id); |
6032 | 0 | } |
6033 | | |
6034 | | bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id) |
6035 | 0 | { |
6036 | 0 | int i; |
6037 | 0 | if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); |
6038 | 0 | for (i=0; i<line->n_fmt; i++) |
6039 | 0 | { |
6040 | 0 | if ( line->d.fmt[i].id==id ) return &line->d.fmt[i]; |
6041 | 0 | } |
6042 | 0 | return NULL; |
6043 | 0 | } |
6044 | | |
6045 | | bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id) |
6046 | 0 | { |
6047 | 0 | int i; |
6048 | 0 | if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO); |
6049 | 0 | for (i=0; i<line->n_info; i++) |
6050 | 0 | { |
6051 | 0 | if ( line->d.info[i].key==id ) return &line->d.info[i]; |
6052 | 0 | } |
6053 | 0 | return NULL; |
6054 | 0 | } |
6055 | | |
6056 | | |
6057 | | int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type) |
6058 | 0 | { |
6059 | 0 | int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag); |
6060 | 0 | if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1; // no such INFO field in the header |
6061 | 0 | if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2; // expected different type |
6062 | | |
6063 | 0 | if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO); |
6064 | |
|
6065 | 0 | for (i=0; i<line->n_info; i++) |
6066 | 0 | if ( line->d.info[i].key==tag_id ) break; |
6067 | 0 | if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3; // the tag is not present in this record |
6068 | 0 | if ( type==BCF_HT_FLAG ) return 1; |
6069 | | |
6070 | 0 | bcf_info_t *info = &line->d.info[i]; |
6071 | 0 | if ( !info->vptr ) return -3; // the tag was marked for removal |
6072 | 0 | if ( type==BCF_HT_STR ) |
6073 | 0 | { |
6074 | 0 | if ( *ndst < info->len+1 ) |
6075 | 0 | { |
6076 | 0 | *ndst = info->len + 1; |
6077 | 0 | *dst = realloc(*dst, *ndst); |
6078 | 0 | } |
6079 | 0 | memcpy(*dst,info->vptr,info->len); |
6080 | 0 | ((uint8_t*)*dst)[info->len] = 0; |
6081 | 0 | return info->len; |
6082 | 0 | } |
6083 | | |
6084 | | // Make sure the buffer is big enough |
6085 | 0 | int size1; |
6086 | 0 | switch (type) { |
6087 | 0 | case BCF_HT_INT: size1 = sizeof(int32_t); break; |
6088 | 0 | case BCF_HT_LONG: size1 = sizeof(int64_t); break; |
6089 | 0 | case BCF_HT_REAL: size1 = sizeof(float); break; |
6090 | 0 | default: |
6091 | 0 | hts_log_error("Unexpected output type %d at %s:%"PRIhts_pos, type, bcf_seqname_safe(hdr,line), line->pos+1); |
6092 | 0 | return -2; |
6093 | 0 | } |
6094 | 0 | if ( *ndst < info->len ) |
6095 | 0 | { |
6096 | 0 | *ndst = info->len; |
6097 | 0 | *dst = realloc(*dst, *ndst * size1); |
6098 | 0 | } |
6099 | |
|
6100 | 0 | #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_regular, out_type_t) do { \ |
6101 | 0 | out_type_t *tmp = (out_type_t *) *dst; \ |
6102 | 0 | int j; \ |
6103 | 0 | for (j=0; j<info->len; j++) \ |
6104 | 0 | { \ |
6105 | 0 | type_t p = convert(info->vptr + j * sizeof(type_t)); \ |
6106 | 0 | if ( is_vector_end ) break; \ |
6107 | 0 | if ( is_missing ) set_missing; \ |
6108 | 0 | else set_regular; \ |
6109 | 0 | tmp++; \ |
6110 | 0 | } \ |
6111 | 0 | ret = j; \ |
6112 | 0 | } while (0) |
6113 | 0 | switch (info->type) { |
6114 | 0 | case BCF_BT_INT8: |
6115 | 0 | if (type == BCF_HT_LONG) { |
6116 | 0 | BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); |
6117 | 0 | } else { |
6118 | 0 | BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); |
6119 | 0 | } |
6120 | 0 | break; |
6121 | 0 | case BCF_BT_INT16: |
6122 | 0 | if (type == BCF_HT_LONG) { |
6123 | 0 | BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); |
6124 | 0 | } else { |
6125 | 0 | BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); |
6126 | 0 | } |
6127 | 0 | break; |
6128 | 0 | case BCF_BT_INT32: |
6129 | 0 | if (type == BCF_HT_LONG) { |
6130 | 0 | BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); break; |
6131 | 0 | } else { |
6132 | 0 | BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break; |
6133 | 0 | } |
6134 | 0 | case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break; |
6135 | 0 | default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, info->type, bcf_seqname_safe(hdr,line), line->pos+1); return -2; |
6136 | 0 | } |
6137 | 0 | #undef BRANCH |
6138 | 0 | return ret; // set by BRANCH |
6139 | 0 | } |
6140 | | |
6141 | | int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst) |
6142 | 0 | { |
6143 | 0 | int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag); |
6144 | 0 | if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header |
6145 | 0 | if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2; // expected different type |
6146 | | |
6147 | 0 | if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); |
6148 | |
|
6149 | 0 | for (i=0; i<line->n_fmt; i++) |
6150 | 0 | if ( line->d.fmt[i].id==tag_id ) break; |
6151 | 0 | if ( i==line->n_fmt ) return -3; // the tag is not present in this record |
6152 | 0 | bcf_fmt_t *fmt = &line->d.fmt[i]; |
6153 | 0 | if ( !fmt->p ) return -3; // the tag was marked for removal |
6154 | | |
6155 | 0 | int nsmpl = bcf_hdr_nsamples(hdr); |
6156 | 0 | if ( !*dst ) |
6157 | 0 | { |
6158 | 0 | *dst = (char**) malloc(sizeof(char*)*nsmpl); |
6159 | 0 | if ( !*dst ) return -4; // could not alloc |
6160 | 0 | (*dst)[0] = NULL; |
6161 | 0 | } |
6162 | 0 | int n = (fmt->n+1)*nsmpl; |
6163 | 0 | if ( *ndst < n ) |
6164 | 0 | { |
6165 | 0 | (*dst)[0] = realloc((*dst)[0], n); |
6166 | 0 | if ( !(*dst)[0] ) return -4; // could not alloc |
6167 | 0 | *ndst = n; |
6168 | 0 | } |
6169 | 0 | for (i=0; i<nsmpl; i++) |
6170 | 0 | { |
6171 | 0 | uint8_t *src = fmt->p + i*fmt->n; |
6172 | 0 | uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1); |
6173 | 0 | memcpy(tmp,src,fmt->n); |
6174 | 0 | tmp[fmt->n] = 0; |
6175 | 0 | (*dst)[i] = (char*) tmp; |
6176 | 0 | } |
6177 | 0 | return n; |
6178 | 0 | } |
6179 | | |
6180 | | int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type) |
6181 | 0 | { |
6182 | 0 | int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag); |
6183 | 0 | if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header |
6184 | 0 | if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 ) |
6185 | 0 | { |
6186 | | // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT. |
6187 | 0 | if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2; |
6188 | 0 | } |
6189 | 0 | else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2; // expected different type |
6190 | | |
6191 | 0 | if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT); |
6192 | |
|
6193 | 0 | for (i=0; i<line->n_fmt; i++) |
6194 | 0 | if ( line->d.fmt[i].id==tag_id ) break; |
6195 | 0 | if ( i==line->n_fmt ) return -3; // the tag is not present in this record |
6196 | 0 | bcf_fmt_t *fmt = &line->d.fmt[i]; |
6197 | 0 | if ( !fmt->p ) return -3; // the tag was marked for removal |
6198 | | |
6199 | 0 | if ( type==BCF_HT_STR ) |
6200 | 0 | { |
6201 | 0 | int n = fmt->n*bcf_hdr_nsamples(hdr); |
6202 | 0 | if ( *ndst < n ) |
6203 | 0 | { |
6204 | 0 | *dst = realloc(*dst, n); |
6205 | 0 | if ( !*dst ) return -4; // could not alloc |
6206 | 0 | *ndst = n; |
6207 | 0 | } |
6208 | 0 | memcpy(*dst,fmt->p,n); |
6209 | 0 | return n; |
6210 | 0 | } |
6211 | | |
6212 | | // Make sure the buffer is big enough |
6213 | 0 | int nsmpl = bcf_hdr_nsamples(hdr); |
6214 | 0 | int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float); |
6215 | 0 | if ( *ndst < fmt->n*nsmpl ) |
6216 | 0 | { |
6217 | 0 | *ndst = fmt->n*nsmpl; |
6218 | 0 | *dst = realloc(*dst, *ndst*size1); |
6219 | 0 | if ( !*dst ) return -4; // could not alloc |
6220 | 0 | } |
6221 | | |
6222 | 0 | #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_vector_end, set_regular, out_type_t) { \ |
6223 | 0 | out_type_t *tmp = (out_type_t *) *dst; \ |
6224 | 0 | uint8_t *fmt_p = fmt->p; \ |
6225 | 0 | for (i=0; i<nsmpl; i++) \ |
6226 | 0 | { \ |
6227 | 0 | for (j=0; j<fmt->n; j++) \ |
6228 | 0 | { \ |
6229 | 0 | type_t p = convert(fmt_p + j * sizeof(type_t)); \ |
6230 | 0 | if ( is_missing ) set_missing; \ |
6231 | 0 | else if ( is_vector_end ) { set_vector_end; break; } \ |
6232 | 0 | else set_regular; \ |
6233 | 0 | tmp++; \ |
6234 | 0 | } \ |
6235 | 0 | for (; j<fmt->n; j++) { set_vector_end; tmp++; } \ |
6236 | 0 | fmt_p += fmt->size; \ |
6237 | 0 | } \ |
6238 | 0 | } |
6239 | 0 | switch (fmt->type) { |
6240 | 0 | case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break; |
6241 | 0 | case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break; |
6242 | 0 | case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break; |
6243 | 0 | case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), bcf_float_set(tmp, p), float); break; |
6244 | 0 | default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt->type, bcf_seqname_safe(hdr,line), line->pos+1); exit(1); |
6245 | 0 | } |
6246 | 0 | #undef BRANCH |
6247 | | |
6248 | 0 | return nsmpl*fmt->n; |
6249 | 0 | } |
6250 | | |
6251 | | //error description structure definition |
6252 | | typedef struct err_desc { |
6253 | | int errorcode; |
6254 | | const char *description; |
6255 | | }err_desc; |
6256 | | |
6257 | | // error descriptions |
6258 | | static const err_desc errdesc_bcf[] = { |
6259 | | { BCF_ERR_CTG_UNDEF, "Contig not defined in header"}, |
6260 | | { BCF_ERR_TAG_UNDEF, "Tag not defined in header" }, |
6261 | | { BCF_ERR_NCOLS, "Incorrect number of columns" }, |
6262 | | { BCF_ERR_LIMITS, "Limits reached" }, |
6263 | | { BCF_ERR_CHAR, "Invalid character" }, |
6264 | | { BCF_ERR_CTG_INVALID, "Invalid contig" }, |
6265 | | { BCF_ERR_TAG_INVALID, "Invalid tag" }, |
6266 | | }; |
6267 | | |
6268 | | /// append given description to buffer based on available size and add ... when not enough space |
6269 | | /** @param buffer buffer to which description to be appended |
6270 | | @param offset offset at which to be appended |
6271 | | @param maxbuffer maximum size of the buffer |
6272 | | @param description the description to be appended |
6273 | | on failure returns -1 - when buffer is not big enough; returns -1 on invalid params and on too small buffer which are improbable due to validation at caller site |
6274 | | on success returns 0 |
6275 | | */ |
6276 | 2.00k | static int add_desc_to_buffer(char *buffer, size_t *offset, size_t maxbuffer, const char *description) { |
6277 | | |
6278 | 2.00k | if (!description || !buffer || !offset || (maxbuffer < 4)) |
6279 | 0 | return -1; |
6280 | | |
6281 | 2.00k | size_t rembuffer = maxbuffer - *offset; |
6282 | 2.00k | if (rembuffer > (strlen(description) + (rembuffer == maxbuffer ? 0 : 1))) { //add description with optionally required ',' |
6283 | 2.00k | *offset += snprintf(buffer + *offset, rembuffer, "%s%s", (rembuffer == maxbuffer)? "": ",", description); |
6284 | 2.00k | } else { //not enough space for description, put ... |
6285 | 0 | size_t tmppos = (rembuffer <= 4) ? maxbuffer - 4 : *offset; |
6286 | 0 | snprintf(buffer + tmppos, 4, "..."); //ignore offset update |
6287 | 0 | return -1; |
6288 | 0 | } |
6289 | 2.00k | return 0; |
6290 | 2.00k | } |
6291 | | |
6292 | | //get description for given error code. return NULL on error |
6293 | 968 | const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) { |
6294 | 968 | size_t usedup = 0; |
6295 | 968 | int ret = 0; |
6296 | 968 | int idx; |
6297 | | |
6298 | 968 | if (!buffer || maxbuffer < 4) |
6299 | 0 | return NULL; //invalid / insufficient buffer |
6300 | | |
6301 | 968 | if (!errorcode) { |
6302 | 0 | buffer[0] = '\0'; //no error, set null |
6303 | 0 | return buffer; |
6304 | 0 | } |
6305 | | |
6306 | 7.74k | for (idx = 0; idx < sizeof(errdesc_bcf) / sizeof(err_desc); ++idx) { |
6307 | 6.77k | if (errorcode & errdesc_bcf[idx].errorcode) { //error is set, add description |
6308 | 2.00k | ret = add_desc_to_buffer(buffer, &usedup, maxbuffer, errdesc_bcf[idx].description); |
6309 | 2.00k | if (ret < 0) |
6310 | 0 | break; //not enough space, ... added, no need to continue |
6311 | | |
6312 | 2.00k | errorcode &= ~errdesc_bcf[idx].errorcode; //reset the error |
6313 | 2.00k | } |
6314 | 6.77k | } |
6315 | | |
6316 | 968 | if (errorcode && (ret >= 0)) { //undescribed error is present in error code and had enough buffer, try to add unkonwn error as well§ |
6317 | 0 | add_desc_to_buffer(buffer, &usedup, maxbuffer, "Unknown error"); |
6318 | 0 | } |
6319 | 968 | return buffer; |
6320 | 968 | } |
6321 | | |
6322 | | /** |
6323 | | * bcf_format_gt_v2 - formats GT information on a string |
6324 | | * @param hdr - bcf header, to get version |
6325 | | * @param fmt - pointer to bcf format data |
6326 | | * @param isample - position of interested sample in data |
6327 | | * @param str - pointer to output string |
6328 | | * Returns 0 on success and -1 on failure |
6329 | | * This method is preferred over bcf_format_gt as this supports vcf4.4 and |
6330 | | * prefixed phasing. Explicit / prefixed phasing for 1st allele is used only |
6331 | | * when it is a must to correctly express phasing. |
6332 | | * correctly express phasing. |
6333 | | */ |
6334 | | int bcf_format_gt_v2(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample, kstring_t *str) |
6335 | 13.8k | { |
6336 | 13.8k | uint32_t e = 0; |
6337 | 13.8k | int ploidy = 1, anyunphased = 0; |
6338 | 13.8k | int32_t val0 = 0; |
6339 | 13.8k | size_t pos = str ? str->l : 0; |
6340 | | |
6341 | 13.8k | #define BRANCH(type_t, convert, missing, vector_end) { \ |
6342 | 13.2k | uint8_t *ptr = fmt->p + isample*fmt->size; \ |
6343 | 13.2k | int i; \ |
6344 | 30.9k | for (i=0; i<fmt->n; i++, ptr += sizeof(type_t)) \ |
6345 | 24.9k | { \ |
6346 | 24.9k | type_t val = convert(ptr); \ |
6347 | 24.9k | if ( val == vector_end ) break; \ |
6348 | 24.9k | if (!i) { val0 = val; } \ |
6349 | 17.7k | if (i) { \ |
6350 | 4.43k | e |= kputc("/|"[val & 1], str) < 0; \ |
6351 | 4.43k | anyunphased |= !(val & 1); \ |
6352 | 4.43k | } \ |
6353 | 17.7k | if (!(val >> 1)) e |= kputc('.', str) < 0; \ |
6354 | 17.7k | else e |= kputw((val >> 1) - 1, str) < 0; \ |
6355 | 17.7k | } \ |
6356 | 13.2k | if (i == 0) e |= kputc('.', str) < 0; \ |
6357 | 13.2k | ploidy = i; \ |
6358 | 13.2k | } |
6359 | 13.8k | switch (fmt->type) { |
6360 | 6.46k | case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, bcf_int8_missing, |
6361 | 6.46k | bcf_int8_vector_end); break; |
6362 | 2.20k | case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing, |
6363 | 2.20k | bcf_int16_vector_end); break; |
6364 | 4.60k | case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing, |
6365 | 4.60k | bcf_int32_vector_end); break; |
6366 | 606 | case BCF_BT_NULL: e |= kputc('.', str) < 0; break; |
6367 | 0 | default: hts_log_error("Unexpected type %d", fmt->type); return -2; |
6368 | 13.8k | } |
6369 | 13.8k | #undef BRANCH |
6370 | | |
6371 | 13.8k | if (hdr && get_hdr_aux(hdr)->version >= VCF44) { |
6372 | | //output which supports prefixed phasing |
6373 | | |
6374 | | /* update 1st allele's phasing if required and append rest to it. |
6375 | | use prefixed phasing only when it is a must. i.e. without which the |
6376 | | inferred value will be incorrect */ |
6377 | 6.38k | if (val0 & 1) { |
6378 | | /* 1st one is phased, if ploidy is > 1 and an unphased allele exists |
6379 | | need to specify explicitly */ |
6380 | 810 | e |= (ploidy > 1 && anyunphased) ? |
6381 | 49 | (kinsert_char('|', pos, str) < 0) : |
6382 | 810 | (ploidy <= 1 && !((val0 >> 1)) ? //|. needs explicit o/p |
6383 | 0 | (kinsert_char('|', pos, str) < 0) : |
6384 | 761 | 0); |
6385 | 5.57k | } else { |
6386 | | /* 1st allele is unphased, if ploidy is = 1 or allele is '.' or |
6387 | | ploidy > 1 and no other unphased allele exist, need to specify |
6388 | | explicitly */ |
6389 | 5.57k | e |= ((ploidy <= 1 && val0 != 0) || (ploidy > 1 && !anyunphased)) ? |
6390 | 2.93k | (kinsert_char('/', pos, str) < 0) : |
6391 | 5.57k | 0; |
6392 | 5.57k | } |
6393 | 6.38k | } |
6394 | 13.8k | return e == 0 ? 0 : -1; |
6395 | 13.8k | } |
6396 | | |
6397 | | /** |
6398 | | * get_rlen - calculates and returns rlen value |
6399 | | * @param h - bcf header |
6400 | | * @param v - bcf data |
6401 | | * Returns rlen calculated on success and -1 on failure. |
6402 | | * rlen calculation is dependent on vcf version and a few other field data. |
6403 | | * When bcf decoded data is available, refers it. When not available, retrieves |
6404 | | * required field data by seeking on the data stream. |
6405 | | * Ideally pos & version be set appropriately before any info/format field |
6406 | | * update to have proper rlen calculation. |
6407 | | * As version is not kept properly updated in practice, it is ignored in calcs. |
6408 | | */ |
6409 | | static int64_t get_rlen(const bcf_hdr_t *h, bcf1_t *v) |
6410 | 42.0k | { |
6411 | 42.0k | uint8_t *f = (uint8_t*)v->shared.s, *t = NULL, |
6412 | 42.0k | *e = (uint8_t*)v->shared.s + v->shared.l; |
6413 | 42.0k | int size, type, id, lenid, endid, svlenid, i, bad, gvcf = 0, use_svlen = 0; |
6414 | 42.0k | bcf_info_t *endinfo = NULL, *svleninfo = NULL, end_lcl, svlen_lcl; |
6415 | 42.0k | bcf_fmt_t *lenfmt = NULL, len_lcl; |
6416 | | |
6417 | | //holds SVLEN allele status for the max no of alleles |
6418 | 42.0k | uint8_t svlenals[8192]; |
6419 | | //pos from info END, fmt LEN, info SVLEN |
6420 | 42.0k | hts_pos_t end = 0, end_fmtlen = 0, end_svlen = 0, hpos; |
6421 | 42.0k | int64_t len_ref = 0, len = 0, tmp; |
6422 | 42.0k | endid = bcf_hdr_id2int(h, BCF_DT_ID, "END"); |
6423 | | |
6424 | | //initialise bytes which are to be used |
6425 | 42.0k | memset(svlenals, 0, 1 + v->n_allele / 8); |
6426 | | |
6427 | | //use decoded data where ever available and where not, get from stream |
6428 | 42.0k | if (v->unpacked & BCF_UN_STR || v->d.shared_dirty & BCF1_DIRTY_ALS) { |
6429 | 0 | for (i = 1; i < v->n_allele; ++i) { |
6430 | | // check only symbolic alt alleles |
6431 | 0 | if (v->d.allele[i][0] != '<') |
6432 | 0 | continue; |
6433 | 0 | if (svlen_on_ref_for_vcf_alt(v->d.allele[i], -1)) { |
6434 | | // del, dup or cnv allele, note to check corresponding svlen val |
6435 | 0 | svlenals[i >> 3] |= 1 << (i & 7); |
6436 | 0 | use_svlen = 1; |
6437 | 0 | } else if (!strcmp(v->d.allele[i], "<*>") || |
6438 | 0 | !strcmp(v->d.allele[i], "<NON_REF>")) { |
6439 | 0 | gvcf = 1; //gvcf present, have to check for LEN field |
6440 | 0 | } |
6441 | 0 | } |
6442 | 0 | f += v->unpack_size[0] + v->unpack_size[1]; |
6443 | 0 | len_ref = v->n_allele ? strlen(v->d.allele[0]) : 0; |
6444 | 42.0k | } else if (f < e) { |
6445 | | //skip ID |
6446 | 42.0k | size = bcf_dec_size(f, &f, &type); |
6447 | 42.0k | f += size << bcf_type_shift[type]; |
6448 | | // REF, ALT |
6449 | 1.69M | for (i = 0; i < v->n_allele; ++i) { |
6450 | | //check all alleles, w/o NUL |
6451 | 1.64M | size = bcf_dec_size(f, &f, &type); |
6452 | 1.64M | if (!i) { //REF length |
6453 | 42.0k | len_ref = size; |
6454 | 1.60M | } else if (size > 0 && *f == '<') { |
6455 | 1.95k | if (svlen_on_ref_for_vcf_alt((char *) f, size)) { |
6456 | | // del, dup or cnv allele, note to check corresponding svlen val |
6457 | 0 | svlenals[i >> 3] |= 1 << (i & 7); |
6458 | 0 | use_svlen = 1; |
6459 | 1.95k | } else if ((size == 3 && !strncmp((char*)f, "<*>", size)) || |
6460 | 1.95k | (size == 9 && !strncmp((char*)f, "<NON_REF>", size))) { |
6461 | 47 | gvcf = 1; //gvcf present, have to check for LEN field |
6462 | 47 | } |
6463 | 1.95k | } |
6464 | 1.64M | f += size << bcf_type_shift[type]; |
6465 | 1.64M | } |
6466 | 42.0k | } |
6467 | | // FILTER |
6468 | 42.0k | if (v->unpacked & BCF_UN_FLT) { |
6469 | 0 | f += v->unpack_size[2]; |
6470 | 42.0k | } else if (f < e) { |
6471 | 42.0k | size = bcf_dec_size(f, &f, &type); |
6472 | 42.0k | f += size << bcf_type_shift[type]; |
6473 | 42.0k | } |
6474 | | |
6475 | | // Only do SVLEN lookup if there are suitable symbolic alleles |
6476 | 42.0k | svlenid = use_svlen ? bcf_hdr_id2int(h, BCF_DT_ID, "SVLEN") : -1; |
6477 | | |
6478 | | // INFO |
6479 | 42.0k | if (svlenid >= 0 || endid >= 0 ) { //only if end/svlen present |
6480 | 9.46k | if (v->unpacked & BCF_UN_INFO || v->d.shared_dirty & BCF1_DIRTY_INF) { |
6481 | 0 | endinfo = bcf_get_info(h, v, "END"); |
6482 | 0 | svleninfo = bcf_get_info(h, v, "SVLEN"); |
6483 | 9.46k | } else if (f < e) { |
6484 | 11.6k | for (i = 0; i < v->n_info; ++i) { |
6485 | 6.93k | id = bcf_dec_typed_int1(f, &t); |
6486 | 6.93k | if (id == endid) { //END |
6487 | 693 | t = bcf_unpack_info_core1(f, &end_lcl); |
6488 | 693 | endinfo = &end_lcl; |
6489 | 693 | if (svleninfo || svlenid < 0) { |
6490 | 693 | break; //already got svlen or no need to search further |
6491 | 693 | } |
6492 | 6.24k | } else if (id == svlenid) { //SVLEN |
6493 | 0 | t = bcf_unpack_info_core1(f, &svlen_lcl); |
6494 | 0 | svleninfo = &svlen_lcl; |
6495 | 0 | if (endinfo || endid < 0 ) { |
6496 | 0 | break; //already got end or no need to search further |
6497 | 0 | } |
6498 | 6.24k | } else { |
6499 | 6.24k | f = t; |
6500 | 6.24k | size = bcf_dec_size(f, &t, &type); |
6501 | 6.24k | t += size << bcf_type_shift[type]; |
6502 | 6.24k | } |
6503 | 6.24k | f = t; |
6504 | 6.24k | } |
6505 | 5.41k | } |
6506 | 9.46k | } |
6507 | | |
6508 | | // Only do LEN lookup if a <*> allele was found |
6509 | 42.0k | lenid = gvcf ? bcf_hdr_id2int(h, BCF_DT_ID, "LEN") : -1; |
6510 | | |
6511 | | // FORMAT |
6512 | 42.0k | if (lenid >= 0) { |
6513 | | //with LEN and has gvcf allele |
6514 | 0 | f = (uint8_t*)v->indiv.s; t = NULL; e = (uint8_t*)v->indiv.s + v->indiv.l; |
6515 | 0 | if (v->unpacked & BCF_UN_FMT || v->d.indiv_dirty) { |
6516 | 0 | lenfmt = bcf_get_fmt(h, v, "LEN"); |
6517 | 0 | } else if (f < e) { |
6518 | 0 | for (i = 0; i < v->n_fmt; ++i) { |
6519 | 0 | id = bcf_dec_typed_int1(f, &t); |
6520 | 0 | if (id == lenid) { |
6521 | 0 | t = bcf_unpack_fmt_core1(f, v->n_sample, &len_lcl); |
6522 | 0 | lenfmt = &len_lcl; |
6523 | 0 | break; //that's all needed |
6524 | 0 | } else { |
6525 | 0 | f = t; |
6526 | 0 | size = bcf_dec_size(f, &t, &type); |
6527 | 0 | t += size * v->n_sample << bcf_type_shift[type]; |
6528 | 0 | } |
6529 | 0 | f = t; |
6530 | 0 | } |
6531 | 0 | } |
6532 | 0 | } |
6533 | | //got required data, find end and rlen |
6534 | 42.0k | if (endinfo && endinfo->vptr) { //end position given by info END |
6535 | | //end info exists, not being deleted |
6536 | 693 | end = endinfo->v1.i; |
6537 | 693 | switch(endinfo->type) { |
6538 | 0 | case BCF_BT_INT8: end = end == bcf_int8_missing ? 0 : end; break; |
6539 | 0 | case BCF_BT_INT16: end = end == bcf_int16_missing ? 0 : end; break; |
6540 | 0 | case BCF_BT_INT32: end = end == bcf_int32_missing ? 0 : end; break; |
6541 | 0 | case BCF_BT_INT64: end = end == bcf_int64_missing ? 0 : end; break; |
6542 | 693 | default: end = 0; break; //invalid |
6543 | 693 | } |
6544 | 693 | } |
6545 | | |
6546 | 42.0k | if (svleninfo && svleninfo->vptr) { |
6547 | | //svlen info exists, not being deleted |
6548 | 0 | bad = 0; |
6549 | | //get largest svlen corresponding to a <DEL> symbolic allele |
6550 | 0 | for (i = 0; i < svleninfo->len && i + 1 < v->n_allele; ++i) { |
6551 | 0 | if (!(svlenals[i >> 3] & (1 << ((i + 1) & 7)))) |
6552 | 0 | continue; |
6553 | | |
6554 | 0 | switch(svleninfo->type) { |
6555 | 0 | case BCF_BT_INT8: |
6556 | 0 | tmp = le_to_i8(&svleninfo->vptr[i]); |
6557 | 0 | tmp = tmp == bcf_int8_missing ? 0 : tmp; |
6558 | 0 | break; |
6559 | 0 | case BCF_BT_INT16: |
6560 | 0 | tmp = le_to_i16(&svleninfo->vptr[i * 2]); |
6561 | 0 | tmp = tmp == bcf_int16_missing ? 0 : tmp; |
6562 | 0 | break; |
6563 | 0 | case BCF_BT_INT32: |
6564 | 0 | tmp = le_to_i32(&svleninfo->vptr[i * 4]); |
6565 | 0 | tmp = tmp == bcf_int32_missing ? 0 : tmp; |
6566 | 0 | break; |
6567 | 0 | case BCF_BT_INT64: |
6568 | 0 | tmp = le_to_i64(&svleninfo->vptr[i * 8]); |
6569 | 0 | tmp = tmp == bcf_int64_missing ? 0 : tmp; |
6570 | 0 | break; |
6571 | 0 | default: //invalid |
6572 | 0 | tmp = 0; |
6573 | 0 | bad = 1; |
6574 | 0 | break; |
6575 | 0 | } |
6576 | 0 | if (bad) { //stop svlen check |
6577 | 0 | len = 0; |
6578 | 0 | break; |
6579 | 0 | } |
6580 | | |
6581 | 0 | tmp = tmp < 0 ? llabs(tmp) : tmp; |
6582 | 0 | if (len < tmp) len = tmp; |
6583 | 0 | } |
6584 | 0 | } |
6585 | 42.0k | if ((!svleninfo || !len) && end) { //no svlen, infer from end |
6586 | 0 | len = end > v->pos ? end - v->pos - 1 : 0; |
6587 | 0 | } |
6588 | 42.0k | end_svlen = v->pos + len + 1; //end position found from SVLEN |
6589 | | |
6590 | 42.0k | len = 0; |
6591 | 42.0k | if (lenfmt && lenfmt->p) { |
6592 | | //fmt len exists, not being deleted, has gvcf and version >= 4.5 |
6593 | 0 | int j = 0; |
6594 | 0 | int64_t offset = 0; |
6595 | 0 | bad = 0; |
6596 | 0 | for (i = 0; i < v->n_sample; ++i) { |
6597 | 0 | for (j = 0; j < lenfmt->n; ++j) { |
6598 | 0 | switch(lenfmt->type) { |
6599 | 0 | case BCF_BT_INT8: |
6600 | 0 | tmp = le_to_i8(lenfmt->p + offset + j); |
6601 | 0 | tmp = tmp == bcf_int8_missing ? 0 : tmp; |
6602 | 0 | break; |
6603 | 0 | case BCF_BT_INT16: |
6604 | 0 | tmp = le_to_i16(lenfmt->p + offset + j * 2); |
6605 | 0 | tmp = tmp == bcf_int16_missing ? 0 : tmp; |
6606 | 0 | break; |
6607 | 0 | case BCF_BT_INT32: |
6608 | 0 | tmp = le_to_i32(lenfmt->p + offset + j * 4); |
6609 | 0 | tmp = tmp == bcf_int32_missing ? 0 : tmp; |
6610 | 0 | break; |
6611 | 0 | case BCF_BT_INT64: |
6612 | 0 | tmp = le_to_i64(lenfmt->p + offset + j * 8); |
6613 | 0 | tmp = tmp == bcf_int64_missing ? 0 : tmp; |
6614 | 0 | break; |
6615 | 0 | default: //invalid |
6616 | 0 | bad = 1; |
6617 | 0 | break; |
6618 | 0 | } |
6619 | 0 | if (bad) { //stop LEN check |
6620 | 0 | len = 0; |
6621 | 0 | break; |
6622 | 0 | } |
6623 | | //assumes only gvcf have valid LEN |
6624 | 0 | if (len < tmp) len = tmp; |
6625 | 0 | } |
6626 | 0 | offset += j << bcf_type_shift[lenfmt->type]; |
6627 | 0 | } |
6628 | 0 | } |
6629 | 42.0k | if ((!lenfmt || !len) && end) { //no fmt len, infer from end |
6630 | 0 | len = end > v->pos ? end - v->pos : 0; |
6631 | 0 | } |
6632 | 42.0k | end_fmtlen = v->pos + len; //end position found from LEN |
6633 | | |
6634 | | //get largest pos, based on END, SVLEN, fmt LEN and length using it |
6635 | 42.0k | hpos = end < end_svlen ? |
6636 | 6.40k | end_svlen < end_fmtlen ? end_fmtlen : end_svlen : |
6637 | 42.0k | end < end_fmtlen ? end_fmtlen : end; |
6638 | 42.0k | len = hpos - v->pos; |
6639 | | |
6640 | | //NOTE: 'end' calculation be in sync with tbx.c:tbx_parse1 |
6641 | | |
6642 | | /* rlen to be calculated based on version, END, SVLEN, fmt LEN, ref len. |
6643 | | Relevance of these fields vary across different vcf versions. |
6644 | | Many times, these info/fmt fields are used without version updates; |
6645 | | hence these fields are used for calculation disregarding vcf version */ |
6646 | 42.0k | return len < len_ref ? len_ref : len; |
6647 | 42.0k | } |