Line | Count | Source (jump to first uncovered line) |
1 | | /* tbx.c -- tabix API functions. |
2 | | |
3 | | Copyright (C) 2009, 2010, 2012-2015, 2017-2020, 2022-2023, 2025 Genome Research Ltd. |
4 | | Copyright (C) 2010-2012 Broad Institute. |
5 | | |
6 | | Author: Heng Li <lh3@sanger.ac.uk> |
7 | | |
8 | | Permission is hereby granted, free of charge, to any person obtaining a copy |
9 | | of this software and associated documentation files (the "Software"), to deal |
10 | | in the Software without restriction, including without limitation the rights |
11 | | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
12 | | copies of the Software, and to permit persons to whom the Software is |
13 | | furnished to do so, subject to the following conditions: |
14 | | |
15 | | The above copyright notice and this permission notice shall be included in |
16 | | all copies or substantial portions of the Software. |
17 | | |
18 | | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
19 | | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
20 | | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
21 | | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
22 | | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
23 | | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
24 | | DEALINGS IN THE SOFTWARE. */ |
25 | | |
26 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
27 | | #include <config.h> |
28 | | |
29 | | #include <stdlib.h> |
30 | | #include <string.h> |
31 | | #include <stdio.h> |
32 | | #include <assert.h> |
33 | | #include <errno.h> |
34 | | #include "htslib/tbx.h" |
35 | | #include "htslib/bgzf.h" |
36 | | #include "htslib/hts_endian.h" |
37 | | #include "hts_internal.h" |
38 | | |
39 | | #include "htslib/khash.h" |
40 | | KHASH_DECLARE(s2i, kh_cstr_t, int64_t) |
41 | | |
42 | | HTSLIB_EXPORT |
43 | | const tbx_conf_t tbx_conf_gff = { 0, 1, 4, 5, '#', 0 }; |
44 | | |
45 | | HTSLIB_EXPORT |
46 | | const tbx_conf_t tbx_conf_bed = { TBX_UCSC, 1, 2, 3, '#', 0 }; |
47 | | |
48 | | HTSLIB_EXPORT |
49 | | const tbx_conf_t tbx_conf_psltbl = { TBX_UCSC, 15, 17, 18, '#', 0 }; |
50 | | |
51 | | HTSLIB_EXPORT |
52 | | const tbx_conf_t tbx_conf_sam = { TBX_SAM, 3, 4, 0, '@', 0 }; |
53 | | |
54 | | HTSLIB_EXPORT |
55 | | const tbx_conf_t tbx_conf_vcf = { TBX_VCF, 1, 2, 0, '#', 0 }; |
56 | | const tbx_conf_t tbx_conf_gaf = { TBX_GAF, 1, 6, 0, '#', 0 }; |
57 | | |
58 | | typedef struct { |
59 | | int64_t beg, end; |
60 | | char *ss, *se; |
61 | | int tid; |
62 | | } tbx_intv_t; |
63 | | |
64 | | static inline int get_tid(tbx_t *tbx, const char *ss, int is_add) |
65 | 0 | { |
66 | 0 | khint_t k; |
67 | 0 | khash_t(s2i) *d; |
68 | 0 | if ((tbx->conf.preset&0xffff) == TBX_GAF) return(0); |
69 | 0 | if (tbx->dict == 0) tbx->dict = kh_init(s2i); |
70 | 0 | if (!tbx->dict) return -1; // Out of memory |
71 | 0 | d = (khash_t(s2i)*)tbx->dict; |
72 | 0 | if (is_add) { |
73 | 0 | int absent; |
74 | 0 | k = kh_put(s2i, d, ss, &absent); |
75 | 0 | if (absent < 0) { |
76 | 0 | return -1; // Out of memory |
77 | 0 | } else if (absent) { |
78 | 0 | char *ss_dup = strdup(ss); |
79 | 0 | if (ss_dup) { |
80 | 0 | kh_key(d, k) = ss_dup; |
81 | 0 | kh_val(d, k) = kh_size(d) - 1; |
82 | 0 | } else { |
83 | 0 | kh_del(s2i, d, k); |
84 | 0 | return -1; // Out of memory |
85 | 0 | } |
86 | 0 | } |
87 | 0 | } else k = kh_get(s2i, d, ss); |
88 | 0 | return k == kh_end(d)? -1 : kh_val(d, k); |
89 | 0 | } |
90 | | |
91 | | int tbx_name2id(tbx_t *tbx, const char *ss) |
92 | 0 | { |
93 | 0 | return get_tid(tbx, ss, 0); |
94 | 0 | } |
95 | | |
96 | | int tbx_parse1(const tbx_conf_t *conf, size_t len, char *line, tbx_intv_t *intv) |
97 | 0 | { |
98 | 0 | size_t i, b = 0; |
99 | 0 | int id = 1, getlen = 0, alcnt = 0, haveins = 0, lenpos = -1; |
100 | 0 | char *s, *t; |
101 | 0 | uint8_t insals[8192]; |
102 | 0 | int64_t reflen = 0, svlen = 0, fmtlen = 0, tmp = 0; |
103 | |
|
104 | 0 | intv->ss = intv->se = 0; intv->beg = intv->end = -1; |
105 | 0 | for (i = 0; i <= len; ++i) { |
106 | 0 | if (line[i] == '\t' || line[i] == 0) { |
107 | 0 | if (id == conf->sc) { |
108 | 0 | intv->ss = line + b; intv->se = line + i; |
109 | 0 | } else if (id == conf->bc) { |
110 | | // here ->beg is 0-based. |
111 | 0 | if ((conf->preset&0xffff) == TBX_GAF){ |
112 | | // if gaf find the smallest and largest node id |
113 | 0 | char *t; |
114 | 0 | int64_t nodeid = -1; |
115 | 0 | for (s = line + b + 1; s < line + i;) { |
116 | 0 | nodeid = strtoll(s, &t, 0); |
117 | 0 | if(intv->beg == -1){ |
118 | 0 | intv->beg = intv->end = nodeid; |
119 | 0 | } else { |
120 | 0 | if(nodeid < intv->beg){ |
121 | 0 | intv->beg = nodeid; |
122 | 0 | } |
123 | |
|
124 | 0 | if(nodeid > intv->end){ |
125 | 0 | intv->end = nodeid; |
126 | 0 | } |
127 | 0 | } |
128 | 0 | s = t + 1; |
129 | 0 | } |
130 | 0 | } else { |
131 | 0 | intv->beg = strtoll(line + b, &s, 0); |
132 | |
|
133 | 0 | if (conf->bc <= conf->ec) // don't overwrite an already set end point |
134 | 0 | intv->end = intv->beg; |
135 | |
|
136 | 0 | if ( s==line+b ) return -1; // expected int |
137 | | |
138 | 0 | if (!(conf->preset&TBX_UCSC)) |
139 | 0 | --intv->beg; |
140 | 0 | else if (conf->bc <= conf->ec) |
141 | 0 | ++intv->end; |
142 | |
|
143 | 0 | if (intv->beg < 0) { |
144 | 0 | hts_log_warning("Coordinate <= 0 detected. " |
145 | 0 | "Did you forget to use the -0 option?"); |
146 | 0 | intv->beg = 0; |
147 | 0 | } |
148 | 0 | if (intv->end < 1) intv->end = 1; |
149 | 0 | } |
150 | 0 | } else { |
151 | 0 | if ((conf->preset&0xffff) == TBX_GENERIC) { |
152 | 0 | if (id == conf->ec) |
153 | 0 | { |
154 | 0 | intv->end = strtoll(line + b, &s, 0); |
155 | 0 | if ( s==line+b ) return -1; // expected int |
156 | 0 | } |
157 | 0 | } else if ((conf->preset&0xffff) == TBX_SAM) { |
158 | 0 | if (id == 6) { // CIGAR |
159 | 0 | int l = 0; |
160 | 0 | char *t; |
161 | 0 | for (s = line + b; s < line + i;) { |
162 | 0 | long x = strtol(s, &t, 10); |
163 | 0 | char op = toupper_c(*t); |
164 | 0 | if (op == 'M' || op == 'D' || op == 'N') l += x; |
165 | 0 | s = t + 1; |
166 | 0 | } |
167 | 0 | if (l == 0) l = 1; |
168 | 0 | intv->end = intv->beg + l; |
169 | 0 | } |
170 | 0 | } else if ((conf->preset&0xffff) == TBX_VCF) { |
171 | 0 | if (id == 4) { //ref allele |
172 | 0 | if (b < i) intv->end = intv->beg + (i - b); |
173 | 0 | ++alcnt; |
174 | 0 | reflen = i - b; |
175 | 0 | } if (id == 5) { //alt allele |
176 | 0 | int lastbyte = 0, c = line[i]; |
177 | 0 | insals[lastbyte] = 0; |
178 | 0 | line[i] = 0; |
179 | 0 | s = line + b; |
180 | 0 | do { |
181 | 0 | t = strchr(s, ','); |
182 | 0 | if (alcnt >> 3 != lastbyte) { //initialize insals |
183 | 0 | lastbyte = alcnt >> 3; |
184 | 0 | insals[lastbyte] = 0; |
185 | 0 | } |
186 | 0 | ++alcnt; |
187 | 0 | if (t) { |
188 | 0 | *t = 0; |
189 | 0 | } |
190 | 0 | if (s[0] == '<') { |
191 | 0 | if (!strcmp("<INS>", s)) { //note inserts |
192 | 0 | insals[lastbyte] |= 1 << ((alcnt - 1) & 7); |
193 | 0 | haveins = 1; |
194 | 0 | } else if (!strcmp("<*>", s) || |
195 | 0 | !strcmp("<NON_REF>", s)) { //note gvcf |
196 | 0 | getlen = 1; |
197 | 0 | } |
198 | 0 | } |
199 | 0 | if (t) { |
200 | 0 | *t = ','; |
201 | 0 | s = t + 1; |
202 | 0 | } |
203 | 0 | } while (t && alcnt < 65536); //max allcnt is 65535 |
204 | 0 | line[i] = c; |
205 | 0 | } else if (id == 8) { //INFO, look for "END=" / "SVLEN" |
206 | 0 | int c = line[i], d = 1; |
207 | 0 | line[i] = 0; |
208 | 0 | s = strstr(line + b, "END="); |
209 | 0 | if (s == line + b) s += 4; |
210 | 0 | else if (s) { |
211 | 0 | s = strstr(line + b, ";END="); |
212 | 0 | if (s) s += 5; |
213 | 0 | } |
214 | 0 | if (s && *s != '.') { |
215 | 0 | long long end = strtoll(s, &s, 0); |
216 | 0 | if (end <= intv->beg) { |
217 | 0 | static int reported = 0; |
218 | 0 | if (!reported) { |
219 | 0 | int l = intv->ss ? (int) (intv->se - intv->ss) : 0; |
220 | 0 | hts_log_warning("VCF INFO/END=%lld is smaller than POS at %.*s:%"PRIhts_pos"\n" |
221 | 0 | "This tag will be ignored. " |
222 | 0 | "Note: only one invalid END tag will be reported.", |
223 | 0 | end, l >= 0 ? l : 0, |
224 | 0 | intv->ss ? intv->ss : "", |
225 | 0 | intv->beg); |
226 | 0 | reported = 1; |
227 | 0 | } |
228 | 0 | } else { |
229 | 0 | intv->end = end; |
230 | 0 | } |
231 | 0 | } |
232 | 0 | s = strstr(line + b, "SVLEN="); |
233 | 0 | if (s == line + b) s += 6; //at start of info |
234 | 0 | else if (s) { //not at the start |
235 | 0 | s = strstr(line + b, ";SVLEN="); |
236 | 0 | if (s) s += 7; |
237 | 0 | } |
238 | 0 | while (s && d < alcnt) { |
239 | 0 | t = strchr(s, ','); |
240 | 0 | if ((haveins) && (insals[d >> 3] & (1 << (d & 7)))) { |
241 | 0 | tmp = 1; //<INS> |
242 | 0 | } else { |
243 | 0 | tmp = atoll(s); |
244 | 0 | tmp = tmp < 0 ? llabs(tmp) : tmp; |
245 | 0 | } |
246 | 0 | svlen = svlen < tmp ? tmp : svlen; |
247 | 0 | s = t ? t + 1 : NULL; |
248 | 0 | ++d; |
249 | 0 | } |
250 | 0 | line[i] = c; |
251 | 0 | } else if (getlen && id == 9 ) { //FORMAT |
252 | 0 | int c = line[i], pos = -1; |
253 | 0 | line[i] = 0; |
254 | 0 | s = line + b; |
255 | 0 | while (s) { |
256 | 0 | ++pos; |
257 | 0 | if (!(t = strchr(s, ':'))) { //no further fields |
258 | 0 | if (!strcmp(s, "LEN")) { |
259 | 0 | lenpos = pos; |
260 | 0 | } |
261 | 0 | break; //not present at all! |
262 | 0 | } else { |
263 | 0 | *t = '\0'; |
264 | 0 | if (!strcmp(s, "LEN")) { |
265 | 0 | lenpos = pos; |
266 | 0 | *t = ':'; |
267 | 0 | break; |
268 | 0 | } |
269 | 0 | *t = ':'; |
270 | 0 | s = t + 1; //check next one |
271 | 0 | } |
272 | 0 | } |
273 | 0 | line[i] = c; |
274 | 0 | if (lenpos == -1) { //not present |
275 | 0 | break; |
276 | 0 | } |
277 | 0 | } else if (id > 9 && getlen && lenpos != -1) { |
278 | | //get LEN from sample |
279 | 0 | int c = line[i], d = 0; |
280 | 0 | line[i] = 0; tmp = 0; |
281 | 0 | s = line + b; |
282 | 0 | for (d = 0; d <= lenpos; ++d) { |
283 | 0 | if (d == lenpos) { |
284 | 0 | tmp = atoll(s); |
285 | 0 | break; |
286 | 0 | } |
287 | 0 | if ((t = strchr(s, ':'))) { |
288 | 0 | s = t + 1; |
289 | 0 | } else { |
290 | 0 | break; //not in sycn with fmt def! |
291 | 0 | } |
292 | 0 | } |
293 | 0 | fmtlen = fmtlen < tmp ? tmp : fmtlen; |
294 | 0 | line[i] = c; |
295 | 0 | } |
296 | 0 | } |
297 | 0 | } |
298 | 0 | b = i + 1; //beginning if current field |
299 | 0 | ++id; |
300 | 0 | } |
301 | 0 | } |
302 | 0 | if ((conf->preset&0xffff) == TBX_VCF) { |
303 | 0 | tmp = reflen < svlen ? |
304 | 0 | svlen < fmtlen ? fmtlen : svlen : |
305 | 0 | reflen < fmtlen ? fmtlen : reflen ; |
306 | 0 | tmp += intv->beg; |
307 | 0 | intv->end = intv->end < tmp ? tmp : intv->end; |
308 | | |
309 | | //NOTE: 'end' calculation be in sync with end/rlen in vcf.c:get_rlen |
310 | 0 | } |
311 | 0 | if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1; |
312 | 0 | return 0; |
313 | 0 | } |
314 | | |
315 | | static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_add) |
316 | 0 | { |
317 | 0 | if (tbx_parse1(&tbx->conf, str->l, str->s, intv) == 0) { |
318 | 0 | int c = *intv->se; |
319 | 0 | *intv->se = '\0'; |
320 | 0 | if ((tbx->conf.preset&0xffff) == TBX_GAF){ |
321 | 0 | intv->tid = 0; |
322 | 0 | } else { |
323 | 0 | intv->tid = get_tid(tbx, intv->ss, is_add); |
324 | 0 | } |
325 | 0 | *intv->se = c; |
326 | 0 | if (intv->tid < 0) return -2; // get_tid out of memory |
327 | 0 | return (intv->beg >= 0 && intv->end >= 0)? 0 : -1; |
328 | 0 | } else { |
329 | 0 | char *type = NULL; |
330 | 0 | switch (tbx->conf.preset&0xffff) |
331 | 0 | { |
332 | 0 | case TBX_SAM: type = "TBX_SAM"; break; |
333 | 0 | case TBX_VCF: type = "TBX_VCF"; break; |
334 | 0 | case TBX_GAF: type = "TBX_GAF"; break; |
335 | 0 | case TBX_UCSC: type = "TBX_UCSC"; break; |
336 | 0 | default: type = "TBX_GENERIC"; break; |
337 | 0 | } |
338 | 0 | if (hts_is_utf16_text(str)) |
339 | 0 | hts_log_error("Failed to parse %s: offending line appears to be encoded as UTF-16", type); |
340 | 0 | else |
341 | 0 | hts_log_error("Failed to parse %s: was wrong -p [type] used?\nThe offending line was: \"%s\"", |
342 | 0 | type, str->s); |
343 | 0 | return -1; |
344 | 0 | } |
345 | 0 | } |
346 | | |
347 | | /* |
348 | | * Called by tabix iterator to read the next record. |
349 | | * Returns >= 0 on success |
350 | | * -1 on EOF |
351 | | * <= -2 on error |
352 | | */ |
353 | | int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, hts_pos_t *beg, hts_pos_t *end) |
354 | 0 | { |
355 | 0 | tbx_t *tbx = (tbx_t *) tbxv; |
356 | 0 | kstring_t *s = (kstring_t *) sv; |
357 | 0 | int ret; |
358 | 0 | if ((ret = bgzf_getline(fp, '\n', s)) >= 0) { |
359 | 0 | tbx_intv_t intv; |
360 | 0 | if (get_intv(tbx, s, &intv, 0) < 0) |
361 | 0 | return -2; |
362 | 0 | *tid = intv.tid; *beg = intv.beg; *end = intv.end; |
363 | 0 | } |
364 | 0 | return ret; |
365 | 0 | } |
366 | | |
367 | | static int tbx_set_meta(tbx_t *tbx) |
368 | 0 | { |
369 | 0 | int i, l = 0, l_nm; |
370 | 0 | uint32_t x[7]; |
371 | 0 | char **name; |
372 | 0 | uint8_t *meta; |
373 | 0 | khint_t k; |
374 | 0 | khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict; |
375 | |
|
376 | 0 | memcpy(x, &tbx->conf, 24); |
377 | 0 | name = (char**)malloc(sizeof(char*) * kh_size(d)); |
378 | 0 | if (!name) return -1; |
379 | 0 | for (k = kh_begin(d), l = 0; k != kh_end(d); ++k) { |
380 | 0 | if (!kh_exist(d, k)) continue; |
381 | 0 | name[kh_val(d, k)] = (char*)kh_key(d, k); |
382 | 0 | l += strlen(kh_key(d, k)) + 1; // +1 to include '\0' |
383 | 0 | } |
384 | 0 | l_nm = x[6] = l; |
385 | 0 | meta = (uint8_t*)malloc(l_nm + 28); |
386 | 0 | if (!meta) { free(name); return -1; } |
387 | 0 | if (ed_is_big()) |
388 | 0 | for (i = 0; i < 7; ++i) |
389 | 0 | x[i] = ed_swap_4(x[i]); |
390 | 0 | memcpy(meta, x, 28); |
391 | 0 | for (l = 28, i = 0; i < (int)kh_size(d); ++i) { |
392 | 0 | int x = strlen(name[i]) + 1; |
393 | 0 | memcpy(meta + l, name[i], x); |
394 | 0 | l += x; |
395 | 0 | } |
396 | 0 | free(name); |
397 | 0 | hts_idx_set_meta(tbx->idx, l, meta, 0); |
398 | 0 | return 0; |
399 | 0 | } |
400 | | |
401 | | // Minimal effort parser to extract reference length out of VCF header line |
402 | | // This is used only used to adjust the number of levels if necessary, |
403 | | // so not a major problem if it doesn't always work. |
404 | | static void adjust_max_ref_len_vcf(const char *str, int64_t *max_ref_len) |
405 | 0 | { |
406 | 0 | const char *ptr; |
407 | 0 | int64_t len; |
408 | 0 | if (strncmp(str, "##contig", 8) != 0) return; |
409 | 0 | ptr = strstr(str + 8, "length"); |
410 | 0 | if (!ptr) return; |
411 | 0 | for (ptr += 6; *ptr == ' ' || *ptr == '='; ptr++) {} |
412 | 0 | len = strtoll(ptr, NULL, 10); |
413 | 0 | if (*max_ref_len < len) *max_ref_len = len; |
414 | 0 | } |
415 | | |
416 | | // Same for sam files |
417 | | static void adjust_max_ref_len_sam(const char *str, int64_t *max_ref_len) |
418 | 0 | { |
419 | 0 | const char *ptr; |
420 | 0 | int64_t len; |
421 | 0 | if (strncmp(str, "@SQ", 3) != 0) return; |
422 | 0 | ptr = strstr(str + 3, "\tLN:"); |
423 | 0 | if (!ptr) return; |
424 | 0 | ptr += 4; |
425 | 0 | len = strtoll(ptr, NULL, 10); |
426 | 0 | if (*max_ref_len < len) *max_ref_len = len; |
427 | 0 | } |
428 | | |
429 | | // Adjusts number of levels if not big enough. This can happen for |
430 | | // files with very large contigs. |
431 | | static int adjust_n_lvls(int min_shift, int n_lvls, int64_t max_len) |
432 | 0 | { |
433 | 0 | int64_t s = hts_bin_maxpos(min_shift, n_lvls); |
434 | 0 | max_len += 256; |
435 | 0 | for (; max_len > s; ++n_lvls, s <<= 3) {} |
436 | 0 | return n_lvls; |
437 | 0 | } |
438 | | |
439 | | tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf) |
440 | 0 | { |
441 | 0 | tbx_t *tbx; |
442 | 0 | kstring_t str; |
443 | 0 | int ret, first = 0, n_lvls, fmt; |
444 | 0 | int64_t lineno = 0; |
445 | 0 | uint64_t last_off = 0; |
446 | 0 | tbx_intv_t intv; |
447 | 0 | int64_t max_ref_len = 0; |
448 | |
|
449 | 0 | str.s = 0; str.l = str.m = 0; |
450 | 0 | tbx = (tbx_t*)calloc(1, sizeof(tbx_t)); |
451 | 0 | if (!tbx) return NULL; |
452 | 0 | tbx->conf = *conf; |
453 | 0 | if (min_shift > 0) n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3, fmt = HTS_FMT_CSI; |
454 | 0 | else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_TBI; |
455 | 0 | while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) { |
456 | 0 | ++lineno; |
457 | 0 | if (str.s[0] == tbx->conf.meta_char && fmt == HTS_FMT_CSI) { |
458 | 0 | switch (tbx->conf.preset) { |
459 | 0 | case TBX_SAM: |
460 | 0 | adjust_max_ref_len_sam(str.s, &max_ref_len); break; |
461 | 0 | case TBX_VCF: |
462 | 0 | adjust_max_ref_len_vcf(str.s, &max_ref_len); break; |
463 | 0 | default: |
464 | 0 | break; |
465 | 0 | } |
466 | 0 | } |
467 | 0 | if (lineno <= tbx->conf.line_skip || str.s[0] == tbx->conf.meta_char) { |
468 | 0 | last_off = bgzf_tell(fp); |
469 | 0 | continue; |
470 | 0 | } |
471 | 0 | if (first == 0) { |
472 | 0 | if (fmt == HTS_FMT_CSI) { |
473 | 0 | if (!max_ref_len) |
474 | 0 | max_ref_len = (int64_t)100*1024*1024*1024; // 100G default |
475 | 0 | n_lvls = adjust_n_lvls(min_shift, n_lvls, max_ref_len); |
476 | 0 | } |
477 | 0 | tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls); |
478 | 0 | if (!tbx->idx) goto fail; |
479 | 0 | first = 1; |
480 | 0 | } |
481 | 0 | ret = get_intv(tbx, &str, &intv, 1); |
482 | 0 | if (ret < 0) goto fail; // Out of memory or unparsable lines |
483 | 0 | if (hts_idx_push(tbx->idx, intv.tid, intv.beg, intv.end, |
484 | 0 | bgzf_tell(fp), 1) < 0) { |
485 | 0 | goto fail; |
486 | 0 | } |
487 | 0 | } |
488 | 0 | if (ret < -1) goto fail; |
489 | 0 | if ( !tbx->idx ) tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls); // empty file |
490 | 0 | if (!tbx->idx) goto fail; |
491 | 0 | if ( !tbx->dict ) tbx->dict = kh_init(s2i); |
492 | 0 | if (!tbx->dict) goto fail; |
493 | 0 | if (hts_idx_finish(tbx->idx, bgzf_tell(fp)) != 0) goto fail; |
494 | 0 | if (tbx_set_meta(tbx) != 0) goto fail; |
495 | 0 | free(str.s); |
496 | 0 | return tbx; |
497 | | |
498 | 0 | fail: |
499 | 0 | free(str.s); |
500 | 0 | tbx_destroy(tbx); |
501 | 0 | return NULL; |
502 | 0 | } |
503 | | |
504 | | void tbx_destroy(tbx_t *tbx) |
505 | 0 | { |
506 | 0 | khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict; |
507 | 0 | if (d != NULL) |
508 | 0 | { |
509 | 0 | khint_t k; |
510 | 0 | for (k = kh_begin(d); k != kh_end(d); ++k) |
511 | 0 | if (kh_exist(d, k)) free((char*)kh_key(d, k)); |
512 | 0 | } |
513 | 0 | hts_idx_destroy(tbx->idx); |
514 | 0 | kh_destroy(s2i, d); |
515 | 0 | free(tbx); |
516 | 0 | } |
517 | | |
518 | | int tbx_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads, const tbx_conf_t *conf) |
519 | 0 | { |
520 | 0 | tbx_t *tbx; |
521 | 0 | BGZF *fp; |
522 | 0 | int ret; |
523 | 0 | if ((fp = bgzf_open(fn, "r")) == 0) return -1; |
524 | 0 | if ( n_threads ) bgzf_mt(fp, n_threads, 256); |
525 | 0 | if ( bgzf_compression(fp) != bgzf ) { bgzf_close(fp); return -2; } |
526 | 0 | tbx = tbx_index(fp, min_shift, conf); |
527 | 0 | bgzf_close(fp); |
528 | 0 | if ( !tbx ) return -1; |
529 | 0 | ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0? HTS_FMT_CSI : HTS_FMT_TBI); |
530 | 0 | tbx_destroy(tbx); |
531 | 0 | return ret; |
532 | 0 | } |
533 | | |
534 | | int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf) |
535 | 0 | { |
536 | 0 | return tbx_index_build3(fn, fnidx, min_shift, 0, conf); |
537 | 0 | } |
538 | | |
539 | | int tbx_index_build(const char *fn, int min_shift, const tbx_conf_t *conf) |
540 | 0 | { |
541 | 0 | return tbx_index_build3(fn, NULL, min_shift, 0, conf); |
542 | 0 | } |
543 | | |
544 | | static tbx_t *index_load(const char *fn, const char *fnidx, int flags) |
545 | 2.75k | { |
546 | 2.75k | tbx_t *tbx; |
547 | 2.75k | uint8_t *meta; |
548 | 2.75k | char *nm, *p; |
549 | 2.75k | uint32_t l_meta, l_nm; |
550 | 2.75k | tbx = (tbx_t*)calloc(1, sizeof(tbx_t)); |
551 | 2.75k | if (!tbx) |
552 | 0 | return NULL; |
553 | 2.75k | tbx->idx = hts_idx_load3(fn, fnidx, HTS_FMT_TBI, flags); |
554 | 2.75k | if ( !tbx->idx ) |
555 | 2.75k | { |
556 | 2.75k | free(tbx); |
557 | 2.75k | return NULL; |
558 | 2.75k | } |
559 | 0 | meta = hts_idx_get_meta(tbx->idx, &l_meta); |
560 | 0 | if ( !meta || l_meta < 28) goto invalid; |
561 | | |
562 | 0 | tbx->conf.preset = le_to_i32(&meta[0]); |
563 | 0 | tbx->conf.sc = le_to_i32(&meta[4]); |
564 | 0 | tbx->conf.bc = le_to_i32(&meta[8]); |
565 | 0 | tbx->conf.ec = le_to_i32(&meta[12]); |
566 | 0 | tbx->conf.meta_char = le_to_i32(&meta[16]); |
567 | 0 | tbx->conf.line_skip = le_to_i32(&meta[20]); |
568 | 0 | l_nm = le_to_u32(&meta[24]); |
569 | 0 | if (l_nm > l_meta - 28) goto invalid; |
570 | | |
571 | 0 | p = nm = (char*)meta + 28; |
572 | | // This assumes meta is NUL-terminated, so we can merrily strlen away. |
573 | | // hts_idx_load_local() assures this for us by adding a NUL on the end |
574 | | // of whatever it reads. |
575 | 0 | for (; p - nm < l_nm; p += strlen(p) + 1) { |
576 | 0 | if (get_tid(tbx, p, 1) < 0) { |
577 | 0 | hts_log_error("%s", strerror(errno)); |
578 | 0 | goto fail; |
579 | 0 | } |
580 | 0 | } |
581 | 0 | return tbx; |
582 | | |
583 | 0 | invalid: |
584 | 0 | hts_log_error("Invalid index header for %s", fnidx ? fnidx : fn); |
585 | |
|
586 | 0 | fail: |
587 | 0 | tbx_destroy(tbx); |
588 | 0 | return NULL; |
589 | 0 | } |
590 | | |
591 | | tbx_t *tbx_index_load3(const char *fn, const char *fnidx, int flags) |
592 | 2.75k | { |
593 | 2.75k | return index_load(fn, fnidx, flags); |
594 | 2.75k | } |
595 | | |
596 | | tbx_t *tbx_index_load2(const char *fn, const char *fnidx) |
597 | 0 | { |
598 | 0 | return index_load(fn, fnidx, 1); |
599 | 0 | } |
600 | | |
601 | | tbx_t *tbx_index_load(const char *fn) |
602 | 0 | { |
603 | 0 | return index_load(fn, NULL, 1); |
604 | 0 | } |
605 | | |
606 | | const char **tbx_seqnames(tbx_t *tbx, int *n) |
607 | 0 | { |
608 | 0 | khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict; |
609 | 0 | if (d == NULL) |
610 | 0 | { |
611 | 0 | *n = 0; |
612 | 0 | return calloc(1, sizeof(char *)); |
613 | 0 | } |
614 | 0 | int tid, m = kh_size(d); |
615 | 0 | const char **names = (const char**) calloc(m,sizeof(const char*)); |
616 | 0 | khint_t k; |
617 | 0 | if (!names) { |
618 | 0 | *n = 0; |
619 | 0 | return NULL; |
620 | 0 | } |
621 | 0 | for (k=kh_begin(d); k<kh_end(d); k++) |
622 | 0 | { |
623 | 0 | if ( !kh_exist(d,k) ) continue; |
624 | 0 | tid = kh_val(d,k); |
625 | 0 | assert( tid<m ); |
626 | 0 | names[tid] = kh_key(d,k); |
627 | 0 | } |
628 | | // sanity check: there should be no gaps |
629 | 0 | for (tid=0; tid<m; tid++) |
630 | 0 | assert(names[tid]); |
631 | 0 | *n = m; |
632 | 0 | return names; |
633 | 0 | } |
634 | | |