Coverage Report

Created: 2025-10-28 07:42

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/tbx.c
Line
Count
Source
1
/*  tbx.c -- tabix API functions.
2
3
    Copyright (C) 2009, 2010, 2012-2015, 2017-2020, 2022-2023, 2025 Genome Research Ltd.
4
    Copyright (C) 2010-2012 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <stdlib.h>
30
#include <string.h>
31
#include <stdio.h>
32
#include <assert.h>
33
#include <errno.h>
34
#include "htslib/tbx.h"
35
#include "htslib/bgzf.h"
36
#include "htslib/hts_endian.h"
37
#include "hts_internal.h"
38
39
#include "htslib/khash.h"
40
KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
41
42
HTSLIB_EXPORT
43
const tbx_conf_t tbx_conf_gff = { 0, 1, 4, 5, '#', 0 };
44
45
HTSLIB_EXPORT
46
const tbx_conf_t tbx_conf_bed = { TBX_UCSC, 1, 2, 3, '#', 0 };
47
48
HTSLIB_EXPORT
49
const tbx_conf_t tbx_conf_psltbl = { TBX_UCSC, 15, 17, 18, '#', 0 };
50
51
HTSLIB_EXPORT
52
const tbx_conf_t tbx_conf_sam = { TBX_SAM, 3, 4, 0, '@', 0 };
53
54
HTSLIB_EXPORT
55
const tbx_conf_t tbx_conf_vcf = { TBX_VCF, 1, 2, 0, '#', 0 };
56
const tbx_conf_t tbx_conf_gaf = { TBX_GAF, 1, 6, 0, '#', 0 };
57
58
typedef struct {
59
    int64_t beg, end;
60
    char *ss, *se;
61
    int tid;
62
} tbx_intv_t;
63
64
static inline int get_tid(tbx_t *tbx, const char *ss, int is_add)
65
0
{
66
0
    khint_t k;
67
0
    khash_t(s2i) *d;
68
0
    if ((tbx->conf.preset&0xffff) == TBX_GAF) return(0);
69
0
    if (tbx->dict == 0) tbx->dict = kh_init(s2i);
70
0
    if (!tbx->dict) return -1; // Out of memory
71
0
    d = (khash_t(s2i)*)tbx->dict;
72
0
    if (is_add) {
73
0
        int absent;
74
0
        k = kh_put(s2i, d, ss, &absent);
75
0
        if (absent < 0) {
76
0
            return -1; // Out of memory
77
0
        } else if (absent) {
78
0
            char *ss_dup = strdup(ss);
79
0
            if (ss_dup) {
80
0
                kh_key(d, k) = ss_dup;
81
0
                kh_val(d, k) = kh_size(d) - 1;
82
0
            } else {
83
0
                kh_del(s2i, d, k);
84
0
                return -1; // Out of memory
85
0
            }
86
0
        }
87
0
    } else k = kh_get(s2i, d, ss);
88
0
    return k == kh_end(d)? -1 : kh_val(d, k);
89
0
}
90
91
int tbx_name2id(tbx_t *tbx, const char *ss)
92
0
{
93
0
    return get_tid(tbx, ss, 0);
94
0
}
95
96
int tbx_parse1(const tbx_conf_t *conf, size_t len, char *line, tbx_intv_t *intv)
97
0
{
98
0
    size_t i, b = 0;
99
0
    int id = 1, getlen = 0, alcnt = 0, use_svlen = 0, lenpos = -1;
100
0
    char *s, *t;
101
0
    uint8_t svlenals[8192];
102
0
    int64_t reflen = 0, svlen = 0, fmtlen = 0, tmp = 0;
103
104
0
    intv->ss = intv->se = 0; intv->beg = intv->end = -1;
105
0
    for (i = 0; i <= len; ++i) {
106
0
        if (line[i] == '\t' || line[i] == 0) {
107
0
            if (id == conf->sc) {
108
0
                intv->ss = line + b; intv->se = line + i;
109
0
            } else if (id == conf->bc) {
110
                // here ->beg is 0-based.
111
0
                if ((conf->preset&0xffff) == TBX_GAF){
112
                    // if gaf find the smallest and largest node id
113
0
                    char *t;
114
0
                    int64_t nodeid = -1;
115
0
                    for (s = line + b + 1; s < line + i;) {
116
0
                        nodeid = strtoll(s, &t, 0);
117
0
                        if(intv->beg == -1){
118
0
                            intv->beg = intv->end = nodeid;
119
0
                        } else {
120
0
                            if(nodeid < intv->beg){
121
0
                                intv->beg = nodeid;
122
0
                            }
123
124
0
                            if(nodeid > intv->end){
125
0
                                intv->end = nodeid;
126
0
                            }
127
0
                        }
128
0
                        s = t + 1;
129
0
                    }
130
0
                } else {
131
0
                    intv->beg = strtoll(line + b, &s, 0);
132
133
0
                    if (conf->bc <= conf->ec) // don't overwrite an already set end point
134
0
                        intv->end = intv->beg;
135
136
0
                    if ( s==line+b ) return -1; // expected int
137
138
0
                    if (!(conf->preset&TBX_UCSC))
139
0
                        --intv->beg;
140
0
                    else if (conf->bc <= conf->ec)
141
0
                        ++intv->end;
142
143
0
                    if (intv->beg < 0) {
144
0
                        hts_log_warning("Coordinate <= 0 detected. "
145
0
                                        "Did you forget to use the -0 option?");
146
0
                        intv->beg = 0;
147
0
                    }
148
0
                    if (intv->end < 1) intv->end = 1;
149
0
                }
150
0
            } else {
151
0
                if ((conf->preset&0xffff) == TBX_GENERIC) {
152
0
                    if (id == conf->ec)
153
0
                    {
154
0
                        intv->end = strtoll(line + b, &s, 0);
155
0
                        if ( s==line+b ) return -1; // expected int
156
0
                    }
157
0
                } else if ((conf->preset&0xffff) == TBX_SAM) {
158
0
                    if (id == 6) { // CIGAR
159
0
                        int l = 0;
160
0
                        char *t;
161
0
                        for (s = line + b; s < line + i;) {
162
0
                            long x = strtol(s, &t, 10);
163
0
                            char op = toupper_c(*t);
164
0
                            if (op == 'M' || op == 'D' || op == 'N') l += x;
165
0
                            s = t + 1;
166
0
                        }
167
0
                        if (l == 0) l = 1;
168
0
                        intv->end = intv->beg + l;
169
0
                    }
170
0
                } else if ((conf->preset&0xffff) == TBX_VCF) {
171
0
                    if (id == 4) { //ref allele
172
0
                        if (b < i) intv->end = intv->beg + (i - b);
173
0
                        ++alcnt;
174
0
                        reflen = i - b;
175
0
                    } if (id == 5) {    //alt allele
176
0
                        int lastbyte = 0, c = line[i];
177
0
                        svlenals[lastbyte] = 0;
178
0
                        line[i] = 0;
179
0
                        s = line + b;
180
0
                        do {
181
0
                            t = strchr(s, ',');
182
0
                            if (alcnt >> 3 != lastbyte) {   //initialize insals
183
0
                                lastbyte = alcnt >> 3;
184
0
                                svlenals[lastbyte] = 0;
185
0
                            }
186
0
                            ++alcnt;
187
0
                            if (t) {
188
0
                                *t = 0;
189
0
                            }
190
0
                            if (svlen_on_ref_for_vcf_alt(s, -1)) {
191
                                // Need to check SVLEN for this ALT
192
0
                                svlenals[lastbyte] |= 1 << ((alcnt - 1) & 7);
193
0
                                use_svlen = 1;
194
0
                            } else if (!strcmp("<*>", s) ||
195
0
                                       !strcmp("<NON_REF>", s)) {  //note gvcf
196
0
                                getlen = 1;
197
0
                            }
198
0
                            if (t) {
199
0
                                *t = ',';
200
0
                                s = t + 1;
201
0
                            }
202
0
                        } while (t && alcnt < 65536);   //max allcnt is 65535
203
0
                        line[i] = c;
204
0
                    } else if (id == 8) { //INFO, look for "END=" / "SVLEN"
205
0
                        int c = line[i], d = 1;
206
0
                        line[i] = 0;
207
0
                        s = strstr(line + b, "END=");
208
0
                        if (s == line + b) s += 4;
209
0
                        else if (s) {
210
0
                            s = strstr(line + b, ";END=");
211
0
                            if (s) s += 5;
212
0
                        }
213
0
                        if (s && *s != '.') {
214
0
                            long long end = strtoll(s, &s, 0);
215
0
                            if (end <= intv->beg) {
216
0
                                static int reported = 0;
217
0
                                if (!reported) {
218
0
                                    int l = intv->ss ? (int) (intv->se - intv->ss) : 0;
219
0
                                    hts_log_warning("VCF INFO/END=%lld is smaller than POS at %.*s:%"PRIhts_pos"\n"
220
0
                                                    "This tag will be ignored. "
221
0
                                                    "Note: only one invalid END tag will be reported.",
222
0
                                                    end, l >= 0 ? l : 0,
223
0
                                                    intv->ss ? intv->ss : "",
224
0
                                                    intv->beg);
225
0
                                    reported = 1;
226
0
                                }
227
0
                            } else {
228
0
                                intv->end = end;
229
0
                            }
230
0
                        }
231
0
                        s = strstr(line + b, "SVLEN=");
232
0
                        if (s == line + b) s += 6;  //at start of info
233
0
                        else if (s) {               //not at the start
234
0
                            s = strstr(line + b, ";SVLEN=");
235
0
                            if (s) s += 7;
236
0
                        }
237
0
                        while (s && d < alcnt) {
238
0
                            t = strchr(s, ',');
239
0
                            if ((use_svlen) && (svlenals[d >> 3] & (1 << (d & 7)))) {
240
                                // <DEL> symbolic allele
241
0
                                tmp = atoll(s);
242
0
                                tmp = tmp < 0 ? llabs(tmp) : tmp;
243
0
                            } else {
244
0
                                tmp = 1;
245
0
                            }
246
0
                            svlen = svlen < tmp ? tmp : svlen;
247
0
                            s = t ? t + 1 : NULL;
248
0
                            ++d;
249
0
                        }
250
0
                        line[i] = c;
251
0
                    } else if (getlen && id == 9 ) {    //FORMAT
252
0
                        int c = line[i], pos = -1;
253
0
                        line[i] = 0;
254
0
                        s = line + b;
255
0
                        while (s) {
256
0
                            ++pos;
257
0
                            if (!(t = strchr(s, ':'))) {    //no further fields
258
0
                                if (!strcmp(s, "LEN")) {
259
0
                                    lenpos = pos;
260
0
                                }
261
0
                                break;  //not present at all!
262
0
                            } else {
263
0
                                *t = '\0';
264
0
                                if (!strcmp(s, "LEN")) {
265
0
                                    lenpos = pos;
266
0
                                    *t = ':';
267
0
                                    break;
268
0
                                }
269
0
                                *t = ':';
270
0
                                s = t + 1;  //check next one
271
0
                            }
272
0
                        }
273
0
                        line[i] = c;
274
0
                        if (lenpos == -1) { //not present
275
0
                            break;
276
0
                        }
277
0
                    } else if (id > 9 && getlen && lenpos != -1) {
278
                        //get LEN from sample
279
0
                        int c = line[i], d = 0;
280
0
                        line[i] = 0; tmp = 0;
281
0
                        s = line + b;
282
0
                        for (d = 0; d <= lenpos; ++d) {
283
0
                            if (d == lenpos) {
284
0
                                tmp = atoll(s);
285
0
                                break;
286
0
                            }
287
0
                            if ((t = strchr(s, ':'))) {
288
0
                                s = t + 1;
289
0
                            } else {
290
0
                                break;    //not in sycn with fmt def!
291
0
                            }
292
0
                        }
293
0
                        fmtlen = fmtlen < tmp ? tmp : fmtlen;
294
0
                        line[i] = c;
295
0
                    }
296
0
                }
297
0
            }
298
0
            b = i + 1;  //beginning if current field
299
0
            ++id;
300
0
        }
301
0
    }
302
0
    if ((conf->preset&0xffff) == TBX_VCF) {
303
0
        tmp = reflen < svlen ?
304
0
                svlen < fmtlen ? fmtlen : svlen :
305
0
                reflen < fmtlen ? fmtlen : reflen ;
306
0
        tmp += intv->beg;
307
0
        intv->end = intv->end < tmp ? tmp : intv->end;
308
309
        //NOTE: 'end' calculation be in sync with end/rlen in vcf.c:get_rlen
310
0
    }
311
0
    if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1;
312
0
    return 0;
313
0
}
314
315
static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_add)
316
0
{
317
0
    if (tbx_parse1(&tbx->conf, str->l, str->s, intv) == 0) {
318
0
        int c = *intv->se;
319
0
        *intv->se = '\0';
320
0
        if ((tbx->conf.preset&0xffff) == TBX_GAF){
321
0
            intv->tid = 0;
322
0
        } else {
323
0
            intv->tid = get_tid(tbx, intv->ss, is_add);
324
0
        }
325
0
        *intv->se = c;
326
0
        if (intv->tid < 0) return -2;  // get_tid out of memory
327
0
        return (intv->beg >= 0 && intv->end >= 0)? 0 : -1;
328
0
    } else {
329
0
        char *type = NULL;
330
0
        switch (tbx->conf.preset&0xffff)
331
0
        {
332
0
            case TBX_SAM: type = "TBX_SAM"; break;
333
0
            case TBX_VCF: type = "TBX_VCF"; break;
334
0
            case TBX_GAF: type = "TBX_GAF"; break;
335
0
            case TBX_UCSC: type = "TBX_UCSC"; break;
336
0
            default: type = "TBX_GENERIC"; break;
337
0
        }
338
0
        if (hts_is_utf16_text(str))
339
0
            hts_log_error("Failed to parse %s: offending line appears to be encoded as UTF-16", type);
340
0
        else
341
0
            hts_log_error("Failed to parse %s: was wrong -p [type] used?\nThe offending line was: \"%s\"",
342
0
                type, str->s);
343
0
        return -1;
344
0
    }
345
0
}
346
347
/*
348
 * Called by tabix iterator to read the next record.
349
 * Returns    >=  0 on success
350
 *               -1 on EOF
351
 *            <= -2 on error
352
 */
353
int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, hts_pos_t *beg, hts_pos_t *end)
354
0
{
355
0
    tbx_t *tbx = (tbx_t *) tbxv;
356
0
    kstring_t *s = (kstring_t *) sv;
357
0
    int ret;
358
359
    // Get a line until either EOF or a non-meta character
360
0
    do {
361
0
        ret = bgzf_getline(fp, '\n', s);
362
0
    } while (ret >= 0 && s->l && *s->s == tbx->conf.meta_char);
363
364
    // Parse line
365
0
    if (ret >= 0)  {
366
0
        tbx_intv_t intv;
367
0
        if (get_intv(tbx, s, &intv, 0) < 0)
368
0
            return -2;
369
0
        *tid = intv.tid; *beg = intv.beg; *end = intv.end;
370
0
    }
371
372
0
    return ret;
373
0
}
374
375
static int tbx_set_meta(tbx_t *tbx)
376
0
{
377
0
    int i, l = 0, l_nm;
378
0
    uint32_t x[7];
379
0
    char **name;
380
0
    uint8_t *meta;
381
0
    khint_t k;
382
0
    khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
383
384
0
    memcpy(x, &tbx->conf, 24);
385
0
    name = (char**)malloc(sizeof(char*) * kh_size(d));
386
0
    if (!name) return -1;
387
0
    for (k = kh_begin(d), l = 0; k != kh_end(d); ++k) {
388
0
        if (!kh_exist(d, k)) continue;
389
0
        name[kh_val(d, k)] = (char*)kh_key(d, k);
390
0
        l += strlen(kh_key(d, k)) + 1; // +1 to include '\0'
391
0
    }
392
0
    l_nm = x[6] = l;
393
0
    meta = (uint8_t*)malloc(l_nm + 28);
394
0
    if (!meta) { free(name); return -1; }
395
0
    if (ed_is_big())
396
0
        for (i = 0; i < 7; ++i)
397
0
            x[i] = ed_swap_4(x[i]);
398
0
    memcpy(meta, x, 28);
399
0
    for (l = 28, i = 0; i < (int)kh_size(d); ++i) {
400
0
        int x = strlen(name[i]) + 1;
401
0
        memcpy(meta + l, name[i], x);
402
0
        l += x;
403
0
    }
404
0
    free(name);
405
0
    hts_idx_set_meta(tbx->idx, l, meta, 0);
406
0
    return 0;
407
0
}
408
409
// Minimal effort parser to extract reference length out of VCF header line
410
// This is used only used to adjust the number of levels if necessary,
411
// so not a major problem if it doesn't always work.
412
static void adjust_max_ref_len_vcf(const char *str, int64_t *max_ref_len)
413
0
{
414
0
    const char *ptr;
415
0
    int64_t len;
416
0
    if (strncmp(str, "##contig", 8) != 0) return;
417
0
    ptr = strstr(str + 8, "length");
418
0
    if (!ptr) return;
419
0
    for (ptr += 6; *ptr == ' ' || *ptr == '='; ptr++) {}
420
0
    len = strtoll(ptr, NULL, 10);
421
0
    if (*max_ref_len < len) *max_ref_len = len;
422
0
}
423
424
// Same for sam files
425
static void adjust_max_ref_len_sam(const char *str, int64_t *max_ref_len)
426
0
{
427
0
    const char *ptr;
428
0
    int64_t len;
429
0
    if (strncmp(str, "@SQ", 3) != 0) return;
430
0
    ptr = strstr(str + 3, "\tLN:");
431
0
    if (!ptr) return;
432
0
    ptr += 4;
433
0
    len = strtoll(ptr, NULL, 10);
434
0
    if (*max_ref_len < len) *max_ref_len = len;
435
0
}
436
437
// Adjusts number of levels if not big enough.  This can happen for
438
// files with very large contigs.
439
static int adjust_n_lvls(int min_shift, int n_lvls, int64_t max_len)
440
0
{
441
0
    int64_t s = hts_bin_maxpos(min_shift, n_lvls);
442
0
    max_len += 256;
443
0
    for (; max_len > s; ++n_lvls, s <<= 3) {}
444
0
    return n_lvls;
445
0
}
446
447
tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf)
448
0
{
449
0
    tbx_t *tbx;
450
0
    kstring_t str;
451
0
    int ret, first = 0, n_lvls, fmt;
452
0
    int64_t lineno = 0;
453
0
    uint64_t last_off = 0;
454
0
    tbx_intv_t intv;
455
0
    int64_t max_ref_len = 0;
456
457
0
    str.s = 0; str.l = str.m = 0;
458
0
    tbx = (tbx_t*)calloc(1, sizeof(tbx_t));
459
0
    if (!tbx) return NULL;
460
0
    tbx->conf = *conf;
461
0
    if (min_shift > 0) n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3, fmt = HTS_FMT_CSI;
462
0
    else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_TBI;
463
0
    while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
464
0
        ++lineno;
465
0
        if (str.s[0] == tbx->conf.meta_char && fmt == HTS_FMT_CSI) {
466
0
            switch (tbx->conf.preset) {
467
0
                case TBX_SAM:
468
0
                    adjust_max_ref_len_sam(str.s, &max_ref_len); break;
469
0
                case TBX_VCF:
470
0
                    adjust_max_ref_len_vcf(str.s, &max_ref_len); break;
471
0
                default:
472
0
                    break;
473
0
            }
474
0
        }
475
0
        if (lineno <= tbx->conf.line_skip || str.s[0] == tbx->conf.meta_char) {
476
0
            last_off = bgzf_tell(fp);
477
0
            continue;
478
0
        }
479
0
        if (first == 0) {
480
0
            if (fmt == HTS_FMT_CSI) {
481
0
                if (!max_ref_len)
482
0
                    max_ref_len = (int64_t)100*1024*1024*1024; // 100G default
483
0
                n_lvls = adjust_n_lvls(min_shift, n_lvls, max_ref_len);
484
0
            }
485
0
            tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls);
486
0
            if (!tbx->idx) goto fail;
487
0
            first = 1;
488
0
        }
489
0
        ret = get_intv(tbx, &str, &intv, 1);
490
0
        if (ret < 0) goto fail;  // Out of memory or unparsable lines
491
0
        if (hts_idx_push(tbx->idx, intv.tid, intv.beg, intv.end,
492
0
                         bgzf_tell(fp), 1) < 0) {
493
0
            goto fail;
494
0
        }
495
0
    }
496
0
    if (ret < -1) goto fail;
497
0
    if ( !tbx->idx ) tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls);   // empty file
498
0
    if (!tbx->idx) goto fail;
499
0
    if ( !tbx->dict ) tbx->dict = kh_init(s2i);
500
0
    if (!tbx->dict) goto fail;
501
0
    if (hts_idx_finish(tbx->idx, bgzf_tell(fp)) != 0) goto fail;
502
0
    if (tbx_set_meta(tbx) != 0) goto fail;
503
0
    free(str.s);
504
0
    return tbx;
505
506
0
 fail:
507
0
    free(str.s);
508
0
    tbx_destroy(tbx);
509
0
    return NULL;
510
0
}
511
512
void tbx_destroy(tbx_t *tbx)
513
0
{
514
0
    khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
515
0
    if (d != NULL)
516
0
    {
517
0
        khint_t k;
518
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
519
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
520
0
    }
521
0
    hts_idx_destroy(tbx->idx);
522
0
    kh_destroy(s2i, d);
523
0
    free(tbx);
524
0
}
525
526
int tbx_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads, const tbx_conf_t *conf)
527
0
{
528
0
    tbx_t *tbx;
529
0
    BGZF *fp;
530
0
    int ret;
531
0
    if ((fp = bgzf_open(fn, "r")) == 0) return -1;
532
0
    if ( n_threads ) bgzf_mt(fp, n_threads, 256);
533
0
    if ( bgzf_compression(fp) != bgzf ) { bgzf_close(fp); return -2; }
534
0
    tbx = tbx_index(fp, min_shift, conf);
535
0
    bgzf_close(fp);
536
0
    if ( !tbx ) return -1;
537
0
    ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0? HTS_FMT_CSI : HTS_FMT_TBI);
538
0
    tbx_destroy(tbx);
539
0
    return ret;
540
0
}
541
542
int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf)
543
0
{
544
0
    return tbx_index_build3(fn, fnidx, min_shift, 0, conf);
545
0
}
546
547
int tbx_index_build(const char *fn, int min_shift, const tbx_conf_t *conf)
548
0
{
549
0
    return tbx_index_build3(fn, NULL, min_shift, 0, conf);
550
0
}
551
552
static tbx_t *index_load(const char *fn, const char *fnidx, int flags)
553
2.96k
{
554
2.96k
    tbx_t *tbx;
555
2.96k
    uint8_t *meta;
556
2.96k
    char *nm, *p;
557
2.96k
    uint32_t l_meta, l_nm;
558
2.96k
    tbx = (tbx_t*)calloc(1, sizeof(tbx_t));
559
2.96k
    if (!tbx)
560
0
        return NULL;
561
2.96k
    tbx->idx = hts_idx_load3(fn, fnidx, HTS_FMT_TBI, flags);
562
2.96k
    if ( !tbx->idx )
563
2.96k
    {
564
2.96k
        free(tbx);
565
2.96k
        return NULL;
566
2.96k
    }
567
0
    meta = hts_idx_get_meta(tbx->idx, &l_meta);
568
0
    if ( !meta || l_meta < 28) goto invalid;
569
570
0
    tbx->conf.preset = le_to_i32(&meta[0]);
571
0
    tbx->conf.sc = le_to_i32(&meta[4]);
572
0
    tbx->conf.bc = le_to_i32(&meta[8]);
573
0
    tbx->conf.ec = le_to_i32(&meta[12]);
574
0
    tbx->conf.meta_char = le_to_i32(&meta[16]);
575
0
    tbx->conf.line_skip = le_to_i32(&meta[20]);
576
0
    l_nm = le_to_u32(&meta[24]);
577
0
    if (l_nm > l_meta - 28) goto invalid;
578
579
0
    p = nm = (char*)meta + 28;
580
    // This assumes meta is NUL-terminated, so we can merrily strlen away.
581
    // hts_idx_load_local() assures this for us by adding a NUL on the end
582
    // of whatever it reads.
583
0
    for (; p - nm < l_nm; p += strlen(p) + 1) {
584
0
        if (get_tid(tbx, p, 1) < 0) {
585
0
            hts_log_error("%s", strerror(errno));
586
0
            goto fail;
587
0
        }
588
0
    }
589
0
    return tbx;
590
591
0
 invalid:
592
0
    hts_log_error("Invalid index header for %s", fnidx ? fnidx : fn);
593
594
0
 fail:
595
0
    tbx_destroy(tbx);
596
0
    return NULL;
597
0
}
598
599
tbx_t *tbx_index_load3(const char *fn, const char *fnidx, int flags)
600
2.96k
{
601
2.96k
    return index_load(fn, fnidx, flags);
602
2.96k
}
603
604
tbx_t *tbx_index_load2(const char *fn, const char *fnidx)
605
0
{
606
0
    return index_load(fn, fnidx, 1);
607
0
}
608
609
tbx_t *tbx_index_load(const char *fn)
610
0
{
611
0
    return index_load(fn, NULL, 1);
612
0
}
613
614
const char **tbx_seqnames(tbx_t *tbx, int *n)
615
0
{
616
0
    khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
617
0
    if (d == NULL)
618
0
    {
619
0
        *n = 0;
620
0
        return calloc(1, sizeof(char *));
621
0
    }
622
0
    int tid, m = kh_size(d);
623
0
    const char **names = (const char**) calloc(m,sizeof(const char*));
624
0
    khint_t k;
625
0
    if (!names) {
626
0
        *n = 0;
627
0
        return NULL;
628
0
    }
629
0
    for (k=kh_begin(d); k<kh_end(d); k++)
630
0
    {
631
0
        if ( !kh_exist(d,k) ) continue;
632
0
        tid = kh_val(d,k);
633
0
        assert( tid<m );
634
0
        names[tid] = kh_key(d,k);
635
0
    }
636
    // sanity check: there should be no gaps
637
0
    for (tid=0; tid<m; tid++)
638
0
        assert(names[tid]);
639
0
    *n = m;
640
0
    return names;
641
0
}
642