Coverage Report

Created: 2025-07-11 06:53

/src/htslib/tbx.c
Line
Count
Source (jump to first uncovered line)
1
/*  tbx.c -- tabix API functions.
2
3
    Copyright (C) 2009, 2010, 2012-2015, 2017-2020, 2022-2023, 2025 Genome Research Ltd.
4
    Copyright (C) 2010-2012 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <stdlib.h>
30
#include <string.h>
31
#include <stdio.h>
32
#include <assert.h>
33
#include <errno.h>
34
#include "htslib/tbx.h"
35
#include "htslib/bgzf.h"
36
#include "htslib/hts_endian.h"
37
#include "hts_internal.h"
38
39
#include "htslib/khash.h"
40
KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
41
42
HTSLIB_EXPORT
43
const tbx_conf_t tbx_conf_gff = { 0, 1, 4, 5, '#', 0 };
44
45
HTSLIB_EXPORT
46
const tbx_conf_t tbx_conf_bed = { TBX_UCSC, 1, 2, 3, '#', 0 };
47
48
HTSLIB_EXPORT
49
const tbx_conf_t tbx_conf_psltbl = { TBX_UCSC, 15, 17, 18, '#', 0 };
50
51
HTSLIB_EXPORT
52
const tbx_conf_t tbx_conf_sam = { TBX_SAM, 3, 4, 0, '@', 0 };
53
54
HTSLIB_EXPORT
55
const tbx_conf_t tbx_conf_vcf = { TBX_VCF, 1, 2, 0, '#', 0 };
56
const tbx_conf_t tbx_conf_gaf = { TBX_GAF, 1, 6, 0, '#', 0 };
57
58
typedef struct {
59
    int64_t beg, end;
60
    char *ss, *se;
61
    int tid;
62
} tbx_intv_t;
63
64
static inline int get_tid(tbx_t *tbx, const char *ss, int is_add)
65
0
{
66
0
    khint_t k;
67
0
    khash_t(s2i) *d;
68
0
    if ((tbx->conf.preset&0xffff) == TBX_GAF) return(0);
69
0
    if (tbx->dict == 0) tbx->dict = kh_init(s2i);
70
0
    if (!tbx->dict) return -1; // Out of memory
71
0
    d = (khash_t(s2i)*)tbx->dict;
72
0
    if (is_add) {
73
0
        int absent;
74
0
        k = kh_put(s2i, d, ss, &absent);
75
0
        if (absent < 0) {
76
0
            return -1; // Out of memory
77
0
        } else if (absent) {
78
0
            char *ss_dup = strdup(ss);
79
0
            if (ss_dup) {
80
0
                kh_key(d, k) = ss_dup;
81
0
                kh_val(d, k) = kh_size(d) - 1;
82
0
            } else {
83
0
                kh_del(s2i, d, k);
84
0
                return -1; // Out of memory
85
0
            }
86
0
        }
87
0
    } else k = kh_get(s2i, d, ss);
88
0
    return k == kh_end(d)? -1 : kh_val(d, k);
89
0
}
90
91
int tbx_name2id(tbx_t *tbx, const char *ss)
92
0
{
93
0
    return get_tid(tbx, ss, 0);
94
0
}
95
96
int tbx_parse1(const tbx_conf_t *conf, size_t len, char *line, tbx_intv_t *intv)
97
0
{
98
0
    size_t i, b = 0;
99
0
    int id = 1, getlen = 0, alcnt = 0, haveins = 0, lenpos = -1;
100
0
    char *s, *t;
101
0
    uint8_t insals[8192];
102
0
    int64_t reflen = 0, svlen = 0, fmtlen = 0, tmp = 0;
103
104
0
    intv->ss = intv->se = 0; intv->beg = intv->end = -1;
105
0
    for (i = 0; i <= len; ++i) {
106
0
        if (line[i] == '\t' || line[i] == 0) {
107
0
            if (id == conf->sc) {
108
0
                intv->ss = line + b; intv->se = line + i;
109
0
            } else if (id == conf->bc) {
110
                // here ->beg is 0-based.
111
0
                if ((conf->preset&0xffff) == TBX_GAF){
112
                    // if gaf find the smallest and largest node id
113
0
                    char *t;
114
0
                    int64_t nodeid = -1;
115
0
                    for (s = line + b + 1; s < line + i;) {
116
0
                        nodeid = strtoll(s, &t, 0);
117
0
                        if(intv->beg == -1){
118
0
                            intv->beg = intv->end = nodeid;
119
0
                        } else {
120
0
                            if(nodeid < intv->beg){
121
0
                                intv->beg = nodeid;
122
0
                            }
123
124
0
                            if(nodeid > intv->end){
125
0
                                intv->end = nodeid;
126
0
                            }
127
0
                        }
128
0
                        s = t + 1;
129
0
                    }
130
0
                } else {
131
0
                    intv->beg = strtoll(line + b, &s, 0);
132
133
0
                    if (conf->bc <= conf->ec) // don't overwrite an already set end point
134
0
                        intv->end = intv->beg;
135
136
0
                    if ( s==line+b ) return -1; // expected int
137
138
0
                    if (!(conf->preset&TBX_UCSC))
139
0
                        --intv->beg;
140
0
                    else if (conf->bc <= conf->ec)
141
0
                        ++intv->end;
142
143
0
                    if (intv->beg < 0) {
144
0
                        hts_log_warning("Coordinate <= 0 detected. "
145
0
                                        "Did you forget to use the -0 option?");
146
0
                        intv->beg = 0;
147
0
                    }
148
0
                    if (intv->end < 1) intv->end = 1;
149
0
                }
150
0
            } else {
151
0
                if ((conf->preset&0xffff) == TBX_GENERIC) {
152
0
                    if (id == conf->ec)
153
0
                    {
154
0
                        intv->end = strtoll(line + b, &s, 0);
155
0
                        if ( s==line+b ) return -1; // expected int
156
0
                    }
157
0
                } else if ((conf->preset&0xffff) == TBX_SAM) {
158
0
                    if (id == 6) { // CIGAR
159
0
                        int l = 0;
160
0
                        char *t;
161
0
                        for (s = line + b; s < line + i;) {
162
0
                            long x = strtol(s, &t, 10);
163
0
                            char op = toupper_c(*t);
164
0
                            if (op == 'M' || op == 'D' || op == 'N') l += x;
165
0
                            s = t + 1;
166
0
                        }
167
0
                        if (l == 0) l = 1;
168
0
                        intv->end = intv->beg + l;
169
0
                    }
170
0
                } else if ((conf->preset&0xffff) == TBX_VCF) {
171
0
                    if (id == 4) { //ref allele
172
0
                        if (b < i) intv->end = intv->beg + (i - b);
173
0
                        ++alcnt;
174
0
                        reflen = i - b;
175
0
                    } if (id == 5) {    //alt allele
176
0
                        int lastbyte = 0, c = line[i];
177
0
                        insals[lastbyte] = 0;
178
0
                        line[i] = 0;
179
0
                        s = line + b;
180
0
                        do {
181
0
                            t = strchr(s, ',');
182
0
                            if (alcnt >> 3 != lastbyte) {   //initialize insals
183
0
                                lastbyte = alcnt >> 3;
184
0
                                insals[lastbyte] = 0;
185
0
                            }
186
0
                            ++alcnt;
187
0
                            if (t) {
188
0
                                *t = 0;
189
0
                            }
190
0
                            if (s[0] == '<') {
191
0
                                if (!strcmp("<INS>", s)) {  //note inserts
192
0
                                    insals[lastbyte] |= 1 << ((alcnt - 1) & 7);
193
0
                                    haveins = 1;
194
0
                                } else if (!strcmp("<*>", s) ||
195
0
                                    !strcmp("<NON_REF>", s)) {  //note gvcf
196
0
                                    getlen = 1;
197
0
                                }
198
0
                            }
199
0
                            if (t) {
200
0
                                *t = ',';
201
0
                                s = t + 1;
202
0
                            }
203
0
                        } while (t && alcnt < 65536);   //max allcnt is 65535
204
0
                        line[i] = c;
205
0
                    } else if (id == 8) { //INFO, look for "END=" / "SVLEN"
206
0
                        int c = line[i], d = 1;
207
0
                        line[i] = 0;
208
0
                        s = strstr(line + b, "END=");
209
0
                        if (s == line + b) s += 4;
210
0
                        else if (s) {
211
0
                            s = strstr(line + b, ";END=");
212
0
                            if (s) s += 5;
213
0
                        }
214
0
                        if (s && *s != '.') {
215
0
                            long long end = strtoll(s, &s, 0);
216
0
                            if (end <= intv->beg) {
217
0
                                static int reported = 0;
218
0
                                if (!reported) {
219
0
                                    int l = intv->ss ? (int) (intv->se - intv->ss) : 0;
220
0
                                    hts_log_warning("VCF INFO/END=%lld is smaller than POS at %.*s:%"PRIhts_pos"\n"
221
0
                                                    "This tag will be ignored. "
222
0
                                                    "Note: only one invalid END tag will be reported.",
223
0
                                                    end, l >= 0 ? l : 0,
224
0
                                                    intv->ss ? intv->ss : "",
225
0
                                                    intv->beg);
226
0
                                    reported = 1;
227
0
                                }
228
0
                            } else {
229
0
                                intv->end = end;
230
0
                            }
231
0
                        }
232
0
                        s = strstr(line + b, "SVLEN=");
233
0
                        if (s == line + b) s += 6;  //at start of info
234
0
                        else if (s) {               //not at the start
235
0
                            s = strstr(line + b, ";SVLEN=");
236
0
                            if (s) s += 7;
237
0
                        }
238
0
                        while (s && d < alcnt) {
239
0
                            t = strchr(s, ',');
240
0
                            if ((haveins) && (insals[d >> 3] & (1 << (d & 7)))) {
241
0
                                tmp = 1;    //<INS>
242
0
                            } else {
243
0
                                tmp = atoll(s);
244
0
                                tmp = tmp < 0 ? llabs(tmp) : tmp;
245
0
                            }
246
0
                            svlen = svlen < tmp ? tmp : svlen;
247
0
                            s = t ? t + 1 : NULL;
248
0
                            ++d;
249
0
                        }
250
0
                        line[i] = c;
251
0
                    } else if (getlen && id == 9 ) {    //FORMAT
252
0
                        int c = line[i], pos = -1;
253
0
                        line[i] = 0;
254
0
                        s = line + b;
255
0
                        while (s) {
256
0
                            ++pos;
257
0
                            if (!(t = strchr(s, ':'))) {    //no further fields
258
0
                                if (!strcmp(s, "LEN")) {
259
0
                                    lenpos = pos;
260
0
                                }
261
0
                                break;  //not present at all!
262
0
                            } else {
263
0
                                *t = '\0';
264
0
                                if (!strcmp(s, "LEN")) {
265
0
                                    lenpos = pos;
266
0
                                    *t = ':';
267
0
                                    break;
268
0
                                }
269
0
                                *t = ':';
270
0
                                s = t + 1;  //check next one
271
0
                            }
272
0
                        }
273
0
                        line[i] = c;
274
0
                        if (lenpos == -1) { //not present
275
0
                            break;
276
0
                        }
277
0
                    } else if (id > 9 && getlen && lenpos != -1) {
278
                        //get LEN from sample
279
0
                        int c = line[i], d = 0;
280
0
                        line[i] = 0; tmp = 0;
281
0
                        s = line + b;
282
0
                        for (d = 0; d <= lenpos; ++d) {
283
0
                            if (d == lenpos) {
284
0
                                tmp = atoll(s);
285
0
                                break;
286
0
                            }
287
0
                            if ((t = strchr(s, ':'))) {
288
0
                                s = t + 1;
289
0
                            } else {
290
0
                                break;    //not in sycn with fmt def!
291
0
                            }
292
0
                        }
293
0
                        fmtlen = fmtlen < tmp ? tmp : fmtlen;
294
0
                        line[i] = c;
295
0
                    }
296
0
                }
297
0
            }
298
0
            b = i + 1;  //beginning if current field
299
0
            ++id;
300
0
        }
301
0
    }
302
0
    if ((conf->preset&0xffff) == TBX_VCF) {
303
0
        tmp = reflen < svlen ?
304
0
                svlen < fmtlen ? fmtlen : svlen :
305
0
                reflen < fmtlen ? fmtlen : reflen ;
306
0
        tmp += intv->beg;
307
0
        intv->end = intv->end < tmp ? tmp : intv->end;
308
309
        //NOTE: 'end' calculation be in sync with end/rlen in vcf.c:get_rlen
310
0
    }
311
0
    if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1;
312
0
    return 0;
313
0
}
314
315
static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_add)
316
0
{
317
0
    if (tbx_parse1(&tbx->conf, str->l, str->s, intv) == 0) {
318
0
        int c = *intv->se;
319
0
        *intv->se = '\0';
320
0
        if ((tbx->conf.preset&0xffff) == TBX_GAF){
321
0
            intv->tid = 0;
322
0
        } else {
323
0
            intv->tid = get_tid(tbx, intv->ss, is_add);
324
0
        }
325
0
        *intv->se = c;
326
0
        if (intv->tid < 0) return -2;  // get_tid out of memory
327
0
        return (intv->beg >= 0 && intv->end >= 0)? 0 : -1;
328
0
    } else {
329
0
        char *type = NULL;
330
0
        switch (tbx->conf.preset&0xffff)
331
0
        {
332
0
            case TBX_SAM: type = "TBX_SAM"; break;
333
0
            case TBX_VCF: type = "TBX_VCF"; break;
334
0
            case TBX_GAF: type = "TBX_GAF"; break;
335
0
            case TBX_UCSC: type = "TBX_UCSC"; break;
336
0
            default: type = "TBX_GENERIC"; break;
337
0
        }
338
0
        if (hts_is_utf16_text(str))
339
0
            hts_log_error("Failed to parse %s: offending line appears to be encoded as UTF-16", type);
340
0
        else
341
0
            hts_log_error("Failed to parse %s: was wrong -p [type] used?\nThe offending line was: \"%s\"",
342
0
                type, str->s);
343
0
        return -1;
344
0
    }
345
0
}
346
347
/*
348
 * Called by tabix iterator to read the next record.
349
 * Returns    >=  0 on success
350
 *               -1 on EOF
351
 *            <= -2 on error
352
 */
353
int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, hts_pos_t *beg, hts_pos_t *end)
354
0
{
355
0
    tbx_t *tbx = (tbx_t *) tbxv;
356
0
    kstring_t *s = (kstring_t *) sv;
357
0
    int ret;
358
0
    if ((ret = bgzf_getline(fp, '\n', s)) >= 0) {
359
0
        tbx_intv_t intv;
360
0
        if (get_intv(tbx, s, &intv, 0) < 0)
361
0
            return -2;
362
0
        *tid = intv.tid; *beg = intv.beg; *end = intv.end;
363
0
    }
364
0
    return ret;
365
0
}
366
367
static int tbx_set_meta(tbx_t *tbx)
368
0
{
369
0
    int i, l = 0, l_nm;
370
0
    uint32_t x[7];
371
0
    char **name;
372
0
    uint8_t *meta;
373
0
    khint_t k;
374
0
    khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
375
376
0
    memcpy(x, &tbx->conf, 24);
377
0
    name = (char**)malloc(sizeof(char*) * kh_size(d));
378
0
    if (!name) return -1;
379
0
    for (k = kh_begin(d), l = 0; k != kh_end(d); ++k) {
380
0
        if (!kh_exist(d, k)) continue;
381
0
        name[kh_val(d, k)] = (char*)kh_key(d, k);
382
0
        l += strlen(kh_key(d, k)) + 1; // +1 to include '\0'
383
0
    }
384
0
    l_nm = x[6] = l;
385
0
    meta = (uint8_t*)malloc(l_nm + 28);
386
0
    if (!meta) { free(name); return -1; }
387
0
    if (ed_is_big())
388
0
        for (i = 0; i < 7; ++i)
389
0
            x[i] = ed_swap_4(x[i]);
390
0
    memcpy(meta, x, 28);
391
0
    for (l = 28, i = 0; i < (int)kh_size(d); ++i) {
392
0
        int x = strlen(name[i]) + 1;
393
0
        memcpy(meta + l, name[i], x);
394
0
        l += x;
395
0
    }
396
0
    free(name);
397
0
    hts_idx_set_meta(tbx->idx, l, meta, 0);
398
0
    return 0;
399
0
}
400
401
// Minimal effort parser to extract reference length out of VCF header line
402
// This is used only used to adjust the number of levels if necessary,
403
// so not a major problem if it doesn't always work.
404
static void adjust_max_ref_len_vcf(const char *str, int64_t *max_ref_len)
405
0
{
406
0
    const char *ptr;
407
0
    int64_t len;
408
0
    if (strncmp(str, "##contig", 8) != 0) return;
409
0
    ptr = strstr(str + 8, "length");
410
0
    if (!ptr) return;
411
0
    for (ptr += 6; *ptr == ' ' || *ptr == '='; ptr++) {}
412
0
    len = strtoll(ptr, NULL, 10);
413
0
    if (*max_ref_len < len) *max_ref_len = len;
414
0
}
415
416
// Same for sam files
417
static void adjust_max_ref_len_sam(const char *str, int64_t *max_ref_len)
418
0
{
419
0
    const char *ptr;
420
0
    int64_t len;
421
0
    if (strncmp(str, "@SQ", 3) != 0) return;
422
0
    ptr = strstr(str + 3, "\tLN:");
423
0
    if (!ptr) return;
424
0
    ptr += 4;
425
0
    len = strtoll(ptr, NULL, 10);
426
0
    if (*max_ref_len < len) *max_ref_len = len;
427
0
}
428
429
// Adjusts number of levels if not big enough.  This can happen for
430
// files with very large contigs.
431
static int adjust_n_lvls(int min_shift, int n_lvls, int64_t max_len)
432
0
{
433
0
    int64_t s = hts_bin_maxpos(min_shift, n_lvls);
434
0
    max_len += 256;
435
0
    for (; max_len > s; ++n_lvls, s <<= 3) {}
436
0
    return n_lvls;
437
0
}
438
439
tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf)
440
0
{
441
0
    tbx_t *tbx;
442
0
    kstring_t str;
443
0
    int ret, first = 0, n_lvls, fmt;
444
0
    int64_t lineno = 0;
445
0
    uint64_t last_off = 0;
446
0
    tbx_intv_t intv;
447
0
    int64_t max_ref_len = 0;
448
449
0
    str.s = 0; str.l = str.m = 0;
450
0
    tbx = (tbx_t*)calloc(1, sizeof(tbx_t));
451
0
    if (!tbx) return NULL;
452
0
    tbx->conf = *conf;
453
0
    if (min_shift > 0) n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3, fmt = HTS_FMT_CSI;
454
0
    else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_TBI;
455
0
    while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
456
0
        ++lineno;
457
0
        if (str.s[0] == tbx->conf.meta_char && fmt == HTS_FMT_CSI) {
458
0
            switch (tbx->conf.preset) {
459
0
                case TBX_SAM:
460
0
                    adjust_max_ref_len_sam(str.s, &max_ref_len); break;
461
0
                case TBX_VCF:
462
0
                    adjust_max_ref_len_vcf(str.s, &max_ref_len); break;
463
0
                default:
464
0
                    break;
465
0
            }
466
0
        }
467
0
        if (lineno <= tbx->conf.line_skip || str.s[0] == tbx->conf.meta_char) {
468
0
            last_off = bgzf_tell(fp);
469
0
            continue;
470
0
        }
471
0
        if (first == 0) {
472
0
            if (fmt == HTS_FMT_CSI) {
473
0
                if (!max_ref_len)
474
0
                    max_ref_len = (int64_t)100*1024*1024*1024; // 100G default
475
0
                n_lvls = adjust_n_lvls(min_shift, n_lvls, max_ref_len);
476
0
            }
477
0
            tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls);
478
0
            if (!tbx->idx) goto fail;
479
0
            first = 1;
480
0
        }
481
0
        ret = get_intv(tbx, &str, &intv, 1);
482
0
        if (ret < 0) goto fail;  // Out of memory or unparsable lines
483
0
        if (hts_idx_push(tbx->idx, intv.tid, intv.beg, intv.end,
484
0
                         bgzf_tell(fp), 1) < 0) {
485
0
            goto fail;
486
0
        }
487
0
    }
488
0
    if (ret < -1) goto fail;
489
0
    if ( !tbx->idx ) tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls);   // empty file
490
0
    if (!tbx->idx) goto fail;
491
0
    if ( !tbx->dict ) tbx->dict = kh_init(s2i);
492
0
    if (!tbx->dict) goto fail;
493
0
    if (hts_idx_finish(tbx->idx, bgzf_tell(fp)) != 0) goto fail;
494
0
    if (tbx_set_meta(tbx) != 0) goto fail;
495
0
    free(str.s);
496
0
    return tbx;
497
498
0
 fail:
499
0
    free(str.s);
500
0
    tbx_destroy(tbx);
501
0
    return NULL;
502
0
}
503
504
void tbx_destroy(tbx_t *tbx)
505
0
{
506
0
    khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
507
0
    if (d != NULL)
508
0
    {
509
0
        khint_t k;
510
0
        for (k = kh_begin(d); k != kh_end(d); ++k)
511
0
            if (kh_exist(d, k)) free((char*)kh_key(d, k));
512
0
    }
513
0
    hts_idx_destroy(tbx->idx);
514
0
    kh_destroy(s2i, d);
515
0
    free(tbx);
516
0
}
517
518
int tbx_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads, const tbx_conf_t *conf)
519
0
{
520
0
    tbx_t *tbx;
521
0
    BGZF *fp;
522
0
    int ret;
523
0
    if ((fp = bgzf_open(fn, "r")) == 0) return -1;
524
0
    if ( n_threads ) bgzf_mt(fp, n_threads, 256);
525
0
    if ( bgzf_compression(fp) != bgzf ) { bgzf_close(fp); return -2; }
526
0
    tbx = tbx_index(fp, min_shift, conf);
527
0
    bgzf_close(fp);
528
0
    if ( !tbx ) return -1;
529
0
    ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0? HTS_FMT_CSI : HTS_FMT_TBI);
530
0
    tbx_destroy(tbx);
531
0
    return ret;
532
0
}
533
534
int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf)
535
0
{
536
0
    return tbx_index_build3(fn, fnidx, min_shift, 0, conf);
537
0
}
538
539
int tbx_index_build(const char *fn, int min_shift, const tbx_conf_t *conf)
540
0
{
541
0
    return tbx_index_build3(fn, NULL, min_shift, 0, conf);
542
0
}
543
544
static tbx_t *index_load(const char *fn, const char *fnidx, int flags)
545
2.75k
{
546
2.75k
    tbx_t *tbx;
547
2.75k
    uint8_t *meta;
548
2.75k
    char *nm, *p;
549
2.75k
    uint32_t l_meta, l_nm;
550
2.75k
    tbx = (tbx_t*)calloc(1, sizeof(tbx_t));
551
2.75k
    if (!tbx)
552
0
        return NULL;
553
2.75k
    tbx->idx = hts_idx_load3(fn, fnidx, HTS_FMT_TBI, flags);
554
2.75k
    if ( !tbx->idx )
555
2.75k
    {
556
2.75k
        free(tbx);
557
2.75k
        return NULL;
558
2.75k
    }
559
0
    meta = hts_idx_get_meta(tbx->idx, &l_meta);
560
0
    if ( !meta || l_meta < 28) goto invalid;
561
562
0
    tbx->conf.preset = le_to_i32(&meta[0]);
563
0
    tbx->conf.sc = le_to_i32(&meta[4]);
564
0
    tbx->conf.bc = le_to_i32(&meta[8]);
565
0
    tbx->conf.ec = le_to_i32(&meta[12]);
566
0
    tbx->conf.meta_char = le_to_i32(&meta[16]);
567
0
    tbx->conf.line_skip = le_to_i32(&meta[20]);
568
0
    l_nm = le_to_u32(&meta[24]);
569
0
    if (l_nm > l_meta - 28) goto invalid;
570
571
0
    p = nm = (char*)meta + 28;
572
    // This assumes meta is NUL-terminated, so we can merrily strlen away.
573
    // hts_idx_load_local() assures this for us by adding a NUL on the end
574
    // of whatever it reads.
575
0
    for (; p - nm < l_nm; p += strlen(p) + 1) {
576
0
        if (get_tid(tbx, p, 1) < 0) {
577
0
            hts_log_error("%s", strerror(errno));
578
0
            goto fail;
579
0
        }
580
0
    }
581
0
    return tbx;
582
583
0
 invalid:
584
0
    hts_log_error("Invalid index header for %s", fnidx ? fnidx : fn);
585
586
0
 fail:
587
0
    tbx_destroy(tbx);
588
0
    return NULL;
589
0
}
590
591
tbx_t *tbx_index_load3(const char *fn, const char *fnidx, int flags)
592
2.75k
{
593
2.75k
    return index_load(fn, fnidx, flags);
594
2.75k
}
595
596
tbx_t *tbx_index_load2(const char *fn, const char *fnidx)
597
0
{
598
0
    return index_load(fn, fnidx, 1);
599
0
}
600
601
tbx_t *tbx_index_load(const char *fn)
602
0
{
603
0
    return index_load(fn, NULL, 1);
604
0
}
605
606
const char **tbx_seqnames(tbx_t *tbx, int *n)
607
0
{
608
0
    khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
609
0
    if (d == NULL)
610
0
    {
611
0
        *n = 0;
612
0
        return calloc(1, sizeof(char *));
613
0
    }
614
0
    int tid, m = kh_size(d);
615
0
    const char **names = (const char**) calloc(m,sizeof(const char*));
616
0
    khint_t k;
617
0
    if (!names) {
618
0
        *n = 0;
619
0
        return NULL;
620
0
    }
621
0
    for (k=kh_begin(d); k<kh_end(d); k++)
622
0
    {
623
0
        if ( !kh_exist(d,k) ) continue;
624
0
        tid = kh_val(d,k);
625
0
        assert( tid<m );
626
0
        names[tid] = kh_key(d,k);
627
0
    }
628
    // sanity check: there should be no gaps
629
0
    for (tid=0; tid<m; tid++)
630
0
        assert(names[tid]);
631
0
    *n = m;
632
0
    return names;
633
0
}
634