Coverage Report

Created: 2026-03-25 06:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/faidx.c
Line
Count
Source
1
/*  faidx.c -- FASTA and FASTQ random access.
2
3
    Copyright (C) 2008, 2009, 2013-2020, 2022, 2024 Genome Research Ltd.
4
    Portions copyright (C) 2011 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <ctype.h>
30
#include <string.h>
31
#include <stdlib.h>
32
#include <stdio.h>
33
#include <inttypes.h>
34
#include <errno.h>
35
#include <limits.h>
36
#include <unistd.h>
37
#include <assert.h>
38
39
#include "htslib/bgzf.h"
40
#include "htslib/faidx.h"
41
#include "htslib/hfile.h"
42
#include "htslib/khash.h"
43
#include "htslib/kstring.h"
44
#include "hts_internal.h"
45
46
// Faster isgraph; assumes ASCII
47
2.33M
static inline int isgraph_(unsigned char c) {
48
2.33M
    return c > ' ' && c <= '~';
49
2.33M
}
50
51
#ifdef isgraph
52
#  undef isgraph
53
#endif
54
2.33M
#define isgraph isgraph_
55
56
// An optimised bgzf_getc.
57
// We could consider moving this to bgzf.h, but our own code uses it here only.
58
6.83M
static inline int bgzf_getc_(BGZF *fp) {
59
6.83M
    if (fp->block_offset+1 < fp->block_length) {
60
6.83M
        int c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
61
6.83M
        fp->uncompressed_address++;
62
6.83M
        return c;
63
6.83M
    }
64
65
867
    return bgzf_getc(fp);
66
6.83M
}
67
6.83M
#define bgzf_getc bgzf_getc_
68
69
typedef struct {
70
    int id; // faidx_t->name[id] is for this struct.
71
    uint32_t line_len, line_blen;
72
    uint64_t len;
73
    uint64_t seq_offset;
74
    uint64_t qual_offset;
75
} faidx1_t;
76
KHASH_MAP_INIT_STR(s, faidx1_t)
77
78
struct faidx_t {
79
    BGZF *bgzf;
80
    int n, m;
81
    char **name;
82
    khash_t(s) *hash;
83
    enum fai_format_options format;
84
};
85
86
static int fai_name2id(void *v, const char *ref)
87
0
{
88
0
    faidx_t *fai = (faidx_t *)v;
89
0
    khint_t k = kh_get(s, fai->hash, ref);
90
0
    return k == kh_end(fai->hash) ? -1 : kh_val(fai->hash, k).id;
91
0
}
92
93
static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len, uint32_t line_len, uint32_t line_blen, uint64_t seq_offset, uint64_t qual_offset)
94
6.58k
{
95
6.58k
    if (!name) {
96
0
        hts_log_error("Malformed line");
97
0
        return -1;
98
0
    }
99
100
6.58k
    char *name_key = strdup(name);
101
6.58k
    int absent;
102
6.58k
    khint_t k = kh_put(s, idx->hash, name_key, &absent);
103
6.58k
    faidx1_t *v = &kh_value(idx->hash, k);
104
105
6.58k
    if (! absent) {
106
5.03k
        hts_log_warning("Ignoring duplicate sequence \"%s\" at byte offset %" PRIu64, name, seq_offset);
107
5.03k
        free(name_key);
108
5.03k
        return 0;
109
5.03k
    }
110
111
1.54k
    if (idx->n == idx->m) {
112
166
        char **tmp;
113
166
        idx->m = idx->m? idx->m<<1 : 16;
114
166
        if (!(tmp = (char**)realloc(idx->name, sizeof(char*) * idx->m))) {
115
0
            hts_log_error("Out of memory");
116
0
            return -1;
117
0
        }
118
166
        idx->name = tmp;
119
166
    }
120
1.54k
    v->id = idx->n;
121
1.54k
    idx->name[idx->n++] = name_key;
122
1.54k
    v->len = len;
123
1.54k
    v->line_len = line_len;
124
1.54k
    v->line_blen = line_blen;
125
1.54k
    v->seq_offset = seq_offset;
126
1.54k
    v->qual_offset = qual_offset;
127
128
1.54k
    return 0;
129
1.54k
}
130
131
132
251
static faidx_t *fai_build_core(BGZF *bgzf) {
133
251
    kstring_t name = { 0, 0, NULL };
134
251
    int c, read_done, line_num;
135
251
    faidx_t *idx;
136
251
    uint64_t seq_offset, qual_offset;
137
251
    uint64_t seq_len, qual_len;
138
251
    uint64_t char_len, cl, line_len, ll;
139
251
    enum read_state {OUT_READ, IN_NAME, IN_SEQ, SEQ_END, IN_QUAL} state;
140
141
251
    idx = (faidx_t*)calloc(1, sizeof(faidx_t));
142
251
    idx->hash = kh_init(s);
143
251
    idx->format = FAI_NONE;
144
145
251
    state = OUT_READ, read_done = 0, line_num = 1;
146
251
    seq_offset = qual_offset = seq_len = qual_len = char_len = cl = line_len = ll = 0;
147
148
29.7k
    while ((c = bgzf_getc(bgzf)) >= 0) {
149
29.6k
        switch (state) {
150
3.46k
            case OUT_READ:
151
3.46k
                switch (c) {
152
2.80k
                    case '>':
153
2.80k
                        if (idx->format == FAI_FASTQ) {
154
0
                            hts_log_error("Found '>' in a FASTQ file, error at line %d", line_num);
155
0
                            goto fail;
156
0
                        }
157
158
2.80k
                        idx->format = FAI_FASTA;
159
2.80k
                        state = IN_NAME;
160
2.80k
                    break;
161
162
111
                    case '@':
163
111
                        if (idx->format == FAI_FASTA) {
164
11
                            hts_log_error("Found '@' in a FASTA file, error at line %d", line_num);
165
11
                            goto fail;
166
11
                        }
167
168
100
                        idx->format = FAI_FASTQ;
169
100
                        state = IN_NAME;
170
100
                    break;
171
172
78
                    case '\r':
173
                        // Blank line with cr-lf ending?
174
78
                        if ((c = bgzf_getc(bgzf)) == '\n') {
175
69
                            line_num++;
176
69
                        } else {
177
9
                            hts_log_error("Format error, carriage return not followed by new line at line %d", line_num);
178
9
                            goto fail;
179
9
                        }
180
69
                    break;
181
182
447
                    case '\n':
183
                        // just move onto the next line
184
447
                        line_num++;
185
447
                    break;
186
187
19
                    default: {
188
19
                        char s[4] = { '"', c, '"', '\0' };
189
19
                        hts_log_error("Format error, unexpected %s at line %d", isprint(c) ? s : "character", line_num);
190
19
                        goto fail;
191
78
                    }
192
3.46k
                }
193
3.42k
            break;
194
195
9.64k
            case IN_NAME:
196
9.64k
                if (read_done) {
197
6.53k
                    if (fai_insert_index(idx, name.s, seq_len, line_len, char_len, seq_offset, qual_offset) != 0)
198
0
                        goto fail;
199
200
6.53k
                    read_done = 0;
201
6.53k
                }
202
203
9.64k
                name.l = 0;
204
205
3.62M
                do {
206
3.62M
                    if (!isspace(c)) {
207
3.61M
                        kputc(c, &name);
208
3.61M
                    } else if (name.l > 0 || c == '\n') {
209
9.59k
                        break;
210
9.59k
                    }
211
3.62M
                } while ((c = bgzf_getc(bgzf)) >= 0);
212
213
9.64k
                kputsn("", 0, &name);
214
215
9.64k
                if (c < 0) {
216
49
                    hts_log_error("The last entry '%s' has no sequence at line %d", name.s, line_num);
217
49
                    goto fail;
218
49
                }
219
220
                // read the rest of the line if necessary
221
853k
                if (c != '\n') while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
222
223
9.59k
                state = IN_SEQ; seq_len = qual_len = char_len = line_len = 0;
224
9.59k
                seq_offset = bgzf_utell(bgzf);
225
9.59k
                line_num++;
226
9.59k
            break;
227
228
16.4k
            case IN_SEQ:
229
16.4k
                if (idx->format == FAI_FASTA) {
230
16.2k
                    if (c == '\n') {
231
1.60k
                        state = OUT_READ;
232
1.60k
                        line_num++;
233
1.60k
                        continue;
234
14.6k
                    } else if (c == '>') {
235
6.73k
                        state = IN_NAME;
236
6.73k
                        continue;
237
6.73k
                    }
238
16.2k
                } else if (idx->format == FAI_FASTQ) {
239
232
                    if (c == '+') {
240
19
                        state = IN_QUAL;
241
2.05k
                        if (c != '\n') while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
242
19
                        qual_offset = bgzf_utell(bgzf);
243
19
                        line_num++;
244
19
                        continue;
245
213
                    } else if (c == '\n') {
246
5
                        hts_log_error("Inlined empty line is not allowed in sequence '%s' at line %d", name.s, line_num);
247
5
                        goto fail;
248
5
                    }
249
232
                }
250
251
8.09k
                ll = cl = 0;
252
253
8.09k
                if (idx->format == FAI_FASTA) read_done = 1;
254
255
2.33M
                do {
256
2.33M
                    ll++;
257
2.33M
                    if (isgraph(c)) cl++;
258
2.33M
                } while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
259
260
8.09k
                ll++; seq_len += cl;
261
262
8.09k
                if (line_len == 0) {
263
6.69k
                    line_len = ll;
264
6.69k
                    char_len = cl;
265
6.69k
                } else if (line_len > ll) {
266
267
1.15k
                    if (idx->format == FAI_FASTA)
268
1.10k
                        state = OUT_READ;
269
57
                    else
270
57
                        state = SEQ_END;
271
272
1.15k
                } else if (line_len < ll) {
273
12
                    hts_log_error("Different line length in sequence '%s' at line %d", name.s, line_num);
274
12
                    goto fail;
275
12
                }
276
277
8.08k
                line_num++;
278
8.08k
            break;
279
280
51
            case SEQ_END:
281
51
                if (c == '+') {
282
46
                    state = IN_QUAL;
283
2.25k
                    while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
284
46
                    qual_offset = bgzf_utell(bgzf);
285
46
                    line_num++;
286
46
                } else {
287
5
                    hts_log_error("Format error, expecting '+', got '%c' at line %d", c, line_num);
288
5
                    goto fail;
289
5
                }
290
46
            break;
291
292
69
            case IN_QUAL:
293
69
                if (c == '\n') {
294
0
                    if (!read_done) {
295
0
                        hts_log_error("Inlined empty line is not allowed in quality of sequence '%s' at line %d", name.s, line_num);
296
0
                        goto fail;
297
0
                    }
298
299
0
                    state = OUT_READ;
300
0
                    line_num++;
301
0
                    continue;
302
69
                } else if (c == '@' && read_done) {
303
8
                    state = IN_NAME;
304
8
                    continue;
305
8
                }
306
307
61
                ll = cl = 0;
308
309
770
                do {
310
770
                    ll++;
311
770
                    if (isgraph(c)) cl++;
312
770
                } while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
313
314
61
                ll++; qual_len += cl;
315
316
61
                if (line_len < ll) {
317
27
                    hts_log_error("Quality line length too long in '%s' at line %d", name.s, line_num);
318
27
                    goto fail;
319
34
                } else if (qual_len == seq_len) {
320
8
                    read_done = 1;
321
26
                } else if (qual_len > seq_len) {
322
0
                    hts_log_error("Quality length longer than sequence in '%s' at line %d", name.s, line_num);
323
0
                    goto fail;
324
26
                } else if (line_len > ll) {
325
15
                    hts_log_error("Quality line length too short in '%s' at line %d", name.s, line_num);
326
15
                    goto fail;
327
15
                }
328
329
19
                line_num++;
330
19
            break;
331
29.6k
        }
332
29.6k
    }
333
334
99
    if (read_done) {
335
44
        if (fai_insert_index(idx, name.s, seq_len, line_len, char_len, seq_offset, qual_offset) != 0)
336
0
            goto fail;
337
55
    } else {
338
55
        hts_log_error("File truncated at line %d", line_num);
339
55
        goto fail;
340
55
    }
341
342
44
    free(name.s);
343
44
    return idx;
344
345
207
fail:
346
207
    free(name.s);
347
207
    fai_destroy(idx);
348
207
    return NULL;
349
99
}
350
351
352
0
static int fai_save(const faidx_t *fai, hFILE *fp) {
353
0
    khint_t k;
354
0
    int i;
355
0
    char buf[96]; // Must be big enough for format below.
356
357
0
    for (i = 0; i < fai->n; ++i) {
358
0
        faidx1_t x;
359
0
        k = kh_get(s, fai->hash, fai->name[i]);
360
0
        assert(k < kh_end(fai->hash));
361
0
        x = kh_value(fai->hash, k);
362
363
0
        if (fai->format == FAI_FASTA) {
364
0
            snprintf(buf, sizeof(buf),
365
0
                 "\t%"PRIu64"\t%"PRIu64"\t%"PRIu32"\t%"PRIu32"\n",
366
0
                 x.len, x.seq_offset, x.line_blen, x.line_len);
367
0
        } else {
368
0
            snprintf(buf, sizeof(buf),
369
0
                 "\t%"PRIu64"\t%"PRIu64"\t%"PRIu32"\t%"PRIu32"\t%"PRIu64"\n",
370
0
                 x.len, x.seq_offset, x.line_blen, x.line_len, x.qual_offset);
371
0
        }
372
373
0
        if (hputs(fai->name[i], fp) != 0) return -1;
374
0
        if (hputs(buf, fp) != 0) return -1;
375
0
    }
376
0
    return 0;
377
0
}
378
379
380
static faidx_t *fai_read(hFILE *fp, const char *fname, int format)
381
0
{
382
0
    faidx_t *fai;
383
0
    char *buf = NULL, *p;
384
0
    ssize_t l, lnum = 1;
385
386
0
    fai = (faidx_t*)calloc(1, sizeof(faidx_t));
387
0
    if (!fai) return NULL;
388
389
0
    fai->hash = kh_init(s);
390
0
    if (!fai->hash) goto fail;
391
392
0
    buf = (char*)calloc(0x10000, 1);
393
0
    if (!buf) goto fail;
394
395
0
    while ((l = hgetln(buf, 0x10000, fp)) > 0) {
396
0
        uint32_t line_len, line_blen, n;
397
0
        uint64_t len;
398
0
        uint64_t seq_offset;
399
0
        uint64_t qual_offset = 0;
400
401
0
        for (p = buf; *p && !isspace_c(*p); ++p);
402
403
0
        if (p - buf < l) {
404
0
            *p = 0; ++p;
405
0
        }
406
407
0
        if (format == FAI_FASTA) {
408
0
            n = sscanf(p, "%"SCNu64"%"SCNu64"%"SCNu32"%"SCNu32, &len, &seq_offset, &line_blen, &line_len);
409
410
0
            if (n != 4) {
411
0
                hts_log_error("Could not understand FASTA index %s line %zd", fname, lnum);
412
0
                goto fail;
413
0
            }
414
0
        } else {
415
0
            n = sscanf(p, "%"SCNu64"%"SCNu64"%"SCNu32"%"SCNu32"%"SCNu64, &len, &seq_offset, &line_blen, &line_len, &qual_offset);
416
417
0
            if (n != 5) {
418
0
                if (n == 4) {
419
0
                    hts_log_error("Possibly this is a FASTA index, try using faidx.  Problem in %s line %zd", fname, lnum);
420
0
                } else {
421
0
                    hts_log_error("Could not understand FASTQ index %s line %zd", fname, lnum);
422
0
                }
423
424
0
                goto fail;
425
0
            }
426
0
        }
427
428
0
        if (fai_insert_index(fai, buf, len, line_len, line_blen, seq_offset, qual_offset) != 0) {
429
0
            goto fail;
430
0
        }
431
432
0
        if (buf[l - 1] == '\n') ++lnum;
433
0
    }
434
435
0
    if (l < 0) {
436
0
        hts_log_error("Error while reading %s: %s", fname, strerror(errno));
437
0
        goto fail;
438
0
    }
439
0
    free(buf);
440
0
    return fai;
441
442
0
 fail:
443
0
    free(buf);
444
0
    fai_destroy(fai);
445
0
    return NULL;
446
0
}
447
448
void fai_destroy(faidx_t *fai)
449
591
{
450
591
    int i;
451
591
    if (!fai) return;
452
1.79k
    for (i = 0; i < fai->n; ++i) free(fai->name[i]);
453
251
    free(fai->name);
454
251
    kh_destroy(s, fai->hash);
455
251
    if (fai->bgzf) bgzf_close(fai->bgzf);
456
251
    free(fai);
457
251
}
458
459
460
static int fai_build3_core(const char *fn, const char *fnfai, const char *fngzi)
461
384
{
462
384
    kstring_t fai_kstr = { 0, 0, NULL };
463
384
    kstring_t gzi_kstr = { 0, 0, NULL };
464
384
    BGZF *bgzf = NULL;
465
384
    hFILE *fp = NULL;
466
384
    faidx_t *fai = NULL;
467
384
    int save_errno, res;
468
384
    char *file_type;
469
470
384
    bgzf = bgzf_open(fn, "r");
471
472
384
    if ( !bgzf ) {
473
133
        hts_log_error("Failed to open the file %s : %s", fn, strerror(errno));
474
133
        goto fail;
475
133
    }
476
477
251
    if ( bgzf->is_compressed ) {
478
7
        if (bgzf_index_build_init(bgzf) != 0) {
479
0
            hts_log_error("Failed to allocate bgzf index");
480
0
            goto fail;
481
0
        }
482
7
    }
483
484
251
    fai = fai_build_core(bgzf);
485
486
251
    if ( !fai ) {
487
207
        if (bgzf->is_compressed && bgzf->is_gzip) {
488
7
            hts_log_error("Cannot index files compressed with gzip, please use bgzip");
489
7
        }
490
207
        goto fail;
491
207
    }
492
493
44
    if (fai->format == FAI_FASTA) {
494
44
        file_type   = "FASTA";
495
44
    } else {
496
0
        file_type   = "FASTQ";
497
0
    }
498
499
44
    if (!fnfai) {
500
44
        if (ksprintf(&fai_kstr, "%s.fai", fn) < 0) goto fail;
501
44
        fnfai = fai_kstr.s;
502
44
    }
503
504
44
    if (!fngzi) {
505
44
        if (ksprintf(&gzi_kstr, "%s.gzi", fn) < 0) goto fail;
506
44
        fngzi = gzi_kstr.s;
507
44
    }
508
509
44
    if ( bgzf->is_compressed ) {
510
0
        if (bgzf_index_dump(bgzf, fngzi, NULL) < 0) {
511
0
            hts_log_error("Failed to make bgzf index %s", fngzi);
512
0
            goto fail;
513
0
        }
514
0
    }
515
516
44
    res = bgzf_close(bgzf);
517
44
    bgzf = NULL;
518
519
44
    if (res < 0) {
520
0
        hts_log_error("Error on closing %s : %s", fn, strerror(errno));
521
0
        goto fail;
522
0
    }
523
524
44
    fp = hopen(fnfai, "wb");
525
526
44
    if ( !fp ) {
527
44
        hts_log_error("Failed to open %s index %s : %s", file_type, fnfai, strerror(errno));
528
44
        goto fail;
529
44
    }
530
531
0
    if (fai_save(fai, fp) != 0) {
532
0
        hts_log_error("Failed to write %s index %s : %s", file_type, fnfai, strerror(errno));
533
0
        goto fail;
534
0
    }
535
536
0
    if (hclose(fp) != 0) {
537
0
        hts_log_error("Failed on closing %s index %s : %s", file_type, fnfai, strerror(errno));
538
0
        goto fail;
539
0
    }
540
541
0
    free(fai_kstr.s);
542
0
    free(gzi_kstr.s);
543
0
    fai_destroy(fai);
544
0
    return 0;
545
546
384
 fail:
547
384
    save_errno = errno;
548
384
    free(fai_kstr.s);
549
384
    free(gzi_kstr.s);
550
384
    bgzf_close(bgzf);
551
384
    fai_destroy(fai);
552
384
    errno = save_errno;
553
384
    return -1;
554
0
}
555
556
557
384
int fai_build3(const char *fn, const char *fnfai, const char *fngzi) {
558
384
    return fai_build3_core(fn, fnfai, fngzi);
559
384
}
560
561
562
384
int fai_build(const char *fn) {
563
384
    return fai_build3(fn, NULL, NULL);
564
384
}
565
566
567
static faidx_t *fai_load3_core(const char *fn, const char *fnfai, const char *fngzi,
568
                   int flags, int format)
569
0
{
570
0
    kstring_t fai_kstr = { 0, 0, NULL };
571
0
    kstring_t gzi_kstr = { 0, 0, NULL };
572
0
    hFILE *fp = NULL;
573
0
    faidx_t *fai = NULL;
574
0
    int res, gzi_index_needed = 0;
575
0
    char *file_type;
576
577
0
    if (format == FAI_FASTA) {
578
0
        file_type   = "FASTA";
579
0
    } else {
580
0
        file_type   = "FASTQ";
581
0
    }
582
583
0
    if (fn == NULL)
584
0
        return NULL;
585
586
0
    if (fnfai == NULL) {
587
0
        if (ksprintf(&fai_kstr, "%s.fai", fn) < 0) goto fail;
588
0
        fnfai = fai_kstr.s;
589
0
    }
590
0
    if (fngzi == NULL) {
591
0
        if (ksprintf(&gzi_kstr, "%s.gzi", fn) < 0) goto fail;
592
0
        fngzi = gzi_kstr.s;
593
0
    }
594
595
0
    fp = hopen(fnfai, "rb");
596
597
0
    if (fp) {
598
        // index file present, check if a compressed index is needed
599
0
        hFILE *gz = NULL;
600
0
        BGZF *bgzf = bgzf_open(fn, "rb");
601
602
0
        if (bgzf == 0) {
603
0
            hts_log_error("Failed to open %s file %s", file_type, fn);
604
0
            goto fail;
605
0
        }
606
607
0
        if (bgzf_compression(bgzf) == 2) { // BGZF compression
608
0
            if ((gz = hopen(fngzi, "rb")) == 0) {
609
610
0
                if (!(flags & FAI_CREATE) || errno != ENOENT) {
611
0
                    hts_log_error("Failed to open %s index %s: %s", file_type, fngzi, strerror(errno));
612
0
                    bgzf_close(bgzf);
613
0
                    goto fail;
614
0
                }
615
616
0
                gzi_index_needed = 1;
617
0
                res = hclose(fp); // closed as going to be re-indexed
618
619
0
                if (res < 0) {
620
0
                    hts_log_error("Failed on closing %s index %s : %s", file_type, fnfai, strerror(errno));
621
0
                    goto fail;
622
0
                }
623
0
            } else {
624
0
                res = hclose(gz);
625
626
0
                if (res < 0) {
627
0
                    hts_log_error("Failed on closing %s index %s : %s", file_type, fngzi, strerror(errno));
628
0
                    goto fail;
629
0
                }
630
0
            }
631
0
        }
632
633
0
        bgzf_close(bgzf);
634
0
    }
635
636
0
    if (fp == 0 || gzi_index_needed) {
637
0
        if (!(flags & FAI_CREATE) || errno != ENOENT) {
638
0
            hts_log_error("Failed to open %s index %s: %s", file_type, fnfai, strerror(errno));
639
0
            goto fail;
640
0
        }
641
642
0
        hts_log_info("Build %s index", file_type);
643
644
0
        if (fai_build3_core(fn, fnfai, fngzi) < 0) {
645
0
            goto fail;
646
0
        }
647
648
0
        fp = hopen(fnfai, "rb");
649
0
        if (fp == 0) {
650
0
            hts_log_error("Failed to open %s index %s: %s", file_type, fnfai, strerror(errno));
651
0
            goto fail;
652
0
        }
653
0
    }
654
655
0
    fai = fai_read(fp, fnfai, format);
656
0
    if (fai == NULL) {
657
0
        hts_log_error("Failed to read %s index %s", file_type, fnfai);
658
0
        goto fail;
659
0
    }
660
661
0
    res = hclose(fp);
662
0
    fp = NULL;
663
0
    if (res < 0) {
664
0
        hts_log_error("Failed on closing %s index %s : %s", file_type, fnfai, strerror(errno));
665
0
        goto fail;
666
0
    }
667
668
0
    fai->bgzf = bgzf_open(fn, "rb");
669
0
    if (fai->bgzf == 0) {
670
0
        hts_log_error("Failed to open %s file %s", file_type, fn);
671
0
        goto fail;
672
0
    }
673
674
0
    if ( fai->bgzf->is_compressed==1 ) {
675
0
        if ( bgzf_index_load(fai->bgzf, fngzi, NULL) < 0 ) {
676
0
            hts_log_error("Failed to load .gzi index: %s", fngzi);
677
0
            goto fail;
678
0
        }
679
0
    }
680
0
    free(fai_kstr.s);
681
0
    free(gzi_kstr.s);
682
0
    return fai;
683
684
0
 fail:
685
0
    if (fai) fai_destroy(fai);
686
0
    if (fp) hclose_abruptly(fp);
687
0
    free(fai_kstr.s);
688
0
    free(gzi_kstr.s);
689
0
    return NULL;
690
0
}
691
692
693
faidx_t *fai_load3(const char *fn, const char *fnfai, const char *fngzi,
694
0
                   int flags) {
695
0
    return fai_load3_core(fn, fnfai, fngzi, flags, FAI_FASTA);
696
0
}
697
698
699
faidx_t *fai_load(const char *fn)
700
0
{
701
0
    return fai_load3(fn, NULL, NULL, FAI_CREATE);
702
0
}
703
704
705
faidx_t *fai_load3_format(const char *fn, const char *fnfai, const char *fngzi,
706
0
                   int flags, enum fai_format_options format) {
707
0
    return fai_load3_core(fn, fnfai, fngzi, flags, format);
708
0
}
709
710
711
0
faidx_t *fai_load_format(const char *fn, enum fai_format_options format) {
712
0
    return fai_load3_format(fn, NULL, NULL, FAI_CREATE, format);
713
0
}
714
715
716
static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
717
0
                          uint64_t offset, hts_pos_t beg, hts_pos_t end, hts_pos_t *len) {
718
0
    char *buffer, *s;
719
0
    ssize_t nread, remaining, firstline_len, firstline_blen;
720
0
    int ret;
721
722
0
    if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) {
723
0
        hts_log_error("Range %"PRId64"..%"PRId64" too big", beg, end);
724
0
        *len = -1;
725
0
        return NULL;
726
0
    }
727
728
0
    if (val->line_blen <= 0) {
729
0
        hts_log_error("Invalid line length in index: %d", val->line_blen);
730
0
        *len = -1;
731
0
        return NULL;
732
0
    }
733
734
0
    ret = bgzf_useek(fai->bgzf,
735
0
                     offset
736
0
                     + beg / val->line_blen * val->line_len
737
0
                     + beg % val->line_blen, SEEK_SET);
738
739
0
    if (ret < 0) {
740
0
        *len = -1;
741
0
        hts_log_error("Failed to retrieve block. (Seeking in a compressed, .gzi unindexed, file?)");
742
0
        return NULL;
743
0
    }
744
745
    // Over-allocate so there is extra space for one end-of-line sequence
746
0
    buffer = (char*)malloc((size_t) end - beg + val->line_len - val->line_blen + 1);
747
0
    if (!buffer) {
748
0
        *len = -1;
749
0
        return NULL;
750
0
    }
751
752
0
    remaining = *len = end - beg;
753
0
    firstline_blen = val->line_blen - beg % val->line_blen;
754
755
    // Special case when the entire interval requested is within a single FASTA/Q line
756
0
    if (remaining <= firstline_blen) {
757
0
        nread = bgzf_read_small(fai->bgzf, buffer, remaining);
758
0
        if (nread < remaining) goto error;
759
0
        buffer[nread] = '\0';
760
0
        return buffer;
761
0
    }
762
763
0
    s = buffer;
764
0
    firstline_len = val->line_len - beg % val->line_blen;
765
766
    // Read the (partial) first line and its line terminator, but increment  s  past the
767
    // line contents only, so the terminator characters will be overwritten by the next line.
768
0
    nread = bgzf_read_small(fai->bgzf, s, firstline_len);
769
0
    if (nread < firstline_len) goto error;
770
0
    s += firstline_blen;
771
0
    remaining -= firstline_blen;
772
773
    // Similarly read complete lines and their line terminator characters, but overwrite the latter.
774
0
    while (remaining > val->line_blen) {
775
0
        nread = bgzf_read_small(fai->bgzf, s, val->line_len);
776
0
        if (nread < (ssize_t) val->line_len) goto error;
777
0
        s += val->line_blen;
778
0
        remaining -= val->line_blen;
779
0
    }
780
781
0
    if (remaining > 0) {
782
0
        nread = bgzf_read_small(fai->bgzf, s, remaining);
783
0
        if (nread < remaining) goto error;
784
0
        s += remaining;
785
0
    }
786
787
0
    *s = '\0';
788
0
    return buffer;
789
790
0
error:
791
0
    hts_log_error("Failed to retrieve block: %s",
792
0
                  (nread == 0)? "unexpected end of file" : "error reading file");
793
0
    free(buffer);
794
0
    *len = -1;
795
0
    return NULL;
796
0
}
797
798
static int fai_get_val(const faidx_t *fai, const char *str,
799
0
                       hts_pos_t *len, faidx1_t *val, hts_pos_t *fbeg, hts_pos_t *fend) {
800
0
    khiter_t iter;
801
0
    khash_t(s) *h;
802
0
    int id;
803
0
    hts_pos_t beg, end;
804
805
0
    if (!fai_parse_region(fai, str, &id, &beg, &end, 0)) {
806
0
        hts_log_warning("Reference %s not found in FASTA file, returning empty sequence", str);
807
0
        *len = -2;
808
0
        return 1;
809
0
    }
810
811
0
    h = fai->hash;
812
0
    iter = kh_get(s, h, faidx_iseq(fai, id));
813
0
    if (iter >= kh_end(h)) {
814
        // should have already been caught above
815
0
        abort();
816
0
    }
817
0
    *val = kh_value(h, iter);
818
819
0
    if (beg >= val->len) beg = val->len;
820
0
    if (end >= val->len) end = val->len;
821
0
    if (beg > end) beg = end;
822
823
0
    *fbeg = beg;
824
0
    *fend = end;
825
826
0
    return 0;
827
0
}
828
829
/*
830
 *  The internal still has line_blen as uint32_t, but our references
831
 *  can be longer, so for future proofing we use hts_pos_t.  We also needed
832
 *  a signed value so we can return negatives as an error.
833
 */
834
hts_pos_t fai_line_length(const faidx_t *fai, const char *str)
835
0
{
836
0
    faidx1_t val;
837
0
    int64_t beg, end;
838
0
    hts_pos_t len;
839
840
0
    if (fai_get_val(fai, str, &len, &val, &beg, &end))
841
0
        return -1;
842
0
    else
843
0
        return val.line_blen;
844
0
}
845
846
char *fai_fetch64(const faidx_t *fai, const char *str, hts_pos_t *len)
847
0
{
848
0
    faidx1_t val;
849
0
    int64_t beg, end;
850
851
0
    if (fai_get_val(fai, str, len, &val, &beg, &end)) {
852
0
        return NULL;
853
0
    }
854
855
    // now retrieve the sequence
856
0
    return fai_retrieve(fai, &val, val.seq_offset, beg, end, len);
857
0
}
858
859
char *fai_fetch(const faidx_t *fai, const char *str, int *len)
860
0
{
861
0
    hts_pos_t len64;
862
0
    char *ret = fai_fetch64(fai, str, &len64);
863
0
    *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc
864
0
    return ret;
865
0
}
866
867
0
char *fai_fetchqual64(const faidx_t *fai, const char *str, hts_pos_t *len) {
868
0
    faidx1_t val;
869
0
    int64_t beg, end;
870
871
0
    if (fai_get_val(fai, str, len, &val, &beg, &end)) {
872
0
        return NULL;
873
0
    }
874
875
    // now retrieve the sequence
876
0
    return fai_retrieve(fai, &val, val.qual_offset, beg, end, len);
877
0
}
878
879
0
char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) {
880
0
    hts_pos_t len64;
881
0
    char *ret = fai_fetchqual64(fai, str, &len64);
882
0
    *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc
883
0
    return ret;
884
0
}
885
886
int faidx_fetch_nseq(const faidx_t *fai)
887
0
{
888
0
    return fai->n;
889
0
}
890
891
int faidx_nseq(const faidx_t *fai)
892
0
{
893
0
    return fai->n;
894
0
}
895
896
const char *faidx_iseq(const faidx_t *fai, int i)
897
0
{
898
0
    return fai->name[i];
899
0
}
900
901
hts_pos_t faidx_seq_len64(const faidx_t *fai, const char *seq)
902
0
{
903
0
    khint_t k = kh_get(s, fai->hash, seq);
904
0
    if ( k == kh_end(fai->hash) ) return -1;
905
0
    return kh_val(fai->hash, k).len;
906
0
}
907
908
int faidx_seq_len(const faidx_t *fai, const char *seq)
909
0
{
910
0
    hts_pos_t len = faidx_seq_len64(fai, seq);
911
0
    return len < INT_MAX ? len : INT_MAX;
912
0
}
913
914
static int faidx_adjust_position(const faidx_t *fai, int end_adjust,
915
                                 faidx1_t *val_out, const char *c_name,
916
                                 hts_pos_t *p_beg_i, hts_pos_t *p_end_i,
917
0
                                 hts_pos_t *len) {
918
0
    khiter_t iter;
919
0
    faidx1_t *val;
920
921
    // Adjust position
922
0
    iter = kh_get(s, fai->hash, c_name);
923
924
0
    if (iter == kh_end(fai->hash)) {
925
0
        if (len)
926
0
            *len = -2;
927
0
        hts_log_error("The sequence \"%s\" was not found", c_name);
928
0
        return 1;
929
0
    }
930
931
0
    val = &kh_value(fai->hash, iter);
932
933
0
    if (val_out)
934
0
        *val_out = *val;
935
936
0
    if(*p_end_i < *p_beg_i)
937
0
        *p_beg_i = *p_end_i;
938
939
0
    if(*p_beg_i < 0)
940
0
        *p_beg_i = 0;
941
0
    else if(val->len <= *p_beg_i)
942
0
        *p_beg_i = val->len;
943
944
0
    if(*p_end_i < 0)
945
0
        *p_end_i = 0;
946
0
    else if(val->len <= *p_end_i)
947
0
        *p_end_i = val->len - end_adjust;
948
949
0
    return 0;
950
0
}
951
952
int fai_adjust_region(const faidx_t *fai, int tid,
953
                      hts_pos_t *beg, hts_pos_t *end)
954
0
{
955
0
    hts_pos_t orig_beg, orig_end;
956
957
0
    if (!fai || !beg || !end || tid < 0 || tid >= fai->n)
958
0
        return -1;
959
960
0
    orig_beg = *beg;
961
0
    orig_end = *end;
962
0
    if (faidx_adjust_position(fai, 0, NULL, fai->name[tid], beg, end, NULL) != 0) {
963
0
        hts_log_error("Inconsistent faidx internal state - couldn't find \"%s\"",
964
0
                      fai->name[tid]);
965
0
        return -1;
966
0
    }
967
968
0
    return ((orig_beg != *beg ? 1 : 0) |
969
0
            (orig_end != *end && orig_end < HTS_POS_MAX ? 2 : 0));
970
0
}
971
972
char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len)
973
0
{
974
0
    faidx1_t val;
975
976
    // Adjust position
977
0
    if (faidx_adjust_position(fai, 1, &val, c_name, &p_beg_i, &p_end_i, len)) {
978
0
        return NULL;
979
0
    }
980
981
    // Now retrieve the sequence
982
0
    return fai_retrieve(fai, &val, val.seq_offset, p_beg_i, p_end_i + 1, len);
983
0
}
984
985
char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
986
0
{
987
0
    hts_pos_t len64;
988
0
    char *ret = faidx_fetch_seq64(fai, c_name, p_beg_i, p_end_i, &len64);
989
0
    *len = len64 < INT_MAX ? len64 : INT_MAX;  // trunc
990
0
    return ret;
991
0
}
992
993
char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len)
994
0
{
995
0
    faidx1_t val;
996
997
    // Adjust position
998
0
    if (faidx_adjust_position(fai, 1, &val, c_name, &p_beg_i, &p_end_i, len)) {
999
0
        return NULL;
1000
0
    }
1001
1002
    // Now retrieve the sequence
1003
0
    return fai_retrieve(fai, &val, val.qual_offset, p_beg_i, p_end_i + 1, len);
1004
0
}
1005
1006
char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
1007
0
{
1008
0
    hts_pos_t len64;
1009
0
    char *ret = faidx_fetch_qual64(fai, c_name, p_beg_i, p_end_i, &len64);
1010
0
    *len = len64 < INT_MAX ? len64 : INT_MAX;  // trunc
1011
0
    return ret;
1012
0
}
1013
1014
int faidx_has_seq(const faidx_t *fai, const char *seq)
1015
0
{
1016
0
    khiter_t iter = kh_get(s, fai->hash, seq);
1017
0
    if (iter == kh_end(fai->hash)) return 0;
1018
0
    return 1;
1019
0
}
1020
1021
const char *fai_parse_region(const faidx_t *fai, const char *s,
1022
                             int *tid, hts_pos_t *beg, hts_pos_t *end,
1023
                             int flags)
1024
0
{
1025
0
    return hts_parse_region(s, tid, beg, end, (hts_name2id_f)fai_name2id, (void *)fai, flags);
1026
0
}
1027
1028
0
void fai_set_cache_size(faidx_t *fai, int cache_size) {
1029
0
    bgzf_set_cache_size(fai->bgzf, cache_size);
1030
0
}
1031
1032
// Adds a thread pool to the underlying BGZF layer.
1033
0
int fai_thread_pool(faidx_t *fai, struct hts_tpool *pool, int qsize) {
1034
0
    return bgzf_thread_pool(fai->bgzf, pool, qsize);
1035
0
}
1036
1037
0
char *fai_path(const char *fa) {
1038
0
    char *fai = NULL;
1039
0
    if (!fa) {
1040
0
        hts_log_error("No reference file specified");
1041
0
    } else {
1042
0
        char *fai_tmp = strstr(fa, HTS_IDX_DELIM);
1043
0
        if (fai_tmp) {
1044
0
            fai_tmp += strlen(HTS_IDX_DELIM);
1045
0
            fai = strdup(fai_tmp);
1046
0
            if (!fai)
1047
0
                hts_log_error("Failed to allocate memory");
1048
0
        } else {
1049
0
            if (hisremote(fa)) {
1050
0
                fai = hts_idx_locatefn(fa, ".fai");       // get the remote fai file name, if any, but do not download the file
1051
0
                if (!fai)
1052
0
                    hts_log_error("Failed to locate index file for remote reference file '%s'", fa);
1053
0
            } else{
1054
0
                if (hts_idx_check_local(fa, HTS_FMT_FAI, &fai) == 0 && fai) {
1055
0
                    if (fai_build3(fa, fai, NULL) == -1) {      // create local fai file by indexing local fasta
1056
0
                        hts_log_error("Failed to build index file for reference file '%s'", fa);
1057
0
                        free(fai);
1058
0
                        fai = NULL;
1059
0
                    }
1060
0
                }
1061
0
            }
1062
0
        }
1063
0
    }
1064
1065
0
    return fai;
1066
0
}