Coverage Report

Created: 2026-05-30 06:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/faidx.c
Line
Count
Source
1
/*  faidx.c -- FASTA and FASTQ random access.
2
3
    Copyright (C) 2008, 2009, 2013-2020, 2022, 2024 Genome Research Ltd.
4
    Portions copyright (C) 2011 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <ctype.h>
30
#include <string.h>
31
#include <stdlib.h>
32
#include <stdio.h>
33
#include <inttypes.h>
34
#include <errno.h>
35
#include <limits.h>
36
#include <unistd.h>
37
#include <assert.h>
38
39
#include "htslib/bgzf.h"
40
#include "htslib/faidx.h"
41
#include "htslib/hfile.h"
42
#include "htslib/hts_alloc.h"
43
#include "htslib/khash.h"
44
#include "htslib/kstring.h"
45
#include "hts_internal.h"
46
47
// Faster isgraph; assumes ASCII
48
0
static inline int isgraph_(unsigned char c) {
49
0
    return c > ' ' && c <= '~';
50
0
}
51
52
#ifdef isgraph
53
#  undef isgraph
54
#endif
55
0
#define isgraph isgraph_
56
57
// An optimised bgzf_getc.
58
// We could consider moving this to bgzf.h, but our own code uses it here only.
59
0
static inline int bgzf_getc_(BGZF *fp) {
60
0
    if (fp->block_offset+1 < fp->block_length) {
61
0
        int c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
62
0
        fp->uncompressed_address++;
63
0
        return c;
64
0
    }
65
66
0
    return bgzf_getc(fp);
67
0
}
68
0
#define bgzf_getc bgzf_getc_
69
70
typedef struct {
71
    int id; // faidx_t->name[id] is for this struct.
72
    uint32_t line_extra;  // Bytes to skip at the end of each line
73
                          // Usually 1 for \n or 2 for \r\n ending
74
    uint64_t line_blen;   // Number of bases (or quality values) on each line
75
                          // Total line length = line_blen + line_extra
76
    uint64_t len;         // Length of entire sequence
77
    uint64_t seq_offset;  // File offset to start of sequence
78
    uint64_t qual_offset; // File offset to start of quality values (if fastq)
79
} faidx1_t;
80
KHASH_MAP_INIT_STR(s, faidx1_t)
81
82
struct faidx_t {
83
    BGZF *bgzf;
84
    int n, m;
85
    char **name;
86
    khash_t(s) *hash;
87
    enum fai_format_options format;
88
};
89
90
static int fai_name2id(void *v, const char *ref)
91
0
{
92
0
    faidx_t *fai = (faidx_t *)v;
93
0
    khint_t k = kh_get(s, fai->hash, ref);
94
0
    return k == kh_end(fai->hash) ? -1 : kh_val(fai->hash, k).id;
95
0
}
96
97
static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len, uint64_t line_len, uint64_t line_blen, uint64_t seq_offset, uint64_t qual_offset)
98
0
{
99
0
    if (!name) {
100
0
        hts_log_error("Malformed line: no name");
101
0
        return -1;
102
0
    }
103
0
    if (line_len > HTS_POS_MAX || line_blen > HTS_POS_MAX) {
104
0
        hts_log_error("Malformed line: length out of bounds");
105
0
        return -1;
106
0
    }
107
0
    if (line_len < line_blen) {
108
0
        hts_log_error("Malformed line: width (%"PRIu64") less than base count (%"PRIu64")",
109
0
                      line_len, line_blen);
110
0
        return -1;
111
0
    }
112
0
    if (line_len - line_blen > UINT32_MAX) {
113
0
        hts_log_error("Malformed line: difference between width (%"PRIu64") and base count (%"PRIu64") too large",
114
0
                      line_len, line_blen);
115
0
        return -1;
116
0
    }
117
118
0
    char *name_key = strdup(name);
119
0
    int absent;
120
0
    khint_t k = kh_put(s, idx->hash, name_key, &absent);
121
0
    faidx1_t *v = &kh_value(idx->hash, k);
122
123
0
    if (! absent) {
124
0
        hts_log_warning("Ignoring duplicate sequence \"%s\" at byte offset %" PRIu64, name, seq_offset);
125
0
        free(name_key);
126
0
        return 0;
127
0
    }
128
129
0
    if (idx->n == idx->m) {
130
0
        char **tmp;
131
0
        idx->m = idx->m? idx->m<<1 : 16;
132
0
        if (!(tmp = hts_realloc_p(idx->name, sizeof(char*), idx->m))) {
133
0
            hts_log_error("Out of memory");
134
0
            return -1;
135
0
        }
136
0
        idx->name = tmp;
137
0
    }
138
0
    v->id = idx->n;
139
0
    idx->name[idx->n++] = name_key;
140
0
    v->len = len;
141
0
    v->line_extra = (uint32_t) (line_len - line_blen);
142
0
    v->line_blen = line_blen;
143
0
    v->seq_offset = seq_offset;
144
0
    v->qual_offset = qual_offset;
145
146
0
    return 0;
147
0
}
148
149
150
0
static faidx_t *fai_build_core(BGZF *bgzf) {
151
0
    kstring_t name = { 0, 0, NULL };
152
0
    int c, read_done, line_num;
153
0
    faidx_t *idx;
154
0
    uint64_t seq_offset, qual_offset;
155
0
    uint64_t seq_len, qual_len;
156
0
    uint64_t char_len, cl, line_len, ll;
157
0
    enum read_state {OUT_READ, IN_NAME, IN_SEQ, SEQ_END, IN_QUAL} state;
158
159
0
    idx = (faidx_t*)calloc(1, sizeof(faidx_t));
160
0
    idx->hash = kh_init(s);
161
0
    idx->format = FAI_NONE;
162
163
0
    state = OUT_READ, read_done = 0, line_num = 1;
164
0
    seq_offset = qual_offset = seq_len = qual_len = char_len = cl = line_len = ll = 0;
165
166
0
    while ((c = bgzf_getc(bgzf)) >= 0) {
167
0
        switch (state) {
168
0
            case OUT_READ:
169
0
                switch (c) {
170
0
                    case '>':
171
0
                        if (idx->format == FAI_FASTQ) {
172
0
                            hts_log_error("Found '>' in a FASTQ file, error at line %d", line_num);
173
0
                            goto fail;
174
0
                        }
175
176
0
                        idx->format = FAI_FASTA;
177
0
                        state = IN_NAME;
178
0
                    break;
179
180
0
                    case '@':
181
0
                        if (idx->format == FAI_FASTA) {
182
0
                            hts_log_error("Found '@' in a FASTA file, error at line %d", line_num);
183
0
                            goto fail;
184
0
                        }
185
186
0
                        idx->format = FAI_FASTQ;
187
0
                        state = IN_NAME;
188
0
                    break;
189
190
0
                    case '\r':
191
                        // Blank line with cr-lf ending?
192
0
                        if ((c = bgzf_getc(bgzf)) == '\n') {
193
0
                            line_num++;
194
0
                        } else {
195
0
                            hts_log_error("Format error, carriage return not followed by new line at line %d", line_num);
196
0
                            goto fail;
197
0
                        }
198
0
                    break;
199
200
0
                    case '\n':
201
                        // just move onto the next line
202
0
                        line_num++;
203
0
                    break;
204
205
0
                    default: {
206
0
                        char s[4] = { '"', c, '"', '\0' };
207
0
                        hts_log_error("Format error, unexpected %s at line %d", isprint(c) ? s : "character", line_num);
208
0
                        goto fail;
209
0
                    }
210
0
                }
211
0
            break;
212
213
0
            case IN_NAME:
214
0
                if (read_done) {
215
0
                    if (fai_insert_index(idx, name.s, seq_len, line_len, char_len, seq_offset, qual_offset) != 0)
216
0
                        goto fail;
217
218
0
                    read_done = 0;
219
0
                }
220
221
0
                name.l = 0;
222
223
0
                do {
224
0
                    if (!isspace(c)) {
225
0
                        kputc(c, &name);
226
0
                    } else if (name.l > 0 || c == '\n') {
227
0
                        break;
228
0
                    }
229
0
                } while ((c = bgzf_getc(bgzf)) >= 0);
230
231
0
                kputsn("", 0, &name);
232
233
0
                if (c < 0) {
234
0
                    hts_log_error("The last entry '%s' has no sequence at line %d", name.s, line_num);
235
0
                    goto fail;
236
0
                }
237
238
                // read the rest of the line if necessary
239
0
                if (c != '\n') while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
240
241
0
                state = IN_SEQ; seq_len = qual_len = char_len = line_len = 0;
242
0
                seq_offset = bgzf_utell(bgzf);
243
0
                line_num++;
244
0
            break;
245
246
0
            case IN_SEQ:
247
0
                if (idx->format == FAI_FASTA) {
248
0
                    if (c == '\n') {
249
0
                        state = OUT_READ;
250
0
                        line_num++;
251
0
                        continue;
252
0
                    } else if (c == '>') {
253
0
                        state = IN_NAME;
254
0
                        continue;
255
0
                    }
256
0
                } else if (idx->format == FAI_FASTQ) {
257
0
                    if (c == '+') {
258
0
                        state = IN_QUAL;
259
0
                        if (c != '\n') while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
260
0
                        qual_offset = bgzf_utell(bgzf);
261
0
                        line_num++;
262
0
                        continue;
263
0
                    } else if (c == '\n') {
264
0
                        hts_log_error("Inlined empty line is not allowed in sequence '%s' at line %d", name.s, line_num);
265
0
                        goto fail;
266
0
                    }
267
0
                }
268
269
0
                ll = cl = 0;
270
271
0
                if (idx->format == FAI_FASTA) read_done = 1;
272
273
0
                do {
274
0
                    ll++;
275
0
                    if (isgraph(c)) cl++;
276
0
                } while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
277
278
0
                ll++; seq_len += cl;
279
280
0
                if (line_len == 0) {
281
0
                    line_len = ll;
282
0
                    char_len = cl;
283
0
                } else if (line_len > ll) {
284
285
0
                    if (idx->format == FAI_FASTA)
286
0
                        state = OUT_READ;
287
0
                    else
288
0
                        state = SEQ_END;
289
290
0
                } else if (line_len < ll) {
291
0
                    hts_log_error("Different line length in sequence '%s' at line %d", name.s, line_num);
292
0
                    goto fail;
293
0
                }
294
295
0
                line_num++;
296
0
            break;
297
298
0
            case SEQ_END:
299
0
                if (c == '+') {
300
0
                    state = IN_QUAL;
301
0
                    while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
302
0
                    qual_offset = bgzf_utell(bgzf);
303
0
                    line_num++;
304
0
                } else {
305
0
                    hts_log_error("Format error, expecting '+', got '%c' at line %d", c, line_num);
306
0
                    goto fail;
307
0
                }
308
0
            break;
309
310
0
            case IN_QUAL:
311
0
                if (c == '\n') {
312
0
                    if (!read_done) {
313
0
                        hts_log_error("Inlined empty line is not allowed in quality of sequence '%s' at line %d", name.s, line_num);
314
0
                        goto fail;
315
0
                    }
316
317
0
                    state = OUT_READ;
318
0
                    line_num++;
319
0
                    continue;
320
0
                } else if (c == '@' && read_done) {
321
0
                    state = IN_NAME;
322
0
                    continue;
323
0
                }
324
325
0
                ll = cl = 0;
326
327
0
                do {
328
0
                    ll++;
329
0
                    if (isgraph(c)) cl++;
330
0
                } while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
331
332
0
                ll++; qual_len += cl;
333
334
0
                if (line_len < ll) {
335
0
                    hts_log_error("Quality line length too long in '%s' at line %d", name.s, line_num);
336
0
                    goto fail;
337
0
                } else if (qual_len == seq_len) {
338
0
                    read_done = 1;
339
0
                } else if (qual_len > seq_len) {
340
0
                    hts_log_error("Quality length longer than sequence in '%s' at line %d", name.s, line_num);
341
0
                    goto fail;
342
0
                } else if (line_len > ll) {
343
0
                    hts_log_error("Quality line length too short in '%s' at line %d", name.s, line_num);
344
0
                    goto fail;
345
0
                }
346
347
0
                line_num++;
348
0
            break;
349
0
        }
350
0
    }
351
352
0
    if (read_done) {
353
0
        if (fai_insert_index(idx, name.s, seq_len, line_len, char_len, seq_offset, qual_offset) != 0)
354
0
            goto fail;
355
0
    } else {
356
0
        hts_log_error("File truncated at line %d", line_num);
357
0
        goto fail;
358
0
    }
359
360
0
    free(name.s);
361
0
    return idx;
362
363
0
fail:
364
0
    free(name.s);
365
0
    fai_destroy(idx);
366
0
    return NULL;
367
0
}
368
369
370
0
static int fai_save(const faidx_t *fai, hFILE *fp) {
371
0
    khint_t k;
372
0
    int i;
373
0
    char buf[96]; // Must be big enough for format below.
374
375
0
    for (i = 0; i < fai->n; ++i) {
376
0
        faidx1_t x;
377
0
        k = kh_get(s, fai->hash, fai->name[i]);
378
0
        assert(k < kh_end(fai->hash));
379
0
        x = kh_value(fai->hash, k);
380
381
0
        if (fai->format == FAI_FASTA) {
382
0
            snprintf(buf, sizeof(buf),
383
0
                 "\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n",
384
0
                 x.len, x.seq_offset, x.line_blen, x.line_blen + x.line_extra);
385
0
        } else {
386
0
            snprintf(buf, sizeof(buf),
387
0
                 "\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n",
388
0
                     x.len, x.seq_offset, x.line_blen,
389
0
                     x.line_blen + x.line_extra, x.qual_offset);
390
0
        }
391
392
0
        if (hputs(fai->name[i], fp) != 0) return -1;
393
0
        if (hputs(buf, fp) != 0) return -1;
394
0
    }
395
0
    return 0;
396
0
}
397
398
399
static faidx_t *fai_read(hFILE *fp, const char *fname, int format)
400
0
{
401
0
    faidx_t *fai;
402
0
    char *buf = NULL, *p;
403
0
    ssize_t l, lnum = 1;
404
405
0
    fai = (faidx_t*)calloc(1, sizeof(faidx_t));
406
0
    if (!fai) return NULL;
407
408
0
    fai->hash = kh_init(s);
409
0
    if (!fai->hash) goto fail;
410
411
0
    buf = (char*)calloc(0x10000, 1);
412
0
    if (!buf) goto fail;
413
414
0
    while ((l = hgetln(buf, 0x10000, fp)) > 0) {
415
0
        uint64_t line_len, line_blen;
416
0
        uint64_t len;
417
0
        uint64_t seq_offset;
418
0
        uint64_t qual_offset = 0;
419
0
        int n;
420
421
0
        for (p = buf; *p && !isspace_c(*p); ++p);
422
423
0
        if (p - buf < l) {
424
0
            *p = 0; ++p;
425
0
        }
426
427
0
        if (format == FAI_FASTA) {
428
0
            n = sscanf(p, "%"SCNu64"%"SCNu64"%"SCNu64"%"SCNu64, &len, &seq_offset, &line_blen, &line_len);
429
430
0
            if (n != 4) {
431
0
                hts_log_error("Could not understand FASTA index %s line %zd", fname, lnum);
432
0
                goto fail;
433
0
            }
434
0
        } else {
435
0
            n = sscanf(p, "%"SCNu64"%"SCNu64"%"SCNu64"%"SCNu64"%"SCNu64, &len, &seq_offset, &line_blen, &line_len, &qual_offset);
436
437
0
            if (n != 5) {
438
0
                if (n == 4) {
439
0
                    hts_log_error("Possibly this is a FASTA index, try using faidx.  Problem in %s line %zd", fname, lnum);
440
0
                } else {
441
0
                    hts_log_error("Could not understand FASTQ index %s line %zd", fname, lnum);
442
0
                }
443
444
0
                goto fail;
445
0
            }
446
0
        }
447
448
0
        if (fai_insert_index(fai, buf, len, line_len, line_blen, seq_offset, qual_offset) != 0) {
449
0
            goto fail;
450
0
        }
451
452
0
        if (buf[l - 1] == '\n') ++lnum;
453
0
    }
454
455
0
    if (l < 0) {
456
0
        hts_log_error("Error while reading %s: %s", fname, strerror(errno));
457
0
        goto fail;
458
0
    }
459
0
    free(buf);
460
0
    return fai;
461
462
0
 fail:
463
0
    free(buf);
464
0
    fai_destroy(fai);
465
0
    return NULL;
466
0
}
467
468
void fai_destroy(faidx_t *fai)
469
2
{
470
2
    int i;
471
2
    if (!fai) return;
472
0
    for (i = 0; i < fai->n; ++i) free(fai->name[i]);
473
0
    free(fai->name);
474
0
    kh_destroy(s, fai->hash);
475
0
    if (fai->bgzf) bgzf_close(fai->bgzf);
476
0
    free(fai);
477
0
}
478
479
480
static int fai_build3_core(const char *fn, const char *fnfai, const char *fngzi)
481
2
{
482
2
    kstring_t fai_kstr = { 0, 0, NULL };
483
2
    kstring_t gzi_kstr = { 0, 0, NULL };
484
2
    BGZF *bgzf = NULL;
485
2
    hFILE *fp = NULL;
486
2
    faidx_t *fai = NULL;
487
2
    int save_errno, res;
488
2
    char *file_type;
489
490
2
    bgzf = bgzf_open(fn, "r");
491
492
2
    if ( !bgzf ) {
493
2
        hts_log_error("Failed to open the file %s : %s", fn, strerror(errno));
494
2
        goto fail;
495
2
    }
496
497
0
    if ( bgzf->is_compressed ) {
498
0
        if (bgzf_index_build_init(bgzf) != 0) {
499
0
            hts_log_error("Failed to allocate bgzf index");
500
0
            goto fail;
501
0
        }
502
0
    }
503
504
0
    fai = fai_build_core(bgzf);
505
506
0
    if ( !fai ) {
507
0
        if (bgzf->is_compressed && bgzf->is_gzip) {
508
0
            hts_log_error("Cannot index files compressed with gzip, please use bgzip");
509
0
        }
510
0
        goto fail;
511
0
    }
512
513
0
    if (fai->format == FAI_FASTA) {
514
0
        file_type   = "FASTA";
515
0
    } else {
516
0
        file_type   = "FASTQ";
517
0
    }
518
519
0
    if (!fnfai) {
520
0
        if (ksprintf(&fai_kstr, "%s.fai", fn) < 0) goto fail;
521
0
        fnfai = fai_kstr.s;
522
0
    }
523
524
0
    if (!fngzi) {
525
0
        if (ksprintf(&gzi_kstr, "%s.gzi", fn) < 0) goto fail;
526
0
        fngzi = gzi_kstr.s;
527
0
    }
528
529
0
    if ( bgzf->is_compressed ) {
530
0
        if (bgzf_index_dump(bgzf, fngzi, NULL) < 0) {
531
0
            hts_log_error("Failed to make bgzf index %s", fngzi);
532
0
            goto fail;
533
0
        }
534
0
    }
535
536
0
    res = bgzf_close(bgzf);
537
0
    bgzf = NULL;
538
539
0
    if (res < 0) {
540
0
        hts_log_error("Error on closing %s : %s", fn, strerror(errno));
541
0
        goto fail;
542
0
    }
543
544
0
    fp = hopen(fnfai, "wb");
545
546
0
    if ( !fp ) {
547
0
        hts_log_error("Failed to open %s index %s : %s", file_type, fnfai, strerror(errno));
548
0
        goto fail;
549
0
    }
550
551
0
    if (fai_save(fai, fp) != 0) {
552
0
        hts_log_error("Failed to write %s index %s : %s", file_type, fnfai, strerror(errno));
553
0
        goto fail;
554
0
    }
555
556
0
    if (hclose(fp) != 0) {
557
0
        hts_log_error("Failed on closing %s index %s : %s", file_type, fnfai, strerror(errno));
558
0
        goto fail;
559
0
    }
560
561
0
    free(fai_kstr.s);
562
0
    free(gzi_kstr.s);
563
0
    fai_destroy(fai);
564
0
    return 0;
565
566
2
 fail:
567
2
    save_errno = errno;
568
2
    free(fai_kstr.s);
569
2
    free(gzi_kstr.s);
570
2
    bgzf_close(bgzf);
571
2
    fai_destroy(fai);
572
2
    errno = save_errno;
573
2
    return -1;
574
0
}
575
576
577
2
int fai_build3(const char *fn, const char *fnfai, const char *fngzi) {
578
2
    return fai_build3_core(fn, fnfai, fngzi);
579
2
}
580
581
582
2
int fai_build(const char *fn) {
583
2
    return fai_build3(fn, NULL, NULL);
584
2
}
585
586
587
static faidx_t *fai_load3_core(const char *fn, const char *fnfai, const char *fngzi,
588
                   int flags, int format)
589
0
{
590
0
    kstring_t fai_kstr = { 0, 0, NULL };
591
0
    kstring_t gzi_kstr = { 0, 0, NULL };
592
0
    hFILE *fp = NULL;
593
0
    faidx_t *fai = NULL;
594
0
    int res, gzi_index_needed = 0;
595
0
    char *file_type;
596
597
0
    if (format == FAI_FASTA) {
598
0
        file_type   = "FASTA";
599
0
    } else {
600
0
        file_type   = "FASTQ";
601
0
    }
602
603
0
    if (fn == NULL)
604
0
        return NULL;
605
606
0
    if (fnfai == NULL) {
607
0
        if (ksprintf(&fai_kstr, "%s.fai", fn) < 0) goto fail;
608
0
        fnfai = fai_kstr.s;
609
0
    }
610
0
    if (fngzi == NULL) {
611
0
        if (ksprintf(&gzi_kstr, "%s.gzi", fn) < 0) goto fail;
612
0
        fngzi = gzi_kstr.s;
613
0
    }
614
615
0
    fp = hopen(fnfai, "rb");
616
617
0
    if (fp) {
618
        // index file present, check if a compressed index is needed
619
0
        hFILE *gz = NULL;
620
0
        BGZF *bgzf = bgzf_open(fn, "rb");
621
622
0
        if (bgzf == 0) {
623
0
            hts_log_error("Failed to open %s file %s", file_type, fn);
624
0
            goto fail;
625
0
        }
626
627
0
        if (bgzf_compression(bgzf) == 2) { // BGZF compression
628
0
            if ((gz = hopen(fngzi, "rb")) == 0) {
629
630
0
                if (!(flags & FAI_CREATE) || errno != ENOENT) {
631
0
                    hts_log_error("Failed to open %s index %s: %s", file_type, fngzi, strerror(errno));
632
0
                    bgzf_close(bgzf);
633
0
                    goto fail;
634
0
                }
635
636
0
                gzi_index_needed = 1;
637
0
                res = hclose(fp); // closed as going to be re-indexed
638
639
0
                if (res < 0) {
640
0
                    hts_log_error("Failed on closing %s index %s : %s", file_type, fnfai, strerror(errno));
641
0
                    goto fail;
642
0
                }
643
0
            } else {
644
0
                res = hclose(gz);
645
646
0
                if (res < 0) {
647
0
                    hts_log_error("Failed on closing %s index %s : %s", file_type, fngzi, strerror(errno));
648
0
                    goto fail;
649
0
                }
650
0
            }
651
0
        }
652
653
0
        bgzf_close(bgzf);
654
0
    }
655
656
0
    if (fp == 0 || gzi_index_needed) {
657
0
        if (!(flags & FAI_CREATE) || errno != ENOENT) {
658
0
            hts_log_error("Failed to open %s index %s: %s", file_type, fnfai, strerror(errno));
659
0
            goto fail;
660
0
        }
661
662
0
        hts_log_info("Build %s index", file_type);
663
664
0
        if (fai_build3_core(fn, fnfai, fngzi) < 0) {
665
0
            goto fail;
666
0
        }
667
668
0
        fp = hopen(fnfai, "rb");
669
0
        if (fp == 0) {
670
0
            hts_log_error("Failed to open %s index %s: %s", file_type, fnfai, strerror(errno));
671
0
            goto fail;
672
0
        }
673
0
    }
674
675
0
    fai = fai_read(fp, fnfai, format);
676
0
    if (fai == NULL) {
677
0
        hts_log_error("Failed to read %s index %s", file_type, fnfai);
678
0
        goto fail;
679
0
    }
680
681
0
    res = hclose(fp);
682
0
    fp = NULL;
683
0
    if (res < 0) {
684
0
        hts_log_error("Failed on closing %s index %s : %s", file_type, fnfai, strerror(errno));
685
0
        goto fail;
686
0
    }
687
688
0
    fai->bgzf = bgzf_open(fn, "rb");
689
0
    if (fai->bgzf == 0) {
690
0
        hts_log_error("Failed to open %s file %s", file_type, fn);
691
0
        goto fail;
692
0
    }
693
694
0
    if ( fai->bgzf->is_compressed==1 ) {
695
0
        if ( bgzf_index_load(fai->bgzf, fngzi, NULL) < 0 ) {
696
0
            hts_log_error("Failed to load .gzi index: %s", fngzi);
697
0
            goto fail;
698
0
        }
699
0
    }
700
0
    free(fai_kstr.s);
701
0
    free(gzi_kstr.s);
702
0
    return fai;
703
704
0
 fail:
705
0
    if (fai) fai_destroy(fai);
706
0
    if (fp) hclose_abruptly(fp);
707
0
    free(fai_kstr.s);
708
0
    free(gzi_kstr.s);
709
0
    return NULL;
710
0
}
711
712
713
faidx_t *fai_load3(const char *fn, const char *fnfai, const char *fngzi,
714
0
                   int flags) {
715
0
    return fai_load3_core(fn, fnfai, fngzi, flags, FAI_FASTA);
716
0
}
717
718
719
faidx_t *fai_load(const char *fn)
720
0
{
721
0
    return fai_load3(fn, NULL, NULL, FAI_CREATE);
722
0
}
723
724
725
faidx_t *fai_load3_format(const char *fn, const char *fnfai, const char *fngzi,
726
0
                   int flags, enum fai_format_options format) {
727
0
    return fai_load3_core(fn, fnfai, fngzi, flags, format);
728
0
}
729
730
731
0
faidx_t *fai_load_format(const char *fn, enum fai_format_options format) {
732
0
    return fai_load3_format(fn, NULL, NULL, FAI_CREATE, format);
733
0
}
734
735
736
static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
737
0
                          uint64_t offset, hts_pos_t beg, hts_pos_t end, hts_pos_t *len) {
738
0
    char *buffer, *s;
739
0
    ssize_t nread, remaining, firstline_len, firstline_blen;
740
0
    int ret;
741
742
0
    if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) {
743
0
        hts_log_error("Range %"PRId64"..%"PRId64" too big", beg, end);
744
0
        *len = -1;
745
0
        return NULL;
746
0
    }
747
748
0
    if (val->line_blen <= 0) {
749
0
        hts_log_error("Invalid line length in index: %"PRIu64, val->line_blen);
750
0
        *len = -1;
751
0
        return NULL;
752
0
    }
753
754
0
    uint64_t line_len = val->line_blen + val->line_extra;
755
756
0
    ret = bgzf_useek(fai->bgzf,
757
0
                     offset
758
0
                     + beg / val->line_blen * line_len
759
0
                     + beg % val->line_blen, SEEK_SET);
760
761
0
    if (ret < 0) {
762
0
        *len = -1;
763
0
        hts_log_error("Failed to retrieve block. (Seeking in a compressed, .gzi unindexed, file?)");
764
0
        return NULL;
765
0
    }
766
767
    // Over-allocate so there is extra space for one end-of-line sequence
768
0
    buffer = hts_malloc(hts_add_sat3(end - beg, line_len - val->line_blen, 1));
769
0
    if (!buffer) {
770
0
        *len = -1;
771
0
        return NULL;
772
0
    }
773
774
0
    remaining = *len = end - beg;
775
0
    firstline_blen = val->line_blen - beg % val->line_blen;
776
777
    // Special case when the entire interval requested is within a single FASTA/Q line
778
0
    if (remaining <= firstline_blen) {
779
0
        nread = bgzf_read_small(fai->bgzf, buffer, remaining);
780
0
        if (nread < remaining) goto error;
781
0
        buffer[nread] = '\0';
782
0
        return buffer;
783
0
    }
784
785
0
    s = buffer;
786
0
    firstline_len = line_len - beg % val->line_blen;
787
788
    // Read the (partial) first line and its line terminator, but increment  s  past the
789
    // line contents only, so the terminator characters will be overwritten by the next line.
790
0
    nread = bgzf_read_small(fai->bgzf, s, firstline_len);
791
0
    if (nread < firstline_len) goto error;
792
0
    s += firstline_blen;
793
0
    remaining -= firstline_blen;
794
795
    // Similarly read complete lines and their line terminator characters, but overwrite the latter.
796
0
    while (remaining > val->line_blen) {
797
0
        nread = bgzf_read_small(fai->bgzf, s, line_len);
798
0
        if (nread < (ssize_t) line_len) goto error;
799
0
        s += val->line_blen;
800
0
        remaining -= val->line_blen;
801
0
    }
802
803
0
    if (remaining > 0) {
804
0
        nread = bgzf_read_small(fai->bgzf, s, remaining);
805
0
        if (nread < remaining) goto error;
806
0
        s += remaining;
807
0
    }
808
809
0
    *s = '\0';
810
0
    return buffer;
811
812
0
error:
813
0
    hts_log_error("Failed to retrieve block: %s",
814
0
                  (nread == 0)? "unexpected end of file" : "error reading file");
815
0
    free(buffer);
816
0
    *len = -1;
817
0
    return NULL;
818
0
}
819
820
static int fai_get_val(const faidx_t *fai, const char *str,
821
0
                       hts_pos_t *len, faidx1_t *val, hts_pos_t *fbeg, hts_pos_t *fend) {
822
0
    khiter_t iter;
823
0
    khash_t(s) *h;
824
0
    int id;
825
0
    hts_pos_t beg, end;
826
827
0
    if (!fai_parse_region(fai, str, &id, &beg, &end, 0)) {
828
0
        hts_log_warning("Reference %s not found in FASTA file, returning empty sequence", str);
829
0
        *len = -2;
830
0
        return 1;
831
0
    }
832
833
0
    h = fai->hash;
834
0
    iter = kh_get(s, h, faidx_iseq(fai, id));
835
0
    if (iter >= kh_end(h)) {
836
        // should have already been caught above
837
0
        abort();
838
0
    }
839
0
    *val = kh_value(h, iter);
840
841
0
    if (beg >= val->len) beg = val->len;
842
0
    if (end >= val->len) end = val->len;
843
0
    if (beg > end) beg = end;
844
845
0
    *fbeg = beg;
846
0
    *fend = end;
847
848
0
    return 0;
849
0
}
850
851
/*
852
 *  Returns hts_pos_t for historic reasons (line_blen was uint32_t,
853
 *  but this function returned a wider value in case it was expanded
854
 *  which has now happened).  It also needs to return a negative value
855
 *  on error.
856
 */
857
hts_pos_t fai_line_length(const faidx_t *fai, const char *str)
858
0
{
859
0
    faidx1_t val;
860
0
    int64_t beg, end;
861
0
    hts_pos_t len;
862
863
0
    if (fai_get_val(fai, str, &len, &val, &beg, &end))
864
0
        return -1;
865
0
    else
866
0
        return (val.line_blen <= HTS_POS_MAX
867
0
                ? (hts_pos_t) val.line_blen
868
0
                : (hts_pos_t) -1);
869
0
}
870
871
char *fai_fetch64(const faidx_t *fai, const char *str, hts_pos_t *len)
872
0
{
873
0
    faidx1_t val;
874
0
    int64_t beg, end;
875
876
0
    if (fai_get_val(fai, str, len, &val, &beg, &end)) {
877
0
        return NULL;
878
0
    }
879
880
    // now retrieve the sequence
881
0
    return fai_retrieve(fai, &val, val.seq_offset, beg, end, len);
882
0
}
883
884
char *fai_fetch(const faidx_t *fai, const char *str, int *len)
885
0
{
886
0
    hts_pos_t len64;
887
0
    char *ret = fai_fetch64(fai, str, &len64);
888
0
    *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc
889
0
    return ret;
890
0
}
891
892
0
char *fai_fetchqual64(const faidx_t *fai, const char *str, hts_pos_t *len) {
893
0
    faidx1_t val;
894
0
    int64_t beg, end;
895
896
0
    if (fai_get_val(fai, str, len, &val, &beg, &end)) {
897
0
        return NULL;
898
0
    }
899
900
    // now retrieve the sequence
901
0
    return fai_retrieve(fai, &val, val.qual_offset, beg, end, len);
902
0
}
903
904
0
char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) {
905
0
    hts_pos_t len64;
906
0
    char *ret = fai_fetchqual64(fai, str, &len64);
907
0
    *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc
908
0
    return ret;
909
0
}
910
911
int faidx_fetch_nseq(const faidx_t *fai)
912
0
{
913
0
    return fai->n;
914
0
}
915
916
int faidx_nseq(const faidx_t *fai)
917
0
{
918
0
    return fai->n;
919
0
}
920
921
const char *faidx_iseq(const faidx_t *fai, int i)
922
0
{
923
0
    return fai->name[i];
924
0
}
925
926
hts_pos_t faidx_seq_len64(const faidx_t *fai, const char *seq)
927
0
{
928
0
    khint_t k = kh_get(s, fai->hash, seq);
929
0
    if ( k == kh_end(fai->hash) ) return -1;
930
0
    return kh_val(fai->hash, k).len;
931
0
}
932
933
int faidx_seq_len(const faidx_t *fai, const char *seq)
934
0
{
935
0
    hts_pos_t len = faidx_seq_len64(fai, seq);
936
0
    return len < INT_MAX ? len : INT_MAX;
937
0
}
938
939
static int faidx_adjust_position(const faidx_t *fai, int end_adjust,
940
                                 faidx1_t *val_out, const char *c_name,
941
                                 hts_pos_t *p_beg_i, hts_pos_t *p_end_i,
942
0
                                 hts_pos_t *len) {
943
0
    khiter_t iter;
944
0
    faidx1_t *val;
945
946
    // Adjust position
947
0
    iter = kh_get(s, fai->hash, c_name);
948
949
0
    if (iter == kh_end(fai->hash)) {
950
0
        if (len)
951
0
            *len = -2;
952
0
        hts_log_error("The sequence \"%s\" was not found", c_name);
953
0
        return 1;
954
0
    }
955
956
0
    val = &kh_value(fai->hash, iter);
957
958
0
    if (val_out)
959
0
        *val_out = *val;
960
961
0
    if(*p_end_i < *p_beg_i)
962
0
        *p_beg_i = *p_end_i;
963
964
0
    if(*p_beg_i < 0)
965
0
        *p_beg_i = 0;
966
0
    else if(val->len <= *p_beg_i)
967
0
        *p_beg_i = val->len;
968
969
0
    if(*p_end_i < 0)
970
0
        *p_end_i = 0;
971
0
    else if(val->len <= *p_end_i)
972
0
        *p_end_i = val->len - end_adjust;
973
974
0
    return 0;
975
0
}
976
977
int fai_adjust_region(const faidx_t *fai, int tid,
978
                      hts_pos_t *beg, hts_pos_t *end)
979
0
{
980
0
    hts_pos_t orig_beg, orig_end;
981
982
0
    if (!fai || !beg || !end || tid < 0 || tid >= fai->n)
983
0
        return -1;
984
985
0
    orig_beg = *beg;
986
0
    orig_end = *end;
987
0
    if (faidx_adjust_position(fai, 0, NULL, fai->name[tid], beg, end, NULL) != 0) {
988
0
        hts_log_error("Inconsistent faidx internal state - couldn't find \"%s\"",
989
0
                      fai->name[tid]);
990
0
        return -1;
991
0
    }
992
993
0
    return ((orig_beg != *beg ? 1 : 0) |
994
0
            (orig_end != *end && orig_end < HTS_POS_MAX ? 2 : 0));
995
0
}
996
997
char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len)
998
0
{
999
0
    faidx1_t val;
1000
1001
    // Adjust position
1002
0
    if (faidx_adjust_position(fai, 1, &val, c_name, &p_beg_i, &p_end_i, len)) {
1003
0
        return NULL;
1004
0
    }
1005
1006
    // Now retrieve the sequence
1007
0
    return fai_retrieve(fai, &val, val.seq_offset, p_beg_i, p_end_i + 1, len);
1008
0
}
1009
1010
char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
1011
0
{
1012
0
    hts_pos_t len64;
1013
0
    char *ret = faidx_fetch_seq64(fai, c_name, p_beg_i, p_end_i, &len64);
1014
0
    *len = len64 < INT_MAX ? len64 : INT_MAX;  // trunc
1015
0
    return ret;
1016
0
}
1017
1018
char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len)
1019
0
{
1020
0
    faidx1_t val;
1021
1022
    // Adjust position
1023
0
    if (faidx_adjust_position(fai, 1, &val, c_name, &p_beg_i, &p_end_i, len)) {
1024
0
        return NULL;
1025
0
    }
1026
1027
    // Now retrieve the sequence
1028
0
    return fai_retrieve(fai, &val, val.qual_offset, p_beg_i, p_end_i + 1, len);
1029
0
}
1030
1031
char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
1032
0
{
1033
0
    hts_pos_t len64;
1034
0
    char *ret = faidx_fetch_qual64(fai, c_name, p_beg_i, p_end_i, &len64);
1035
0
    *len = len64 < INT_MAX ? len64 : INT_MAX;  // trunc
1036
0
    return ret;
1037
0
}
1038
1039
int faidx_has_seq(const faidx_t *fai, const char *seq)
1040
0
{
1041
0
    khiter_t iter = kh_get(s, fai->hash, seq);
1042
0
    if (iter == kh_end(fai->hash)) return 0;
1043
0
    return 1;
1044
0
}
1045
1046
const char *fai_parse_region(const faidx_t *fai, const char *s,
1047
                             int *tid, hts_pos_t *beg, hts_pos_t *end,
1048
                             int flags)
1049
0
{
1050
0
    return hts_parse_region(s, tid, beg, end, fai_name2id, (void *)fai, flags);
1051
0
}
1052
1053
0
void fai_set_cache_size(faidx_t *fai, int cache_size) {
1054
0
    bgzf_set_cache_size(fai->bgzf, cache_size);
1055
0
}
1056
1057
// Adds a thread pool to the underlying BGZF layer.
1058
0
int fai_thread_pool(faidx_t *fai, struct hts_tpool *pool, int qsize) {
1059
0
    return bgzf_thread_pool(fai->bgzf, pool, qsize);
1060
0
}
1061
1062
0
char *fai_path(const char *fa) {
1063
0
    char *fai = NULL;
1064
0
    if (!fa) {
1065
0
        hts_log_error("No reference file specified");
1066
0
    } else {
1067
0
        char *fai_tmp = strstr(fa, HTS_IDX_DELIM);
1068
0
        if (fai_tmp) {
1069
0
            fai_tmp += strlen(HTS_IDX_DELIM);
1070
0
            fai = strdup(fai_tmp);
1071
0
            if (!fai)
1072
0
                hts_log_error("Failed to allocate memory");
1073
0
        } else {
1074
0
            if (hisremote(fa)) {
1075
0
                fai = hts_idx_locatefn(fa, ".fai");       // get the remote fai file name, if any, but do not download the file
1076
0
                if (!fai)
1077
0
                    hts_log_error("Failed to locate index file for remote reference file '%s'", fa);
1078
0
            } else{
1079
0
                if (hts_idx_check_local(fa, HTS_FMT_FAI, &fai) == 0 && fai) {
1080
0
                    if (fai_build3(fa, fai, NULL) == -1) {      // create local fai file by indexing local fasta
1081
0
                        hts_log_error("Failed to build index file for reference file '%s'", fa);
1082
0
                        free(fai);
1083
0
                        fai = NULL;
1084
0
                    }
1085
0
                }
1086
0
            }
1087
0
        }
1088
0
    }
1089
1090
0
    return fai;
1091
0
}