Coverage Report

Created: 2025-08-10 06:30

/src/htslib/faidx.c
Line
Count
Source (jump to first uncovered line)
1
/*  faidx.c -- FASTA and FASTQ random access.
2
3
    Copyright (C) 2008, 2009, 2013-2020, 2022, 2024 Genome Research Ltd.
4
    Portions copyright (C) 2011 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <ctype.h>
30
#include <string.h>
31
#include <stdlib.h>
32
#include <stdio.h>
33
#include <inttypes.h>
34
#include <errno.h>
35
#include <limits.h>
36
#include <unistd.h>
37
#include <assert.h>
38
39
#include "htslib/bgzf.h"
40
#include "htslib/faidx.h"
41
#include "htslib/hfile.h"
42
#include "htslib/khash.h"
43
#include "htslib/kstring.h"
44
#include "hts_internal.h"
45
46
// Faster isgraph; assumes ASCII
47
1.15M
static inline int isgraph_(unsigned char c) {
48
1.15M
    return c > ' ' && c <= '~';
49
1.15M
}
50
51
#ifdef isgraph
52
#  undef isgraph
53
#endif
54
1.15M
#define isgraph isgraph_
55
56
// An optimised bgzf_getc.
57
// We could consider moving this to bgzf.h, but our own code uses it here only.
58
4.35M
static inline int bgzf_getc_(BGZF *fp) {
59
4.35M
    if (fp->block_offset+1 < fp->block_length) {
60
4.35M
        int c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
61
4.35M
        fp->uncompressed_address++;
62
4.35M
        return c;
63
4.35M
    }
64
65
2.58k
    return bgzf_getc(fp);
66
4.35M
}
67
4.35M
#define bgzf_getc bgzf_getc_
68
69
typedef struct {
70
    int id; // faidx_t->name[id] is for this struct.
71
    uint32_t line_len, line_blen;
72
    uint64_t len;
73
    uint64_t seq_offset;
74
    uint64_t qual_offset;
75
} faidx1_t;
76
KHASH_MAP_INIT_STR(s, faidx1_t)
77
78
struct faidx_t {
79
    BGZF *bgzf;
80
    int n, m;
81
    char **name;
82
    khash_t(s) *hash;
83
    enum fai_format_options format;
84
};
85
86
static int fai_name2id(void *v, const char *ref)
87
0
{
88
0
    faidx_t *fai = (faidx_t *)v;
89
0
    khint_t k = kh_get(s, fai->hash, ref);
90
0
    return k == kh_end(fai->hash) ? -1 : kh_val(fai->hash, k).id;
91
0
}
92
93
static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len, uint32_t line_len, uint32_t line_blen, uint64_t seq_offset, uint64_t qual_offset)
94
20.5k
{
95
20.5k
    if (!name) {
96
0
        hts_log_error("Malformed line");
97
0
        return -1;
98
0
    }
99
100
20.5k
    char *name_key = strdup(name);
101
20.5k
    int absent;
102
20.5k
    khint_t k = kh_put(s, idx->hash, name_key, &absent);
103
20.5k
    faidx1_t *v = &kh_value(idx->hash, k);
104
105
20.5k
    if (! absent) {
106
16.8k
        hts_log_warning("Ignoring duplicate sequence \"%s\" at byte offset %" PRIu64, name, seq_offset);
107
16.8k
        free(name_key);
108
16.8k
        return 0;
109
16.8k
    }
110
111
3.63k
    if (idx->n == idx->m) {
112
482
        char **tmp;
113
482
        idx->m = idx->m? idx->m<<1 : 16;
114
482
        if (!(tmp = (char**)realloc(idx->name, sizeof(char*) * idx->m))) {
115
0
            hts_log_error("Out of memory");
116
0
            return -1;
117
0
        }
118
482
        idx->name = tmp;
119
482
    }
120
3.63k
    v->id = idx->n;
121
3.63k
    idx->name[idx->n++] = name_key;
122
3.63k
    v->len = len;
123
3.63k
    v->line_len = line_len;
124
3.63k
    v->line_blen = line_blen;
125
3.63k
    v->seq_offset = seq_offset;
126
3.63k
    v->qual_offset = qual_offset;
127
128
3.63k
    return 0;
129
3.63k
}
130
131
132
889
static faidx_t *fai_build_core(BGZF *bgzf) {
133
889
    kstring_t name = { 0, 0, NULL };
134
889
    int c, read_done, line_num;
135
889
    faidx_t *idx;
136
889
    uint64_t seq_offset, qual_offset;
137
889
    uint64_t seq_len, qual_len;
138
889
    uint64_t char_len, cl, line_len, ll;
139
889
    enum read_state {OUT_READ, IN_NAME, IN_SEQ, SEQ_END, IN_QUAL} state;
140
141
889
    idx = (faidx_t*)calloc(1, sizeof(faidx_t));
142
889
    idx->hash = kh_init(s);
143
889
    idx->format = FAI_NONE;
144
145
889
    state = OUT_READ, read_done = 0, line_num = 1;
146
889
    seq_offset = qual_offset = seq_len = qual_len = char_len = cl = line_len = ll = 0;
147
148
92.9k
    while ((c = bgzf_getc(bgzf)) >= 0) {
149
92.6k
        switch (state) {
150
10.4k
            case OUT_READ:
151
10.4k
                switch (c) {
152
8.92k
                    case '>':
153
8.92k
                        if (idx->format == FAI_FASTQ) {
154
11
                            hts_log_error("Found '>' in a FASTQ file, error at line %d", line_num);
155
11
                            goto fail;
156
11
                        }
157
158
8.91k
                        idx->format = FAI_FASTA;
159
8.91k
                        state = IN_NAME;
160
8.91k
                    break;
161
162
411
                    case '@':
163
411
                        if (idx->format == FAI_FASTA) {
164
6
                            hts_log_error("Found '@' in a FASTA file, error at line %d", line_num);
165
6
                            goto fail;
166
6
                        }
167
168
405
                        idx->format = FAI_FASTQ;
169
405
                        state = IN_NAME;
170
405
                    break;
171
172
38
                    case '\r':
173
                        // Blank line with cr-lf ending?
174
38
                        if ((c = bgzf_getc(bgzf)) == '\n') {
175
18
                            line_num++;
176
20
                        } else {
177
20
                            hts_log_error("Format error, carriage return not followed by new line at line %d", line_num);
178
20
                            goto fail;
179
20
                        }
180
18
                    break;
181
182
984
                    case '\n':
183
                        // just move onto the next line
184
984
                        line_num++;
185
984
                    break;
186
187
83
                    default: {
188
83
                        char s[4] = { '"', c, '"', '\0' };
189
83
                        hts_log_error("Format error, unexpected %s at line %d", isprint(c) ? s : "character", line_num);
190
83
                        goto fail;
191
38
                    }
192
10.4k
                }
193
10.3k
            break;
194
195
30.1k
            case IN_NAME:
196
30.1k
                if (read_done) {
197
20.3k
                    if (fai_insert_index(idx, name.s, seq_len, line_len, char_len, seq_offset, qual_offset) != 0)
198
0
                        goto fail;
199
200
20.3k
                    read_done = 0;
201
20.3k
                }
202
203
30.1k
                name.l = 0;
204
205
892k
                do {
206
892k
                    if (!isspace(c)) {
207
859k
                        kputc(c, &name);
208
859k
                    } else if (name.l > 0 || c == '\n') {
209
29.9k
                        break;
210
29.9k
                    }
211
892k
                } while ((c = bgzf_getc(bgzf)) >= 0);
212
213
0
                kputsn("", 0, &name);
214
215
30.1k
                if (c < 0) {
216
205
                    hts_log_error("The last entry '%s' has no sequence at line %d", name.s, line_num);
217
205
                    goto fail;
218
205
                }
219
220
                // read the rest of the line if necessary
221
2.23M
                if (c != '\n') while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
222
223
29.9k
                state = IN_SEQ; seq_len = qual_len = char_len = line_len = 0;
224
29.9k
                seq_offset = bgzf_utell(bgzf);
225
29.9k
                line_num++;
226
29.9k
            break;
227
228
51.6k
            case IN_SEQ:
229
51.6k
                if (idx->format == FAI_FASTA) {
230
51.0k
                    if (c == '\n') {
231
5.96k
                        state = OUT_READ;
232
5.96k
                        line_num++;
233
5.96k
                        continue;
234
45.0k
                    } else if (c == '>') {
235
20.8k
                        state = IN_NAME;
236
20.8k
                        continue;
237
20.8k
                    }
238
51.0k
                } else if (idx->format == FAI_FASTQ) {
239
590
                    if (c == '+') {
240
89
                        state = IN_QUAL;
241
2.24k
                        if (c != '\n') while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
242
89
                        qual_offset = bgzf_utell(bgzf);
243
89
                        line_num++;
244
89
                        continue;
245
501
                    } else if (c == '\n') {
246
10
                        hts_log_error("Inlined empty line is not allowed in sequence '%s' at line %d", name.s, line_num);
247
10
                        goto fail;
248
10
                    }
249
590
                }
250
251
24.7k
                ll = cl = 0;
252
253
24.7k
                if (idx->format == FAI_FASTA) read_done = 1;
254
255
1.08M
                do {
256
1.08M
                    ll++;
257
1.08M
                    if (isgraph(c)) cl++;
258
1.08M
                } while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
259
260
24.7k
                ll++; seq_len += cl;
261
262
24.7k
                if (line_len == 0) {
263
20.8k
                    line_len = ll;
264
20.8k
                    char_len = cl;
265
20.8k
                } else if (line_len > ll) {
266
267
2.78k
                    if (idx->format == FAI_FASTA)
268
2.60k
                        state = OUT_READ;
269
177
                    else
270
177
                        state = SEQ_END;
271
272
2.78k
                } else if (line_len < ll) {
273
45
                    hts_log_error("Different line length in sequence '%s' at line %d", name.s, line_num);
274
45
                    goto fail;
275
45
                }
276
277
24.6k
                line_num++;
278
24.6k
            break;
279
280
162
            case SEQ_END:
281
162
                if (c == '+') {
282
151
                    state = IN_QUAL;
283
6.59k
                    while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
284
151
                    qual_offset = bgzf_utell(bgzf);
285
151
                    line_num++;
286
151
                } else {
287
11
                    hts_log_error("Format error, expecting '+', got '%c' at line %d", c, line_num);
288
11
                    goto fail;
289
11
                }
290
151
            break;
291
292
258
            case IN_QUAL:
293
258
                if (c == '\n') {
294
22
                    if (!read_done) {
295
10
                        hts_log_error("Inlined empty line is not allowed in quality of sequence '%s' at line %d", name.s, line_num);
296
10
                        goto fail;
297
10
                    }
298
299
12
                    state = OUT_READ;
300
12
                    line_num++;
301
12
                    continue;
302
236
                } else if (c == '@' && read_done) {
303
10
                    state = IN_NAME;
304
10
                    continue;
305
10
                }
306
307
226
                ll = cl = 0;
308
309
71.4k
                do {
310
71.4k
                    ll++;
311
71.4k
                    if (isgraph(c)) cl++;
312
71.4k
                } while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
313
314
226
                ll++; qual_len += cl;
315
316
226
                if (line_len < ll) {
317
93
                    hts_log_error("Quality line length too long in '%s' at line %d", name.s, line_num);
318
93
                    goto fail;
319
133
                } else if (qual_len == seq_len) {
320
52
                    read_done = 1;
321
81
                } else if (qual_len > seq_len) {
322
11
                    hts_log_error("Quality length longer than sequence in '%s' at line %d", name.s, line_num);
323
11
                    goto fail;
324
70
                } else if (line_len > ll) {
325
41
                    hts_log_error("Quality line length too short in '%s' at line %d", name.s, line_num);
326
41
                    goto fail;
327
41
                }
328
329
81
                line_num++;
330
81
            break;
331
92.6k
        }
332
92.6k
    }
333
334
343
    if (read_done) {
335
176
        if (fai_insert_index(idx, name.s, seq_len, line_len, char_len, seq_offset, qual_offset) != 0)
336
0
            goto fail;
337
176
    } else {
338
167
        hts_log_error("File truncated at line %d", line_num);
339
167
        goto fail;
340
167
    }
341
342
176
    free(name.s);
343
176
    return idx;
344
345
713
fail:
346
713
    free(name.s);
347
713
    fai_destroy(idx);
348
713
    return NULL;
349
343
}
350
351
352
0
static int fai_save(const faidx_t *fai, hFILE *fp) {
353
0
    khint_t k;
354
0
    int i;
355
0
    char buf[96]; // Must be big enough for format below.
356
357
0
    for (i = 0; i < fai->n; ++i) {
358
0
        faidx1_t x;
359
0
        k = kh_get(s, fai->hash, fai->name[i]);
360
0
        assert(k < kh_end(fai->hash));
361
0
        x = kh_value(fai->hash, k);
362
363
0
        if (fai->format == FAI_FASTA) {
364
0
            snprintf(buf, sizeof(buf),
365
0
                 "\t%"PRIu64"\t%"PRIu64"\t%"PRIu32"\t%"PRIu32"\n",
366
0
                 x.len, x.seq_offset, x.line_blen, x.line_len);
367
0
        } else {
368
0
            snprintf(buf, sizeof(buf),
369
0
                 "\t%"PRIu64"\t%"PRIu64"\t%"PRIu32"\t%"PRIu32"\t%"PRIu64"\n",
370
0
                 x.len, x.seq_offset, x.line_blen, x.line_len, x.qual_offset);
371
0
        }
372
373
0
        if (hputs(fai->name[i], fp) != 0) return -1;
374
0
        if (hputs(buf, fp) != 0) return -1;
375
0
    }
376
0
    return 0;
377
0
}
378
379
380
static faidx_t *fai_read(hFILE *fp, const char *fname, int format)
381
0
{
382
0
    faidx_t *fai;
383
0
    char *buf = NULL, *p;
384
0
    ssize_t l, lnum = 1;
385
386
0
    fai = (faidx_t*)calloc(1, sizeof(faidx_t));
387
0
    if (!fai) return NULL;
388
389
0
    fai->hash = kh_init(s);
390
0
    if (!fai->hash) goto fail;
391
392
0
    buf = (char*)calloc(0x10000, 1);
393
0
    if (!buf) goto fail;
394
395
0
    while ((l = hgetln(buf, 0x10000, fp)) > 0) {
396
0
        uint32_t line_len, line_blen, n;
397
0
        uint64_t len;
398
0
        uint64_t seq_offset;
399
0
        uint64_t qual_offset = 0;
400
401
0
        for (p = buf; *p && !isspace_c(*p); ++p);
402
403
0
        if (p - buf < l) {
404
0
            *p = 0; ++p;
405
0
        }
406
407
0
        if (format == FAI_FASTA) {
408
0
            n = sscanf(p, "%"SCNu64"%"SCNu64"%"SCNu32"%"SCNu32, &len, &seq_offset, &line_blen, &line_len);
409
410
0
            if (n != 4) {
411
0
                hts_log_error("Could not understand FASTA index %s line %zd", fname, lnum);
412
0
                goto fail;
413
0
            }
414
0
        } else {
415
0
            n = sscanf(p, "%"SCNu64"%"SCNu64"%"SCNu32"%"SCNu32"%"SCNu64, &len, &seq_offset, &line_blen, &line_len, &qual_offset);
416
417
0
            if (n != 5) {
418
0
                if (n == 4) {
419
0
                    hts_log_error("Possibly this is a FASTA index, try using faidx.  Problem in %s line %zd", fname, lnum);
420
0
                } else {
421
0
                    hts_log_error("Could not understand FASTQ index %s line %zd", fname, lnum);
422
0
                }
423
424
0
                goto fail;
425
0
            }
426
0
        }
427
428
0
        if (fai_insert_index(fai, buf, len, line_len, line_blen, seq_offset, qual_offset) != 0) {
429
0
            goto fail;
430
0
        }
431
432
0
        if (buf[l - 1] == '\n') ++lnum;
433
0
    }
434
435
0
    if (l < 0) {
436
0
        hts_log_error("Error while reading %s: %s", fname, strerror(errno));
437
0
        goto fail;
438
0
    }
439
0
    free(buf);
440
0
    return fai;
441
442
0
 fail:
443
0
    free(buf);
444
0
    fai_destroy(fai);
445
0
    return NULL;
446
0
}
447
448
void fai_destroy(faidx_t *fai)
449
1.95k
{
450
1.95k
    int i;
451
1.95k
    if (!fai) return;
452
4.52k
    for (i = 0; i < fai->n; ++i) free(fai->name[i]);
453
889
    free(fai->name);
454
889
    kh_destroy(s, fai->hash);
455
889
    if (fai->bgzf) bgzf_close(fai->bgzf);
456
889
    free(fai);
457
889
}
458
459
460
static int fai_build3_core(const char *fn, const char *fnfai, const char *fngzi)
461
1.24k
{
462
1.24k
    kstring_t fai_kstr = { 0, 0, NULL };
463
1.24k
    kstring_t gzi_kstr = { 0, 0, NULL };
464
1.24k
    BGZF *bgzf = NULL;
465
1.24k
    hFILE *fp = NULL;
466
1.24k
    faidx_t *fai = NULL;
467
1.24k
    int save_errno, res;
468
1.24k
    char *file_type;
469
470
1.24k
    bgzf = bgzf_open(fn, "r");
471
472
1.24k
    if ( !bgzf ) {
473
357
        hts_log_error("Failed to open the file %s : %s", fn, strerror(errno));
474
357
        goto fail;
475
357
    }
476
477
889
    if ( bgzf->is_compressed ) {
478
12
        if (bgzf_index_build_init(bgzf) != 0) {
479
0
            hts_log_error("Failed to allocate bgzf index");
480
0
            goto fail;
481
0
        }
482
12
    }
483
484
889
    fai = fai_build_core(bgzf);
485
486
889
    if ( !fai ) {
487
713
        if (bgzf->is_compressed && bgzf->is_gzip) {
488
12
            hts_log_error("Cannot index files compressed with gzip, please use bgzip");
489
12
        }
490
713
        goto fail;
491
713
    }
492
493
176
    if (fai->format == FAI_FASTA) {
494
163
        file_type   = "FASTA";
495
163
    } else {
496
13
        file_type   = "FASTQ";
497
13
    }
498
499
176
    if (!fnfai) {
500
176
        if (ksprintf(&fai_kstr, "%s.fai", fn) < 0) goto fail;
501
176
        fnfai = fai_kstr.s;
502
176
    }
503
504
176
    if (!fngzi) {
505
176
        if (ksprintf(&gzi_kstr, "%s.gzi", fn) < 0) goto fail;
506
176
        fngzi = gzi_kstr.s;
507
176
    }
508
509
176
    if ( bgzf->is_compressed ) {
510
0
        if (bgzf_index_dump(bgzf, fngzi, NULL) < 0) {
511
0
            hts_log_error("Failed to make bgzf index %s", fngzi);
512
0
            goto fail;
513
0
        }
514
0
    }
515
516
176
    res = bgzf_close(bgzf);
517
176
    bgzf = NULL;
518
519
176
    if (res < 0) {
520
0
        hts_log_error("Error on closing %s : %s", fn, strerror(errno));
521
0
        goto fail;
522
0
    }
523
524
176
    fp = hopen(fnfai, "wb");
525
526
176
    if ( !fp ) {
527
176
        hts_log_error("Failed to open %s index %s : %s", file_type, fnfai, strerror(errno));
528
176
        goto fail;
529
176
    }
530
531
0
    if (fai_save(fai, fp) != 0) {
532
0
        hts_log_error("Failed to write %s index %s : %s", file_type, fnfai, strerror(errno));
533
0
        goto fail;
534
0
    }
535
536
0
    if (hclose(fp) != 0) {
537
0
        hts_log_error("Failed on closing %s index %s : %s", file_type, fnfai, strerror(errno));
538
0
        goto fail;
539
0
    }
540
541
0
    free(fai_kstr.s);
542
0
    free(gzi_kstr.s);
543
0
    fai_destroy(fai);
544
0
    return 0;
545
546
1.24k
 fail:
547
1.24k
    save_errno = errno;
548
1.24k
    free(fai_kstr.s);
549
1.24k
    free(gzi_kstr.s);
550
1.24k
    bgzf_close(bgzf);
551
1.24k
    fai_destroy(fai);
552
1.24k
    errno = save_errno;
553
1.24k
    return -1;
554
0
}
555
556
557
1.24k
int fai_build3(const char *fn, const char *fnfai, const char *fngzi) {
558
1.24k
    return fai_build3_core(fn, fnfai, fngzi);
559
1.24k
}
560
561
562
1.24k
int fai_build(const char *fn) {
563
1.24k
    return fai_build3(fn, NULL, NULL);
564
1.24k
}
565
566
567
static faidx_t *fai_load3_core(const char *fn, const char *fnfai, const char *fngzi,
568
                   int flags, int format)
569
0
{
570
0
    kstring_t fai_kstr = { 0, 0, NULL };
571
0
    kstring_t gzi_kstr = { 0, 0, NULL };
572
0
    hFILE *fp = NULL;
573
0
    faidx_t *fai = NULL;
574
0
    int res, gzi_index_needed = 0;
575
0
    char *file_type;
576
577
0
    if (format == FAI_FASTA) {
578
0
        file_type   = "FASTA";
579
0
    } else {
580
0
        file_type   = "FASTQ";
581
0
    }
582
583
0
    if (fn == NULL)
584
0
        return NULL;
585
586
0
    if (fnfai == NULL) {
587
0
        if (ksprintf(&fai_kstr, "%s.fai", fn) < 0) goto fail;
588
0
        fnfai = fai_kstr.s;
589
0
    }
590
0
    if (fngzi == NULL) {
591
0
        if (ksprintf(&gzi_kstr, "%s.gzi", fn) < 0) goto fail;
592
0
        fngzi = gzi_kstr.s;
593
0
    }
594
595
0
    fp = hopen(fnfai, "rb");
596
597
0
    if (fp) {
598
        // index file present, check if a compressed index is needed
599
0
        hFILE *gz = NULL;
600
0
        BGZF *bgzf = bgzf_open(fn, "rb");
601
602
0
        if (bgzf == 0) {
603
0
            hts_log_error("Failed to open %s file %s", file_type, fn);
604
0
            goto fail;
605
0
        }
606
607
0
        if (bgzf_compression(bgzf) == 2) { // BGZF compression
608
0
            if ((gz = hopen(fngzi, "rb")) == 0) {
609
610
0
                if (!(flags & FAI_CREATE) || errno != ENOENT) {
611
0
                    hts_log_error("Failed to open %s index %s: %s", file_type, fngzi, strerror(errno));
612
0
                    bgzf_close(bgzf);
613
0
                    goto fail;
614
0
                }
615
616
0
                gzi_index_needed = 1;
617
0
                res = hclose(fp); // closed as going to be re-indexed
618
619
0
                if (res < 0) {
620
0
                    hts_log_error("Failed on closing %s index %s : %s", file_type, fnfai, strerror(errno));
621
0
                    goto fail;
622
0
                }
623
0
            } else {
624
0
                res = hclose(gz);
625
626
0
                if (res < 0) {
627
0
                    hts_log_error("Failed on closing %s index %s : %s", file_type, fngzi, strerror(errno));
628
0
                    goto fail;
629
0
                }
630
0
            }
631
0
        }
632
633
0
        bgzf_close(bgzf);
634
0
    }
635
636
0
    if (fp == 0 || gzi_index_needed) {
637
0
        if (!(flags & FAI_CREATE) || errno != ENOENT) {
638
0
            hts_log_error("Failed to open %s index %s: %s", file_type, fnfai, strerror(errno));
639
0
            goto fail;
640
0
        }
641
642
0
        hts_log_info("Build %s index", file_type);
643
644
0
        if (fai_build3_core(fn, fnfai, fngzi) < 0) {
645
0
            goto fail;
646
0
        }
647
648
0
        fp = hopen(fnfai, "rb");
649
0
        if (fp == 0) {
650
0
            hts_log_error("Failed to open %s index %s: %s", file_type, fnfai, strerror(errno));
651
0
            goto fail;
652
0
        }
653
0
    }
654
655
0
    fai = fai_read(fp, fnfai, format);
656
0
    if (fai == NULL) {
657
0
        hts_log_error("Failed to read %s index %s", file_type, fnfai);
658
0
        goto fail;
659
0
    }
660
661
0
    res = hclose(fp);
662
0
    fp = NULL;
663
0
    if (res < 0) {
664
0
        hts_log_error("Failed on closing %s index %s : %s", file_type, fnfai, strerror(errno));
665
0
        goto fail;
666
0
    }
667
668
0
    fai->bgzf = bgzf_open(fn, "rb");
669
0
    if (fai->bgzf == 0) {
670
0
        hts_log_error("Failed to open %s file %s", file_type, fn);
671
0
        goto fail;
672
0
    }
673
674
0
    if ( fai->bgzf->is_compressed==1 ) {
675
0
        if ( bgzf_index_load(fai->bgzf, fngzi, NULL) < 0 ) {
676
0
            hts_log_error("Failed to load .gzi index: %s", fngzi);
677
0
            goto fail;
678
0
        }
679
0
    }
680
0
    free(fai_kstr.s);
681
0
    free(gzi_kstr.s);
682
0
    return fai;
683
684
0
 fail:
685
0
    if (fai) fai_destroy(fai);
686
0
    if (fp) hclose_abruptly(fp);
687
0
    free(fai_kstr.s);
688
0
    free(gzi_kstr.s);
689
0
    return NULL;
690
0
}
691
692
693
faidx_t *fai_load3(const char *fn, const char *fnfai, const char *fngzi,
694
0
                   int flags) {
695
0
    return fai_load3_core(fn, fnfai, fngzi, flags, FAI_FASTA);
696
0
}
697
698
699
faidx_t *fai_load(const char *fn)
700
0
{
701
0
    return fai_load3(fn, NULL, NULL, FAI_CREATE);
702
0
}
703
704
705
faidx_t *fai_load3_format(const char *fn, const char *fnfai, const char *fngzi,
706
0
                   int flags, enum fai_format_options format) {
707
0
    return fai_load3_core(fn, fnfai, fngzi, flags, format);
708
0
}
709
710
711
0
faidx_t *fai_load_format(const char *fn, enum fai_format_options format) {
712
0
    return fai_load3_format(fn, NULL, NULL, FAI_CREATE, format);
713
0
}
714
715
716
static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
717
0
                          uint64_t offset, hts_pos_t beg, hts_pos_t end, hts_pos_t *len) {
718
0
    char *buffer, *s;
719
0
    ssize_t nread, remaining, firstline_len, firstline_blen;
720
0
    int ret;
721
722
0
    if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) {
723
0
        hts_log_error("Range %"PRId64"..%"PRId64" too big", beg, end);
724
0
        *len = -1;
725
0
        return NULL;
726
0
    }
727
728
0
    if (val->line_blen <= 0) {
729
0
        hts_log_error("Invalid line length in index: %d", val->line_blen);
730
0
        *len = -1;
731
0
        return NULL;
732
0
    }
733
734
0
    ret = bgzf_useek(fai->bgzf,
735
0
                     offset
736
0
                     + beg / val->line_blen * val->line_len
737
0
                     + beg % val->line_blen, SEEK_SET);
738
739
0
    if (ret < 0) {
740
0
        *len = -1;
741
0
        hts_log_error("Failed to retrieve block. (Seeking in a compressed, .gzi unindexed, file?)");
742
0
        return NULL;
743
0
    }
744
745
    // Over-allocate so there is extra space for one end-of-line sequence
746
0
    buffer = (char*)malloc((size_t) end - beg + val->line_len - val->line_blen + 1);
747
0
    if (!buffer) {
748
0
        *len = -1;
749
0
        return NULL;
750
0
    }
751
752
0
    remaining = *len = end - beg;
753
0
    firstline_blen = val->line_blen - beg % val->line_blen;
754
755
    // Special case when the entire interval requested is within a single FASTA/Q line
756
0
    if (remaining <= firstline_blen) {
757
0
        nread = bgzf_read_small(fai->bgzf, buffer, remaining);
758
0
        if (nread < remaining) goto error;
759
0
        buffer[nread] = '\0';
760
0
        return buffer;
761
0
    }
762
763
0
    s = buffer;
764
0
    firstline_len = val->line_len - beg % val->line_blen;
765
766
    // Read the (partial) first line and its line terminator, but increment  s  past the
767
    // line contents only, so the terminator characters will be overwritten by the next line.
768
0
    nread = bgzf_read_small(fai->bgzf, s, firstline_len);
769
0
    if (nread < firstline_len) goto error;
770
0
    s += firstline_blen;
771
0
    remaining -= firstline_blen;
772
773
    // Similarly read complete lines and their line terminator characters, but overwrite the latter.
774
0
    while (remaining > val->line_blen) {
775
0
        nread = bgzf_read_small(fai->bgzf, s, val->line_len);
776
0
        if (nread < (ssize_t) val->line_len) goto error;
777
0
        s += val->line_blen;
778
0
        remaining -= val->line_blen;
779
0
    }
780
781
0
    if (remaining > 0) {
782
0
        nread = bgzf_read_small(fai->bgzf, s, remaining);
783
0
        if (nread < remaining) goto error;
784
0
        s += remaining;
785
0
    }
786
787
0
    *s = '\0';
788
0
    return buffer;
789
790
0
error:
791
0
    hts_log_error("Failed to retrieve block: %s",
792
0
                  (nread == 0)? "unexpected end of file" : "error reading file");
793
0
    free(buffer);
794
0
    *len = -1;
795
0
    return NULL;
796
0
}
797
798
static int fai_get_val(const faidx_t *fai, const char *str,
799
0
                       hts_pos_t *len, faidx1_t *val, hts_pos_t *fbeg, hts_pos_t *fend) {
800
0
    khiter_t iter;
801
0
    khash_t(s) *h;
802
0
    int id;
803
0
    hts_pos_t beg, end;
804
805
0
    if (!fai_parse_region(fai, str, &id, &beg, &end, 0)) {
806
0
        hts_log_warning("Reference %s not found in FASTA file, returning empty sequence", str);
807
0
        *len = -2;
808
0
        return 1;
809
0
    }
810
811
0
    h = fai->hash;
812
0
    iter = kh_get(s, h, faidx_iseq(fai, id));
813
0
    if (iter >= kh_end(h)) {
814
        // should have already been caught above
815
0
        abort();
816
0
    }
817
0
    *val = kh_value(h, iter);
818
819
0
    if (beg >= val->len) beg = val->len;
820
0
    if (end >= val->len) end = val->len;
821
0
    if (beg > end) beg = end;
822
823
0
    *fbeg = beg;
824
0
    *fend = end;
825
826
0
    return 0;
827
0
}
828
829
/*
830
 *  The internal still has line_blen as uint32_t, but our references
831
 *  can be longer, so for future proofing we use hts_pos_t.  We also needed
832
 *  a signed value so we can return negatives as an error.
833
 */
834
hts_pos_t fai_line_length(const faidx_t *fai, const char *str)
835
0
{
836
0
    faidx1_t val;
837
0
    int64_t beg, end;
838
0
    hts_pos_t len;
839
840
0
    if (fai_get_val(fai, str, &len, &val, &beg, &end))
841
0
        return -1;
842
0
    else
843
0
        return val.line_blen;
844
0
}
845
846
char *fai_fetch64(const faidx_t *fai, const char *str, hts_pos_t *len)
847
0
{
848
0
    faidx1_t val;
849
0
    int64_t beg, end;
850
851
0
    if (fai_get_val(fai, str, len, &val, &beg, &end)) {
852
0
        return NULL;
853
0
    }
854
855
    // now retrieve the sequence
856
0
    return fai_retrieve(fai, &val, val.seq_offset, beg, end, len);
857
0
}
858
859
char *fai_fetch(const faidx_t *fai, const char *str, int *len)
860
0
{
861
0
    hts_pos_t len64;
862
0
    char *ret = fai_fetch64(fai, str, &len64);
863
0
    *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc
864
0
    return ret;
865
0
}
866
867
0
char *fai_fetchqual64(const faidx_t *fai, const char *str, hts_pos_t *len) {
868
0
    faidx1_t val;
869
0
    int64_t beg, end;
870
871
0
    if (fai_get_val(fai, str, len, &val, &beg, &end)) {
872
0
        return NULL;
873
0
    }
874
875
    // now retrieve the sequence
876
0
    return fai_retrieve(fai, &val, val.qual_offset, beg, end, len);
877
0
}
878
879
0
char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) {
880
0
    hts_pos_t len64;
881
0
    char *ret = fai_fetchqual64(fai, str, &len64);
882
0
    *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc
883
0
    return ret;
884
0
}
885
886
int faidx_fetch_nseq(const faidx_t *fai)
887
0
{
888
0
    return fai->n;
889
0
}
890
891
int faidx_nseq(const faidx_t *fai)
892
0
{
893
0
    return fai->n;
894
0
}
895
896
const char *faidx_iseq(const faidx_t *fai, int i)
897
0
{
898
0
    return fai->name[i];
899
0
}
900
901
hts_pos_t faidx_seq_len64(const faidx_t *fai, const char *seq)
902
0
{
903
0
    khint_t k = kh_get(s, fai->hash, seq);
904
0
    if ( k == kh_end(fai->hash) ) return -1;
905
0
    return kh_val(fai->hash, k).len;
906
0
}
907
908
int faidx_seq_len(const faidx_t *fai, const char *seq)
909
0
{
910
0
    hts_pos_t len = faidx_seq_len64(fai, seq);
911
0
    return len < INT_MAX ? len : INT_MAX;
912
0
}
913
914
static int faidx_adjust_position(const faidx_t *fai, int end_adjust,
915
                                 faidx1_t *val_out, const char *c_name,
916
                                 hts_pos_t *p_beg_i, hts_pos_t *p_end_i,
917
0
                                 hts_pos_t *len) {
918
0
    khiter_t iter;
919
0
    faidx1_t *val;
920
921
    // Adjust position
922
0
    iter = kh_get(s, fai->hash, c_name);
923
924
0
    if (iter == kh_end(fai->hash)) {
925
0
        if (len)
926
0
            *len = -2;
927
0
        hts_log_error("The sequence \"%s\" was not found", c_name);
928
0
        return 1;
929
0
    }
930
931
0
    val = &kh_value(fai->hash, iter);
932
933
0
    if (val_out)
934
0
        *val_out = *val;
935
936
0
    if(*p_end_i < *p_beg_i)
937
0
        *p_beg_i = *p_end_i;
938
939
0
    if(*p_beg_i < 0)
940
0
        *p_beg_i = 0;
941
0
    else if(val->len <= *p_beg_i)
942
0
        *p_beg_i = val->len;
943
944
0
    if(*p_end_i < 0)
945
0
        *p_end_i = 0;
946
0
    else if(val->len <= *p_end_i)
947
0
        *p_end_i = val->len - end_adjust;
948
949
0
    return 0;
950
0
}
951
952
int fai_adjust_region(const faidx_t *fai, int tid,
953
                      hts_pos_t *beg, hts_pos_t *end)
954
0
{
955
0
    hts_pos_t orig_beg, orig_end;
956
957
0
    if (!fai || !beg || !end || tid < 0 || tid >= fai->n)
958
0
        return -1;
959
960
0
    orig_beg = *beg;
961
0
    orig_end = *end;
962
0
    if (faidx_adjust_position(fai, 0, NULL, fai->name[tid], beg, end, NULL) != 0) {
963
0
        hts_log_error("Inconsistent faidx internal state - couldn't find \"%s\"",
964
0
                      fai->name[tid]);
965
0
        return -1;
966
0
    }
967
968
0
    return ((orig_beg != *beg ? 1 : 0) |
969
0
            (orig_end != *end && orig_end < HTS_POS_MAX ? 2 : 0));
970
0
}
971
972
char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len)
973
0
{
974
0
    faidx1_t val;
975
976
    // Adjust position
977
0
    if (faidx_adjust_position(fai, 1, &val, c_name, &p_beg_i, &p_end_i, len)) {
978
0
        return NULL;
979
0
    }
980
981
    // Now retrieve the sequence
982
0
    return fai_retrieve(fai, &val, val.seq_offset, p_beg_i, p_end_i + 1, len);
983
0
}
984
985
char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
986
0
{
987
0
    hts_pos_t len64;
988
0
    char *ret = faidx_fetch_seq64(fai, c_name, p_beg_i, p_end_i, &len64);
989
0
    *len = len64 < INT_MAX ? len64 : INT_MAX;  // trunc
990
0
    return ret;
991
0
}
992
993
char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len)
994
0
{
995
0
    faidx1_t val;
996
997
    // Adjust position
998
0
    if (faidx_adjust_position(fai, 1, &val, c_name, &p_beg_i, &p_end_i, len)) {
999
0
        return NULL;
1000
0
    }
1001
1002
    // Now retrieve the sequence
1003
0
    return fai_retrieve(fai, &val, val.qual_offset, p_beg_i, p_end_i + 1, len);
1004
0
}
1005
1006
char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
1007
0
{
1008
0
    hts_pos_t len64;
1009
0
    char *ret = faidx_fetch_qual64(fai, c_name, p_beg_i, p_end_i, &len64);
1010
0
    *len = len64 < INT_MAX ? len64 : INT_MAX;  // trunc
1011
0
    return ret;
1012
0
}
1013
1014
int faidx_has_seq(const faidx_t *fai, const char *seq)
1015
0
{
1016
0
    khiter_t iter = kh_get(s, fai->hash, seq);
1017
0
    if (iter == kh_end(fai->hash)) return 0;
1018
0
    return 1;
1019
0
}
1020
1021
const char *fai_parse_region(const faidx_t *fai, const char *s,
1022
                             int *tid, hts_pos_t *beg, hts_pos_t *end,
1023
                             int flags)
1024
0
{
1025
0
    return hts_parse_region(s, tid, beg, end, (hts_name2id_f)fai_name2id, (void *)fai, flags);
1026
0
}
1027
1028
0
void fai_set_cache_size(faidx_t *fai, int cache_size) {
1029
0
    bgzf_set_cache_size(fai->bgzf, cache_size);
1030
0
}
1031
1032
// Adds a thread pool to the underlying BGZF layer.
1033
0
int fai_thread_pool(faidx_t *fai, struct hts_tpool *pool, int qsize) {
1034
0
    return bgzf_thread_pool(fai->bgzf, pool, qsize);
1035
0
}
1036
1037
0
char *fai_path(const char *fa) {
1038
0
    char *fai = NULL;
1039
0
    if (!fa) {
1040
0
        hts_log_error("No reference file specified");
1041
0
    } else {
1042
0
        char *fai_tmp = strstr(fa, HTS_IDX_DELIM);
1043
0
        if (fai_tmp) {
1044
0
            fai_tmp += strlen(HTS_IDX_DELIM);
1045
0
            fai = strdup(fai_tmp);
1046
0
            if (!fai)
1047
0
                hts_log_error("Failed to allocate memory");
1048
0
        } else {
1049
0
            if (hisremote(fa)) {
1050
0
                fai = hts_idx_locatefn(fa, ".fai");       // get the remote fai file name, if any, but do not download the file
1051
0
                if (!fai)
1052
0
                    hts_log_error("Failed to locate index file for remote reference file '%s'", fa);
1053
0
            } else{
1054
0
                if (hts_idx_check_local(fa, HTS_FMT_FAI, &fai) == 0 && fai) {
1055
0
                    if (fai_build3(fa, fai, NULL) == -1) {      // create local fai file by indexing local fasta
1056
0
                        hts_log_error("Failed to build index file for reference file '%s'", fa);
1057
0
                        free(fai);
1058
0
                        fai = NULL;
1059
0
                    }
1060
0
                }
1061
0
            }
1062
0
        }
1063
0
    }
1064
1065
0
    return fai;
1066
0
}