Coverage Report

Created: 2025-12-14 06:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/faidx.c
Line
Count
Source
1
/*  faidx.c -- FASTA and FASTQ random access.
2
3
    Copyright (C) 2008, 2009, 2013-2020, 2022, 2024 Genome Research Ltd.
4
    Portions copyright (C) 2011 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <ctype.h>
30
#include <string.h>
31
#include <stdlib.h>
32
#include <stdio.h>
33
#include <inttypes.h>
34
#include <errno.h>
35
#include <limits.h>
36
#include <unistd.h>
37
#include <assert.h>
38
39
#include "htslib/bgzf.h"
40
#include "htslib/faidx.h"
41
#include "htslib/hfile.h"
42
#include "htslib/khash.h"
43
#include "htslib/kstring.h"
44
#include "hts_internal.h"
45
46
// Faster isgraph; assumes ASCII
47
81.6k
static inline int isgraph_(unsigned char c) {
48
81.6k
    return c > ' ' && c <= '~';
49
81.6k
}
50
51
#ifdef isgraph
52
#  undef isgraph
53
#endif
54
81.6k
#define isgraph isgraph_
55
56
// An optimised bgzf_getc.
57
// We could consider moving this to bgzf.h, but our own code uses it here only.
58
1.66M
static inline int bgzf_getc_(BGZF *fp) {
59
1.66M
    if (fp->block_offset+1 < fp->block_length) {
60
1.65M
        int c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
61
1.65M
        fp->uncompressed_address++;
62
1.65M
        return c;
63
1.65M
    }
64
65
781
    return bgzf_getc(fp);
66
1.66M
}
67
1.66M
#define bgzf_getc bgzf_getc_
68
69
typedef struct {
70
    int id; // faidx_t->name[id] is for this struct.
71
    uint32_t line_len, line_blen;
72
    uint64_t len;
73
    uint64_t seq_offset;
74
    uint64_t qual_offset;
75
} faidx1_t;
76
KHASH_MAP_INIT_STR(s, faidx1_t)
77
78
struct faidx_t {
79
    BGZF *bgzf;
80
    int n, m;
81
    char **name;
82
    khash_t(s) *hash;
83
    enum fai_format_options format;
84
};
85
86
static int fai_name2id(void *v, const char *ref)
87
0
{
88
0
    faidx_t *fai = (faidx_t *)v;
89
0
    khint_t k = kh_get(s, fai->hash, ref);
90
0
    return k == kh_end(fai->hash) ? -1 : kh_val(fai->hash, k).id;
91
0
}
92
93
static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len, uint32_t line_len, uint32_t line_blen, uint64_t seq_offset, uint64_t qual_offset)
94
1.19k
{
95
1.19k
    if (!name) {
96
0
        hts_log_error("Malformed line");
97
0
        return -1;
98
0
    }
99
100
1.19k
    char *name_key = strdup(name);
101
1.19k
    int absent;
102
1.19k
    khint_t k = kh_put(s, idx->hash, name_key, &absent);
103
1.19k
    faidx1_t *v = &kh_value(idx->hash, k);
104
105
1.19k
    if (! absent) {
106
569
        hts_log_warning("Ignoring duplicate sequence \"%s\" at byte offset %" PRIu64, name, seq_offset);
107
569
        free(name_key);
108
569
        return 0;
109
569
    }
110
111
624
    if (idx->n == idx->m) {
112
117
        char **tmp;
113
117
        idx->m = idx->m? idx->m<<1 : 16;
114
117
        if (!(tmp = (char**)realloc(idx->name, sizeof(char*) * idx->m))) {
115
0
            hts_log_error("Out of memory");
116
0
            return -1;
117
0
        }
118
117
        idx->name = tmp;
119
117
    }
120
624
    v->id = idx->n;
121
624
    idx->name[idx->n++] = name_key;
122
624
    v->len = len;
123
624
    v->line_len = line_len;
124
624
    v->line_blen = line_blen;
125
624
    v->seq_offset = seq_offset;
126
624
    v->qual_offset = qual_offset;
127
128
624
    return 0;
129
624
}
130
131
132
258
static faidx_t *fai_build_core(BGZF *bgzf) {
133
258
    kstring_t name = { 0, 0, NULL };
134
258
    int c, read_done, line_num;
135
258
    faidx_t *idx;
136
258
    uint64_t seq_offset, qual_offset;
137
258
    uint64_t seq_len, qual_len;
138
258
    uint64_t char_len, cl, line_len, ll;
139
258
    enum read_state {OUT_READ, IN_NAME, IN_SEQ, SEQ_END, IN_QUAL} state;
140
141
258
    idx = (faidx_t*)calloc(1, sizeof(faidx_t));
142
258
    idx->hash = kh_init(s);
143
258
    idx->format = FAI_NONE;
144
145
258
    state = OUT_READ, read_done = 0, line_num = 1;
146
258
    seq_offset = qual_offset = seq_len = qual_len = char_len = cl = line_len = ll = 0;
147
148
6.02k
    while ((c = bgzf_getc(bgzf)) >= 0) {
149
5.90k
        switch (state) {
150
856
            case OUT_READ:
151
856
                switch (c) {
152
602
                    case '>':
153
602
                        if (idx->format == FAI_FASTQ) {
154
1
                            hts_log_error("Found '>' in a FASTQ file, error at line %d", line_num);
155
1
                            goto fail;
156
1
                        }
157
158
601
                        idx->format = FAI_FASTA;
159
601
                        state = IN_NAME;
160
601
                    break;
161
162
124
                    case '@':
163
124
                        if (idx->format == FAI_FASTA) {
164
1
                            hts_log_error("Found '@' in a FASTA file, error at line %d", line_num);
165
1
                            goto fail;
166
1
                        }
167
168
123
                        idx->format = FAI_FASTQ;
169
123
                        state = IN_NAME;
170
123
                    break;
171
172
42
                    case '\r':
173
                        // Blank line with cr-lf ending?
174
42
                        if ((c = bgzf_getc(bgzf)) == '\n') {
175
30
                            line_num++;
176
30
                        } else {
177
12
                            hts_log_error("Format error, carriage return not followed by new line at line %d", line_num);
178
12
                            goto fail;
179
12
                        }
180
30
                    break;
181
182
72
                    case '\n':
183
                        // just move onto the next line
184
72
                        line_num++;
185
72
                    break;
186
187
16
                    default: {
188
16
                        char s[4] = { '"', c, '"', '\0' };
189
16
                        hts_log_error("Format error, unexpected %s at line %d", isprint(c) ? s : "character", line_num);
190
16
                        goto fail;
191
42
                    }
192
856
                }
193
826
            break;
194
195
1.82k
            case IN_NAME:
196
1.82k
                if (read_done) {
197
1.15k
                    if (fai_insert_index(idx, name.s, seq_len, line_len, char_len, seq_offset, qual_offset) != 0)
198
0
                        goto fail;
199
200
1.15k
                    read_done = 0;
201
1.15k
                }
202
203
1.82k
                name.l = 0;
204
205
211k
                do {
206
211k
                    if (!isspace(c)) {
207
209k
                        kputc(c, &name);
208
209k
                    } else if (name.l > 0 || c == '\n') {
209
1.77k
                        break;
210
1.77k
                    }
211
211k
                } while ((c = bgzf_getc(bgzf)) >= 0);
212
213
1.82k
                kputsn("", 0, &name);
214
215
1.82k
                if (c < 0) {
216
56
                    hts_log_error("The last entry '%s' has no sequence at line %d", name.s, line_num);
217
56
                    goto fail;
218
56
                }
219
220
                // read the rest of the line if necessary
221
1.35M
                if (c != '\n') while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
222
223
1.77k
                state = IN_SEQ; seq_len = qual_len = char_len = line_len = 0;
224
1.77k
                seq_offset = bgzf_utell(bgzf);
225
1.77k
                line_num++;
226
1.77k
            break;
227
228
3.10k
            case IN_SEQ:
229
3.10k
                if (idx->format == FAI_FASTA) {
230
2.87k
                    if (c == '\n') {
231
216
                        state = OUT_READ;
232
216
                        line_num++;
233
216
                        continue;
234
2.65k
                    } else if (c == '>') {
235
1.10k
                        state = IN_NAME;
236
1.10k
                        continue;
237
1.10k
                    }
238
2.87k
                } else if (idx->format == FAI_FASTQ) {
239
226
                    if (c == '+') {
240
35
                        state = IN_QUAL;
241
1.77k
                        if (c != '\n') while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
242
35
                        qual_offset = bgzf_utell(bgzf);
243
35
                        line_num++;
244
35
                        continue;
245
191
                    } else if (c == '\n') {
246
4
                        hts_log_error("Inlined empty line is not allowed in sequence '%s' at line %d", name.s, line_num);
247
4
                        goto fail;
248
4
                    }
249
226
                }
250
251
1.74k
                ll = cl = 0;
252
253
1.74k
                if (idx->format == FAI_FASTA) read_done = 1;
254
255
80.8k
                do {
256
80.8k
                    ll++;
257
80.8k
                    if (isgraph(c)) cl++;
258
80.8k
                } while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
259
260
1.74k
                ll++; seq_len += cl;
261
262
1.74k
                if (line_len == 0) {
263
1.28k
                    line_len = ll;
264
1.28k
                    char_len = cl;
265
1.28k
                } else if (line_len > ll) {
266
267
341
                    if (idx->format == FAI_FASTA)
268
296
                        state = OUT_READ;
269
45
                    else
270
45
                        state = SEQ_END;
271
272
341
                } else if (line_len < ll) {
273
12
                    hts_log_error("Different line length in sequence '%s' at line %d", name.s, line_num);
274
12
                    goto fail;
275
12
                }
276
277
1.73k
                line_num++;
278
1.73k
            break;
279
280
42
            case SEQ_END:
281
42
                if (c == '+') {
282
41
                    state = IN_QUAL;
283
1.18k
                    while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
284
41
                    qual_offset = bgzf_utell(bgzf);
285
41
                    line_num++;
286
41
                } else {
287
1
                    hts_log_error("Format error, expecting '+', got '%c' at line %d", c, line_num);
288
1
                    goto fail;
289
1
                }
290
41
            break;
291
292
83
            case IN_QUAL:
293
83
                if (c == '\n') {
294
2
                    if (!read_done) {
295
1
                        hts_log_error("Inlined empty line is not allowed in quality of sequence '%s' at line %d", name.s, line_num);
296
1
                        goto fail;
297
1
                    }
298
299
1
                    state = OUT_READ;
300
1
                    line_num++;
301
1
                    continue;
302
81
                } else if (c == '@' && read_done) {
303
7
                    state = IN_NAME;
304
7
                    continue;
305
7
                }
306
307
74
                ll = cl = 0;
308
309
882
                do {
310
882
                    ll++;
311
882
                    if (isgraph(c)) cl++;
312
882
                } while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
313
314
74
                ll++; qual_len += cl;
315
316
74
                if (line_len < ll) {
317
27
                    hts_log_error("Quality line length too long in '%s' at line %d", name.s, line_num);
318
27
                    goto fail;
319
47
                } else if (qual_len == seq_len) {
320
19
                    read_done = 1;
321
28
                } else if (qual_len > seq_len) {
322
4
                    hts_log_error("Quality length longer than sequence in '%s' at line %d", name.s, line_num);
323
4
                    goto fail;
324
24
                } else if (line_len > ll) {
325
9
                    hts_log_error("Quality line length too short in '%s' at line %d", name.s, line_num);
326
9
                    goto fail;
327
9
                }
328
329
34
                line_num++;
330
34
            break;
331
5.90k
        }
332
5.90k
    }
333
334
114
    if (read_done) {
335
43
        if (fai_insert_index(idx, name.s, seq_len, line_len, char_len, seq_offset, qual_offset) != 0)
336
0
            goto fail;
337
71
    } else {
338
71
        hts_log_error("File truncated at line %d", line_num);
339
71
        goto fail;
340
71
    }
341
342
43
    free(name.s);
343
43
    return idx;
344
345
215
fail:
346
215
    free(name.s);
347
215
    fai_destroy(idx);
348
215
    return NULL;
349
114
}
350
351
352
0
static int fai_save(const faidx_t *fai, hFILE *fp) {
353
0
    khint_t k;
354
0
    int i;
355
0
    char buf[96]; // Must be big enough for format below.
356
357
0
    for (i = 0; i < fai->n; ++i) {
358
0
        faidx1_t x;
359
0
        k = kh_get(s, fai->hash, fai->name[i]);
360
0
        assert(k < kh_end(fai->hash));
361
0
        x = kh_value(fai->hash, k);
362
363
0
        if (fai->format == FAI_FASTA) {
364
0
            snprintf(buf, sizeof(buf),
365
0
                 "\t%"PRIu64"\t%"PRIu64"\t%"PRIu32"\t%"PRIu32"\n",
366
0
                 x.len, x.seq_offset, x.line_blen, x.line_len);
367
0
        } else {
368
0
            snprintf(buf, sizeof(buf),
369
0
                 "\t%"PRIu64"\t%"PRIu64"\t%"PRIu32"\t%"PRIu32"\t%"PRIu64"\n",
370
0
                 x.len, x.seq_offset, x.line_blen, x.line_len, x.qual_offset);
371
0
        }
372
373
0
        if (hputs(fai->name[i], fp) != 0) return -1;
374
0
        if (hputs(buf, fp) != 0) return -1;
375
0
    }
376
0
    return 0;
377
0
}
378
379
380
static faidx_t *fai_read(hFILE *fp, const char *fname, int format)
381
0
{
382
0
    faidx_t *fai;
383
0
    char *buf = NULL, *p;
384
0
    ssize_t l, lnum = 1;
385
386
0
    fai = (faidx_t*)calloc(1, sizeof(faidx_t));
387
0
    if (!fai) return NULL;
388
389
0
    fai->hash = kh_init(s);
390
0
    if (!fai->hash) goto fail;
391
392
0
    buf = (char*)calloc(0x10000, 1);
393
0
    if (!buf) goto fail;
394
395
0
    while ((l = hgetln(buf, 0x10000, fp)) > 0) {
396
0
        uint32_t line_len, line_blen, n;
397
0
        uint64_t len;
398
0
        uint64_t seq_offset;
399
0
        uint64_t qual_offset = 0;
400
401
0
        for (p = buf; *p && !isspace_c(*p); ++p);
402
403
0
        if (p - buf < l) {
404
0
            *p = 0; ++p;
405
0
        }
406
407
0
        if (format == FAI_FASTA) {
408
0
            n = sscanf(p, "%"SCNu64"%"SCNu64"%"SCNu32"%"SCNu32, &len, &seq_offset, &line_blen, &line_len);
409
410
0
            if (n != 4) {
411
0
                hts_log_error("Could not understand FASTA index %s line %zd", fname, lnum);
412
0
                goto fail;
413
0
            }
414
0
        } else {
415
0
            n = sscanf(p, "%"SCNu64"%"SCNu64"%"SCNu32"%"SCNu32"%"SCNu64, &len, &seq_offset, &line_blen, &line_len, &qual_offset);
416
417
0
            if (n != 5) {
418
0
                if (n == 4) {
419
0
                    hts_log_error("Possibly this is a FASTA index, try using faidx.  Problem in %s line %zd", fname, lnum);
420
0
                } else {
421
0
                    hts_log_error("Could not understand FASTQ index %s line %zd", fname, lnum);
422
0
                }
423
424
0
                goto fail;
425
0
            }
426
0
        }
427
428
0
        if (fai_insert_index(fai, buf, len, line_len, line_blen, seq_offset, qual_offset) != 0) {
429
0
            goto fail;
430
0
        }
431
432
0
        if (buf[l - 1] == '\n') ++lnum;
433
0
    }
434
435
0
    if (l < 0) {
436
0
        hts_log_error("Error while reading %s: %s", fname, strerror(errno));
437
0
        goto fail;
438
0
    }
439
0
    free(buf);
440
0
    return fai;
441
442
0
 fail:
443
0
    free(buf);
444
0
    fai_destroy(fai);
445
0
    return NULL;
446
0
}
447
448
void fai_destroy(faidx_t *fai)
449
667
{
450
667
    int i;
451
667
    if (!fai) return;
452
882
    for (i = 0; i < fai->n; ++i) free(fai->name[i]);
453
258
    free(fai->name);
454
258
    kh_destroy(s, fai->hash);
455
258
    if (fai->bgzf) bgzf_close(fai->bgzf);
456
258
    free(fai);
457
258
}
458
459
460
static int fai_build3_core(const char *fn, const char *fnfai, const char *fngzi)
461
452
{
462
452
    kstring_t fai_kstr = { 0, 0, NULL };
463
452
    kstring_t gzi_kstr = { 0, 0, NULL };
464
452
    BGZF *bgzf = NULL;
465
452
    hFILE *fp = NULL;
466
452
    faidx_t *fai = NULL;
467
452
    int save_errno, res;
468
452
    char *file_type;
469
470
452
    bgzf = bgzf_open(fn, "r");
471
472
452
    if ( !bgzf ) {
473
194
        hts_log_error("Failed to open the file %s : %s", fn, strerror(errno));
474
194
        goto fail;
475
194
    }
476
477
258
    if ( bgzf->is_compressed ) {
478
7
        if (bgzf_index_build_init(bgzf) != 0) {
479
0
            hts_log_error("Failed to allocate bgzf index");
480
0
            goto fail;
481
0
        }
482
7
    }
483
484
258
    fai = fai_build_core(bgzf);
485
486
258
    if ( !fai ) {
487
215
        if (bgzf->is_compressed && bgzf->is_gzip) {
488
7
            hts_log_error("Cannot index files compressed with gzip, please use bgzip");
489
7
        }
490
215
        goto fail;
491
215
    }
492
493
43
    if (fai->format == FAI_FASTA) {
494
41
        file_type   = "FASTA";
495
41
    } else {
496
2
        file_type   = "FASTQ";
497
2
    }
498
499
43
    if (!fnfai) {
500
43
        if (ksprintf(&fai_kstr, "%s.fai", fn) < 0) goto fail;
501
43
        fnfai = fai_kstr.s;
502
43
    }
503
504
43
    if (!fngzi) {
505
43
        if (ksprintf(&gzi_kstr, "%s.gzi", fn) < 0) goto fail;
506
43
        fngzi = gzi_kstr.s;
507
43
    }
508
509
43
    if ( bgzf->is_compressed ) {
510
0
        if (bgzf_index_dump(bgzf, fngzi, NULL) < 0) {
511
0
            hts_log_error("Failed to make bgzf index %s", fngzi);
512
0
            goto fail;
513
0
        }
514
0
    }
515
516
43
    res = bgzf_close(bgzf);
517
43
    bgzf = NULL;
518
519
43
    if (res < 0) {
520
0
        hts_log_error("Error on closing %s : %s", fn, strerror(errno));
521
0
        goto fail;
522
0
    }
523
524
43
    fp = hopen(fnfai, "wb");
525
526
43
    if ( !fp ) {
527
43
        hts_log_error("Failed to open %s index %s : %s", file_type, fnfai, strerror(errno));
528
43
        goto fail;
529
43
    }
530
531
0
    if (fai_save(fai, fp) != 0) {
532
0
        hts_log_error("Failed to write %s index %s : %s", file_type, fnfai, strerror(errno));
533
0
        goto fail;
534
0
    }
535
536
0
    if (hclose(fp) != 0) {
537
0
        hts_log_error("Failed on closing %s index %s : %s", file_type, fnfai, strerror(errno));
538
0
        goto fail;
539
0
    }
540
541
0
    free(fai_kstr.s);
542
0
    free(gzi_kstr.s);
543
0
    fai_destroy(fai);
544
0
    return 0;
545
546
452
 fail:
547
452
    save_errno = errno;
548
452
    free(fai_kstr.s);
549
452
    free(gzi_kstr.s);
550
452
    bgzf_close(bgzf);
551
452
    fai_destroy(fai);
552
452
    errno = save_errno;
553
452
    return -1;
554
0
}
555
556
557
452
int fai_build3(const char *fn, const char *fnfai, const char *fngzi) {
558
452
    return fai_build3_core(fn, fnfai, fngzi);
559
452
}
560
561
562
452
int fai_build(const char *fn) {
563
452
    return fai_build3(fn, NULL, NULL);
564
452
}
565
566
567
static faidx_t *fai_load3_core(const char *fn, const char *fnfai, const char *fngzi,
568
                   int flags, int format)
569
0
{
570
0
    kstring_t fai_kstr = { 0, 0, NULL };
571
0
    kstring_t gzi_kstr = { 0, 0, NULL };
572
0
    hFILE *fp = NULL;
573
0
    faidx_t *fai = NULL;
574
0
    int res, gzi_index_needed = 0;
575
0
    char *file_type;
576
577
0
    if (format == FAI_FASTA) {
578
0
        file_type   = "FASTA";
579
0
    } else {
580
0
        file_type   = "FASTQ";
581
0
    }
582
583
0
    if (fn == NULL)
584
0
        return NULL;
585
586
0
    if (fnfai == NULL) {
587
0
        if (ksprintf(&fai_kstr, "%s.fai", fn) < 0) goto fail;
588
0
        fnfai = fai_kstr.s;
589
0
    }
590
0
    if (fngzi == NULL) {
591
0
        if (ksprintf(&gzi_kstr, "%s.gzi", fn) < 0) goto fail;
592
0
        fngzi = gzi_kstr.s;
593
0
    }
594
595
0
    fp = hopen(fnfai, "rb");
596
597
0
    if (fp) {
598
        // index file present, check if a compressed index is needed
599
0
        hFILE *gz = NULL;
600
0
        BGZF *bgzf = bgzf_open(fn, "rb");
601
602
0
        if (bgzf == 0) {
603
0
            hts_log_error("Failed to open %s file %s", file_type, fn);
604
0
            goto fail;
605
0
        }
606
607
0
        if (bgzf_compression(bgzf) == 2) { // BGZF compression
608
0
            if ((gz = hopen(fngzi, "rb")) == 0) {
609
610
0
                if (!(flags & FAI_CREATE) || errno != ENOENT) {
611
0
                    hts_log_error("Failed to open %s index %s: %s", file_type, fngzi, strerror(errno));
612
0
                    bgzf_close(bgzf);
613
0
                    goto fail;
614
0
                }
615
616
0
                gzi_index_needed = 1;
617
0
                res = hclose(fp); // closed as going to be re-indexed
618
619
0
                if (res < 0) {
620
0
                    hts_log_error("Failed on closing %s index %s : %s", file_type, fnfai, strerror(errno));
621
0
                    goto fail;
622
0
                }
623
0
            } else {
624
0
                res = hclose(gz);
625
626
0
                if (res < 0) {
627
0
                    hts_log_error("Failed on closing %s index %s : %s", file_type, fngzi, strerror(errno));
628
0
                    goto fail;
629
0
                }
630
0
            }
631
0
        }
632
633
0
        bgzf_close(bgzf);
634
0
    }
635
636
0
    if (fp == 0 || gzi_index_needed) {
637
0
        if (!(flags & FAI_CREATE) || errno != ENOENT) {
638
0
            hts_log_error("Failed to open %s index %s: %s", file_type, fnfai, strerror(errno));
639
0
            goto fail;
640
0
        }
641
642
0
        hts_log_info("Build %s index", file_type);
643
644
0
        if (fai_build3_core(fn, fnfai, fngzi) < 0) {
645
0
            goto fail;
646
0
        }
647
648
0
        fp = hopen(fnfai, "rb");
649
0
        if (fp == 0) {
650
0
            hts_log_error("Failed to open %s index %s: %s", file_type, fnfai, strerror(errno));
651
0
            goto fail;
652
0
        }
653
0
    }
654
655
0
    fai = fai_read(fp, fnfai, format);
656
0
    if (fai == NULL) {
657
0
        hts_log_error("Failed to read %s index %s", file_type, fnfai);
658
0
        goto fail;
659
0
    }
660
661
0
    res = hclose(fp);
662
0
    fp = NULL;
663
0
    if (res < 0) {
664
0
        hts_log_error("Failed on closing %s index %s : %s", file_type, fnfai, strerror(errno));
665
0
        goto fail;
666
0
    }
667
668
0
    fai->bgzf = bgzf_open(fn, "rb");
669
0
    if (fai->bgzf == 0) {
670
0
        hts_log_error("Failed to open %s file %s", file_type, fn);
671
0
        goto fail;
672
0
    }
673
674
0
    if ( fai->bgzf->is_compressed==1 ) {
675
0
        if ( bgzf_index_load(fai->bgzf, fngzi, NULL) < 0 ) {
676
0
            hts_log_error("Failed to load .gzi index: %s", fngzi);
677
0
            goto fail;
678
0
        }
679
0
    }
680
0
    free(fai_kstr.s);
681
0
    free(gzi_kstr.s);
682
0
    return fai;
683
684
0
 fail:
685
0
    if (fai) fai_destroy(fai);
686
0
    if (fp) hclose_abruptly(fp);
687
0
    free(fai_kstr.s);
688
0
    free(gzi_kstr.s);
689
0
    return NULL;
690
0
}
691
692
693
faidx_t *fai_load3(const char *fn, const char *fnfai, const char *fngzi,
694
0
                   int flags) {
695
0
    return fai_load3_core(fn, fnfai, fngzi, flags, FAI_FASTA);
696
0
}
697
698
699
faidx_t *fai_load(const char *fn)
700
0
{
701
0
    return fai_load3(fn, NULL, NULL, FAI_CREATE);
702
0
}
703
704
705
faidx_t *fai_load3_format(const char *fn, const char *fnfai, const char *fngzi,
706
0
                   int flags, enum fai_format_options format) {
707
0
    return fai_load3_core(fn, fnfai, fngzi, flags, format);
708
0
}
709
710
711
0
faidx_t *fai_load_format(const char *fn, enum fai_format_options format) {
712
0
    return fai_load3_format(fn, NULL, NULL, FAI_CREATE, format);
713
0
}
714
715
716
static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
717
0
                          uint64_t offset, hts_pos_t beg, hts_pos_t end, hts_pos_t *len) {
718
0
    char *buffer, *s;
719
0
    ssize_t nread, remaining, firstline_len, firstline_blen;
720
0
    int ret;
721
722
0
    if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) {
723
0
        hts_log_error("Range %"PRId64"..%"PRId64" too big", beg, end);
724
0
        *len = -1;
725
0
        return NULL;
726
0
    }
727
728
0
    if (val->line_blen <= 0) {
729
0
        hts_log_error("Invalid line length in index: %d", val->line_blen);
730
0
        *len = -1;
731
0
        return NULL;
732
0
    }
733
734
0
    ret = bgzf_useek(fai->bgzf,
735
0
                     offset
736
0
                     + beg / val->line_blen * val->line_len
737
0
                     + beg % val->line_blen, SEEK_SET);
738
739
0
    if (ret < 0) {
740
0
        *len = -1;
741
0
        hts_log_error("Failed to retrieve block. (Seeking in a compressed, .gzi unindexed, file?)");
742
0
        return NULL;
743
0
    }
744
745
    // Over-allocate so there is extra space for one end-of-line sequence
746
0
    buffer = (char*)malloc((size_t) end - beg + val->line_len - val->line_blen + 1);
747
0
    if (!buffer) {
748
0
        *len = -1;
749
0
        return NULL;
750
0
    }
751
752
0
    remaining = *len = end - beg;
753
0
    firstline_blen = val->line_blen - beg % val->line_blen;
754
755
    // Special case when the entire interval requested is within a single FASTA/Q line
756
0
    if (remaining <= firstline_blen) {
757
0
        nread = bgzf_read_small(fai->bgzf, buffer, remaining);
758
0
        if (nread < remaining) goto error;
759
0
        buffer[nread] = '\0';
760
0
        return buffer;
761
0
    }
762
763
0
    s = buffer;
764
0
    firstline_len = val->line_len - beg % val->line_blen;
765
766
    // Read the (partial) first line and its line terminator, but increment  s  past the
767
    // line contents only, so the terminator characters will be overwritten by the next line.
768
0
    nread = bgzf_read_small(fai->bgzf, s, firstline_len);
769
0
    if (nread < firstline_len) goto error;
770
0
    s += firstline_blen;
771
0
    remaining -= firstline_blen;
772
773
    // Similarly read complete lines and their line terminator characters, but overwrite the latter.
774
0
    while (remaining > val->line_blen) {
775
0
        nread = bgzf_read_small(fai->bgzf, s, val->line_len);
776
0
        if (nread < (ssize_t) val->line_len) goto error;
777
0
        s += val->line_blen;
778
0
        remaining -= val->line_blen;
779
0
    }
780
781
0
    if (remaining > 0) {
782
0
        nread = bgzf_read_small(fai->bgzf, s, remaining);
783
0
        if (nread < remaining) goto error;
784
0
        s += remaining;
785
0
    }
786
787
0
    *s = '\0';
788
0
    return buffer;
789
790
0
error:
791
0
    hts_log_error("Failed to retrieve block: %s",
792
0
                  (nread == 0)? "unexpected end of file" : "error reading file");
793
0
    free(buffer);
794
0
    *len = -1;
795
0
    return NULL;
796
0
}
797
798
static int fai_get_val(const faidx_t *fai, const char *str,
799
0
                       hts_pos_t *len, faidx1_t *val, hts_pos_t *fbeg, hts_pos_t *fend) {
800
0
    khiter_t iter;
801
0
    khash_t(s) *h;
802
0
    int id;
803
0
    hts_pos_t beg, end;
804
805
0
    if (!fai_parse_region(fai, str, &id, &beg, &end, 0)) {
806
0
        hts_log_warning("Reference %s not found in FASTA file, returning empty sequence", str);
807
0
        *len = -2;
808
0
        return 1;
809
0
    }
810
811
0
    h = fai->hash;
812
0
    iter = kh_get(s, h, faidx_iseq(fai, id));
813
0
    if (iter >= kh_end(h)) {
814
        // should have already been caught above
815
0
        abort();
816
0
    }
817
0
    *val = kh_value(h, iter);
818
819
0
    if (beg >= val->len) beg = val->len;
820
0
    if (end >= val->len) end = val->len;
821
0
    if (beg > end) beg = end;
822
823
0
    *fbeg = beg;
824
0
    *fend = end;
825
826
0
    return 0;
827
0
}
828
829
/*
830
 *  The internal still has line_blen as uint32_t, but our references
831
 *  can be longer, so for future proofing we use hts_pos_t.  We also needed
832
 *  a signed value so we can return negatives as an error.
833
 */
834
hts_pos_t fai_line_length(const faidx_t *fai, const char *str)
835
0
{
836
0
    faidx1_t val;
837
0
    int64_t beg, end;
838
0
    hts_pos_t len;
839
840
0
    if (fai_get_val(fai, str, &len, &val, &beg, &end))
841
0
        return -1;
842
0
    else
843
0
        return val.line_blen;
844
0
}
845
846
char *fai_fetch64(const faidx_t *fai, const char *str, hts_pos_t *len)
847
0
{
848
0
    faidx1_t val;
849
0
    int64_t beg, end;
850
851
0
    if (fai_get_val(fai, str, len, &val, &beg, &end)) {
852
0
        return NULL;
853
0
    }
854
855
    // now retrieve the sequence
856
0
    return fai_retrieve(fai, &val, val.seq_offset, beg, end, len);
857
0
}
858
859
char *fai_fetch(const faidx_t *fai, const char *str, int *len)
860
0
{
861
0
    hts_pos_t len64;
862
0
    char *ret = fai_fetch64(fai, str, &len64);
863
0
    *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc
864
0
    return ret;
865
0
}
866
867
0
char *fai_fetchqual64(const faidx_t *fai, const char *str, hts_pos_t *len) {
868
0
    faidx1_t val;
869
0
    int64_t beg, end;
870
871
0
    if (fai_get_val(fai, str, len, &val, &beg, &end)) {
872
0
        return NULL;
873
0
    }
874
875
    // now retrieve the sequence
876
0
    return fai_retrieve(fai, &val, val.qual_offset, beg, end, len);
877
0
}
878
879
0
char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) {
880
0
    hts_pos_t len64;
881
0
    char *ret = fai_fetchqual64(fai, str, &len64);
882
0
    *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc
883
0
    return ret;
884
0
}
885
886
int faidx_fetch_nseq(const faidx_t *fai)
887
0
{
888
0
    return fai->n;
889
0
}
890
891
int faidx_nseq(const faidx_t *fai)
892
0
{
893
0
    return fai->n;
894
0
}
895
896
const char *faidx_iseq(const faidx_t *fai, int i)
897
0
{
898
0
    return fai->name[i];
899
0
}
900
901
hts_pos_t faidx_seq_len64(const faidx_t *fai, const char *seq)
902
0
{
903
0
    khint_t k = kh_get(s, fai->hash, seq);
904
0
    if ( k == kh_end(fai->hash) ) return -1;
905
0
    return kh_val(fai->hash, k).len;
906
0
}
907
908
int faidx_seq_len(const faidx_t *fai, const char *seq)
909
0
{
910
0
    hts_pos_t len = faidx_seq_len64(fai, seq);
911
0
    return len < INT_MAX ? len : INT_MAX;
912
0
}
913
914
static int faidx_adjust_position(const faidx_t *fai, int end_adjust,
915
                                 faidx1_t *val_out, const char *c_name,
916
                                 hts_pos_t *p_beg_i, hts_pos_t *p_end_i,
917
0
                                 hts_pos_t *len) {
918
0
    khiter_t iter;
919
0
    faidx1_t *val;
920
921
    // Adjust position
922
0
    iter = kh_get(s, fai->hash, c_name);
923
924
0
    if (iter == kh_end(fai->hash)) {
925
0
        if (len)
926
0
            *len = -2;
927
0
        hts_log_error("The sequence \"%s\" was not found", c_name);
928
0
        return 1;
929
0
    }
930
931
0
    val = &kh_value(fai->hash, iter);
932
933
0
    if (val_out)
934
0
        *val_out = *val;
935
936
0
    if(*p_end_i < *p_beg_i)
937
0
        *p_beg_i = *p_end_i;
938
939
0
    if(*p_beg_i < 0)
940
0
        *p_beg_i = 0;
941
0
    else if(val->len <= *p_beg_i)
942
0
        *p_beg_i = val->len;
943
944
0
    if(*p_end_i < 0)
945
0
        *p_end_i = 0;
946
0
    else if(val->len <= *p_end_i)
947
0
        *p_end_i = val->len - end_adjust;
948
949
0
    return 0;
950
0
}
951
952
int fai_adjust_region(const faidx_t *fai, int tid,
953
                      hts_pos_t *beg, hts_pos_t *end)
954
0
{
955
0
    hts_pos_t orig_beg, orig_end;
956
957
0
    if (!fai || !beg || !end || tid < 0 || tid >= fai->n)
958
0
        return -1;
959
960
0
    orig_beg = *beg;
961
0
    orig_end = *end;
962
0
    if (faidx_adjust_position(fai, 0, NULL, fai->name[tid], beg, end, NULL) != 0) {
963
0
        hts_log_error("Inconsistent faidx internal state - couldn't find \"%s\"",
964
0
                      fai->name[tid]);
965
0
        return -1;
966
0
    }
967
968
0
    return ((orig_beg != *beg ? 1 : 0) |
969
0
            (orig_end != *end && orig_end < HTS_POS_MAX ? 2 : 0));
970
0
}
971
972
char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len)
973
0
{
974
0
    faidx1_t val;
975
976
    // Adjust position
977
0
    if (faidx_adjust_position(fai, 1, &val, c_name, &p_beg_i, &p_end_i, len)) {
978
0
        return NULL;
979
0
    }
980
981
    // Now retrieve the sequence
982
0
    return fai_retrieve(fai, &val, val.seq_offset, p_beg_i, p_end_i + 1, len);
983
0
}
984
985
char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
986
0
{
987
0
    hts_pos_t len64;
988
0
    char *ret = faidx_fetch_seq64(fai, c_name, p_beg_i, p_end_i, &len64);
989
0
    *len = len64 < INT_MAX ? len64 : INT_MAX;  // trunc
990
0
    return ret;
991
0
}
992
993
char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len)
994
0
{
995
0
    faidx1_t val;
996
997
    // Adjust position
998
0
    if (faidx_adjust_position(fai, 1, &val, c_name, &p_beg_i, &p_end_i, len)) {
999
0
        return NULL;
1000
0
    }
1001
1002
    // Now retrieve the sequence
1003
0
    return fai_retrieve(fai, &val, val.qual_offset, p_beg_i, p_end_i + 1, len);
1004
0
}
1005
1006
char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
1007
0
{
1008
0
    hts_pos_t len64;
1009
0
    char *ret = faidx_fetch_qual64(fai, c_name, p_beg_i, p_end_i, &len64);
1010
0
    *len = len64 < INT_MAX ? len64 : INT_MAX;  // trunc
1011
0
    return ret;
1012
0
}
1013
1014
int faidx_has_seq(const faidx_t *fai, const char *seq)
1015
0
{
1016
0
    khiter_t iter = kh_get(s, fai->hash, seq);
1017
0
    if (iter == kh_end(fai->hash)) return 0;
1018
0
    return 1;
1019
0
}
1020
1021
const char *fai_parse_region(const faidx_t *fai, const char *s,
1022
                             int *tid, hts_pos_t *beg, hts_pos_t *end,
1023
                             int flags)
1024
0
{
1025
0
    return hts_parse_region(s, tid, beg, end, (hts_name2id_f)fai_name2id, (void *)fai, flags);
1026
0
}
1027
1028
0
void fai_set_cache_size(faidx_t *fai, int cache_size) {
1029
0
    bgzf_set_cache_size(fai->bgzf, cache_size);
1030
0
}
1031
1032
// Adds a thread pool to the underlying BGZF layer.
1033
0
int fai_thread_pool(faidx_t *fai, struct hts_tpool *pool, int qsize) {
1034
0
    return bgzf_thread_pool(fai->bgzf, pool, qsize);
1035
0
}
1036
1037
0
char *fai_path(const char *fa) {
1038
0
    char *fai = NULL;
1039
0
    if (!fa) {
1040
0
        hts_log_error("No reference file specified");
1041
0
    } else {
1042
0
        char *fai_tmp = strstr(fa, HTS_IDX_DELIM);
1043
0
        if (fai_tmp) {
1044
0
            fai_tmp += strlen(HTS_IDX_DELIM);
1045
0
            fai = strdup(fai_tmp);
1046
0
            if (!fai)
1047
0
                hts_log_error("Failed to allocate memory");
1048
0
        } else {
1049
0
            if (hisremote(fa)) {
1050
0
                fai = hts_idx_locatefn(fa, ".fai");       // get the remote fai file name, if any, but do not download the file
1051
0
                if (!fai)
1052
0
                    hts_log_error("Failed to locate index file for remote reference file '%s'", fa);
1053
0
            } else{
1054
0
                if (hts_idx_check_local(fa, HTS_FMT_FAI, &fai) == 0 && fai) {
1055
0
                    if (fai_build3(fa, fai, NULL) == -1) {      // create local fai file by indexing local fasta
1056
0
                        hts_log_error("Failed to build index file for reference file '%s'", fa);
1057
0
                        free(fai);
1058
0
                        fai = NULL;
1059
0
                    }
1060
0
                }
1061
0
            }
1062
0
        }
1063
0
    }
1064
1065
0
    return fai;
1066
0
}