Coverage Report

Created: 2023-11-19 06:13

/src/htslib/faidx.c
Line
Count
Source (jump to first uncovered line)
1
/*  faidx.c -- FASTA and FASTQ random access.
2
3
    Copyright (C) 2008, 2009, 2013-2020, 2022 Genome Research Ltd.
4
    Portions copyright (C) 2011 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <ctype.h>
30
#include <string.h>
31
#include <stdlib.h>
32
#include <stdio.h>
33
#include <inttypes.h>
34
#include <errno.h>
35
#include <limits.h>
36
#include <unistd.h>
37
#include <assert.h>
38
39
#include "htslib/bgzf.h"
40
#include "htslib/faidx.h"
41
#include "htslib/hfile.h"
42
#include "htslib/khash.h"
43
#include "htslib/kstring.h"
44
#include "hts_internal.h"
45
46
typedef struct {
47
    int id; // faidx_t->name[id] is for this struct.
48
    uint32_t line_len, line_blen;
49
    uint64_t len;
50
    uint64_t seq_offset;
51
    uint64_t qual_offset;
52
} faidx1_t;
53
KHASH_MAP_INIT_STR(s, faidx1_t)
54
55
struct faidx_t {
56
    BGZF *bgzf;
57
    int n, m;
58
    char **name;
59
    khash_t(s) *hash;
60
    enum fai_format_options format;
61
};
62
63
static int fai_name2id(void *v, const char *ref)
64
0
{
65
0
    faidx_t *fai = (faidx_t *)v;
66
0
    khint_t k = kh_get(s, fai->hash, ref);
67
0
    return k == kh_end(fai->hash) ? -1 : kh_val(fai->hash, k).id;
68
0
}
69
70
static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len, uint32_t line_len, uint32_t line_blen, uint64_t seq_offset, uint64_t qual_offset)
71
0
{
72
0
    if (!name) {
73
0
        hts_log_error("Malformed line");
74
0
        return -1;
75
0
    }
76
77
0
    char *name_key = strdup(name);
78
0
    int absent;
79
0
    khint_t k = kh_put(s, idx->hash, name_key, &absent);
80
0
    faidx1_t *v = &kh_value(idx->hash, k);
81
82
0
    if (! absent) {
83
0
        hts_log_warning("Ignoring duplicate sequence \"%s\" at byte offset %" PRIu64, name, seq_offset);
84
0
        free(name_key);
85
0
        return 0;
86
0
    }
87
88
0
    if (idx->n == idx->m) {
89
0
        char **tmp;
90
0
        idx->m = idx->m? idx->m<<1 : 16;
91
0
        if (!(tmp = (char**)realloc(idx->name, sizeof(char*) * idx->m))) {
92
0
            hts_log_error("Out of memory");
93
0
            return -1;
94
0
        }
95
0
        idx->name = tmp;
96
0
    }
97
0
    v->id = idx->n;
98
0
    idx->name[idx->n++] = name_key;
99
0
    v->len = len;
100
0
    v->line_len = line_len;
101
0
    v->line_blen = line_blen;
102
0
    v->seq_offset = seq_offset;
103
0
    v->qual_offset = qual_offset;
104
105
0
    return 0;
106
0
}
107
108
109
0
static faidx_t *fai_build_core(BGZF *bgzf) {
110
0
    kstring_t name = { 0, 0, NULL };
111
0
    int c, read_done, line_num;
112
0
    faidx_t *idx;
113
0
    uint64_t seq_offset, qual_offset;
114
0
    uint64_t seq_len, qual_len;
115
0
    uint64_t char_len, cl, line_len, ll;
116
0
    enum read_state {OUT_READ, IN_NAME, IN_SEQ, SEQ_END, IN_QUAL} state;
117
118
0
    idx = (faidx_t*)calloc(1, sizeof(faidx_t));
119
0
    idx->hash = kh_init(s);
120
0
    idx->format = FAI_NONE;
121
122
0
    state = OUT_READ, read_done = 0, line_num = 1;
123
0
    seq_offset = qual_offset = seq_len = qual_len = char_len = cl = line_len = ll = 0;
124
125
0
    while ((c = bgzf_getc(bgzf)) >= 0) {
126
0
        switch (state) {
127
0
            case OUT_READ:
128
0
                switch (c) {
129
0
                    case '>':
130
0
                        if (idx->format == FAI_FASTQ) {
131
0
                            hts_log_error("Found '>' in a FASTQ file, error at line %d", line_num);
132
0
                            goto fail;
133
0
                        }
134
135
0
                        idx->format = FAI_FASTA;
136
0
                        state = IN_NAME;
137
0
                    break;
138
139
0
                    case '@':
140
0
                        if (idx->format == FAI_FASTA) {
141
0
                            hts_log_error("Found '@' in a FASTA file, error at line %d", line_num);
142
0
                            goto fail;
143
0
                        }
144
145
0
                        idx->format = FAI_FASTQ;
146
0
                        state = IN_NAME;
147
0
                    break;
148
149
0
                    case '\r':
150
                        // Blank line with cr-lf ending?
151
0
                        if ((c = bgzf_getc(bgzf)) == '\n') {
152
0
                            line_num++;
153
0
                        } else {
154
0
                            hts_log_error("Format error, carriage return not followed by new line at line %d", line_num);
155
0
                            goto fail;
156
0
                        }
157
0
                    break;
158
159
0
                    case '\n':
160
                        // just move onto the next line
161
0
                        line_num++;
162
0
                    break;
163
164
0
                    default: {
165
0
                        char s[4] = { '"', c, '"', '\0' };
166
0
                        hts_log_error("Format error, unexpected %s at line %d", isprint(c) ? s : "character", line_num);
167
0
                        goto fail;
168
0
                    }
169
0
                }
170
0
            break;
171
172
0
            case IN_NAME:
173
0
                if (read_done) {
174
0
                    if (fai_insert_index(idx, name.s, seq_len, line_len, char_len, seq_offset, qual_offset) != 0)
175
0
                        goto fail;
176
177
0
                    read_done = 0;
178
0
                }
179
180
0
                name.l = 0;
181
182
0
                do {
183
0
                    if (!isspace(c)) {
184
0
                        kputc(c, &name);
185
0
                    } else if (name.l > 0 || c == '\n') {
186
0
                        break;
187
0
                    }
188
0
                } while ((c = bgzf_getc(bgzf)) >= 0);
189
190
0
                kputsn("", 0, &name);
191
192
0
                if (c < 0) {
193
0
                    hts_log_error("The last entry '%s' has no sequence", name.s);
194
0
                    goto fail;
195
0
                }
196
197
                // read the rest of the line if necessary
198
0
                if (c != '\n') while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
199
200
0
                state = IN_SEQ; seq_len = qual_len = char_len = line_len = 0;
201
0
                seq_offset = bgzf_utell(bgzf);
202
0
                line_num++;
203
0
            break;
204
205
0
            case IN_SEQ:
206
0
                if (idx->format == FAI_FASTA) {
207
0
                    if (c == '\n') {
208
0
                        state = OUT_READ;
209
0
                        line_num++;
210
0
                        continue;
211
0
                    } else if (c == '>') {
212
0
                        state = IN_NAME;
213
0
                        continue;
214
0
                    }
215
0
                } else if (idx->format == FAI_FASTQ) {
216
0
                    if (c == '+') {
217
0
                        state = IN_QUAL;
218
0
                        if (c != '\n') while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
219
0
                        qual_offset = bgzf_utell(bgzf);
220
0
                        line_num++;
221
0
                        continue;
222
0
                    } else if (c == '\n') {
223
0
                        hts_log_error("Inlined empty line is not allowed in sequence '%s' at line %d", name.s, line_num);
224
0
                        goto fail;
225
0
                    }
226
0
                }
227
228
0
                ll = cl = 0;
229
230
0
                if (idx->format == FAI_FASTA) read_done = 1;
231
232
0
                do {
233
0
                    ll++;
234
0
                    if (isgraph(c)) cl++;
235
0
                } while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
236
237
0
                ll++; seq_len += cl;
238
239
0
                if (line_len == 0) {
240
0
                    line_len = ll;
241
0
                    char_len = cl;
242
0
                } else if (line_len > ll) {
243
244
0
                    if (idx->format == FAI_FASTA)
245
0
                        state = OUT_READ;
246
0
                    else
247
0
                        state = SEQ_END;
248
249
0
                } else if (line_len < ll) {
250
0
                    hts_log_error("Different line length in sequence '%s'", name.s);
251
0
                    goto fail;
252
0
                }
253
254
0
                line_num++;
255
0
            break;
256
257
0
            case SEQ_END:
258
0
                if (c == '+') {
259
0
                    state = IN_QUAL;
260
0
                    while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
261
0
                    qual_offset = bgzf_utell(bgzf);
262
0
                    line_num++;
263
0
                } else {
264
0
                    hts_log_error("Format error, expecting '+', got '%c' at line %d", c, line_num);
265
0
                    goto fail;
266
0
                }
267
0
            break;
268
269
0
            case IN_QUAL:
270
0
                if (c == '\n') {
271
0
                    if (!read_done) {
272
0
                        hts_log_error("Inlined empty line is not allowed in quality of sequence '%s'", name.s);
273
0
                        goto fail;
274
0
                    }
275
276
0
                    state = OUT_READ;
277
0
                    line_num++;
278
0
                    continue;
279
0
                } else if (c == '@' && read_done) {
280
0
                    state = IN_NAME;
281
0
                    continue;
282
0
                }
283
284
0
                ll = cl = 0;
285
286
0
                do {
287
0
                    ll++;
288
0
                    if (isgraph(c)) cl++;
289
0
                } while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n');
290
291
0
                ll++; qual_len += cl;
292
293
0
                if (line_len < ll) {
294
0
                    hts_log_error("Quality line length too long in '%s' at line %d", name.s, line_num);
295
0
                    goto fail;
296
0
                } else if (qual_len == seq_len) {
297
0
                    read_done = 1;
298
0
                } else if (qual_len > seq_len) {
299
0
                    hts_log_error("Quality length longer than sequence in '%s' at line %d", name.s, line_num);
300
0
                    goto fail;
301
0
                } else if (line_len > ll) {
302
0
                    hts_log_error("Quality line length too short in '%s' at line %d", name.s, line_num);
303
0
                    goto fail;
304
0
                }
305
306
0
                line_num++;
307
0
            break;
308
0
        }
309
0
    }
310
311
0
    if (read_done) {
312
0
        if (fai_insert_index(idx, name.s, seq_len, line_len, char_len, seq_offset, qual_offset) != 0)
313
0
            goto fail;
314
0
    } else {
315
0
        goto fail;
316
0
    }
317
318
0
    free(name.s);
319
0
    return idx;
320
321
0
fail:
322
0
    free(name.s);
323
0
    fai_destroy(idx);
324
0
    return NULL;
325
0
}
326
327
328
0
static int fai_save(const faidx_t *fai, hFILE *fp) {
329
0
    khint_t k;
330
0
    int i;
331
0
    char buf[96]; // Must be big enough for format below.
332
333
0
    for (i = 0; i < fai->n; ++i) {
334
0
        faidx1_t x;
335
0
        k = kh_get(s, fai->hash, fai->name[i]);
336
0
        assert(k < kh_end(fai->hash));
337
0
        x = kh_value(fai->hash, k);
338
339
0
        if (fai->format == FAI_FASTA) {
340
0
            snprintf(buf, sizeof(buf),
341
0
                 "\t%"PRIu64"\t%"PRIu64"\t%"PRIu32"\t%"PRIu32"\n",
342
0
                 x.len, x.seq_offset, x.line_blen, x.line_len);
343
0
        } else {
344
0
            snprintf(buf, sizeof(buf),
345
0
                 "\t%"PRIu64"\t%"PRIu64"\t%"PRIu32"\t%"PRIu32"\t%"PRIu64"\n",
346
0
                 x.len, x.seq_offset, x.line_blen, x.line_len, x.qual_offset);
347
0
        }
348
349
0
        if (hputs(fai->name[i], fp) != 0) return -1;
350
0
        if (hputs(buf, fp) != 0) return -1;
351
0
    }
352
0
    return 0;
353
0
}
354
355
356
static faidx_t *fai_read(hFILE *fp, const char *fname, int format)
357
0
{
358
0
    faidx_t *fai;
359
0
    char *buf = NULL, *p;
360
0
    ssize_t l, lnum = 1;
361
362
0
    fai = (faidx_t*)calloc(1, sizeof(faidx_t));
363
0
    if (!fai) return NULL;
364
365
0
    fai->hash = kh_init(s);
366
0
    if (!fai->hash) goto fail;
367
368
0
    buf = (char*)calloc(0x10000, 1);
369
0
    if (!buf) goto fail;
370
371
0
    while ((l = hgetln(buf, 0x10000, fp)) > 0) {
372
0
        uint32_t line_len, line_blen, n;
373
0
        uint64_t len;
374
0
        uint64_t seq_offset;
375
0
        uint64_t qual_offset = 0;
376
377
0
        for (p = buf; *p && !isspace_c(*p); ++p);
378
379
0
        if (p - buf < l) {
380
0
            *p = 0; ++p;
381
0
        }
382
383
0
        if (format == FAI_FASTA) {
384
0
            n = sscanf(p, "%"SCNu64"%"SCNu64"%"SCNu32"%"SCNu32, &len, &seq_offset, &line_blen, &line_len);
385
386
0
            if (n != 4) {
387
0
                hts_log_error("Could not understand FASTA index %s line %zd", fname, lnum);
388
0
                goto fail;
389
0
            }
390
0
        } else {
391
0
            n = sscanf(p, "%"SCNu64"%"SCNu64"%"SCNu32"%"SCNu32"%"SCNu64, &len, &seq_offset, &line_blen, &line_len, &qual_offset);
392
393
0
            if (n != 5) {
394
0
                if (n == 4) {
395
0
                    hts_log_error("Possibly this is a FASTA index, try using faidx.  Problem in %s line %zd", fname, lnum);
396
0
                } else {
397
0
                    hts_log_error("Could not understand FASTQ index %s line %zd", fname, lnum);
398
0
                }
399
400
0
                goto fail;
401
0
            }
402
0
        }
403
404
0
        if (fai_insert_index(fai, buf, len, line_len, line_blen, seq_offset, qual_offset) != 0) {
405
0
            goto fail;
406
0
        }
407
408
0
        if (buf[l - 1] == '\n') ++lnum;
409
0
    }
410
411
0
    if (l < 0) {
412
0
        hts_log_error("Error while reading %s: %s", fname, strerror(errno));
413
0
        goto fail;
414
0
    }
415
0
    free(buf);
416
0
    return fai;
417
418
0
 fail:
419
0
    free(buf);
420
0
    fai_destroy(fai);
421
0
    return NULL;
422
0
}
423
424
void fai_destroy(faidx_t *fai)
425
0
{
426
0
    int i;
427
0
    if (!fai) return;
428
0
    for (i = 0; i < fai->n; ++i) free(fai->name[i]);
429
0
    free(fai->name);
430
0
    kh_destroy(s, fai->hash);
431
0
    if (fai->bgzf) bgzf_close(fai->bgzf);
432
0
    free(fai);
433
0
}
434
435
436
static int fai_build3_core(const char *fn, const char *fnfai, const char *fngzi)
437
0
{
438
0
    kstring_t fai_kstr = { 0, 0, NULL };
439
0
    kstring_t gzi_kstr = { 0, 0, NULL };
440
0
    BGZF *bgzf = NULL;
441
0
    hFILE *fp = NULL;
442
0
    faidx_t *fai = NULL;
443
0
    int save_errno, res;
444
0
    char *file_type;
445
446
0
    bgzf = bgzf_open(fn, "r");
447
448
0
    if ( !bgzf ) {
449
0
        hts_log_error("Failed to open the file %s", fn);
450
0
        goto fail;
451
0
    }
452
453
0
    if ( bgzf->is_compressed ) {
454
0
        if (bgzf_index_build_init(bgzf) != 0) {
455
0
            hts_log_error("Failed to allocate bgzf index");
456
0
            goto fail;
457
0
        }
458
0
    }
459
460
0
    fai = fai_build_core(bgzf);
461
462
0
    if ( !fai ) {
463
0
        if (bgzf->is_compressed && bgzf->is_gzip) {
464
0
            hts_log_error("Cannot index files compressed with gzip, please use bgzip");
465
0
        }
466
0
        goto fail;
467
0
    }
468
469
0
    if (fai->format == FAI_FASTA) {
470
0
        file_type   = "FASTA";
471
0
    } else {
472
0
        file_type   = "FASTQ";
473
0
    }
474
475
0
    if (!fnfai) {
476
0
        if (ksprintf(&fai_kstr, "%s.fai", fn) < 0) goto fail;
477
0
        fnfai = fai_kstr.s;
478
0
    }
479
480
0
    if (!fngzi) {
481
0
        if (ksprintf(&gzi_kstr, "%s.gzi", fn) < 0) goto fail;
482
0
        fngzi = gzi_kstr.s;
483
0
    }
484
485
0
    if ( bgzf->is_compressed ) {
486
0
        if (bgzf_index_dump(bgzf, fngzi, NULL) < 0) {
487
0
            hts_log_error("Failed to make bgzf index %s", fngzi);
488
0
            goto fail;
489
0
        }
490
0
    }
491
492
0
    res = bgzf_close(bgzf);
493
0
    bgzf = NULL;
494
495
0
    if (res < 0) {
496
0
        hts_log_error("Error on closing %s : %s", fn, strerror(errno));
497
0
        goto fail;
498
0
    }
499
500
0
    fp = hopen(fnfai, "wb");
501
502
0
    if ( !fp ) {
503
0
        hts_log_error("Failed to open %s index %s : %s", file_type, fnfai, strerror(errno));
504
0
        goto fail;
505
0
    }
506
507
0
    if (fai_save(fai, fp) != 0) {
508
0
        hts_log_error("Failed to write %s index %s : %s", file_type, fnfai, strerror(errno));
509
0
        goto fail;
510
0
    }
511
512
0
    if (hclose(fp) != 0) {
513
0
        hts_log_error("Failed on closing %s index %s : %s", file_type, fnfai, strerror(errno));
514
0
        goto fail;
515
0
    }
516
517
0
    free(fai_kstr.s);
518
0
    free(gzi_kstr.s);
519
0
    fai_destroy(fai);
520
0
    return 0;
521
522
0
 fail:
523
0
    save_errno = errno;
524
0
    free(fai_kstr.s);
525
0
    free(gzi_kstr.s);
526
0
    bgzf_close(bgzf);
527
0
    fai_destroy(fai);
528
0
    errno = save_errno;
529
0
    return -1;
530
0
}
531
532
533
0
int fai_build3(const char *fn, const char *fnfai, const char *fngzi) {
534
0
    return fai_build3_core(fn, fnfai, fngzi);
535
0
}
536
537
538
0
int fai_build(const char *fn) {
539
0
    return fai_build3(fn, NULL, NULL);
540
0
}
541
542
543
static faidx_t *fai_load3_core(const char *fn, const char *fnfai, const char *fngzi,
544
                   int flags, int format)
545
0
{
546
0
    kstring_t fai_kstr = { 0, 0, NULL };
547
0
    kstring_t gzi_kstr = { 0, 0, NULL };
548
0
    hFILE *fp = NULL;
549
0
    faidx_t *fai = NULL;
550
0
    int res, gzi_index_needed = 0;
551
0
    char *file_type;
552
553
0
    if (format == FAI_FASTA) {
554
0
        file_type   = "FASTA";
555
0
    } else {
556
0
        file_type   = "FASTQ";
557
0
    }
558
559
0
    if (fn == NULL)
560
0
        return NULL;
561
562
0
    if (fnfai == NULL) {
563
0
        if (ksprintf(&fai_kstr, "%s.fai", fn) < 0) goto fail;
564
0
        fnfai = fai_kstr.s;
565
0
    }
566
0
    if (fngzi == NULL) {
567
0
        if (ksprintf(&gzi_kstr, "%s.gzi", fn) < 0) goto fail;
568
0
        fngzi = gzi_kstr.s;
569
0
    }
570
571
0
    fp = hopen(fnfai, "rb");
572
573
0
    if (fp) {
574
        // index file present, check if a compressed index is needed
575
0
        hFILE *gz = NULL;
576
0
        BGZF *bgzf = bgzf_open(fn, "rb");
577
578
0
        if (bgzf == 0) {
579
0
            hts_log_error("Failed to open %s file %s", file_type, fn);
580
0
            goto fail;
581
0
        }
582
583
0
        if (bgzf_compression(bgzf) == 2) { // BGZF compression
584
0
            if ((gz = hopen(fngzi, "rb")) == 0) {
585
586
0
                if (!(flags & FAI_CREATE) || errno != ENOENT) {
587
0
                    hts_log_error("Failed to open %s index %s: %s", file_type, fngzi, strerror(errno));
588
0
                    bgzf_close(bgzf);
589
0
                    goto fail;
590
0
                }
591
592
0
                gzi_index_needed = 1;
593
0
                res = hclose(fp); // closed as going to be re-indexed
594
595
0
                if (res < 0) {
596
0
                    hts_log_error("Failed on closing %s index %s : %s", file_type, fnfai, strerror(errno));
597
0
                    goto fail;
598
0
                }
599
0
            } else {
600
0
                res = hclose(gz);
601
602
0
                if (res < 0) {
603
0
                    hts_log_error("Failed on closing %s index %s : %s", file_type, fngzi, strerror(errno));
604
0
                    goto fail;
605
0
                }
606
0
            }
607
0
        }
608
609
0
        bgzf_close(bgzf);
610
0
    }
611
612
0
    if (fp == 0 || gzi_index_needed) {
613
0
        if (!(flags & FAI_CREATE) || errno != ENOENT) {
614
0
            hts_log_error("Failed to open %s index %s: %s", file_type, fnfai, strerror(errno));
615
0
            goto fail;
616
0
        }
617
618
0
        hts_log_info("Build %s index", file_type);
619
620
0
        if (fai_build3_core(fn, fnfai, fngzi) < 0) {
621
0
            goto fail;
622
0
        }
623
624
0
        fp = hopen(fnfai, "rb");
625
0
        if (fp == 0) {
626
0
            hts_log_error("Failed to open %s index %s: %s", file_type, fnfai, strerror(errno));
627
0
            goto fail;
628
0
        }
629
0
    }
630
631
0
    fai = fai_read(fp, fnfai, format);
632
0
    if (fai == NULL) {
633
0
        hts_log_error("Failed to read %s index %s", file_type, fnfai);
634
0
        goto fail;
635
0
    }
636
637
0
    res = hclose(fp);
638
0
    fp = NULL;
639
0
    if (res < 0) {
640
0
        hts_log_error("Failed on closing %s index %s : %s", file_type, fnfai, strerror(errno));
641
0
        goto fail;
642
0
    }
643
644
0
    fai->bgzf = bgzf_open(fn, "rb");
645
0
    if (fai->bgzf == 0) {
646
0
        hts_log_error("Failed to open %s file %s", file_type, fn);
647
0
        goto fail;
648
0
    }
649
650
0
    if ( fai->bgzf->is_compressed==1 ) {
651
0
        if ( bgzf_index_load(fai->bgzf, fngzi, NULL) < 0 ) {
652
0
            hts_log_error("Failed to load .gzi index: %s", fngzi);
653
0
            goto fail;
654
0
        }
655
0
    }
656
0
    free(fai_kstr.s);
657
0
    free(gzi_kstr.s);
658
0
    return fai;
659
660
0
 fail:
661
0
    if (fai) fai_destroy(fai);
662
0
    if (fp) hclose_abruptly(fp);
663
0
    free(fai_kstr.s);
664
0
    free(gzi_kstr.s);
665
0
    return NULL;
666
0
}
667
668
669
faidx_t *fai_load3(const char *fn, const char *fnfai, const char *fngzi,
670
0
                   int flags) {
671
0
    return fai_load3_core(fn, fnfai, fngzi, flags, FAI_FASTA);
672
0
}
673
674
675
faidx_t *fai_load(const char *fn)
676
0
{
677
0
    return fai_load3(fn, NULL, NULL, FAI_CREATE);
678
0
}
679
680
681
faidx_t *fai_load3_format(const char *fn, const char *fnfai, const char *fngzi,
682
0
                   int flags, enum fai_format_options format) {
683
0
    return fai_load3_core(fn, fnfai, fngzi, flags, format);
684
0
}
685
686
687
0
faidx_t *fai_load_format(const char *fn, enum fai_format_options format) {
688
0
    return fai_load3_format(fn, NULL, NULL, FAI_CREATE, format);
689
0
}
690
691
692
static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
693
0
                          uint64_t offset, hts_pos_t beg, hts_pos_t end, hts_pos_t *len) {
694
0
    char *s;
695
0
    size_t l;
696
0
    int c = 0;
697
0
    int ret;
698
699
0
    if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) {
700
0
        hts_log_error("Range %"PRId64"..%"PRId64" too big", beg, end);
701
0
        *len = -1;
702
0
        return NULL;
703
0
    }
704
705
0
    if (val->line_blen <= 0) {
706
0
        hts_log_error("Invalid line length in index: %d", val->line_blen);
707
0
        *len = -1;
708
0
        return NULL;
709
0
    }
710
711
0
    ret = bgzf_useek(fai->bgzf,
712
0
                     offset
713
0
                     + beg / val->line_blen * val->line_len
714
0
                     + beg % val->line_blen, SEEK_SET);
715
716
0
    if (ret < 0) {
717
0
        *len = -1;
718
0
        hts_log_error("Failed to retrieve block. (Seeking in a compressed, .gzi unindexed, file?)");
719
0
        return NULL;
720
0
    }
721
722
0
    l = 0;
723
0
    s = (char*)malloc((size_t) end - beg + 2);
724
0
    if (!s) {
725
0
        *len = -1;
726
0
        return NULL;
727
0
    }
728
729
0
    while ( l < end - beg && (c=bgzf_getc(fai->bgzf))>=0 )
730
0
        if (isgraph(c)) s[l++] = c;
731
0
    if (c < 0) {
732
0
        hts_log_error("Failed to retrieve block: %s",
733
0
            c == -1 ? "unexpected end of file" : "error reading file");
734
0
        free(s);
735
0
        *len = -1;
736
0
        return NULL;
737
0
    }
738
739
0
    s[l] = '\0';
740
0
    *len = l;
741
0
    return s;
742
0
}
743
744
static int fai_get_val(const faidx_t *fai, const char *str,
745
0
                       hts_pos_t *len, faidx1_t *val, hts_pos_t *fbeg, hts_pos_t *fend) {
746
0
    khiter_t iter;
747
0
    khash_t(s) *h;
748
0
    int id;
749
0
    hts_pos_t beg, end;
750
751
0
    if (!fai_parse_region(fai, str, &id, &beg, &end, 0)) {
752
0
        hts_log_warning("Reference %s not found in FASTA file, returning empty sequence", str);
753
0
        *len = -2;
754
0
        return 1;
755
0
    }
756
757
0
    h = fai->hash;
758
0
    iter = kh_get(s, h, faidx_iseq(fai, id));
759
0
    if (iter >= kh_end(h)) {
760
        // should have already been caught above
761
0
        abort();
762
0
    }
763
0
    *val = kh_value(h, iter);
764
765
0
    if (beg >= val->len) beg = val->len;
766
0
    if (end >= val->len) end = val->len;
767
0
    if (beg > end) beg = end;
768
769
0
    *fbeg = beg;
770
0
    *fend = end;
771
772
0
    return 0;
773
0
}
774
775
/*
776
 *  The internal still has line_blen as uint32_t, but our references
777
 *  can be longer, so for future proofing we use hts_pos_t.  We also needed
778
 *  a signed value so we can return negatives as an error.
779
 */
780
hts_pos_t fai_line_length(const faidx_t *fai, const char *str)
781
0
{
782
0
    faidx1_t val;
783
0
    int64_t beg, end;
784
0
    hts_pos_t len;
785
786
0
    if (fai_get_val(fai, str, &len, &val, &beg, &end))
787
0
        return -1;
788
0
    else
789
0
        return val.line_blen;
790
0
}
791
792
char *fai_fetch64(const faidx_t *fai, const char *str, hts_pos_t *len)
793
0
{
794
0
    faidx1_t val;
795
0
    int64_t beg, end;
796
797
0
    if (fai_get_val(fai, str, len, &val, &beg, &end)) {
798
0
        return NULL;
799
0
    }
800
801
    // now retrieve the sequence
802
0
    return fai_retrieve(fai, &val, val.seq_offset, beg, end, len);
803
0
}
804
805
char *fai_fetch(const faidx_t *fai, const char *str, int *len)
806
0
{
807
0
    hts_pos_t len64;
808
0
    char *ret = fai_fetch64(fai, str, &len64);
809
0
    *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc
810
0
    return ret;
811
0
}
812
813
0
char *fai_fetchqual64(const faidx_t *fai, const char *str, hts_pos_t *len) {
814
0
    faidx1_t val;
815
0
    int64_t beg, end;
816
817
0
    if (fai_get_val(fai, str, len, &val, &beg, &end)) {
818
0
        return NULL;
819
0
    }
820
821
    // now retrieve the sequence
822
0
    return fai_retrieve(fai, &val, val.qual_offset, beg, end, len);
823
0
}
824
825
0
char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) {
826
0
    hts_pos_t len64;
827
0
    char *ret = fai_fetchqual64(fai, str, &len64);
828
0
    *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc
829
0
    return ret;
830
0
}
831
832
int faidx_fetch_nseq(const faidx_t *fai)
833
0
{
834
0
    return fai->n;
835
0
}
836
837
int faidx_nseq(const faidx_t *fai)
838
0
{
839
0
    return fai->n;
840
0
}
841
842
const char *faidx_iseq(const faidx_t *fai, int i)
843
0
{
844
0
    return fai->name[i];
845
0
}
846
847
hts_pos_t faidx_seq_len64(const faidx_t *fai, const char *seq)
848
0
{
849
0
    khint_t k = kh_get(s, fai->hash, seq);
850
0
    if ( k == kh_end(fai->hash) ) return -1;
851
0
    return kh_val(fai->hash, k).len;
852
0
}
853
854
int faidx_seq_len(const faidx_t *fai, const char *seq)
855
0
{
856
0
    hts_pos_t len = faidx_seq_len64(fai, seq);
857
0
    return len < INT_MAX ? len : INT_MAX;
858
0
}
859
860
static int faidx_adjust_position(const faidx_t *fai, int end_adjust,
861
                                 faidx1_t *val_out, const char *c_name,
862
                                 hts_pos_t *p_beg_i, hts_pos_t *p_end_i,
863
0
                                 hts_pos_t *len) {
864
0
    khiter_t iter;
865
0
    faidx1_t *val;
866
867
    // Adjust position
868
0
    iter = kh_get(s, fai->hash, c_name);
869
870
0
    if (iter == kh_end(fai->hash)) {
871
0
        if (len)
872
0
            *len = -2;
873
0
        hts_log_error("The sequence \"%s\" was not found", c_name);
874
0
        return 1;
875
0
    }
876
877
0
    val = &kh_value(fai->hash, iter);
878
879
0
    if (val_out)
880
0
        *val_out = *val;
881
882
0
    if(*p_end_i < *p_beg_i)
883
0
        *p_beg_i = *p_end_i;
884
885
0
    if(*p_beg_i < 0)
886
0
        *p_beg_i = 0;
887
0
    else if(val->len <= *p_beg_i)
888
0
        *p_beg_i = val->len;
889
890
0
    if(*p_end_i < 0)
891
0
        *p_end_i = 0;
892
0
    else if(val->len <= *p_end_i)
893
0
        *p_end_i = val->len - end_adjust;
894
895
0
    return 0;
896
0
}
897
898
int fai_adjust_region(const faidx_t *fai, int tid,
899
                      hts_pos_t *beg, hts_pos_t *end)
900
0
{
901
0
    hts_pos_t orig_beg, orig_end;
902
903
0
    if (!fai || !beg || !end || tid < 0 || tid >= fai->n)
904
0
        return -1;
905
906
0
    orig_beg = *beg;
907
0
    orig_end = *end;
908
0
    if (faidx_adjust_position(fai, 0, NULL, fai->name[tid], beg, end, NULL) != 0) {
909
0
        hts_log_error("Inconsistent faidx internal state - couldn't find \"%s\"",
910
0
                      fai->name[tid]);
911
0
        return -1;
912
0
    }
913
914
0
    return ((orig_beg != *beg ? 1 : 0) |
915
0
            (orig_end != *end && orig_end < HTS_POS_MAX ? 2 : 0));
916
0
}
917
918
char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len)
919
0
{
920
0
    faidx1_t val;
921
922
    // Adjust position
923
0
    if (faidx_adjust_position(fai, 1, &val, c_name, &p_beg_i, &p_end_i, len)) {
924
0
        return NULL;
925
0
    }
926
927
    // Now retrieve the sequence
928
0
    return fai_retrieve(fai, &val, val.seq_offset, p_beg_i, p_end_i + 1, len);
929
0
}
930
931
char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
932
0
{
933
0
    hts_pos_t len64;
934
0
    char *ret = faidx_fetch_seq64(fai, c_name, p_beg_i, p_end_i, &len64);
935
0
    *len = len64 < INT_MAX ? len64 : INT_MAX;  // trunc
936
0
    return ret;
937
0
}
938
939
char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len)
940
0
{
941
0
    faidx1_t val;
942
943
    // Adjust position
944
0
    if (faidx_adjust_position(fai, 1, &val, c_name, &p_beg_i, &p_end_i, len)) {
945
0
        return NULL;
946
0
    }
947
948
    // Now retrieve the sequence
949
0
    return fai_retrieve(fai, &val, val.qual_offset, p_beg_i, p_end_i + 1, len);
950
0
}
951
952
char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
953
0
{
954
0
    hts_pos_t len64;
955
0
    char *ret = faidx_fetch_qual64(fai, c_name, p_beg_i, p_end_i, &len64);
956
0
    *len = len64 < INT_MAX ? len64 : INT_MAX;  // trunc
957
0
    return ret;
958
0
}
959
960
int faidx_has_seq(const faidx_t *fai, const char *seq)
961
0
{
962
0
    khiter_t iter = kh_get(s, fai->hash, seq);
963
0
    if (iter == kh_end(fai->hash)) return 0;
964
0
    return 1;
965
0
}
966
967
const char *fai_parse_region(const faidx_t *fai, const char *s,
968
                             int *tid, hts_pos_t *beg, hts_pos_t *end,
969
                             int flags)
970
0
{
971
0
    return hts_parse_region(s, tid, beg, end, (hts_name2id_f)fai_name2id, (void *)fai, flags);
972
0
}
973
974
0
void fai_set_cache_size(faidx_t *fai, int cache_size) {
975
0
    bgzf_set_cache_size(fai->bgzf, cache_size);
976
0
}
977
978
// Adds a thread pool to the underlying BGZF layer.
979
0
int fai_thread_pool(faidx_t *fai, struct hts_tpool *pool, int qsize) {
980
0
    return bgzf_thread_pool(fai->bgzf, pool, qsize);
981
0
}
982
983
0
char *fai_path(const char *fa) {
984
0
    char *fai = NULL;
985
0
    if (!fa) {
986
0
        hts_log_error("No reference file specified");
987
0
    } else {
988
0
        char *fai_tmp = strstr(fa, HTS_IDX_DELIM);
989
0
        if (fai_tmp) {
990
0
            fai_tmp += strlen(HTS_IDX_DELIM);
991
0
            fai = strdup(fai_tmp);
992
0
            if (!fai)
993
0
                hts_log_error("Failed to allocate memory");
994
0
        } else {
995
0
            if (hisremote(fa)) {
996
0
                fai = hts_idx_locatefn(fa, ".fai");       // get the remote fai file name, if any, but do not download the file
997
0
                if (!fai)
998
0
                    hts_log_error("Failed to locate index file for remote reference file '%s'", fa);
999
0
            } else{
1000
0
                if (hts_idx_check_local(fa, HTS_FMT_FAI, &fai) == 0 && fai) {
1001
0
                    if (fai_build3(fa, fai, NULL) == -1) {      // create local fai file by indexing local fasta
1002
0
                        hts_log_error("Failed to build index file for reference file '%s'", fa);
1003
0
                        free(fai);
1004
0
                        fai = NULL;
1005
0
                    }
1006
0
                }
1007
0
            }
1008
0
        }
1009
0
    }
1010
1011
0
    return fai;
1012
0
}