Line | Count | Source (jump to first uncovered line) |
1 | | /* faidx.c -- FASTA and FASTQ random access. |
2 | | |
3 | | Copyright (C) 2008, 2009, 2013-2020, 2022 Genome Research Ltd. |
4 | | Portions copyright (C) 2011 Broad Institute. |
5 | | |
6 | | Author: Heng Li <lh3@sanger.ac.uk> |
7 | | |
8 | | Permission is hereby granted, free of charge, to any person obtaining a copy |
9 | | of this software and associated documentation files (the "Software"), to deal |
10 | | in the Software without restriction, including without limitation the rights |
11 | | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
12 | | copies of the Software, and to permit persons to whom the Software is |
13 | | furnished to do so, subject to the following conditions: |
14 | | |
15 | | The above copyright notice and this permission notice shall be included in |
16 | | all copies or substantial portions of the Software. |
17 | | |
18 | | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
19 | | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
20 | | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
21 | | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
22 | | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
23 | | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
24 | | DEALINGS IN THE SOFTWARE. */ |
25 | | |
26 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
27 | | #include <config.h> |
28 | | |
29 | | #include <ctype.h> |
30 | | #include <string.h> |
31 | | #include <stdlib.h> |
32 | | #include <stdio.h> |
33 | | #include <inttypes.h> |
34 | | #include <errno.h> |
35 | | #include <limits.h> |
36 | | #include <unistd.h> |
37 | | #include <assert.h> |
38 | | |
39 | | #include "htslib/bgzf.h" |
40 | | #include "htslib/faidx.h" |
41 | | #include "htslib/hfile.h" |
42 | | #include "htslib/khash.h" |
43 | | #include "htslib/kstring.h" |
44 | | #include "hts_internal.h" |
45 | | |
46 | | typedef struct { |
47 | | int id; // faidx_t->name[id] is for this struct. |
48 | | uint32_t line_len, line_blen; |
49 | | uint64_t len; |
50 | | uint64_t seq_offset; |
51 | | uint64_t qual_offset; |
52 | | } faidx1_t; |
53 | | KHASH_MAP_INIT_STR(s, faidx1_t) |
54 | | |
55 | | struct faidx_t { |
56 | | BGZF *bgzf; |
57 | | int n, m; |
58 | | char **name; |
59 | | khash_t(s) *hash; |
60 | | enum fai_format_options format; |
61 | | }; |
62 | | |
63 | | static int fai_name2id(void *v, const char *ref) |
64 | 0 | { |
65 | 0 | faidx_t *fai = (faidx_t *)v; |
66 | 0 | khint_t k = kh_get(s, fai->hash, ref); |
67 | 0 | return k == kh_end(fai->hash) ? -1 : kh_val(fai->hash, k).id; |
68 | 0 | } |
69 | | |
70 | | static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len, uint32_t line_len, uint32_t line_blen, uint64_t seq_offset, uint64_t qual_offset) |
71 | 0 | { |
72 | 0 | if (!name) { |
73 | 0 | hts_log_error("Malformed line"); |
74 | 0 | return -1; |
75 | 0 | } |
76 | | |
77 | 0 | char *name_key = strdup(name); |
78 | 0 | int absent; |
79 | 0 | khint_t k = kh_put(s, idx->hash, name_key, &absent); |
80 | 0 | faidx1_t *v = &kh_value(idx->hash, k); |
81 | |
|
82 | 0 | if (! absent) { |
83 | 0 | hts_log_warning("Ignoring duplicate sequence \"%s\" at byte offset %" PRIu64, name, seq_offset); |
84 | 0 | free(name_key); |
85 | 0 | return 0; |
86 | 0 | } |
87 | | |
88 | 0 | if (idx->n == idx->m) { |
89 | 0 | char **tmp; |
90 | 0 | idx->m = idx->m? idx->m<<1 : 16; |
91 | 0 | if (!(tmp = (char**)realloc(idx->name, sizeof(char*) * idx->m))) { |
92 | 0 | hts_log_error("Out of memory"); |
93 | 0 | return -1; |
94 | 0 | } |
95 | 0 | idx->name = tmp; |
96 | 0 | } |
97 | 0 | v->id = idx->n; |
98 | 0 | idx->name[idx->n++] = name_key; |
99 | 0 | v->len = len; |
100 | 0 | v->line_len = line_len; |
101 | 0 | v->line_blen = line_blen; |
102 | 0 | v->seq_offset = seq_offset; |
103 | 0 | v->qual_offset = qual_offset; |
104 | |
|
105 | 0 | return 0; |
106 | 0 | } |
107 | | |
108 | | |
109 | 0 | static faidx_t *fai_build_core(BGZF *bgzf) { |
110 | 0 | kstring_t name = { 0, 0, NULL }; |
111 | 0 | int c, read_done, line_num; |
112 | 0 | faidx_t *idx; |
113 | 0 | uint64_t seq_offset, qual_offset; |
114 | 0 | uint64_t seq_len, qual_len; |
115 | 0 | uint64_t char_len, cl, line_len, ll; |
116 | 0 | enum read_state {OUT_READ, IN_NAME, IN_SEQ, SEQ_END, IN_QUAL} state; |
117 | |
|
118 | 0 | idx = (faidx_t*)calloc(1, sizeof(faidx_t)); |
119 | 0 | idx->hash = kh_init(s); |
120 | 0 | idx->format = FAI_NONE; |
121 | |
|
122 | 0 | state = OUT_READ, read_done = 0, line_num = 1; |
123 | 0 | seq_offset = qual_offset = seq_len = qual_len = char_len = cl = line_len = ll = 0; |
124 | |
|
125 | 0 | while ((c = bgzf_getc(bgzf)) >= 0) { |
126 | 0 | switch (state) { |
127 | 0 | case OUT_READ: |
128 | 0 | switch (c) { |
129 | 0 | case '>': |
130 | 0 | if (idx->format == FAI_FASTQ) { |
131 | 0 | hts_log_error("Found '>' in a FASTQ file, error at line %d", line_num); |
132 | 0 | goto fail; |
133 | 0 | } |
134 | | |
135 | 0 | idx->format = FAI_FASTA; |
136 | 0 | state = IN_NAME; |
137 | 0 | break; |
138 | | |
139 | 0 | case '@': |
140 | 0 | if (idx->format == FAI_FASTA) { |
141 | 0 | hts_log_error("Found '@' in a FASTA file, error at line %d", line_num); |
142 | 0 | goto fail; |
143 | 0 | } |
144 | | |
145 | 0 | idx->format = FAI_FASTQ; |
146 | 0 | state = IN_NAME; |
147 | 0 | break; |
148 | | |
149 | 0 | case '\r': |
150 | | // Blank line with cr-lf ending? |
151 | 0 | if ((c = bgzf_getc(bgzf)) == '\n') { |
152 | 0 | line_num++; |
153 | 0 | } else { |
154 | 0 | hts_log_error("Format error, carriage return not followed by new line at line %d", line_num); |
155 | 0 | goto fail; |
156 | 0 | } |
157 | 0 | break; |
158 | | |
159 | 0 | case '\n': |
160 | | // just move onto the next line |
161 | 0 | line_num++; |
162 | 0 | break; |
163 | | |
164 | 0 | default: { |
165 | 0 | char s[4] = { '"', c, '"', '\0' }; |
166 | 0 | hts_log_error("Format error, unexpected %s at line %d", isprint(c) ? s : "character", line_num); |
167 | 0 | goto fail; |
168 | 0 | } |
169 | 0 | } |
170 | 0 | break; |
171 | | |
172 | 0 | case IN_NAME: |
173 | 0 | if (read_done) { |
174 | 0 | if (fai_insert_index(idx, name.s, seq_len, line_len, char_len, seq_offset, qual_offset) != 0) |
175 | 0 | goto fail; |
176 | | |
177 | 0 | read_done = 0; |
178 | 0 | } |
179 | | |
180 | 0 | name.l = 0; |
181 | |
|
182 | 0 | do { |
183 | 0 | if (!isspace(c)) { |
184 | 0 | kputc(c, &name); |
185 | 0 | } else if (name.l > 0 || c == '\n') { |
186 | 0 | break; |
187 | 0 | } |
188 | 0 | } while ((c = bgzf_getc(bgzf)) >= 0); |
189 | | |
190 | 0 | kputsn("", 0, &name); |
191 | |
|
192 | 0 | if (c < 0) { |
193 | 0 | hts_log_error("The last entry '%s' has no sequence", name.s); |
194 | 0 | goto fail; |
195 | 0 | } |
196 | | |
197 | | // read the rest of the line if necessary |
198 | 0 | if (c != '\n') while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n'); |
199 | |
|
200 | 0 | state = IN_SEQ; seq_len = qual_len = char_len = line_len = 0; |
201 | 0 | seq_offset = bgzf_utell(bgzf); |
202 | 0 | line_num++; |
203 | 0 | break; |
204 | | |
205 | 0 | case IN_SEQ: |
206 | 0 | if (idx->format == FAI_FASTA) { |
207 | 0 | if (c == '\n') { |
208 | 0 | state = OUT_READ; |
209 | 0 | line_num++; |
210 | 0 | continue; |
211 | 0 | } else if (c == '>') { |
212 | 0 | state = IN_NAME; |
213 | 0 | continue; |
214 | 0 | } |
215 | 0 | } else if (idx->format == FAI_FASTQ) { |
216 | 0 | if (c == '+') { |
217 | 0 | state = IN_QUAL; |
218 | 0 | if (c != '\n') while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n'); |
219 | 0 | qual_offset = bgzf_utell(bgzf); |
220 | 0 | line_num++; |
221 | 0 | continue; |
222 | 0 | } else if (c == '\n') { |
223 | 0 | hts_log_error("Inlined empty line is not allowed in sequence '%s' at line %d", name.s, line_num); |
224 | 0 | goto fail; |
225 | 0 | } |
226 | 0 | } |
227 | | |
228 | 0 | ll = cl = 0; |
229 | |
|
230 | 0 | if (idx->format == FAI_FASTA) read_done = 1; |
231 | |
|
232 | 0 | do { |
233 | 0 | ll++; |
234 | 0 | if (isgraph(c)) cl++; |
235 | 0 | } while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n'); |
236 | |
|
237 | 0 | ll++; seq_len += cl; |
238 | |
|
239 | 0 | if (line_len == 0) { |
240 | 0 | line_len = ll; |
241 | 0 | char_len = cl; |
242 | 0 | } else if (line_len > ll) { |
243 | |
|
244 | 0 | if (idx->format == FAI_FASTA) |
245 | 0 | state = OUT_READ; |
246 | 0 | else |
247 | 0 | state = SEQ_END; |
248 | |
|
249 | 0 | } else if (line_len < ll) { |
250 | 0 | hts_log_error("Different line length in sequence '%s'", name.s); |
251 | 0 | goto fail; |
252 | 0 | } |
253 | | |
254 | 0 | line_num++; |
255 | 0 | break; |
256 | | |
257 | 0 | case SEQ_END: |
258 | 0 | if (c == '+') { |
259 | 0 | state = IN_QUAL; |
260 | 0 | while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n'); |
261 | 0 | qual_offset = bgzf_utell(bgzf); |
262 | 0 | line_num++; |
263 | 0 | } else { |
264 | 0 | hts_log_error("Format error, expecting '+', got '%c' at line %d", c, line_num); |
265 | 0 | goto fail; |
266 | 0 | } |
267 | 0 | break; |
268 | | |
269 | 0 | case IN_QUAL: |
270 | 0 | if (c == '\n') { |
271 | 0 | if (!read_done) { |
272 | 0 | hts_log_error("Inlined empty line is not allowed in quality of sequence '%s'", name.s); |
273 | 0 | goto fail; |
274 | 0 | } |
275 | | |
276 | 0 | state = OUT_READ; |
277 | 0 | line_num++; |
278 | 0 | continue; |
279 | 0 | } else if (c == '@' && read_done) { |
280 | 0 | state = IN_NAME; |
281 | 0 | continue; |
282 | 0 | } |
283 | | |
284 | 0 | ll = cl = 0; |
285 | |
|
286 | 0 | do { |
287 | 0 | ll++; |
288 | 0 | if (isgraph(c)) cl++; |
289 | 0 | } while ((c = bgzf_getc(bgzf)) >= 0 && c != '\n'); |
290 | |
|
291 | 0 | ll++; qual_len += cl; |
292 | |
|
293 | 0 | if (line_len < ll) { |
294 | 0 | hts_log_error("Quality line length too long in '%s' at line %d", name.s, line_num); |
295 | 0 | goto fail; |
296 | 0 | } else if (qual_len == seq_len) { |
297 | 0 | read_done = 1; |
298 | 0 | } else if (qual_len > seq_len) { |
299 | 0 | hts_log_error("Quality length longer than sequence in '%s' at line %d", name.s, line_num); |
300 | 0 | goto fail; |
301 | 0 | } else if (line_len > ll) { |
302 | 0 | hts_log_error("Quality line length too short in '%s' at line %d", name.s, line_num); |
303 | 0 | goto fail; |
304 | 0 | } |
305 | | |
306 | 0 | line_num++; |
307 | 0 | break; |
308 | 0 | } |
309 | 0 | } |
310 | | |
311 | 0 | if (read_done) { |
312 | 0 | if (fai_insert_index(idx, name.s, seq_len, line_len, char_len, seq_offset, qual_offset) != 0) |
313 | 0 | goto fail; |
314 | 0 | } else { |
315 | 0 | goto fail; |
316 | 0 | } |
317 | | |
318 | 0 | free(name.s); |
319 | 0 | return idx; |
320 | | |
321 | 0 | fail: |
322 | 0 | free(name.s); |
323 | 0 | fai_destroy(idx); |
324 | 0 | return NULL; |
325 | 0 | } |
326 | | |
327 | | |
328 | 0 | static int fai_save(const faidx_t *fai, hFILE *fp) { |
329 | 0 | khint_t k; |
330 | 0 | int i; |
331 | 0 | char buf[96]; // Must be big enough for format below. |
332 | |
|
333 | 0 | for (i = 0; i < fai->n; ++i) { |
334 | 0 | faidx1_t x; |
335 | 0 | k = kh_get(s, fai->hash, fai->name[i]); |
336 | 0 | assert(k < kh_end(fai->hash)); |
337 | 0 | x = kh_value(fai->hash, k); |
338 | |
|
339 | 0 | if (fai->format == FAI_FASTA) { |
340 | 0 | snprintf(buf, sizeof(buf), |
341 | 0 | "\t%"PRIu64"\t%"PRIu64"\t%"PRIu32"\t%"PRIu32"\n", |
342 | 0 | x.len, x.seq_offset, x.line_blen, x.line_len); |
343 | 0 | } else { |
344 | 0 | snprintf(buf, sizeof(buf), |
345 | 0 | "\t%"PRIu64"\t%"PRIu64"\t%"PRIu32"\t%"PRIu32"\t%"PRIu64"\n", |
346 | 0 | x.len, x.seq_offset, x.line_blen, x.line_len, x.qual_offset); |
347 | 0 | } |
348 | |
|
349 | 0 | if (hputs(fai->name[i], fp) != 0) return -1; |
350 | 0 | if (hputs(buf, fp) != 0) return -1; |
351 | 0 | } |
352 | 0 | return 0; |
353 | 0 | } |
354 | | |
355 | | |
356 | | static faidx_t *fai_read(hFILE *fp, const char *fname, int format) |
357 | 0 | { |
358 | 0 | faidx_t *fai; |
359 | 0 | char *buf = NULL, *p; |
360 | 0 | ssize_t l, lnum = 1; |
361 | |
|
362 | 0 | fai = (faidx_t*)calloc(1, sizeof(faidx_t)); |
363 | 0 | if (!fai) return NULL; |
364 | | |
365 | 0 | fai->hash = kh_init(s); |
366 | 0 | if (!fai->hash) goto fail; |
367 | | |
368 | 0 | buf = (char*)calloc(0x10000, 1); |
369 | 0 | if (!buf) goto fail; |
370 | | |
371 | 0 | while ((l = hgetln(buf, 0x10000, fp)) > 0) { |
372 | 0 | uint32_t line_len, line_blen, n; |
373 | 0 | uint64_t len; |
374 | 0 | uint64_t seq_offset; |
375 | 0 | uint64_t qual_offset = 0; |
376 | |
|
377 | 0 | for (p = buf; *p && !isspace_c(*p); ++p); |
378 | |
|
379 | 0 | if (p - buf < l) { |
380 | 0 | *p = 0; ++p; |
381 | 0 | } |
382 | |
|
383 | 0 | if (format == FAI_FASTA) { |
384 | 0 | n = sscanf(p, "%"SCNu64"%"SCNu64"%"SCNu32"%"SCNu32, &len, &seq_offset, &line_blen, &line_len); |
385 | |
|
386 | 0 | if (n != 4) { |
387 | 0 | hts_log_error("Could not understand FASTA index %s line %zd", fname, lnum); |
388 | 0 | goto fail; |
389 | 0 | } |
390 | 0 | } else { |
391 | 0 | n = sscanf(p, "%"SCNu64"%"SCNu64"%"SCNu32"%"SCNu32"%"SCNu64, &len, &seq_offset, &line_blen, &line_len, &qual_offset); |
392 | |
|
393 | 0 | if (n != 5) { |
394 | 0 | if (n == 4) { |
395 | 0 | hts_log_error("Possibly this is a FASTA index, try using faidx. Problem in %s line %zd", fname, lnum); |
396 | 0 | } else { |
397 | 0 | hts_log_error("Could not understand FASTQ index %s line %zd", fname, lnum); |
398 | 0 | } |
399 | |
|
400 | 0 | goto fail; |
401 | 0 | } |
402 | 0 | } |
403 | | |
404 | 0 | if (fai_insert_index(fai, buf, len, line_len, line_blen, seq_offset, qual_offset) != 0) { |
405 | 0 | goto fail; |
406 | 0 | } |
407 | | |
408 | 0 | if (buf[l - 1] == '\n') ++lnum; |
409 | 0 | } |
410 | | |
411 | 0 | if (l < 0) { |
412 | 0 | hts_log_error("Error while reading %s: %s", fname, strerror(errno)); |
413 | 0 | goto fail; |
414 | 0 | } |
415 | 0 | free(buf); |
416 | 0 | return fai; |
417 | | |
418 | 0 | fail: |
419 | 0 | free(buf); |
420 | 0 | fai_destroy(fai); |
421 | 0 | return NULL; |
422 | 0 | } |
423 | | |
424 | | void fai_destroy(faidx_t *fai) |
425 | 0 | { |
426 | 0 | int i; |
427 | 0 | if (!fai) return; |
428 | 0 | for (i = 0; i < fai->n; ++i) free(fai->name[i]); |
429 | 0 | free(fai->name); |
430 | 0 | kh_destroy(s, fai->hash); |
431 | 0 | if (fai->bgzf) bgzf_close(fai->bgzf); |
432 | 0 | free(fai); |
433 | 0 | } |
434 | | |
435 | | |
436 | | static int fai_build3_core(const char *fn, const char *fnfai, const char *fngzi) |
437 | 0 | { |
438 | 0 | kstring_t fai_kstr = { 0, 0, NULL }; |
439 | 0 | kstring_t gzi_kstr = { 0, 0, NULL }; |
440 | 0 | BGZF *bgzf = NULL; |
441 | 0 | hFILE *fp = NULL; |
442 | 0 | faidx_t *fai = NULL; |
443 | 0 | int save_errno, res; |
444 | 0 | char *file_type; |
445 | |
|
446 | 0 | bgzf = bgzf_open(fn, "r"); |
447 | |
|
448 | 0 | if ( !bgzf ) { |
449 | 0 | hts_log_error("Failed to open the file %s", fn); |
450 | 0 | goto fail; |
451 | 0 | } |
452 | | |
453 | 0 | if ( bgzf->is_compressed ) { |
454 | 0 | if (bgzf_index_build_init(bgzf) != 0) { |
455 | 0 | hts_log_error("Failed to allocate bgzf index"); |
456 | 0 | goto fail; |
457 | 0 | } |
458 | 0 | } |
459 | | |
460 | 0 | fai = fai_build_core(bgzf); |
461 | |
|
462 | 0 | if ( !fai ) { |
463 | 0 | if (bgzf->is_compressed && bgzf->is_gzip) { |
464 | 0 | hts_log_error("Cannot index files compressed with gzip, please use bgzip"); |
465 | 0 | } |
466 | 0 | goto fail; |
467 | 0 | } |
468 | | |
469 | 0 | if (fai->format == FAI_FASTA) { |
470 | 0 | file_type = "FASTA"; |
471 | 0 | } else { |
472 | 0 | file_type = "FASTQ"; |
473 | 0 | } |
474 | |
|
475 | 0 | if (!fnfai) { |
476 | 0 | if (ksprintf(&fai_kstr, "%s.fai", fn) < 0) goto fail; |
477 | 0 | fnfai = fai_kstr.s; |
478 | 0 | } |
479 | | |
480 | 0 | if (!fngzi) { |
481 | 0 | if (ksprintf(&gzi_kstr, "%s.gzi", fn) < 0) goto fail; |
482 | 0 | fngzi = gzi_kstr.s; |
483 | 0 | } |
484 | | |
485 | 0 | if ( bgzf->is_compressed ) { |
486 | 0 | if (bgzf_index_dump(bgzf, fngzi, NULL) < 0) { |
487 | 0 | hts_log_error("Failed to make bgzf index %s", fngzi); |
488 | 0 | goto fail; |
489 | 0 | } |
490 | 0 | } |
491 | | |
492 | 0 | res = bgzf_close(bgzf); |
493 | 0 | bgzf = NULL; |
494 | |
|
495 | 0 | if (res < 0) { |
496 | 0 | hts_log_error("Error on closing %s : %s", fn, strerror(errno)); |
497 | 0 | goto fail; |
498 | 0 | } |
499 | | |
500 | 0 | fp = hopen(fnfai, "wb"); |
501 | |
|
502 | 0 | if ( !fp ) { |
503 | 0 | hts_log_error("Failed to open %s index %s : %s", file_type, fnfai, strerror(errno)); |
504 | 0 | goto fail; |
505 | 0 | } |
506 | | |
507 | 0 | if (fai_save(fai, fp) != 0) { |
508 | 0 | hts_log_error("Failed to write %s index %s : %s", file_type, fnfai, strerror(errno)); |
509 | 0 | goto fail; |
510 | 0 | } |
511 | | |
512 | 0 | if (hclose(fp) != 0) { |
513 | 0 | hts_log_error("Failed on closing %s index %s : %s", file_type, fnfai, strerror(errno)); |
514 | 0 | goto fail; |
515 | 0 | } |
516 | | |
517 | 0 | free(fai_kstr.s); |
518 | 0 | free(gzi_kstr.s); |
519 | 0 | fai_destroy(fai); |
520 | 0 | return 0; |
521 | | |
522 | 0 | fail: |
523 | 0 | save_errno = errno; |
524 | 0 | free(fai_kstr.s); |
525 | 0 | free(gzi_kstr.s); |
526 | 0 | bgzf_close(bgzf); |
527 | 0 | fai_destroy(fai); |
528 | 0 | errno = save_errno; |
529 | 0 | return -1; |
530 | 0 | } |
531 | | |
532 | | |
533 | 0 | int fai_build3(const char *fn, const char *fnfai, const char *fngzi) { |
534 | 0 | return fai_build3_core(fn, fnfai, fngzi); |
535 | 0 | } |
536 | | |
537 | | |
538 | 0 | int fai_build(const char *fn) { |
539 | 0 | return fai_build3(fn, NULL, NULL); |
540 | 0 | } |
541 | | |
542 | | |
543 | | static faidx_t *fai_load3_core(const char *fn, const char *fnfai, const char *fngzi, |
544 | | int flags, int format) |
545 | 0 | { |
546 | 0 | kstring_t fai_kstr = { 0, 0, NULL }; |
547 | 0 | kstring_t gzi_kstr = { 0, 0, NULL }; |
548 | 0 | hFILE *fp = NULL; |
549 | 0 | faidx_t *fai = NULL; |
550 | 0 | int res, gzi_index_needed = 0; |
551 | 0 | char *file_type; |
552 | |
|
553 | 0 | if (format == FAI_FASTA) { |
554 | 0 | file_type = "FASTA"; |
555 | 0 | } else { |
556 | 0 | file_type = "FASTQ"; |
557 | 0 | } |
558 | |
|
559 | 0 | if (fn == NULL) |
560 | 0 | return NULL; |
561 | | |
562 | 0 | if (fnfai == NULL) { |
563 | 0 | if (ksprintf(&fai_kstr, "%s.fai", fn) < 0) goto fail; |
564 | 0 | fnfai = fai_kstr.s; |
565 | 0 | } |
566 | 0 | if (fngzi == NULL) { |
567 | 0 | if (ksprintf(&gzi_kstr, "%s.gzi", fn) < 0) goto fail; |
568 | 0 | fngzi = gzi_kstr.s; |
569 | 0 | } |
570 | | |
571 | 0 | fp = hopen(fnfai, "rb"); |
572 | |
|
573 | 0 | if (fp) { |
574 | | // index file present, check if a compressed index is needed |
575 | 0 | hFILE *gz = NULL; |
576 | 0 | BGZF *bgzf = bgzf_open(fn, "rb"); |
577 | |
|
578 | 0 | if (bgzf == 0) { |
579 | 0 | hts_log_error("Failed to open %s file %s", file_type, fn); |
580 | 0 | goto fail; |
581 | 0 | } |
582 | | |
583 | 0 | if (bgzf_compression(bgzf) == 2) { // BGZF compression |
584 | 0 | if ((gz = hopen(fngzi, "rb")) == 0) { |
585 | |
|
586 | 0 | if (!(flags & FAI_CREATE) || errno != ENOENT) { |
587 | 0 | hts_log_error("Failed to open %s index %s: %s", file_type, fngzi, strerror(errno)); |
588 | 0 | bgzf_close(bgzf); |
589 | 0 | goto fail; |
590 | 0 | } |
591 | | |
592 | 0 | gzi_index_needed = 1; |
593 | 0 | res = hclose(fp); // closed as going to be re-indexed |
594 | |
|
595 | 0 | if (res < 0) { |
596 | 0 | hts_log_error("Failed on closing %s index %s : %s", file_type, fnfai, strerror(errno)); |
597 | 0 | goto fail; |
598 | 0 | } |
599 | 0 | } else { |
600 | 0 | res = hclose(gz); |
601 | |
|
602 | 0 | if (res < 0) { |
603 | 0 | hts_log_error("Failed on closing %s index %s : %s", file_type, fngzi, strerror(errno)); |
604 | 0 | goto fail; |
605 | 0 | } |
606 | 0 | } |
607 | 0 | } |
608 | | |
609 | 0 | bgzf_close(bgzf); |
610 | 0 | } |
611 | | |
612 | 0 | if (fp == 0 || gzi_index_needed) { |
613 | 0 | if (!(flags & FAI_CREATE) || errno != ENOENT) { |
614 | 0 | hts_log_error("Failed to open %s index %s: %s", file_type, fnfai, strerror(errno)); |
615 | 0 | goto fail; |
616 | 0 | } |
617 | | |
618 | 0 | hts_log_info("Build %s index", file_type); |
619 | |
|
620 | 0 | if (fai_build3_core(fn, fnfai, fngzi) < 0) { |
621 | 0 | goto fail; |
622 | 0 | } |
623 | | |
624 | 0 | fp = hopen(fnfai, "rb"); |
625 | 0 | if (fp == 0) { |
626 | 0 | hts_log_error("Failed to open %s index %s: %s", file_type, fnfai, strerror(errno)); |
627 | 0 | goto fail; |
628 | 0 | } |
629 | 0 | } |
630 | | |
631 | 0 | fai = fai_read(fp, fnfai, format); |
632 | 0 | if (fai == NULL) { |
633 | 0 | hts_log_error("Failed to read %s index %s", file_type, fnfai); |
634 | 0 | goto fail; |
635 | 0 | } |
636 | | |
637 | 0 | res = hclose(fp); |
638 | 0 | fp = NULL; |
639 | 0 | if (res < 0) { |
640 | 0 | hts_log_error("Failed on closing %s index %s : %s", file_type, fnfai, strerror(errno)); |
641 | 0 | goto fail; |
642 | 0 | } |
643 | | |
644 | 0 | fai->bgzf = bgzf_open(fn, "rb"); |
645 | 0 | if (fai->bgzf == 0) { |
646 | 0 | hts_log_error("Failed to open %s file %s", file_type, fn); |
647 | 0 | goto fail; |
648 | 0 | } |
649 | | |
650 | 0 | if ( fai->bgzf->is_compressed==1 ) { |
651 | 0 | if ( bgzf_index_load(fai->bgzf, fngzi, NULL) < 0 ) { |
652 | 0 | hts_log_error("Failed to load .gzi index: %s", fngzi); |
653 | 0 | goto fail; |
654 | 0 | } |
655 | 0 | } |
656 | 0 | free(fai_kstr.s); |
657 | 0 | free(gzi_kstr.s); |
658 | 0 | return fai; |
659 | | |
660 | 0 | fail: |
661 | 0 | if (fai) fai_destroy(fai); |
662 | 0 | if (fp) hclose_abruptly(fp); |
663 | 0 | free(fai_kstr.s); |
664 | 0 | free(gzi_kstr.s); |
665 | 0 | return NULL; |
666 | 0 | } |
667 | | |
668 | | |
669 | | faidx_t *fai_load3(const char *fn, const char *fnfai, const char *fngzi, |
670 | 0 | int flags) { |
671 | 0 | return fai_load3_core(fn, fnfai, fngzi, flags, FAI_FASTA); |
672 | 0 | } |
673 | | |
674 | | |
675 | | faidx_t *fai_load(const char *fn) |
676 | 0 | { |
677 | 0 | return fai_load3(fn, NULL, NULL, FAI_CREATE); |
678 | 0 | } |
679 | | |
680 | | |
681 | | faidx_t *fai_load3_format(const char *fn, const char *fnfai, const char *fngzi, |
682 | 0 | int flags, enum fai_format_options format) { |
683 | 0 | return fai_load3_core(fn, fnfai, fngzi, flags, format); |
684 | 0 | } |
685 | | |
686 | | |
687 | 0 | faidx_t *fai_load_format(const char *fn, enum fai_format_options format) { |
688 | 0 | return fai_load3_format(fn, NULL, NULL, FAI_CREATE, format); |
689 | 0 | } |
690 | | |
691 | | |
692 | | static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, |
693 | 0 | uint64_t offset, hts_pos_t beg, hts_pos_t end, hts_pos_t *len) { |
694 | 0 | char *s; |
695 | 0 | size_t l; |
696 | 0 | int c = 0; |
697 | 0 | int ret; |
698 | |
|
699 | 0 | if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) { |
700 | 0 | hts_log_error("Range %"PRId64"..%"PRId64" too big", beg, end); |
701 | 0 | *len = -1; |
702 | 0 | return NULL; |
703 | 0 | } |
704 | | |
705 | 0 | if (val->line_blen <= 0) { |
706 | 0 | hts_log_error("Invalid line length in index: %d", val->line_blen); |
707 | 0 | *len = -1; |
708 | 0 | return NULL; |
709 | 0 | } |
710 | | |
711 | 0 | ret = bgzf_useek(fai->bgzf, |
712 | 0 | offset |
713 | 0 | + beg / val->line_blen * val->line_len |
714 | 0 | + beg % val->line_blen, SEEK_SET); |
715 | |
|
716 | 0 | if (ret < 0) { |
717 | 0 | *len = -1; |
718 | 0 | hts_log_error("Failed to retrieve block. (Seeking in a compressed, .gzi unindexed, file?)"); |
719 | 0 | return NULL; |
720 | 0 | } |
721 | | |
722 | 0 | l = 0; |
723 | 0 | s = (char*)malloc((size_t) end - beg + 2); |
724 | 0 | if (!s) { |
725 | 0 | *len = -1; |
726 | 0 | return NULL; |
727 | 0 | } |
728 | | |
729 | 0 | while ( l < end - beg && (c=bgzf_getc(fai->bgzf))>=0 ) |
730 | 0 | if (isgraph(c)) s[l++] = c; |
731 | 0 | if (c < 0) { |
732 | 0 | hts_log_error("Failed to retrieve block: %s", |
733 | 0 | c == -1 ? "unexpected end of file" : "error reading file"); |
734 | 0 | free(s); |
735 | 0 | *len = -1; |
736 | 0 | return NULL; |
737 | 0 | } |
738 | | |
739 | 0 | s[l] = '\0'; |
740 | 0 | *len = l; |
741 | 0 | return s; |
742 | 0 | } |
743 | | |
744 | | static int fai_get_val(const faidx_t *fai, const char *str, |
745 | 0 | hts_pos_t *len, faidx1_t *val, hts_pos_t *fbeg, hts_pos_t *fend) { |
746 | 0 | khiter_t iter; |
747 | 0 | khash_t(s) *h; |
748 | 0 | int id; |
749 | 0 | hts_pos_t beg, end; |
750 | |
|
751 | 0 | if (!fai_parse_region(fai, str, &id, &beg, &end, 0)) { |
752 | 0 | hts_log_warning("Reference %s not found in FASTA file, returning empty sequence", str); |
753 | 0 | *len = -2; |
754 | 0 | return 1; |
755 | 0 | } |
756 | | |
757 | 0 | h = fai->hash; |
758 | 0 | iter = kh_get(s, h, faidx_iseq(fai, id)); |
759 | 0 | if (iter >= kh_end(h)) { |
760 | | // should have already been caught above |
761 | 0 | abort(); |
762 | 0 | } |
763 | 0 | *val = kh_value(h, iter); |
764 | |
|
765 | 0 | if (beg >= val->len) beg = val->len; |
766 | 0 | if (end >= val->len) end = val->len; |
767 | 0 | if (beg > end) beg = end; |
768 | |
|
769 | 0 | *fbeg = beg; |
770 | 0 | *fend = end; |
771 | |
|
772 | 0 | return 0; |
773 | 0 | } |
774 | | |
775 | | /* |
776 | | * The internal still has line_blen as uint32_t, but our references |
777 | | * can be longer, so for future proofing we use hts_pos_t. We also needed |
778 | | * a signed value so we can return negatives as an error. |
779 | | */ |
780 | | hts_pos_t fai_line_length(const faidx_t *fai, const char *str) |
781 | 0 | { |
782 | 0 | faidx1_t val; |
783 | 0 | int64_t beg, end; |
784 | 0 | hts_pos_t len; |
785 | |
|
786 | 0 | if (fai_get_val(fai, str, &len, &val, &beg, &end)) |
787 | 0 | return -1; |
788 | 0 | else |
789 | 0 | return val.line_blen; |
790 | 0 | } |
791 | | |
792 | | char *fai_fetch64(const faidx_t *fai, const char *str, hts_pos_t *len) |
793 | 0 | { |
794 | 0 | faidx1_t val; |
795 | 0 | int64_t beg, end; |
796 | |
|
797 | 0 | if (fai_get_val(fai, str, len, &val, &beg, &end)) { |
798 | 0 | return NULL; |
799 | 0 | } |
800 | | |
801 | | // now retrieve the sequence |
802 | 0 | return fai_retrieve(fai, &val, val.seq_offset, beg, end, len); |
803 | 0 | } |
804 | | |
805 | | char *fai_fetch(const faidx_t *fai, const char *str, int *len) |
806 | 0 | { |
807 | 0 | hts_pos_t len64; |
808 | 0 | char *ret = fai_fetch64(fai, str, &len64); |
809 | 0 | *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc |
810 | 0 | return ret; |
811 | 0 | } |
812 | | |
813 | 0 | char *fai_fetchqual64(const faidx_t *fai, const char *str, hts_pos_t *len) { |
814 | 0 | faidx1_t val; |
815 | 0 | int64_t beg, end; |
816 | |
|
817 | 0 | if (fai_get_val(fai, str, len, &val, &beg, &end)) { |
818 | 0 | return NULL; |
819 | 0 | } |
820 | | |
821 | | // now retrieve the sequence |
822 | 0 | return fai_retrieve(fai, &val, val.qual_offset, beg, end, len); |
823 | 0 | } |
824 | | |
825 | 0 | char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) { |
826 | 0 | hts_pos_t len64; |
827 | 0 | char *ret = fai_fetchqual64(fai, str, &len64); |
828 | 0 | *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc |
829 | 0 | return ret; |
830 | 0 | } |
831 | | |
832 | | int faidx_fetch_nseq(const faidx_t *fai) |
833 | 0 | { |
834 | 0 | return fai->n; |
835 | 0 | } |
836 | | |
837 | | int faidx_nseq(const faidx_t *fai) |
838 | 0 | { |
839 | 0 | return fai->n; |
840 | 0 | } |
841 | | |
842 | | const char *faidx_iseq(const faidx_t *fai, int i) |
843 | 0 | { |
844 | 0 | return fai->name[i]; |
845 | 0 | } |
846 | | |
847 | | hts_pos_t faidx_seq_len64(const faidx_t *fai, const char *seq) |
848 | 0 | { |
849 | 0 | khint_t k = kh_get(s, fai->hash, seq); |
850 | 0 | if ( k == kh_end(fai->hash) ) return -1; |
851 | 0 | return kh_val(fai->hash, k).len; |
852 | 0 | } |
853 | | |
854 | | int faidx_seq_len(const faidx_t *fai, const char *seq) |
855 | 0 | { |
856 | 0 | hts_pos_t len = faidx_seq_len64(fai, seq); |
857 | 0 | return len < INT_MAX ? len : INT_MAX; |
858 | 0 | } |
859 | | |
860 | | static int faidx_adjust_position(const faidx_t *fai, int end_adjust, |
861 | | faidx1_t *val_out, const char *c_name, |
862 | | hts_pos_t *p_beg_i, hts_pos_t *p_end_i, |
863 | 0 | hts_pos_t *len) { |
864 | 0 | khiter_t iter; |
865 | 0 | faidx1_t *val; |
866 | | |
867 | | // Adjust position |
868 | 0 | iter = kh_get(s, fai->hash, c_name); |
869 | |
|
870 | 0 | if (iter == kh_end(fai->hash)) { |
871 | 0 | if (len) |
872 | 0 | *len = -2; |
873 | 0 | hts_log_error("The sequence \"%s\" was not found", c_name); |
874 | 0 | return 1; |
875 | 0 | } |
876 | | |
877 | 0 | val = &kh_value(fai->hash, iter); |
878 | |
|
879 | 0 | if (val_out) |
880 | 0 | *val_out = *val; |
881 | |
|
882 | 0 | if(*p_end_i < *p_beg_i) |
883 | 0 | *p_beg_i = *p_end_i; |
884 | |
|
885 | 0 | if(*p_beg_i < 0) |
886 | 0 | *p_beg_i = 0; |
887 | 0 | else if(val->len <= *p_beg_i) |
888 | 0 | *p_beg_i = val->len; |
889 | |
|
890 | 0 | if(*p_end_i < 0) |
891 | 0 | *p_end_i = 0; |
892 | 0 | else if(val->len <= *p_end_i) |
893 | 0 | *p_end_i = val->len - end_adjust; |
894 | |
|
895 | 0 | return 0; |
896 | 0 | } |
897 | | |
898 | | int fai_adjust_region(const faidx_t *fai, int tid, |
899 | | hts_pos_t *beg, hts_pos_t *end) |
900 | 0 | { |
901 | 0 | hts_pos_t orig_beg, orig_end; |
902 | |
|
903 | 0 | if (!fai || !beg || !end || tid < 0 || tid >= fai->n) |
904 | 0 | return -1; |
905 | | |
906 | 0 | orig_beg = *beg; |
907 | 0 | orig_end = *end; |
908 | 0 | if (faidx_adjust_position(fai, 0, NULL, fai->name[tid], beg, end, NULL) != 0) { |
909 | 0 | hts_log_error("Inconsistent faidx internal state - couldn't find \"%s\"", |
910 | 0 | fai->name[tid]); |
911 | 0 | return -1; |
912 | 0 | } |
913 | | |
914 | 0 | return ((orig_beg != *beg ? 1 : 0) | |
915 | 0 | (orig_end != *end && orig_end < HTS_POS_MAX ? 2 : 0)); |
916 | 0 | } |
917 | | |
918 | | char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len) |
919 | 0 | { |
920 | 0 | faidx1_t val; |
921 | | |
922 | | // Adjust position |
923 | 0 | if (faidx_adjust_position(fai, 1, &val, c_name, &p_beg_i, &p_end_i, len)) { |
924 | 0 | return NULL; |
925 | 0 | } |
926 | | |
927 | | // Now retrieve the sequence |
928 | 0 | return fai_retrieve(fai, &val, val.seq_offset, p_beg_i, p_end_i + 1, len); |
929 | 0 | } |
930 | | |
931 | | char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) |
932 | 0 | { |
933 | 0 | hts_pos_t len64; |
934 | 0 | char *ret = faidx_fetch_seq64(fai, c_name, p_beg_i, p_end_i, &len64); |
935 | 0 | *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc |
936 | 0 | return ret; |
937 | 0 | } |
938 | | |
939 | | char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len) |
940 | 0 | { |
941 | 0 | faidx1_t val; |
942 | | |
943 | | // Adjust position |
944 | 0 | if (faidx_adjust_position(fai, 1, &val, c_name, &p_beg_i, &p_end_i, len)) { |
945 | 0 | return NULL; |
946 | 0 | } |
947 | | |
948 | | // Now retrieve the sequence |
949 | 0 | return fai_retrieve(fai, &val, val.qual_offset, p_beg_i, p_end_i + 1, len); |
950 | 0 | } |
951 | | |
952 | | char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) |
953 | 0 | { |
954 | 0 | hts_pos_t len64; |
955 | 0 | char *ret = faidx_fetch_qual64(fai, c_name, p_beg_i, p_end_i, &len64); |
956 | 0 | *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc |
957 | 0 | return ret; |
958 | 0 | } |
959 | | |
960 | | int faidx_has_seq(const faidx_t *fai, const char *seq) |
961 | 0 | { |
962 | 0 | khiter_t iter = kh_get(s, fai->hash, seq); |
963 | 0 | if (iter == kh_end(fai->hash)) return 0; |
964 | 0 | return 1; |
965 | 0 | } |
966 | | |
967 | | const char *fai_parse_region(const faidx_t *fai, const char *s, |
968 | | int *tid, hts_pos_t *beg, hts_pos_t *end, |
969 | | int flags) |
970 | 0 | { |
971 | 0 | return hts_parse_region(s, tid, beg, end, (hts_name2id_f)fai_name2id, (void *)fai, flags); |
972 | 0 | } |
973 | | |
974 | 0 | void fai_set_cache_size(faidx_t *fai, int cache_size) { |
975 | 0 | bgzf_set_cache_size(fai->bgzf, cache_size); |
976 | 0 | } |
977 | | |
978 | | // Adds a thread pool to the underlying BGZF layer. |
979 | 0 | int fai_thread_pool(faidx_t *fai, struct hts_tpool *pool, int qsize) { |
980 | 0 | return bgzf_thread_pool(fai->bgzf, pool, qsize); |
981 | 0 | } |
982 | | |
983 | 0 | char *fai_path(const char *fa) { |
984 | 0 | char *fai = NULL; |
985 | 0 | if (!fa) { |
986 | 0 | hts_log_error("No reference file specified"); |
987 | 0 | } else { |
988 | 0 | char *fai_tmp = strstr(fa, HTS_IDX_DELIM); |
989 | 0 | if (fai_tmp) { |
990 | 0 | fai_tmp += strlen(HTS_IDX_DELIM); |
991 | 0 | fai = strdup(fai_tmp); |
992 | 0 | if (!fai) |
993 | 0 | hts_log_error("Failed to allocate memory"); |
994 | 0 | } else { |
995 | 0 | if (hisremote(fa)) { |
996 | 0 | fai = hts_idx_locatefn(fa, ".fai"); // get the remote fai file name, if any, but do not download the file |
997 | 0 | if (!fai) |
998 | 0 | hts_log_error("Failed to locate index file for remote reference file '%s'", fa); |
999 | 0 | } else{ |
1000 | 0 | if (hts_idx_check_local(fa, HTS_FMT_FAI, &fai) == 0 && fai) { |
1001 | 0 | if (fai_build3(fa, fai, NULL) == -1) { // create local fai file by indexing local fasta |
1002 | 0 | hts_log_error("Failed to build index file for reference file '%s'", fa); |
1003 | 0 | free(fai); |
1004 | 0 | fai = NULL; |
1005 | 0 | } |
1006 | 0 | } |
1007 | 0 | } |
1008 | 0 | } |
1009 | 0 | } |
1010 | |
|
1011 | 0 | return fai; |
1012 | 0 | } |