Line | Count | Source (jump to first uncovered line) |
1 | | /* hts.c -- format-neutral I/O, indexing, and iterator API functions. |
2 | | |
3 | | Copyright (C) 2008, 2009, 2012-2023 Genome Research Ltd. |
4 | | Copyright (C) 2012, 2013 Broad Institute. |
5 | | |
6 | | Author: Heng Li <lh3@sanger.ac.uk> |
7 | | |
8 | | Permission is hereby granted, free of charge, to any person obtaining a copy |
9 | | of this software and associated documentation files (the "Software"), to deal |
10 | | in the Software without restriction, including without limitation the rights |
11 | | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
12 | | copies of the Software, and to permit persons to whom the Software is |
13 | | furnished to do so, subject to the following conditions: |
14 | | |
15 | | The above copyright notice and this permission notice shall be included in |
16 | | all copies or substantial portions of the Software. |
17 | | |
18 | | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
19 | | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
20 | | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
21 | | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
22 | | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
23 | | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
24 | | DEALINGS IN THE SOFTWARE. */ |
25 | | |
26 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
27 | | #include <config.h> |
28 | | |
29 | | #include <zlib.h> |
30 | | #include <stdio.h> |
31 | | #include <string.h> |
32 | | #include <strings.h> |
33 | | #include <stdlib.h> |
34 | | #include <unistd.h> |
35 | | #include <inttypes.h> |
36 | | #include <limits.h> |
37 | | #include <stdint.h> |
38 | | #include <fcntl.h> |
39 | | #include <errno.h> |
40 | | #include <time.h> |
41 | | #include <sys/stat.h> |
42 | | #include <assert.h> |
43 | | |
44 | | #ifdef HAVE_LIBLZMA |
45 | | #ifdef HAVE_LZMA_H |
46 | | #include <lzma.h> |
47 | | #else |
48 | | #include "os/lzma_stub.h" |
49 | | #endif |
50 | | #endif |
51 | | |
52 | | #include "htslib/hts.h" |
53 | | #include "htslib/bgzf.h" |
54 | | #include "cram/cram.h" |
55 | | #include "htslib/hfile.h" |
56 | | #include "htslib/hts_endian.h" |
57 | | #include "version.h" |
58 | | #include "config_vars.h" |
59 | | #include "hts_internal.h" |
60 | | #include "hfile_internal.h" |
61 | | #include "sam_internal.h" |
62 | | #include "htslib/hts_expr.h" |
63 | | #include "htslib/hts_os.h" // drand48 |
64 | | |
65 | | #include "htslib/khash.h" |
66 | | #include "htslib/kseq.h" |
67 | | #include "htslib/ksort.h" |
68 | | #include "htslib/tbx.h" |
69 | | #if defined(HAVE_EXTERNAL_LIBHTSCODECS) |
70 | | #include <htscodecs/htscodecs.h> |
71 | | #else |
72 | | #include "htscodecs/htscodecs/htscodecs.h" |
73 | | #endif |
74 | | |
75 | | #ifndef EFTYPE |
76 | 4 | #define EFTYPE ENOEXEC |
77 | | #endif |
78 | | |
79 | 49.7k | KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal) |
80 | | |
81 | | HTSLIB_EXPORT |
82 | | int hts_verbose = HTS_LOG_WARNING; |
83 | | |
84 | | const char *hts_version() |
85 | 2 | { |
86 | 2 | return HTS_VERSION_TEXT; |
87 | 2 | } |
88 | | |
89 | 0 | unsigned int hts_features(void) { |
90 | 0 | unsigned int feat = HTS_FEATURE_HTSCODECS; // Always present |
91 | |
|
92 | 0 | #ifdef PACKAGE_URL |
93 | 0 | feat |= HTS_FEATURE_CONFIGURE; |
94 | 0 | #endif |
95 | |
|
96 | | #ifdef ENABLE_PLUGINS |
97 | | feat |= HTS_FEATURE_PLUGINS; |
98 | | #endif |
99 | |
|
100 | 0 | #ifdef HAVE_LIBCURL |
101 | 0 | feat |= HTS_FEATURE_LIBCURL; |
102 | 0 | #endif |
103 | |
|
104 | 0 | #ifdef ENABLE_S3 |
105 | 0 | feat |= HTS_FEATURE_S3; |
106 | 0 | #endif |
107 | |
|
108 | 0 | #ifdef ENABLE_GCS |
109 | 0 | feat |= HTS_FEATURE_GCS; |
110 | 0 | #endif |
111 | |
|
112 | | #ifdef HAVE_LIBDEFLATE |
113 | | feat |= HTS_FEATURE_LIBDEFLATE; |
114 | | #endif |
115 | |
|
116 | 0 | #ifdef HAVE_LIBLZMA |
117 | 0 | feat |= HTS_FEATURE_LZMA; |
118 | 0 | #endif |
119 | |
|
120 | 0 | #ifdef HAVE_LIBBZ2 |
121 | 0 | feat |= HTS_FEATURE_BZIP2; |
122 | 0 | #endif |
123 | |
|
124 | 0 | return feat; |
125 | 0 | } |
126 | | |
127 | 0 | const char *hts_test_feature(unsigned int id) { |
128 | 0 | unsigned int feat = hts_features(); |
129 | |
|
130 | 0 | switch (id) { |
131 | 0 | case HTS_FEATURE_CONFIGURE: |
132 | 0 | return feat & HTS_FEATURE_CONFIGURE ? "yes" : NULL; |
133 | 0 | case HTS_FEATURE_PLUGINS: |
134 | 0 | return feat & HTS_FEATURE_PLUGINS ? "yes" : NULL; |
135 | 0 | case HTS_FEATURE_LIBCURL: |
136 | 0 | return feat & HTS_FEATURE_LIBCURL ? "yes" : NULL; |
137 | 0 | case HTS_FEATURE_S3: |
138 | 0 | return feat & HTS_FEATURE_S3 ? "yes" : NULL; |
139 | 0 | case HTS_FEATURE_GCS: |
140 | 0 | return feat & HTS_FEATURE_GCS ? "yes" : NULL; |
141 | 0 | case HTS_FEATURE_LIBDEFLATE: |
142 | 0 | return feat & HTS_FEATURE_LIBDEFLATE ? "yes" : NULL; |
143 | 0 | case HTS_FEATURE_BZIP2: |
144 | 0 | return feat & HTS_FEATURE_BZIP2 ? "yes" : NULL; |
145 | 0 | case HTS_FEATURE_LZMA: |
146 | 0 | return feat & HTS_FEATURE_LZMA ? "yes" : NULL; |
147 | | |
148 | 0 | case HTS_FEATURE_HTSCODECS: |
149 | 0 | return htscodecs_version(); |
150 | | |
151 | 0 | case HTS_FEATURE_CC: |
152 | 0 | return HTS_CC; |
153 | 0 | case HTS_FEATURE_CFLAGS: |
154 | 0 | return HTS_CFLAGS; |
155 | 0 | case HTS_FEATURE_LDFLAGS: |
156 | 0 | return HTS_LDFLAGS; |
157 | 0 | case HTS_FEATURE_CPPFLAGS: |
158 | 0 | return HTS_CPPFLAGS; |
159 | | |
160 | 0 | default: |
161 | 0 | fprintf(stderr, "Unknown feature code: %u\n", id); |
162 | 0 | } |
163 | | |
164 | 0 | return NULL; |
165 | 0 | } |
166 | | |
167 | | // Note this implementation also means we can just "strings" the library |
168 | | // to find the configuration parameters. |
169 | 0 | const char *hts_feature_string(void) { |
170 | 0 | static char config[1200]; |
171 | 0 | const char *flags= |
172 | |
|
173 | 0 | #ifdef PACKAGE_URL |
174 | 0 | "build=configure " |
175 | | #else |
176 | | "build=Makefile " |
177 | | #endif |
178 | |
|
179 | 0 | #ifdef HAVE_LIBCURL |
180 | 0 | "libcurl=yes " |
181 | | #else |
182 | | "libcurl=no " |
183 | | #endif |
184 | |
|
185 | 0 | #ifdef ENABLE_S3 |
186 | 0 | "S3=yes " |
187 | | #else |
188 | | "S3=no " |
189 | | #endif |
190 | |
|
191 | 0 | #ifdef ENABLE_GCS |
192 | 0 | "GCS=yes " |
193 | | #else |
194 | | "GCS=no " |
195 | | #endif |
196 | |
|
197 | | #ifdef HAVE_LIBDEFLATE |
198 | | "libdeflate=yes " |
199 | | #else |
200 | 0 | "libdeflate=no " |
201 | 0 | #endif |
202 | |
|
203 | 0 | #ifdef HAVE_LIBLZMA |
204 | 0 | "lzma=yes " |
205 | | #else |
206 | | "lzma=no " |
207 | | #endif |
208 | |
|
209 | 0 | #ifdef HAVE_LIBBZ2 |
210 | 0 | "bzip2=yes " |
211 | | #else |
212 | | "bzip2=no " |
213 | | #endif |
214 | | |
215 | | // "plugins=" must stay at the end as it is followed by "plugin-path=" |
216 | | #ifdef ENABLE_PLUGINS |
217 | | "plugins=yes"; |
218 | | #else |
219 | 0 | "plugins=no"; |
220 | 0 | #endif |
221 | |
|
222 | | #ifdef ENABLE_PLUGINS |
223 | | snprintf(config, sizeof(config), |
224 | | "%s plugin-path=%.1000s htscodecs=%.40s", |
225 | | flags, hts_plugin_path(), htscodecs_version()); |
226 | | #else |
227 | 0 | snprintf(config, sizeof(config), |
228 | 0 | "%s htscodecs=%.40s", |
229 | 0 | flags, htscodecs_version()); |
230 | 0 | #endif |
231 | 0 | return config; |
232 | 0 | } |
233 | | |
234 | | |
235 | | HTSLIB_EXPORT |
236 | | const unsigned char seq_nt16_table[256] = { |
237 | | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, |
238 | | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, |
239 | | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, |
240 | | 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15, |
241 | | 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, |
242 | | 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, |
243 | | 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, |
244 | | 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, |
245 | | |
246 | | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, |
247 | | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, |
248 | | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, |
249 | | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, |
250 | | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, |
251 | | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, |
252 | | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, |
253 | | 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 |
254 | | }; |
255 | | |
256 | | HTSLIB_EXPORT |
257 | | const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN"; |
258 | | |
259 | | HTSLIB_EXPORT |
260 | | const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; |
261 | | |
262 | | /********************** |
263 | | *** Basic file I/O *** |
264 | | **********************/ |
265 | | |
266 | | static enum htsFormatCategory format_category(enum htsExactFormat fmt) |
267 | 13.6k | { |
268 | 13.6k | switch (fmt) { |
269 | 0 | case bam: |
270 | 0 | case sam: |
271 | 0 | case cram: |
272 | 0 | case fastq_format: |
273 | 0 | case fasta_format: |
274 | 0 | return sequence_data; |
275 | | |
276 | 0 | case vcf: |
277 | 0 | case bcf: |
278 | 0 | return variant_data; |
279 | | |
280 | 0 | case bai: |
281 | 0 | case crai: |
282 | 0 | case csi: |
283 | 0 | case fai_format: |
284 | 0 | case fqi_format: |
285 | 0 | case gzi: |
286 | 0 | case tbi: |
287 | 0 | return index_file; |
288 | | |
289 | 0 | case bed: |
290 | 0 | case d4_format: |
291 | 0 | return region_list; |
292 | | |
293 | 0 | case htsget: |
294 | 0 | case hts_crypt4gh_format: |
295 | 0 | return unknown_category; |
296 | | |
297 | 0 | case unknown_format: |
298 | 0 | case binary_format: |
299 | 13.6k | case text_format: |
300 | 13.6k | case empty_format: |
301 | 13.6k | case format_maximum: |
302 | 13.6k | break; |
303 | 13.6k | } |
304 | | |
305 | 13.6k | return unknown_category; |
306 | 13.6k | } |
307 | | |
308 | | // Decompress several hundred bytes by peeking at the file, which must be |
309 | | // positioned at the start of a GZIP block. |
310 | | static ssize_t |
311 | | decompress_peek_gz(hFILE *fp, unsigned char *dest, size_t destsize) |
312 | 1.12k | { |
313 | 1.12k | unsigned char buffer[2048]; |
314 | 1.12k | z_stream zs; |
315 | 1.12k | ssize_t npeek = hpeek(fp, buffer, sizeof buffer); |
316 | | |
317 | 1.12k | if (npeek < 0) return -1; |
318 | | |
319 | 1.12k | zs.zalloc = NULL; |
320 | 1.12k | zs.zfree = NULL; |
321 | 1.12k | zs.next_in = buffer; |
322 | 1.12k | zs.avail_in = npeek; |
323 | 1.12k | zs.next_out = dest; |
324 | 1.12k | zs.avail_out = destsize; |
325 | 1.12k | if (inflateInit2(&zs, 31) != Z_OK) return -1; |
326 | | |
327 | 1.12k | int ret; |
328 | 1.12k | const unsigned char *last_in = buffer; |
329 | 2.28k | while (zs.avail_out > 0) { |
330 | 1.22k | ret = inflate(&zs, Z_SYNC_FLUSH); |
331 | 1.22k | if (ret == Z_STREAM_END) { |
332 | 82 | if (last_in == zs.next_in) |
333 | 0 | break; // Paranoia to avoid potential looping. Shouldn't happen |
334 | 82 | else |
335 | 82 | last_in = zs.next_in; |
336 | 82 | inflateReset(&zs); |
337 | 1.13k | } else if (ret != Z_OK) { |
338 | | // eg Z_BUF_ERROR due to avail_in/out becoming zero |
339 | 58 | break; |
340 | 58 | } |
341 | 1.22k | } |
342 | | |
343 | | // NB: zs.total_out is changed by inflateReset, so use pointer diff instead |
344 | 1.12k | destsize = zs.next_out - dest; |
345 | 1.12k | inflateEnd(&zs); |
346 | | |
347 | 1.12k | return destsize; |
348 | 1.12k | } |
349 | | |
350 | | #ifdef HAVE_LIBLZMA |
351 | | // Similarly decompress a portion by peeking at the file, which must be |
352 | | // positioned at the start of the file. |
353 | | static ssize_t |
354 | | decompress_peek_xz(hFILE *fp, unsigned char *dest, size_t destsize) |
355 | 0 | { |
356 | 0 | unsigned char buffer[2048]; |
357 | 0 | ssize_t npeek = hpeek(fp, buffer, sizeof buffer); |
358 | 0 | if (npeek < 0) return -1; |
359 | | |
360 | 0 | lzma_stream ls = LZMA_STREAM_INIT; |
361 | 0 | if (lzma_stream_decoder(&ls, lzma_easy_decoder_memusage(9), 0) != LZMA_OK) |
362 | 0 | return -1; |
363 | | |
364 | 0 | ls.next_in = buffer; |
365 | 0 | ls.avail_in = npeek; |
366 | 0 | ls.next_out = dest; |
367 | 0 | ls.avail_out = destsize; |
368 | |
|
369 | 0 | int r = lzma_code(&ls, LZMA_RUN); |
370 | 0 | if (! (r == LZMA_OK || r == LZMA_STREAM_END)) { |
371 | 0 | lzma_end(&ls); |
372 | 0 | return -1; |
373 | 0 | } |
374 | | |
375 | 0 | destsize = ls.total_out; |
376 | 0 | lzma_end(&ls); |
377 | |
|
378 | 0 | return destsize; |
379 | 0 | } |
380 | | #endif |
381 | | |
382 | | // Parse "x.y" text, taking care because the string is not NUL-terminated |
383 | | // and filling in major/minor only when the digits are followed by a delimiter, |
384 | | // so we don't misread "1.10" as "1.1" due to reaching the end of the buffer. |
385 | | static void |
386 | | parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim) |
387 | 30 | { |
388 | 30 | const char *s = (const char *) u; |
389 | 30 | const char *slim = (const char *) ulim; |
390 | 30 | short v; |
391 | | |
392 | 30 | fmt->version.major = fmt->version.minor = -1; |
393 | | |
394 | 194 | for (v = 0; s < slim && isdigit_c(*s); s++) |
395 | 164 | v = 10 * v + *s - '0'; |
396 | | |
397 | 30 | if (s < slim) { |
398 | 29 | fmt->version.major = v; |
399 | 29 | if (*s == '.') { |
400 | 2 | s++; |
401 | 151 | for (v = 0; s < slim && isdigit_c(*s); s++) |
402 | 149 | v = 10 * v + *s - '0'; |
403 | 2 | if (s < slim) |
404 | 2 | fmt->version.minor = v; |
405 | 2 | } |
406 | 27 | else |
407 | 27 | fmt->version.minor = 0; |
408 | 29 | } |
409 | 30 | } |
410 | | |
411 | | static int |
412 | | cmp_nonblank(const char *key, const unsigned char *u, const unsigned char *ulim) |
413 | 352 | { |
414 | 352 | const unsigned char *ukey = (const unsigned char *) key; |
415 | | |
416 | 1.37k | while (*ukey) |
417 | 1.37k | if (u >= ulim) return +1; |
418 | 1.37k | else if (isspace_c(*u)) u++; |
419 | 351 | else if (*u != *ukey) return (*ukey < *u)? -1 : +1; |
420 | 0 | else u++, ukey++; |
421 | | |
422 | 0 | return 0; |
423 | 352 | } |
424 | | |
425 | | static int is_text_only(const unsigned char *u, const unsigned char *ulim) |
426 | 352 | { |
427 | 19.4k | for (; u < ulim; u++) |
428 | 19.0k | if (! (*u >= ' ' || *u == '\t' || *u == '\r' || *u == '\n')) |
429 | 4 | return 0; |
430 | | |
431 | 348 | return 1; |
432 | 352 | } |
433 | | |
434 | | static int is_fastaq(const unsigned char *u, const unsigned char *ulim) |
435 | 347 | { |
436 | 347 | const unsigned char *eol = memchr(u, '\n', ulim - u); |
437 | | |
438 | | // Check that the first line is entirely textual |
439 | 347 | if (! is_text_only(u, eol? eol : ulim)) return 0; |
440 | | |
441 | | // If the first line is very long, consider the file to indeed be FASTA/Q |
442 | 347 | if (eol == NULL) return 1; |
443 | | |
444 | 289 | u = eol+1; // Now points to the first character of the second line |
445 | | |
446 | | // Scan over all base-encoding letters (including 'N' but not SEQ's '=') |
447 | 700 | while (u < ulim && (seq_nt16_table[*u] != 15 || toupper(*u) == 'N')) { |
448 | 411 | if (*u == '=') return 0; |
449 | 411 | u++; |
450 | 411 | } |
451 | | |
452 | 289 | return (u == ulim || *u == '\r' || *u == '\n')? 1 : 0; |
453 | 289 | } |
454 | | |
455 | | // Parse tab-delimited text, filling in a string of column types and returning |
456 | | // the number of columns spotted (within [u,ulim), and up to column_len) or -1 |
457 | | // if non-printable characters were seen. Column types: |
458 | | // i: integer, s: strand sign, C: CIGAR, O: SAM optional field, Z: anything |
459 | | static int |
460 | | parse_tabbed_text(char *columns, int column_len, |
461 | | const unsigned char *u, const unsigned char *ulim, |
462 | | int *complete) |
463 | 5 | { |
464 | 5 | const char *str = (const char *) u; |
465 | 5 | const char *slim = (const char *) ulim; |
466 | 5 | const char *s; |
467 | 5 | int ncolumns = 0; |
468 | | |
469 | 5 | enum { digit = 1, leading_sign = 2, cigar_operator = 4, other = 8 }; |
470 | 5 | unsigned seen = 0; |
471 | 5 | *complete = 0; |
472 | | |
473 | 1.21k | for (s = str; s < slim; s++) |
474 | 1.21k | if (*s >= ' ') { |
475 | 1.20k | if (isdigit_c(*s)) |
476 | 154 | seen |= digit; |
477 | 1.04k | else if ((*s == '+' || *s == '-') && s == str) |
478 | 1 | seen |= leading_sign; |
479 | 1.04k | else if (strchr(BAM_CIGAR_STR, *s) && s > str && isdigit_c(s[-1])) |
480 | 0 | seen |= cigar_operator; |
481 | 1.04k | else |
482 | 1.04k | seen |= other; |
483 | 1.20k | } |
484 | 13 | else if (*s == '\t' || *s == '\r' || *s == '\n') { |
485 | 10 | size_t len = s - str; |
486 | 10 | char type; |
487 | | |
488 | 10 | if (seen == digit || seen == (leading_sign|digit)) type = 'i'; |
489 | 5 | else if (seen == (digit|cigar_operator)) type = 'C'; |
490 | 5 | else if (len == 1) |
491 | 2 | switch (str[0]) { |
492 | 2 | case '*': type = 'C'; break; |
493 | 0 | case '+': case '-': case '.': type = 's'; break; |
494 | 0 | default: type = 'Z'; break; |
495 | 2 | } |
496 | 3 | else if (len >= 5 && str[2] == ':' && str[4] == ':') type = 'O'; |
497 | 3 | else type = 'Z'; |
498 | | |
499 | 10 | columns[ncolumns++] = type; |
500 | 10 | if (*s != '\t' || ncolumns >= column_len - 1) { |
501 | 0 | *complete = 1; // finished the line or more columns than needed |
502 | 0 | break; |
503 | 0 | } |
504 | | |
505 | 10 | str = s + 1; |
506 | 10 | seen = 0; |
507 | 10 | } |
508 | 3 | else return -1; |
509 | | |
510 | 2 | columns[ncolumns] = '\0'; |
511 | 2 | return ncolumns; |
512 | 5 | } |
513 | | |
514 | | // Match COLUMNS as a prefix against PATTERN (so COLUMNS may run out first). |
515 | | // Returns len(COLUMNS) (modulo '+'), or 0 if there is a mismatched entry. |
516 | | static int colmatch(const char *columns, const char *pattern) |
517 | 1 | { |
518 | 1 | int i; |
519 | 11 | for (i = 0; columns[i] != '\0'; i++) { |
520 | 10 | if (pattern[i] == '+') return i; |
521 | 10 | if (! (columns[i] == pattern[i] || pattern[i] == 'Z')) return 0; |
522 | 10 | } |
523 | | |
524 | 1 | return i; |
525 | 1 | } |
526 | | |
527 | | int hts_detect_format(hFILE *hfile, htsFormat *fmt) |
528 | 0 | { |
529 | 0 | return hts_detect_format2(hfile, NULL, fmt); |
530 | 0 | } |
531 | | |
532 | | int hts_detect_format2(hFILE *hfile, const char *fname, htsFormat *fmt) |
533 | 14.8k | { |
534 | 14.8k | char extension[HTS_MAX_EXT_LEN], columns[24]; |
535 | 14.8k | unsigned char s[1024]; |
536 | 14.8k | int complete = 0; |
537 | 14.8k | ssize_t len = hpeek(hfile, s, 18); |
538 | 14.8k | if (len < 0) return -1; |
539 | | |
540 | 14.8k | fmt->category = unknown_category; |
541 | 14.8k | fmt->format = unknown_format; |
542 | 14.8k | fmt->version.major = fmt->version.minor = -1; |
543 | 14.8k | fmt->compression = no_compression; |
544 | 14.8k | fmt->compression_level = -1; |
545 | 14.8k | fmt->specific = NULL; |
546 | | |
547 | 14.8k | if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) { |
548 | | // The stream is either gzip-compressed or BGZF-compressed. |
549 | | // Determine which, and decompress the first few records or lines. |
550 | 1.12k | fmt->compression = gzip; |
551 | 1.12k | if (len >= 18 && (s[3] & 4)) { |
552 | 216 | if (memcmp(&s[12], "BC\2\0", 4) == 0) |
553 | 38 | fmt->compression = bgzf; |
554 | 178 | else if (memcmp(&s[12], "RAZF", 4) == 0) |
555 | 10 | fmt->compression = razf_compression; |
556 | 216 | } |
557 | 1.12k | if (len >= 9 && s[2] == 8) |
558 | 1.12k | fmt->compression_level = (s[8] == 2)? 9 : (s[8] == 4)? 1 : -1; |
559 | | |
560 | 1.12k | len = decompress_peek_gz(hfile, s, sizeof s); |
561 | 1.12k | } |
562 | 13.7k | else if (len >= 10 && memcmp(s, "BZh", 3) == 0 && |
563 | 13.7k | (memcmp(&s[4], "\x31\x41\x59\x26\x53\x59", 6) == 0 || |
564 | 0 | memcmp(&s[4], "\x17\x72\x45\x38\x50\x90", 6) == 0)) { |
565 | 0 | fmt->compression = bzip2_compression; |
566 | 0 | fmt->compression_level = s[3] - '0'; |
567 | | // Decompressing via libbz2 produces no output until it has a whole |
568 | | // block (of size 100Kb x level), which is too large for peeking. |
569 | | // So unfortunately we can recognise bzip2 but not the contents, |
570 | | // except that \x1772... magic indicates the stream is empty. |
571 | 0 | if (s[4] == '\x31') return 0; |
572 | 0 | else len = 0; |
573 | 0 | } |
574 | 13.7k | else if (len >= 6 && memcmp(s, "\xfd""7zXZ\0", 6) == 0) { |
575 | 0 | fmt->compression = xz_compression; |
576 | 0 | #ifdef HAVE_LIBLZMA |
577 | 0 | len = decompress_peek_xz(hfile, s, sizeof s); |
578 | | #else |
579 | | // Without liblzma, we can't recognise the decompressed contents. |
580 | | return 0; |
581 | | #endif |
582 | 0 | } |
583 | 13.7k | else if (len >= 4 && memcmp(s, "\x28\xb5\x2f\xfd", 4) == 0) { |
584 | 0 | fmt->compression = zstd_compression; |
585 | 0 | return 0; |
586 | 0 | } |
587 | 13.7k | else { |
588 | 13.7k | len = hpeek(hfile, s, sizeof s); |
589 | 13.7k | } |
590 | 14.8k | if (len < 0) return -1; |
591 | | |
592 | 14.8k | if (len == 0) { |
593 | 10 | fmt->format = empty_format; |
594 | 10 | return 0; |
595 | 10 | } |
596 | | |
597 | | // We avoid using filename extensions wherever possible (as filenames are |
598 | | // not always available), but in a few cases they must be considered: |
599 | | // - FASTA/Q indexes are simply tab-separated text; files that match these |
600 | | // patterns but not the fai/fqi extension are usually generic BED files |
601 | | // - GZI indexes have no magic numbers so can only be detected by filename |
602 | 14.8k | if (fname && strcmp(fname, "-") != 0) { |
603 | 14.8k | char *s; |
604 | 14.8k | if (find_file_extension(fname, extension) < 0) extension[0] = '\0'; |
605 | 14.8k | for (s = extension; *s; s++) *s = tolower_c(*s); |
606 | 14.8k | } |
607 | 0 | else extension[0] = '\0'; |
608 | | |
609 | 14.8k | if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=7 && s[5]<=7) { |
610 | 5.09k | fmt->category = sequence_data; |
611 | 5.09k | fmt->format = cram; |
612 | 5.09k | fmt->version.major = s[4], fmt->version.minor = s[5]; |
613 | 5.09k | fmt->compression = custom; |
614 | 5.09k | return 0; |
615 | 5.09k | } |
616 | 9.79k | else if (len >= 4 && s[3] <= '\4') { |
617 | 1.11k | if (memcmp(s, "BAM\1", 4) == 0) { |
618 | 468 | fmt->category = sequence_data; |
619 | 468 | fmt->format = bam; |
620 | | // TODO Decompress enough to pick version from @HD-VN header |
621 | 468 | fmt->version.major = 1, fmt->version.minor = -1; |
622 | 468 | return 0; |
623 | 468 | } |
624 | 650 | else if (memcmp(s, "BAI\1", 4) == 0) { |
625 | 0 | fmt->category = index_file; |
626 | 0 | fmt->format = bai; |
627 | 0 | fmt->version.major = -1, fmt->version.minor = -1; |
628 | 0 | return 0; |
629 | 0 | } |
630 | 650 | else if (memcmp(s, "BCF\4", 4) == 0) { |
631 | 0 | fmt->category = variant_data; |
632 | 0 | fmt->format = bcf; |
633 | 0 | fmt->version.major = 1, fmt->version.minor = -1; |
634 | 0 | return 0; |
635 | 0 | } |
636 | 650 | else if (memcmp(s, "BCF\2", 4) == 0) { |
637 | 649 | fmt->category = variant_data; |
638 | 649 | fmt->format = bcf; |
639 | 649 | fmt->version.major = s[3]; |
640 | 649 | fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0; |
641 | 649 | return 0; |
642 | 649 | } |
643 | 1 | else if (memcmp(s, "CSI\1", 4) == 0) { |
644 | 0 | fmt->category = index_file; |
645 | 0 | fmt->format = csi; |
646 | 0 | fmt->version.major = 1, fmt->version.minor = -1; |
647 | 0 | return 0; |
648 | 0 | } |
649 | 1 | else if (memcmp(s, "TBI\1", 4) == 0) { |
650 | 0 | fmt->category = index_file; |
651 | 0 | fmt->format = tbi; |
652 | 0 | return 0; |
653 | 0 | } |
654 | | // GZI indexes have no magic numbers, so must be recognised solely by |
655 | | // filename extension. |
656 | 1 | else if (strcmp(extension, "gzi") == 0) { |
657 | 0 | fmt->category = index_file; |
658 | 0 | fmt->format = gzi; |
659 | 0 | return 0; |
660 | 0 | } |
661 | 1.11k | } |
662 | 8.67k | else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) { |
663 | 5.38k | fmt->category = variant_data; |
664 | 5.38k | fmt->format = vcf; |
665 | 5.38k | if (len >= 21 && s[16] == 'v') |
666 | 0 | parse_version(fmt, &s[17], &s[len]); |
667 | 5.38k | return 0; |
668 | 5.38k | } |
669 | 3.28k | else if (len >= 4 && s[0] == '@' && |
670 | 3.28k | (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 || |
671 | 2.96k | memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0 || |
672 | 2.96k | memcmp(s, "@CO\t", 4) == 0)) { |
673 | 2.93k | fmt->category = sequence_data; |
674 | 2.93k | fmt->format = sam; |
675 | | // @HD-VN is not guaranteed to be the first tag, but then @HD is |
676 | | // not guaranteed to be present at all... |
677 | 2.93k | if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0) |
678 | 30 | parse_version(fmt, &s[7], &s[len]); |
679 | 2.90k | else |
680 | 2.90k | fmt->version.major = 1, fmt->version.minor = -1; |
681 | 2.93k | return 0; |
682 | 2.93k | } |
683 | 352 | else if (len >= 8 && memcmp(s, "d4\xdd\xdd", 4) == 0) { |
684 | 0 | fmt->category = region_list; |
685 | 0 | fmt->format = d4_format; |
686 | | // How to decode the D4 Format Version bytes is not yet specified |
687 | | // so we don't try to set fmt->version.{major,minor}. |
688 | 0 | return 0; |
689 | 0 | } |
690 | 352 | else if (cmp_nonblank("{\"htsget\":", s, &s[len]) == 0) { |
691 | 0 | fmt->category = unknown_category; |
692 | 0 | fmt->format = htsget; |
693 | 0 | return 0; |
694 | 0 | } |
695 | 352 | else if (len > 8 && memcmp(s, "crypt4gh", 8) == 0) { |
696 | 0 | fmt->category = unknown_category; |
697 | 0 | fmt->format = hts_crypt4gh_format; |
698 | 0 | return 0; |
699 | 0 | } |
700 | 352 | else if (len >= 1 && s[0] == '>' && is_fastaq(s, &s[len])) { |
701 | 317 | fmt->category = sequence_data; |
702 | 317 | fmt->format = fasta_format; |
703 | 317 | return 0; |
704 | 317 | } |
705 | 35 | else if (len >= 1 && s[0] == '@' && is_fastaq(s, &s[len])) { |
706 | 30 | fmt->category = sequence_data; |
707 | 30 | fmt->format = fastq_format; |
708 | 30 | return 0; |
709 | 30 | } |
710 | 5 | else if (parse_tabbed_text(columns, sizeof columns, s, |
711 | 5 | &s[len], &complete) > 0) { |
712 | | // A complete SAM line is at least 11 columns. On unmapped long reads may |
713 | | // be missing two. (On mapped long reads we must have an @ header so long |
714 | | // CIGAR is irrelevant.) |
715 | 1 | if (colmatch(columns, "ZiZiiCZiiZZOOOOOOOOOOOOOOOOOOOO+") |
716 | 1 | >= 9 + 2*complete) { |
717 | 1 | fmt->category = sequence_data; |
718 | 1 | fmt->format = sam; |
719 | 1 | fmt->version.major = 1, fmt->version.minor = -1; |
720 | 1 | return 0; |
721 | 1 | } |
722 | 0 | else if (fmt->compression == gzip && colmatch(columns, "iiiiii") == 6) { |
723 | 0 | fmt->category = index_file; |
724 | 0 | fmt->format = crai; |
725 | 0 | return 0; |
726 | 0 | } |
727 | 0 | else if (strstr(extension, "fqi") && colmatch(columns, "Ziiiii") == 6) { |
728 | 0 | fmt->category = index_file; |
729 | 0 | fmt->format = fqi_format; |
730 | 0 | return 0; |
731 | 0 | } |
732 | 0 | else if (strstr(extension, "fai") && colmatch(columns, "Ziiii") == 5) { |
733 | 0 | fmt->category = index_file; |
734 | 0 | fmt->format = fai_format; |
735 | 0 | return 0; |
736 | 0 | } |
737 | 0 | else if (colmatch(columns, "Zii+") >= 3) { |
738 | 0 | fmt->category = region_list; |
739 | 0 | fmt->format = bed; |
740 | 0 | return 0; |
741 | 0 | } |
742 | 1 | } |
743 | | |
744 | | // Arbitrary text files can be read using hts_getline(). |
745 | 5 | if (is_text_only(s, &s[len])) fmt->format = text_format; |
746 | | |
747 | | // Nothing recognised: leave unset fmt-> fields as unknown. |
748 | 5 | return 0; |
749 | 14.8k | } |
750 | | |
751 | | char *hts_format_description(const htsFormat *format) |
752 | 0 | { |
753 | 0 | kstring_t str = { 0, 0, NULL }; |
754 | |
|
755 | 0 | switch (format->format) { |
756 | 0 | case sam: kputs("SAM", &str); break; |
757 | 0 | case bam: kputs("BAM", &str); break; |
758 | 0 | case cram: kputs("CRAM", &str); break; |
759 | 0 | case fasta_format: kputs("FASTA", &str); break; |
760 | 0 | case fastq_format: kputs("FASTQ", &str); break; |
761 | 0 | case vcf: kputs("VCF", &str); break; |
762 | 0 | case bcf: |
763 | 0 | if (format->version.major == 1) kputs("Legacy BCF", &str); |
764 | 0 | else kputs("BCF", &str); |
765 | 0 | break; |
766 | 0 | case bai: kputs("BAI", &str); break; |
767 | 0 | case crai: kputs("CRAI", &str); break; |
768 | 0 | case csi: kputs("CSI", &str); break; |
769 | 0 | case fai_format: kputs("FASTA-IDX", &str); break; |
770 | 0 | case fqi_format: kputs("FASTQ-IDX", &str); break; |
771 | 0 | case gzi: kputs("GZI", &str); break; |
772 | 0 | case tbi: kputs("Tabix", &str); break; |
773 | 0 | case bed: kputs("BED", &str); break; |
774 | 0 | case d4_format: kputs("D4", &str); break; |
775 | 0 | case htsget: kputs("htsget", &str); break; |
776 | 0 | case hts_crypt4gh_format: kputs("crypt4gh", &str); break; |
777 | 0 | case empty_format: kputs("empty", &str); break; |
778 | 0 | default: kputs("unknown", &str); break; |
779 | 0 | } |
780 | | |
781 | 0 | if (format->version.major >= 0) { |
782 | 0 | kputs(" version ", &str); |
783 | 0 | kputw(format->version.major, &str); |
784 | 0 | if (format->version.minor >= 0) { |
785 | 0 | kputc('.', &str); |
786 | 0 | kputw(format->version.minor, &str); |
787 | 0 | } |
788 | 0 | } |
789 | |
|
790 | 0 | switch (format->compression) { |
791 | 0 | case bzip2_compression: kputs(" bzip2-compressed", &str); break; |
792 | 0 | case razf_compression: kputs(" legacy-RAZF-compressed", &str); break; |
793 | 0 | case xz_compression: kputs(" XZ-compressed", &str); break; |
794 | 0 | case zstd_compression: kputs(" Zstandard-compressed", &str); break; |
795 | 0 | case custom: kputs(" compressed", &str); break; |
796 | 0 | case gzip: kputs(" gzip-compressed", &str); break; |
797 | | |
798 | 0 | case bgzf: |
799 | 0 | switch (format->format) { |
800 | 0 | case bam: |
801 | 0 | case bcf: |
802 | 0 | case csi: |
803 | 0 | case tbi: |
804 | | // These are by definition BGZF, so just use the generic term |
805 | 0 | kputs(" compressed", &str); |
806 | 0 | break; |
807 | 0 | default: |
808 | 0 | kputs(" BGZF-compressed", &str); |
809 | 0 | break; |
810 | 0 | } |
811 | 0 | break; |
812 | | |
813 | 0 | case no_compression: |
814 | 0 | switch (format->format) { |
815 | 0 | case bam: |
816 | 0 | case bcf: |
817 | 0 | case cram: |
818 | 0 | case csi: |
819 | 0 | case tbi: |
820 | | // These are normally compressed, so emphasise that this one isn't |
821 | 0 | kputs(" uncompressed", &str); |
822 | 0 | break; |
823 | 0 | default: |
824 | 0 | break; |
825 | 0 | } |
826 | 0 | break; |
827 | | |
828 | 0 | default: break; |
829 | 0 | } |
830 | | |
831 | 0 | switch (format->category) { |
832 | 0 | case sequence_data: kputs(" sequence", &str); break; |
833 | 0 | case variant_data: kputs(" variant calling", &str); break; |
834 | 0 | case index_file: kputs(" index", &str); break; |
835 | 0 | case region_list: kputs(" genomic region", &str); break; |
836 | 0 | default: break; |
837 | 0 | } |
838 | | |
839 | 0 | if (format->compression == no_compression) |
840 | 0 | switch (format->format) { |
841 | 0 | case text_format: |
842 | 0 | case sam: |
843 | 0 | case crai: |
844 | 0 | case vcf: |
845 | 0 | case bed: |
846 | 0 | case fai_format: |
847 | 0 | case fqi_format: |
848 | 0 | case fasta_format: |
849 | 0 | case fastq_format: |
850 | 0 | case htsget: |
851 | 0 | kputs(" text", &str); |
852 | 0 | break; |
853 | | |
854 | 0 | case empty_format: |
855 | 0 | break; |
856 | | |
857 | 0 | default: |
858 | 0 | kputs(" data", &str); |
859 | 0 | break; |
860 | 0 | } |
861 | 0 | else |
862 | 0 | kputs(" data", &str); |
863 | | |
864 | 0 | return ks_release(&str); |
865 | 0 | } |
866 | | |
867 | | htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) |
868 | 13.6k | { |
869 | 13.6k | char smode[101], *cp, *cp2, *mode_c, *uncomp = NULL; |
870 | 13.6k | htsFile *fp = NULL; |
871 | 13.6k | hFILE *hfile = NULL; |
872 | 13.6k | char fmt_code = '\0'; |
873 | | // see enum htsExactFormat in htslib/hts.h |
874 | 13.6k | const char format_to_mode[] = "\0g\0\0b\0c\0\0b\0g\0\0\0\0\0Ff\0\0"; |
875 | | |
876 | 13.6k | strncpy(smode, mode, 99); |
877 | 13.6k | smode[99]=0; |
878 | 13.6k | if ((cp = strchr(smode, ','))) |
879 | 0 | *cp = '\0'; |
880 | | |
881 | | // Migrate format code (b or c) to the end of the smode buffer. |
882 | 27.3k | for (cp2 = cp = smode; *cp; cp++) { |
883 | 13.6k | if (*cp == 'b') |
884 | 0 | fmt_code = 'b'; |
885 | 13.6k | else if (*cp == 'c') |
886 | 0 | fmt_code = 'c'; |
887 | 13.6k | else { |
888 | 13.6k | *cp2++ = *cp; |
889 | | // Cache the uncompress flag 'u' pos if present |
890 | 13.6k | if (!uncomp && (*cp == 'u')) { |
891 | 0 | uncomp = cp2 - 1; |
892 | 0 | } |
893 | 13.6k | } |
894 | 13.6k | } |
895 | 13.6k | mode_c = cp2; |
896 | 13.6k | *cp2++ = fmt_code; |
897 | 13.6k | *cp2++ = 0; |
898 | | |
899 | | // Set or reset the format code if opts->format is used |
900 | 13.6k | if (fmt && fmt->format > unknown_format |
901 | 13.6k | && fmt->format < sizeof(format_to_mode)) { |
902 | 0 | *mode_c = format_to_mode[fmt->format]; |
903 | 0 | } |
904 | | |
905 | | // Uncompressed bam/bcf is not supported, change 'u' to '0' on write |
906 | 13.6k | if (uncomp && *mode_c == 'b' && (strchr(smode, 'w') || strchr(smode, 'a'))) { |
907 | 0 | *uncomp = '0'; |
908 | 0 | } |
909 | | |
910 | | // If we really asked for a compressed text format then mode_c above will |
911 | | // point to nul. We set to 'z' to enable bgzf. |
912 | 13.6k | if (strchr(mode, 'w') && fmt && fmt->compression == bgzf) { |
913 | 0 | if (fmt->format == sam || fmt->format == vcf || fmt->format == text_format) |
914 | 0 | *mode_c = 'z'; |
915 | 0 | } |
916 | | |
917 | 13.6k | char *rmme = NULL, *fnidx = strstr(fn, HTS_IDX_DELIM); |
918 | 13.6k | if ( fnidx ) { |
919 | 0 | rmme = strdup(fn); |
920 | 0 | if ( !rmme ) goto error; |
921 | 0 | rmme[fnidx-fn] = 0; |
922 | 0 | fn = rmme; |
923 | 0 | } |
924 | | |
925 | 13.6k | hfile = hopen(fn, smode); |
926 | 13.6k | if (hfile == NULL) goto error; |
927 | | |
928 | 13.6k | fp = hts_hopen(hfile, fn, smode); |
929 | 13.6k | if (fp == NULL) goto error; |
930 | | |
931 | | // Compensate for the loss of exactness in htsExactFormat. |
932 | | // hts_hopen returns generics such as binary or text, but we |
933 | | // have been given something explicit here so use that instead. |
934 | 13.6k | if (fp->is_write && fmt && |
935 | 13.6k | (fmt->format == bam || fmt->format == sam || |
936 | 0 | fmt->format == vcf || fmt->format == bcf || |
937 | 0 | fmt->format == bed || fmt->format == fasta_format || |
938 | 0 | fmt->format == fastq_format)) |
939 | 0 | fp->format.format = fmt->format; |
940 | | |
941 | 13.6k | if (fmt && fmt->specific) |
942 | 0 | if (hts_opt_apply(fp, fmt->specific) != 0) |
943 | 0 | goto error; |
944 | | |
945 | 13.6k | if ( rmme ) free(rmme); |
946 | 13.6k | return fp; |
947 | | |
948 | 0 | error: |
949 | 0 | hts_log_error("Failed to open file \"%s\"%s%s", fn, |
950 | 0 | errno ? " : " : "", errno ? strerror(errno) : ""); |
951 | 0 | if ( rmme ) free(rmme); |
952 | |
|
953 | 0 | if (hfile) |
954 | 0 | hclose_abruptly(hfile); |
955 | |
|
956 | 0 | return NULL; |
957 | 13.6k | } |
958 | | |
959 | 13.6k | htsFile *hts_open(const char *fn, const char *mode) { |
960 | 13.6k | return hts_open_format(fn, mode, NULL); |
961 | 13.6k | } |
962 | | |
963 | | /* |
964 | | * Splits str into a prefix, delimiter ('\0' or delim), and suffix, writing |
965 | | * the prefix in lowercase into buf and returning a pointer to the suffix. |
966 | | * On return, buf is always NUL-terminated; thus assumes that the "keyword" |
967 | | * prefix should be one of several known values of maximum length buflen-2. |
968 | | * (If delim is not found, returns a pointer to the '\0'.) |
969 | | */ |
970 | | static const char * |
971 | | scan_keyword(const char *str, char delim, char *buf, size_t buflen) |
972 | 0 | { |
973 | 0 | size_t i = 0; |
974 | 0 | while (*str && *str != delim) { |
975 | 0 | if (i < buflen-1) buf[i++] = tolower_c(*str); |
976 | 0 | str++; |
977 | 0 | } |
978 | |
|
979 | 0 | buf[i] = '\0'; |
980 | 0 | return *str? str+1 : str; |
981 | 0 | } |
982 | | |
983 | | /* |
984 | | * Parses arg and appends it to the option list. |
985 | | * |
986 | | * Returns 0 on success; |
987 | | * -1 on failure. |
988 | | */ |
989 | 0 | int hts_opt_add(hts_opt **opts, const char *c_arg) { |
990 | 0 | hts_opt *o, *t; |
991 | 0 | char *val; |
992 | | |
993 | | /* |
994 | | * IMPORTANT!!! |
995 | | * If you add another string option here, don't forget to also add |
996 | | * it to the case statement in hts_opt_apply. |
997 | | */ |
998 | |
|
999 | 0 | if (!c_arg) |
1000 | 0 | return -1; |
1001 | | |
1002 | 0 | if (!(o = malloc(sizeof(*o)))) |
1003 | 0 | return -1; |
1004 | | |
1005 | 0 | if (!(o->arg = strdup(c_arg))) { |
1006 | 0 | free(o); |
1007 | 0 | return -1; |
1008 | 0 | } |
1009 | | |
1010 | 0 | if (!(val = strchr(o->arg, '='))) |
1011 | 0 | val = "1"; // assume boolean |
1012 | 0 | else |
1013 | 0 | *val++ = '\0'; |
1014 | |
|
1015 | 0 | if (strcmp(o->arg, "decode_md") == 0 || |
1016 | 0 | strcmp(o->arg, "DECODE_MD") == 0) |
1017 | 0 | o->opt = CRAM_OPT_DECODE_MD, o->val.i = atoi(val); |
1018 | | |
1019 | 0 | else if (strcmp(o->arg, "verbosity") == 0 || |
1020 | 0 | strcmp(o->arg, "VERBOSITY") == 0) |
1021 | 0 | o->opt = CRAM_OPT_VERBOSITY, o->val.i = atoi(val); |
1022 | | |
1023 | 0 | else if (strcmp(o->arg, "seqs_per_slice") == 0 || |
1024 | 0 | strcmp(o->arg, "SEQS_PER_SLICE") == 0) |
1025 | 0 | o->opt = CRAM_OPT_SEQS_PER_SLICE, o->val.i = atoi(val); |
1026 | | |
1027 | 0 | else if (strcmp(o->arg, "bases_per_slice") == 0 || |
1028 | 0 | strcmp(o->arg, "BASES_PER_SLICE") == 0) |
1029 | 0 | o->opt = CRAM_OPT_BASES_PER_SLICE, o->val.i = atoi(val); |
1030 | | |
1031 | 0 | else if (strcmp(o->arg, "slices_per_container") == 0 || |
1032 | 0 | strcmp(o->arg, "SLICES_PER_CONTAINER") == 0) |
1033 | 0 | o->opt = CRAM_OPT_SLICES_PER_CONTAINER, o->val.i = atoi(val); |
1034 | | |
1035 | 0 | else if (strcmp(o->arg, "embed_ref") == 0 || |
1036 | 0 | strcmp(o->arg, "EMBED_REF") == 0) |
1037 | 0 | o->opt = CRAM_OPT_EMBED_REF, o->val.i = atoi(val); |
1038 | | |
1039 | 0 | else if (strcmp(o->arg, "no_ref") == 0 || |
1040 | 0 | strcmp(o->arg, "NO_REF") == 0) |
1041 | 0 | o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(val); |
1042 | | |
1043 | 0 | else if (strcmp(o->arg, "pos_delta") == 0 || |
1044 | 0 | strcmp(o->arg, "POS_DELTA") == 0) |
1045 | 0 | o->opt = CRAM_OPT_POS_DELTA, o->val.i = atoi(val); |
1046 | | |
1047 | 0 | else if (strcmp(o->arg, "ignore_md5") == 0 || |
1048 | 0 | strcmp(o->arg, "IGNORE_MD5") == 0) |
1049 | 0 | o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(val); |
1050 | | |
1051 | 0 | else if (strcmp(o->arg, "use_bzip2") == 0 || |
1052 | 0 | strcmp(o->arg, "USE_BZIP2") == 0) |
1053 | 0 | o->opt = CRAM_OPT_USE_BZIP2, o->val.i = atoi(val); |
1054 | | |
1055 | 0 | else if (strcmp(o->arg, "use_rans") == 0 || |
1056 | 0 | strcmp(o->arg, "USE_RANS") == 0) |
1057 | 0 | o->opt = CRAM_OPT_USE_RANS, o->val.i = atoi(val); |
1058 | | |
1059 | 0 | else if (strcmp(o->arg, "use_lzma") == 0 || |
1060 | 0 | strcmp(o->arg, "USE_LZMA") == 0) |
1061 | 0 | o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(val); |
1062 | | |
1063 | 0 | else if (strcmp(o->arg, "use_tok") == 0 || |
1064 | 0 | strcmp(o->arg, "USE_TOK") == 0) |
1065 | 0 | o->opt = CRAM_OPT_USE_TOK, o->val.i = atoi(val); |
1066 | | |
1067 | 0 | else if (strcmp(o->arg, "use_fqz") == 0 || |
1068 | 0 | strcmp(o->arg, "USE_FQZ") == 0) |
1069 | 0 | o->opt = CRAM_OPT_USE_FQZ, o->val.i = atoi(val); |
1070 | | |
1071 | 0 | else if (strcmp(o->arg, "use_arith") == 0 || |
1072 | 0 | strcmp(o->arg, "USE_ARITH") == 0) |
1073 | 0 | o->opt = CRAM_OPT_USE_ARITH, o->val.i = atoi(val); |
1074 | | |
1075 | 0 | else if (strcmp(o->arg, "fast") == 0 || |
1076 | 0 | strcmp(o->arg, "FAST") == 0) |
1077 | 0 | o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_FAST; |
1078 | | |
1079 | 0 | else if (strcmp(o->arg, "normal") == 0 || |
1080 | 0 | strcmp(o->arg, "NORMAL") == 0) |
1081 | 0 | o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_NORMAL; |
1082 | | |
1083 | 0 | else if (strcmp(o->arg, "small") == 0 || |
1084 | 0 | strcmp(o->arg, "SMALL") == 0) |
1085 | 0 | o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_SMALL; |
1086 | | |
1087 | 0 | else if (strcmp(o->arg, "archive") == 0 || |
1088 | 0 | strcmp(o->arg, "ARCHIVE") == 0) |
1089 | 0 | o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_ARCHIVE; |
1090 | | |
1091 | 0 | else if (strcmp(o->arg, "reference") == 0 || |
1092 | 0 | strcmp(o->arg, "REFERENCE") == 0) |
1093 | 0 | o->opt = CRAM_OPT_REFERENCE, o->val.s = val; |
1094 | | |
1095 | 0 | else if (strcmp(o->arg, "version") == 0 || |
1096 | 0 | strcmp(o->arg, "VERSION") == 0) |
1097 | 0 | o->opt = CRAM_OPT_VERSION, o->val.s =val; |
1098 | | |
1099 | 0 | else if (strcmp(o->arg, "multi_seq_per_slice") == 0 || |
1100 | 0 | strcmp(o->arg, "MULTI_SEQ_PER_SLICE") == 0) |
1101 | 0 | o->opt = CRAM_OPT_MULTI_SEQ_PER_SLICE, o->val.i = atoi(val); |
1102 | | |
1103 | 0 | else if (strcmp(o->arg, "nthreads") == 0 || |
1104 | 0 | strcmp(o->arg, "NTHREADS") == 0) |
1105 | 0 | o->opt = HTS_OPT_NTHREADS, o->val.i = atoi(val); |
1106 | | |
1107 | 0 | else if (strcmp(o->arg, "cache_size") == 0 || |
1108 | 0 | strcmp(o->arg, "CACHE_SIZE") == 0) { |
1109 | 0 | char *endp; |
1110 | 0 | o->opt = HTS_OPT_CACHE_SIZE; |
1111 | 0 | o->val.i = strtol(val, &endp, 0); |
1112 | | // NB: Doesn't support floats, eg 1.5g |
1113 | | // TODO: extend hts_parse_decimal? See also samtools sort. |
1114 | 0 | switch (*endp) { |
1115 | 0 | case 'g': case 'G': o->val.i *= 1024; // fall through |
1116 | 0 | case 'm': case 'M': o->val.i *= 1024; // fall through |
1117 | 0 | case 'k': case 'K': o->val.i *= 1024; break; |
1118 | 0 | case '\0': break; |
1119 | 0 | default: |
1120 | 0 | hts_log_error("Unrecognised cache size suffix '%c'", *endp); |
1121 | 0 | free(o->arg); |
1122 | 0 | free(o); |
1123 | 0 | return -1; |
1124 | 0 | } |
1125 | 0 | } |
1126 | | |
1127 | 0 | else if (strcmp(o->arg, "required_fields") == 0 || |
1128 | 0 | strcmp(o->arg, "REQUIRED_FIELDS") == 0) |
1129 | 0 | o->opt = CRAM_OPT_REQUIRED_FIELDS, o->val.i = strtol(val, NULL, 0); |
1130 | | |
1131 | 0 | else if (strcmp(o->arg, "lossy_names") == 0 || |
1132 | 0 | strcmp(o->arg, "LOSSY_NAMES") == 0) |
1133 | 0 | o->opt = CRAM_OPT_LOSSY_NAMES, o->val.i = strtol(val, NULL, 0); |
1134 | | |
1135 | 0 | else if (strcmp(o->arg, "name_prefix") == 0 || |
1136 | 0 | strcmp(o->arg, "NAME_PREFIX") == 0) |
1137 | 0 | o->opt = CRAM_OPT_PREFIX, o->val.s = val; |
1138 | | |
1139 | 0 | else if (strcmp(o->arg, "store_md") == 0 || |
1140 | 0 | strcmp(o->arg, "store_md") == 0) |
1141 | 0 | o->opt = CRAM_OPT_STORE_MD, o->val.i = atoi(val); |
1142 | | |
1143 | 0 | else if (strcmp(o->arg, "store_nm") == 0 || |
1144 | 0 | strcmp(o->arg, "store_nm") == 0) |
1145 | 0 | o->opt = CRAM_OPT_STORE_NM, o->val.i = atoi(val); |
1146 | | |
1147 | 0 | else if (strcmp(o->arg, "block_size") == 0 || |
1148 | 0 | strcmp(o->arg, "BLOCK_SIZE") == 0) |
1149 | 0 | o->opt = HTS_OPT_BLOCK_SIZE, o->val.i = strtol(val, NULL, 0); |
1150 | | |
1151 | 0 | else if (strcmp(o->arg, "level") == 0 || |
1152 | 0 | strcmp(o->arg, "LEVEL") == 0) |
1153 | 0 | o->opt = HTS_OPT_COMPRESSION_LEVEL, o->val.i = strtol(val, NULL, 0); |
1154 | | |
1155 | 0 | else if (strcmp(o->arg, "filter") == 0 || |
1156 | 0 | strcmp(o->arg, "FILTER") == 0) |
1157 | 0 | o->opt = HTS_OPT_FILTER, o->val.s = val; |
1158 | | |
1159 | 0 | else if (strcmp(o->arg, "fastq_aux") == 0 || |
1160 | 0 | strcmp(o->arg, "FASTQ_AUX") == 0) |
1161 | 0 | o->opt = FASTQ_OPT_AUX, o->val.s = val; |
1162 | | |
1163 | 0 | else if (strcmp(o->arg, "fastq_barcode") == 0 || |
1164 | 0 | strcmp(o->arg, "FASTQ_BARCODE") == 0) |
1165 | 0 | o->opt = FASTQ_OPT_BARCODE, o->val.s = val; |
1166 | | |
1167 | 0 | else if (strcmp(o->arg, "fastq_rnum") == 0 || |
1168 | 0 | strcmp(o->arg, "FASTQ_RNUM") == 0) |
1169 | 0 | o->opt = FASTQ_OPT_RNUM, o->val.i = 1; |
1170 | | |
1171 | 0 | else if (strcmp(o->arg, "fastq_casava") == 0 || |
1172 | 0 | strcmp(o->arg, "FASTQ_CASAVA") == 0) |
1173 | 0 | o->opt = FASTQ_OPT_CASAVA, o->val.i = 1; |
1174 | | |
1175 | 0 | else if (strcmp(o->arg, "fastq_name2") == 0 || |
1176 | 0 | strcmp(o->arg, "FASTQ_NAME2") == 0) |
1177 | 0 | o->opt = FASTQ_OPT_NAME2, o->val.i = 1; |
1178 | | |
1179 | 0 | else { |
1180 | 0 | hts_log_error("Unknown option '%s'", o->arg); |
1181 | 0 | free(o->arg); |
1182 | 0 | free(o); |
1183 | 0 | return -1; |
1184 | 0 | } |
1185 | | |
1186 | 0 | o->next = NULL; |
1187 | | |
1188 | | // Append; assumes small list. |
1189 | 0 | if (*opts) { |
1190 | 0 | t = *opts; |
1191 | 0 | while (t->next) |
1192 | 0 | t = t->next; |
1193 | 0 | t->next = o; |
1194 | 0 | } else { |
1195 | 0 | *opts = o; |
1196 | 0 | } |
1197 | |
|
1198 | 0 | return 0; |
1199 | 0 | } |
1200 | | |
1201 | | /* |
1202 | | * Applies an hts_opt option list to a given htsFile. |
1203 | | * |
1204 | | * Returns 0 on success |
1205 | | * -1 on failure |
1206 | | */ |
1207 | 0 | int hts_opt_apply(htsFile *fp, hts_opt *opts) { |
1208 | 0 | hts_opt *last = NULL; |
1209 | |
|
1210 | 0 | for (; opts; opts = (last=opts)->next) { |
1211 | 0 | switch (opts->opt) { |
1212 | 0 | case CRAM_OPT_REFERENCE: |
1213 | 0 | if (!(fp->fn_aux = strdup(opts->val.s))) |
1214 | 0 | return -1; |
1215 | | // fall through |
1216 | 0 | case CRAM_OPT_VERSION: |
1217 | 0 | case CRAM_OPT_PREFIX: |
1218 | 0 | case HTS_OPT_FILTER: |
1219 | 0 | case FASTQ_OPT_AUX: |
1220 | 0 | case FASTQ_OPT_BARCODE: |
1221 | 0 | if (hts_set_opt(fp, opts->opt, opts->val.s) != 0) |
1222 | 0 | return -1; |
1223 | 0 | break; |
1224 | 0 | default: |
1225 | 0 | if (hts_set_opt(fp, opts->opt, opts->val.i) != 0) |
1226 | 0 | return -1; |
1227 | 0 | break; |
1228 | 0 | } |
1229 | 0 | } |
1230 | | |
1231 | 0 | return 0; |
1232 | 0 | } |
1233 | | |
1234 | | /* |
1235 | | * Frees an hts_opt list. |
1236 | | */ |
1237 | 0 | void hts_opt_free(hts_opt *opts) { |
1238 | 0 | hts_opt *last = NULL; |
1239 | 0 | while (opts) { |
1240 | 0 | opts = (last=opts)->next; |
1241 | 0 | free(last->arg); |
1242 | 0 | free(last); |
1243 | 0 | } |
1244 | 0 | } |
1245 | | |
1246 | | |
1247 | | /* |
1248 | | * Tokenise options as (key(=value)?,)*(key(=value)?)? |
1249 | | * NB: No provision for ',' appearing in the value! |
1250 | | * Add backslashing rules? |
1251 | | * |
1252 | | * This could be used as part of a general command line option parser or |
1253 | | * as a string concatenated onto the file open mode. |
1254 | | * |
1255 | | * Returns 0 on success |
1256 | | * -1 on failure. |
1257 | | */ |
1258 | 0 | int hts_parse_opt_list(htsFormat *fmt, const char *str) { |
1259 | 0 | while (str && *str) { |
1260 | 0 | const char *str_start; |
1261 | 0 | int len; |
1262 | 0 | char arg[8001]; |
1263 | |
|
1264 | 0 | while (*str && *str == ',') |
1265 | 0 | str++; |
1266 | |
|
1267 | 0 | for (str_start = str; *str && *str != ','; str++); |
1268 | 0 | len = str - str_start; |
1269 | | |
1270 | | // Produce a nul terminated copy of the option |
1271 | 0 | strncpy(arg, str_start, len < 8000 ? len : 8000); |
1272 | 0 | arg[len < 8000 ? len : 8000] = '\0'; |
1273 | |
|
1274 | 0 | if (hts_opt_add((hts_opt **)&fmt->specific, arg) != 0) |
1275 | 0 | return -1; |
1276 | | |
1277 | 0 | if (*str) |
1278 | 0 | str++; |
1279 | 0 | } |
1280 | | |
1281 | 0 | return 0; |
1282 | 0 | } |
1283 | | |
1284 | | /* |
1285 | | * Accepts a string file format (sam, bam, cram, vcf, bam) optionally |
1286 | | * followed by a comma separated list of key=value options and splits |
1287 | | * these up into the fields of htsFormat struct. |
1288 | | * |
1289 | | * format is assumed to be already initialised, either to blank |
1290 | | * "unknown" values or via previous hts_opt_add calls. |
1291 | | * |
1292 | | * Returns 0 on success |
1293 | | * -1 on failure. |
1294 | | */ |
1295 | 0 | int hts_parse_format(htsFormat *format, const char *str) { |
1296 | 0 | char fmt[8]; |
1297 | 0 | const char *cp = scan_keyword(str, ',', fmt, sizeof fmt); |
1298 | |
|
1299 | 0 | format->version.minor = 0; // unknown |
1300 | 0 | format->version.major = 0; // unknown |
1301 | |
|
1302 | 0 | if (strcmp(fmt, "sam") == 0) { |
1303 | 0 | format->category = sequence_data; |
1304 | 0 | format->format = sam; |
1305 | 0 | format->compression = no_compression; |
1306 | 0 | format->compression_level = 0; |
1307 | 0 | } else if (strcmp(fmt, "sam.gz") == 0) { |
1308 | 0 | format->category = sequence_data; |
1309 | 0 | format->format = sam; |
1310 | 0 | format->compression = bgzf; |
1311 | 0 | format->compression_level = -1; |
1312 | 0 | } else if (strcmp(fmt, "bam") == 0) { |
1313 | 0 | format->category = sequence_data; |
1314 | 0 | format->format = bam; |
1315 | 0 | format->compression = bgzf; |
1316 | 0 | format->compression_level = -1; |
1317 | 0 | } else if (strcmp(fmt, "cram") == 0) { |
1318 | 0 | format->category = sequence_data; |
1319 | 0 | format->format = cram; |
1320 | 0 | format->compression = custom; |
1321 | 0 | format->compression_level = -1; |
1322 | 0 | } else if (strcmp(fmt, "vcf") == 0) { |
1323 | 0 | format->category = variant_data; |
1324 | 0 | format->format = vcf; |
1325 | 0 | format->compression = no_compression; |
1326 | 0 | format->compression_level = 0; |
1327 | 0 | } else if (strcmp(fmt, "bcf") == 0) { |
1328 | 0 | format->category = variant_data; |
1329 | 0 | format->format = bcf; |
1330 | 0 | format->compression = bgzf; |
1331 | 0 | format->compression_level = -1; |
1332 | 0 | } else if (strcmp(fmt, "fastq") == 0 || strcmp(fmt, "fq") == 0) { |
1333 | 0 | format->category = sequence_data; |
1334 | 0 | format->format = fastq_format; |
1335 | 0 | format->compression = no_compression; |
1336 | 0 | format->compression_level = 0; |
1337 | 0 | } else if (strcmp(fmt, "fastq.gz") == 0 || strcmp(fmt, "fq.gz") == 0) { |
1338 | 0 | format->category = sequence_data; |
1339 | 0 | format->format = fastq_format; |
1340 | 0 | format->compression = bgzf; |
1341 | 0 | format->compression_level = 0; |
1342 | 0 | } else if (strcmp(fmt, "fasta") == 0 || strcmp(fmt, "fa") == 0) { |
1343 | 0 | format->category = sequence_data; |
1344 | 0 | format->format = fasta_format; |
1345 | 0 | format->compression = no_compression; |
1346 | 0 | format->compression_level = 0; |
1347 | 0 | } else if (strcmp(fmt, "fasta.gz") == 0 || strcmp(fmt, "fa.gz") == 0) { |
1348 | 0 | format->category = sequence_data; |
1349 | 0 | format->format = fasta_format; |
1350 | 0 | format->compression = bgzf; |
1351 | 0 | format->compression_level = 0; |
1352 | 0 | } else { |
1353 | 0 | return -1; |
1354 | 0 | } |
1355 | | |
1356 | 0 | return hts_parse_opt_list(format, cp); |
1357 | 0 | } |
1358 | | |
1359 | | |
1360 | | /* |
1361 | | * Tokenise options as (key(=value)?,)*(key(=value)?)? |
1362 | | * NB: No provision for ',' appearing in the value! |
1363 | | * Add backslashing rules? |
1364 | | * |
1365 | | * This could be used as part of a general command line option parser or |
1366 | | * as a string concatenated onto the file open mode. |
1367 | | * |
1368 | | * Returns 0 on success |
1369 | | * -1 on failure. |
1370 | | */ |
1371 | 0 | static int hts_process_opts(htsFile *fp, const char *opts) { |
1372 | 0 | htsFormat fmt; |
1373 | |
|
1374 | 0 | fmt.specific = NULL; |
1375 | 0 | if (hts_parse_opt_list(&fmt, opts) != 0) |
1376 | 0 | return -1; |
1377 | | |
1378 | 0 | if (hts_opt_apply(fp, fmt.specific) != 0) { |
1379 | 0 | hts_opt_free(fmt.specific); |
1380 | 0 | return -1; |
1381 | 0 | } |
1382 | | |
1383 | 0 | hts_opt_free(fmt.specific); |
1384 | |
|
1385 | 0 | return 0; |
1386 | 0 | } |
1387 | | |
1388 | | static int hts_crypt4gh_redirect(const char *fn, const char *mode, |
1389 | 0 | hFILE **hfile_ptr, htsFile *fp) { |
1390 | 0 | hFILE *hfile1 = *hfile_ptr; |
1391 | 0 | hFILE *hfile2 = NULL; |
1392 | 0 | char fn_buf[512], *fn2 = fn_buf; |
1393 | 0 | char mode2[102]; // Size set by sizeof(simple_mode) in hts_hopen() |
1394 | 0 | const char *prefix = "crypt4gh:"; |
1395 | 0 | size_t fn2_len = strlen(prefix) + strlen(fn) + 1; |
1396 | 0 | int ret = -1; |
1397 | |
|
1398 | 0 | if (fn2_len > sizeof(fn_buf)) { |
1399 | 0 | if (fn2_len >= INT_MAX) // Silence gcc format-truncation warning |
1400 | 0 | return -1; |
1401 | 0 | fn2 = malloc(fn2_len); |
1402 | 0 | if (!fn2) return -1; |
1403 | 0 | } |
1404 | | |
1405 | | // Reopen fn using the crypt4gh plug-in (if available) |
1406 | 0 | snprintf(fn2, fn2_len, "%s%s", prefix, fn); |
1407 | 0 | snprintf(mode2, sizeof(mode2), "%s%s", mode, strchr(mode, ':') ? "" : ":"); |
1408 | 0 | hfile2 = hopen(fn2, mode2, "parent", hfile1, NULL); |
1409 | 0 | if (hfile2) { |
1410 | | // Replace original hfile with the new one. The original is now |
1411 | | // enclosed within hfile2 |
1412 | 0 | *hfile_ptr = hfile2; |
1413 | 0 | ret = 0; |
1414 | 0 | } |
1415 | |
|
1416 | 0 | if (fn2 != fn_buf) |
1417 | 0 | free(fn2); |
1418 | 0 | return ret; |
1419 | 0 | } |
1420 | | |
1421 | | htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode) |
1422 | 28.5k | { |
1423 | 28.5k | hFILE *hfile_orig = hfile; |
1424 | 28.5k | hFILE *hfile_cleanup = hfile; |
1425 | 28.5k | htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile)); |
1426 | 28.5k | char simple_mode[101], *cp, *opts; |
1427 | 28.5k | simple_mode[100] = '\0'; |
1428 | | |
1429 | 28.5k | if (fp == NULL) goto error; |
1430 | | |
1431 | 28.5k | fp->fn = strdup(fn); |
1432 | 28.5k | fp->is_be = ed_is_big(); |
1433 | | |
1434 | | // Split mode into simple_mode,opts strings |
1435 | 28.5k | if ((cp = strchr(mode, ','))) { |
1436 | 0 | strncpy(simple_mode, mode, cp-mode <= 100 ? cp-mode : 100); |
1437 | 0 | simple_mode[cp-mode] = '\0'; |
1438 | 0 | opts = cp+1; |
1439 | 28.5k | } else { |
1440 | 28.5k | strncpy(simple_mode, mode, 100); |
1441 | 28.5k | opts = NULL; |
1442 | 28.5k | } |
1443 | | |
1444 | 28.5k | if (strchr(simple_mode, 'r')) { |
1445 | 14.8k | const int max_loops = 5; // Should be plenty |
1446 | 14.8k | int loops = 0; |
1447 | 14.8k | if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error; |
1448 | | |
1449 | | // Deal with formats that re-direct an underlying file via a plug-in. |
1450 | | // Loops as we may have crypt4gh served via htsget, or |
1451 | | // crypt4gh-in-crypt4gh. |
1452 | | |
1453 | 14.8k | while (fp->format.format == htsget || |
1454 | 14.8k | fp->format.format == hts_crypt4gh_format) { |
1455 | | // Ensure we don't get stuck in an endless redirect loop |
1456 | 0 | if (++loops > max_loops) { |
1457 | 0 | errno = ELOOP; |
1458 | 0 | goto error; |
1459 | 0 | } |
1460 | | |
1461 | 0 | if (fp->format.format == htsget) { |
1462 | 0 | hFILE *hfile2 = hopen_htsget_redirect(hfile, simple_mode); |
1463 | 0 | if (hfile2 == NULL) goto error; |
1464 | | |
1465 | 0 | if (hfile != hfile_cleanup) { |
1466 | | // Close the result of an earlier redirection |
1467 | 0 | hclose_abruptly(hfile); |
1468 | 0 | } |
1469 | |
|
1470 | 0 | hfile = hfile2; |
1471 | 0 | } |
1472 | 0 | else if (fp->format.format == hts_crypt4gh_format) { |
1473 | 0 | int should_preserve = (hfile == hfile_orig); |
1474 | 0 | int update_cleanup = (hfile == hfile_cleanup); |
1475 | 0 | if (hts_crypt4gh_redirect(fn, simple_mode, &hfile, fp) < 0) |
1476 | 0 | goto error; |
1477 | 0 | if (should_preserve) { |
1478 | | // The original hFILE is now contained in a crypt4gh |
1479 | | // wrapper. Should we need to close the wrapper due |
1480 | | // to a later error, we need to prevent the wrapped |
1481 | | // handle from being closed as the caller will see |
1482 | | // this function return NULL and try to clean up itself. |
1483 | 0 | hfile_orig->preserve = 1; |
1484 | 0 | } |
1485 | 0 | if (update_cleanup) { |
1486 | | // Update handle to close at the end if redirected by htsget |
1487 | 0 | hfile_cleanup = hfile; |
1488 | 0 | } |
1489 | 0 | } |
1490 | | |
1491 | | // Re-detect format against the result of the redirection |
1492 | 0 | if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error; |
1493 | 0 | } |
1494 | 14.8k | } |
1495 | 13.6k | else if (strchr(simple_mode, 'w') || strchr(simple_mode, 'a')) { |
1496 | 13.6k | htsFormat *fmt = &fp->format; |
1497 | 13.6k | fp->is_write = 1; |
1498 | | |
1499 | 13.6k | if (strchr(simple_mode, 'b')) fmt->format = binary_format; |
1500 | 13.6k | else if (strchr(simple_mode, 'c')) fmt->format = cram; |
1501 | 13.6k | else if (strchr(simple_mode, 'f')) fmt->format = fastq_format; |
1502 | 13.6k | else if (strchr(simple_mode, 'F')) fmt->format = fasta_format; |
1503 | 13.6k | else fmt->format = text_format; |
1504 | | |
1505 | 13.6k | if (strchr(simple_mode, 'z')) fmt->compression = bgzf; |
1506 | 13.6k | else if (strchr(simple_mode, 'g')) fmt->compression = gzip; |
1507 | 13.6k | else if (strchr(simple_mode, 'u')) fmt->compression = no_compression; |
1508 | 13.6k | else { |
1509 | | // No compression mode specified, set to the default for the format |
1510 | 13.6k | switch (fmt->format) { |
1511 | 0 | case binary_format: fmt->compression = bgzf; break; |
1512 | 0 | case cram: fmt->compression = custom; break; |
1513 | 0 | case fastq_format: fmt->compression = no_compression; break; |
1514 | 0 | case fasta_format: fmt->compression = no_compression; break; |
1515 | 13.6k | case text_format: fmt->compression = no_compression; break; |
1516 | 0 | default: abort(); |
1517 | 13.6k | } |
1518 | 13.6k | } |
1519 | | |
1520 | | // Fill in category (if determinable; e.g. 'b' could be BAM or BCF) |
1521 | 13.6k | fmt->category = format_category(fmt->format); |
1522 | | |
1523 | 13.6k | fmt->version.major = fmt->version.minor = -1; |
1524 | 13.6k | fmt->compression_level = -1; |
1525 | 13.6k | fmt->specific = NULL; |
1526 | 13.6k | } |
1527 | 0 | else { errno = EINVAL; goto error; } |
1528 | | |
1529 | 28.5k | switch (fp->format.format) { |
1530 | 0 | case binary_format: |
1531 | 468 | case bam: |
1532 | 1.11k | case bcf: |
1533 | 1.11k | fp->fp.bgzf = bgzf_hopen(hfile, simple_mode); |
1534 | 1.11k | if (fp->fp.bgzf == NULL) goto error; |
1535 | 1.11k | fp->is_bin = fp->is_bgzf = 1; |
1536 | 1.11k | break; |
1537 | | |
1538 | 5.09k | case cram: |
1539 | 5.09k | fp->fp.cram = cram_dopen(hfile, fn, simple_mode); |
1540 | 5.09k | if (fp->fp.cram == NULL) goto error; |
1541 | 3.90k | if (!fp->is_write) |
1542 | 3.90k | cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, -1); // auto |
1543 | 3.90k | fp->is_cram = 1; |
1544 | 3.90k | break; |
1545 | | |
1546 | 10 | case empty_format: |
1547 | 13.7k | case text_format: |
1548 | 13.7k | case bed: |
1549 | 14.0k | case fasta_format: |
1550 | 14.0k | case fastq_format: |
1551 | 16.9k | case sam: |
1552 | 22.3k | case vcf: |
1553 | 22.3k | if (fp->format.compression != no_compression) { |
1554 | 1.07k | fp->fp.bgzf = bgzf_hopen(hfile, simple_mode); |
1555 | 1.07k | if (fp->fp.bgzf == NULL) goto error; |
1556 | 1.06k | fp->is_bgzf = 1; |
1557 | 1.06k | } |
1558 | 21.2k | else |
1559 | 21.2k | fp->fp.hfile = hfile; |
1560 | 22.3k | break; |
1561 | | |
1562 | 22.3k | default: |
1563 | 4 | errno = EFTYPE; |
1564 | 4 | goto error; |
1565 | 28.5k | } |
1566 | | |
1567 | 27.3k | if (opts) |
1568 | 0 | hts_process_opts(fp, opts); |
1569 | | |
1570 | | // Allow original file to close if it was preserved earlier by crypt4gh |
1571 | 27.3k | hfile_orig->preserve = 0; |
1572 | | |
1573 | | // If redirecting via htsget, close the original hFILE now (pedantically |
1574 | | // we would instead close it in hts_close(), but this a simplifying |
1575 | | // optimisation) |
1576 | 27.3k | if (hfile != hfile_cleanup) hclose_abruptly(hfile_cleanup); |
1577 | | |
1578 | 27.3k | return fp; |
1579 | | |
1580 | 1.19k | error: |
1581 | 1.19k | hts_log_error("Failed to open file %s", fn); |
1582 | | |
1583 | | // If redirecting, close the failed redirection hFILE that we have opened |
1584 | 1.19k | if (hfile != hfile_orig) hclose_abruptly(hfile); |
1585 | 1.19k | hfile_orig->preserve = 0; // Allow caller to close the original hfile |
1586 | | |
1587 | 1.19k | if (fp) { |
1588 | 1.19k | free(fp->fn); |
1589 | 1.19k | free(fp->fn_aux); |
1590 | 1.19k | free(fp); |
1591 | 1.19k | } |
1592 | 1.19k | return NULL; |
1593 | 28.5k | } |
1594 | | |
1595 | | int hts_close(htsFile *fp) |
1596 | 27.3k | { |
1597 | 27.3k | int ret = 0, save; |
1598 | | |
1599 | 27.3k | switch (fp->format.format) { |
1600 | 0 | case binary_format: |
1601 | 468 | case bam: |
1602 | 1.11k | case bcf: |
1603 | 1.11k | ret = bgzf_close(fp->fp.bgzf); |
1604 | 1.11k | break; |
1605 | | |
1606 | 3.90k | case cram: |
1607 | 3.90k | if (!fp->is_write) { |
1608 | 3.90k | switch (cram_eof(fp->fp.cram)) { |
1609 | 51 | case 2: |
1610 | 51 | hts_log_warning("EOF marker is absent. The input is probably truncated"); |
1611 | 51 | break; |
1612 | 3.85k | case 0: /* not at EOF, but may not have wanted all seqs */ |
1613 | 3.85k | default: /* case 1, expected EOF */ |
1614 | 3.85k | break; |
1615 | 3.90k | } |
1616 | 3.90k | } |
1617 | 3.90k | ret = cram_close(fp->fp.cram); |
1618 | 3.90k | break; |
1619 | | |
1620 | 1 | case empty_format: |
1621 | 1.06k | case text_format: |
1622 | 1.06k | case bed: |
1623 | 1.38k | case fasta_format: |
1624 | 1.41k | case fastq_format: |
1625 | 11.8k | case sam: |
1626 | 22.3k | case vcf: |
1627 | 22.3k | if (fp->format.format == sam) |
1628 | 10.4k | ret = sam_state_destroy(fp); |
1629 | 11.8k | else if (fp->format.format == fastq_format || |
1630 | 11.8k | fp->format.format == fasta_format) |
1631 | 346 | fastq_state_destroy(fp); |
1632 | | |
1633 | 22.3k | if (fp->format.compression != no_compression) |
1634 | 1.06k | ret |= bgzf_close(fp->fp.bgzf); |
1635 | 21.2k | else |
1636 | 21.2k | ret |= hclose(fp->fp.hfile); |
1637 | 22.3k | break; |
1638 | | |
1639 | 0 | default: |
1640 | 0 | ret = -1; |
1641 | 0 | break; |
1642 | 27.3k | } |
1643 | | |
1644 | 27.3k | save = errno; |
1645 | 27.3k | sam_hdr_destroy(fp->bam_header); |
1646 | 27.3k | hts_idx_destroy(fp->idx); |
1647 | 27.3k | hts_filter_free(fp->filter); |
1648 | 27.3k | free(fp->fn); |
1649 | 27.3k | free(fp->fn_aux); |
1650 | 27.3k | free(fp->line.s); |
1651 | 27.3k | free(fp); |
1652 | 27.3k | errno = save; |
1653 | 27.3k | return ret; |
1654 | 27.3k | } |
1655 | | |
1656 | | int hts_flush(htsFile *fp) |
1657 | 0 | { |
1658 | 0 | if (fp == NULL) return 0; |
1659 | | |
1660 | 0 | switch (fp->format.format) { |
1661 | 0 | case binary_format: |
1662 | 0 | case bam: |
1663 | 0 | case bcf: |
1664 | 0 | return bgzf_flush(fp->fp.bgzf); |
1665 | | |
1666 | 0 | case cram: |
1667 | 0 | return cram_flush(fp->fp.cram); |
1668 | | |
1669 | 0 | case empty_format: |
1670 | 0 | case text_format: |
1671 | 0 | case bed: |
1672 | 0 | case fasta_format: |
1673 | 0 | case fastq_format: |
1674 | 0 | case sam: |
1675 | 0 | case vcf: |
1676 | 0 | if (fp->format.compression != no_compression) |
1677 | 0 | return bgzf_flush(fp->fp.bgzf); |
1678 | 0 | else |
1679 | 0 | return hflush(fp->fp.hfile); |
1680 | | |
1681 | 0 | default: |
1682 | 0 | break; |
1683 | 0 | } |
1684 | | |
1685 | 0 | return 0; |
1686 | 0 | } |
1687 | | |
1688 | | const htsFormat *hts_get_format(htsFile *fp) |
1689 | 0 | { |
1690 | 0 | return fp? &fp->format : NULL; |
1691 | 0 | } |
1692 | | |
1693 | 0 | const char *hts_format_file_extension(const htsFormat *format) { |
1694 | 0 | if (!format) |
1695 | 0 | return "?"; |
1696 | | |
1697 | 0 | switch (format->format) { |
1698 | 0 | case sam: return "sam"; |
1699 | 0 | case bam: return "bam"; |
1700 | 0 | case bai: return "bai"; |
1701 | 0 | case cram: return "cram"; |
1702 | 0 | case crai: return "crai"; |
1703 | 0 | case vcf: return "vcf"; |
1704 | 0 | case bcf: return "bcf"; |
1705 | 0 | case csi: return "csi"; |
1706 | 0 | case fai_format: return "fai"; |
1707 | 0 | case fqi_format: return "fqi"; |
1708 | 0 | case gzi: return "gzi"; |
1709 | 0 | case tbi: return "tbi"; |
1710 | 0 | case bed: return "bed"; |
1711 | 0 | case d4_format: return "d4"; |
1712 | 0 | case fasta_format: return "fa"; |
1713 | 0 | case fastq_format: return "fq"; |
1714 | 0 | default: return "?"; |
1715 | 0 | } |
1716 | 0 | } |
1717 | | |
1718 | 0 | static hFILE *hts_hfile(htsFile *fp) { |
1719 | 0 | switch (fp->format.format) { |
1720 | 0 | case binary_format:// fall through |
1721 | 0 | case bcf: // fall through |
1722 | 0 | case bam: return bgzf_hfile(fp->fp.bgzf); |
1723 | 0 | case cram: return cram_hfile(fp->fp.cram); |
1724 | 0 | case text_format: return fp->fp.hfile; |
1725 | 0 | case vcf: // fall through |
1726 | 0 | case fastq_format: // fall through |
1727 | 0 | case fasta_format: // fall through |
1728 | 0 | case sam: return fp->format.compression != no_compression |
1729 | 0 | ? bgzf_hfile(fp->fp.bgzf) |
1730 | 0 | : fp->fp.hfile; |
1731 | 0 | default: return NULL; |
1732 | 0 | } |
1733 | 0 | } |
1734 | | |
1735 | 0 | int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { |
1736 | 0 | int r; |
1737 | 0 | va_list args; |
1738 | |
|
1739 | 0 | switch (opt) { |
1740 | 0 | case HTS_OPT_NTHREADS: { |
1741 | 0 | va_start(args, opt); |
1742 | 0 | int nthreads = va_arg(args, int); |
1743 | 0 | va_end(args); |
1744 | 0 | return hts_set_threads(fp, nthreads); |
1745 | 0 | } |
1746 | | |
1747 | 0 | case HTS_OPT_BLOCK_SIZE: { |
1748 | 0 | hFILE *hf = hts_hfile(fp); |
1749 | |
|
1750 | 0 | if (hf) { |
1751 | 0 | va_start(args, opt); |
1752 | 0 | if (hfile_set_blksize(hf, va_arg(args, int)) != 0) |
1753 | 0 | hts_log_warning("Failed to change block size"); |
1754 | 0 | va_end(args); |
1755 | 0 | } |
1756 | 0 | else { |
1757 | | // To do - implement for vcf/bcf. |
1758 | 0 | hts_log_warning("Cannot change block size for this format"); |
1759 | 0 | } |
1760 | |
|
1761 | 0 | return 0; |
1762 | 0 | } |
1763 | | |
1764 | 0 | case HTS_OPT_THREAD_POOL: { |
1765 | 0 | va_start(args, opt); |
1766 | 0 | htsThreadPool *p = va_arg(args, htsThreadPool *); |
1767 | 0 | va_end(args); |
1768 | 0 | return hts_set_thread_pool(fp, p); |
1769 | 0 | } |
1770 | | |
1771 | 0 | case HTS_OPT_CACHE_SIZE: { |
1772 | 0 | va_start(args, opt); |
1773 | 0 | int cache_size = va_arg(args, int); |
1774 | 0 | va_end(args); |
1775 | 0 | hts_set_cache_size(fp, cache_size); |
1776 | 0 | return 0; |
1777 | 0 | } |
1778 | | |
1779 | 0 | case FASTQ_OPT_CASAVA: |
1780 | 0 | case FASTQ_OPT_RNUM: |
1781 | 0 | case FASTQ_OPT_NAME2: |
1782 | 0 | if (fp->format.format == fastq_format || |
1783 | 0 | fp->format.format == fasta_format) |
1784 | 0 | return fastq_state_set(fp, opt); |
1785 | 0 | return 0; |
1786 | | |
1787 | 0 | case FASTQ_OPT_AUX: |
1788 | 0 | if (fp->format.format == fastq_format || |
1789 | 0 | fp->format.format == fasta_format) { |
1790 | 0 | va_start(args, opt); |
1791 | 0 | char *list = va_arg(args, char *); |
1792 | 0 | va_end(args); |
1793 | 0 | return fastq_state_set(fp, opt, list); |
1794 | 0 | } |
1795 | 0 | return 0; |
1796 | | |
1797 | 0 | case FASTQ_OPT_BARCODE: |
1798 | 0 | if (fp->format.format == fastq_format || |
1799 | 0 | fp->format.format == fasta_format) { |
1800 | 0 | va_start(args, opt); |
1801 | 0 | char *bc = va_arg(args, char *); |
1802 | 0 | va_end(args); |
1803 | 0 | return fastq_state_set(fp, opt, bc); |
1804 | 0 | } |
1805 | 0 | return 0; |
1806 | | |
1807 | | // Options below here flow through to cram_set_voption |
1808 | 0 | case HTS_OPT_COMPRESSION_LEVEL: { |
1809 | 0 | va_start(args, opt); |
1810 | 0 | int level = va_arg(args, int); |
1811 | 0 | va_end(args); |
1812 | 0 | if (fp->is_bgzf) |
1813 | 0 | fp->fp.bgzf->compress_level = level; |
1814 | 0 | else if (fp->format.format == cram) |
1815 | 0 | return cram_set_option(fp->fp.cram, opt, level); |
1816 | 0 | return 0; |
1817 | 0 | } |
1818 | | |
1819 | 0 | case HTS_OPT_FILTER: { |
1820 | 0 | va_start(args, opt); |
1821 | 0 | char *expr = va_arg(args, char *); |
1822 | 0 | va_end(args); |
1823 | 0 | return hts_set_filter_expression(fp, expr); |
1824 | 0 | } |
1825 | | |
1826 | 0 | case HTS_OPT_PROFILE: { |
1827 | 0 | va_start(args, opt); |
1828 | 0 | enum hts_profile_option prof = va_arg(args, int); |
1829 | 0 | va_end(args); |
1830 | 0 | if (fp->is_bgzf) { |
1831 | 0 | switch (prof) { |
1832 | | #ifdef HAVE_LIBDEFLATE |
1833 | | case HTS_PROFILE_FAST: fp->fp.bgzf->compress_level = 2; break; |
1834 | | case HTS_PROFILE_NORMAL: fp->fp.bgzf->compress_level = -1; break; |
1835 | | case HTS_PROFILE_SMALL: fp->fp.bgzf->compress_level = 10; break; |
1836 | | case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level = 12; break; |
1837 | | #else |
1838 | 0 | case HTS_PROFILE_FAST: fp->fp.bgzf->compress_level = 1; break; |
1839 | 0 | case HTS_PROFILE_NORMAL: fp->fp.bgzf->compress_level = -1; break; |
1840 | 0 | case HTS_PROFILE_SMALL: fp->fp.bgzf->compress_level = 8; break; |
1841 | 0 | case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level = 9; break; |
1842 | 0 | #endif |
1843 | 0 | } |
1844 | 0 | } // else CRAM manages this in its own way |
1845 | 0 | break; |
1846 | 0 | } |
1847 | | |
1848 | 0 | default: |
1849 | 0 | break; |
1850 | 0 | } |
1851 | | |
1852 | 0 | if (fp->format.format != cram) |
1853 | 0 | return 0; |
1854 | | |
1855 | 0 | va_start(args, opt); |
1856 | 0 | r = cram_set_voption(fp->fp.cram, opt, args); |
1857 | 0 | va_end(args); |
1858 | |
|
1859 | 0 | return r; |
1860 | 0 | } |
1861 | | |
1862 | | BGZF *hts_get_bgzfp(htsFile *fp); |
1863 | | |
1864 | | int hts_set_threads(htsFile *fp, int n) |
1865 | 0 | { |
1866 | 0 | if (fp->format.format == sam) { |
1867 | 0 | return sam_set_threads(fp, n); |
1868 | 0 | } else if (fp->format.compression == bgzf) { |
1869 | 0 | return bgzf_mt(hts_get_bgzfp(fp), n, 256/*unused*/); |
1870 | 0 | } else if (fp->format.format == cram) { |
1871 | 0 | return hts_set_opt(fp, CRAM_OPT_NTHREADS, n); |
1872 | 0 | } |
1873 | 0 | else return 0; |
1874 | 0 | } |
1875 | | |
1876 | 0 | int hts_set_thread_pool(htsFile *fp, htsThreadPool *p) { |
1877 | 0 | if (fp->format.format == sam || fp->format.format == text_format) { |
1878 | 0 | return sam_set_thread_pool(fp, p); |
1879 | 0 | } else if (fp->format.compression == bgzf) { |
1880 | 0 | return bgzf_thread_pool(hts_get_bgzfp(fp), p->pool, p->qsize); |
1881 | 0 | } else if (fp->format.format == cram) { |
1882 | 0 | return hts_set_opt(fp, CRAM_OPT_THREAD_POOL, p); |
1883 | 0 | } |
1884 | 0 | else return 0; |
1885 | 0 | } |
1886 | | |
1887 | | void hts_set_cache_size(htsFile *fp, int n) |
1888 | 0 | { |
1889 | 0 | if (fp->format.compression == bgzf) |
1890 | 0 | bgzf_set_cache_size(hts_get_bgzfp(fp), n); |
1891 | 0 | } |
1892 | | |
1893 | | int hts_set_fai_filename(htsFile *fp, const char *fn_aux) |
1894 | 0 | { |
1895 | 0 | free(fp->fn_aux); |
1896 | 0 | if (fn_aux) { |
1897 | 0 | fp->fn_aux = strdup(fn_aux); |
1898 | 0 | if (fp->fn_aux == NULL) return -1; |
1899 | 0 | } |
1900 | 0 | else fp->fn_aux = NULL; |
1901 | | |
1902 | 0 | if (fp->format.format == cram) |
1903 | 0 | if (cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux)) |
1904 | 0 | return -1; |
1905 | | |
1906 | 0 | return 0; |
1907 | 0 | } |
1908 | | |
1909 | | int hts_set_filter_expression(htsFile *fp, const char *expr) |
1910 | 0 | { |
1911 | 0 | if (fp->filter) |
1912 | 0 | hts_filter_free(fp->filter); |
1913 | |
|
1914 | 0 | if (!expr) |
1915 | 0 | return 0; |
1916 | | |
1917 | 0 | return (fp->filter = hts_filter_init(expr)) |
1918 | 0 | ? 0 : -1; |
1919 | 0 | } |
1920 | | |
1921 | | hFILE *hts_open_tmpfile(const char *fname, const char *mode, kstring_t *tmpname) |
1922 | 0 | { |
1923 | 0 | int pid = (int) getpid(); |
1924 | 0 | unsigned ptr = (uintptr_t) tmpname; |
1925 | 0 | int n = 0; |
1926 | 0 | hFILE *fp = NULL; |
1927 | |
|
1928 | 0 | do { |
1929 | | // Attempt to further uniquify the temporary filename |
1930 | 0 | unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock()) ^ ptr; |
1931 | 0 | n++; |
1932 | |
|
1933 | 0 | ks_clear(tmpname); |
1934 | 0 | if (ksprintf(tmpname, "%s.tmp_%d_%d_%u", fname, pid, n, t) < 0) break; |
1935 | | |
1936 | 0 | fp = hopen(tmpname->s, mode); |
1937 | 0 | } while (fp == NULL && errno == EEXIST && n < 100); |
1938 | | |
1939 | 0 | return fp; |
1940 | 0 | } |
1941 | | |
1942 | | // For VCF/BCF backward sweeper. Not exposing these functions because their |
1943 | | // future is uncertain. Things will probably have to change with hFILE... |
1944 | | BGZF *hts_get_bgzfp(htsFile *fp) |
1945 | 0 | { |
1946 | 0 | if (fp->is_bgzf) |
1947 | 0 | return fp->fp.bgzf; |
1948 | 0 | else |
1949 | 0 | return NULL; |
1950 | 0 | } |
1951 | | int hts_useek(htsFile *fp, off_t uoffset, int where) |
1952 | 0 | { |
1953 | 0 | if (fp->is_bgzf) |
1954 | 0 | return bgzf_useek(fp->fp.bgzf, uoffset, where); |
1955 | 0 | else |
1956 | 0 | return (hseek(fp->fp.hfile, uoffset, SEEK_SET) >= 0)? 0 : -1; |
1957 | 0 | } |
1958 | | off_t hts_utell(htsFile *fp) |
1959 | 0 | { |
1960 | 0 | if (fp->is_bgzf) |
1961 | 0 | return bgzf_utell(fp->fp.bgzf); |
1962 | 0 | else |
1963 | 0 | return htell(fp->fp.hfile); |
1964 | 0 | } |
1965 | | |
1966 | | int hts_getline(htsFile *fp, int delimiter, kstring_t *str) |
1967 | 27.0M | { |
1968 | 27.0M | int ret; |
1969 | 27.0M | if (! (delimiter == KS_SEP_LINE || delimiter == '\n')) { |
1970 | 0 | hts_log_error("Unexpected delimiter %d", delimiter); |
1971 | 0 | abort(); |
1972 | 0 | } |
1973 | | |
1974 | 27.0M | switch (fp->format.compression) { |
1975 | 124k | case no_compression: |
1976 | 124k | str->l = 0; |
1977 | 124k | ret = kgetline2(str, (kgets_func2 *) hgetln, fp->fp.hfile); |
1978 | 124k | if (ret >= 0) ret = (str->l <= INT_MAX)? (int) str->l : INT_MAX; |
1979 | 4.78k | else if (herrno(fp->fp.hfile)) ret = -2, errno = herrno(fp->fp.hfile); |
1980 | 4.78k | else ret = -1; |
1981 | 124k | break; |
1982 | | |
1983 | 26.9M | case gzip: |
1984 | 26.9M | case bgzf: |
1985 | 26.9M | ret = bgzf_getline(fp->fp.bgzf, '\n', str); |
1986 | 26.9M | break; |
1987 | | |
1988 | 0 | default: |
1989 | 0 | abort(); |
1990 | 27.0M | } |
1991 | | |
1992 | 27.0M | ++fp->lineno; |
1993 | 27.0M | return ret; |
1994 | 27.0M | } |
1995 | | |
1996 | | char **hts_readlist(const char *string, int is_file, int *_n) |
1997 | 0 | { |
1998 | 0 | unsigned int m = 0, n = 0; |
1999 | 0 | char **s = 0, **s_new; |
2000 | 0 | if ( is_file ) |
2001 | 0 | { |
2002 | 0 | BGZF *fp = bgzf_open(string, "r"); |
2003 | 0 | if ( !fp ) return NULL; |
2004 | | |
2005 | 0 | kstring_t str; |
2006 | 0 | int ret; |
2007 | 0 | str.s = 0; str.l = str.m = 0; |
2008 | 0 | while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) |
2009 | 0 | { |
2010 | 0 | if (str.l == 0) continue; |
2011 | 0 | if (hts_resize(char*, n + 1, &m, &s, 0) < 0) |
2012 | 0 | goto err; |
2013 | 0 | s[n] = strdup(str.s); |
2014 | 0 | if (!s[n]) |
2015 | 0 | goto err; |
2016 | 0 | n++; |
2017 | 0 | } |
2018 | 0 | if (ret < -1) // Read error |
2019 | 0 | goto err; |
2020 | 0 | bgzf_close(fp); |
2021 | 0 | free(str.s); |
2022 | 0 | } |
2023 | 0 | else |
2024 | 0 | { |
2025 | 0 | const char *q = string, *p = string; |
2026 | 0 | while ( 1 ) |
2027 | 0 | { |
2028 | 0 | if (*p == ',' || *p == 0) |
2029 | 0 | { |
2030 | 0 | if (hts_resize(char*, n + 1, &m, &s, 0) < 0) |
2031 | 0 | goto err; |
2032 | 0 | s[n] = (char*)calloc(p - q + 1, 1); |
2033 | 0 | if (!s[n]) |
2034 | 0 | goto err; |
2035 | 0 | strncpy(s[n++], q, p - q); |
2036 | 0 | q = p + 1; |
2037 | 0 | } |
2038 | 0 | if ( !*p ) break; |
2039 | 0 | p++; |
2040 | 0 | } |
2041 | 0 | } |
2042 | | // Try to shrink s to the minimum size needed |
2043 | 0 | s_new = (char**)realloc(s, n * sizeof(char*)); |
2044 | 0 | if (!s_new) |
2045 | 0 | goto err; |
2046 | | |
2047 | 0 | s = s_new; |
2048 | 0 | assert(n < INT_MAX); // hts_resize() should ensure this |
2049 | 0 | *_n = n; |
2050 | 0 | return s; |
2051 | | |
2052 | 0 | err: |
2053 | 0 | for (m = 0; m < n; m++) |
2054 | 0 | free(s[m]); |
2055 | 0 | free(s); |
2056 | 0 | return NULL; |
2057 | 0 | } |
2058 | | |
2059 | | char **hts_readlines(const char *fn, int *_n) |
2060 | 0 | { |
2061 | 0 | unsigned int m = 0, n = 0; |
2062 | 0 | char **s = 0, **s_new; |
2063 | 0 | BGZF *fp = bgzf_open(fn, "r"); |
2064 | 0 | if ( fp ) { // read from file |
2065 | 0 | kstring_t str; |
2066 | 0 | int ret; |
2067 | 0 | str.s = 0; str.l = str.m = 0; |
2068 | 0 | while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) { |
2069 | 0 | if (str.l == 0) continue; |
2070 | 0 | if (hts_resize(char *, n + 1, &m, &s, 0) < 0) |
2071 | 0 | goto err; |
2072 | 0 | s[n] = strdup(str.s); |
2073 | 0 | if (!s[n]) |
2074 | 0 | goto err; |
2075 | 0 | n++; |
2076 | 0 | } |
2077 | 0 | if (ret < -1) // Read error |
2078 | 0 | goto err; |
2079 | 0 | bgzf_close(fp); |
2080 | 0 | free(str.s); |
2081 | 0 | } else if (*fn == ':') { // read from string |
2082 | 0 | const char *q, *p; |
2083 | 0 | for (q = p = fn + 1;; ++p) |
2084 | 0 | if (*p == ',' || *p == 0) { |
2085 | 0 | if (hts_resize(char *, n + 1, &m, &s, 0) < 0) |
2086 | 0 | goto err; |
2087 | 0 | s[n] = (char*)calloc(p - q + 1, 1); |
2088 | 0 | if (!s[n]) |
2089 | 0 | goto err; |
2090 | 0 | strncpy(s[n++], q, p - q); |
2091 | 0 | q = p + 1; |
2092 | 0 | if (*p == 0) break; |
2093 | 0 | } |
2094 | 0 | } else return 0; |
2095 | | // Try to shrink s to the minimum size needed |
2096 | 0 | s_new = (char**)realloc(s, n * sizeof(char*)); |
2097 | 0 | if (!s_new) |
2098 | 0 | goto err; |
2099 | | |
2100 | 0 | s = s_new; |
2101 | 0 | assert(n < INT_MAX); // hts_resize() should ensure this |
2102 | 0 | *_n = n; |
2103 | 0 | return s; |
2104 | | |
2105 | 0 | err: |
2106 | 0 | for (m = 0; m < n; m++) |
2107 | 0 | free(s[m]); |
2108 | 0 | free(s); |
2109 | 0 | return NULL; |
2110 | 0 | } |
2111 | | |
2112 | | // DEPRECATED: To be removed in a future HTSlib release |
2113 | | int hts_file_type(const char *fname) |
2114 | 0 | { |
2115 | 0 | int len = strlen(fname); |
2116 | 0 | if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ; |
2117 | 0 | if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF; |
2118 | 0 | if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ; |
2119 | 0 | if ( !strcmp("-",fname) ) return FT_STDIN; |
2120 | | |
2121 | 0 | hFILE *f = hopen(fname, "r"); |
2122 | 0 | if (f == NULL) return 0; |
2123 | | |
2124 | 0 | htsFormat fmt; |
2125 | 0 | if (hts_detect_format2(f, fname, &fmt) < 0) { hclose_abruptly(f); return 0; } |
2126 | 0 | if (hclose(f) < 0) return 0; |
2127 | | |
2128 | 0 | switch (fmt.format) { |
2129 | 0 | case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ; |
2130 | 0 | case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ; |
2131 | 0 | default: return 0; |
2132 | 0 | } |
2133 | 0 | } |
2134 | | |
2135 | | int hts_check_EOF(htsFile *fp) |
2136 | 0 | { |
2137 | 0 | if (fp->format.compression == bgzf) |
2138 | 0 | return bgzf_check_EOF(hts_get_bgzfp(fp)); |
2139 | 0 | else if (fp->format.format == cram) |
2140 | 0 | return cram_check_EOF(fp->fp.cram); |
2141 | 0 | else |
2142 | 0 | return 3; |
2143 | 0 | } |
2144 | | |
2145 | | |
2146 | | /**************** |
2147 | | *** Indexing *** |
2148 | | ****************/ |
2149 | | |
2150 | 0 | #define HTS_MIN_MARKER_DIST 0x10000 |
2151 | | |
2152 | | // Finds the special meta bin |
2153 | | // ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1 |
2154 | 0 | #define META_BIN(idx) ((idx)->n_bins + 1) |
2155 | | |
2156 | 0 | #define pair64_lt(a,b) ((a).u < (b).u) |
2157 | 0 | #define pair64max_lt(a,b) ((a).u < (b).u || \ |
2158 | 0 | ((a).u == (b).u && (a).max < (b).max)) |
2159 | | |
2160 | 0 | KSORT_INIT_STATIC(_off, hts_pair64_t, pair64_lt) |
2161 | 0 | KSORT_INIT_STATIC(_off_max, hts_pair64_max_t, pair64max_lt) |
2162 | | |
2163 | | typedef struct { |
2164 | | int32_t m, n; |
2165 | | uint64_t loff; |
2166 | | hts_pair64_t *list; |
2167 | | } bins_t; |
2168 | | |
2169 | | KHASH_MAP_INIT_INT(bin, bins_t) |
2170 | | typedef khash_t(bin) bidx_t; |
2171 | | |
2172 | | typedef struct { |
2173 | | hts_pos_t n, m; |
2174 | | uint64_t *offset; |
2175 | | } lidx_t; |
2176 | | |
2177 | | struct hts_idx_t { |
2178 | | int fmt, min_shift, n_lvls, n_bins; |
2179 | | uint32_t l_meta; |
2180 | | int32_t n, m; |
2181 | | uint64_t n_no_coor; |
2182 | | bidx_t **bidx; |
2183 | | lidx_t *lidx; |
2184 | | uint8_t *meta; // MUST have a terminating NUL on the end |
2185 | | int tbi_n, last_tbi_tid; |
2186 | | struct { |
2187 | | uint32_t last_bin, save_bin; |
2188 | | hts_pos_t last_coor; |
2189 | | int last_tid, save_tid, finished; |
2190 | | uint64_t last_off, save_off; |
2191 | | uint64_t off_beg, off_end; |
2192 | | uint64_t n_mapped, n_unmapped; |
2193 | | } z; // keep internal states |
2194 | | }; |
2195 | | |
2196 | 0 | static char * idx_format_name(int fmt) { |
2197 | 0 | switch (fmt) { |
2198 | 0 | case HTS_FMT_CSI: return "csi"; |
2199 | 0 | case HTS_FMT_BAI: return "bai"; |
2200 | 0 | case HTS_FMT_TBI: return "tbi"; |
2201 | 0 | case HTS_FMT_CRAI: return "crai"; |
2202 | 0 | default: return "unknown"; |
2203 | 0 | } |
2204 | 0 | } |
2205 | | |
2206 | | #ifdef DEBUG_INDEX |
2207 | | static void idx_dump(const hts_idx_t *idx) { |
2208 | | int i; |
2209 | | int64_t j; |
2210 | | |
2211 | | if (!idx) fprintf(stderr, "Null index\n"); |
2212 | | |
2213 | | fprintf(stderr, "format='%s', min_shift=%d, n_lvls=%d, n_bins=%d, l_meta=%u ", |
2214 | | idx_format_name(idx->fmt), idx->min_shift, idx->n_lvls, idx->n_bins, idx->l_meta); |
2215 | | fprintf(stderr, "n=%d, m=%d, n_no_coor=%"PRIu64"\n", idx->n, idx->m, idx->n_no_coor); |
2216 | | for (i = 0; i < idx->n; i++) { |
2217 | | bidx_t *bidx = idx->bidx[i]; |
2218 | | lidx_t *lidx = &idx->lidx[i]; |
2219 | | if (bidx) { |
2220 | | fprintf(stderr, "======== BIN Index - tid=%d, n_buckets=%d, size=%d\n", i, bidx->n_buckets, bidx->size); |
2221 | | int b; |
2222 | | for (b = 0; b < META_BIN(idx); b++) { |
2223 | | khint_t k; |
2224 | | if ((k = kh_get(bin, bidx, b)) != kh_end(bidx)) { |
2225 | | bins_t *entries = &kh_value(bidx, k); |
2226 | | int l = hts_bin_level(b); |
2227 | | int64_t bin_width = 1LL << ((idx->n_lvls - l) * 3 + idx->min_shift); |
2228 | | fprintf(stderr, "\tbin=%d, level=%d, parent=%d, n_chunks=%d, loff=%"PRIu64", interval=[%"PRId64" - %"PRId64"]\n", |
2229 | | b, l, hts_bin_parent(b), entries->n, entries->loff, (b-hts_bin_first(l))*bin_width+1, (b+1-hts_bin_first(l))*bin_width); |
2230 | | for (j = 0; j < entries->n; j++) |
2231 | | fprintf(stderr, "\t\tchunk=%"PRId64", u=%"PRIu64", v=%"PRIu64"\n", j, entries->list[j].u, entries->list[j].v); |
2232 | | } |
2233 | | } |
2234 | | } |
2235 | | if (lidx) { |
2236 | | fprintf(stderr, "======== LINEAR Index - tid=%d, n_values=%"PRId64"\n", i, lidx->n); |
2237 | | for (j = 0; j < lidx->n; j++) { |
2238 | | fprintf(stderr, "\t\tentry=%"PRId64", offset=%"PRIu64", interval=[%"PRId64" - %"PRId64"]\n", |
2239 | | j, lidx->offset[j], j*(1<<idx->min_shift)+1, (j+1)*(1<<idx->min_shift)); |
2240 | | } |
2241 | | } |
2242 | | } |
2243 | | } |
2244 | | #endif |
2245 | | |
2246 | | static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end) |
2247 | 0 | { |
2248 | 0 | khint_t k; |
2249 | 0 | bins_t *l; |
2250 | 0 | int absent; |
2251 | 0 | k = kh_put(bin, b, bin, &absent); |
2252 | 0 | if (absent < 0) return -1; // Out of memory |
2253 | 0 | l = &kh_value(b, k); |
2254 | 0 | if (absent) { |
2255 | 0 | l->m = 1; l->n = 0; |
2256 | 0 | l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t)); |
2257 | 0 | if (!l->list) { |
2258 | 0 | kh_del(bin, b, k); |
2259 | 0 | return -1; |
2260 | 0 | } |
2261 | 0 | } else if (l->n == l->m) { |
2262 | 0 | uint32_t new_m = l->m ? l->m << 1 : 1; |
2263 | 0 | hts_pair64_t *new_list = realloc(l->list, new_m * sizeof(hts_pair64_t)); |
2264 | 0 | if (!new_list) return -1; |
2265 | 0 | l->list = new_list; |
2266 | 0 | l->m = new_m; |
2267 | 0 | } |
2268 | 0 | l->list[l->n].u = beg; |
2269 | 0 | l->list[l->n++].v = end; |
2270 | 0 | return 0; |
2271 | 0 | } |
2272 | | |
2273 | | static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift) |
2274 | 0 | { |
2275 | 0 | int i; |
2276 | 0 | hts_pos_t beg, end; |
2277 | 0 | beg = _beg >> min_shift; |
2278 | 0 | end = (_end - 1) >> min_shift; |
2279 | 0 | if (l->m < end + 1) { |
2280 | 0 | size_t new_m = l->m * 2 > end + 1 ? l->m * 2 : end + 1; |
2281 | 0 | uint64_t *new_offset; |
2282 | |
|
2283 | 0 | new_offset = (uint64_t*)realloc(l->offset, new_m * sizeof(uint64_t)); |
2284 | 0 | if (!new_offset) return -1; |
2285 | | |
2286 | | // fill unused memory with (uint64_t)-1 |
2287 | 0 | memset(new_offset + l->m, 0xff, sizeof(uint64_t) * (new_m - l->m)); |
2288 | 0 | l->m = new_m; |
2289 | 0 | l->offset = new_offset; |
2290 | 0 | } |
2291 | 0 | for (i = beg; i <= end; ++i) { |
2292 | 0 | if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset; |
2293 | 0 | } |
2294 | 0 | if (l->n < end + 1) l->n = end + 1; |
2295 | 0 | return 0; |
2296 | 0 | } |
2297 | | |
2298 | | hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls) |
2299 | 0 | { |
2300 | 0 | hts_idx_t *idx; |
2301 | 0 | idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t)); |
2302 | 0 | if (idx == NULL) return NULL; |
2303 | 0 | idx->fmt = fmt; |
2304 | 0 | idx->min_shift = min_shift; |
2305 | 0 | idx->n_lvls = n_lvls; |
2306 | 0 | idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7; |
2307 | 0 | idx->z.save_tid = idx->z.last_tid = -1; |
2308 | 0 | idx->z.save_bin = idx->z.last_bin = 0xffffffffu; |
2309 | 0 | idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0; |
2310 | 0 | idx->z.last_coor = 0xffffffffu; |
2311 | 0 | if (n) { |
2312 | 0 | idx->n = idx->m = n; |
2313 | 0 | idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*)); |
2314 | 0 | if (idx->bidx == NULL) { free(idx); return NULL; } |
2315 | 0 | idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t)); |
2316 | 0 | if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; } |
2317 | 0 | } |
2318 | 0 | idx->tbi_n = -1; |
2319 | 0 | idx->last_tbi_tid = -1; |
2320 | 0 | return idx; |
2321 | 0 | } |
2322 | | |
2323 | | static void update_loff(hts_idx_t *idx, int i, int free_lidx) |
2324 | 0 | { |
2325 | 0 | bidx_t *bidx = idx->bidx[i]; |
2326 | 0 | lidx_t *lidx = &idx->lidx[i]; |
2327 | 0 | khint_t k; |
2328 | 0 | int l; |
2329 | | // the last entry is always valid |
2330 | 0 | for (l=lidx->n-2; l >= 0; l--) { |
2331 | 0 | if (lidx->offset[l] == (uint64_t)-1) |
2332 | 0 | lidx->offset[l] = lidx->offset[l+1]; |
2333 | 0 | } |
2334 | 0 | if (bidx == 0) return; |
2335 | 0 | for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff |
2336 | 0 | if (kh_exist(bidx, k)) |
2337 | 0 | { |
2338 | 0 | if ( kh_key(bidx, k) < idx->n_bins ) |
2339 | 0 | { |
2340 | 0 | int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls); |
2341 | | // disable linear index if bot_bin out of bounds |
2342 | 0 | kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0; |
2343 | 0 | } |
2344 | 0 | else |
2345 | 0 | kh_val(bidx, k).loff = 0; |
2346 | 0 | } |
2347 | 0 | if (free_lidx) { |
2348 | 0 | free(lidx->offset); |
2349 | 0 | lidx->m = lidx->n = 0; |
2350 | 0 | lidx->offset = 0; |
2351 | 0 | } |
2352 | 0 | } |
2353 | | |
2354 | | static int compress_binning(hts_idx_t *idx, int i) |
2355 | 0 | { |
2356 | 0 | bidx_t *bidx = idx->bidx[i]; |
2357 | 0 | khint_t k; |
2358 | 0 | int l, m; |
2359 | 0 | if (bidx == 0) return 0; |
2360 | | // merge a bin to its parent if the bin is too small |
2361 | 0 | for (l = idx->n_lvls; l > 0; --l) { |
2362 | 0 | unsigned start = hts_bin_first(l); |
2363 | 0 | for (k = kh_begin(bidx); k != kh_end(bidx); ++k) { |
2364 | 0 | bins_t *p, *q; |
2365 | 0 | if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue; |
2366 | 0 | p = &kh_value(bidx, k); |
2367 | 0 | if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list); |
2368 | 0 | if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) { |
2369 | 0 | khint_t kp; |
2370 | 0 | kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k))); |
2371 | 0 | if (kp == kh_end(bidx)) continue; |
2372 | 0 | q = &kh_val(bidx, kp); |
2373 | 0 | if (q->n + p->n > q->m) { |
2374 | 0 | uint32_t new_m = q->n + p->n; |
2375 | 0 | hts_pair64_t *new_list; |
2376 | 0 | kroundup32(new_m); |
2377 | 0 | if (new_m > INT32_MAX) return -1; // Limited by index format |
2378 | 0 | new_list = realloc(q->list, new_m * sizeof(*new_list)); |
2379 | 0 | if (!new_list) return -1; |
2380 | 0 | q->m = new_m; |
2381 | 0 | q->list = new_list; |
2382 | 0 | } |
2383 | 0 | memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t)); |
2384 | 0 | q->n += p->n; |
2385 | 0 | free(p->list); |
2386 | 0 | kh_del(bin, bidx, k); |
2387 | 0 | } |
2388 | 0 | } |
2389 | 0 | } |
2390 | 0 | k = kh_get(bin, bidx, 0); |
2391 | 0 | if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list); |
2392 | | // merge adjacent chunks that start from the same BGZF block |
2393 | 0 | for (k = kh_begin(bidx); k != kh_end(bidx); ++k) { |
2394 | 0 | bins_t *p; |
2395 | 0 | if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue; |
2396 | 0 | p = &kh_value(bidx, k); |
2397 | 0 | for (l = 1, m = 0; l < p->n; ++l) { |
2398 | 0 | if (p->list[m].v>>16 >= p->list[l].u>>16) { |
2399 | 0 | if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v; |
2400 | 0 | } else p->list[++m] = p->list[l]; |
2401 | 0 | } |
2402 | 0 | p->n = m + 1; |
2403 | 0 | } |
2404 | 0 | return 0; |
2405 | 0 | } |
2406 | | |
2407 | | int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset) |
2408 | 0 | { |
2409 | 0 | int i, ret = 0; |
2410 | 0 | if (idx == NULL || idx->z.finished) return 0; // do not run this function on an empty index or multiple times |
2411 | 0 | if (idx->z.save_tid >= 0) { |
2412 | 0 | ret |= insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset); |
2413 | 0 | ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset); |
2414 | 0 | ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped); |
2415 | 0 | } |
2416 | 0 | for (i = 0; i < idx->n; ++i) { |
2417 | 0 | update_loff(idx, i, (idx->fmt == HTS_FMT_CSI)); |
2418 | 0 | ret |= compress_binning(idx, i); |
2419 | 0 | } |
2420 | 0 | idx->z.finished = 1; |
2421 | |
|
2422 | 0 | return ret; |
2423 | 0 | } |
2424 | | |
2425 | | int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end) |
2426 | 0 | { |
2427 | 0 | int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3); |
2428 | 0 | if (tid < 0 || (beg <= maxpos && end <= maxpos)) |
2429 | 0 | return 0; |
2430 | | |
2431 | 0 | if (idx->fmt == HTS_FMT_CSI) { |
2432 | 0 | hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" " |
2433 | 0 | "cannot be stored in a csi index with these parameters. " |
2434 | 0 | "Please use a larger min_shift or depth", |
2435 | 0 | beg, end); |
2436 | 0 | } else { |
2437 | 0 | hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos |
2438 | 0 | " cannot be stored in a %s index. Try using a csi index", |
2439 | 0 | beg, end, idx_format_name(idx->fmt)); |
2440 | 0 | } |
2441 | 0 | errno = ERANGE; |
2442 | 0 | return -1; |
2443 | 0 | } |
2444 | | |
2445 | | int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped) |
2446 | 0 | { |
2447 | 0 | int bin; |
2448 | 0 | if (tid<0) beg = -1, end = 0; |
2449 | 0 | if (hts_idx_check_range(idx, tid, beg, end) < 0) |
2450 | 0 | return -1; |
2451 | 0 | if (tid >= idx->m) { // enlarge the index |
2452 | 0 | uint32_t new_m = idx->m * 2 > tid + 1 ? idx->m * 2 : tid + 1; |
2453 | 0 | bidx_t **new_bidx; |
2454 | 0 | lidx_t *new_lidx; |
2455 | |
|
2456 | 0 | new_bidx = (bidx_t**)realloc(idx->bidx, new_m * sizeof(bidx_t*)); |
2457 | 0 | if (!new_bidx) return -1; |
2458 | 0 | idx->bidx = new_bidx; |
2459 | |
|
2460 | 0 | new_lidx = (lidx_t*) realloc(idx->lidx, new_m * sizeof(lidx_t)); |
2461 | 0 | if (!new_lidx) return -1; |
2462 | 0 | idx->lidx = new_lidx; |
2463 | |
|
2464 | 0 | memset(&idx->bidx[idx->m], 0, (new_m - idx->m) * sizeof(bidx_t*)); |
2465 | 0 | memset(&idx->lidx[idx->m], 0, (new_m - idx->m) * sizeof(lidx_t)); |
2466 | 0 | idx->m = new_m; |
2467 | 0 | } |
2468 | 0 | if (idx->n < tid + 1) idx->n = tid + 1; |
2469 | 0 | if (idx->z.finished) return 0; |
2470 | 0 | if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome |
2471 | 0 | if ( tid>=0 && idx->n_no_coor ) |
2472 | 0 | { |
2473 | 0 | hts_log_error("NO_COOR reads not in a single block at the end %d %d", tid, idx->z.last_tid); |
2474 | 0 | return -1; |
2475 | 0 | } |
2476 | 0 | if (tid>=0 && idx->bidx[tid] != 0) |
2477 | 0 | { |
2478 | 0 | hts_log_error("Chromosome blocks not continuous"); |
2479 | 0 | return -1; |
2480 | 0 | } |
2481 | 0 | idx->z.last_tid = tid; |
2482 | 0 | idx->z.last_bin = 0xffffffffu; |
2483 | 0 | } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order |
2484 | 0 | hts_log_error("Unsorted positions on sequence #%d: %"PRIhts_pos" followed by %"PRIhts_pos, tid+1, idx->z.last_coor+1, beg+1); |
2485 | 0 | return -1; |
2486 | 0 | } |
2487 | 0 | if (end < beg) { |
2488 | | // Malformed ranges are errors. (Empty ranges (beg==end) are unusual but acceptable.) |
2489 | 0 | hts_log_error("Invalid record on sequence #%d: end %"PRId64" < begin %"PRId64, tid+1, end, beg+1); |
2490 | 0 | return -1; |
2491 | 0 | } |
2492 | 0 | if ( tid>=0 ) |
2493 | 0 | { |
2494 | 0 | if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin); |
2495 | | // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin |
2496 | 0 | if (beg < 0) beg = 0; |
2497 | 0 | if (end <= 0) end = 1; |
2498 | | // idx->z.last_off points to the start of the current record |
2499 | 0 | if (insert_to_l(&idx->lidx[tid], beg, end, |
2500 | 0 | idx->z.last_off, idx->min_shift) < 0) return -1; |
2501 | 0 | } |
2502 | 0 | else idx->n_no_coor++; |
2503 | 0 | bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls); |
2504 | 0 | if ((int)idx->z.last_bin != bin) { // then possibly write the binning index |
2505 | 0 | if (idx->z.save_bin != 0xffffffffu) { // save_bin==0xffffffffu only happens to the first record |
2506 | 0 | if (insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, |
2507 | 0 | idx->z.save_off, idx->z.last_off) < 0) return -1; |
2508 | 0 | } |
2509 | 0 | if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information |
2510 | 0 | idx->z.off_end = idx->z.last_off; |
2511 | 0 | if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), |
2512 | 0 | idx->z.off_beg, idx->z.off_end) < 0) return -1; |
2513 | 0 | if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), |
2514 | 0 | idx->z.n_mapped, idx->z.n_unmapped) < 0) return -1; |
2515 | 0 | idx->z.n_mapped = idx->z.n_unmapped = 0; |
2516 | 0 | idx->z.off_beg = idx->z.off_end; |
2517 | 0 | } |
2518 | 0 | idx->z.save_off = idx->z.last_off; |
2519 | 0 | idx->z.save_bin = idx->z.last_bin = bin; |
2520 | 0 | idx->z.save_tid = tid; |
2521 | 0 | } |
2522 | 0 | if (is_mapped) ++idx->z.n_mapped; |
2523 | 0 | else ++idx->z.n_unmapped; |
2524 | 0 | idx->z.last_off = offset; |
2525 | 0 | idx->z.last_coor = beg; |
2526 | 0 | return 0; |
2527 | 0 | } |
2528 | | |
2529 | | // Needed for TBI only. Ensure 'tid' with 'name' is in the index meta data. |
2530 | | // idx->meta needs to have been initialised first with an appropriate Tabix |
2531 | | // configuration via hts_idx_set_meta. |
2532 | | // |
2533 | | // NB number of references (first 4 bytes of tabix header) aren't in |
2534 | | // idx->meta, but held in idx->n instead. |
2535 | 0 | int hts_idx_tbi_name(hts_idx_t *idx, int tid, const char *name) { |
2536 | | // Horrid - we have to map incoming tid to a tbi alternative tid. |
2537 | | // This is because TBI counts tids by "covered" refs while everything |
2538 | | // else counts by Nth SQ/contig record in header. |
2539 | 0 | if (tid == idx->last_tbi_tid || tid < 0 || !name) |
2540 | 0 | return idx->tbi_n; |
2541 | | |
2542 | 0 | uint32_t len = strlen(name)+1; |
2543 | 0 | uint8_t *tmp = (uint8_t *)realloc(idx->meta, idx->l_meta + len); |
2544 | 0 | if (!tmp) |
2545 | 0 | return -1; |
2546 | | |
2547 | | // Append name |
2548 | 0 | idx->meta = tmp; |
2549 | 0 | strcpy((char *)idx->meta + idx->l_meta, name); |
2550 | 0 | idx->l_meta += len; |
2551 | | |
2552 | | // Update seq length |
2553 | 0 | u32_to_le(le_to_u32(idx->meta+24)+len, idx->meta+24); |
2554 | |
|
2555 | 0 | idx->last_tbi_tid = tid; |
2556 | 0 | return ++idx->tbi_n; |
2557 | 0 | } |
2558 | | |
2559 | | // When doing samtools index we have a read_bam / hts_idx_push(bgzf_tell()) |
2560 | | // loop. idx->z.last_off is the previous bzgf_tell location, so we know |
2561 | | // the location the current bam record started at as well as where it ends. |
2562 | | // |
2563 | | // When building an index on the fly via a write_bam / hts_idx_push loop, |
2564 | | // this isn't quite identical as we may amend the virtual coord returned |
2565 | | // by bgzf_tell to the start of a new block if the next bam struct doesn't |
2566 | | // fit. It's essentially the same thing, but for bit-identical indices |
2567 | | // we need to amend the idx->z.last_off when we know we're starting a new |
2568 | | // block. |
2569 | | void hts_idx_amend_last(hts_idx_t *idx, uint64_t offset) |
2570 | 0 | { |
2571 | 0 | idx->z.last_off = offset; |
2572 | 0 | } |
2573 | | |
2574 | | void hts_idx_destroy(hts_idx_t *idx) |
2575 | 27.3k | { |
2576 | 27.3k | khint_t k; |
2577 | 27.3k | int i; |
2578 | 27.3k | if (idx == 0) return; |
2579 | | |
2580 | | // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c |
2581 | 0 | if (idx->fmt == HTS_FMT_CRAI) { |
2582 | 0 | hts_cram_idx_t *cidx = (hts_cram_idx_t *) idx; |
2583 | 0 | cram_index_free(cidx->cram); |
2584 | 0 | free(cidx); |
2585 | 0 | return; |
2586 | 0 | } |
2587 | | |
2588 | 0 | for (i = 0; i < idx->m; ++i) { |
2589 | 0 | bidx_t *bidx = idx->bidx[i]; |
2590 | 0 | free(idx->lidx[i].offset); |
2591 | 0 | if (bidx == 0) continue; |
2592 | 0 | for (k = kh_begin(bidx); k != kh_end(bidx); ++k) |
2593 | 0 | if (kh_exist(bidx, k)) |
2594 | 0 | free(kh_value(bidx, k).list); |
2595 | 0 | kh_destroy(bin, bidx); |
2596 | 0 | } |
2597 | 0 | free(idx->bidx); free(idx->lidx); free(idx->meta); |
2598 | 0 | free(idx); |
2599 | 0 | } |
2600 | | |
2601 | 0 | int hts_idx_fmt(hts_idx_t *idx) { |
2602 | 0 | return idx->fmt; |
2603 | 0 | } |
2604 | | |
2605 | | // The optimizer eliminates these ed_is_big() calls; still it would be good to |
2606 | | // TODO Determine endianness at configure- or compile-time |
2607 | | |
2608 | | static inline ssize_t HTS_RESULT_USED idx_write_int32(BGZF *fp, int32_t x) |
2609 | 0 | { |
2610 | 0 | if (ed_is_big()) x = ed_swap_4(x); |
2611 | 0 | return bgzf_write(fp, &x, sizeof x); |
2612 | 0 | } |
2613 | | |
2614 | | static inline ssize_t HTS_RESULT_USED idx_write_uint32(BGZF *fp, uint32_t x) |
2615 | 0 | { |
2616 | 0 | if (ed_is_big()) x = ed_swap_4(x); |
2617 | 0 | return bgzf_write(fp, &x, sizeof x); |
2618 | 0 | } |
2619 | | |
2620 | | static inline ssize_t HTS_RESULT_USED idx_write_uint64(BGZF *fp, uint64_t x) |
2621 | 0 | { |
2622 | 0 | if (ed_is_big()) x = ed_swap_8(x); |
2623 | 0 | return bgzf_write(fp, &x, sizeof x); |
2624 | 0 | } |
2625 | | |
2626 | | static inline void swap_bins(bins_t *p) |
2627 | 0 | { |
2628 | 0 | int i; |
2629 | 0 | for (i = 0; i < p->n; ++i) { |
2630 | 0 | ed_swap_8p(&p->list[i].u); |
2631 | 0 | ed_swap_8p(&p->list[i].v); |
2632 | 0 | } |
2633 | 0 | } |
2634 | | |
2635 | | static int idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt) |
2636 | 0 | { |
2637 | 0 | int32_t i, j; |
2638 | |
|
2639 | 0 | #define check(ret) if ((ret) < 0) return -1 |
2640 | | |
2641 | | // VCF TBI/CSI only writes IDs for non-empty bins (ie covered references) |
2642 | | // |
2643 | | // NOTE: CSI meta is undefined in spec, so this code has an assumption |
2644 | | // that we're only using it for Tabix data. |
2645 | 0 | int nids = idx->n; |
2646 | 0 | if (idx->meta && idx->l_meta >= 4 && le_to_u32(idx->meta) == TBX_VCF) { |
2647 | 0 | for (i = nids = 0; i < idx->n; ++i) { |
2648 | 0 | if (idx->bidx[i]) |
2649 | 0 | nids++; |
2650 | 0 | } |
2651 | 0 | } |
2652 | 0 | check(idx_write_int32(fp, nids)); |
2653 | 0 | if (fmt == HTS_FMT_TBI && idx->l_meta) |
2654 | 0 | check(bgzf_write(fp, idx->meta, idx->l_meta)); |
2655 | | |
2656 | 0 | for (i = 0; i < idx->n; ++i) { |
2657 | 0 | khint_t k; |
2658 | 0 | bidx_t *bidx = idx->bidx[i]; |
2659 | 0 | lidx_t *lidx = &idx->lidx[i]; |
2660 | | |
2661 | | // write binning index |
2662 | 0 | if (nids == idx->n || bidx) |
2663 | 0 | check(idx_write_int32(fp, bidx? kh_size(bidx) : 0)); |
2664 | 0 | if (bidx) |
2665 | 0 | for (k = kh_begin(bidx); k != kh_end(bidx); ++k) |
2666 | 0 | if (kh_exist(bidx, k)) { |
2667 | 0 | bins_t *p = &kh_value(bidx, k); |
2668 | 0 | check(idx_write_uint32(fp, kh_key(bidx, k))); |
2669 | 0 | if (fmt == HTS_FMT_CSI) check(idx_write_uint64(fp, p->loff)); |
2670 | | //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v); |
2671 | 0 | check(idx_write_int32(fp, p->n)); |
2672 | 0 | for (j = 0; j < p->n; ++j) { |
2673 | | //fprintf(stderr, "\t%ld\t%ld\n", p->list[j].u, p->list[j].v); |
2674 | 0 | check(idx_write_uint64(fp, p->list[j].u)); |
2675 | 0 | check(idx_write_uint64(fp, p->list[j].v)); |
2676 | 0 | } |
2677 | 0 | } |
2678 | | |
2679 | | // write linear index |
2680 | 0 | if (fmt != HTS_FMT_CSI) { |
2681 | 0 | check(idx_write_int32(fp, lidx->n)); |
2682 | 0 | for (j = 0; j < lidx->n; ++j) |
2683 | 0 | check(idx_write_uint64(fp, lidx->offset[j])); |
2684 | 0 | } |
2685 | 0 | } |
2686 | | |
2687 | 0 | check(idx_write_uint64(fp, idx->n_no_coor)); |
2688 | | #ifdef DEBUG_INDEX |
2689 | | idx_dump(idx); |
2690 | | #endif |
2691 | | |
2692 | 0 | return 0; |
2693 | 0 | #undef check |
2694 | 0 | } |
2695 | | |
2696 | | int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt) |
2697 | 0 | { |
2698 | 0 | int ret, save; |
2699 | 0 | if (idx == NULL || fn == NULL) { errno = EINVAL; return -1; } |
2700 | 0 | char *fnidx = (char*)calloc(1, strlen(fn) + 5); |
2701 | 0 | if (fnidx == NULL) return -1; |
2702 | | |
2703 | 0 | strcpy(fnidx, fn); |
2704 | 0 | switch (fmt) { |
2705 | 0 | case HTS_FMT_BAI: strcat(fnidx, ".bai"); break; |
2706 | 0 | case HTS_FMT_CSI: strcat(fnidx, ".csi"); break; |
2707 | 0 | case HTS_FMT_TBI: strcat(fnidx, ".tbi"); break; |
2708 | 0 | default: abort(); |
2709 | 0 | } |
2710 | | |
2711 | 0 | ret = hts_idx_save_as(idx, fn, fnidx, fmt); |
2712 | 0 | save = errno; |
2713 | 0 | free(fnidx); |
2714 | 0 | errno = save; |
2715 | 0 | return ret; |
2716 | 0 | } |
2717 | | |
2718 | | int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt) |
2719 | 0 | { |
2720 | 0 | BGZF *fp; |
2721 | |
|
2722 | 0 | #define check(ret) if ((ret) < 0) goto fail |
2723 | |
|
2724 | 0 | if (fnidx == NULL) return hts_idx_save(idx, fn, fmt); |
2725 | | |
2726 | 0 | fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w"); |
2727 | 0 | if (fp == NULL) return -1; |
2728 | | |
2729 | 0 | if (fmt == HTS_FMT_CSI) { |
2730 | 0 | check(bgzf_write(fp, "CSI\1", 4)); |
2731 | 0 | check(idx_write_int32(fp, idx->min_shift)); |
2732 | 0 | check(idx_write_int32(fp, idx->n_lvls)); |
2733 | 0 | check(idx_write_uint32(fp, idx->l_meta)); |
2734 | 0 | if (idx->l_meta) check(bgzf_write(fp, idx->meta, idx->l_meta)); |
2735 | 0 | } else if (fmt == HTS_FMT_TBI) { |
2736 | 0 | check(bgzf_write(fp, "TBI\1", 4)); |
2737 | 0 | } else if (fmt == HTS_FMT_BAI) { |
2738 | 0 | check(bgzf_write(fp, "BAI\1", 4)); |
2739 | 0 | } else abort(); |
2740 | | |
2741 | 0 | check(idx_save_core(idx, fp, fmt)); |
2742 | | |
2743 | 0 | return bgzf_close(fp); |
2744 | 0 | #undef check |
2745 | | |
2746 | 0 | fail: |
2747 | 0 | bgzf_close(fp); |
2748 | 0 | return -1; |
2749 | 0 | } |
2750 | | |
2751 | | static int idx_read_core(hts_idx_t *idx, BGZF *fp, int fmt) |
2752 | 0 | { |
2753 | 0 | int32_t i, n, is_be; |
2754 | 0 | is_be = ed_is_big(); |
2755 | 0 | if (idx == NULL) return -4; |
2756 | 0 | for (i = 0; i < idx->n; ++i) { |
2757 | 0 | bidx_t *h; |
2758 | 0 | lidx_t *l = &idx->lidx[i]; |
2759 | 0 | uint32_t key; |
2760 | 0 | int j, absent; |
2761 | 0 | bins_t *p; |
2762 | 0 | h = idx->bidx[i] = kh_init(bin); |
2763 | 0 | if (bgzf_read(fp, &n, 4) != 4) return -1; |
2764 | 0 | if (is_be) ed_swap_4p(&n); |
2765 | 0 | if (n < 0) return -3; |
2766 | 0 | for (j = 0; j < n; ++j) { |
2767 | 0 | khint_t k; |
2768 | 0 | if (bgzf_read(fp, &key, 4) != 4) return -1; |
2769 | 0 | if (is_be) ed_swap_4p(&key); |
2770 | 0 | k = kh_put(bin, h, key, &absent); |
2771 | 0 | if (absent < 0) return -2; // No memory |
2772 | 0 | if (absent == 0) return -3; // Duplicate bin number |
2773 | 0 | p = &kh_val(h, k); |
2774 | 0 | if (fmt == HTS_FMT_CSI) { |
2775 | 0 | if (bgzf_read(fp, &p->loff, 8) != 8) return -1; |
2776 | 0 | if (is_be) ed_swap_8p(&p->loff); |
2777 | 0 | } else p->loff = 0; |
2778 | 0 | if (bgzf_read(fp, &p->n, 4) != 4) return -1; |
2779 | 0 | if (is_be) ed_swap_4p(&p->n); |
2780 | 0 | if (p->n < 0) return -3; |
2781 | 0 | if ((size_t) p->n > SIZE_MAX / sizeof(hts_pair64_t)) return -2; |
2782 | 0 | p->m = p->n; |
2783 | 0 | p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t)); |
2784 | 0 | if (p->list == NULL) return -2; |
2785 | 0 | if (bgzf_read(fp, p->list, ((size_t) p->n)<<4) != ((size_t) p->n)<<4) return -1; |
2786 | 0 | if (is_be) swap_bins(p); |
2787 | 0 | } |
2788 | 0 | if (fmt != HTS_FMT_CSI) { // load linear index |
2789 | 0 | int j, k; |
2790 | 0 | uint32_t x; |
2791 | 0 | if (bgzf_read(fp, &x, 4) != 4) return -1; |
2792 | 0 | if (is_be) ed_swap_4p(&x); |
2793 | 0 | l->n = x; |
2794 | 0 | if (l->n < 0) return -3; |
2795 | 0 | if ((size_t) l->n > SIZE_MAX / sizeof(uint64_t)) return -2; |
2796 | 0 | l->m = l->n; |
2797 | 0 | l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t)); |
2798 | 0 | if (l->offset == NULL) return -2; |
2799 | 0 | if (bgzf_read(fp, l->offset, l->n << 3) != l->n << 3) return -1; |
2800 | 0 | if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]); |
2801 | 0 | for (k = j = 0; j < l->n && l->offset[j] == 0; k = ++j); // stop at the first non-zero entry |
2802 | 0 | for (j = l->n-1; j > k; j--) // fill missing values; may happen given older samtools and tabix |
2803 | 0 | if (l->offset[j-1] == 0) l->offset[j-1] = l->offset[j]; |
2804 | 0 | update_loff(idx, i, 0); |
2805 | 0 | } |
2806 | 0 | } |
2807 | 0 | if (bgzf_read(fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0; |
2808 | 0 | if (is_be) ed_swap_8p(&idx->n_no_coor); |
2809 | | #ifdef DEBUG_INDEX |
2810 | | idx_dump(idx); |
2811 | | #endif |
2812 | |
|
2813 | 0 | return 0; |
2814 | 0 | } |
2815 | | |
2816 | | static hts_idx_t *idx_read(const char *fn) |
2817 | 0 | { |
2818 | 0 | uint8_t magic[4]; |
2819 | 0 | int i, is_be; |
2820 | 0 | hts_idx_t *idx = NULL; |
2821 | 0 | uint8_t *meta = NULL; |
2822 | 0 | BGZF *fp = bgzf_open(fn, "r"); |
2823 | 0 | if (fp == NULL) return NULL; |
2824 | 0 | is_be = ed_is_big(); |
2825 | 0 | if (bgzf_read(fp, magic, 4) != 4) goto fail; |
2826 | | |
2827 | 0 | if (memcmp(magic, "CSI\1", 4) == 0) { |
2828 | 0 | uint32_t x[3], n; |
2829 | 0 | if (bgzf_read(fp, x, 12) != 12) goto fail; |
2830 | 0 | if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]); |
2831 | 0 | if (x[2]) { |
2832 | 0 | if (SIZE_MAX - x[2] < 1) goto fail; // Prevent possible overflow |
2833 | 0 | if ((meta = (uint8_t*)malloc((size_t) x[2] + 1)) == NULL) goto fail; |
2834 | 0 | if (bgzf_read(fp, meta, x[2]) != x[2]) goto fail; |
2835 | | // Prevent possible strlen past the end in tbx_index_load2 |
2836 | 0 | meta[x[2]] = '\0'; |
2837 | 0 | } |
2838 | 0 | if (bgzf_read(fp, &n, 4) != 4) goto fail; |
2839 | 0 | if (is_be) ed_swap_4p(&n); |
2840 | 0 | if (n > INT32_MAX) goto fail; |
2841 | 0 | if ((idx = hts_idx_init(n, HTS_FMT_CSI, 0, x[0], x[1])) == NULL) goto fail; |
2842 | 0 | idx->l_meta = x[2]; |
2843 | 0 | idx->meta = meta; |
2844 | 0 | meta = NULL; |
2845 | 0 | if (idx_read_core(idx, fp, HTS_FMT_CSI) < 0) goto fail; |
2846 | 0 | } |
2847 | 0 | else if (memcmp(magic, "TBI\1", 4) == 0) { |
2848 | 0 | uint8_t x[8 * 4]; |
2849 | 0 | uint32_t n; |
2850 | | // Read file header |
2851 | 0 | if (bgzf_read(fp, x, sizeof(x)) != sizeof(x)) goto fail; |
2852 | 0 | n = le_to_u32(&x[0]); // location of n_ref |
2853 | 0 | if (n > INT32_MAX) goto fail; |
2854 | 0 | if ((idx = hts_idx_init(n, HTS_FMT_TBI, 0, 14, 5)) == NULL) goto fail; |
2855 | 0 | n = le_to_u32(&x[7*4]); // location of l_nm |
2856 | 0 | if (n > UINT32_MAX - 29) goto fail; // Prevent possible overflow |
2857 | 0 | idx->l_meta = 28 + n; |
2858 | 0 | if ((idx->meta = (uint8_t*)malloc(idx->l_meta + 1)) == NULL) goto fail; |
2859 | | // copy format, col_seq, col_beg, col_end, meta, skip, l_nm |
2860 | | // N.B. left in little-endian byte order. |
2861 | 0 | memcpy(idx->meta, &x[1*4], 28); |
2862 | | // Read in sequence names. |
2863 | 0 | if (bgzf_read(fp, idx->meta + 28, n) != n) goto fail; |
2864 | | // Prevent possible strlen past the end in tbx_index_load2 |
2865 | 0 | idx->meta[idx->l_meta] = '\0'; |
2866 | 0 | if (idx_read_core(idx, fp, HTS_FMT_TBI) < 0) goto fail; |
2867 | 0 | } |
2868 | 0 | else if (memcmp(magic, "BAI\1", 4) == 0) { |
2869 | 0 | uint32_t n; |
2870 | 0 | if (bgzf_read(fp, &n, 4) != 4) goto fail; |
2871 | 0 | if (is_be) ed_swap_4p(&n); |
2872 | 0 | if (n > INT32_MAX) goto fail; |
2873 | 0 | if ((idx = hts_idx_init(n, HTS_FMT_BAI, 0, 14, 5)) == NULL) goto fail; |
2874 | 0 | if (idx_read_core(idx, fp, HTS_FMT_BAI) < 0) goto fail; |
2875 | 0 | } |
2876 | 0 | else { errno = EINVAL; goto fail; } |
2877 | | |
2878 | 0 | bgzf_close(fp); |
2879 | 0 | return idx; |
2880 | | |
2881 | 0 | fail: |
2882 | 0 | bgzf_close(fp); |
2883 | 0 | hts_idx_destroy(idx); |
2884 | 0 | free(meta); |
2885 | 0 | return NULL; |
2886 | 0 | } |
2887 | | |
2888 | | int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta, |
2889 | | int is_copy) |
2890 | 0 | { |
2891 | 0 | uint8_t *new_meta = meta; |
2892 | 0 | if (is_copy) { |
2893 | 0 | size_t l = l_meta; |
2894 | 0 | if (l > SIZE_MAX - 1) { |
2895 | 0 | errno = ENOMEM; |
2896 | 0 | return -1; |
2897 | 0 | } |
2898 | 0 | new_meta = malloc(l + 1); |
2899 | 0 | if (!new_meta) return -1; |
2900 | 0 | memcpy(new_meta, meta, l); |
2901 | | // Prevent possible strlen past the end in tbx_index_load2 |
2902 | 0 | new_meta[l] = '\0'; |
2903 | 0 | } |
2904 | 0 | if (idx->meta) free(idx->meta); |
2905 | 0 | idx->l_meta = l_meta; |
2906 | 0 | idx->meta = new_meta; |
2907 | 0 | return 0; |
2908 | 0 | } |
2909 | | |
2910 | | uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta) |
2911 | 0 | { |
2912 | 0 | *l_meta = idx->l_meta; |
2913 | 0 | return idx->meta; |
2914 | 0 | } |
2915 | | |
2916 | | const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr) |
2917 | 0 | { |
2918 | 0 | if ( !idx || !idx->n ) |
2919 | 0 | { |
2920 | 0 | *n = 0; |
2921 | 0 | return NULL; |
2922 | 0 | } |
2923 | | |
2924 | 0 | int tid = 0, i; |
2925 | 0 | const char **names = (const char**) calloc(idx->n,sizeof(const char*)); |
2926 | 0 | for (i=0; i<idx->n; i++) |
2927 | 0 | { |
2928 | 0 | bidx_t *bidx = idx->bidx[i]; |
2929 | 0 | if ( !bidx ) continue; |
2930 | 0 | names[tid++] = getid(hdr,i); |
2931 | 0 | } |
2932 | 0 | *n = tid; |
2933 | 0 | return names; |
2934 | 0 | } |
2935 | | |
2936 | 0 | int hts_idx_nseq(const hts_idx_t *idx) { |
2937 | 0 | if (!idx) return -1; |
2938 | 0 | return idx->n; |
2939 | 0 | } |
2940 | | |
2941 | | int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped) |
2942 | 0 | { |
2943 | 0 | if (!idx) return -1; |
2944 | 0 | if ( idx->fmt == HTS_FMT_CRAI ) { |
2945 | 0 | *mapped = 0; *unmapped = 0; |
2946 | 0 | return -1; |
2947 | 0 | } |
2948 | | |
2949 | 0 | bidx_t *h = idx->bidx[tid]; |
2950 | 0 | if (!h) return -1; |
2951 | 0 | khint_t k = kh_get(bin, h, META_BIN(idx)); |
2952 | 0 | if (k != kh_end(h)) { |
2953 | 0 | *mapped = kh_val(h, k).list[1].u; |
2954 | 0 | *unmapped = kh_val(h, k).list[1].v; |
2955 | 0 | return 0; |
2956 | 0 | } else { |
2957 | 0 | *mapped = 0; *unmapped = 0; |
2958 | 0 | return -1; |
2959 | 0 | } |
2960 | 0 | } |
2961 | | |
2962 | | uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx) |
2963 | 0 | { |
2964 | 0 | if (idx->fmt == HTS_FMT_CRAI) return 0; |
2965 | 0 | return idx->n_no_coor; |
2966 | 0 | } |
2967 | | |
2968 | | /**************** |
2969 | | *** Iterator *** |
2970 | | ****************/ |
2971 | | |
2972 | | // Note: even with 32-bit hts_pos_t, end needs to be 64-bit here due to 1LL<<s. |
2973 | | static inline int reg2bins_narrow(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx) |
2974 | 0 | { |
2975 | 0 | int l, t, s = min_shift + (n_lvls<<1) + n_lvls; |
2976 | 0 | for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) { |
2977 | 0 | hts_pos_t b, e; |
2978 | 0 | int i; |
2979 | 0 | b = t + (beg>>s); e = t + (end>>s); |
2980 | 0 | for (i = b; i <= e; ++i) { |
2981 | 0 | if (kh_get(bin, bidx, i) != kh_end(bidx)) { |
2982 | 0 | assert(itr->bins.n < itr->bins.m); |
2983 | 0 | itr->bins.a[itr->bins.n++] = i; |
2984 | 0 | } |
2985 | 0 | } |
2986 | 0 | } |
2987 | 0 | return itr->bins.n; |
2988 | 0 | } |
2989 | | |
2990 | | static inline int reg2bins_wide(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx) |
2991 | 0 | { |
2992 | 0 | khint_t i; |
2993 | 0 | hts_pos_t max_shift = 3 * n_lvls + min_shift; |
2994 | 0 | --end; |
2995 | 0 | if (beg < 0) beg = 0; |
2996 | 0 | for (i = kh_begin(bidx); i != kh_end(bidx); i++) { |
2997 | 0 | if (!kh_exist(bidx, i)) continue; |
2998 | 0 | hts_pos_t bin = (hts_pos_t) kh_key(bidx, i); |
2999 | 0 | int level = hts_bin_level(bin); |
3000 | 0 | if (level > n_lvls) continue; // Dodgy index? |
3001 | 0 | hts_pos_t first = hts_bin_first(level); |
3002 | 0 | hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level)); |
3003 | 0 | hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level)); |
3004 | 0 | if (beg_at_level <= bin && bin <= end_at_level) { |
3005 | 0 | assert(itr->bins.n < itr->bins.m); |
3006 | 0 | itr->bins.a[itr->bins.n++] = bin; |
3007 | 0 | } |
3008 | 0 | } |
3009 | 0 | return itr->bins.n; |
3010 | 0 | } |
3011 | | |
3012 | | static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx) |
3013 | 0 | { |
3014 | 0 | int l, t, s = min_shift + (n_lvls<<1) + n_lvls; |
3015 | 0 | size_t reg_bin_count = 0, hash_bin_count = kh_n_buckets(bidx), max_bins; |
3016 | 0 | hts_pos_t end1; |
3017 | 0 | if (end >= 1LL<<s) end = 1LL<<s; |
3018 | 0 | if (beg >= end) return 0; |
3019 | 0 | end1 = end - 1; |
3020 | | |
3021 | | // Count bins to see if it's faster to iterate through the hash table |
3022 | | // or the set of bins covering the region |
3023 | 0 | for (l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) { |
3024 | 0 | reg_bin_count += (end1 >> s) - (beg >> s) + 1; |
3025 | 0 | } |
3026 | 0 | max_bins = reg_bin_count < kh_size(bidx) ? reg_bin_count : kh_size(bidx); |
3027 | 0 | if (itr->bins.m - itr->bins.n < max_bins) { |
3028 | | // Worst-case memory usage. May be wasteful on very sparse |
3029 | | // data, but the bin list usually won't be too big anyway. |
3030 | 0 | size_t new_m = max_bins + itr->bins.n; |
3031 | 0 | if (new_m > INT_MAX || new_m > SIZE_MAX / sizeof(int)) { |
3032 | 0 | errno = ENOMEM; |
3033 | 0 | return -1; |
3034 | 0 | } |
3035 | 0 | int *new_a = realloc(itr->bins.a, new_m * sizeof(*new_a)); |
3036 | 0 | if (!new_a) return -1; |
3037 | 0 | itr->bins.a = new_a; |
3038 | 0 | itr->bins.m = new_m; |
3039 | 0 | } |
3040 | 0 | if (reg_bin_count < hash_bin_count) { |
3041 | 0 | return reg2bins_narrow(beg, end, itr, min_shift, n_lvls, bidx); |
3042 | 0 | } else { |
3043 | 0 | return reg2bins_wide(beg, end, itr, min_shift, n_lvls, bidx); |
3044 | 0 | } |
3045 | 0 | } |
3046 | | |
3047 | | static inline int add_to_interval(hts_itr_t *iter, bins_t *bin, |
3048 | | int tid, uint32_t interval, |
3049 | | uint64_t min_off, uint64_t max_off) |
3050 | 0 | { |
3051 | 0 | hts_pair64_max_t *off; |
3052 | 0 | int j; |
3053 | |
|
3054 | 0 | if (!bin->n) |
3055 | 0 | return 0; |
3056 | 0 | off = realloc(iter->off, (iter->n_off + bin->n) * sizeof(*off)); |
3057 | 0 | if (!off) |
3058 | 0 | return -2; |
3059 | | |
3060 | 0 | iter->off = off; |
3061 | 0 | for (j = 0; j < bin->n; ++j) { |
3062 | 0 | if (bin->list[j].v > min_off && bin->list[j].u < max_off) { |
3063 | 0 | iter->off[iter->n_off].u = min_off > bin->list[j].u |
3064 | 0 | ? min_off : bin->list[j].u; |
3065 | 0 | iter->off[iter->n_off].v = max_off < bin->list[j].v |
3066 | 0 | ? max_off : bin->list[j].v; |
3067 | | // hts_pair64_max_t::max is now used to link |
3068 | | // file offsets to region list entries. |
3069 | | // The iterator can use this to decide if it |
3070 | | // can skip some file regions. |
3071 | 0 | iter->off[iter->n_off].max = ((uint64_t) tid << 32) | interval; |
3072 | 0 | iter->n_off++; |
3073 | 0 | } |
3074 | 0 | } |
3075 | 0 | return 0; |
3076 | 0 | } |
3077 | | |
3078 | | static inline int reg2intervals_narrow(hts_itr_t *iter, const bidx_t *bidx, |
3079 | | int tid, int64_t beg, int64_t end, |
3080 | | uint32_t interval, |
3081 | | uint64_t min_off, uint64_t max_off, |
3082 | | int min_shift, int n_lvls) |
3083 | 0 | { |
3084 | 0 | int l, t, s = min_shift + n_lvls * 3; |
3085 | 0 | hts_pos_t b, e, i; |
3086 | |
|
3087 | 0 | for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) { |
3088 | 0 | b = t + (beg>>s); e = t + (end>>s); |
3089 | 0 | for (i = b; i <= e; ++i) { |
3090 | 0 | khint_t k = kh_get(bin, bidx, i); |
3091 | 0 | if (k != kh_end(bidx)) { |
3092 | 0 | bins_t *bin = &kh_value(bidx, k); |
3093 | 0 | int res = add_to_interval(iter, bin, tid, interval, min_off, max_off); |
3094 | 0 | if (res < 0) |
3095 | 0 | return res; |
3096 | 0 | } |
3097 | 0 | } |
3098 | 0 | } |
3099 | 0 | return 0; |
3100 | 0 | } |
3101 | | |
3102 | | static inline int reg2intervals_wide(hts_itr_t *iter, const bidx_t *bidx, |
3103 | | int tid, int64_t beg, int64_t end, |
3104 | | uint32_t interval, |
3105 | | uint64_t min_off, uint64_t max_off, |
3106 | | int min_shift, int n_lvls) |
3107 | 0 | { |
3108 | 0 | khint_t i; |
3109 | 0 | hts_pos_t max_shift = 3 * n_lvls + min_shift; |
3110 | 0 | --end; |
3111 | 0 | if (beg < 0) beg = 0; |
3112 | 0 | for (i = kh_begin(bidx); i != kh_end(bidx); i++) { |
3113 | 0 | if (!kh_exist(bidx, i)) continue; |
3114 | 0 | hts_pos_t bin = (hts_pos_t) kh_key(bidx, i); |
3115 | 0 | int level = hts_bin_level(bin); |
3116 | 0 | if (level > n_lvls) continue; // Dodgy index? |
3117 | 0 | hts_pos_t first = hts_bin_first(level); |
3118 | 0 | hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level)); |
3119 | 0 | hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level)); |
3120 | 0 | if (beg_at_level <= bin && bin <= end_at_level) { |
3121 | 0 | bins_t *bin = &kh_value(bidx, i); |
3122 | 0 | int res = add_to_interval(iter, bin, tid, interval, min_off, max_off); |
3123 | 0 | if (res < 0) |
3124 | 0 | return res; |
3125 | 0 | } |
3126 | 0 | } |
3127 | 0 | return 0; |
3128 | 0 | } |
3129 | | |
3130 | | static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint32_t interval, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls) |
3131 | 0 | { |
3132 | 0 | int l, t, s; |
3133 | 0 | int i, j; |
3134 | 0 | hts_pos_t end1; |
3135 | 0 | bidx_t *bidx; |
3136 | 0 | int start_n_off; |
3137 | 0 | size_t reg_bin_count = 0, hash_bin_count; |
3138 | 0 | int res; |
3139 | |
|
3140 | 0 | if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg >= end) |
3141 | 0 | return -1; |
3142 | | |
3143 | 0 | hash_bin_count = kh_n_buckets(bidx); |
3144 | |
|
3145 | 0 | s = min_shift + (n_lvls<<1) + n_lvls; |
3146 | 0 | if (end >= 1LL<<s) |
3147 | 0 | end = 1LL<<s; |
3148 | |
|
3149 | 0 | end1 = end - 1; |
3150 | | // Count bins to see if it's faster to iterate through the hash table |
3151 | | // or the set of bins covering the region |
3152 | 0 | for (l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) { |
3153 | 0 | reg_bin_count += (end1 >> s) - (beg >> s) + 1; |
3154 | 0 | } |
3155 | |
|
3156 | 0 | start_n_off = iter->n_off; |
3157 | | |
3158 | | // Populate iter->off with the intervals for this region |
3159 | 0 | if (reg_bin_count < hash_bin_count) { |
3160 | 0 | res = reg2intervals_narrow(iter, bidx, tid, beg, end, interval, |
3161 | 0 | min_off, max_off, min_shift, n_lvls); |
3162 | 0 | } else { |
3163 | 0 | res = reg2intervals_wide(iter, bidx, tid, beg, end, interval, |
3164 | 0 | min_off, max_off, min_shift, n_lvls); |
3165 | 0 | } |
3166 | 0 | if (res < 0) |
3167 | 0 | return res; |
3168 | | |
3169 | 0 | if (iter->n_off - start_n_off > 1) { |
3170 | 0 | ks_introsort(_off_max, iter->n_off - start_n_off, iter->off + start_n_off); |
3171 | 0 | for (i = start_n_off, j = start_n_off + 1; j < iter->n_off; j++) { |
3172 | 0 | if (iter->off[i].v >= iter->off[j].u) { |
3173 | 0 | if (iter->off[i].v < iter->off[j].v) |
3174 | 0 | iter->off[i].v = iter->off[j].v; |
3175 | 0 | } else { |
3176 | 0 | i++; |
3177 | 0 | if (i < j) |
3178 | 0 | iter->off[i] = iter->off[j]; |
3179 | 0 | } |
3180 | 0 | } |
3181 | 0 | iter->n_off = i + 1; |
3182 | 0 | } |
3183 | |
|
3184 | 0 | return iter->n_off; |
3185 | 0 | } |
3186 | | |
3187 | 0 | static int compare_regions(const void *r1, const void *r2) { |
3188 | 0 | hts_reglist_t *reg1 = (hts_reglist_t *)r1; |
3189 | 0 | hts_reglist_t *reg2 = (hts_reglist_t *)r2; |
3190 | |
|
3191 | 0 | if (reg1->tid < 0 && reg2->tid >= 0) |
3192 | 0 | return 1; |
3193 | 0 | else if (reg1->tid >= 0 && reg2->tid < 0) |
3194 | 0 | return -1; |
3195 | 0 | else |
3196 | 0 | return reg1->tid - reg2->tid; |
3197 | 0 | } |
3198 | | |
3199 | 0 | uint64_t hts_itr_off(const hts_idx_t* idx, int tid) { |
3200 | |
|
3201 | 0 | int i; |
3202 | 0 | bidx_t* bidx; |
3203 | 0 | uint64_t off0 = (uint64_t) -1; |
3204 | 0 | khint_t k; |
3205 | 0 | switch (tid) { |
3206 | 0 | case HTS_IDX_START: |
3207 | | // Find the smallest offset, note that sequence ids may not be ordered sequentially |
3208 | 0 | for (i = 0; i < idx->n; i++) { |
3209 | 0 | bidx = idx->bidx[i]; |
3210 | 0 | k = kh_get(bin, bidx, META_BIN(idx)); |
3211 | 0 | if (k == kh_end(bidx)) |
3212 | 0 | continue; |
3213 | | |
3214 | 0 | if (off0 > kh_val(bidx, k).list[0].u) |
3215 | 0 | off0 = kh_val(bidx, k).list[0].u; |
3216 | 0 | } |
3217 | 0 | if (off0 == (uint64_t) -1 && idx->n_no_coor) |
3218 | 0 | off0 = 0; |
3219 | | // only no-coor reads in this bam |
3220 | 0 | break; |
3221 | 0 | case HTS_IDX_NOCOOR: |
3222 | | /* No-coor reads sort after all of the mapped reads. The position |
3223 | | is not stored in the index itself, so need to find the end |
3224 | | offset for the last mapped read. A loop is needed here in |
3225 | | case references at the end of the file have no mapped reads, |
3226 | | or sequence ids are not ordered sequentially. |
3227 | | See issue samtools#568 and commits b2aab8, 60c22d and cc207d. */ |
3228 | 0 | for (i = 0; i < idx->n; i++) { |
3229 | 0 | bidx = idx->bidx[i]; |
3230 | 0 | k = kh_get(bin, bidx, META_BIN(idx)); |
3231 | 0 | if (k != kh_end(bidx)) { |
3232 | 0 | if (off0 == (uint64_t) -1 || off0 < kh_val(bidx, k).list[0].v) { |
3233 | 0 | off0 = kh_val(bidx, k).list[0].v; |
3234 | 0 | } |
3235 | 0 | } |
3236 | 0 | } |
3237 | 0 | if (off0 == (uint64_t) -1 && idx->n_no_coor) |
3238 | 0 | off0 = 0; |
3239 | | // only no-coor reads in this bam |
3240 | 0 | break; |
3241 | 0 | case HTS_IDX_REST: |
3242 | 0 | off0 = 0; |
3243 | 0 | break; |
3244 | 0 | case HTS_IDX_NONE: |
3245 | 0 | off0 = 0; |
3246 | 0 | break; |
3247 | 0 | } |
3248 | | |
3249 | 0 | return off0; |
3250 | 0 | } |
3251 | | |
3252 | | hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec) |
3253 | 0 | { |
3254 | 0 | int i, n_off, l, bin; |
3255 | 0 | hts_pair64_max_t *off; |
3256 | 0 | khint_t k; |
3257 | 0 | bidx_t *bidx; |
3258 | 0 | uint64_t min_off, max_off; |
3259 | 0 | hts_itr_t *iter; |
3260 | 0 | uint32_t unmapped = 0, rel_off; |
3261 | | |
3262 | | // It's possible to call this function with NULL idx iff |
3263 | | // tid is one of the special values HTS_IDX_REST or HTS_IDX_NONE |
3264 | 0 | if (!idx && !(tid == HTS_IDX_REST || tid == HTS_IDX_NONE)) { |
3265 | 0 | errno = EINVAL; |
3266 | 0 | return NULL; |
3267 | 0 | } |
3268 | | |
3269 | 0 | iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t)); |
3270 | 0 | if (iter) { |
3271 | 0 | if (tid < 0) { |
3272 | 0 | uint64_t off = hts_itr_off(idx, tid); |
3273 | 0 | if (off != (uint64_t) -1) { |
3274 | 0 | iter->read_rest = 1; |
3275 | 0 | iter->curr_off = off; |
3276 | 0 | iter->readrec = readrec; |
3277 | 0 | if (tid == HTS_IDX_NONE) |
3278 | 0 | iter->finished = 1; |
3279 | 0 | } else { |
3280 | 0 | free(iter); |
3281 | 0 | iter = NULL; |
3282 | 0 | } |
3283 | 0 | } else if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) { |
3284 | 0 | iter->finished = 1; |
3285 | 0 | } else { |
3286 | 0 | if (beg < 0) beg = 0; |
3287 | 0 | if (end < beg) { |
3288 | 0 | free(iter); |
3289 | 0 | return NULL; |
3290 | 0 | } |
3291 | | |
3292 | 0 | k = kh_get(bin, bidx, META_BIN(idx)); |
3293 | 0 | if (k != kh_end(bidx)) |
3294 | 0 | unmapped = kh_val(bidx, k).list[1].v; |
3295 | 0 | else |
3296 | 0 | unmapped = 1; |
3297 | |
|
3298 | 0 | iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1; |
3299 | 0 | iter->readrec = readrec; |
3300 | |
|
3301 | 0 | if ( !kh_size(bidx) ) { iter->finished = 1; return iter; } |
3302 | | |
3303 | 0 | rel_off = beg>>idx->min_shift; |
3304 | | // compute min_off |
3305 | 0 | bin = hts_bin_first(idx->n_lvls) + rel_off; |
3306 | 0 | do { |
3307 | 0 | int first; |
3308 | 0 | k = kh_get(bin, bidx, bin); |
3309 | 0 | if (k != kh_end(bidx)) break; |
3310 | 0 | first = (hts_bin_parent(bin)<<3) + 1; |
3311 | 0 | if (bin > first) --bin; |
3312 | 0 | else bin = hts_bin_parent(bin); |
3313 | 0 | } while (bin); |
3314 | 0 | if (bin == 0) k = kh_get(bin, bidx, bin); |
3315 | 0 | min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0; |
3316 | | // min_off can be calculated more accurately if the |
3317 | | // linear index is available |
3318 | 0 | if (idx->lidx[tid].offset |
3319 | 0 | && rel_off < idx->lidx[tid].n) { |
3320 | 0 | if (min_off < idx->lidx[tid].offset[rel_off]) |
3321 | 0 | min_off = idx->lidx[tid].offset[rel_off]; |
3322 | 0 | if (unmapped) { |
3323 | | // unmapped reads are not covered by the linear index, |
3324 | | // so search backwards for a smaller offset |
3325 | 0 | int tmp_off; |
3326 | 0 | for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) { |
3327 | 0 | if (idx->lidx[tid].offset[tmp_off] < min_off) { |
3328 | 0 | min_off = idx->lidx[tid].offset[tmp_off]; |
3329 | 0 | break; |
3330 | 0 | } |
3331 | 0 | } |
3332 | | // if the search went too far back or no satisfactory entry |
3333 | | // was found, revert to the bin index loff value |
3334 | 0 | if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).loff || tmp_off < 0)) |
3335 | 0 | min_off = kh_val(bidx, k).loff; |
3336 | 0 | } |
3337 | 0 | } else if (unmapped) { //CSI index |
3338 | 0 | if (k != kh_end(bidx)) |
3339 | 0 | min_off = kh_val(bidx, k).loff; |
3340 | 0 | } |
3341 | | |
3342 | | // compute max_off: a virtual offset from a bin to the right of end |
3343 | | // First check if end lies within the range of the index (it won't |
3344 | | // if it's HTS_POS_MAX) |
3345 | 0 | if (end < 1LL << (idx->min_shift + 3 * idx->n_lvls)) { |
3346 | 0 | bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; |
3347 | 0 | if (bin >= idx->n_bins) bin = 0; |
3348 | 0 | while (1) { |
3349 | | // search for an extant bin by moving right, but moving up to the |
3350 | | // parent whenever we get to a first child (which also covers falling |
3351 | | // off the RHS, which wraps around and immediately goes up to bin 0) |
3352 | 0 | while (bin % 8 == 1) bin = hts_bin_parent(bin); |
3353 | 0 | if (bin == 0) { max_off = UINT64_MAX; break; } |
3354 | 0 | k = kh_get(bin, bidx, bin); |
3355 | 0 | if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; } |
3356 | 0 | bin++; |
3357 | 0 | } |
3358 | 0 | } else { |
3359 | | // Searching to end of reference |
3360 | 0 | max_off = UINT64_MAX; |
3361 | 0 | } |
3362 | | |
3363 | | // retrieve bins |
3364 | 0 | if (reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls, bidx) < 0) { |
3365 | 0 | hts_itr_destroy(iter); |
3366 | 0 | return NULL; |
3367 | 0 | } |
3368 | | |
3369 | 0 | for (i = n_off = 0; i < iter->bins.n; ++i) |
3370 | 0 | if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) |
3371 | 0 | n_off += kh_value(bidx, k).n; |
3372 | 0 | if (n_off == 0) { |
3373 | | // No overlapping bins means the iterator has already finished. |
3374 | 0 | iter->finished = 1; |
3375 | 0 | return iter; |
3376 | 0 | } |
3377 | 0 | off = calloc(n_off, sizeof(*off)); |
3378 | 0 | for (i = n_off = 0; i < iter->bins.n; ++i) { |
3379 | 0 | if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) { |
3380 | 0 | int j; |
3381 | 0 | bins_t *p = &kh_value(bidx, k); |
3382 | 0 | for (j = 0; j < p->n; ++j) |
3383 | 0 | if (p->list[j].v > min_off && p->list[j].u < max_off) { |
3384 | 0 | off[n_off].u = min_off > p->list[j].u |
3385 | 0 | ? min_off : p->list[j].u; |
3386 | 0 | off[n_off].v = max_off < p->list[j].v |
3387 | 0 | ? max_off : p->list[j].v; |
3388 | | // hts_pair64_max_t::max is now used to link |
3389 | | // file offsets to region list entries. |
3390 | | // The iterator can use this to decide if it |
3391 | | // can skip some file regions. |
3392 | 0 | off[n_off].max = ((uint64_t) tid << 32) | j; |
3393 | 0 | n_off++; |
3394 | 0 | } |
3395 | 0 | } |
3396 | 0 | } |
3397 | |
|
3398 | 0 | if (n_off == 0) { |
3399 | 0 | free(off); |
3400 | 0 | iter->finished = 1; |
3401 | 0 | return iter; |
3402 | 0 | } |
3403 | 0 | ks_introsort(_off_max, n_off, off); |
3404 | | // resolve completely contained adjacent blocks |
3405 | 0 | for (i = 1, l = 0; i < n_off; ++i) |
3406 | 0 | if (off[l].v < off[i].v) off[++l] = off[i]; |
3407 | 0 | n_off = l + 1; |
3408 | | // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing |
3409 | 0 | for (i = 1; i < n_off; ++i) |
3410 | 0 | if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u; |
3411 | | // merge adjacent blocks |
3412 | 0 | for (i = 1, l = 0; i < n_off; ++i) { |
3413 | 0 | if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v; |
3414 | 0 | else off[++l] = off[i]; |
3415 | 0 | } |
3416 | 0 | n_off = l + 1; |
3417 | 0 | iter->n_off = n_off; iter->off = off; |
3418 | 0 | } |
3419 | 0 | } |
3420 | | |
3421 | 0 | return iter; |
3422 | 0 | } |
3423 | | |
3424 | | int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) |
3425 | 0 | { |
3426 | 0 | int i, j, bin; |
3427 | 0 | khint_t k; |
3428 | 0 | bidx_t *bidx; |
3429 | 0 | uint64_t min_off, max_off, t_off = (uint64_t)-1; |
3430 | 0 | int tid; |
3431 | 0 | hts_pos_t beg, end; |
3432 | 0 | hts_reglist_t *curr_reg; |
3433 | 0 | uint32_t unmapped = 0, rel_off; |
3434 | |
|
3435 | 0 | if (!idx || !iter || !iter->multi) |
3436 | 0 | return -1; |
3437 | | |
3438 | 0 | iter->i = -1; |
3439 | 0 | for (i=0; i<iter->n_reg; i++) { |
3440 | |
|
3441 | 0 | curr_reg = &iter->reg_list[i]; |
3442 | 0 | tid = curr_reg->tid; |
3443 | |
|
3444 | 0 | if (tid < 0) { |
3445 | 0 | t_off = hts_itr_off(idx, tid); |
3446 | 0 | if (t_off != (uint64_t)-1) { |
3447 | 0 | switch (tid) { |
3448 | 0 | case HTS_IDX_NONE: |
3449 | 0 | iter->finished = 1; |
3450 | | // fall through |
3451 | 0 | case HTS_IDX_START: |
3452 | 0 | case HTS_IDX_REST: |
3453 | 0 | iter->curr_off = t_off; |
3454 | 0 | iter->n_reg = 0; |
3455 | 0 | iter->reg_list = NULL; |
3456 | 0 | iter->read_rest = 1; |
3457 | 0 | return 0; |
3458 | 0 | case HTS_IDX_NOCOOR: |
3459 | 0 | iter->nocoor = 1; |
3460 | 0 | iter->nocoor_off = t_off; |
3461 | 0 | } |
3462 | 0 | } |
3463 | 0 | } else { |
3464 | 0 | if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL || !kh_size(bidx)) |
3465 | 0 | continue; |
3466 | | |
3467 | 0 | k = kh_get(bin, bidx, META_BIN(idx)); |
3468 | 0 | if (k != kh_end(bidx)) |
3469 | 0 | unmapped = kh_val(bidx, k).list[1].v; |
3470 | 0 | else |
3471 | 0 | unmapped = 1; |
3472 | |
|
3473 | 0 | for(j=0; j<curr_reg->count; j++) { |
3474 | 0 | hts_pair32_t *curr_intv = &curr_reg->intervals[j]; |
3475 | 0 | if (curr_intv->end < curr_intv->beg) |
3476 | 0 | continue; |
3477 | | |
3478 | 0 | beg = curr_intv->beg; |
3479 | 0 | end = curr_intv->end; |
3480 | 0 | rel_off = beg>>idx->min_shift; |
3481 | | |
3482 | | /* Compute 'min_off' by searching the lowest level bin containing 'beg'. |
3483 | | If the computed bin is not in the index, try the next bin to the |
3484 | | left, belonging to the same parent. If it is the first sibling bin, |
3485 | | try the parent bin. */ |
3486 | 0 | bin = hts_bin_first(idx->n_lvls) + rel_off; |
3487 | 0 | do { |
3488 | 0 | int first; |
3489 | 0 | k = kh_get(bin, bidx, bin); |
3490 | 0 | if (k != kh_end(bidx)) break; |
3491 | 0 | first = (hts_bin_parent(bin)<<3) + 1; |
3492 | 0 | if (bin > first) --bin; |
3493 | 0 | else bin = hts_bin_parent(bin); |
3494 | 0 | } while (bin); |
3495 | 0 | if (bin == 0) |
3496 | 0 | k = kh_get(bin, bidx, bin); |
3497 | 0 | min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0; |
3498 | | // min_off can be calculated more accurately if the |
3499 | | // linear index is available |
3500 | 0 | if (idx->lidx[tid].offset |
3501 | 0 | && rel_off < idx->lidx[tid].n) { |
3502 | 0 | if (min_off < idx->lidx[tid].offset[rel_off]) |
3503 | 0 | min_off = idx->lidx[tid].offset[rel_off]; |
3504 | 0 | if (unmapped) { |
3505 | 0 | int tmp_off; |
3506 | 0 | for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) { |
3507 | 0 | if (idx->lidx[tid].offset[tmp_off] < min_off) { |
3508 | 0 | min_off = idx->lidx[tid].offset[tmp_off]; |
3509 | 0 | break; |
3510 | 0 | } |
3511 | 0 | } |
3512 | |
|
3513 | 0 | if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).loff || tmp_off < 0)) |
3514 | 0 | min_off = kh_val(bidx, k).loff; |
3515 | 0 | } |
3516 | 0 | } else if (unmapped) { //CSI index |
3517 | 0 | if (k != kh_end(bidx)) |
3518 | 0 | min_off = kh_val(bidx, k).loff; |
3519 | 0 | } |
3520 | | |
3521 | | // compute max_off: a virtual offset from a bin to the right of end |
3522 | | // First check if end lies within the range of the index (it |
3523 | | // won't if it's HTS_POS_MAX) |
3524 | 0 | if (end < 1LL << (idx->min_shift + 3 * idx->n_lvls)) { |
3525 | 0 | bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; |
3526 | 0 | if (bin >= idx->n_bins) bin = 0; |
3527 | 0 | while (1) { |
3528 | | // search for an extant bin by moving right, but moving up to the |
3529 | | // parent whenever we get to a first child (which also covers falling |
3530 | | // off the RHS, which wraps around and immediately goes up to bin 0) |
3531 | 0 | while (bin % 8 == 1) bin = hts_bin_parent(bin); |
3532 | 0 | if (bin == 0) { max_off = UINT64_MAX; break; } |
3533 | 0 | k = kh_get(bin, bidx, bin); |
3534 | 0 | if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { |
3535 | 0 | max_off = kh_val(bidx, k).list[0].u; |
3536 | 0 | break; |
3537 | 0 | } |
3538 | 0 | bin++; |
3539 | 0 | } |
3540 | 0 | } else { |
3541 | | // Searching to end of reference |
3542 | 0 | max_off = UINT64_MAX; |
3543 | 0 | } |
3544 | | |
3545 | | //convert coordinates to file offsets |
3546 | 0 | if (reg2intervals(iter, idx, tid, beg, end, j, |
3547 | 0 | min_off, max_off, |
3548 | 0 | idx->min_shift, idx->n_lvls) < 0) { |
3549 | 0 | return -1; |
3550 | 0 | } |
3551 | 0 | } |
3552 | 0 | } |
3553 | 0 | } |
3554 | | |
3555 | 0 | if (iter->n_off > 1) |
3556 | 0 | ks_introsort(_off_max, iter->n_off, iter->off); |
3557 | |
|
3558 | 0 | if(!iter->n_off && !iter->nocoor) |
3559 | 0 | iter->finished = 1; |
3560 | |
|
3561 | 0 | return 0; |
3562 | 0 | } |
3563 | | |
3564 | | int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter) |
3565 | 0 | { |
3566 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
3567 | 0 | int tid, i, n_off = 0; |
3568 | 0 | uint32_t j; |
3569 | 0 | hts_pos_t beg, end; |
3570 | 0 | hts_reglist_t *curr_reg; |
3571 | 0 | hts_pair32_t *curr_intv; |
3572 | 0 | hts_pair64_max_t *off = NULL, *tmp; |
3573 | 0 | cram_index *e = NULL; |
3574 | |
|
3575 | 0 | if (!cidx || !iter || !iter->multi) |
3576 | 0 | return -1; |
3577 | | |
3578 | 0 | iter->is_cram = 1; |
3579 | 0 | iter->read_rest = 0; |
3580 | 0 | iter->off = NULL; |
3581 | 0 | iter->n_off = 0; |
3582 | 0 | iter->curr_off = 0; |
3583 | 0 | iter->i = -1; |
3584 | |
|
3585 | 0 | for (i=0; i<iter->n_reg; i++) { |
3586 | |
|
3587 | 0 | curr_reg = &iter->reg_list[i]; |
3588 | 0 | tid = curr_reg->tid; |
3589 | |
|
3590 | 0 | if (tid >= 0) { |
3591 | 0 | tmp = realloc(off, (n_off + curr_reg->count) * sizeof(*off)); |
3592 | 0 | if (!tmp) |
3593 | 0 | goto err; |
3594 | 0 | off = tmp; |
3595 | |
|
3596 | 0 | for (j=0; j < curr_reg->count; j++) { |
3597 | 0 | curr_intv = &curr_reg->intervals[j]; |
3598 | 0 | if (curr_intv->end < curr_intv->beg) |
3599 | 0 | continue; |
3600 | | |
3601 | 0 | beg = curr_intv->beg; |
3602 | 0 | end = curr_intv->end; |
3603 | | |
3604 | | /* First, fetch the container overlapping 'beg' and assign its file offset to u, then |
3605 | | * find the container overlapping 'end' and assign the relative end of the slice to v. |
3606 | | * The cram_ptell function will adjust with the container offset, which is not stored |
3607 | | * in the index. |
3608 | | */ |
3609 | 0 | e = cram_index_query(cidx->cram, tid, beg+1, NULL); |
3610 | 0 | if (e) { |
3611 | 0 | off[n_off].u = e->offset; |
3612 | | // hts_pair64_max_t::max is now used to link |
3613 | | // file offsets to region list entries. |
3614 | | // The iterator can use this to decide if it |
3615 | | // can skip some file regions. |
3616 | 0 | off[n_off].max = ((uint64_t) tid << 32) | j; |
3617 | |
|
3618 | 0 | if (end >= HTS_POS_MAX) { |
3619 | 0 | e = cram_index_last(cidx->cram, tid, NULL); |
3620 | 0 | } else { |
3621 | 0 | e = cram_index_query_last(cidx->cram, tid, end+1); |
3622 | 0 | } |
3623 | |
|
3624 | 0 | if (e) { |
3625 | 0 | off[n_off++].v = e->e_next |
3626 | 0 | ? e->e_next->offset |
3627 | 0 | : e->offset + e->slice + e->len; |
3628 | 0 | } else { |
3629 | 0 | hts_log_warning("Could not set offset end for region %d:%"PRIhts_pos"-%"PRIhts_pos". Skipping", tid, beg, end); |
3630 | 0 | } |
3631 | 0 | } |
3632 | 0 | } |
3633 | 0 | } else { |
3634 | 0 | switch (tid) { |
3635 | 0 | case HTS_IDX_NOCOOR: |
3636 | 0 | e = cram_index_query(cidx->cram, tid, 1, NULL); |
3637 | 0 | if (e) { |
3638 | 0 | iter->nocoor = 1; |
3639 | 0 | iter->nocoor_off = e->offset; |
3640 | 0 | } else { |
3641 | 0 | hts_log_warning("No index entry for NOCOOR region"); |
3642 | 0 | } |
3643 | 0 | break; |
3644 | 0 | case HTS_IDX_START: |
3645 | 0 | e = cram_index_query(cidx->cram, tid, 1, NULL); |
3646 | 0 | if (e) { |
3647 | 0 | iter->read_rest = 1; |
3648 | 0 | tmp = realloc(off, sizeof(*off)); |
3649 | 0 | if (!tmp) |
3650 | 0 | goto err; |
3651 | 0 | off = tmp; |
3652 | 0 | off[0].u = e->offset; |
3653 | 0 | off[0].v = 0; |
3654 | 0 | n_off=1; |
3655 | 0 | } else { |
3656 | 0 | hts_log_warning("No index entries"); |
3657 | 0 | } |
3658 | 0 | break; |
3659 | 0 | case HTS_IDX_REST: |
3660 | 0 | break; |
3661 | 0 | case HTS_IDX_NONE: |
3662 | 0 | iter->finished = 1; |
3663 | 0 | break; |
3664 | 0 | default: |
3665 | 0 | hts_log_error("Query with tid=%d not implemented for CRAM files", tid); |
3666 | 0 | } |
3667 | 0 | } |
3668 | 0 | } |
3669 | | |
3670 | 0 | if (n_off) { |
3671 | 0 | ks_introsort(_off_max, n_off, off); |
3672 | 0 | iter->n_off = n_off; iter->off = off; |
3673 | 0 | } |
3674 | |
|
3675 | 0 | if(!n_off && !iter->nocoor) |
3676 | 0 | iter->finished = 1; |
3677 | |
|
3678 | 0 | return 0; |
3679 | | |
3680 | 0 | err: |
3681 | 0 | free(off); |
3682 | 0 | return -1; |
3683 | 0 | } |
3684 | | |
3685 | | void hts_itr_destroy(hts_itr_t *iter) |
3686 | 0 | { |
3687 | 0 | if (iter) { |
3688 | 0 | if (iter->multi) { |
3689 | 0 | hts_reglist_free(iter->reg_list, iter->n_reg); |
3690 | 0 | } else { |
3691 | 0 | free(iter->bins.a); |
3692 | 0 | } |
3693 | |
|
3694 | 0 | if (iter->off) |
3695 | 0 | free(iter->off); |
3696 | 0 | free(iter); |
3697 | 0 | } |
3698 | 0 | } |
3699 | | |
3700 | | static inline long long push_digit(long long i, char c) |
3701 | 0 | { |
3702 | | // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so |
3703 | 0 | int digit = c - '0'; |
3704 | 0 | return 10 * i + digit; |
3705 | 0 | } |
3706 | | |
3707 | | long long hts_parse_decimal(const char *str, char **strend, int flags) |
3708 | 0 | { |
3709 | 0 | long long n = 0; |
3710 | 0 | int digits = 0, decimals = 0, e = 0, lost = 0; |
3711 | 0 | char sign = '+', esign = '+'; |
3712 | 0 | const char *s, *str_orig = str; |
3713 | |
|
3714 | 0 | while (isspace_c(*str)) str++; |
3715 | 0 | s = str; |
3716 | |
|
3717 | 0 | if (*s == '+' || *s == '-') sign = *s++; |
3718 | 0 | while (*s) |
3719 | 0 | if (isdigit_c(*s)) digits++, n = push_digit(n, *s++); |
3720 | 0 | else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++; |
3721 | 0 | else break; |
3722 | |
|
3723 | 0 | if (*s == '.') { |
3724 | 0 | s++; |
3725 | 0 | while (isdigit_c(*s)) decimals++, digits++, n = push_digit(n, *s++); |
3726 | 0 | } |
3727 | |
|
3728 | 0 | switch (*s) { |
3729 | 0 | case 'e': case 'E': |
3730 | 0 | s++; |
3731 | 0 | if (*s == '+' || *s == '-') esign = *s++; |
3732 | 0 | while (isdigit_c(*s)) e = push_digit(e, *s++); |
3733 | 0 | if (esign == '-') e = -e; |
3734 | 0 | break; |
3735 | | |
3736 | 0 | case 'k': case 'K': e += 3; s++; break; |
3737 | 0 | case 'm': case 'M': e += 6; s++; break; |
3738 | 0 | case 'g': case 'G': e += 9; s++; break; |
3739 | 0 | } |
3740 | | |
3741 | 0 | e -= decimals; |
3742 | 0 | while (e > 0) n *= 10, e--; |
3743 | 0 | while (e < 0) lost += n % 10, n /= 10, e++; |
3744 | |
|
3745 | 0 | if (lost > 0) { |
3746 | 0 | hts_log_warning("Discarding fractional part of %.*s", (int)(s - str), str); |
3747 | 0 | } |
3748 | |
|
3749 | 0 | if (strend) { |
3750 | | // Set to the original input str pointer if not valid number syntax |
3751 | 0 | *strend = (digits > 0)? (char *)s : (char *)str_orig; |
3752 | 0 | } else if (digits == 0) { |
3753 | 0 | hts_log_warning("Invalid numeric value %.8s[truncated]", str); |
3754 | 0 | } else if (*s) { |
3755 | 0 | if ((flags & HTS_PARSE_THOUSANDS_SEP) || (!(flags & HTS_PARSE_THOUSANDS_SEP) && *s != ',')) |
3756 | 0 | hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s); |
3757 | 0 | } |
3758 | |
|
3759 | 0 | return (sign == '+')? n : -n; |
3760 | 0 | } |
3761 | | |
3762 | 0 | static void *hts_memrchr(const void *s, int c, size_t n) { |
3763 | 0 | size_t i; |
3764 | 0 | unsigned char *u = (unsigned char *)s; |
3765 | 0 | for (i = n; i > 0; i--) { |
3766 | 0 | if (u[i-1] == c) |
3767 | 0 | return u+i-1; |
3768 | 0 | } |
3769 | | |
3770 | 0 | return NULL; |
3771 | 0 | } |
3772 | | |
3773 | | /* |
3774 | | * A variant of hts_parse_reg which is reference-id aware. It uses |
3775 | | * the iterator name2id callbacks to validate the region tokenisation works. |
3776 | | * |
3777 | | * This is necessary due to GRCh38 HLA additions which have reference names |
3778 | | * like "HLA-DRB1*12:17". |
3779 | | * |
3780 | | * All parameters are mandatory. |
3781 | | * |
3782 | | * To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200" |
3783 | | * are reference names, we may quote using curly braces. |
3784 | | * Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. |
3785 | | * |
3786 | | * Flags are used to control how parsing works, and can be one of the below. |
3787 | | * |
3788 | | * HTS_PARSE_LIST: |
3789 | | * If present, the region is assmed to be a comma separated list and |
3790 | | * position parsing will not contain commas (this implicitly |
3791 | | * clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal). |
3792 | | * On success the return pointer will be the start of the next region, ie |
3793 | | * the character after the comma. (If *ret != '\0' then the caller can |
3794 | | * assume another region is present in the list.) |
3795 | | * |
3796 | | * If not set then positions may contain commas. In this case the return |
3797 | | * value should point to the end of the string, or NULL on failure. |
3798 | | * |
3799 | | * HTS_PARSE_ONE_COORD: |
3800 | | * If present, X:100 is treated as the single base pair region X:100-100. |
3801 | | * In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-<end>. |
3802 | | * (This is the standard bcftools region convention.) |
3803 | | * |
3804 | | * When not set X:100 is considered to be X:100-<end> where <end> is |
3805 | | * the end of chromosome X (set to HTS_POS_MAX here). X:100- and X:-100 |
3806 | | * are invalid. |
3807 | | * (This is the standard samtools region convention.) |
3808 | | * |
3809 | | * Note the supplied string expects 1 based inclusive coordinates, but the |
3810 | | * returned coordinates start from 0 and are half open, so pos0 is valid |
3811 | | * for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}" |
3812 | | * |
3813 | | * On success a pointer to the byte after the end of the entire region |
3814 | | * specifier is returned (plus any trailing comma), and tid, |
3815 | | * beg & end will be set. |
3816 | | * On failure NULL is returned. |
3817 | | */ |
3818 | | const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg, |
3819 | | hts_pos_t *end, hts_name2id_f getid, void *hdr, |
3820 | | int flags) |
3821 | 0 | { |
3822 | 0 | if (!s || !tid || !beg || !end || !getid) |
3823 | 0 | return NULL; |
3824 | | |
3825 | 0 | size_t s_len = strlen(s); |
3826 | 0 | kstring_t ks = { 0, 0, NULL }; |
3827 | |
|
3828 | 0 | const char *colon = NULL, *comma = NULL; |
3829 | 0 | int quoted = 0; |
3830 | |
|
3831 | 0 | if (flags & HTS_PARSE_LIST) |
3832 | 0 | flags &= ~HTS_PARSE_THOUSANDS_SEP; |
3833 | 0 | else |
3834 | 0 | flags |= HTS_PARSE_THOUSANDS_SEP; |
3835 | |
|
3836 | 0 | const char *s_end = s + s_len; |
3837 | | |
3838 | | // Braced quoting of references is permitted to resolve ambiguities. |
3839 | 0 | if (*s == '{') { |
3840 | 0 | const char *close = memchr(s, '}', s_len); |
3841 | 0 | if (!close) { |
3842 | 0 | hts_log_error("Mismatching braces in \"%s\"", s); |
3843 | 0 | *tid = -1; |
3844 | 0 | return NULL; |
3845 | 0 | } |
3846 | 0 | s++; |
3847 | 0 | s_len--; |
3848 | 0 | if (close[1] == ':') |
3849 | 0 | colon = close+1; |
3850 | 0 | quoted = 1; // number of trailing characters to trim |
3851 | | |
3852 | | // Truncate to this item only, if appropriate. |
3853 | 0 | if (flags & HTS_PARSE_LIST) { |
3854 | 0 | comma = strchr(close, ','); |
3855 | 0 | if (comma) { |
3856 | 0 | s_len = comma-s; |
3857 | 0 | s_end = comma+1; |
3858 | 0 | } |
3859 | 0 | } |
3860 | 0 | } else { |
3861 | | // Truncate to this item only, if appropriate. |
3862 | 0 | if (flags & HTS_PARSE_LIST) { |
3863 | 0 | comma = strchr(s, ','); |
3864 | 0 | if (comma) { |
3865 | 0 | s_len = comma-s; |
3866 | 0 | s_end = comma+1; |
3867 | 0 | } |
3868 | 0 | } |
3869 | |
|
3870 | 0 | colon = hts_memrchr(s, ':', s_len); |
3871 | 0 | } |
3872 | | |
3873 | | // No colon is simplest case; just check and return. |
3874 | 0 | if (colon == NULL) { |
3875 | 0 | *beg = 0; *end = HTS_POS_MAX; |
3876 | 0 | kputsn(s, s_len-quoted, &ks); // convert to nul terminated string |
3877 | 0 | if (!ks.s) { |
3878 | 0 | *tid = -2; |
3879 | 0 | return NULL; |
3880 | 0 | } |
3881 | | |
3882 | 0 | *tid = getid(hdr, ks.s); |
3883 | 0 | free(ks.s); |
3884 | |
|
3885 | 0 | return *tid >= 0 ? s_end : NULL; |
3886 | 0 | } |
3887 | | |
3888 | | // Has a colon, but check whole name first. |
3889 | 0 | if (!quoted) { |
3890 | 0 | *beg = 0; *end = HTS_POS_MAX; |
3891 | 0 | kputsn(s, s_len, &ks); // convert to nul terminated string |
3892 | 0 | if (!ks.s) { |
3893 | 0 | *tid = -2; |
3894 | 0 | return NULL; |
3895 | 0 | } |
3896 | 0 | if ((*tid = getid(hdr, ks.s)) >= 0) { |
3897 | | // Entire name matches, but also check this isn't |
3898 | | // ambiguous. eg we have ref chr1 and ref chr1:100-200 |
3899 | | // both present. |
3900 | 0 | ks.l = 0; |
3901 | 0 | kputsn(s, colon-s, &ks); // convert to nul terminated string |
3902 | 0 | if (!ks.s) { |
3903 | 0 | *tid = -2; |
3904 | 0 | return NULL; |
3905 | 0 | } |
3906 | 0 | if (getid(hdr, ks.s) >= 0) { |
3907 | 0 | free(ks.s); |
3908 | 0 | *tid = -1; |
3909 | 0 | hts_log_error("Range is ambiguous. " |
3910 | 0 | "Use {%s} or {%.*s}%s instead", |
3911 | 0 | s, (int)(colon-s), s, colon); |
3912 | 0 | return NULL; |
3913 | 0 | } |
3914 | 0 | free(ks.s); |
3915 | |
|
3916 | 0 | return s_end; |
3917 | 0 | } |
3918 | 0 | if (*tid < -1) // Failed to parse header |
3919 | 0 | return NULL; |
3920 | 0 | } |
3921 | | |
3922 | | // Quoted, or unquoted and whole string isn't a name. |
3923 | | // Check the pre-colon part is valid. |
3924 | 0 | ks.l = 0; |
3925 | 0 | kputsn(s, colon-s-quoted, &ks); // convert to nul terminated string |
3926 | 0 | if (!ks.s) { |
3927 | 0 | *tid = -2; |
3928 | 0 | return NULL; |
3929 | 0 | } |
3930 | 0 | *tid = getid(hdr, ks.s); |
3931 | 0 | free(ks.s); |
3932 | 0 | if (*tid < 0) |
3933 | 0 | return NULL; |
3934 | | |
3935 | | // Finally parse the post-colon coordinates |
3936 | 0 | char *hyphen; |
3937 | 0 | *beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1; |
3938 | 0 | if (*beg < 0) { |
3939 | 0 | if (*beg != -1 && *hyphen == '-' && colon[1] != '\0') { |
3940 | | // User specified zero, but we're 1-based. |
3941 | 0 | hts_log_error("Coordinates must be > 0"); |
3942 | 0 | return NULL; |
3943 | 0 | } |
3944 | 0 | if (isdigit_c(*hyphen) || *hyphen == '\0' || *hyphen == ',') { |
3945 | | // interpret chr:-100 as chr:1-100 |
3946 | 0 | *end = *beg==-1 ? HTS_POS_MAX : -(*beg+1); |
3947 | 0 | *beg = 0; |
3948 | 0 | return s_end; |
3949 | 0 | } else if (*beg < -1) { |
3950 | 0 | hts_log_error("Unexpected string \"%s\" after region", hyphen); |
3951 | 0 | return NULL; |
3952 | 0 | } |
3953 | 0 | } |
3954 | | |
3955 | 0 | if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) { |
3956 | 0 | *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : HTS_POS_MAX; |
3957 | 0 | } else if (*hyphen == '-') { |
3958 | 0 | *end = hts_parse_decimal(hyphen+1, &hyphen, flags); |
3959 | 0 | if (*hyphen != '\0' && *hyphen != ',') { |
3960 | 0 | hts_log_error("Unexpected string \"%s\" after region", hyphen); |
3961 | 0 | return NULL; |
3962 | 0 | } |
3963 | 0 | } else { |
3964 | 0 | hts_log_error("Unexpected string \"%s\" after region", hyphen); |
3965 | 0 | return NULL; |
3966 | 0 | } |
3967 | | |
3968 | 0 | if (*end == 0) |
3969 | 0 | *end = HTS_POS_MAX; // interpret chr:100- as chr:100-<end> |
3970 | |
|
3971 | 0 | if (*beg >= *end) return NULL; |
3972 | | |
3973 | 0 | return s_end; |
3974 | 0 | } |
3975 | | |
3976 | | // Next release we should mark this as deprecated? |
3977 | | // Use hts_parse_region above instead. |
3978 | | const char *hts_parse_reg64(const char *s, hts_pos_t *beg, hts_pos_t *end) |
3979 | 0 | { |
3980 | 0 | char *hyphen; |
3981 | 0 | const char *colon = strrchr(s, ':'); |
3982 | 0 | if (colon == NULL) { |
3983 | 0 | *beg = 0; *end = HTS_POS_MAX; |
3984 | 0 | return s + strlen(s); |
3985 | 0 | } |
3986 | | |
3987 | 0 | *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1; |
3988 | 0 | if (*beg < 0) *beg = 0; |
3989 | |
|
3990 | 0 | if (*hyphen == '\0') *end = HTS_POS_MAX; |
3991 | 0 | else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP); |
3992 | 0 | else return NULL; |
3993 | | |
3994 | 0 | if (*beg >= *end) return NULL; |
3995 | 0 | return colon; |
3996 | 0 | } |
3997 | | |
3998 | | const char *hts_parse_reg(const char *s, int *beg, int *end) |
3999 | 0 | { |
4000 | 0 | hts_pos_t beg64 = 0, end64 = 0; |
4001 | 0 | const char *colon = hts_parse_reg64(s, &beg64, &end64); |
4002 | 0 | if (beg64 > INT_MAX) { |
4003 | 0 | hts_log_error("Position %"PRId64" too large", beg64); |
4004 | 0 | return NULL; |
4005 | 0 | } |
4006 | 0 | if (end64 > INT_MAX) { |
4007 | 0 | if (end64 == HTS_POS_MAX) { |
4008 | 0 | end64 = INT_MAX; |
4009 | 0 | } else { |
4010 | 0 | hts_log_error("Position %"PRId64" too large", end64); |
4011 | 0 | return NULL; |
4012 | 0 | } |
4013 | 0 | } |
4014 | 0 | *beg = beg64; |
4015 | 0 | *end = end64; |
4016 | 0 | return colon; |
4017 | 0 | } |
4018 | | |
4019 | | hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec) |
4020 | 0 | { |
4021 | 0 | int tid; |
4022 | 0 | hts_pos_t beg, end; |
4023 | |
|
4024 | 0 | if (strcmp(reg, ".") == 0) |
4025 | 0 | return itr_query(idx, HTS_IDX_START, 0, 0, readrec); |
4026 | 0 | else if (strcmp(reg, "*") == 0) |
4027 | 0 | return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec); |
4028 | | |
4029 | 0 | if (!hts_parse_region(reg, &tid, &beg, &end, getid, hdr, HTS_PARSE_THOUSANDS_SEP)) |
4030 | 0 | return NULL; |
4031 | | |
4032 | 0 | return itr_query(idx, tid, beg, end, readrec); |
4033 | 0 | } |
4034 | | |
4035 | 0 | hts_itr_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int count, hts_name2id_f getid, void *hdr, hts_itr_multi_query_func *itr_specific, hts_readrec_func *readrec, hts_seek_func *seek, hts_tell_func *tell) { |
4036 | |
|
4037 | 0 | int i; |
4038 | |
|
4039 | 0 | if (!reglist) |
4040 | 0 | return NULL; |
4041 | | |
4042 | 0 | hts_itr_t *itr = (hts_itr_t*)calloc(1, sizeof(hts_itr_t)); |
4043 | 0 | if (itr) { |
4044 | 0 | itr->n_reg = count; |
4045 | 0 | itr->readrec = readrec; |
4046 | 0 | itr->seek = seek; |
4047 | 0 | itr->tell = tell; |
4048 | 0 | itr->reg_list = reglist; |
4049 | 0 | itr->finished = 0; |
4050 | 0 | itr->nocoor = 0; |
4051 | 0 | itr->multi = 1; |
4052 | |
|
4053 | 0 | for (i = 0; i < itr->n_reg; i++) { |
4054 | 0 | if (itr->reg_list[i].reg) { |
4055 | 0 | if (!strcmp(itr->reg_list[i].reg, ".")) { |
4056 | 0 | itr->reg_list[i].tid = HTS_IDX_START; |
4057 | 0 | continue; |
4058 | 0 | } |
4059 | | |
4060 | 0 | if (!strcmp(itr->reg_list[i].reg, "*")) { |
4061 | 0 | itr->reg_list[i].tid = HTS_IDX_NOCOOR; |
4062 | 0 | continue; |
4063 | 0 | } |
4064 | | |
4065 | 0 | itr->reg_list[i].tid = getid(hdr, reglist[i].reg); |
4066 | 0 | if (itr->reg_list[i].tid < 0) { |
4067 | 0 | if (itr->reg_list[i].tid < -1) { |
4068 | 0 | hts_log_error("Failed to parse header"); |
4069 | 0 | hts_itr_destroy(itr); |
4070 | 0 | return NULL; |
4071 | 0 | } else { |
4072 | 0 | hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", reglist[i].reg); |
4073 | 0 | } |
4074 | 0 | } |
4075 | 0 | } |
4076 | 0 | } |
4077 | | |
4078 | 0 | qsort(itr->reg_list, itr->n_reg, sizeof(hts_reglist_t), compare_regions); |
4079 | 0 | if (itr_specific(idx, itr) != 0) { |
4080 | 0 | hts_log_error("Failed to create the multi-region iterator!"); |
4081 | 0 | hts_itr_destroy(itr); |
4082 | 0 | itr = NULL; |
4083 | 0 | } |
4084 | 0 | } |
4085 | | |
4086 | 0 | return itr; |
4087 | 0 | } |
4088 | | |
4089 | | int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) |
4090 | 0 | { |
4091 | 0 | int ret, tid; |
4092 | 0 | hts_pos_t beg, end; |
4093 | 0 | if (iter == NULL || iter->finished) return -1; |
4094 | 0 | if (iter->read_rest) { |
4095 | 0 | if (iter->curr_off) { // seek to the start |
4096 | 0 | if (bgzf_seek(fp, iter->curr_off, SEEK_SET) < 0) { |
4097 | 0 | hts_log_error("Failed to seek to offset %"PRIu64"%s%s", |
4098 | 0 | iter->curr_off, |
4099 | 0 | errno ? ": " : "", strerror(errno)); |
4100 | 0 | return -2; |
4101 | 0 | } |
4102 | 0 | iter->curr_off = 0; // only seek once |
4103 | 0 | } |
4104 | 0 | ret = iter->readrec(fp, data, r, &tid, &beg, &end); |
4105 | 0 | if (ret < 0) iter->finished = 1; |
4106 | 0 | iter->curr_tid = tid; |
4107 | 0 | iter->curr_beg = beg; |
4108 | 0 | iter->curr_end = end; |
4109 | 0 | return ret; |
4110 | 0 | } |
4111 | | // A NULL iter->off should always be accompanied by iter->finished. |
4112 | 0 | assert(iter->off != NULL); |
4113 | 0 | for (;;) { |
4114 | 0 | if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk |
4115 | 0 | if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks |
4116 | 0 | if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek |
4117 | 0 | if (bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) { |
4118 | 0 | hts_log_error("Failed to seek to offset %"PRIu64"%s%s", |
4119 | 0 | iter->off[iter->i+1].u, |
4120 | 0 | errno ? ": " : "", strerror(errno)); |
4121 | 0 | return -2; |
4122 | 0 | } |
4123 | 0 | iter->curr_off = bgzf_tell(fp); |
4124 | 0 | } |
4125 | 0 | ++iter->i; |
4126 | 0 | } |
4127 | 0 | if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) { |
4128 | 0 | iter->curr_off = bgzf_tell(fp); |
4129 | 0 | if (tid != iter->tid || beg >= iter->end) { // no need to proceed |
4130 | 0 | ret = -1; break; |
4131 | 0 | } else if (end > iter->beg && iter->end > beg) { |
4132 | 0 | iter->curr_tid = tid; |
4133 | 0 | iter->curr_beg = beg; |
4134 | 0 | iter->curr_end = end; |
4135 | 0 | return ret; |
4136 | 0 | } |
4137 | 0 | } else break; // end of file or error |
4138 | 0 | } |
4139 | 0 | iter->finished = 1; |
4140 | 0 | return ret; |
4141 | 0 | } |
4142 | | |
4143 | | int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r) |
4144 | 0 | { |
4145 | 0 | void *fp; |
4146 | 0 | int ret, tid, i, cr, ci; |
4147 | 0 | hts_pos_t beg, end; |
4148 | 0 | hts_reglist_t *found_reg; |
4149 | |
|
4150 | 0 | if (iter == NULL || iter->finished) return -1; |
4151 | | |
4152 | 0 | if (iter->is_cram) { |
4153 | 0 | fp = fd->fp.cram; |
4154 | 0 | } else { |
4155 | 0 | fp = fd->fp.bgzf; |
4156 | 0 | } |
4157 | |
|
4158 | 0 | if (iter->read_rest) { |
4159 | 0 | if (iter->curr_off) { // seek to the start |
4160 | 0 | if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) { |
4161 | 0 | hts_log_error("Seek at offset %" PRIu64 " failed.", iter->curr_off); |
4162 | 0 | return -1; |
4163 | 0 | } |
4164 | 0 | iter->curr_off = 0; // only seek once |
4165 | 0 | } |
4166 | | |
4167 | 0 | ret = iter->readrec(fp, fd, r, &tid, &beg, &end); |
4168 | 0 | if (ret < 0) |
4169 | 0 | iter->finished = 1; |
4170 | |
|
4171 | 0 | iter->curr_tid = tid; |
4172 | 0 | iter->curr_beg = beg; |
4173 | 0 | iter->curr_end = end; |
4174 | |
|
4175 | 0 | return ret; |
4176 | 0 | } |
4177 | | // A NULL iter->off should always be accompanied by iter->finished. |
4178 | 0 | assert(iter->off != NULL || iter->nocoor != 0); |
4179 | | |
4180 | 0 | int next_range = 0; |
4181 | 0 | for (;;) { |
4182 | | // Note that due to the way bam indexing works, iter->off may contain |
4183 | | // file chunks that are not actually needed as they contain data |
4184 | | // beyond the end of the requested region. These are filtered out |
4185 | | // by comparing the tid and index into hts_reglist_t::intervals |
4186 | | // (packed for reasons of convenience into iter->off[iter->i].max) |
4187 | | // associated with the file region with iter->curr_tid and |
4188 | | // iter->curr_intv. |
4189 | |
|
4190 | 0 | if (next_range |
4191 | 0 | || iter->curr_off == 0 |
4192 | 0 | || iter->i >= iter->n_off |
4193 | 0 | || iter->curr_off >= iter->off[iter->i].v |
4194 | 0 | || (iter->off[iter->i].max >> 32 == iter->curr_tid |
4195 | 0 | && (iter->off[iter->i].max & 0xffffffff) < iter->curr_intv)) { |
4196 | | |
4197 | | // Jump to the next chunk. It may be necessary to skip more |
4198 | | // than one as the iter->off list can include overlapping entries. |
4199 | 0 | do { |
4200 | 0 | iter->i++; |
4201 | 0 | } while (iter->i < iter->n_off |
4202 | 0 | && (iter->curr_off >= iter->off[iter->i].v |
4203 | 0 | || (iter->off[iter->i].max >> 32 == iter->curr_tid |
4204 | 0 | && (iter->off[iter->i].max & 0xffffffff) < iter->curr_intv))); |
4205 | |
|
4206 | 0 | if (iter->is_cram && iter->i < iter->n_off) { |
4207 | | // Ensure iter->curr_reg is correct. |
4208 | | // |
4209 | | // We need this for CRAM as we shortcut some of the later |
4210 | | // logic by getting an end-of-range and continuing to the |
4211 | | // next offset. |
4212 | | // |
4213 | | // We cannot do this for BAM (and fortunately do not need to |
4214 | | // either) because in BAM world a query to genomic positions |
4215 | | // GX and GY leading to a seek offsets PX and PY may have |
4216 | | // GX > GY and PX < PY. (This is due to the R-tree and falling |
4217 | | // between intervals, bumping up to a higher bin.) |
4218 | | // CRAM strictly follows PX >= PY if GX >= GY, so this logic |
4219 | | // works. |
4220 | 0 | int want_tid = iter->off[iter->i].max >> 32; |
4221 | 0 | if (!(iter->curr_reg < iter->n_reg && |
4222 | 0 | iter->reg_list[iter->curr_reg].tid == want_tid)) { |
4223 | 0 | int j; |
4224 | 0 | for (j = 0; j < iter->n_reg; j++) |
4225 | 0 | if (iter->reg_list[j].tid == want_tid) |
4226 | 0 | break; |
4227 | 0 | if (j == iter->n_reg) |
4228 | 0 | return -1; |
4229 | 0 | iter->curr_reg = j; |
4230 | 0 | iter->curr_tid = iter->reg_list[iter->curr_reg].tid; |
4231 | 0 | }; |
4232 | 0 | iter->curr_intv = iter->off[iter->i].max & 0xffffffff; |
4233 | 0 | } |
4234 | | |
4235 | 0 | if (iter->i >= iter->n_off) { // no more chunks, except NOCOORs |
4236 | 0 | if (iter->nocoor) { |
4237 | 0 | next_range = 0; |
4238 | 0 | if (iter->seek(fp, iter->nocoor_off, SEEK_SET) < 0) { |
4239 | 0 | hts_log_error("Seek at offset %" PRIu64 " failed.", iter->nocoor_off); |
4240 | 0 | return -1; |
4241 | 0 | } |
4242 | 0 | if (iter->is_cram) { |
4243 | 0 | cram_range r = { HTS_IDX_NOCOOR }; |
4244 | 0 | cram_set_option(fp, CRAM_OPT_RANGE_NOSEEK, &r); |
4245 | 0 | } |
4246 | | |
4247 | | // The first slice covering the unmapped reads might |
4248 | | // contain a few mapped reads, so scroll |
4249 | | // forward until finding the first unmapped read. |
4250 | 0 | do { |
4251 | 0 | ret = iter->readrec(fp, fd, r, &tid, &beg, &end); |
4252 | 0 | } while (tid >= 0 && ret >=0); |
4253 | |
|
4254 | 0 | if (ret < 0) |
4255 | 0 | iter->finished = 1; |
4256 | 0 | else |
4257 | 0 | iter->read_rest = 1; |
4258 | |
|
4259 | 0 | iter->curr_off = 0; // don't seek any more |
4260 | 0 | iter->curr_tid = tid; |
4261 | 0 | iter->curr_beg = beg; |
4262 | 0 | iter->curr_end = end; |
4263 | |
|
4264 | 0 | return ret; |
4265 | 0 | } else { |
4266 | 0 | ret = -1; break; |
4267 | 0 | } |
4268 | 0 | } else if (iter->i < iter->n_off) { |
4269 | | // New chunk may overlap the last one, so ensure we |
4270 | | // only seek forwards. |
4271 | 0 | if (iter->curr_off < iter->off[iter->i].u || next_range) { |
4272 | 0 | iter->curr_off = iter->off[iter->i].u; |
4273 | | |
4274 | | // CRAM has the capability of setting an end location. |
4275 | | // This means multi-threaded decodes can stop once they |
4276 | | // reach that point, rather than pointlessly decoding |
4277 | | // more slices than we'll be using. |
4278 | | // |
4279 | | // We have to be careful here. Whenever we set the cram |
4280 | | // range we need a corresponding seek in order to ensure |
4281 | | // we can safely decode at that offset. We use next_range |
4282 | | // var to ensure this is always true; this is set on |
4283 | | // end-of-range condition. It's never modified for BAM. |
4284 | 0 | if (iter->is_cram) { |
4285 | | // Next offset.[uv] tuple, but it's already been |
4286 | | // included in our cram range, so don't seek and don't |
4287 | | // reset range so we can efficiently multi-thread. |
4288 | 0 | if (next_range || iter->curr_off >= iter->end) { |
4289 | 0 | if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) { |
4290 | 0 | hts_log_error("Seek at offset %" PRIu64 |
4291 | 0 | " failed.", iter->curr_off); |
4292 | 0 | return -1; |
4293 | 0 | } |
4294 | | |
4295 | | // Find the genomic range matching this interval. |
4296 | 0 | int j; |
4297 | 0 | hts_reglist_t *rl = &iter->reg_list[iter->curr_reg]; |
4298 | 0 | cram_range r = { |
4299 | 0 | rl->tid, |
4300 | 0 | rl->intervals[iter->curr_intv].beg, |
4301 | 0 | rl->intervals[iter->curr_intv].end |
4302 | 0 | }; |
4303 | | |
4304 | | // Expand it up to cover neighbouring intervals. |
4305 | | // Note we can only have a single chromosome in a |
4306 | | // range, so if we detect our blocks span chromosomes |
4307 | | // or we have a multi-ref mode slice, we just use |
4308 | | // HTS_IDX_START refid instead. This doesn't actually |
4309 | | // seek (due to CRAM_OPT_RANGE_NOSEEK) and is simply |
4310 | | // and indicator of decoding with no end limit. |
4311 | | // |
4312 | | // That isn't as efficient as it could be, but it's |
4313 | | // no poorer than before and it works. |
4314 | 0 | int tid = r.refid; |
4315 | 0 | int64_t end = r.end; |
4316 | 0 | int64_t v = iter->off[iter->i].v; |
4317 | 0 | j = iter->i+1; |
4318 | 0 | while (j < iter->n_off) { |
4319 | 0 | if (iter->off[j].u > v) |
4320 | 0 | break; |
4321 | | |
4322 | 0 | uint64_t max = iter->off[j].max; |
4323 | 0 | if ((max>>32) != tid) |
4324 | 0 | tid = HTS_IDX_START; // => no range limit |
4325 | |
|
4326 | 0 | if (end < rl->intervals[max & 0xffffffff].end) |
4327 | 0 | end = rl->intervals[max & 0xffffffff].end; |
4328 | 0 | if (v < iter->off[j].v) |
4329 | 0 | v = iter->off[j].v; |
4330 | 0 | j++; |
4331 | 0 | } |
4332 | 0 | r.refid = tid; |
4333 | 0 | r.end = end; |
4334 | | |
4335 | | // Remember maximum 'v' here so we don't do |
4336 | | // unnecessary subsequent seeks for the next |
4337 | | // regions. We can't change curr_off, but |
4338 | | // beg/end are used only by single region iterator so |
4339 | | // we cache it there to avoid changing the struct. |
4340 | 0 | iter->end = v; |
4341 | |
|
4342 | 0 | cram_set_option(fp, CRAM_OPT_RANGE_NOSEEK, &r); |
4343 | 0 | next_range = 0; |
4344 | 0 | } |
4345 | 0 | } else { // Not CRAM |
4346 | 0 | if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) { |
4347 | 0 | hts_log_error("Seek at offset %" PRIu64 " failed.", |
4348 | 0 | iter->curr_off); |
4349 | 0 | return -1; |
4350 | 0 | } |
4351 | 0 | } |
4352 | 0 | } |
4353 | 0 | } |
4354 | 0 | } |
4355 | | |
4356 | 0 | ret = iter->readrec(fp, fd, r, &tid, &beg, &end); |
4357 | 0 | if (ret < 0) { |
4358 | 0 | if (iter->is_cram && cram_eof(fp)) { |
4359 | | // Skip to end of range |
4360 | | // |
4361 | | // We should never be adjusting curr_off manually unless |
4362 | | // we also can guarantee we'll be doing a seek after to |
4363 | | // a new location. Otherwise we'll be reading wrong offset |
4364 | | // for the next container. |
4365 | | // |
4366 | | // We ensure this by adjusting our CRAM_OPT_RANGE |
4367 | | // accordingly above, but to double check we also |
4368 | | // set the skipped_block flag to enforce a seek also. |
4369 | 0 | iter->curr_off = iter->off[iter->i].v; |
4370 | 0 | next_range = 1; |
4371 | | |
4372 | | // Next region |
4373 | 0 | if (++iter->curr_intv >= iter->reg_list[iter->curr_reg].count){ |
4374 | 0 | if (++iter->curr_reg >= iter->n_reg) |
4375 | 0 | break; |
4376 | 0 | iter->curr_intv = 0; |
4377 | 0 | iter->curr_tid = iter->reg_list[iter->curr_reg].tid; |
4378 | 0 | } |
4379 | 0 | continue; |
4380 | 0 | } else { |
4381 | 0 | break; |
4382 | 0 | } |
4383 | 0 | } |
4384 | | |
4385 | 0 | iter->curr_off = iter->tell(fp); |
4386 | |
|
4387 | 0 | if (tid != iter->curr_tid) { |
4388 | 0 | hts_reglist_t key; |
4389 | 0 | key.tid = tid; |
4390 | |
|
4391 | 0 | found_reg = (hts_reglist_t *)bsearch(&key, iter->reg_list, |
4392 | 0 | iter->n_reg, |
4393 | 0 | sizeof(hts_reglist_t), |
4394 | 0 | compare_regions); |
4395 | 0 | if (!found_reg) |
4396 | 0 | continue; |
4397 | | |
4398 | 0 | iter->curr_reg = (found_reg - iter->reg_list); |
4399 | 0 | iter->curr_tid = tid; |
4400 | 0 | iter->curr_intv = 0; |
4401 | 0 | } |
4402 | | |
4403 | 0 | cr = iter->curr_reg; |
4404 | 0 | ci = iter->curr_intv; |
4405 | |
|
4406 | 0 | for (i = ci; i < iter->reg_list[cr].count; i++) { |
4407 | 0 | if (end > iter->reg_list[cr].intervals[i].beg && |
4408 | 0 | iter->reg_list[cr].intervals[i].end > beg) { |
4409 | 0 | iter->curr_beg = beg; |
4410 | 0 | iter->curr_end = end; |
4411 | 0 | iter->curr_intv = i; |
4412 | |
|
4413 | 0 | return ret; |
4414 | 0 | } |
4415 | | |
4416 | | // Check if the read starts beyond intervals[i].end |
4417 | | // If so, the interval is finished so move on to the next. |
4418 | 0 | if (beg > iter->reg_list[cr].intervals[i].end) |
4419 | 0 | iter->curr_intv = i + 1; |
4420 | | |
4421 | | // No need to keep searching if the read ends before intervals[i].beg |
4422 | 0 | if (end < iter->reg_list[cr].intervals[i].beg) |
4423 | 0 | break; |
4424 | 0 | } |
4425 | 0 | } |
4426 | 0 | iter->finished = 1; |
4427 | |
|
4428 | 0 | return ret; |
4429 | 0 | } |
4430 | | |
4431 | | /********************** |
4432 | | *** Retrieve index *** |
4433 | | **********************/ |
4434 | | // Local_fn and local_len will return a sub-region of 'fn'. |
4435 | | // Eg http://elsewhere/dir/foo.bam.bai?a=b may return |
4436 | | // foo.bam.bai via local_fn and local_len. |
4437 | | // |
4438 | | // Returns -1 if index couldn't be opened. |
4439 | | // -2 on other errors |
4440 | | static int idx_test_and_fetch(const char *fn, const char **local_fn, int *local_len, int download) |
4441 | 0 | { |
4442 | 0 | hFILE *remote_hfp = NULL; |
4443 | 0 | hFILE *local_fp = NULL; |
4444 | 0 | int save_errno; |
4445 | 0 | htsFormat fmt; |
4446 | 0 | kstring_t s = KS_INITIALIZE; |
4447 | 0 | kstring_t tmps = KS_INITIALIZE; |
4448 | |
|
4449 | 0 | if (hisremote(fn)) { |
4450 | 0 | const int buf_size = 1 * 1024 * 1024; |
4451 | 0 | int l; |
4452 | 0 | const char *p, *e; |
4453 | | // Ignore ?# params: eg any file.fmt?param=val, except for S3 URLs |
4454 | 0 | e = fn + ((strncmp(fn, "s3://", 5) && strncmp(fn, "s3+http://", 10) && strncmp(fn, "s3+https://", 11)) ? strcspn(fn, "?#") : strcspn(fn, "?")); |
4455 | | // Find the previous slash from there. |
4456 | 0 | p = e; |
4457 | 0 | while (p > fn && *p != '/') p--; |
4458 | 0 | if (*p == '/') p++; |
4459 | | |
4460 | | // Attempt to open local file first |
4461 | 0 | kputsn(p, e-p, &s); |
4462 | 0 | if (access(s.s, R_OK) == 0) |
4463 | 0 | { |
4464 | 0 | free(s.s); |
4465 | 0 | *local_fn = p; |
4466 | 0 | *local_len = e-p; |
4467 | 0 | return 0; |
4468 | 0 | } |
4469 | | |
4470 | | // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .bai or .tbi index. |
4471 | 0 | if ((remote_hfp = hopen(fn, "r")) == 0) { |
4472 | 0 | hts_log_info("Failed to open index file '%s'", fn); |
4473 | 0 | free(s.s); |
4474 | 0 | return -1; |
4475 | 0 | } |
4476 | 0 | if (hts_detect_format2(remote_hfp, fn, &fmt)) { |
4477 | 0 | hts_log_error("Failed to detect format of index file '%s'", fn); |
4478 | 0 | goto fail; |
4479 | 0 | } |
4480 | 0 | if (fmt.category != index_file || (fmt.format != bai && fmt.format != csi && fmt.format != tbi |
4481 | 0 | && fmt.format != crai && fmt.format != fai_format)) { |
4482 | 0 | hts_log_error("Format of index file '%s' is not supported", fn); |
4483 | 0 | goto fail; |
4484 | 0 | } |
4485 | | |
4486 | 0 | if (download) { |
4487 | 0 | if ((local_fp = hts_open_tmpfile(s.s, "wx", &tmps))
|