Coverage Report

Created: 2023-11-19 06:13

/src/htslib/hts.c
Line
Count
Source (jump to first uncovered line)
1
/*  hts.c -- format-neutral I/O, indexing, and iterator API functions.
2
3
    Copyright (C) 2008, 2009, 2012-2023 Genome Research Ltd.
4
    Copyright (C) 2012, 2013 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <zlib.h>
30
#include <stdio.h>
31
#include <string.h>
32
#include <strings.h>
33
#include <stdlib.h>
34
#include <unistd.h>
35
#include <inttypes.h>
36
#include <limits.h>
37
#include <stdint.h>
38
#include <fcntl.h>
39
#include <errno.h>
40
#include <time.h>
41
#include <sys/stat.h>
42
#include <assert.h>
43
44
#ifdef HAVE_LIBLZMA
45
#ifdef HAVE_LZMA_H
46
#include <lzma.h>
47
#else
48
#include "os/lzma_stub.h"
49
#endif
50
#endif
51
52
#include "htslib/hts.h"
53
#include "htslib/bgzf.h"
54
#include "cram/cram.h"
55
#include "htslib/hfile.h"
56
#include "htslib/hts_endian.h"
57
#include "version.h"
58
#include "config_vars.h"
59
#include "hts_internal.h"
60
#include "hfile_internal.h"
61
#include "sam_internal.h"
62
#include "htslib/hts_expr.h"
63
#include "htslib/hts_os.h" // drand48
64
65
#include "htslib/khash.h"
66
#include "htslib/kseq.h"
67
#include "htslib/ksort.h"
68
#include "htslib/tbx.h"
69
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
70
#include <htscodecs/htscodecs.h>
71
#else
72
#include "htscodecs/htscodecs/htscodecs.h"
73
#endif
74
75
#ifndef EFTYPE
76
4
#define EFTYPE ENOEXEC
77
#endif
78
79
49.7k
KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal)
80
81
HTSLIB_EXPORT
82
int hts_verbose = HTS_LOG_WARNING;
83
84
const char *hts_version()
85
2
{
86
2
    return HTS_VERSION_TEXT;
87
2
}
88
89
0
unsigned int hts_features(void) {
90
0
    unsigned int feat = HTS_FEATURE_HTSCODECS; // Always present
91
92
0
#ifdef PACKAGE_URL
93
0
    feat |= HTS_FEATURE_CONFIGURE;
94
0
#endif
95
96
#ifdef ENABLE_PLUGINS
97
    feat |= HTS_FEATURE_PLUGINS;
98
#endif
99
100
0
#ifdef HAVE_LIBCURL
101
0
    feat |= HTS_FEATURE_LIBCURL;
102
0
#endif
103
104
0
#ifdef ENABLE_S3
105
0
    feat |= HTS_FEATURE_S3;
106
0
#endif
107
108
0
#ifdef ENABLE_GCS
109
0
    feat |= HTS_FEATURE_GCS;
110
0
#endif
111
112
#ifdef HAVE_LIBDEFLATE
113
    feat |= HTS_FEATURE_LIBDEFLATE;
114
#endif
115
116
0
#ifdef HAVE_LIBLZMA
117
0
    feat |= HTS_FEATURE_LZMA;
118
0
#endif
119
120
0
#ifdef HAVE_LIBBZ2
121
0
    feat |= HTS_FEATURE_BZIP2;
122
0
#endif
123
124
0
    return feat;
125
0
}
126
127
0
const char *hts_test_feature(unsigned int id) {
128
0
    unsigned int feat = hts_features();
129
130
0
    switch (id) {
131
0
    case HTS_FEATURE_CONFIGURE:
132
0
        return feat & HTS_FEATURE_CONFIGURE ? "yes" : NULL;
133
0
    case HTS_FEATURE_PLUGINS:
134
0
        return feat & HTS_FEATURE_PLUGINS ? "yes" : NULL;
135
0
    case HTS_FEATURE_LIBCURL:
136
0
        return feat & HTS_FEATURE_LIBCURL ? "yes" : NULL;
137
0
    case HTS_FEATURE_S3:
138
0
        return feat & HTS_FEATURE_S3 ? "yes" : NULL;
139
0
    case HTS_FEATURE_GCS:
140
0
        return feat & HTS_FEATURE_GCS ? "yes" : NULL;
141
0
    case HTS_FEATURE_LIBDEFLATE:
142
0
        return feat & HTS_FEATURE_LIBDEFLATE ? "yes" : NULL;
143
0
    case HTS_FEATURE_BZIP2:
144
0
        return feat & HTS_FEATURE_BZIP2 ? "yes" : NULL;
145
0
    case HTS_FEATURE_LZMA:
146
0
        return feat & HTS_FEATURE_LZMA ? "yes" : NULL;
147
148
0
    case HTS_FEATURE_HTSCODECS:
149
0
        return htscodecs_version();
150
151
0
    case HTS_FEATURE_CC:
152
0
        return HTS_CC;
153
0
    case HTS_FEATURE_CFLAGS:
154
0
        return HTS_CFLAGS;
155
0
    case HTS_FEATURE_LDFLAGS:
156
0
        return HTS_LDFLAGS;
157
0
    case HTS_FEATURE_CPPFLAGS:
158
0
        return HTS_CPPFLAGS;
159
160
0
    default:
161
0
        fprintf(stderr, "Unknown feature code: %u\n", id);
162
0
    }
163
164
0
    return NULL;
165
0
}
166
167
// Note this implementation also means we can just "strings" the library
168
// to find the configuration parameters.
169
0
const char *hts_feature_string(void) {
170
0
    static char config[1200];
171
0
    const char *flags=
172
173
0
#ifdef PACKAGE_URL
174
0
    "build=configure "
175
#else
176
    "build=Makefile "
177
#endif
178
179
0
#ifdef HAVE_LIBCURL
180
0
    "libcurl=yes "
181
#else
182
    "libcurl=no "
183
#endif
184
185
0
#ifdef ENABLE_S3
186
0
    "S3=yes "
187
#else
188
    "S3=no "
189
#endif
190
191
0
#ifdef ENABLE_GCS
192
0
    "GCS=yes "
193
#else
194
    "GCS=no "
195
#endif
196
197
#ifdef HAVE_LIBDEFLATE
198
    "libdeflate=yes "
199
#else
200
0
    "libdeflate=no "
201
0
#endif
202
203
0
#ifdef HAVE_LIBLZMA
204
0
    "lzma=yes "
205
#else
206
    "lzma=no "
207
#endif
208
209
0
#ifdef HAVE_LIBBZ2
210
0
    "bzip2=yes "
211
#else
212
    "bzip2=no "
213
#endif
214
215
// "plugins=" must stay at the end as it is followed by "plugin-path="
216
#ifdef ENABLE_PLUGINS
217
    "plugins=yes";
218
#else
219
0
    "plugins=no";
220
0
#endif
221
222
#ifdef ENABLE_PLUGINS
223
    snprintf(config, sizeof(config),
224
             "%s plugin-path=%.1000s htscodecs=%.40s",
225
             flags, hts_plugin_path(), htscodecs_version());
226
#else
227
0
    snprintf(config, sizeof(config),
228
0
             "%s htscodecs=%.40s",
229
0
             flags, htscodecs_version());
230
0
#endif
231
0
    return config;
232
0
}
233
234
235
HTSLIB_EXPORT
236
const unsigned char seq_nt16_table[256] = {
237
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
238
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
239
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
240
     1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
241
    15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
242
    15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
243
    15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
244
    15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
245
246
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
247
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
248
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
249
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
250
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
251
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
252
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
253
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
254
};
255
256
HTSLIB_EXPORT
257
const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN";
258
259
HTSLIB_EXPORT
260
const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
261
262
/**********************
263
 *** Basic file I/O ***
264
 **********************/
265
266
static enum htsFormatCategory format_category(enum htsExactFormat fmt)
267
13.6k
{
268
13.6k
    switch (fmt) {
269
0
    case bam:
270
0
    case sam:
271
0
    case cram:
272
0
    case fastq_format:
273
0
    case fasta_format:
274
0
        return sequence_data;
275
276
0
    case vcf:
277
0
    case bcf:
278
0
        return variant_data;
279
280
0
    case bai:
281
0
    case crai:
282
0
    case csi:
283
0
    case fai_format:
284
0
    case fqi_format:
285
0
    case gzi:
286
0
    case tbi:
287
0
        return index_file;
288
289
0
    case bed:
290
0
    case d4_format:
291
0
        return region_list;
292
293
0
    case htsget:
294
0
    case hts_crypt4gh_format:
295
0
        return unknown_category;
296
297
0
    case unknown_format:
298
0
    case binary_format:
299
13.6k
    case text_format:
300
13.6k
    case empty_format:
301
13.6k
    case format_maximum:
302
13.6k
        break;
303
13.6k
    }
304
305
13.6k
    return unknown_category;
306
13.6k
}
307
308
// Decompress several hundred bytes by peeking at the file, which must be
309
// positioned at the start of a GZIP block.
310
static ssize_t
311
decompress_peek_gz(hFILE *fp, unsigned char *dest, size_t destsize)
312
1.12k
{
313
1.12k
    unsigned char buffer[2048];
314
1.12k
    z_stream zs;
315
1.12k
    ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
316
317
1.12k
    if (npeek < 0) return -1;
318
319
1.12k
    zs.zalloc = NULL;
320
1.12k
    zs.zfree = NULL;
321
1.12k
    zs.next_in = buffer;
322
1.12k
    zs.avail_in = npeek;
323
1.12k
    zs.next_out = dest;
324
1.12k
    zs.avail_out = destsize;
325
1.12k
    if (inflateInit2(&zs, 31) != Z_OK) return -1;
326
327
1.12k
    int ret;
328
1.12k
    const unsigned char *last_in = buffer;
329
2.28k
    while (zs.avail_out > 0) {
330
1.22k
        ret = inflate(&zs, Z_SYNC_FLUSH);
331
1.22k
        if (ret == Z_STREAM_END) {
332
82
            if (last_in == zs.next_in)
333
0
                break; // Paranoia to avoid potential looping. Shouldn't happen
334
82
            else
335
82
                last_in = zs.next_in;
336
82
            inflateReset(&zs);
337
1.13k
        } else if (ret != Z_OK) {
338
            // eg Z_BUF_ERROR due to avail_in/out becoming zero
339
58
            break;
340
58
        }
341
1.22k
    }
342
343
    // NB: zs.total_out is changed by inflateReset, so use pointer diff instead
344
1.12k
    destsize = zs.next_out - dest;
345
1.12k
    inflateEnd(&zs);
346
347
1.12k
    return destsize;
348
1.12k
}
349
350
#ifdef HAVE_LIBLZMA
351
// Similarly decompress a portion by peeking at the file, which must be
352
// positioned at the start of the file.
353
static ssize_t
354
decompress_peek_xz(hFILE *fp, unsigned char *dest, size_t destsize)
355
0
{
356
0
    unsigned char buffer[2048];
357
0
    ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
358
0
    if (npeek < 0) return -1;
359
360
0
    lzma_stream ls = LZMA_STREAM_INIT;
361
0
    if (lzma_stream_decoder(&ls, lzma_easy_decoder_memusage(9), 0) != LZMA_OK)
362
0
        return -1;
363
364
0
    ls.next_in = buffer;
365
0
    ls.avail_in = npeek;
366
0
    ls.next_out = dest;
367
0
    ls.avail_out = destsize;
368
369
0
    int r = lzma_code(&ls, LZMA_RUN);
370
0
    if (! (r == LZMA_OK || r == LZMA_STREAM_END)) {
371
0
        lzma_end(&ls);
372
0
        return -1;
373
0
    }
374
375
0
    destsize = ls.total_out;
376
0
    lzma_end(&ls);
377
378
0
    return destsize;
379
0
}
380
#endif
381
382
// Parse "x.y" text, taking care because the string is not NUL-terminated
383
// and filling in major/minor only when the digits are followed by a delimiter,
384
// so we don't misread "1.10" as "1.1" due to reaching the end of the buffer.
385
static void
386
parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim)
387
30
{
388
30
    const char *s    = (const char *) u;
389
30
    const char *slim = (const char *) ulim;
390
30
    short v;
391
392
30
    fmt->version.major = fmt->version.minor = -1;
393
394
194
    for (v = 0; s < slim && isdigit_c(*s); s++)
395
164
        v = 10 * v + *s - '0';
396
397
30
    if (s < slim) {
398
29
        fmt->version.major = v;
399
29
        if (*s == '.') {
400
2
            s++;
401
151
            for (v = 0; s < slim && isdigit_c(*s); s++)
402
149
                v = 10 * v + *s - '0';
403
2
            if (s < slim)
404
2
                fmt->version.minor = v;
405
2
        }
406
27
        else
407
27
            fmt->version.minor = 0;
408
29
    }
409
30
}
410
411
static int
412
cmp_nonblank(const char *key, const unsigned char *u, const unsigned char *ulim)
413
352
{
414
352
    const unsigned char *ukey = (const unsigned char *) key;
415
416
1.37k
    while (*ukey)
417
1.37k
        if (u >= ulim) return +1;
418
1.37k
        else if (isspace_c(*u)) u++;
419
351
        else if (*u != *ukey) return (*ukey < *u)? -1 : +1;
420
0
        else u++, ukey++;
421
422
0
    return 0;
423
352
}
424
425
static int is_text_only(const unsigned char *u, const unsigned char *ulim)
426
352
{
427
19.4k
    for (; u < ulim; u++)
428
19.0k
        if (! (*u >= ' ' || *u == '\t' || *u == '\r' || *u == '\n'))
429
4
            return 0;
430
431
348
    return 1;
432
352
}
433
434
static int is_fastaq(const unsigned char *u, const unsigned char *ulim)
435
347
{
436
347
    const unsigned char *eol = memchr(u, '\n', ulim - u);
437
438
    // Check that the first line is entirely textual
439
347
    if (! is_text_only(u, eol? eol : ulim)) return 0;
440
441
    // If the first line is very long, consider the file to indeed be FASTA/Q
442
347
    if (eol == NULL) return 1;
443
444
289
    u = eol+1; // Now points to the first character of the second line
445
446
    // Scan over all base-encoding letters (including 'N' but not SEQ's '=')
447
700
    while (u < ulim && (seq_nt16_table[*u] != 15 || toupper(*u) == 'N')) {
448
411
        if (*u == '=') return 0;
449
411
        u++;
450
411
    }
451
452
289
    return (u == ulim || *u == '\r' || *u == '\n')? 1 : 0;
453
289
}
454
455
// Parse tab-delimited text, filling in a string of column types and returning
456
// the number of columns spotted (within [u,ulim), and up to column_len) or -1
457
// if non-printable characters were seen.  Column types:
458
//     i: integer, s: strand sign, C: CIGAR, O: SAM optional field, Z: anything
459
static int
460
parse_tabbed_text(char *columns, int column_len,
461
                  const unsigned char *u, const unsigned char *ulim,
462
                  int *complete)
463
5
{
464
5
    const char *str  = (const char *) u;
465
5
    const char *slim = (const char *) ulim;
466
5
    const char *s;
467
5
    int ncolumns = 0;
468
469
5
    enum { digit = 1, leading_sign = 2, cigar_operator = 4, other = 8 };
470
5
    unsigned seen = 0;
471
5
    *complete = 0;
472
473
1.21k
    for (s = str; s < slim; s++)
474
1.21k
        if (*s >= ' ') {
475
1.20k
            if (isdigit_c(*s))
476
154
                seen |= digit;
477
1.04k
            else if ((*s == '+' || *s == '-') && s == str)
478
1
                seen |= leading_sign;
479
1.04k
            else if (strchr(BAM_CIGAR_STR, *s) && s > str && isdigit_c(s[-1]))
480
0
                seen |= cigar_operator;
481
1.04k
            else
482
1.04k
                seen |= other;
483
1.20k
        }
484
13
        else if (*s == '\t' || *s == '\r' || *s == '\n') {
485
10
            size_t len = s - str;
486
10
            char type;
487
488
10
            if (seen == digit || seen == (leading_sign|digit)) type = 'i';
489
5
            else if (seen == (digit|cigar_operator)) type = 'C';
490
5
            else if (len == 1)
491
2
                switch (str[0]) {
492
2
                case '*': type = 'C'; break;
493
0
                case '+': case '-': case '.': type = 's'; break;
494
0
                default: type = 'Z'; break;
495
2
                }
496
3
            else if (len >= 5 && str[2] == ':' && str[4] == ':') type = 'O';
497
3
            else type = 'Z';
498
499
10
            columns[ncolumns++] = type;
500
10
            if (*s != '\t' || ncolumns >= column_len - 1) {
501
0
                *complete = 1; // finished the line or more columns than needed
502
0
                break;
503
0
            }
504
505
10
            str = s + 1;
506
10
            seen = 0;
507
10
        }
508
3
        else return -1;
509
510
2
    columns[ncolumns] = '\0';
511
2
    return ncolumns;
512
5
}
513
514
// Match COLUMNS as a prefix against PATTERN (so COLUMNS may run out first).
515
// Returns len(COLUMNS) (modulo '+'), or 0 if there is a mismatched entry.
516
static int colmatch(const char *columns, const char *pattern)
517
1
{
518
1
    int i;
519
11
    for (i = 0; columns[i] != '\0'; i++) {
520
10
        if (pattern[i] == '+') return i;
521
10
        if (! (columns[i] == pattern[i] || pattern[i] == 'Z')) return 0;
522
10
    }
523
524
1
    return i;
525
1
}
526
527
int hts_detect_format(hFILE *hfile, htsFormat *fmt)
528
0
{
529
0
    return hts_detect_format2(hfile, NULL, fmt);
530
0
}
531
532
int hts_detect_format2(hFILE *hfile, const char *fname, htsFormat *fmt)
533
14.8k
{
534
14.8k
    char extension[HTS_MAX_EXT_LEN], columns[24];
535
14.8k
    unsigned char s[1024];
536
14.8k
    int complete = 0;
537
14.8k
    ssize_t len = hpeek(hfile, s, 18);
538
14.8k
    if (len < 0) return -1;
539
540
14.8k
    fmt->category = unknown_category;
541
14.8k
    fmt->format = unknown_format;
542
14.8k
    fmt->version.major = fmt->version.minor = -1;
543
14.8k
    fmt->compression = no_compression;
544
14.8k
    fmt->compression_level = -1;
545
14.8k
    fmt->specific = NULL;
546
547
14.8k
    if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) {
548
        // The stream is either gzip-compressed or BGZF-compressed.
549
        // Determine which, and decompress the first few records or lines.
550
1.12k
        fmt->compression = gzip;
551
1.12k
        if (len >= 18 && (s[3] & 4)) {
552
216
            if (memcmp(&s[12], "BC\2\0", 4) == 0)
553
38
                fmt->compression = bgzf;
554
178
            else if (memcmp(&s[12], "RAZF", 4) == 0)
555
10
                fmt->compression = razf_compression;
556
216
        }
557
1.12k
        if (len >= 9 && s[2] == 8)
558
1.12k
            fmt->compression_level = (s[8] == 2)? 9 : (s[8] == 4)? 1 : -1;
559
560
1.12k
        len = decompress_peek_gz(hfile, s, sizeof s);
561
1.12k
    }
562
13.7k
    else if (len >= 10 && memcmp(s, "BZh", 3) == 0 &&
563
13.7k
             (memcmp(&s[4], "\x31\x41\x59\x26\x53\x59", 6) == 0 ||
564
0
              memcmp(&s[4], "\x17\x72\x45\x38\x50\x90", 6) == 0)) {
565
0
        fmt->compression = bzip2_compression;
566
0
        fmt->compression_level = s[3] - '0';
567
        // Decompressing via libbz2 produces no output until it has a whole
568
        // block (of size 100Kb x level), which is too large for peeking.
569
        // So unfortunately we can recognise bzip2 but not the contents,
570
        // except that \x1772... magic indicates the stream is empty.
571
0
        if (s[4] == '\x31') return 0;
572
0
        else len = 0;
573
0
    }
574
13.7k
    else if (len >= 6 && memcmp(s, "\xfd""7zXZ\0", 6) == 0) {
575
0
        fmt->compression = xz_compression;
576
0
#ifdef HAVE_LIBLZMA
577
0
        len = decompress_peek_xz(hfile, s, sizeof s);
578
#else
579
        // Without liblzma, we can't recognise the decompressed contents.
580
        return 0;
581
#endif
582
0
    }
583
13.7k
    else if (len >= 4 && memcmp(s, "\x28\xb5\x2f\xfd", 4) == 0) {
584
0
        fmt->compression = zstd_compression;
585
0
        return 0;
586
0
    }
587
13.7k
    else {
588
13.7k
        len = hpeek(hfile, s, sizeof s);
589
13.7k
    }
590
14.8k
    if (len < 0) return -1;
591
592
14.8k
    if (len == 0) {
593
10
        fmt->format = empty_format;
594
10
        return 0;
595
10
    }
596
597
    // We avoid using filename extensions wherever possible (as filenames are
598
    // not always available), but in a few cases they must be considered:
599
    //  - FASTA/Q indexes are simply tab-separated text; files that match these
600
    //    patterns but not the fai/fqi extension are usually generic BED files
601
    //  - GZI indexes have no magic numbers so can only be detected by filename
602
14.8k
    if (fname && strcmp(fname, "-") != 0) {
603
14.8k
        char *s;
604
14.8k
        if (find_file_extension(fname, extension) < 0) extension[0] = '\0';
605
14.8k
        for (s = extension; *s; s++) *s = tolower_c(*s);
606
14.8k
    }
607
0
    else extension[0] = '\0';
608
609
14.8k
    if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=7 && s[5]<=7) {
610
5.09k
        fmt->category = sequence_data;
611
5.09k
        fmt->format = cram;
612
5.09k
        fmt->version.major = s[4], fmt->version.minor = s[5];
613
5.09k
        fmt->compression = custom;
614
5.09k
        return 0;
615
5.09k
    }
616
9.79k
    else if (len >= 4 && s[3] <= '\4') {
617
1.11k
        if (memcmp(s, "BAM\1", 4) == 0) {
618
468
            fmt->category = sequence_data;
619
468
            fmt->format = bam;
620
            // TODO Decompress enough to pick version from @HD-VN header
621
468
            fmt->version.major = 1, fmt->version.minor = -1;
622
468
            return 0;
623
468
        }
624
650
        else if (memcmp(s, "BAI\1", 4) == 0) {
625
0
            fmt->category = index_file;
626
0
            fmt->format = bai;
627
0
            fmt->version.major = -1, fmt->version.minor = -1;
628
0
            return 0;
629
0
        }
630
650
        else if (memcmp(s, "BCF\4", 4) == 0) {
631
0
            fmt->category = variant_data;
632
0
            fmt->format = bcf;
633
0
            fmt->version.major = 1, fmt->version.minor = -1;
634
0
            return 0;
635
0
        }
636
650
        else if (memcmp(s, "BCF\2", 4) == 0) {
637
649
            fmt->category = variant_data;
638
649
            fmt->format = bcf;
639
649
            fmt->version.major = s[3];
640
649
            fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0;
641
649
            return 0;
642
649
        }
643
1
        else if (memcmp(s, "CSI\1", 4) == 0) {
644
0
            fmt->category = index_file;
645
0
            fmt->format = csi;
646
0
            fmt->version.major = 1, fmt->version.minor = -1;
647
0
            return 0;
648
0
        }
649
1
        else if (memcmp(s, "TBI\1", 4) == 0) {
650
0
            fmt->category = index_file;
651
0
            fmt->format = tbi;
652
0
            return 0;
653
0
        }
654
        // GZI indexes have no magic numbers, so must be recognised solely by
655
        // filename extension.
656
1
        else if (strcmp(extension, "gzi") == 0) {
657
0
            fmt->category = index_file;
658
0
            fmt->format = gzi;
659
0
            return 0;
660
0
        }
661
1.11k
    }
662
8.67k
    else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) {
663
5.38k
        fmt->category = variant_data;
664
5.38k
        fmt->format = vcf;
665
5.38k
        if (len >= 21 && s[16] == 'v')
666
0
            parse_version(fmt, &s[17], &s[len]);
667
5.38k
        return 0;
668
5.38k
    }
669
3.28k
    else if (len >= 4 && s[0] == '@' &&
670
3.28k
             (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 ||
671
2.96k
              memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0 ||
672
2.96k
              memcmp(s, "@CO\t", 4) == 0)) {
673
2.93k
        fmt->category = sequence_data;
674
2.93k
        fmt->format = sam;
675
        // @HD-VN is not guaranteed to be the first tag, but then @HD is
676
        // not guaranteed to be present at all...
677
2.93k
        if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0)
678
30
            parse_version(fmt, &s[7], &s[len]);
679
2.90k
        else
680
2.90k
            fmt->version.major = 1, fmt->version.minor = -1;
681
2.93k
        return 0;
682
2.93k
    }
683
352
    else if (len >= 8 && memcmp(s, "d4\xdd\xdd", 4) == 0) {
684
0
        fmt->category = region_list;
685
0
        fmt->format = d4_format;
686
        // How to decode the D4 Format Version bytes is not yet specified
687
        // so we don't try to set fmt->version.{major,minor}.
688
0
        return 0;
689
0
    }
690
352
    else if (cmp_nonblank("{\"htsget\":", s, &s[len]) == 0) {
691
0
        fmt->category = unknown_category;
692
0
        fmt->format = htsget;
693
0
        return 0;
694
0
    }
695
352
    else if (len > 8 && memcmp(s, "crypt4gh", 8) == 0) {
696
0
        fmt->category = unknown_category;
697
0
        fmt->format = hts_crypt4gh_format;
698
0
        return 0;
699
0
    }
700
352
    else if (len >= 1 && s[0] == '>' && is_fastaq(s, &s[len])) {
701
317
        fmt->category = sequence_data;
702
317
        fmt->format = fasta_format;
703
317
        return 0;
704
317
    }
705
35
    else if (len >= 1 && s[0] == '@' && is_fastaq(s, &s[len])) {
706
30
        fmt->category = sequence_data;
707
30
        fmt->format = fastq_format;
708
30
        return 0;
709
30
    }
710
5
    else if (parse_tabbed_text(columns, sizeof columns, s,
711
5
                               &s[len], &complete) > 0) {
712
        // A complete SAM line is at least 11 columns.  On unmapped long reads may
713
        // be missing two.  (On mapped long reads we must have an @ header so long
714
        // CIGAR is irrelevant.)
715
1
        if (colmatch(columns, "ZiZiiCZiiZZOOOOOOOOOOOOOOOOOOOO+")
716
1
            >= 9 + 2*complete) {
717
1
            fmt->category = sequence_data;
718
1
            fmt->format = sam;
719
1
            fmt->version.major = 1, fmt->version.minor = -1;
720
1
            return 0;
721
1
        }
722
0
        else if (fmt->compression == gzip && colmatch(columns, "iiiiii") == 6) {
723
0
            fmt->category = index_file;
724
0
            fmt->format = crai;
725
0
            return 0;
726
0
        }
727
0
        else if (strstr(extension, "fqi") && colmatch(columns, "Ziiiii") == 6) {
728
0
            fmt->category = index_file;
729
0
            fmt->format = fqi_format;
730
0
            return 0;
731
0
        }
732
0
        else if (strstr(extension, "fai") && colmatch(columns, "Ziiii") == 5) {
733
0
            fmt->category = index_file;
734
0
            fmt->format = fai_format;
735
0
            return 0;
736
0
        }
737
0
        else if (colmatch(columns, "Zii+") >= 3) {
738
0
            fmt->category = region_list;
739
0
            fmt->format = bed;
740
0
            return 0;
741
0
        }
742
1
    }
743
744
    // Arbitrary text files can be read using hts_getline().
745
5
    if (is_text_only(s, &s[len])) fmt->format = text_format;
746
747
    // Nothing recognised: leave unset fmt-> fields as unknown.
748
5
    return 0;
749
14.8k
}
750
751
char *hts_format_description(const htsFormat *format)
752
0
{
753
0
    kstring_t str = { 0, 0, NULL };
754
755
0
    switch (format->format) {
756
0
    case sam:   kputs("SAM", &str); break;
757
0
    case bam:   kputs("BAM", &str); break;
758
0
    case cram:  kputs("CRAM", &str); break;
759
0
    case fasta_format:  kputs("FASTA", &str); break;
760
0
    case fastq_format:  kputs("FASTQ", &str); break;
761
0
    case vcf:   kputs("VCF", &str); break;
762
0
    case bcf:
763
0
        if (format->version.major == 1) kputs("Legacy BCF", &str);
764
0
        else kputs("BCF", &str);
765
0
        break;
766
0
    case bai:   kputs("BAI", &str); break;
767
0
    case crai:  kputs("CRAI", &str); break;
768
0
    case csi:   kputs("CSI", &str); break;
769
0
    case fai_format:    kputs("FASTA-IDX", &str); break;
770
0
    case fqi_format:    kputs("FASTQ-IDX", &str); break;
771
0
    case gzi:   kputs("GZI", &str); break;
772
0
    case tbi:   kputs("Tabix", &str); break;
773
0
    case bed:   kputs("BED", &str); break;
774
0
    case d4_format:     kputs("D4", &str); break;
775
0
    case htsget: kputs("htsget", &str); break;
776
0
    case hts_crypt4gh_format: kputs("crypt4gh", &str); break;
777
0
    case empty_format:  kputs("empty", &str); break;
778
0
    default:    kputs("unknown", &str); break;
779
0
    }
780
781
0
    if (format->version.major >= 0) {
782
0
        kputs(" version ", &str);
783
0
        kputw(format->version.major, &str);
784
0
        if (format->version.minor >= 0) {
785
0
            kputc('.', &str);
786
0
            kputw(format->version.minor, &str);
787
0
        }
788
0
    }
789
790
0
    switch (format->compression) {
791
0
    case bzip2_compression:  kputs(" bzip2-compressed", &str); break;
792
0
    case razf_compression:   kputs(" legacy-RAZF-compressed", &str); break;
793
0
    case xz_compression:     kputs(" XZ-compressed", &str); break;
794
0
    case zstd_compression:   kputs(" Zstandard-compressed", &str); break;
795
0
    case custom: kputs(" compressed", &str); break;
796
0
    case gzip:   kputs(" gzip-compressed", &str); break;
797
798
0
    case bgzf:
799
0
        switch (format->format) {
800
0
        case bam:
801
0
        case bcf:
802
0
        case csi:
803
0
        case tbi:
804
            // These are by definition BGZF, so just use the generic term
805
0
            kputs(" compressed", &str);
806
0
            break;
807
0
        default:
808
0
            kputs(" BGZF-compressed", &str);
809
0
            break;
810
0
        }
811
0
        break;
812
813
0
    case no_compression:
814
0
        switch (format->format) {
815
0
        case bam:
816
0
        case bcf:
817
0
        case cram:
818
0
        case csi:
819
0
        case tbi:
820
            // These are normally compressed, so emphasise that this one isn't
821
0
            kputs(" uncompressed", &str);
822
0
            break;
823
0
        default:
824
0
            break;
825
0
        }
826
0
        break;
827
828
0
    default: break;
829
0
    }
830
831
0
    switch (format->category) {
832
0
    case sequence_data: kputs(" sequence", &str); break;
833
0
    case variant_data:  kputs(" variant calling", &str); break;
834
0
    case index_file:    kputs(" index", &str); break;
835
0
    case region_list:   kputs(" genomic region", &str); break;
836
0
    default: break;
837
0
    }
838
839
0
    if (format->compression == no_compression)
840
0
        switch (format->format) {
841
0
        case text_format:
842
0
        case sam:
843
0
        case crai:
844
0
        case vcf:
845
0
        case bed:
846
0
        case fai_format:
847
0
        case fqi_format:
848
0
        case fasta_format:
849
0
        case fastq_format:
850
0
        case htsget:
851
0
            kputs(" text", &str);
852
0
            break;
853
854
0
        case empty_format:
855
0
            break;
856
857
0
        default:
858
0
            kputs(" data", &str);
859
0
            break;
860
0
        }
861
0
    else
862
0
        kputs(" data", &str);
863
864
0
    return ks_release(&str);
865
0
}
866
867
htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
868
13.6k
{
869
13.6k
    char smode[101], *cp, *cp2, *mode_c, *uncomp = NULL;
870
13.6k
    htsFile *fp = NULL;
871
13.6k
    hFILE *hfile = NULL;
872
13.6k
    char fmt_code = '\0';
873
    // see enum htsExactFormat in htslib/hts.h
874
13.6k
    const char format_to_mode[] = "\0g\0\0b\0c\0\0b\0g\0\0\0\0\0Ff\0\0";
875
876
13.6k
    strncpy(smode, mode, 99);
877
13.6k
    smode[99]=0;
878
13.6k
    if ((cp = strchr(smode, ',')))
879
0
        *cp = '\0';
880
881
    // Migrate format code (b or c) to the end of the smode buffer.
882
27.3k
    for (cp2 = cp = smode; *cp; cp++) {
883
13.6k
        if (*cp == 'b')
884
0
            fmt_code = 'b';
885
13.6k
        else if (*cp == 'c')
886
0
            fmt_code = 'c';
887
13.6k
        else {
888
13.6k
            *cp2++ = *cp;
889
            // Cache the uncompress flag 'u' pos if present
890
13.6k
            if (!uncomp && (*cp == 'u')) {
891
0
                uncomp = cp2 - 1;
892
0
            }
893
13.6k
        }
894
13.6k
    }
895
13.6k
    mode_c = cp2;
896
13.6k
    *cp2++ = fmt_code;
897
13.6k
    *cp2++ = 0;
898
899
    // Set or reset the format code if opts->format is used
900
13.6k
    if (fmt && fmt->format > unknown_format
901
13.6k
        && fmt->format < sizeof(format_to_mode)) {
902
0
        *mode_c = format_to_mode[fmt->format];
903
0
    }
904
905
    // Uncompressed bam/bcf is not supported, change 'u' to '0' on write
906
13.6k
    if (uncomp && *mode_c == 'b' && (strchr(smode, 'w') || strchr(smode, 'a'))) {
907
0
        *uncomp = '0';
908
0
    }
909
910
    // If we really asked for a compressed text format then mode_c above will
911
    // point to nul.  We set to 'z' to enable bgzf.
912
13.6k
    if (strchr(mode, 'w') && fmt && fmt->compression == bgzf) {
913
0
        if (fmt->format == sam || fmt->format == vcf || fmt->format == text_format)
914
0
            *mode_c = 'z';
915
0
    }
916
917
13.6k
    char *rmme = NULL, *fnidx = strstr(fn, HTS_IDX_DELIM);
918
13.6k
    if ( fnidx ) {
919
0
        rmme = strdup(fn);
920
0
        if ( !rmme ) goto error;
921
0
        rmme[fnidx-fn] = 0;
922
0
        fn = rmme;
923
0
    }
924
925
13.6k
    hfile = hopen(fn, smode);
926
13.6k
    if (hfile == NULL) goto error;
927
928
13.6k
    fp = hts_hopen(hfile, fn, smode);
929
13.6k
    if (fp == NULL) goto error;
930
931
    // Compensate for the loss of exactness in htsExactFormat.
932
    // hts_hopen returns generics such as binary or text, but we
933
    // have been given something explicit here so use that instead.
934
13.6k
    if (fp->is_write && fmt &&
935
13.6k
        (fmt->format == bam || fmt->format == sam ||
936
0
         fmt->format == vcf || fmt->format == bcf ||
937
0
         fmt->format == bed || fmt->format == fasta_format ||
938
0
         fmt->format == fastq_format))
939
0
        fp->format.format = fmt->format;
940
941
13.6k
    if (fmt && fmt->specific)
942
0
        if (hts_opt_apply(fp, fmt->specific) != 0)
943
0
            goto error;
944
945
13.6k
    if ( rmme ) free(rmme);
946
13.6k
    return fp;
947
948
0
error:
949
0
    hts_log_error("Failed to open file \"%s\"%s%s", fn,
950
0
                  errno ? " : " : "", errno ? strerror(errno) : "");
951
0
    if ( rmme ) free(rmme);
952
953
0
    if (hfile)
954
0
        hclose_abruptly(hfile);
955
956
0
    return NULL;
957
13.6k
}
958
959
13.6k
htsFile *hts_open(const char *fn, const char *mode) {
960
13.6k
    return hts_open_format(fn, mode, NULL);
961
13.6k
}
962
963
/*
964
 * Splits str into a prefix, delimiter ('\0' or delim), and suffix, writing
965
 * the prefix in lowercase into buf and returning a pointer to the suffix.
966
 * On return, buf is always NUL-terminated; thus assumes that the "keyword"
967
 * prefix should be one of several known values of maximum length buflen-2.
968
 * (If delim is not found, returns a pointer to the '\0'.)
969
 */
970
static const char *
971
scan_keyword(const char *str, char delim, char *buf, size_t buflen)
972
0
{
973
0
    size_t i = 0;
974
0
    while (*str && *str != delim) {
975
0
        if (i < buflen-1) buf[i++] = tolower_c(*str);
976
0
        str++;
977
0
    }
978
979
0
    buf[i] = '\0';
980
0
    return *str? str+1 : str;
981
0
}
982
983
/*
984
 * Parses arg and appends it to the option list.
985
 *
986
 * Returns 0 on success;
987
 *        -1 on failure.
988
 */
989
0
int hts_opt_add(hts_opt **opts, const char *c_arg) {
990
0
    hts_opt *o, *t;
991
0
    char *val;
992
993
    /*
994
     * IMPORTANT!!!
995
     * If you add another string option here, don't forget to also add
996
     * it to the case statement in hts_opt_apply.
997
     */
998
999
0
    if (!c_arg)
1000
0
        return -1;
1001
1002
0
    if (!(o =  malloc(sizeof(*o))))
1003
0
        return -1;
1004
1005
0
    if (!(o->arg = strdup(c_arg))) {
1006
0
        free(o);
1007
0
        return -1;
1008
0
    }
1009
1010
0
    if (!(val = strchr(o->arg, '=')))
1011
0
        val = "1"; // assume boolean
1012
0
    else
1013
0
        *val++ = '\0';
1014
1015
0
    if (strcmp(o->arg, "decode_md") == 0 ||
1016
0
        strcmp(o->arg, "DECODE_MD") == 0)
1017
0
        o->opt = CRAM_OPT_DECODE_MD, o->val.i = atoi(val);
1018
1019
0
    else if (strcmp(o->arg, "verbosity") == 0 ||
1020
0
             strcmp(o->arg, "VERBOSITY") == 0)
1021
0
        o->opt = CRAM_OPT_VERBOSITY, o->val.i = atoi(val);
1022
1023
0
    else if (strcmp(o->arg, "seqs_per_slice") == 0 ||
1024
0
             strcmp(o->arg, "SEQS_PER_SLICE") == 0)
1025
0
        o->opt = CRAM_OPT_SEQS_PER_SLICE, o->val.i = atoi(val);
1026
1027
0
    else if (strcmp(o->arg, "bases_per_slice") == 0 ||
1028
0
             strcmp(o->arg, "BASES_PER_SLICE") == 0)
1029
0
        o->opt = CRAM_OPT_BASES_PER_SLICE, o->val.i = atoi(val);
1030
1031
0
    else if (strcmp(o->arg, "slices_per_container") == 0 ||
1032
0
             strcmp(o->arg, "SLICES_PER_CONTAINER") == 0)
1033
0
        o->opt = CRAM_OPT_SLICES_PER_CONTAINER, o->val.i = atoi(val);
1034
1035
0
    else if (strcmp(o->arg, "embed_ref") == 0 ||
1036
0
             strcmp(o->arg, "EMBED_REF") == 0)
1037
0
        o->opt = CRAM_OPT_EMBED_REF, o->val.i = atoi(val);
1038
1039
0
    else if (strcmp(o->arg, "no_ref") == 0 ||
1040
0
             strcmp(o->arg, "NO_REF") == 0)
1041
0
        o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(val);
1042
1043
0
    else if (strcmp(o->arg, "pos_delta") == 0 ||
1044
0
             strcmp(o->arg, "POS_DELTA") == 0)
1045
0
        o->opt = CRAM_OPT_POS_DELTA, o->val.i = atoi(val);
1046
1047
0
    else if (strcmp(o->arg, "ignore_md5") == 0 ||
1048
0
             strcmp(o->arg, "IGNORE_MD5") == 0)
1049
0
        o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(val);
1050
1051
0
    else if (strcmp(o->arg, "use_bzip2") == 0 ||
1052
0
             strcmp(o->arg, "USE_BZIP2") == 0)
1053
0
        o->opt = CRAM_OPT_USE_BZIP2, o->val.i = atoi(val);
1054
1055
0
    else if (strcmp(o->arg, "use_rans") == 0 ||
1056
0
             strcmp(o->arg, "USE_RANS") == 0)
1057
0
        o->opt = CRAM_OPT_USE_RANS, o->val.i = atoi(val);
1058
1059
0
    else if (strcmp(o->arg, "use_lzma") == 0 ||
1060
0
             strcmp(o->arg, "USE_LZMA") == 0)
1061
0
        o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(val);
1062
1063
0
    else if (strcmp(o->arg, "use_tok") == 0 ||
1064
0
             strcmp(o->arg, "USE_TOK") == 0)
1065
0
        o->opt = CRAM_OPT_USE_TOK, o->val.i = atoi(val);
1066
1067
0
    else if (strcmp(o->arg, "use_fqz") == 0 ||
1068
0
             strcmp(o->arg, "USE_FQZ") == 0)
1069
0
        o->opt = CRAM_OPT_USE_FQZ, o->val.i = atoi(val);
1070
1071
0
    else if (strcmp(o->arg, "use_arith") == 0 ||
1072
0
             strcmp(o->arg, "USE_ARITH") == 0)
1073
0
        o->opt = CRAM_OPT_USE_ARITH, o->val.i = atoi(val);
1074
1075
0
    else if (strcmp(o->arg, "fast") == 0 ||
1076
0
             strcmp(o->arg, "FAST") == 0)
1077
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_FAST;
1078
1079
0
    else if (strcmp(o->arg, "normal") == 0 ||
1080
0
             strcmp(o->arg, "NORMAL") == 0)
1081
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_NORMAL;
1082
1083
0
    else if (strcmp(o->arg, "small") == 0 ||
1084
0
             strcmp(o->arg, "SMALL") == 0)
1085
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_SMALL;
1086
1087
0
    else if (strcmp(o->arg, "archive") == 0 ||
1088
0
             strcmp(o->arg, "ARCHIVE") == 0)
1089
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_ARCHIVE;
1090
1091
0
    else if (strcmp(o->arg, "reference") == 0 ||
1092
0
             strcmp(o->arg, "REFERENCE") == 0)
1093
0
        o->opt = CRAM_OPT_REFERENCE, o->val.s = val;
1094
1095
0
    else if (strcmp(o->arg, "version") == 0 ||
1096
0
             strcmp(o->arg, "VERSION") == 0)
1097
0
        o->opt = CRAM_OPT_VERSION, o->val.s =val;
1098
1099
0
    else if (strcmp(o->arg, "multi_seq_per_slice") == 0 ||
1100
0
             strcmp(o->arg, "MULTI_SEQ_PER_SLICE") == 0)
1101
0
        o->opt = CRAM_OPT_MULTI_SEQ_PER_SLICE, o->val.i = atoi(val);
1102
1103
0
    else if (strcmp(o->arg, "nthreads") == 0 ||
1104
0
             strcmp(o->arg, "NTHREADS") == 0)
1105
0
        o->opt = HTS_OPT_NTHREADS, o->val.i = atoi(val);
1106
1107
0
    else if (strcmp(o->arg, "cache_size") == 0 ||
1108
0
             strcmp(o->arg, "CACHE_SIZE") == 0) {
1109
0
        char *endp;
1110
0
        o->opt = HTS_OPT_CACHE_SIZE;
1111
0
        o->val.i = strtol(val, &endp, 0);
1112
        // NB: Doesn't support floats, eg 1.5g
1113
        // TODO: extend hts_parse_decimal? See also samtools sort.
1114
0
        switch (*endp) {
1115
0
        case 'g': case 'G': o->val.i *= 1024; // fall through
1116
0
        case 'm': case 'M': o->val.i *= 1024; // fall through
1117
0
        case 'k': case 'K': o->val.i *= 1024; break;
1118
0
        case '\0': break;
1119
0
        default:
1120
0
            hts_log_error("Unrecognised cache size suffix '%c'", *endp);
1121
0
            free(o->arg);
1122
0
            free(o);
1123
0
            return -1;
1124
0
        }
1125
0
    }
1126
1127
0
    else if (strcmp(o->arg, "required_fields") == 0 ||
1128
0
             strcmp(o->arg, "REQUIRED_FIELDS") == 0)
1129
0
        o->opt = CRAM_OPT_REQUIRED_FIELDS, o->val.i = strtol(val, NULL, 0);
1130
1131
0
    else if (strcmp(o->arg, "lossy_names") == 0 ||
1132
0
             strcmp(o->arg, "LOSSY_NAMES") == 0)
1133
0
        o->opt = CRAM_OPT_LOSSY_NAMES, o->val.i = strtol(val, NULL, 0);
1134
1135
0
    else if (strcmp(o->arg, "name_prefix") == 0 ||
1136
0
             strcmp(o->arg, "NAME_PREFIX") == 0)
1137
0
        o->opt = CRAM_OPT_PREFIX, o->val.s = val;
1138
1139
0
    else if (strcmp(o->arg, "store_md") == 0 ||
1140
0
             strcmp(o->arg, "store_md") == 0)
1141
0
        o->opt = CRAM_OPT_STORE_MD, o->val.i = atoi(val);
1142
1143
0
    else if (strcmp(o->arg, "store_nm") == 0 ||
1144
0
             strcmp(o->arg, "store_nm") == 0)
1145
0
        o->opt = CRAM_OPT_STORE_NM, o->val.i = atoi(val);
1146
1147
0
    else if (strcmp(o->arg, "block_size") == 0 ||
1148
0
             strcmp(o->arg, "BLOCK_SIZE") == 0)
1149
0
        o->opt = HTS_OPT_BLOCK_SIZE, o->val.i = strtol(val, NULL, 0);
1150
1151
0
    else if (strcmp(o->arg, "level") == 0 ||
1152
0
             strcmp(o->arg, "LEVEL") == 0)
1153
0
        o->opt = HTS_OPT_COMPRESSION_LEVEL, o->val.i = strtol(val, NULL, 0);
1154
1155
0
    else if (strcmp(o->arg, "filter") == 0 ||
1156
0
             strcmp(o->arg, "FILTER") == 0)
1157
0
        o->opt = HTS_OPT_FILTER, o->val.s = val;
1158
1159
0
    else if (strcmp(o->arg, "fastq_aux") == 0 ||
1160
0
        strcmp(o->arg, "FASTQ_AUX") == 0)
1161
0
        o->opt = FASTQ_OPT_AUX, o->val.s = val;
1162
1163
0
    else if (strcmp(o->arg, "fastq_barcode") == 0 ||
1164
0
        strcmp(o->arg, "FASTQ_BARCODE") == 0)
1165
0
        o->opt = FASTQ_OPT_BARCODE, o->val.s = val;
1166
1167
0
    else if (strcmp(o->arg, "fastq_rnum") == 0 ||
1168
0
        strcmp(o->arg, "FASTQ_RNUM") == 0)
1169
0
        o->opt = FASTQ_OPT_RNUM, o->val.i = 1;
1170
1171
0
    else if (strcmp(o->arg, "fastq_casava") == 0 ||
1172
0
        strcmp(o->arg, "FASTQ_CASAVA") == 0)
1173
0
        o->opt = FASTQ_OPT_CASAVA, o->val.i = 1;
1174
1175
0
    else if (strcmp(o->arg, "fastq_name2") == 0 ||
1176
0
        strcmp(o->arg, "FASTQ_NAME2") == 0)
1177
0
        o->opt = FASTQ_OPT_NAME2, o->val.i = 1;
1178
1179
0
    else {
1180
0
        hts_log_error("Unknown option '%s'", o->arg);
1181
0
        free(o->arg);
1182
0
        free(o);
1183
0
        return -1;
1184
0
    }
1185
1186
0
    o->next = NULL;
1187
1188
    // Append; assumes small list.
1189
0
    if (*opts) {
1190
0
        t = *opts;
1191
0
        while (t->next)
1192
0
            t = t->next;
1193
0
        t->next = o;
1194
0
    } else {
1195
0
        *opts = o;
1196
0
    }
1197
1198
0
    return 0;
1199
0
}
1200
1201
/*
1202
 * Applies an hts_opt option list to a given htsFile.
1203
 *
1204
 * Returns 0 on success
1205
 *        -1 on failure
1206
 */
1207
0
int hts_opt_apply(htsFile *fp, hts_opt *opts) {
1208
0
    hts_opt *last = NULL;
1209
1210
0
    for (; opts;  opts = (last=opts)->next) {
1211
0
        switch (opts->opt) {
1212
0
            case CRAM_OPT_REFERENCE:
1213
0
                if (!(fp->fn_aux = strdup(opts->val.s)))
1214
0
                    return -1;
1215
                // fall through
1216
0
            case CRAM_OPT_VERSION:
1217
0
            case CRAM_OPT_PREFIX:
1218
0
            case HTS_OPT_FILTER:
1219
0
            case FASTQ_OPT_AUX:
1220
0
            case FASTQ_OPT_BARCODE:
1221
0
                if (hts_set_opt(fp,  opts->opt,  opts->val.s) != 0)
1222
0
                    return -1;
1223
0
                break;
1224
0
            default:
1225
0
                if (hts_set_opt(fp,  opts->opt,  opts->val.i) != 0)
1226
0
                    return -1;
1227
0
                break;
1228
0
        }
1229
0
    }
1230
1231
0
    return 0;
1232
0
}
1233
1234
/*
1235
 * Frees an hts_opt list.
1236
 */
1237
0
void hts_opt_free(hts_opt *opts) {
1238
0
    hts_opt *last = NULL;
1239
0
    while (opts) {
1240
0
        opts = (last=opts)->next;
1241
0
        free(last->arg);
1242
0
        free(last);
1243
0
    }
1244
0
}
1245
1246
1247
/*
1248
 * Tokenise options as (key(=value)?,)*(key(=value)?)?
1249
 * NB: No provision for ',' appearing in the value!
1250
 * Add backslashing rules?
1251
 *
1252
 * This could be used as part of a general command line option parser or
1253
 * as a string concatenated onto the file open mode.
1254
 *
1255
 * Returns 0 on success
1256
 *        -1 on failure.
1257
 */
1258
0
int hts_parse_opt_list(htsFormat *fmt, const char *str) {
1259
0
    while (str && *str) {
1260
0
        const char *str_start;
1261
0
        int len;
1262
0
        char arg[8001];
1263
1264
0
        while (*str && *str == ',')
1265
0
            str++;
1266
1267
0
        for (str_start = str; *str && *str != ','; str++);
1268
0
        len = str - str_start;
1269
1270
        // Produce a nul terminated copy of the option
1271
0
        strncpy(arg, str_start, len < 8000 ? len : 8000);
1272
0
        arg[len < 8000 ? len : 8000] = '\0';
1273
1274
0
        if (hts_opt_add((hts_opt **)&fmt->specific, arg) != 0)
1275
0
            return -1;
1276
1277
0
        if (*str)
1278
0
            str++;
1279
0
    }
1280
1281
0
    return 0;
1282
0
}
1283
1284
/*
1285
 * Accepts a string file format (sam, bam, cram, vcf, bam) optionally
1286
 * followed by a comma separated list of key=value options and splits
1287
 * these up into the fields of htsFormat struct.
1288
 *
1289
 * format is assumed to be already initialised, either to blank
1290
 * "unknown" values or via previous hts_opt_add calls.
1291
 *
1292
 * Returns 0 on success
1293
 *        -1 on failure.
1294
 */
1295
0
int hts_parse_format(htsFormat *format, const char *str) {
1296
0
    char fmt[8];
1297
0
    const char *cp = scan_keyword(str, ',', fmt, sizeof fmt);
1298
1299
0
    format->version.minor = 0; // unknown
1300
0
    format->version.major = 0; // unknown
1301
1302
0
    if (strcmp(fmt, "sam") == 0) {
1303
0
        format->category          = sequence_data;
1304
0
        format->format            = sam;
1305
0
        format->compression       = no_compression;
1306
0
        format->compression_level = 0;
1307
0
    } else if (strcmp(fmt, "sam.gz") == 0) {
1308
0
        format->category          = sequence_data;
1309
0
        format->format            = sam;
1310
0
        format->compression       = bgzf;
1311
0
        format->compression_level = -1;
1312
0
    } else if (strcmp(fmt, "bam") == 0) {
1313
0
        format->category          = sequence_data;
1314
0
        format->format            = bam;
1315
0
        format->compression       = bgzf;
1316
0
        format->compression_level = -1;
1317
0
    } else if (strcmp(fmt, "cram") == 0) {
1318
0
        format->category          = sequence_data;
1319
0
        format->format            = cram;
1320
0
        format->compression       = custom;
1321
0
        format->compression_level = -1;
1322
0
    } else if (strcmp(fmt, "vcf") == 0) {
1323
0
        format->category          = variant_data;
1324
0
        format->format            = vcf;
1325
0
        format->compression       = no_compression;
1326
0
        format->compression_level = 0;
1327
0
    } else if (strcmp(fmt, "bcf") == 0) {
1328
0
        format->category          = variant_data;
1329
0
        format->format            = bcf;
1330
0
        format->compression       = bgzf;
1331
0
        format->compression_level = -1;
1332
0
    } else if (strcmp(fmt, "fastq") == 0 || strcmp(fmt, "fq") == 0) {
1333
0
        format->category          = sequence_data;
1334
0
        format->format            = fastq_format;
1335
0
        format->compression       = no_compression;
1336
0
        format->compression_level = 0;
1337
0
    } else if (strcmp(fmt, "fastq.gz") == 0 || strcmp(fmt, "fq.gz") == 0) {
1338
0
        format->category          = sequence_data;
1339
0
        format->format            = fastq_format;
1340
0
        format->compression       = bgzf;
1341
0
        format->compression_level = 0;
1342
0
    } else if (strcmp(fmt, "fasta") == 0 || strcmp(fmt, "fa") == 0) {
1343
0
        format->category          = sequence_data;
1344
0
        format->format            = fasta_format;
1345
0
        format->compression       = no_compression;
1346
0
        format->compression_level = 0;
1347
0
    } else if (strcmp(fmt, "fasta.gz") == 0 || strcmp(fmt, "fa.gz") == 0) {
1348
0
        format->category          = sequence_data;
1349
0
        format->format            = fasta_format;
1350
0
        format->compression       = bgzf;
1351
0
        format->compression_level = 0;
1352
0
    } else {
1353
0
        return -1;
1354
0
    }
1355
1356
0
    return hts_parse_opt_list(format, cp);
1357
0
}
1358
1359
1360
/*
1361
 * Tokenise options as (key(=value)?,)*(key(=value)?)?
1362
 * NB: No provision for ',' appearing in the value!
1363
 * Add backslashing rules?
1364
 *
1365
 * This could be used as part of a general command line option parser or
1366
 * as a string concatenated onto the file open mode.
1367
 *
1368
 * Returns 0 on success
1369
 *        -1 on failure.
1370
 */
1371
0
static int hts_process_opts(htsFile *fp, const char *opts) {
1372
0
    htsFormat fmt;
1373
1374
0
    fmt.specific = NULL;
1375
0
    if (hts_parse_opt_list(&fmt, opts) != 0)
1376
0
        return -1;
1377
1378
0
    if (hts_opt_apply(fp, fmt.specific) != 0) {
1379
0
        hts_opt_free(fmt.specific);
1380
0
        return -1;
1381
0
    }
1382
1383
0
    hts_opt_free(fmt.specific);
1384
1385
0
    return 0;
1386
0
}
1387
1388
static int hts_crypt4gh_redirect(const char *fn, const char *mode,
1389
0
                                 hFILE **hfile_ptr, htsFile *fp) {
1390
0
    hFILE *hfile1 = *hfile_ptr;
1391
0
    hFILE *hfile2 = NULL;
1392
0
    char fn_buf[512], *fn2 = fn_buf;
1393
0
    char mode2[102]; // Size set by sizeof(simple_mode) in hts_hopen()
1394
0
    const char *prefix = "crypt4gh:";
1395
0
    size_t fn2_len = strlen(prefix) + strlen(fn) + 1;
1396
0
    int ret = -1;
1397
1398
0
    if (fn2_len > sizeof(fn_buf)) {
1399
0
        if (fn2_len >= INT_MAX) // Silence gcc format-truncation warning
1400
0
            return -1;
1401
0
        fn2 = malloc(fn2_len);
1402
0
        if (!fn2) return -1;
1403
0
    }
1404
1405
    // Reopen fn using the crypt4gh plug-in (if available)
1406
0
    snprintf(fn2, fn2_len, "%s%s", prefix, fn);
1407
0
    snprintf(mode2, sizeof(mode2), "%s%s", mode, strchr(mode, ':') ? "" : ":");
1408
0
    hfile2 = hopen(fn2, mode2, "parent", hfile1, NULL);
1409
0
    if (hfile2) {
1410
        // Replace original hfile with the new one.  The original is now
1411
        // enclosed within hfile2
1412
0
        *hfile_ptr = hfile2;
1413
0
        ret = 0;
1414
0
    }
1415
1416
0
    if (fn2 != fn_buf)
1417
0
        free(fn2);
1418
0
    return ret;
1419
0
}
1420
1421
htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode)
1422
28.5k
{
1423
28.5k
    hFILE *hfile_orig = hfile;
1424
28.5k
    hFILE *hfile_cleanup = hfile;
1425
28.5k
    htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile));
1426
28.5k
    char simple_mode[101], *cp, *opts;
1427
28.5k
    simple_mode[100] = '\0';
1428
1429
28.5k
    if (fp == NULL) goto error;
1430
1431
28.5k
    fp->fn = strdup(fn);
1432
28.5k
    fp->is_be = ed_is_big();
1433
1434
    // Split mode into simple_mode,opts strings
1435
28.5k
    if ((cp = strchr(mode, ','))) {
1436
0
        strncpy(simple_mode, mode, cp-mode <= 100 ? cp-mode : 100);
1437
0
        simple_mode[cp-mode] = '\0';
1438
0
        opts = cp+1;
1439
28.5k
    } else {
1440
28.5k
        strncpy(simple_mode, mode, 100);
1441
28.5k
        opts = NULL;
1442
28.5k
    }
1443
1444
28.5k
    if (strchr(simple_mode, 'r')) {
1445
14.8k
        const int max_loops = 5; // Should be plenty
1446
14.8k
        int loops = 0;
1447
14.8k
        if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error;
1448
1449
        // Deal with formats that re-direct an underlying file via a plug-in.
1450
        // Loops as we may have crypt4gh served via htsget, or
1451
        // crypt4gh-in-crypt4gh.
1452
1453
14.8k
        while (fp->format.format == htsget ||
1454
14.8k
               fp->format.format == hts_crypt4gh_format) {
1455
            // Ensure we don't get stuck in an endless redirect loop
1456
0
            if (++loops > max_loops) {
1457
0
                errno = ELOOP;
1458
0
                goto error;
1459
0
            }
1460
1461
0
            if (fp->format.format == htsget) {
1462
0
                hFILE *hfile2 = hopen_htsget_redirect(hfile, simple_mode);
1463
0
                if (hfile2 == NULL) goto error;
1464
1465
0
                if (hfile != hfile_cleanup) {
1466
                    // Close the result of an earlier redirection
1467
0
                    hclose_abruptly(hfile);
1468
0
                }
1469
1470
0
                hfile = hfile2;
1471
0
            }
1472
0
            else if (fp->format.format == hts_crypt4gh_format) {
1473
0
                int should_preserve = (hfile == hfile_orig);
1474
0
                int update_cleanup = (hfile == hfile_cleanup);
1475
0
                if (hts_crypt4gh_redirect(fn, simple_mode, &hfile, fp) < 0)
1476
0
                    goto error;
1477
0
                if (should_preserve) {
1478
                    // The original hFILE is now contained in a crypt4gh
1479
                    // wrapper.  Should we need to close the wrapper due
1480
                    // to a later error, we need to prevent the wrapped
1481
                    // handle from being closed as the caller will see
1482
                    // this function return NULL and try to clean up itself.
1483
0
                    hfile_orig->preserve = 1;
1484
0
                }
1485
0
                if (update_cleanup) {
1486
                    // Update handle to close at the end if redirected by htsget
1487
0
                    hfile_cleanup = hfile;
1488
0
                }
1489
0
            }
1490
1491
            // Re-detect format against the result of the redirection
1492
0
            if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error;
1493
0
        }
1494
14.8k
    }
1495
13.6k
    else if (strchr(simple_mode, 'w') || strchr(simple_mode, 'a')) {
1496
13.6k
        htsFormat *fmt = &fp->format;
1497
13.6k
        fp->is_write = 1;
1498
1499
13.6k
        if (strchr(simple_mode, 'b')) fmt->format = binary_format;
1500
13.6k
        else if (strchr(simple_mode, 'c')) fmt->format = cram;
1501
13.6k
        else if (strchr(simple_mode, 'f')) fmt->format = fastq_format;
1502
13.6k
        else if (strchr(simple_mode, 'F')) fmt->format = fasta_format;
1503
13.6k
        else fmt->format = text_format;
1504
1505
13.6k
        if (strchr(simple_mode, 'z')) fmt->compression = bgzf;
1506
13.6k
        else if (strchr(simple_mode, 'g')) fmt->compression = gzip;
1507
13.6k
        else if (strchr(simple_mode, 'u')) fmt->compression = no_compression;
1508
13.6k
        else {
1509
            // No compression mode specified, set to the default for the format
1510
13.6k
            switch (fmt->format) {
1511
0
            case binary_format: fmt->compression = bgzf; break;
1512
0
            case cram: fmt->compression = custom; break;
1513
0
            case fastq_format: fmt->compression = no_compression; break;
1514
0
            case fasta_format: fmt->compression = no_compression; break;
1515
13.6k
            case text_format: fmt->compression = no_compression; break;
1516
0
            default: abort();
1517
13.6k
            }
1518
13.6k
        }
1519
1520
        // Fill in category (if determinable; e.g. 'b' could be BAM or BCF)
1521
13.6k
        fmt->category = format_category(fmt->format);
1522
1523
13.6k
        fmt->version.major = fmt->version.minor = -1;
1524
13.6k
        fmt->compression_level = -1;
1525
13.6k
        fmt->specific = NULL;
1526
13.6k
    }
1527
0
    else { errno = EINVAL; goto error; }
1528
1529
28.5k
    switch (fp->format.format) {
1530
0
    case binary_format:
1531
468
    case bam:
1532
1.11k
    case bcf:
1533
1.11k
        fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
1534
1.11k
        if (fp->fp.bgzf == NULL) goto error;
1535
1.11k
        fp->is_bin = fp->is_bgzf = 1;
1536
1.11k
        break;
1537
1538
5.09k
    case cram:
1539
5.09k
        fp->fp.cram = cram_dopen(hfile, fn, simple_mode);
1540
5.09k
        if (fp->fp.cram == NULL) goto error;
1541
3.90k
        if (!fp->is_write)
1542
3.90k
            cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, -1); // auto
1543
3.90k
        fp->is_cram = 1;
1544
3.90k
        break;
1545
1546
10
    case empty_format:
1547
13.7k
    case text_format:
1548
13.7k
    case bed:
1549
14.0k
    case fasta_format:
1550
14.0k
    case fastq_format:
1551
16.9k
    case sam:
1552
22.3k
    case vcf:
1553
22.3k
        if (fp->format.compression != no_compression) {
1554
1.07k
            fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
1555
1.07k
            if (fp->fp.bgzf == NULL) goto error;
1556
1.06k
            fp->is_bgzf = 1;
1557
1.06k
        }
1558
21.2k
        else
1559
21.2k
            fp->fp.hfile = hfile;
1560
22.3k
        break;
1561
1562
22.3k
    default:
1563
4
        errno = EFTYPE;
1564
4
        goto error;
1565
28.5k
    }
1566
1567
27.3k
    if (opts)
1568
0
        hts_process_opts(fp, opts);
1569
1570
    // Allow original file to close if it was preserved earlier by crypt4gh
1571
27.3k
    hfile_orig->preserve = 0;
1572
1573
    // If redirecting via htsget, close the original hFILE now (pedantically
1574
    // we would instead close it in hts_close(), but this a simplifying
1575
    // optimisation)
1576
27.3k
    if (hfile != hfile_cleanup) hclose_abruptly(hfile_cleanup);
1577
1578
27.3k
    return fp;
1579
1580
1.19k
error:
1581
1.19k
    hts_log_error("Failed to open file %s", fn);
1582
1583
    // If redirecting, close the failed redirection hFILE that we have opened
1584
1.19k
    if (hfile != hfile_orig) hclose_abruptly(hfile);
1585
1.19k
    hfile_orig->preserve = 0; // Allow caller to close the original hfile
1586
1587
1.19k
    if (fp) {
1588
1.19k
        free(fp->fn);
1589
1.19k
        free(fp->fn_aux);
1590
1.19k
        free(fp);
1591
1.19k
    }
1592
1.19k
    return NULL;
1593
28.5k
}
1594
1595
int hts_close(htsFile *fp)
1596
27.3k
{
1597
27.3k
    int ret = 0, save;
1598
1599
27.3k
    switch (fp->format.format) {
1600
0
    case binary_format:
1601
468
    case bam:
1602
1.11k
    case bcf:
1603
1.11k
        ret = bgzf_close(fp->fp.bgzf);
1604
1.11k
        break;
1605
1606
3.90k
    case cram:
1607
3.90k
        if (!fp->is_write) {
1608
3.90k
            switch (cram_eof(fp->fp.cram)) {
1609
51
            case 2:
1610
51
                hts_log_warning("EOF marker is absent. The input is probably truncated");
1611
51
                break;
1612
3.85k
            case 0:  /* not at EOF, but may not have wanted all seqs */
1613
3.85k
            default: /* case 1, expected EOF */
1614
3.85k
                break;
1615
3.90k
            }
1616
3.90k
        }
1617
3.90k
        ret = cram_close(fp->fp.cram);
1618
3.90k
        break;
1619
1620
1
    case empty_format:
1621
1.06k
    case text_format:
1622
1.06k
    case bed:
1623
1.38k
    case fasta_format:
1624
1.41k
    case fastq_format:
1625
11.8k
    case sam:
1626
22.3k
    case vcf:
1627
22.3k
        if (fp->format.format == sam)
1628
10.4k
            ret = sam_state_destroy(fp);
1629
11.8k
        else if (fp->format.format == fastq_format ||
1630
11.8k
                 fp->format.format == fasta_format)
1631
346
            fastq_state_destroy(fp);
1632
1633
22.3k
        if (fp->format.compression != no_compression)
1634
1.06k
            ret |= bgzf_close(fp->fp.bgzf);
1635
21.2k
        else
1636
21.2k
            ret |= hclose(fp->fp.hfile);
1637
22.3k
        break;
1638
1639
0
    default:
1640
0
        ret = -1;
1641
0
        break;
1642
27.3k
    }
1643
1644
27.3k
    save = errno;
1645
27.3k
    sam_hdr_destroy(fp->bam_header);
1646
27.3k
    hts_idx_destroy(fp->idx);
1647
27.3k
    hts_filter_free(fp->filter);
1648
27.3k
    free(fp->fn);
1649
27.3k
    free(fp->fn_aux);
1650
27.3k
    free(fp->line.s);
1651
27.3k
    free(fp);
1652
27.3k
    errno = save;
1653
27.3k
    return ret;
1654
27.3k
}
1655
1656
int hts_flush(htsFile *fp)
1657
0
{
1658
0
    if (fp == NULL) return 0;
1659
1660
0
    switch (fp->format.format) {
1661
0
    case binary_format:
1662
0
    case bam:
1663
0
    case bcf:
1664
0
        return bgzf_flush(fp->fp.bgzf);
1665
1666
0
    case cram:
1667
0
        return cram_flush(fp->fp.cram);
1668
1669
0
    case empty_format:
1670
0
    case text_format:
1671
0
    case bed:
1672
0
    case fasta_format:
1673
0
    case fastq_format:
1674
0
    case sam:
1675
0
    case vcf:
1676
0
        if (fp->format.compression != no_compression)
1677
0
            return bgzf_flush(fp->fp.bgzf);
1678
0
        else
1679
0
            return hflush(fp->fp.hfile);
1680
1681
0
    default:
1682
0
        break;
1683
0
    }
1684
1685
0
    return 0;
1686
0
}
1687
1688
const htsFormat *hts_get_format(htsFile *fp)
1689
0
{
1690
0
    return fp? &fp->format : NULL;
1691
0
}
1692
1693
0
const char *hts_format_file_extension(const htsFormat *format) {
1694
0
    if (!format)
1695
0
        return "?";
1696
1697
0
    switch (format->format) {
1698
0
    case sam:  return "sam";
1699
0
    case bam:  return "bam";
1700
0
    case bai:  return "bai";
1701
0
    case cram: return "cram";
1702
0
    case crai: return "crai";
1703
0
    case vcf:  return "vcf";
1704
0
    case bcf:  return "bcf";
1705
0
    case csi:  return "csi";
1706
0
    case fai_format:   return "fai";
1707
0
    case fqi_format:   return "fqi";
1708
0
    case gzi:  return "gzi";
1709
0
    case tbi:  return "tbi";
1710
0
    case bed:  return "bed";
1711
0
    case d4_format:    return "d4";
1712
0
    case fasta_format: return "fa";
1713
0
    case fastq_format: return "fq";
1714
0
    default:   return "?";
1715
0
    }
1716
0
}
1717
1718
0
static hFILE *hts_hfile(htsFile *fp) {
1719
0
    switch (fp->format.format) {
1720
0
    case binary_format:// fall through
1721
0
    case bcf:          // fall through
1722
0
    case bam:          return bgzf_hfile(fp->fp.bgzf);
1723
0
    case cram:         return cram_hfile(fp->fp.cram);
1724
0
    case text_format:  return fp->fp.hfile;
1725
0
    case vcf:          // fall through
1726
0
    case fastq_format: // fall through
1727
0
    case fasta_format: // fall through
1728
0
    case sam:          return fp->format.compression != no_compression
1729
0
                              ? bgzf_hfile(fp->fp.bgzf)
1730
0
                              : fp->fp.hfile;
1731
0
    default:           return NULL;
1732
0
    }
1733
0
}
1734
1735
0
int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) {
1736
0
    int r;
1737
0
    va_list args;
1738
1739
0
    switch (opt) {
1740
0
    case HTS_OPT_NTHREADS: {
1741
0
        va_start(args, opt);
1742
0
        int nthreads = va_arg(args, int);
1743
0
        va_end(args);
1744
0
        return hts_set_threads(fp, nthreads);
1745
0
    }
1746
1747
0
    case HTS_OPT_BLOCK_SIZE: {
1748
0
        hFILE *hf = hts_hfile(fp);
1749
1750
0
        if (hf) {
1751
0
            va_start(args, opt);
1752
0
            if (hfile_set_blksize(hf, va_arg(args, int)) != 0)
1753
0
                hts_log_warning("Failed to change block size");
1754
0
            va_end(args);
1755
0
        }
1756
0
        else {
1757
            // To do - implement for vcf/bcf.
1758
0
            hts_log_warning("Cannot change block size for this format");
1759
0
        }
1760
1761
0
        return 0;
1762
0
    }
1763
1764
0
    case HTS_OPT_THREAD_POOL: {
1765
0
        va_start(args, opt);
1766
0
        htsThreadPool *p = va_arg(args, htsThreadPool *);
1767
0
        va_end(args);
1768
0
        return hts_set_thread_pool(fp, p);
1769
0
    }
1770
1771
0
    case HTS_OPT_CACHE_SIZE: {
1772
0
        va_start(args, opt);
1773
0
        int cache_size = va_arg(args, int);
1774
0
        va_end(args);
1775
0
        hts_set_cache_size(fp, cache_size);
1776
0
        return 0;
1777
0
    }
1778
1779
0
    case FASTQ_OPT_CASAVA:
1780
0
    case FASTQ_OPT_RNUM:
1781
0
    case FASTQ_OPT_NAME2:
1782
0
        if (fp->format.format == fastq_format ||
1783
0
            fp->format.format == fasta_format)
1784
0
            return fastq_state_set(fp, opt);
1785
0
        return 0;
1786
1787
0
    case FASTQ_OPT_AUX:
1788
0
        if (fp->format.format == fastq_format ||
1789
0
            fp->format.format == fasta_format) {
1790
0
            va_start(args, opt);
1791
0
            char *list = va_arg(args, char *);
1792
0
            va_end(args);
1793
0
            return fastq_state_set(fp, opt, list);
1794
0
        }
1795
0
        return 0;
1796
1797
0
    case FASTQ_OPT_BARCODE:
1798
0
        if (fp->format.format == fastq_format ||
1799
0
            fp->format.format == fasta_format) {
1800
0
            va_start(args, opt);
1801
0
            char *bc = va_arg(args, char *);
1802
0
            va_end(args);
1803
0
            return fastq_state_set(fp, opt, bc);
1804
0
        }
1805
0
        return 0;
1806
1807
    // Options below here flow through to cram_set_voption
1808
0
    case HTS_OPT_COMPRESSION_LEVEL: {
1809
0
        va_start(args, opt);
1810
0
        int level = va_arg(args, int);
1811
0
        va_end(args);
1812
0
        if (fp->is_bgzf)
1813
0
            fp->fp.bgzf->compress_level = level;
1814
0
        else if (fp->format.format == cram)
1815
0
            return cram_set_option(fp->fp.cram, opt, level);
1816
0
        return 0;
1817
0
    }
1818
1819
0
    case HTS_OPT_FILTER: {
1820
0
        va_start(args, opt);
1821
0
        char *expr = va_arg(args, char *);
1822
0
        va_end(args);
1823
0
        return hts_set_filter_expression(fp, expr);
1824
0
    }
1825
1826
0
    case HTS_OPT_PROFILE: {
1827
0
        va_start(args, opt);
1828
0
        enum hts_profile_option prof = va_arg(args, int);
1829
0
        va_end(args);
1830
0
        if (fp->is_bgzf) {
1831
0
            switch (prof) {
1832
#ifdef HAVE_LIBDEFLATE
1833
            case HTS_PROFILE_FAST:    fp->fp.bgzf->compress_level =  2; break;
1834
            case HTS_PROFILE_NORMAL:  fp->fp.bgzf->compress_level = -1; break;
1835
            case HTS_PROFILE_SMALL:   fp->fp.bgzf->compress_level = 10; break;
1836
            case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level = 12; break;
1837
#else
1838
0
            case HTS_PROFILE_FAST:    fp->fp.bgzf->compress_level =  1; break;
1839
0
            case HTS_PROFILE_NORMAL:  fp->fp.bgzf->compress_level = -1; break;
1840
0
            case HTS_PROFILE_SMALL:   fp->fp.bgzf->compress_level =  8; break;
1841
0
            case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level =  9; break;
1842
0
#endif
1843
0
            }
1844
0
        } // else CRAM manages this in its own way
1845
0
        break;
1846
0
    }
1847
1848
0
    default:
1849
0
        break;
1850
0
    }
1851
1852
0
    if (fp->format.format != cram)
1853
0
        return 0;
1854
1855
0
    va_start(args, opt);
1856
0
    r = cram_set_voption(fp->fp.cram, opt, args);
1857
0
    va_end(args);
1858
1859
0
    return r;
1860
0
}
1861
1862
BGZF *hts_get_bgzfp(htsFile *fp);
1863
1864
int hts_set_threads(htsFile *fp, int n)
1865
0
{
1866
0
    if (fp->format.format == sam) {
1867
0
        return sam_set_threads(fp, n);
1868
0
    } else if (fp->format.compression == bgzf) {
1869
0
        return bgzf_mt(hts_get_bgzfp(fp), n, 256/*unused*/);
1870
0
    } else if (fp->format.format == cram) {
1871
0
        return hts_set_opt(fp, CRAM_OPT_NTHREADS, n);
1872
0
    }
1873
0
    else return 0;
1874
0
}
1875
1876
0
int hts_set_thread_pool(htsFile *fp, htsThreadPool *p) {
1877
0
    if (fp->format.format == sam || fp->format.format == text_format) {
1878
0
        return sam_set_thread_pool(fp, p);
1879
0
    } else if (fp->format.compression == bgzf) {
1880
0
        return bgzf_thread_pool(hts_get_bgzfp(fp), p->pool, p->qsize);
1881
0
    } else if (fp->format.format == cram) {
1882
0
        return hts_set_opt(fp, CRAM_OPT_THREAD_POOL, p);
1883
0
    }
1884
0
    else return 0;
1885
0
}
1886
1887
void hts_set_cache_size(htsFile *fp, int n)
1888
0
{
1889
0
    if (fp->format.compression == bgzf)
1890
0
        bgzf_set_cache_size(hts_get_bgzfp(fp), n);
1891
0
}
1892
1893
int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
1894
0
{
1895
0
    free(fp->fn_aux);
1896
0
    if (fn_aux) {
1897
0
        fp->fn_aux = strdup(fn_aux);
1898
0
        if (fp->fn_aux == NULL) return -1;
1899
0
    }
1900
0
    else fp->fn_aux = NULL;
1901
1902
0
    if (fp->format.format == cram)
1903
0
        if (cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux))
1904
0
            return -1;
1905
1906
0
    return 0;
1907
0
}
1908
1909
int hts_set_filter_expression(htsFile *fp, const char *expr)
1910
0
{
1911
0
    if (fp->filter)
1912
0
        hts_filter_free(fp->filter);
1913
1914
0
    if (!expr)
1915
0
        return 0;
1916
1917
0
    return (fp->filter = hts_filter_init(expr))
1918
0
        ? 0 : -1;
1919
0
}
1920
1921
hFILE *hts_open_tmpfile(const char *fname, const char *mode, kstring_t *tmpname)
1922
0
{
1923
0
    int pid = (int) getpid();
1924
0
    unsigned ptr = (uintptr_t) tmpname;
1925
0
    int n = 0;
1926
0
    hFILE *fp = NULL;
1927
1928
0
    do {
1929
        // Attempt to further uniquify the temporary filename
1930
0
        unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock()) ^ ptr;
1931
0
        n++;
1932
1933
0
        ks_clear(tmpname);
1934
0
        if (ksprintf(tmpname, "%s.tmp_%d_%d_%u", fname, pid, n, t) < 0) break;
1935
1936
0
        fp = hopen(tmpname->s, mode);
1937
0
    } while (fp == NULL && errno == EEXIST && n < 100);
1938
1939
0
    return fp;
1940
0
}
1941
1942
// For VCF/BCF backward sweeper. Not exposing these functions because their
1943
// future is uncertain. Things will probably have to change with hFILE...
1944
BGZF *hts_get_bgzfp(htsFile *fp)
1945
0
{
1946
0
    if (fp->is_bgzf)
1947
0
        return fp->fp.bgzf;
1948
0
    else
1949
0
        return NULL;
1950
0
}
1951
int hts_useek(htsFile *fp, off_t uoffset, int where)
1952
0
{
1953
0
    if (fp->is_bgzf)
1954
0
        return bgzf_useek(fp->fp.bgzf, uoffset, where);
1955
0
    else
1956
0
        return (hseek(fp->fp.hfile, uoffset, SEEK_SET) >= 0)? 0 : -1;
1957
0
}
1958
off_t hts_utell(htsFile *fp)
1959
0
{
1960
0
    if (fp->is_bgzf)
1961
0
        return bgzf_utell(fp->fp.bgzf);
1962
0
    else
1963
0
        return htell(fp->fp.hfile);
1964
0
}
1965
1966
int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
1967
27.0M
{
1968
27.0M
    int ret;
1969
27.0M
    if (! (delimiter == KS_SEP_LINE || delimiter == '\n')) {
1970
0
        hts_log_error("Unexpected delimiter %d", delimiter);
1971
0
        abort();
1972
0
    }
1973
1974
27.0M
    switch (fp->format.compression) {
1975
124k
    case no_compression:
1976
124k
        str->l = 0;
1977
124k
        ret = kgetline2(str, (kgets_func2 *) hgetln, fp->fp.hfile);
1978
124k
        if (ret >= 0) ret = (str->l <= INT_MAX)? (int) str->l : INT_MAX;
1979
4.78k
        else if (herrno(fp->fp.hfile)) ret = -2, errno = herrno(fp->fp.hfile);
1980
4.78k
        else ret = -1;
1981
124k
        break;
1982
1983
26.9M
    case gzip:
1984
26.9M
    case bgzf:
1985
26.9M
        ret = bgzf_getline(fp->fp.bgzf, '\n', str);
1986
26.9M
        break;
1987
1988
0
    default:
1989
0
        abort();
1990
27.0M
    }
1991
1992
27.0M
    ++fp->lineno;
1993
27.0M
    return ret;
1994
27.0M
}
1995
1996
char **hts_readlist(const char *string, int is_file, int *_n)
1997
0
{
1998
0
    unsigned int m = 0, n = 0;
1999
0
    char **s = 0, **s_new;
2000
0
    if ( is_file )
2001
0
    {
2002
0
        BGZF *fp = bgzf_open(string, "r");
2003
0
        if ( !fp ) return NULL;
2004
2005
0
        kstring_t str;
2006
0
        int ret;
2007
0
        str.s = 0; str.l = str.m = 0;
2008
0
        while ((ret = bgzf_getline(fp, '\n', &str)) >= 0)
2009
0
        {
2010
0
            if (str.l == 0) continue;
2011
0
            if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
2012
0
                goto err;
2013
0
            s[n] = strdup(str.s);
2014
0
            if (!s[n])
2015
0
                goto err;
2016
0
            n++;
2017
0
        }
2018
0
        if (ret < -1) // Read error
2019
0
            goto err;
2020
0
        bgzf_close(fp);
2021
0
        free(str.s);
2022
0
    }
2023
0
    else
2024
0
    {
2025
0
        const char *q = string, *p = string;
2026
0
        while ( 1 )
2027
0
        {
2028
0
            if (*p == ',' || *p == 0)
2029
0
            {
2030
0
                if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
2031
0
                    goto err;
2032
0
                s[n] = (char*)calloc(p - q + 1, 1);
2033
0
                if (!s[n])
2034
0
                    goto err;
2035
0
                strncpy(s[n++], q, p - q);
2036
0
                q = p + 1;
2037
0
            }
2038
0
            if ( !*p ) break;
2039
0
            p++;
2040
0
        }
2041
0
    }
2042
    // Try to shrink s to the minimum size needed
2043
0
    s_new = (char**)realloc(s, n * sizeof(char*));
2044
0
    if (!s_new)
2045
0
        goto err;
2046
2047
0
    s = s_new;
2048
0
    assert(n < INT_MAX); // hts_resize() should ensure this
2049
0
    *_n = n;
2050
0
    return s;
2051
2052
0
 err:
2053
0
    for (m = 0; m < n; m++)
2054
0
        free(s[m]);
2055
0
    free(s);
2056
0
    return NULL;
2057
0
}
2058
2059
char **hts_readlines(const char *fn, int *_n)
2060
0
{
2061
0
    unsigned int m = 0, n = 0;
2062
0
    char **s = 0, **s_new;
2063
0
    BGZF *fp = bgzf_open(fn, "r");
2064
0
    if ( fp ) { // read from file
2065
0
        kstring_t str;
2066
0
        int ret;
2067
0
        str.s = 0; str.l = str.m = 0;
2068
0
        while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
2069
0
            if (str.l == 0) continue;
2070
0
            if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
2071
0
                goto err;
2072
0
            s[n] = strdup(str.s);
2073
0
            if (!s[n])
2074
0
                goto err;
2075
0
            n++;
2076
0
        }
2077
0
        if (ret < -1) // Read error
2078
0
            goto err;
2079
0
        bgzf_close(fp);
2080
0
        free(str.s);
2081
0
    } else if (*fn == ':') { // read from string
2082
0
        const char *q, *p;
2083
0
        for (q = p = fn + 1;; ++p)
2084
0
            if (*p == ',' || *p == 0) {
2085
0
                if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
2086
0
                    goto err;
2087
0
                s[n] = (char*)calloc(p - q + 1, 1);
2088
0
                if (!s[n])
2089
0
                    goto err;
2090
0
                strncpy(s[n++], q, p - q);
2091
0
                q = p + 1;
2092
0
                if (*p == 0) break;
2093
0
            }
2094
0
    } else return 0;
2095
    // Try to shrink s to the minimum size needed
2096
0
    s_new = (char**)realloc(s, n * sizeof(char*));
2097
0
    if (!s_new)
2098
0
        goto err;
2099
2100
0
    s = s_new;
2101
0
    assert(n < INT_MAX); // hts_resize() should ensure this
2102
0
    *_n = n;
2103
0
    return s;
2104
2105
0
 err:
2106
0
    for (m = 0; m < n; m++)
2107
0
        free(s[m]);
2108
0
    free(s);
2109
0
    return NULL;
2110
0
}
2111
2112
// DEPRECATED: To be removed in a future HTSlib release
2113
int hts_file_type(const char *fname)
2114
0
{
2115
0
    int len = strlen(fname);
2116
0
    if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ;
2117
0
    if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF;
2118
0
    if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ;
2119
0
    if ( !strcmp("-",fname) ) return FT_STDIN;
2120
2121
0
    hFILE *f = hopen(fname, "r");
2122
0
    if (f == NULL) return 0;
2123
2124
0
    htsFormat fmt;
2125
0
    if (hts_detect_format2(f, fname, &fmt) < 0) { hclose_abruptly(f); return 0; }
2126
0
    if (hclose(f) < 0) return 0;
2127
2128
0
    switch (fmt.format) {
2129
0
    case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ;
2130
0
    case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ;
2131
0
    default:  return 0;
2132
0
    }
2133
0
}
2134
2135
int hts_check_EOF(htsFile *fp)
2136
0
{
2137
0
    if (fp->format.compression == bgzf)
2138
0
        return bgzf_check_EOF(hts_get_bgzfp(fp));
2139
0
    else if (fp->format.format == cram)
2140
0
        return cram_check_EOF(fp->fp.cram);
2141
0
    else
2142
0
        return 3;
2143
0
}
2144
2145
2146
/****************
2147
 *** Indexing ***
2148
 ****************/
2149
2150
0
#define HTS_MIN_MARKER_DIST 0x10000
2151
2152
// Finds the special meta bin
2153
//  ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1
2154
0
#define META_BIN(idx) ((idx)->n_bins + 1)
2155
2156
0
#define pair64_lt(a,b) ((a).u < (b).u)
2157
0
#define pair64max_lt(a,b) ((a).u < (b).u || \
2158
0
                           ((a).u == (b).u && (a).max < (b).max))
2159
2160
0
KSORT_INIT_STATIC(_off, hts_pair64_t, pair64_lt)
2161
0
KSORT_INIT_STATIC(_off_max, hts_pair64_max_t, pair64max_lt)
2162
2163
typedef struct {
2164
    int32_t m, n;
2165
    uint64_t loff;
2166
    hts_pair64_t *list;
2167
} bins_t;
2168
2169
KHASH_MAP_INIT_INT(bin, bins_t)
2170
typedef khash_t(bin) bidx_t;
2171
2172
typedef struct {
2173
    hts_pos_t n, m;
2174
    uint64_t *offset;
2175
} lidx_t;
2176
2177
struct hts_idx_t {
2178
    int fmt, min_shift, n_lvls, n_bins;
2179
    uint32_t l_meta;
2180
    int32_t n, m;
2181
    uint64_t n_no_coor;
2182
    bidx_t **bidx;
2183
    lidx_t *lidx;
2184
    uint8_t *meta; // MUST have a terminating NUL on the end
2185
    int tbi_n, last_tbi_tid;
2186
    struct {
2187
        uint32_t last_bin, save_bin;
2188
        hts_pos_t last_coor;
2189
        int last_tid, save_tid, finished;
2190
        uint64_t last_off, save_off;
2191
        uint64_t off_beg, off_end;
2192
        uint64_t n_mapped, n_unmapped;
2193
    } z; // keep internal states
2194
};
2195
2196
0
static char * idx_format_name(int fmt) {
2197
0
    switch (fmt) {
2198
0
        case HTS_FMT_CSI: return "csi";
2199
0
        case HTS_FMT_BAI: return "bai";
2200
0
        case HTS_FMT_TBI: return "tbi";
2201
0
        case HTS_FMT_CRAI: return "crai";
2202
0
        default: return "unknown";
2203
0
    }
2204
0
}
2205
2206
#ifdef DEBUG_INDEX
2207
static void idx_dump(const hts_idx_t *idx) {
2208
    int i;
2209
    int64_t j;
2210
2211
    if (!idx) fprintf(stderr, "Null index\n");
2212
2213
    fprintf(stderr, "format='%s', min_shift=%d, n_lvls=%d, n_bins=%d, l_meta=%u ",
2214
            idx_format_name(idx->fmt), idx->min_shift, idx->n_lvls, idx->n_bins, idx->l_meta);
2215
    fprintf(stderr, "n=%d, m=%d, n_no_coor=%"PRIu64"\n", idx->n, idx->m, idx->n_no_coor);
2216
    for (i = 0; i < idx->n; i++) {
2217
        bidx_t *bidx = idx->bidx[i];
2218
        lidx_t *lidx = &idx->lidx[i];
2219
        if (bidx) {
2220
            fprintf(stderr, "======== BIN Index - tid=%d, n_buckets=%d, size=%d\n", i, bidx->n_buckets, bidx->size);
2221
            int b;
2222
            for (b = 0; b < META_BIN(idx); b++) {
2223
                khint_t k;
2224
                if ((k = kh_get(bin, bidx, b)) != kh_end(bidx)) {
2225
                    bins_t *entries = &kh_value(bidx, k);
2226
                    int l = hts_bin_level(b);
2227
                    int64_t bin_width = 1LL << ((idx->n_lvls - l) * 3 + idx->min_shift);
2228
                    fprintf(stderr, "\tbin=%d, level=%d, parent=%d, n_chunks=%d, loff=%"PRIu64", interval=[%"PRId64" - %"PRId64"]\n",
2229
                        b, l, hts_bin_parent(b), entries->n, entries->loff, (b-hts_bin_first(l))*bin_width+1, (b+1-hts_bin_first(l))*bin_width);
2230
                    for (j = 0; j < entries->n; j++)
2231
                        fprintf(stderr, "\t\tchunk=%"PRId64", u=%"PRIu64", v=%"PRIu64"\n", j, entries->list[j].u, entries->list[j].v);
2232
                }
2233
            }
2234
        }
2235
        if (lidx) {
2236
            fprintf(stderr, "======== LINEAR Index - tid=%d, n_values=%"PRId64"\n", i, lidx->n);
2237
            for (j = 0; j < lidx->n; j++) {
2238
                fprintf(stderr, "\t\tentry=%"PRId64", offset=%"PRIu64", interval=[%"PRId64" - %"PRId64"]\n",
2239
                    j, lidx->offset[j], j*(1<<idx->min_shift)+1, (j+1)*(1<<idx->min_shift));
2240
            }
2241
        }
2242
    }
2243
}
2244
#endif
2245
2246
static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end)
2247
0
{
2248
0
    khint_t k;
2249
0
    bins_t *l;
2250
0
    int absent;
2251
0
    k = kh_put(bin, b, bin, &absent);
2252
0
    if (absent < 0) return -1; // Out of memory
2253
0
    l = &kh_value(b, k);
2254
0
    if (absent) {
2255
0
        l->m = 1; l->n = 0;
2256
0
        l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t));
2257
0
        if (!l->list) {
2258
0
            kh_del(bin, b, k);
2259
0
            return -1;
2260
0
        }
2261
0
    } else if (l->n == l->m) {
2262
0
        uint32_t new_m = l->m ? l->m << 1 : 1;
2263
0
        hts_pair64_t *new_list = realloc(l->list, new_m * sizeof(hts_pair64_t));
2264
0
        if (!new_list) return -1;
2265
0
        l->list = new_list;
2266
0
        l->m = new_m;
2267
0
    }
2268
0
    l->list[l->n].u = beg;
2269
0
    l->list[l->n++].v = end;
2270
0
    return 0;
2271
0
}
2272
2273
static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift)
2274
0
{
2275
0
    int i;
2276
0
    hts_pos_t beg, end;
2277
0
    beg = _beg >> min_shift;
2278
0
    end = (_end - 1) >> min_shift;
2279
0
    if (l->m < end + 1) {
2280
0
        size_t new_m = l->m * 2 > end + 1 ? l->m * 2 : end + 1;
2281
0
        uint64_t *new_offset;
2282
2283
0
        new_offset = (uint64_t*)realloc(l->offset, new_m * sizeof(uint64_t));
2284
0
        if (!new_offset) return -1;
2285
2286
        // fill unused memory with (uint64_t)-1
2287
0
        memset(new_offset + l->m, 0xff, sizeof(uint64_t) * (new_m - l->m));
2288
0
        l->m = new_m;
2289
0
        l->offset = new_offset;
2290
0
    }
2291
0
    for (i = beg; i <= end; ++i) {
2292
0
        if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset;
2293
0
    }
2294
0
    if (l->n < end + 1) l->n = end + 1;
2295
0
    return 0;
2296
0
}
2297
2298
hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
2299
0
{
2300
0
    hts_idx_t *idx;
2301
0
    idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t));
2302
0
    if (idx == NULL) return NULL;
2303
0
    idx->fmt = fmt;
2304
0
    idx->min_shift = min_shift;
2305
0
    idx->n_lvls = n_lvls;
2306
0
    idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7;
2307
0
    idx->z.save_tid = idx->z.last_tid = -1;
2308
0
    idx->z.save_bin = idx->z.last_bin = 0xffffffffu;
2309
0
    idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0;
2310
0
    idx->z.last_coor = 0xffffffffu;
2311
0
    if (n) {
2312
0
        idx->n = idx->m = n;
2313
0
        idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*));
2314
0
        if (idx->bidx == NULL) { free(idx); return NULL; }
2315
0
        idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t));
2316
0
        if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; }
2317
0
    }
2318
0
    idx->tbi_n = -1;
2319
0
    idx->last_tbi_tid = -1;
2320
0
    return idx;
2321
0
}
2322
2323
static void update_loff(hts_idx_t *idx, int i, int free_lidx)
2324
0
{
2325
0
    bidx_t *bidx = idx->bidx[i];
2326
0
    lidx_t *lidx = &idx->lidx[i];
2327
0
    khint_t k;
2328
0
    int l;
2329
    // the last entry is always valid
2330
0
    for (l=lidx->n-2; l >= 0; l--) {
2331
0
        if (lidx->offset[l] == (uint64_t)-1)
2332
0
            lidx->offset[l] = lidx->offset[l+1];
2333
0
    }
2334
0
    if (bidx == 0) return;
2335
0
    for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff
2336
0
        if (kh_exist(bidx, k))
2337
0
        {
2338
0
            if ( kh_key(bidx, k) < idx->n_bins )
2339
0
            {
2340
0
                int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls);
2341
                // disable linear index if bot_bin out of bounds
2342
0
                kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0;
2343
0
            }
2344
0
            else
2345
0
                kh_val(bidx, k).loff = 0;
2346
0
        }
2347
0
    if (free_lidx) {
2348
0
        free(lidx->offset);
2349
0
        lidx->m = lidx->n = 0;
2350
0
        lidx->offset = 0;
2351
0
    }
2352
0
}
2353
2354
static int compress_binning(hts_idx_t *idx, int i)
2355
0
{
2356
0
    bidx_t *bidx = idx->bidx[i];
2357
0
    khint_t k;
2358
0
    int l, m;
2359
0
    if (bidx == 0) return 0;
2360
    // merge a bin to its parent if the bin is too small
2361
0
    for (l = idx->n_lvls; l > 0; --l) {
2362
0
        unsigned start = hts_bin_first(l);
2363
0
        for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
2364
0
            bins_t *p, *q;
2365
0
            if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue;
2366
0
            p = &kh_value(bidx, k);
2367
0
            if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list);
2368
0
            if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) {
2369
0
                khint_t kp;
2370
0
                kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k)));
2371
0
                if (kp == kh_end(bidx)) continue;
2372
0
                q = &kh_val(bidx, kp);
2373
0
                if (q->n + p->n > q->m) {
2374
0
                    uint32_t new_m = q->n + p->n;
2375
0
                    hts_pair64_t *new_list;
2376
0
                    kroundup32(new_m);
2377
0
                    if (new_m > INT32_MAX) return -1; // Limited by index format
2378
0
                    new_list = realloc(q->list, new_m * sizeof(*new_list));
2379
0
                    if (!new_list) return -1;
2380
0
                    q->m = new_m;
2381
0
                    q->list = new_list;
2382
0
                }
2383
0
                memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t));
2384
0
                q->n += p->n;
2385
0
                free(p->list);
2386
0
                kh_del(bin, bidx, k);
2387
0
            }
2388
0
        }
2389
0
    }
2390
0
    k = kh_get(bin, bidx, 0);
2391
0
    if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list);
2392
    // merge adjacent chunks that start from the same BGZF block
2393
0
    for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
2394
0
        bins_t *p;
2395
0
        if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue;
2396
0
        p = &kh_value(bidx, k);
2397
0
        for (l = 1, m = 0; l < p->n; ++l) {
2398
0
            if (p->list[m].v>>16 >= p->list[l].u>>16) {
2399
0
                if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v;
2400
0
            } else p->list[++m] = p->list[l];
2401
0
        }
2402
0
        p->n = m + 1;
2403
0
    }
2404
0
    return 0;
2405
0
}
2406
2407
int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
2408
0
{
2409
0
    int i, ret = 0;
2410
0
    if (idx == NULL || idx->z.finished) return 0; // do not run this function on an empty index or multiple times
2411
0
    if (idx->z.save_tid >= 0) {
2412
0
        ret |= insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset);
2413
0
        ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset);
2414
0
        ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
2415
0
    }
2416
0
    for (i = 0; i < idx->n; ++i) {
2417
0
        update_loff(idx, i, (idx->fmt == HTS_FMT_CSI));
2418
0
        ret |= compress_binning(idx, i);
2419
0
    }
2420
0
    idx->z.finished = 1;
2421
2422
0
    return ret;
2423
0
}
2424
2425
int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
2426
0
{
2427
0
    int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3);
2428
0
    if (tid < 0 || (beg <= maxpos && end <= maxpos))
2429
0
        return 0;
2430
2431
0
    if (idx->fmt == HTS_FMT_CSI) {
2432
0
        hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" "
2433
0
                      "cannot be stored in a csi index with these parameters. "
2434
0
                      "Please use a larger min_shift or depth",
2435
0
                      beg, end);
2436
0
    } else {
2437
0
        hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos
2438
0
                      " cannot be stored in a %s index. Try using a csi index",
2439
0
                      beg, end, idx_format_name(idx->fmt));
2440
0
    }
2441
0
    errno = ERANGE;
2442
0
    return -1;
2443
0
}
2444
2445
int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped)
2446
0
{
2447
0
    int bin;
2448
0
    if (tid<0) beg = -1, end = 0;
2449
0
    if (hts_idx_check_range(idx, tid, beg, end) < 0)
2450
0
        return -1;
2451
0
    if (tid >= idx->m) { // enlarge the index
2452
0
        uint32_t new_m = idx->m * 2 > tid + 1 ? idx->m * 2 : tid + 1;
2453
0
        bidx_t **new_bidx;
2454
0
        lidx_t *new_lidx;
2455
2456
0
        new_bidx = (bidx_t**)realloc(idx->bidx, new_m * sizeof(bidx_t*));
2457
0
        if (!new_bidx) return -1;
2458
0
        idx->bidx = new_bidx;
2459
2460
0
        new_lidx = (lidx_t*) realloc(idx->lidx, new_m * sizeof(lidx_t));
2461
0
        if (!new_lidx) return -1;
2462
0
        idx->lidx = new_lidx;
2463
2464
0
        memset(&idx->bidx[idx->m], 0, (new_m - idx->m) * sizeof(bidx_t*));
2465
0
        memset(&idx->lidx[idx->m], 0, (new_m - idx->m) * sizeof(lidx_t));
2466
0
        idx->m = new_m;
2467
0
    }
2468
0
    if (idx->n < tid + 1) idx->n = tid + 1;
2469
0
    if (idx->z.finished) return 0;
2470
0
    if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome
2471
0
        if ( tid>=0 && idx->n_no_coor )
2472
0
        {
2473
0
            hts_log_error("NO_COOR reads not in a single block at the end %d %d", tid, idx->z.last_tid);
2474
0
            return -1;
2475
0
        }
2476
0
        if (tid>=0 && idx->bidx[tid] != 0)
2477
0
        {
2478
0
            hts_log_error("Chromosome blocks not continuous");
2479
0
            return -1;
2480
0
        }
2481
0
        idx->z.last_tid = tid;
2482
0
        idx->z.last_bin = 0xffffffffu;
2483
0
    } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order
2484
0
        hts_log_error("Unsorted positions on sequence #%d: %"PRIhts_pos" followed by %"PRIhts_pos, tid+1, idx->z.last_coor+1, beg+1);
2485
0
        return -1;
2486
0
    }
2487
0
    if (end < beg) {
2488
        // Malformed ranges are errors. (Empty ranges (beg==end) are unusual but acceptable.)
2489
0
        hts_log_error("Invalid record on sequence #%d: end %"PRId64" < begin %"PRId64, tid+1, end, beg+1);
2490
0
        return -1;
2491
0
    }
2492
0
    if ( tid>=0 )
2493
0
    {
2494
0
        if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin);
2495
        // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin
2496
0
        if (beg < 0)  beg = 0;
2497
0
        if (end <= 0) end = 1;
2498
        // idx->z.last_off points to the start of the current record
2499
0
        if (insert_to_l(&idx->lidx[tid], beg, end,
2500
0
                        idx->z.last_off, idx->min_shift) < 0) return -1;
2501
0
    }
2502
0
    else idx->n_no_coor++;
2503
0
    bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls);
2504
0
    if ((int)idx->z.last_bin != bin) { // then possibly write the binning index
2505
0
        if (idx->z.save_bin != 0xffffffffu) { // save_bin==0xffffffffu only happens to the first record
2506
0
            if (insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin,
2507
0
                            idx->z.save_off, idx->z.last_off) < 0) return -1;
2508
0
        }
2509
0
        if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information
2510
0
            idx->z.off_end = idx->z.last_off;
2511
0
            if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
2512
0
                            idx->z.off_beg, idx->z.off_end) < 0) return -1;
2513
0
            if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
2514
0
                            idx->z.n_mapped, idx->z.n_unmapped) < 0) return -1;
2515
0
            idx->z.n_mapped = idx->z.n_unmapped = 0;
2516
0
            idx->z.off_beg = idx->z.off_end;
2517
0
        }
2518
0
        idx->z.save_off = idx->z.last_off;
2519
0
        idx->z.save_bin = idx->z.last_bin = bin;
2520
0
        idx->z.save_tid = tid;
2521
0
    }
2522
0
    if (is_mapped) ++idx->z.n_mapped;
2523
0
    else ++idx->z.n_unmapped;
2524
0
    idx->z.last_off = offset;
2525
0
    idx->z.last_coor = beg;
2526
0
    return 0;
2527
0
}
2528
2529
// Needed for TBI only.  Ensure 'tid' with 'name' is in the index meta data.
2530
// idx->meta needs to have been initialised first with an appropriate Tabix
2531
// configuration via hts_idx_set_meta.
2532
//
2533
// NB number of references (first 4 bytes of tabix header) aren't in
2534
// idx->meta, but held in idx->n instead.
2535
0
int hts_idx_tbi_name(hts_idx_t *idx, int tid, const char *name) {
2536
    // Horrid - we have to map incoming tid to a tbi alternative tid.
2537
    // This is because TBI counts tids by "covered" refs while everything
2538
    // else counts by Nth SQ/contig record in header.
2539
0
    if (tid == idx->last_tbi_tid || tid < 0 || !name)
2540
0
        return idx->tbi_n;
2541
2542
0
    uint32_t len = strlen(name)+1;
2543
0
    uint8_t *tmp = (uint8_t *)realloc(idx->meta, idx->l_meta + len);
2544
0
    if (!tmp)
2545
0
        return -1;
2546
2547
    // Append name
2548
0
    idx->meta = tmp;
2549
0
    strcpy((char *)idx->meta + idx->l_meta, name);
2550
0
    idx->l_meta += len;
2551
2552
    // Update seq length
2553
0
    u32_to_le(le_to_u32(idx->meta+24)+len, idx->meta+24);
2554
2555
0
    idx->last_tbi_tid = tid;
2556
0
    return ++idx->tbi_n;
2557
0
}
2558
2559
// When doing samtools index we have a read_bam / hts_idx_push(bgzf_tell())
2560
// loop.  idx->z.last_off is the previous bzgf_tell location, so we know
2561
// the location the current bam record started at as well as where it ends.
2562
//
2563
// When building an index on the fly via a write_bam / hts_idx_push loop,
2564
// this isn't quite identical as we may amend the virtual coord returned
2565
// by bgzf_tell to the start of a new block if the next bam struct doesn't
2566
// fit.  It's essentially the same thing, but for bit-identical indices
2567
// we need to amend the idx->z.last_off when we know we're starting a new
2568
// block.
2569
void hts_idx_amend_last(hts_idx_t *idx, uint64_t offset)
2570
0
{
2571
0
    idx->z.last_off = offset;
2572
0
}
2573
2574
void hts_idx_destroy(hts_idx_t *idx)
2575
27.3k
{
2576
27.3k
    khint_t k;
2577
27.3k
    int i;
2578
27.3k
    if (idx == 0) return;
2579
2580
    // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c
2581
0
    if (idx->fmt == HTS_FMT_CRAI) {
2582
0
        hts_cram_idx_t *cidx = (hts_cram_idx_t *) idx;
2583
0
        cram_index_free(cidx->cram);
2584
0
        free(cidx);
2585
0
        return;
2586
0
    }
2587
2588
0
    for (i = 0; i < idx->m; ++i) {
2589
0
        bidx_t *bidx = idx->bidx[i];
2590
0
        free(idx->lidx[i].offset);
2591
0
        if (bidx == 0) continue;
2592
0
        for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
2593
0
            if (kh_exist(bidx, k))
2594
0
                free(kh_value(bidx, k).list);
2595
0
        kh_destroy(bin, bidx);
2596
0
    }
2597
0
    free(idx->bidx); free(idx->lidx); free(idx->meta);
2598
0
    free(idx);
2599
0
}
2600
2601
0
int hts_idx_fmt(hts_idx_t *idx) {
2602
0
    return idx->fmt;
2603
0
}
2604
2605
// The optimizer eliminates these ed_is_big() calls; still it would be good to
2606
// TODO Determine endianness at configure- or compile-time
2607
2608
static inline ssize_t HTS_RESULT_USED idx_write_int32(BGZF *fp, int32_t x)
2609
0
{
2610
0
    if (ed_is_big()) x = ed_swap_4(x);
2611
0
    return bgzf_write(fp, &x, sizeof x);
2612
0
}
2613
2614
static inline ssize_t HTS_RESULT_USED idx_write_uint32(BGZF *fp, uint32_t x)
2615
0
{
2616
0
    if (ed_is_big()) x = ed_swap_4(x);
2617
0
    return bgzf_write(fp, &x, sizeof x);
2618
0
}
2619
2620
static inline ssize_t HTS_RESULT_USED idx_write_uint64(BGZF *fp, uint64_t x)
2621
0
{
2622
0
    if (ed_is_big()) x = ed_swap_8(x);
2623
0
    return bgzf_write(fp, &x, sizeof x);
2624
0
}
2625
2626
static inline void swap_bins(bins_t *p)
2627
0
{
2628
0
    int i;
2629
0
    for (i = 0; i < p->n; ++i) {
2630
0
        ed_swap_8p(&p->list[i].u);
2631
0
        ed_swap_8p(&p->list[i].v);
2632
0
    }
2633
0
}
2634
2635
static int idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt)
2636
0
{
2637
0
    int32_t i, j;
2638
2639
0
    #define check(ret) if ((ret) < 0) return -1
2640
2641
    // VCF TBI/CSI only writes IDs for non-empty bins (ie covered references)
2642
    //
2643
    // NOTE: CSI meta is undefined in spec, so this code has an assumption
2644
    // that we're only using it for Tabix data.
2645
0
    int nids = idx->n;
2646
0
    if (idx->meta && idx->l_meta >= 4 && le_to_u32(idx->meta) == TBX_VCF) {
2647
0
        for (i = nids = 0; i < idx->n; ++i) {
2648
0
            if (idx->bidx[i])
2649
0
                nids++;
2650
0
        }
2651
0
    }
2652
0
    check(idx_write_int32(fp, nids));
2653
0
    if (fmt == HTS_FMT_TBI && idx->l_meta)
2654
0
        check(bgzf_write(fp, idx->meta, idx->l_meta));
2655
2656
0
    for (i = 0; i < idx->n; ++i) {
2657
0
        khint_t k;
2658
0
        bidx_t *bidx = idx->bidx[i];
2659
0
        lidx_t *lidx = &idx->lidx[i];
2660
2661
        // write binning index
2662
0
        if (nids == idx->n || bidx)
2663
0
            check(idx_write_int32(fp, bidx? kh_size(bidx) : 0));
2664
0
        if (bidx)
2665
0
            for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
2666
0
                if (kh_exist(bidx, k)) {
2667
0
                    bins_t *p = &kh_value(bidx, k);
2668
0
                    check(idx_write_uint32(fp, kh_key(bidx, k)));
2669
0
                    if (fmt == HTS_FMT_CSI) check(idx_write_uint64(fp, p->loff));
2670
                    //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v);
2671
0
                    check(idx_write_int32(fp, p->n));
2672
0
                    for (j = 0; j < p->n; ++j) {
2673
                        //fprintf(stderr, "\t%ld\t%ld\n", p->list[j].u, p->list[j].v);
2674
0
                        check(idx_write_uint64(fp, p->list[j].u));
2675
0
                        check(idx_write_uint64(fp, p->list[j].v));
2676
0
                    }
2677
0
                }
2678
2679
        // write linear index
2680
0
        if (fmt != HTS_FMT_CSI) {
2681
0
            check(idx_write_int32(fp, lidx->n));
2682
0
            for (j = 0; j < lidx->n; ++j)
2683
0
                check(idx_write_uint64(fp, lidx->offset[j]));
2684
0
        }
2685
0
    }
2686
2687
0
    check(idx_write_uint64(fp, idx->n_no_coor));
2688
#ifdef DEBUG_INDEX
2689
    idx_dump(idx);
2690
#endif
2691
2692
0
    return 0;
2693
0
    #undef check
2694
0
}
2695
2696
int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
2697
0
{
2698
0
    int ret, save;
2699
0
    if (idx == NULL || fn == NULL) { errno = EINVAL; return -1; }
2700
0
    char *fnidx = (char*)calloc(1, strlen(fn) + 5);
2701
0
    if (fnidx == NULL) return -1;
2702
2703
0
    strcpy(fnidx, fn);
2704
0
    switch (fmt) {
2705
0
    case HTS_FMT_BAI: strcat(fnidx, ".bai"); break;
2706
0
    case HTS_FMT_CSI: strcat(fnidx, ".csi"); break;
2707
0
    case HTS_FMT_TBI: strcat(fnidx, ".tbi"); break;
2708
0
    default: abort();
2709
0
    }
2710
2711
0
    ret = hts_idx_save_as(idx, fn, fnidx, fmt);
2712
0
    save = errno;
2713
0
    free(fnidx);
2714
0
    errno = save;
2715
0
    return ret;
2716
0
}
2717
2718
int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
2719
0
{
2720
0
    BGZF *fp;
2721
2722
0
    #define check(ret) if ((ret) < 0) goto fail
2723
2724
0
    if (fnidx == NULL) return hts_idx_save(idx, fn, fmt);
2725
2726
0
    fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
2727
0
    if (fp == NULL) return -1;
2728
2729
0
    if (fmt == HTS_FMT_CSI) {
2730
0
        check(bgzf_write(fp, "CSI\1", 4));
2731
0
        check(idx_write_int32(fp, idx->min_shift));
2732
0
        check(idx_write_int32(fp, idx->n_lvls));
2733
0
        check(idx_write_uint32(fp, idx->l_meta));
2734
0
        if (idx->l_meta) check(bgzf_write(fp, idx->meta, idx->l_meta));
2735
0
    } else if (fmt == HTS_FMT_TBI) {
2736
0
        check(bgzf_write(fp, "TBI\1", 4));
2737
0
    } else if (fmt == HTS_FMT_BAI) {
2738
0
        check(bgzf_write(fp, "BAI\1", 4));
2739
0
    } else abort();
2740
2741
0
    check(idx_save_core(idx, fp, fmt));
2742
2743
0
    return bgzf_close(fp);
2744
0
    #undef check
2745
2746
0
fail:
2747
0
    bgzf_close(fp);
2748
0
    return -1;
2749
0
}
2750
2751
static int idx_read_core(hts_idx_t *idx, BGZF *fp, int fmt)
2752
0
{
2753
0
    int32_t i, n, is_be;
2754
0
    is_be = ed_is_big();
2755
0
    if (idx == NULL) return -4;
2756
0
    for (i = 0; i < idx->n; ++i) {
2757
0
        bidx_t *h;
2758
0
        lidx_t *l = &idx->lidx[i];
2759
0
        uint32_t key;
2760
0
        int j, absent;
2761
0
        bins_t *p;
2762
0
        h = idx->bidx[i] = kh_init(bin);
2763
0
        if (bgzf_read(fp, &n, 4) != 4) return -1;
2764
0
        if (is_be) ed_swap_4p(&n);
2765
0
        if (n < 0) return -3;
2766
0
        for (j = 0; j < n; ++j) {
2767
0
            khint_t k;
2768
0
            if (bgzf_read(fp, &key, 4) != 4) return -1;
2769
0
            if (is_be) ed_swap_4p(&key);
2770
0
            k = kh_put(bin, h, key, &absent);
2771
0
            if (absent <  0) return -2; // No memory
2772
0
            if (absent == 0) return -3; // Duplicate bin number
2773
0
            p = &kh_val(h, k);
2774
0
            if (fmt == HTS_FMT_CSI) {
2775
0
                if (bgzf_read(fp, &p->loff, 8) != 8) return -1;
2776
0
                if (is_be) ed_swap_8p(&p->loff);
2777
0
            } else p->loff = 0;
2778
0
            if (bgzf_read(fp, &p->n, 4) != 4) return -1;
2779
0
            if (is_be) ed_swap_4p(&p->n);
2780
0
            if (p->n < 0) return -3;
2781
0
            if ((size_t) p->n > SIZE_MAX / sizeof(hts_pair64_t)) return -2;
2782
0
            p->m = p->n;
2783
0
            p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t));
2784
0
            if (p->list == NULL) return -2;
2785
0
            if (bgzf_read(fp, p->list, ((size_t) p->n)<<4) != ((size_t) p->n)<<4) return -1;
2786
0
            if (is_be) swap_bins(p);
2787
0
        }
2788
0
        if (fmt != HTS_FMT_CSI) { // load linear index
2789
0
            int j, k;
2790
0
            uint32_t x;
2791
0
            if (bgzf_read(fp, &x, 4) != 4) return -1;
2792
0
            if (is_be) ed_swap_4p(&x);
2793
0
            l->n = x;
2794
0
            if (l->n < 0) return -3;
2795
0
            if ((size_t) l->n > SIZE_MAX / sizeof(uint64_t)) return -2;
2796
0
            l->m = l->n;
2797
0
            l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t));
2798
0
            if (l->offset == NULL) return -2;
2799
0
            if (bgzf_read(fp, l->offset, l->n << 3) != l->n << 3) return -1;
2800
0
            if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]);
2801
0
            for (k = j = 0; j < l->n && l->offset[j] == 0; k = ++j); // stop at the first non-zero entry
2802
0
            for (j = l->n-1; j > k; j--) // fill missing values; may happen given older samtools and tabix
2803
0
                if (l->offset[j-1] == 0) l->offset[j-1] = l->offset[j];
2804
0
            update_loff(idx, i, 0);
2805
0
        }
2806
0
    }
2807
0
    if (bgzf_read(fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0;
2808
0
    if (is_be) ed_swap_8p(&idx->n_no_coor);
2809
#ifdef DEBUG_INDEX
2810
    idx_dump(idx);
2811
#endif
2812
2813
0
    return 0;
2814
0
}
2815
2816
static hts_idx_t *idx_read(const char *fn)
2817
0
{
2818
0
    uint8_t magic[4];
2819
0
    int i, is_be;
2820
0
    hts_idx_t *idx = NULL;
2821
0
    uint8_t *meta = NULL;
2822
0
    BGZF *fp = bgzf_open(fn, "r");
2823
0
    if (fp == NULL) return NULL;
2824
0
    is_be = ed_is_big();
2825
0
    if (bgzf_read(fp, magic, 4) != 4) goto fail;
2826
2827
0
    if (memcmp(magic, "CSI\1", 4) == 0) {
2828
0
        uint32_t x[3], n;
2829
0
        if (bgzf_read(fp, x, 12) != 12) goto fail;
2830
0
        if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]);
2831
0
        if (x[2]) {
2832
0
            if (SIZE_MAX - x[2] < 1) goto fail; // Prevent possible overflow
2833
0
            if ((meta = (uint8_t*)malloc((size_t) x[2] + 1)) == NULL) goto fail;
2834
0
            if (bgzf_read(fp, meta, x[2]) != x[2]) goto fail;
2835
            // Prevent possible strlen past the end in tbx_index_load2
2836
0
            meta[x[2]] = '\0';
2837
0
        }
2838
0
        if (bgzf_read(fp, &n, 4) != 4) goto fail;
2839
0
        if (is_be) ed_swap_4p(&n);
2840
0
        if (n > INT32_MAX) goto fail;
2841
0
        if ((idx = hts_idx_init(n, HTS_FMT_CSI, 0, x[0], x[1])) == NULL) goto fail;
2842
0
        idx->l_meta = x[2];
2843
0
        idx->meta = meta;
2844
0
        meta = NULL;
2845
0
        if (idx_read_core(idx, fp, HTS_FMT_CSI) < 0) goto fail;
2846
0
    }
2847
0
    else if (memcmp(magic, "TBI\1", 4) == 0) {
2848
0
        uint8_t x[8 * 4];
2849
0
        uint32_t n;
2850
        // Read file header
2851
0
        if (bgzf_read(fp, x, sizeof(x)) != sizeof(x)) goto fail;
2852
0
        n = le_to_u32(&x[0]); // location of n_ref
2853
0
        if (n > INT32_MAX) goto fail;
2854
0
        if ((idx = hts_idx_init(n, HTS_FMT_TBI, 0, 14, 5)) == NULL) goto fail;
2855
0
        n = le_to_u32(&x[7*4]); // location of l_nm
2856
0
        if (n > UINT32_MAX - 29) goto fail; // Prevent possible overflow
2857
0
        idx->l_meta = 28 + n;
2858
0
        if ((idx->meta = (uint8_t*)malloc(idx->l_meta + 1)) == NULL) goto fail;
2859
        // copy format, col_seq, col_beg, col_end, meta, skip, l_nm
2860
        // N.B. left in little-endian byte order.
2861
0
        memcpy(idx->meta, &x[1*4], 28);
2862
        // Read in sequence names.
2863
0
        if (bgzf_read(fp, idx->meta + 28, n) != n) goto fail;
2864
        // Prevent possible strlen past the end in tbx_index_load2
2865
0
        idx->meta[idx->l_meta] = '\0';
2866
0
        if (idx_read_core(idx, fp, HTS_FMT_TBI) < 0) goto fail;
2867
0
    }
2868
0
    else if (memcmp(magic, "BAI\1", 4) == 0) {
2869
0
        uint32_t n;
2870
0
        if (bgzf_read(fp, &n, 4) != 4) goto fail;
2871
0
        if (is_be) ed_swap_4p(&n);
2872
0
        if (n > INT32_MAX) goto fail;
2873
0
        if ((idx = hts_idx_init(n, HTS_FMT_BAI, 0, 14, 5)) == NULL) goto fail;
2874
0
        if (idx_read_core(idx, fp, HTS_FMT_BAI) < 0) goto fail;
2875
0
    }
2876
0
    else { errno = EINVAL; goto fail; }
2877
2878
0
    bgzf_close(fp);
2879
0
    return idx;
2880
2881
0
fail:
2882
0
    bgzf_close(fp);
2883
0
    hts_idx_destroy(idx);
2884
0
    free(meta);
2885
0
    return NULL;
2886
0
}
2887
2888
int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta,
2889
                      int is_copy)
2890
0
{
2891
0
    uint8_t *new_meta = meta;
2892
0
    if (is_copy) {
2893
0
        size_t l = l_meta;
2894
0
        if (l > SIZE_MAX - 1) {
2895
0
            errno = ENOMEM;
2896
0
            return -1;
2897
0
        }
2898
0
        new_meta = malloc(l + 1);
2899
0
        if (!new_meta) return -1;
2900
0
        memcpy(new_meta, meta, l);
2901
        // Prevent possible strlen past the end in tbx_index_load2
2902
0
        new_meta[l] = '\0';
2903
0
    }
2904
0
    if (idx->meta) free(idx->meta);
2905
0
    idx->l_meta = l_meta;
2906
0
    idx->meta = new_meta;
2907
0
    return 0;
2908
0
}
2909
2910
uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta)
2911
0
{
2912
0
    *l_meta = idx->l_meta;
2913
0
    return idx->meta;
2914
0
}
2915
2916
const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)
2917
0
{
2918
0
    if ( !idx || !idx->n )
2919
0
    {
2920
0
        *n = 0;
2921
0
        return NULL;
2922
0
    }
2923
2924
0
    int tid = 0, i;
2925
0
    const char **names = (const char**) calloc(idx->n,sizeof(const char*));
2926
0
    for (i=0; i<idx->n; i++)
2927
0
    {
2928
0
        bidx_t *bidx = idx->bidx[i];
2929
0
        if ( !bidx ) continue;
2930
0
        names[tid++] = getid(hdr,i);
2931
0
    }
2932
0
    *n = tid;
2933
0
    return names;
2934
0
}
2935
2936
0
int hts_idx_nseq(const hts_idx_t *idx) {
2937
0
    if (!idx) return -1;
2938
0
    return idx->n;
2939
0
}
2940
2941
int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped)
2942
0
{
2943
0
    if (!idx) return -1;
2944
0
    if ( idx->fmt == HTS_FMT_CRAI ) {
2945
0
        *mapped = 0; *unmapped = 0;
2946
0
        return -1;
2947
0
    }
2948
2949
0
    bidx_t *h = idx->bidx[tid];
2950
0
    if (!h) return -1;
2951
0
    khint_t k = kh_get(bin, h, META_BIN(idx));
2952
0
    if (k != kh_end(h)) {
2953
0
        *mapped = kh_val(h, k).list[1].u;
2954
0
        *unmapped = kh_val(h, k).list[1].v;
2955
0
        return 0;
2956
0
    } else {
2957
0
        *mapped = 0; *unmapped = 0;
2958
0
        return -1;
2959
0
    }
2960
0
}
2961
2962
uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
2963
0
{
2964
0
    if (idx->fmt == HTS_FMT_CRAI) return 0;
2965
0
    return idx->n_no_coor;
2966
0
}
2967
2968
/****************
2969
 *** Iterator ***
2970
 ****************/
2971
2972
// Note: even with 32-bit hts_pos_t, end needs to be 64-bit here due to 1LL<<s.
2973
static inline int reg2bins_narrow(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx)
2974
0
{
2975
0
    int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
2976
0
    for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
2977
0
        hts_pos_t b, e;
2978
0
        int i;
2979
0
        b = t + (beg>>s); e = t + (end>>s);
2980
0
        for (i = b; i <= e; ++i) {
2981
0
            if (kh_get(bin, bidx, i) != kh_end(bidx)) {
2982
0
                assert(itr->bins.n < itr->bins.m);
2983
0
                itr->bins.a[itr->bins.n++] = i;
2984
0
            }
2985
0
        }
2986
0
    }
2987
0
    return itr->bins.n;
2988
0
}
2989
2990
static inline int reg2bins_wide(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx)
2991
0
{
2992
0
    khint_t i;
2993
0
    hts_pos_t max_shift = 3 * n_lvls + min_shift;
2994
0
    --end;
2995
0
    if (beg < 0) beg = 0;
2996
0
    for (i = kh_begin(bidx); i != kh_end(bidx); i++) {
2997
0
        if (!kh_exist(bidx, i)) continue;
2998
0
        hts_pos_t bin = (hts_pos_t) kh_key(bidx, i);
2999
0
        int level = hts_bin_level(bin);
3000
0
        if (level > n_lvls) continue; // Dodgy index?
3001
0
        hts_pos_t first = hts_bin_first(level);
3002
0
        hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level));
3003
0
        hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level));
3004
0
        if (beg_at_level <= bin && bin <= end_at_level) {
3005
0
            assert(itr->bins.n < itr->bins.m);
3006
0
            itr->bins.a[itr->bins.n++] = bin;
3007
0
        }
3008
0
    }
3009
0
    return itr->bins.n;
3010
0
}
3011
3012
static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx)
3013
0
{
3014
0
    int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
3015
0
    size_t reg_bin_count = 0, hash_bin_count = kh_n_buckets(bidx), max_bins;
3016
0
    hts_pos_t end1;
3017
0
    if (end >= 1LL<<s) end = 1LL<<s;
3018
0
    if (beg >= end) return 0;
3019
0
    end1 = end - 1;
3020
3021
    // Count bins to see if it's faster to iterate through the hash table
3022
    // or the set of bins covering the region
3023
0
    for (l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3024
0
        reg_bin_count += (end1 >> s) - (beg >> s) + 1;
3025
0
    }
3026
0
    max_bins = reg_bin_count < kh_size(bidx) ? reg_bin_count : kh_size(bidx);
3027
0
    if (itr->bins.m - itr->bins.n < max_bins) {
3028
        // Worst-case memory usage.  May be wasteful on very sparse
3029
        // data, but the bin list usually won't be too big anyway.
3030
0
        size_t new_m = max_bins + itr->bins.n;
3031
0
        if (new_m > INT_MAX || new_m > SIZE_MAX / sizeof(int)) {
3032
0
            errno = ENOMEM;
3033
0
            return -1;
3034
0
        }
3035
0
        int *new_a = realloc(itr->bins.a, new_m * sizeof(*new_a));
3036
0
        if (!new_a) return -1;
3037
0
        itr->bins.a = new_a;
3038
0
        itr->bins.m = new_m;
3039
0
    }
3040
0
    if (reg_bin_count < hash_bin_count) {
3041
0
        return reg2bins_narrow(beg, end, itr, min_shift, n_lvls, bidx);
3042
0
    } else {
3043
0
        return reg2bins_wide(beg, end, itr, min_shift, n_lvls, bidx);
3044
0
    }
3045
0
}
3046
3047
static inline int add_to_interval(hts_itr_t *iter, bins_t *bin,
3048
                                  int tid, uint32_t interval,
3049
                                  uint64_t min_off, uint64_t max_off)
3050
0
{
3051
0
    hts_pair64_max_t *off;
3052
0
    int j;
3053
3054
0
    if (!bin->n)
3055
0
        return 0;
3056
0
    off = realloc(iter->off, (iter->n_off + bin->n) * sizeof(*off));
3057
0
    if (!off)
3058
0
        return -2;
3059
3060
0
    iter->off = off;
3061
0
    for (j = 0; j < bin->n; ++j) {
3062
0
        if (bin->list[j].v > min_off && bin->list[j].u < max_off) {
3063
0
            iter->off[iter->n_off].u = min_off > bin->list[j].u
3064
0
                ? min_off : bin->list[j].u;
3065
0
            iter->off[iter->n_off].v = max_off < bin->list[j].v
3066
0
                ? max_off : bin->list[j].v;
3067
            // hts_pair64_max_t::max is now used to link
3068
            // file offsets to region list entries.
3069
            // The iterator can use this to decide if it
3070
            // can skip some file regions.
3071
0
            iter->off[iter->n_off].max = ((uint64_t) tid << 32) | interval;
3072
0
            iter->n_off++;
3073
0
        }
3074
0
    }
3075
0
    return 0;
3076
0
}
3077
3078
static inline int reg2intervals_narrow(hts_itr_t *iter, const bidx_t *bidx,
3079
                                       int tid, int64_t beg, int64_t end,
3080
                                       uint32_t interval,
3081
                                       uint64_t min_off, uint64_t max_off,
3082
                                       int min_shift, int n_lvls)
3083
0
{
3084
0
    int l, t, s = min_shift + n_lvls * 3;
3085
0
    hts_pos_t b, e, i;
3086
3087
0
    for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3088
0
        b = t + (beg>>s); e = t + (end>>s);
3089
0
        for (i = b; i <= e; ++i) {
3090
0
            khint_t k = kh_get(bin, bidx, i);
3091
0
            if (k != kh_end(bidx)) {
3092
0
                bins_t *bin = &kh_value(bidx, k);
3093
0
                int res = add_to_interval(iter, bin, tid, interval, min_off, max_off);
3094
0
                if (res < 0)
3095
0
                    return res;
3096
0
            }
3097
0
        }
3098
0
    }
3099
0
    return 0;
3100
0
}
3101
3102
static inline int reg2intervals_wide(hts_itr_t *iter, const bidx_t *bidx,
3103
                                       int tid, int64_t beg, int64_t end,
3104
                                       uint32_t interval,
3105
                                       uint64_t min_off, uint64_t max_off,
3106
                                       int min_shift, int n_lvls)
3107
0
{
3108
0
    khint_t i;
3109
0
    hts_pos_t max_shift = 3 * n_lvls + min_shift;
3110
0
    --end;
3111
0
    if (beg < 0) beg = 0;
3112
0
    for (i = kh_begin(bidx); i != kh_end(bidx); i++) {
3113
0
        if (!kh_exist(bidx, i)) continue;
3114
0
        hts_pos_t bin = (hts_pos_t) kh_key(bidx, i);
3115
0
        int level = hts_bin_level(bin);
3116
0
        if (level > n_lvls) continue; // Dodgy index?
3117
0
        hts_pos_t first = hts_bin_first(level);
3118
0
        hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level));
3119
0
        hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level));
3120
0
        if (beg_at_level <= bin && bin <= end_at_level) {
3121
0
            bins_t *bin = &kh_value(bidx, i);
3122
0
            int res = add_to_interval(iter, bin, tid, interval, min_off, max_off);
3123
0
            if (res < 0)
3124
0
                return res;
3125
0
        }
3126
0
    }
3127
0
    return 0;
3128
0
}
3129
3130
static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint32_t interval, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls)
3131
0
{
3132
0
    int l, t, s;
3133
0
    int i, j;
3134
0
    hts_pos_t end1;
3135
0
    bidx_t *bidx;
3136
0
    int start_n_off;
3137
0
    size_t reg_bin_count = 0, hash_bin_count;
3138
0
    int res;
3139
3140
0
    if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg >= end)
3141
0
        return -1;
3142
3143
0
    hash_bin_count = kh_n_buckets(bidx);
3144
3145
0
    s = min_shift + (n_lvls<<1) + n_lvls;
3146
0
    if (end >= 1LL<<s)
3147
0
        end = 1LL<<s;
3148
3149
0
    end1 = end - 1;
3150
    // Count bins to see if it's faster to iterate through the hash table
3151
    // or the set of bins covering the region
3152
0
    for (l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3153
0
        reg_bin_count += (end1 >> s) - (beg >> s) + 1;
3154
0
    }
3155
3156
0
    start_n_off = iter->n_off;
3157
3158
    // Populate iter->off with the intervals for this region
3159
0
    if (reg_bin_count < hash_bin_count) {
3160
0
        res = reg2intervals_narrow(iter, bidx, tid, beg, end, interval,
3161
0
                                   min_off, max_off, min_shift, n_lvls);
3162
0
    } else {
3163
0
        res = reg2intervals_wide(iter, bidx, tid, beg, end, interval,
3164
0
                                 min_off, max_off, min_shift, n_lvls);
3165
0
    }
3166
0
    if (res < 0)
3167
0
        return res;
3168
3169
0
    if (iter->n_off - start_n_off > 1) {
3170
0
        ks_introsort(_off_max, iter->n_off - start_n_off, iter->off + start_n_off);
3171
0
        for (i = start_n_off, j = start_n_off + 1; j < iter->n_off; j++) {
3172
0
            if (iter->off[i].v >= iter->off[j].u) {
3173
0
                if (iter->off[i].v < iter->off[j].v)
3174
0
                    iter->off[i].v = iter->off[j].v;
3175
0
            } else {
3176
0
                i++;
3177
0
                if (i < j)
3178
0
                    iter->off[i] = iter->off[j];
3179
0
            }
3180
0
        }
3181
0
        iter->n_off = i + 1;
3182
0
    }
3183
3184
0
    return iter->n_off;
3185
0
}
3186
3187
0
static int compare_regions(const void *r1, const void *r2) {
3188
0
    hts_reglist_t *reg1 = (hts_reglist_t *)r1;
3189
0
    hts_reglist_t *reg2 = (hts_reglist_t *)r2;
3190
3191
0
    if (reg1->tid < 0 && reg2->tid >= 0)
3192
0
        return 1;
3193
0
    else if (reg1->tid >= 0 && reg2->tid < 0)
3194
0
        return -1;
3195
0
    else
3196
0
        return reg1->tid - reg2->tid;
3197
0
}
3198
3199
0
uint64_t hts_itr_off(const hts_idx_t* idx, int tid) {
3200
3201
0
    int i;
3202
0
    bidx_t* bidx;
3203
0
    uint64_t off0 = (uint64_t) -1;
3204
0
    khint_t k;
3205
0
    switch (tid) {
3206
0
    case HTS_IDX_START:
3207
        // Find the smallest offset, note that sequence ids may not be ordered sequentially
3208
0
        for (i = 0; i < idx->n; i++) {
3209
0
            bidx = idx->bidx[i];
3210
0
            k = kh_get(bin, bidx, META_BIN(idx));
3211
0
            if (k == kh_end(bidx))
3212
0
                continue;
3213
3214
0
            if (off0 > kh_val(bidx, k).list[0].u)
3215
0
                off0 = kh_val(bidx, k).list[0].u;
3216
0
        }
3217
0
        if (off0 == (uint64_t) -1 && idx->n_no_coor)
3218
0
            off0 = 0;
3219
        // only no-coor reads in this bam
3220
0
        break;
3221
0
    case HTS_IDX_NOCOOR:
3222
        /* No-coor reads sort after all of the mapped reads.  The position
3223
           is not stored in the index itself, so need to find the end
3224
           offset for the last mapped read.  A loop is needed here in
3225
           case references at the end of the file have no mapped reads,
3226
           or sequence ids are not ordered sequentially.
3227
           See issue samtools#568 and commits b2aab8, 60c22d and cc207d. */
3228
0
        for (i = 0; i < idx->n; i++) {
3229
0
            bidx = idx->bidx[i];
3230
0
            k = kh_get(bin, bidx, META_BIN(idx));
3231
0
            if (k != kh_end(bidx)) {
3232
0
                if (off0 == (uint64_t) -1 || off0 < kh_val(bidx, k).list[0].v) {
3233
0
                    off0 = kh_val(bidx, k).list[0].v;
3234
0
                }
3235
0
            }
3236
0
        }
3237
0
        if (off0 == (uint64_t) -1 && idx->n_no_coor)
3238
0
            off0 = 0;
3239
        // only no-coor reads in this bam
3240
0
        break;
3241
0
    case HTS_IDX_REST:
3242
0
        off0 = 0;
3243
0
        break;
3244
0
    case HTS_IDX_NONE:
3245
0
        off0 = 0;
3246
0
        break;
3247
0
    }
3248
3249
0
    return off0;
3250
0
}
3251
3252
hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
3253
0
{
3254
0
    int i, n_off, l, bin;
3255
0
    hts_pair64_max_t *off;
3256
0
    khint_t k;
3257
0
    bidx_t *bidx;
3258
0
    uint64_t min_off, max_off;
3259
0
    hts_itr_t *iter;
3260
0
    uint32_t unmapped = 0, rel_off;
3261
3262
    // It's possible to call this function with NULL idx iff
3263
    // tid is one of the special values HTS_IDX_REST or HTS_IDX_NONE
3264
0
    if (!idx && !(tid == HTS_IDX_REST || tid == HTS_IDX_NONE)) {
3265
0
        errno = EINVAL;
3266
0
        return NULL;
3267
0
    }
3268
3269
0
    iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
3270
0
    if (iter) {
3271
0
        if (tid < 0) {
3272
0
            uint64_t off = hts_itr_off(idx, tid);
3273
0
            if (off != (uint64_t) -1) {
3274
0
                iter->read_rest = 1;
3275
0
                iter->curr_off = off;
3276
0
                iter->readrec = readrec;
3277
0
                if (tid == HTS_IDX_NONE)
3278
0
                    iter->finished = 1;
3279
0
            } else {
3280
0
                free(iter);
3281
0
                iter = NULL;
3282
0
            }
3283
0
        } else if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) {
3284
0
            iter->finished = 1;
3285
0
        } else {
3286
0
            if (beg < 0) beg = 0;
3287
0
            if (end < beg) {
3288
0
              free(iter);
3289
0
              return NULL;
3290
0
            }
3291
3292
0
            k = kh_get(bin, bidx, META_BIN(idx));
3293
0
            if (k != kh_end(bidx))
3294
0
                unmapped = kh_val(bidx, k).list[1].v;
3295
0
            else
3296
0
                unmapped = 1;
3297
3298
0
            iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
3299
0
            iter->readrec = readrec;
3300
3301
0
            if ( !kh_size(bidx) ) { iter->finished = 1; return iter; }
3302
3303
0
            rel_off = beg>>idx->min_shift;
3304
            // compute min_off
3305
0
            bin = hts_bin_first(idx->n_lvls) + rel_off;
3306
0
            do {
3307
0
                int first;
3308
0
                k = kh_get(bin, bidx, bin);
3309
0
                if (k != kh_end(bidx)) break;
3310
0
                first = (hts_bin_parent(bin)<<3) + 1;
3311
0
                if (bin > first) --bin;
3312
0
                else bin = hts_bin_parent(bin);
3313
0
            } while (bin);
3314
0
            if (bin == 0) k = kh_get(bin, bidx, bin);
3315
0
            min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
3316
            // min_off can be calculated more accurately if the
3317
            // linear index is available
3318
0
            if (idx->lidx[tid].offset
3319
0
                && rel_off < idx->lidx[tid].n) {
3320
0
                if (min_off < idx->lidx[tid].offset[rel_off])
3321
0
                    min_off = idx->lidx[tid].offset[rel_off];
3322
0
                if (unmapped) {
3323
                    // unmapped reads are not covered by the linear index,
3324
                    // so search backwards for a smaller offset
3325
0
                    int tmp_off;
3326
0
                    for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) {
3327
0
                        if (idx->lidx[tid].offset[tmp_off] < min_off) {
3328
0
                            min_off = idx->lidx[tid].offset[tmp_off];
3329
0
                            break;
3330
0
                        }
3331
0
                    }
3332
                    // if the search went too far back or no satisfactory entry
3333
                    // was found, revert to the bin index loff value
3334
0
                    if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).loff || tmp_off < 0))
3335
0
                        min_off = kh_val(bidx, k).loff;
3336
0
                }
3337
0
            } else if (unmapped) { //CSI index
3338
0
                if (k != kh_end(bidx))
3339
0
                    min_off = kh_val(bidx, k).loff;
3340
0
            }
3341
3342
            // compute max_off: a virtual offset from a bin to the right of end
3343
            // First check if end lies within the range of the index (it won't
3344
            // if it's HTS_POS_MAX)
3345
0
            if (end < 1LL << (idx->min_shift + 3 * idx->n_lvls)) {
3346
0
                bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
3347
0
                if (bin >= idx->n_bins) bin = 0;
3348
0
                while (1) {
3349
                    // search for an extant bin by moving right, but moving up to the
3350
                    // parent whenever we get to a first child (which also covers falling
3351
                    // off the RHS, which wraps around and immediately goes up to bin 0)
3352
0
                    while (bin % 8 == 1) bin = hts_bin_parent(bin);
3353
0
                    if (bin == 0) { max_off = UINT64_MAX; break; }
3354
0
                    k = kh_get(bin, bidx, bin);
3355
0
                    if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; }
3356
0
                    bin++;
3357
0
                }
3358
0
            } else {
3359
                // Searching to end of reference
3360
0
                max_off = UINT64_MAX;
3361
0
            }
3362
3363
            // retrieve bins
3364
0
            if (reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls, bidx) < 0) {
3365
0
                hts_itr_destroy(iter);
3366
0
                return NULL;
3367
0
            }
3368
3369
0
            for (i = n_off = 0; i < iter->bins.n; ++i)
3370
0
                if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx))
3371
0
                    n_off += kh_value(bidx, k).n;
3372
0
            if (n_off == 0) {
3373
                // No overlapping bins means the iterator has already finished.
3374
0
                iter->finished = 1;
3375
0
                return iter;
3376
0
            }
3377
0
            off = calloc(n_off, sizeof(*off));
3378
0
            for (i = n_off = 0; i < iter->bins.n; ++i) {
3379
0
                if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) {
3380
0
                    int j;
3381
0
                    bins_t *p = &kh_value(bidx, k);
3382
0
                    for (j = 0; j < p->n; ++j)
3383
0
                        if (p->list[j].v > min_off && p->list[j].u < max_off) {
3384
0
                            off[n_off].u = min_off > p->list[j].u
3385
0
                                ? min_off : p->list[j].u;
3386
0
                            off[n_off].v = max_off < p->list[j].v
3387
0
                                ? max_off : p->list[j].v;
3388
                            // hts_pair64_max_t::max is now used to link
3389
                            // file offsets to region list entries.
3390
                            // The iterator can use this to decide if it
3391
                            // can skip some file regions.
3392
0
                            off[n_off].max = ((uint64_t) tid << 32) | j;
3393
0
                            n_off++;
3394
0
                        }
3395
0
                }
3396
0
            }
3397
3398
0
            if (n_off == 0) {
3399
0
                free(off);
3400
0
                iter->finished = 1;
3401
0
                return iter;
3402
0
            }
3403
0
            ks_introsort(_off_max, n_off, off);
3404
            // resolve completely contained adjacent blocks
3405
0
            for (i = 1, l = 0; i < n_off; ++i)
3406
0
                if (off[l].v < off[i].v) off[++l] = off[i];
3407
0
            n_off = l + 1;
3408
            // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
3409
0
            for (i = 1; i < n_off; ++i)
3410
0
                if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
3411
            // merge adjacent blocks
3412
0
            for (i = 1, l = 0; i < n_off; ++i) {
3413
0
                if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
3414
0
                else off[++l] = off[i];
3415
0
            }
3416
0
            n_off = l + 1;
3417
0
            iter->n_off = n_off; iter->off = off;
3418
0
        }
3419
0
    }
3420
3421
0
    return iter;
3422
0
}
3423
3424
int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
3425
0
{
3426
0
    int i, j, bin;
3427
0
    khint_t k;
3428
0
    bidx_t *bidx;
3429
0
    uint64_t min_off, max_off, t_off = (uint64_t)-1;
3430
0
    int tid;
3431
0
    hts_pos_t beg, end;
3432
0
    hts_reglist_t *curr_reg;
3433
0
    uint32_t unmapped = 0, rel_off;
3434
3435
0
    if (!idx || !iter || !iter->multi)
3436
0
        return -1;
3437
3438
0
    iter->i = -1;
3439
0
    for (i=0; i<iter->n_reg; i++) {
3440
3441
0
        curr_reg = &iter->reg_list[i];
3442
0
        tid = curr_reg->tid;
3443
3444
0
        if (tid < 0) {
3445
0
            t_off = hts_itr_off(idx, tid);
3446
0
            if (t_off != (uint64_t)-1) {
3447
0
                switch (tid) {
3448
0
                case HTS_IDX_NONE:
3449
0
                    iter->finished = 1;
3450
                    // fall through
3451
0
                case HTS_IDX_START:
3452
0
                case HTS_IDX_REST:
3453
0
                    iter->curr_off = t_off;
3454
0
                    iter->n_reg = 0;
3455
0
                    iter->reg_list = NULL;
3456
0
                    iter->read_rest = 1;
3457
0
                    return 0;
3458
0
                case HTS_IDX_NOCOOR:
3459
0
                    iter->nocoor = 1;
3460
0
                    iter->nocoor_off = t_off;
3461
0
                }
3462
0
            }
3463
0
        } else {
3464
0
            if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL || !kh_size(bidx))
3465
0
                continue;
3466
3467
0
            k = kh_get(bin, bidx, META_BIN(idx));
3468
0
            if (k != kh_end(bidx))
3469
0
                unmapped = kh_val(bidx, k).list[1].v;
3470
0
            else
3471
0
                unmapped = 1;
3472
3473
0
            for(j=0; j<curr_reg->count; j++) {
3474
0
                hts_pair32_t *curr_intv = &curr_reg->intervals[j];
3475
0
                if (curr_intv->end < curr_intv->beg)
3476
0
                    continue;
3477
3478
0
                beg = curr_intv->beg;
3479
0
                end = curr_intv->end;
3480
0
                rel_off = beg>>idx->min_shift;
3481
3482
                /* Compute 'min_off' by searching the lowest level bin containing 'beg'.
3483
                       If the computed bin is not in the index, try the next bin to the
3484
                       left, belonging to the same parent. If it is the first sibling bin,
3485
                       try the parent bin. */
3486
0
                bin = hts_bin_first(idx->n_lvls) + rel_off;
3487
0
                do {
3488
0
                    int first;
3489
0
                    k = kh_get(bin, bidx, bin);
3490
0
                    if (k != kh_end(bidx)) break;
3491
0
                    first = (hts_bin_parent(bin)<<3) + 1;
3492
0
                    if (bin > first) --bin;
3493
0
                    else bin = hts_bin_parent(bin);
3494
0
                } while (bin);
3495
0
                if (bin == 0)
3496
0
                    k = kh_get(bin, bidx, bin);
3497
0
                min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
3498
                // min_off can be calculated more accurately if the
3499
                // linear index is available
3500
0
                if (idx->lidx[tid].offset
3501
0
                    && rel_off < idx->lidx[tid].n) {
3502
0
                    if (min_off < idx->lidx[tid].offset[rel_off])
3503
0
                        min_off = idx->lidx[tid].offset[rel_off];
3504
0
                    if (unmapped) {
3505
0
                        int tmp_off;
3506
0
                        for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) {
3507
0
                            if (idx->lidx[tid].offset[tmp_off] < min_off) {
3508
0
                                min_off = idx->lidx[tid].offset[tmp_off];
3509
0
                                break;
3510
0
                            }
3511
0
                        }
3512
3513
0
                        if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).loff || tmp_off < 0))
3514
0
                            min_off = kh_val(bidx, k).loff;
3515
0
                    }
3516
0
                } else if (unmapped) { //CSI index
3517
0
                    if (k != kh_end(bidx))
3518
0
                        min_off = kh_val(bidx, k).loff;
3519
0
                }
3520
3521
                // compute max_off: a virtual offset from a bin to the right of end
3522
                // First check if end lies within the range of the index (it
3523
                // won't if it's HTS_POS_MAX)
3524
0
                if (end < 1LL << (idx->min_shift + 3 * idx->n_lvls)) {
3525
0
                    bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
3526
0
                    if (bin >= idx->n_bins) bin = 0;
3527
0
                    while (1) {
3528
                        // search for an extant bin by moving right, but moving up to the
3529
                        // parent whenever we get to a first child (which also covers falling
3530
                        // off the RHS, which wraps around and immediately goes up to bin 0)
3531
0
                        while (bin % 8 == 1) bin = hts_bin_parent(bin);
3532
0
                        if (bin == 0) { max_off = UINT64_MAX; break; }
3533
0
                        k = kh_get(bin, bidx, bin);
3534
0
                        if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) {
3535
0
                            max_off = kh_val(bidx, k).list[0].u;
3536
0
                            break;
3537
0
                        }
3538
0
                        bin++;
3539
0
                    }
3540
0
                } else {
3541
                    // Searching to end of reference
3542
0
                    max_off = UINT64_MAX;
3543
0
                }
3544
3545
                //convert coordinates to file offsets
3546
0
                if (reg2intervals(iter, idx, tid, beg, end, j,
3547
0
                                  min_off, max_off,
3548
0
                                  idx->min_shift, idx->n_lvls) < 0) {
3549
0
                    return -1;
3550
0
                }
3551
0
            }
3552
0
        }
3553
0
    }
3554
3555
0
    if (iter->n_off > 1)
3556
0
        ks_introsort(_off_max, iter->n_off, iter->off);
3557
3558
0
    if(!iter->n_off && !iter->nocoor)
3559
0
        iter->finished = 1;
3560
3561
0
    return 0;
3562
0
}
3563
3564
int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter)
3565
0
{
3566
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
3567
0
    int tid, i, n_off = 0;
3568
0
    uint32_t j;
3569
0
    hts_pos_t beg, end;
3570
0
    hts_reglist_t *curr_reg;
3571
0
    hts_pair32_t *curr_intv;
3572
0
    hts_pair64_max_t *off = NULL, *tmp;
3573
0
    cram_index *e = NULL;
3574
3575
0
    if (!cidx || !iter || !iter->multi)
3576
0
        return -1;
3577
3578
0
    iter->is_cram = 1;
3579
0
    iter->read_rest = 0;
3580
0
    iter->off = NULL;
3581
0
    iter->n_off = 0;
3582
0
    iter->curr_off = 0;
3583
0
    iter->i = -1;
3584
3585
0
    for (i=0; i<iter->n_reg; i++) {
3586
3587
0
        curr_reg = &iter->reg_list[i];
3588
0
        tid = curr_reg->tid;
3589
3590
0
        if (tid >= 0) {
3591
0
            tmp = realloc(off, (n_off + curr_reg->count) * sizeof(*off));
3592
0
            if (!tmp)
3593
0
                goto err;
3594
0
            off = tmp;
3595
3596
0
            for (j=0; j < curr_reg->count; j++) {
3597
0
                curr_intv = &curr_reg->intervals[j];
3598
0
                if (curr_intv->end < curr_intv->beg)
3599
0
                    continue;
3600
3601
0
                beg = curr_intv->beg;
3602
0
                end = curr_intv->end;
3603
3604
/* First, fetch the container overlapping 'beg' and assign its file offset to u, then
3605
 * find the container overlapping 'end' and assign the relative end of the slice to v.
3606
 * The cram_ptell function will adjust with the container offset, which is not stored
3607
 * in the index.
3608
 */
3609
0
                e = cram_index_query(cidx->cram, tid, beg+1, NULL);
3610
0
                if (e) {
3611
0
                    off[n_off].u = e->offset;
3612
                    // hts_pair64_max_t::max is now used to link
3613
                    // file offsets to region list entries.
3614
                    // The iterator can use this to decide if it
3615
                    // can skip some file regions.
3616
0
                    off[n_off].max = ((uint64_t) tid << 32) | j;
3617
3618
0
                    if (end >= HTS_POS_MAX) {
3619
0
                       e = cram_index_last(cidx->cram, tid, NULL);
3620
0
                    } else {
3621
0
                       e = cram_index_query_last(cidx->cram, tid, end+1);
3622
0
                    }
3623
3624
0
                    if (e) {
3625
0
                        off[n_off++].v = e->e_next
3626
0
                            ? e->e_next->offset
3627
0
                            : e->offset + e->slice + e->len;
3628
0
                    } else {
3629
0
                        hts_log_warning("Could not set offset end for region %d:%"PRIhts_pos"-%"PRIhts_pos". Skipping", tid, beg, end);
3630
0
                    }
3631
0
                }
3632
0
            }
3633
0
        } else {
3634
0
            switch (tid) {
3635
0
                case HTS_IDX_NOCOOR:
3636
0
                    e = cram_index_query(cidx->cram, tid, 1, NULL);
3637
0
                    if (e) {
3638
0
                        iter->nocoor = 1;
3639
0
                        iter->nocoor_off = e->offset;
3640
0
                    } else {
3641
0
                        hts_log_warning("No index entry for NOCOOR region");
3642
0
                    }
3643
0
                    break;
3644
0
                case HTS_IDX_START:
3645
0
                    e = cram_index_query(cidx->cram, tid, 1, NULL);
3646
0
                    if (e) {
3647
0
                        iter->read_rest = 1;
3648
0
                        tmp = realloc(off, sizeof(*off));
3649
0
                        if (!tmp)
3650
0
                            goto err;
3651
0
                        off = tmp;
3652
0
                        off[0].u = e->offset;
3653
0
                        off[0].v = 0;
3654
0
                        n_off=1;
3655
0
                    } else {
3656
0
                        hts_log_warning("No index entries");
3657
0
                    }
3658
0
                    break;
3659
0
                case HTS_IDX_REST:
3660
0
                    break;
3661
0
                case HTS_IDX_NONE:
3662
0
                    iter->finished = 1;
3663
0
                    break;
3664
0
                default:
3665
0
                    hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
3666
0
            }
3667
0
        }
3668
0
    }
3669
3670
0
    if (n_off) {
3671
0
        ks_introsort(_off_max, n_off, off);
3672
0
        iter->n_off = n_off; iter->off = off;
3673
0
    }
3674
3675
0
    if(!n_off && !iter->nocoor)
3676
0
        iter->finished = 1;
3677
3678
0
    return 0;
3679
3680
0
 err:
3681
0
    free(off);
3682
0
    return -1;
3683
0
}
3684
3685
void hts_itr_destroy(hts_itr_t *iter)
3686
0
{
3687
0
    if (iter) {
3688
0
        if (iter->multi) {
3689
0
            hts_reglist_free(iter->reg_list, iter->n_reg);
3690
0
        } else {
3691
0
            free(iter->bins.a);
3692
0
        }
3693
3694
0
        if (iter->off)
3695
0
            free(iter->off);
3696
0
        free(iter);
3697
0
    }
3698
0
}
3699
3700
static inline long long push_digit(long long i, char c)
3701
0
{
3702
    // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so
3703
0
    int digit = c - '0';
3704
0
    return 10 * i + digit;
3705
0
}
3706
3707
long long hts_parse_decimal(const char *str, char **strend, int flags)
3708
0
{
3709
0
    long long n = 0;
3710
0
    int digits = 0, decimals = 0, e = 0, lost = 0;
3711
0
    char sign = '+', esign = '+';
3712
0
    const char *s, *str_orig = str;
3713
3714
0
    while (isspace_c(*str)) str++;
3715
0
    s = str;
3716
3717
0
    if (*s == '+' || *s == '-') sign = *s++;
3718
0
    while (*s)
3719
0
        if (isdigit_c(*s)) digits++, n = push_digit(n, *s++);
3720
0
        else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++;
3721
0
        else break;
3722
3723
0
    if (*s == '.') {
3724
0
        s++;
3725
0
        while (isdigit_c(*s)) decimals++, digits++, n = push_digit(n, *s++);
3726
0
    }
3727
3728
0
    switch (*s) {
3729
0
    case 'e': case 'E':
3730
0
        s++;
3731
0
        if (*s == '+' || *s == '-') esign = *s++;
3732
0
        while (isdigit_c(*s)) e = push_digit(e, *s++);
3733
0
        if (esign == '-') e = -e;
3734
0
        break;
3735
3736
0
    case 'k': case 'K': e += 3; s++; break;
3737
0
    case 'm': case 'M': e += 6; s++; break;
3738
0
    case 'g': case 'G': e += 9; s++; break;
3739
0
    }
3740
3741
0
    e -= decimals;
3742
0
    while (e > 0) n *= 10, e--;
3743
0
    while (e < 0) lost += n % 10, n /= 10, e++;
3744
3745
0
    if (lost > 0) {
3746
0
        hts_log_warning("Discarding fractional part of %.*s", (int)(s - str), str);
3747
0
    }
3748
3749
0
    if (strend) {
3750
        // Set to the original input str pointer if not valid number syntax
3751
0
        *strend = (digits > 0)? (char *)s : (char *)str_orig;
3752
0
    } else if (digits == 0) {
3753
0
        hts_log_warning("Invalid numeric value %.8s[truncated]", str);
3754
0
    } else if (*s) {
3755
0
        if ((flags & HTS_PARSE_THOUSANDS_SEP) || (!(flags & HTS_PARSE_THOUSANDS_SEP) && *s != ','))
3756
0
            hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s);
3757
0
    }
3758
3759
0
    return (sign == '+')? n : -n;
3760
0
}
3761
3762
0
static void *hts_memrchr(const void *s, int c, size_t n) {
3763
0
    size_t i;
3764
0
    unsigned char *u = (unsigned char *)s;
3765
0
    for (i = n; i > 0; i--) {
3766
0
        if (u[i-1] == c)
3767
0
            return u+i-1;
3768
0
    }
3769
3770
0
    return NULL;
3771
0
}
3772
3773
/*
3774
 * A variant of hts_parse_reg which is reference-id aware.  It uses
3775
 * the iterator name2id callbacks to validate the region tokenisation works.
3776
 *
3777
 * This is necessary due to GRCh38 HLA additions which have reference names
3778
 * like "HLA-DRB1*12:17".
3779
 *
3780
 * All parameters are mandatory.
3781
 *
3782
 * To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200"
3783
 * are reference names, we may quote using curly braces.
3784
 * Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example.
3785
 *
3786
 * Flags are used to control how parsing works, and can be one of the below.
3787
 *
3788
 * HTS_PARSE_LIST:
3789
 *     If present, the region is assmed to be a comma separated list and
3790
 *     position parsing will not contain commas (this implicitly
3791
 *     clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal).
3792
 *     On success the return pointer will be the start of the next region, ie
3793
 *     the character after the comma.  (If *ret != '\0' then the caller can
3794
 *     assume another region is present in the list.)
3795
 *
3796
 *     If not set then positions may contain commas.  In this case the return
3797
 *     value should point to the end of the string, or NULL on failure.
3798
 *
3799
 * HTS_PARSE_ONE_COORD:
3800
 *     If present, X:100 is treated as the single base pair region X:100-100.
3801
 *     In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-<end>.
3802
 *     (This is the standard bcftools region convention.)
3803
 *
3804
 *     When not set X:100 is considered to be X:100-<end> where <end> is
3805
 *     the end of chromosome X (set to HTS_POS_MAX here).  X:100- and X:-100
3806
 *     are invalid.
3807
 *     (This is the standard samtools region convention.)
3808
 *
3809
 * Note the supplied string expects 1 based inclusive coordinates, but the
3810
 * returned coordinates start from 0 and are half open, so pos0 is valid
3811
 * for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}"
3812
 *
3813
 * On success a pointer to the byte after the end of the entire region
3814
 *            specifier is returned (plus any trailing comma), and tid,
3815
 *            beg & end will be set.
3816
 * On failure NULL is returned.
3817
 */
3818
const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg,
3819
                             hts_pos_t *end, hts_name2id_f getid, void *hdr,
3820
                             int flags)
3821
0
{
3822
0
    if (!s || !tid || !beg || !end || !getid)
3823
0
        return NULL;
3824
3825
0
    size_t s_len = strlen(s);
3826
0
    kstring_t ks = { 0, 0, NULL };
3827
3828
0
    const char *colon = NULL, *comma = NULL;
3829
0
    int quoted = 0;
3830
3831
0
    if (flags & HTS_PARSE_LIST)
3832
0
        flags &= ~HTS_PARSE_THOUSANDS_SEP;
3833
0
    else
3834
0
        flags |= HTS_PARSE_THOUSANDS_SEP;
3835
3836
0
    const char *s_end = s + s_len;
3837
3838
    // Braced quoting of references is permitted to resolve ambiguities.
3839
0
    if (*s == '{') {
3840
0
        const char *close = memchr(s, '}', s_len);
3841
0
        if (!close) {
3842
0
            hts_log_error("Mismatching braces in \"%s\"", s);
3843
0
            *tid = -1;
3844
0
            return NULL;
3845
0
        }
3846
0
        s++;
3847
0
        s_len--;
3848
0
        if (close[1] == ':')
3849
0
            colon = close+1;
3850
0
        quoted = 1; // number of trailing characters to trim
3851
3852
        // Truncate to this item only, if appropriate.
3853
0
        if (flags & HTS_PARSE_LIST) {
3854
0
            comma = strchr(close, ',');
3855
0
            if (comma) {
3856
0
                s_len = comma-s;
3857
0
                s_end = comma+1;
3858
0
            }
3859
0
        }
3860
0
    } else {
3861
        // Truncate to this item only, if appropriate.
3862
0
        if (flags & HTS_PARSE_LIST) {
3863
0
            comma = strchr(s, ',');
3864
0
            if (comma) {
3865
0
                s_len = comma-s;
3866
0
                s_end = comma+1;
3867
0
            }
3868
0
        }
3869
3870
0
        colon = hts_memrchr(s, ':', s_len);
3871
0
    }
3872
3873
    // No colon is simplest case; just check and return.
3874
0
    if (colon == NULL) {
3875
0
        *beg = 0; *end = HTS_POS_MAX;
3876
0
        kputsn(s, s_len-quoted, &ks); // convert to nul terminated string
3877
0
        if (!ks.s) {
3878
0
            *tid = -2;
3879
0
            return NULL;
3880
0
        }
3881
3882
0
        *tid = getid(hdr, ks.s);
3883
0
        free(ks.s);
3884
3885
0
        return *tid >= 0 ? s_end : NULL;
3886
0
    }
3887
3888
    // Has a colon, but check whole name first.
3889
0
    if (!quoted) {
3890
0
        *beg = 0; *end = HTS_POS_MAX;
3891
0
        kputsn(s, s_len, &ks); // convert to nul terminated string
3892
0
        if (!ks.s) {
3893
0
            *tid = -2;
3894
0
            return NULL;
3895
0
        }
3896
0
        if ((*tid = getid(hdr, ks.s)) >= 0) {
3897
            // Entire name matches, but also check this isn't
3898
            // ambiguous.  eg we have ref chr1 and ref chr1:100-200
3899
            // both present.
3900
0
            ks.l = 0;
3901
0
            kputsn(s, colon-s, &ks); // convert to nul terminated string
3902
0
            if (!ks.s) {
3903
0
                *tid = -2;
3904
0
                return NULL;
3905
0
            }
3906
0
            if (getid(hdr, ks.s) >= 0) {
3907
0
                free(ks.s);
3908
0
                *tid = -1;
3909
0
                hts_log_error("Range is ambiguous. "
3910
0
                              "Use {%s} or {%.*s}%s instead",
3911
0
                              s, (int)(colon-s), s, colon);
3912
0
                return NULL;
3913
0
            }
3914
0
            free(ks.s);
3915
3916
0
            return s_end;
3917
0
        }
3918
0
        if (*tid < -1) // Failed to parse header
3919
0
            return NULL;
3920
0
    }
3921
3922
    // Quoted, or unquoted and whole string isn't a name.
3923
    // Check the pre-colon part is valid.
3924
0
    ks.l = 0;
3925
0
    kputsn(s, colon-s-quoted, &ks); // convert to nul terminated string
3926
0
    if (!ks.s) {
3927
0
        *tid = -2;
3928
0
        return NULL;
3929
0
    }
3930
0
    *tid = getid(hdr, ks.s);
3931
0
    free(ks.s);
3932
0
    if (*tid < 0)
3933
0
        return NULL;
3934
3935
    // Finally parse the post-colon coordinates
3936
0
    char *hyphen;
3937
0
    *beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1;
3938
0
    if (*beg < 0) {
3939
0
        if (*beg != -1 && *hyphen == '-' && colon[1] != '\0') {
3940
            // User specified zero, but we're 1-based.
3941
0
            hts_log_error("Coordinates must be > 0");
3942
0
            return NULL;
3943
0
        }
3944
0
        if (isdigit_c(*hyphen) || *hyphen == '\0' || *hyphen == ',') {
3945
            // interpret chr:-100 as chr:1-100
3946
0
            *end = *beg==-1 ? HTS_POS_MAX : -(*beg+1);
3947
0
            *beg = 0;
3948
0
            return s_end;
3949
0
        } else if (*beg < -1) {
3950
0
            hts_log_error("Unexpected string \"%s\" after region", hyphen);
3951
0
            return NULL;
3952
0
        }
3953
0
    }
3954
3955
0
    if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) {
3956
0
        *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : HTS_POS_MAX;
3957
0
    } else if (*hyphen == '-') {
3958
0
        *end = hts_parse_decimal(hyphen+1, &hyphen, flags);
3959
0
        if (*hyphen != '\0' && *hyphen != ',') {
3960
0
            hts_log_error("Unexpected string \"%s\" after region", hyphen);
3961
0
            return NULL;
3962
0
        }
3963
0
    } else {
3964
0
        hts_log_error("Unexpected string \"%s\" after region", hyphen);
3965
0
        return NULL;
3966
0
    }
3967
3968
0
    if (*end == 0)
3969
0
        *end = HTS_POS_MAX; // interpret chr:100- as chr:100-<end>
3970
3971
0
    if (*beg >= *end) return NULL;
3972
3973
0
    return s_end;
3974
0
}
3975
3976
// Next release we should mark this as deprecated?
3977
// Use hts_parse_region above instead.
3978
const char *hts_parse_reg64(const char *s, hts_pos_t *beg, hts_pos_t *end)
3979
0
{
3980
0
    char *hyphen;
3981
0
    const char *colon = strrchr(s, ':');
3982
0
    if (colon == NULL) {
3983
0
        *beg = 0; *end = HTS_POS_MAX;
3984
0
        return s + strlen(s);
3985
0
    }
3986
3987
0
    *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1;
3988
0
    if (*beg < 0) *beg = 0;
3989
3990
0
    if (*hyphen == '\0') *end = HTS_POS_MAX;
3991
0
    else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP);
3992
0
    else return NULL;
3993
3994
0
    if (*beg >= *end) return NULL;
3995
0
    return colon;
3996
0
}
3997
3998
const char *hts_parse_reg(const char *s, int *beg, int *end)
3999
0
{
4000
0
    hts_pos_t beg64 = 0, end64 = 0;
4001
0
    const char *colon = hts_parse_reg64(s, &beg64, &end64);
4002
0
    if (beg64 > INT_MAX) {
4003
0
        hts_log_error("Position %"PRId64" too large", beg64);
4004
0
        return NULL;
4005
0
    }
4006
0
    if (end64 > INT_MAX) {
4007
0
        if (end64 == HTS_POS_MAX) {
4008
0
            end64 = INT_MAX;
4009
0
        } else {
4010
0
            hts_log_error("Position %"PRId64" too large", end64);
4011
0
            return NULL;
4012
0
        }
4013
0
    }
4014
0
    *beg = beg64;
4015
0
    *end = end64;
4016
0
    return colon;
4017
0
}
4018
4019
hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
4020
0
{
4021
0
    int tid;
4022
0
    hts_pos_t beg, end;
4023
4024
0
    if (strcmp(reg, ".") == 0)
4025
0
        return itr_query(idx, HTS_IDX_START, 0, 0, readrec);
4026
0
    else if (strcmp(reg, "*") == 0)
4027
0
        return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
4028
4029
0
    if (!hts_parse_region(reg, &tid, &beg, &end, getid, hdr, HTS_PARSE_THOUSANDS_SEP))
4030
0
        return NULL;
4031
4032
0
    return itr_query(idx, tid, beg, end, readrec);
4033
0
}
4034
4035
0
hts_itr_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int count, hts_name2id_f getid, void *hdr, hts_itr_multi_query_func *itr_specific, hts_readrec_func *readrec, hts_seek_func *seek, hts_tell_func *tell) {
4036
4037
0
    int i;
4038
4039
0
    if (!reglist)
4040
0
        return NULL;
4041
4042
0
    hts_itr_t *itr = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
4043
0
    if (itr) {
4044
0
        itr->n_reg = count;
4045
0
        itr->readrec = readrec;
4046
0
        itr->seek = seek;
4047
0
        itr->tell = tell;
4048
0
        itr->reg_list = reglist;
4049
0
        itr->finished = 0;
4050
0
        itr->nocoor = 0;
4051
0
        itr->multi = 1;
4052
4053
0
        for (i = 0; i < itr->n_reg; i++) {
4054
0
            if (itr->reg_list[i].reg) {
4055
0
                if (!strcmp(itr->reg_list[i].reg, ".")) {
4056
0
                    itr->reg_list[i].tid = HTS_IDX_START;
4057
0
                    continue;
4058
0
                }
4059
4060
0
                if (!strcmp(itr->reg_list[i].reg, "*")) {
4061
0
                    itr->reg_list[i].tid = HTS_IDX_NOCOOR;
4062
0
                    continue;
4063
0
                }
4064
4065
0
                itr->reg_list[i].tid = getid(hdr, reglist[i].reg);
4066
0
                if (itr->reg_list[i].tid < 0) {
4067
0
                    if (itr->reg_list[i].tid < -1) {
4068
0
                        hts_log_error("Failed to parse header");
4069
0
                        hts_itr_destroy(itr);
4070
0
                        return NULL;
4071
0
                    } else {
4072
0
                        hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", reglist[i].reg);
4073
0
                    }
4074
0
                }
4075
0
            }
4076
0
        }
4077
4078
0
        qsort(itr->reg_list, itr->n_reg, sizeof(hts_reglist_t), compare_regions);
4079
0
        if (itr_specific(idx, itr) != 0) {
4080
0
            hts_log_error("Failed to create the multi-region iterator!");
4081
0
            hts_itr_destroy(itr);
4082
0
            itr = NULL;
4083
0
        }
4084
0
    }
4085
4086
0
    return itr;
4087
0
}
4088
4089
int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
4090
0
{
4091
0
    int ret, tid;
4092
0
    hts_pos_t beg, end;
4093
0
    if (iter == NULL || iter->finished) return -1;
4094
0
    if (iter->read_rest) {
4095
0
        if (iter->curr_off) { // seek to the start
4096
0
            if (bgzf_seek(fp, iter->curr_off, SEEK_SET) < 0) {
4097
0
                hts_log_error("Failed to seek to offset %"PRIu64"%s%s",
4098
0
                              iter->curr_off,
4099
0
                              errno ? ": " : "", strerror(errno));
4100
0
                return -2;
4101
0
            }
4102
0
            iter->curr_off = 0; // only seek once
4103
0
        }
4104
0
        ret = iter->readrec(fp, data, r, &tid, &beg, &end);
4105
0
        if (ret < 0) iter->finished = 1;
4106
0
        iter->curr_tid = tid;
4107
0
        iter->curr_beg = beg;
4108
0
        iter->curr_end = end;
4109
0
        return ret;
4110
0
    }
4111
    // A NULL iter->off should always be accompanied by iter->finished.
4112
0
    assert(iter->off != NULL);
4113
0
    for (;;) {
4114
0
        if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
4115
0
            if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
4116
0
            if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
4117
0
                if (bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) {
4118
0
                    hts_log_error("Failed to seek to offset %"PRIu64"%s%s",
4119
0
                                  iter->off[iter->i+1].u,
4120
0
                                  errno ? ": " : "", strerror(errno));
4121
0
                    return -2;
4122
0
                }
4123
0
                iter->curr_off = bgzf_tell(fp);
4124
0
            }
4125
0
            ++iter->i;
4126
0
        }
4127
0
        if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) {
4128
0
            iter->curr_off = bgzf_tell(fp);
4129
0
            if (tid != iter->tid || beg >= iter->end) { // no need to proceed
4130
0
                ret = -1; break;
4131
0
            } else if (end > iter->beg && iter->end > beg) {
4132
0
                iter->curr_tid = tid;
4133
0
                iter->curr_beg = beg;
4134
0
                iter->curr_end = end;
4135
0
                return ret;
4136
0
            }
4137
0
        } else break; // end of file or error
4138
0
    }
4139
0
    iter->finished = 1;
4140
0
    return ret;
4141
0
}
4142
4143
int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r)
4144
0
{
4145
0
    void *fp;
4146
0
    int ret, tid, i, cr, ci;
4147
0
    hts_pos_t beg, end;
4148
0
    hts_reglist_t *found_reg;
4149
4150
0
    if (iter == NULL || iter->finished) return -1;
4151
4152
0
    if (iter->is_cram) {
4153
0
        fp = fd->fp.cram;
4154
0
    } else {
4155
0
        fp = fd->fp.bgzf;
4156
0
    }
4157
4158
0
    if (iter->read_rest) {
4159
0
        if (iter->curr_off) { // seek to the start
4160
0
            if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
4161
0
                hts_log_error("Seek at offset %" PRIu64 " failed.", iter->curr_off);
4162
0
                return -1;
4163
0
            }
4164
0
            iter->curr_off = 0; // only seek once
4165
0
        }
4166
4167
0
        ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
4168
0
        if (ret < 0)
4169
0
            iter->finished = 1;
4170
4171
0
        iter->curr_tid = tid;
4172
0
        iter->curr_beg = beg;
4173
0
        iter->curr_end = end;
4174
4175
0
        return ret;
4176
0
    }
4177
    // A NULL iter->off should always be accompanied by iter->finished.
4178
0
    assert(iter->off != NULL || iter->nocoor != 0);
4179
4180
0
    int next_range = 0;
4181
0
    for (;;) {
4182
        // Note that due to the way bam indexing works, iter->off may contain
4183
        // file chunks that are not actually needed as they contain data
4184
        // beyond the end of the requested region.  These are filtered out
4185
        // by comparing the tid and index into hts_reglist_t::intervals
4186
        // (packed for reasons of convenience into iter->off[iter->i].max)
4187
        // associated with the file region with iter->curr_tid and
4188
        // iter->curr_intv.
4189
4190
0
        if (next_range
4191
0
            || iter->curr_off == 0
4192
0
            || iter->i >= iter->n_off
4193
0
            || iter->curr_off >= iter->off[iter->i].v
4194
0
            || (iter->off[iter->i].max >> 32 == iter->curr_tid
4195
0
                && (iter->off[iter->i].max & 0xffffffff) < iter->curr_intv)) {
4196
4197
            // Jump to the next chunk.  It may be necessary to skip more
4198
            // than one as the iter->off list can include overlapping entries.
4199
0
            do {
4200
0
                iter->i++;
4201
0
            } while (iter->i < iter->n_off
4202
0
                     && (iter->curr_off >= iter->off[iter->i].v
4203
0
                         || (iter->off[iter->i].max >> 32 == iter->curr_tid
4204
0
                             && (iter->off[iter->i].max & 0xffffffff) < iter->curr_intv)));
4205
4206
0
            if (iter->is_cram && iter->i < iter->n_off) {
4207
                // Ensure iter->curr_reg is correct.
4208
                //
4209
                // We need this for CRAM as we shortcut some of the later
4210
                // logic by getting an end-of-range and continuing to the
4211
                // next offset.
4212
                //
4213
                // We cannot do this for BAM (and fortunately do not need to
4214
                // either) because in BAM world a query to genomic positions
4215
                // GX and GY leading to a seek offsets PX and PY may have
4216
                // GX > GY and PX < PY.  (This is due to the R-tree and falling
4217
                // between intervals, bumping up to a higher bin.)
4218
                // CRAM strictly follows PX >= PY if GX >= GY, so this logic
4219
                // works.
4220
0
                int want_tid = iter->off[iter->i].max >> 32;
4221
0
                if (!(iter->curr_reg < iter->n_reg &&
4222
0
                      iter->reg_list[iter->curr_reg].tid == want_tid)) {
4223
0
                    int j;
4224
0
                    for (j = 0; j < iter->n_reg; j++)
4225
0
                        if (iter->reg_list[j].tid == want_tid)
4226
0
                            break;
4227
0
                    if (j == iter->n_reg)
4228
0
                        return -1;
4229
0
                    iter->curr_reg = j;
4230
0
                    iter->curr_tid = iter->reg_list[iter->curr_reg].tid;
4231
0
                };
4232
0
                iter->curr_intv = iter->off[iter->i].max & 0xffffffff;
4233
0
            }
4234
4235
0
            if (iter->i >= iter->n_off) { // no more chunks, except NOCOORs
4236
0
                if (iter->nocoor) {
4237
0
                    next_range = 0;
4238
0
                    if (iter->seek(fp, iter->nocoor_off, SEEK_SET) < 0) {
4239
0
                        hts_log_error("Seek at offset %" PRIu64 " failed.", iter->nocoor_off);
4240
0
                        return -1;
4241
0
                    }
4242
0
                    if (iter->is_cram) {
4243
0
                        cram_range r = { HTS_IDX_NOCOOR };
4244
0
                        cram_set_option(fp, CRAM_OPT_RANGE_NOSEEK, &r);
4245
0
                    }
4246
4247
                    // The first slice covering the unmapped reads might
4248
                    // contain a few mapped reads, so scroll
4249
                    // forward until finding the first unmapped read.
4250
0
                    do {
4251
0
                        ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
4252
0
                    } while (tid >= 0 && ret >=0);
4253
4254
0
                    if (ret < 0)
4255
0
                        iter->finished = 1;
4256
0
                    else
4257
0
                        iter->read_rest = 1;
4258
4259
0
                    iter->curr_off = 0; // don't seek any more
4260
0
                    iter->curr_tid = tid;
4261
0
                    iter->curr_beg = beg;
4262
0
                    iter->curr_end = end;
4263
4264
0
                    return ret;
4265
0
                } else {
4266
0
                    ret = -1; break;
4267
0
                }
4268
0
            } else if (iter->i < iter->n_off) {
4269
                // New chunk may overlap the last one, so ensure we
4270
                // only seek forwards.
4271
0
                if (iter->curr_off < iter->off[iter->i].u || next_range) {
4272
0
                    iter->curr_off = iter->off[iter->i].u;
4273
4274
                    // CRAM has the capability of setting an end location.
4275
                    // This means multi-threaded decodes can stop once they
4276
                    // reach that point, rather than pointlessly decoding
4277
                    // more slices than we'll be using.
4278
                    //
4279
                    // We have to be careful here.  Whenever we set the cram
4280
                    // range we need a corresponding seek in order to ensure
4281
                    // we can safely decode at that offset.  We use next_range
4282
                    // var to ensure this is always true; this is set on
4283
                    // end-of-range condition. It's never modified for BAM.
4284
0
                    if (iter->is_cram) {
4285
                        // Next offset.[uv] tuple, but it's already been
4286
                        // included in our cram range, so don't seek and don't
4287
                        // reset range so we can efficiently multi-thread.
4288
0
                        if (next_range || iter->curr_off >= iter->end) {
4289
0
                            if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
4290
0
                                hts_log_error("Seek at offset %" PRIu64
4291
0
                                        " failed.", iter->curr_off);
4292
0
                                return -1;
4293
0
                            }
4294
4295
                            // Find the genomic range matching this interval.
4296
0
                            int j;
4297
0
                            hts_reglist_t *rl = &iter->reg_list[iter->curr_reg];
4298
0
                            cram_range r = {
4299
0
                                    rl->tid,
4300
0
                                    rl->intervals[iter->curr_intv].beg,
4301
0
                                    rl->intervals[iter->curr_intv].end
4302
0
                            };
4303
4304
                            // Expand it up to cover neighbouring intervals.
4305
                            // Note we can only have a single chromosome in a
4306
                            // range, so if we detect our blocks span chromosomes
4307
                            // or we have a multi-ref mode slice, we just use
4308
                            // HTS_IDX_START refid instead.  This doesn't actually
4309
                            // seek (due to CRAM_OPT_RANGE_NOSEEK) and is simply
4310
                            // and indicator of decoding with no end limit.
4311
                            //
4312
                            // That isn't as efficient as it could be, but it's
4313
                            // no poorer than before and it works.
4314
0
                            int tid = r.refid;
4315
0
                            int64_t end = r.end;
4316
0
                            int64_t v = iter->off[iter->i].v;
4317
0
                            j = iter->i+1;
4318
0
                            while (j < iter->n_off) {
4319
0
                                if (iter->off[j].u > v)
4320
0
                                    break;
4321
4322
0
                                uint64_t max = iter->off[j].max;
4323
0
                                if ((max>>32) != tid)
4324
0
                                    tid = HTS_IDX_START; // => no range limit
4325
4326
0
                                if (end < rl->intervals[max & 0xffffffff].end)
4327
0
                                    end = rl->intervals[max & 0xffffffff].end;
4328
0
                                if (v < iter->off[j].v)
4329
0
                                    v = iter->off[j].v;
4330
0
                                j++;
4331
0
                            }
4332
0
                            r.refid = tid;
4333
0
                            r.end = end;
4334
4335
                            // Remember maximum 'v' here so we don't do
4336
                            // unnecessary subsequent seeks for the next
4337
                            // regions.  We can't change curr_off, but
4338
                            // beg/end are used only by single region iterator so
4339
                            // we cache it there to avoid changing the struct.
4340
0
                            iter->end = v;
4341
4342
0
                            cram_set_option(fp, CRAM_OPT_RANGE_NOSEEK, &r);
4343
0
                            next_range = 0;
4344
0
                        }
4345
0
                    } else { // Not CRAM
4346
0
                        if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
4347
0
                            hts_log_error("Seek at offset %" PRIu64 " failed.",
4348
0
                                          iter->curr_off);
4349
0
                            return -1;
4350
0
                        }
4351
0
                    }
4352
0
                }
4353
0
            }
4354
0
        }
4355
4356
0
        ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
4357
0
        if (ret < 0) {
4358
0
            if (iter->is_cram && cram_eof(fp)) {
4359
                // Skip to end of range
4360
                //
4361
                // We should never be adjusting curr_off manually unless
4362
                // we also can guarantee we'll be doing a seek after to
4363
                // a new location.  Otherwise we'll be reading wrong offset
4364
                // for the next container.
4365
                //
4366
                // We ensure this by adjusting our CRAM_OPT_RANGE
4367
                // accordingly above, but to double check we also
4368
                // set the skipped_block flag to enforce a seek also.
4369
0
                iter->curr_off = iter->off[iter->i].v;
4370
0
                next_range = 1;
4371
4372
                // Next region
4373
0
                if (++iter->curr_intv >= iter->reg_list[iter->curr_reg].count){
4374
0
                    if (++iter->curr_reg >= iter->n_reg)
4375
0
                        break;
4376
0
                    iter->curr_intv = 0;
4377
0
                    iter->curr_tid = iter->reg_list[iter->curr_reg].tid;
4378
0
                }
4379
0
                continue;
4380
0
            } else {
4381
0
                break;
4382
0
            }
4383
0
        }
4384
4385
0
        iter->curr_off = iter->tell(fp);
4386
4387
0
        if (tid != iter->curr_tid) {
4388
0
            hts_reglist_t key;
4389
0
            key.tid = tid;
4390
4391
0
            found_reg = (hts_reglist_t *)bsearch(&key, iter->reg_list,
4392
0
                                                 iter->n_reg,
4393
0
                                                 sizeof(hts_reglist_t),
4394
0
                                                 compare_regions);
4395
0
            if (!found_reg)
4396
0
                continue;
4397
4398
0
            iter->curr_reg = (found_reg - iter->reg_list);
4399
0
            iter->curr_tid = tid;
4400
0
            iter->curr_intv = 0;
4401
0
        }
4402
4403
0
        cr = iter->curr_reg;
4404
0
        ci = iter->curr_intv;
4405
4406
0
        for (i = ci; i < iter->reg_list[cr].count; i++) {
4407
0
            if (end > iter->reg_list[cr].intervals[i].beg &&
4408
0
                iter->reg_list[cr].intervals[i].end > beg) {
4409
0
                iter->curr_beg = beg;
4410
0
                iter->curr_end = end;
4411
0
                iter->curr_intv = i;
4412
4413
0
                return ret;
4414
0
            }
4415
4416
            // Check if the read starts beyond intervals[i].end
4417
            // If so, the interval is finished so move on to the next.
4418
0
            if (beg > iter->reg_list[cr].intervals[i].end)
4419
0
                iter->curr_intv = i + 1;
4420
4421
            // No need to keep searching if the read ends before intervals[i].beg
4422
0
            if (end < iter->reg_list[cr].intervals[i].beg)
4423
0
                break;
4424
0
        }
4425
0
    }
4426
0
    iter->finished = 1;
4427
4428
0
    return ret;
4429
0
}
4430
4431
/**********************
4432
 *** Retrieve index ***
4433
 **********************/
4434
// Local_fn and local_len will return a sub-region of 'fn'.
4435
// Eg http://elsewhere/dir/foo.bam.bai?a=b may return
4436
// foo.bam.bai via local_fn and local_len.
4437
//
4438
// Returns -1 if index couldn't be opened.
4439
//         -2 on other errors
4440
static int idx_test_and_fetch(const char *fn, const char **local_fn, int *local_len, int download)
4441
0
{
4442
0
    hFILE *remote_hfp = NULL;
4443
0
    hFILE *local_fp = NULL;
4444
0
    int save_errno;
4445
0
    htsFormat fmt;
4446
0
    kstring_t s = KS_INITIALIZE;
4447
0
    kstring_t tmps = KS_INITIALIZE;
4448
4449
0
    if (hisremote(fn)) {
4450
0
        const int buf_size = 1 * 1024 * 1024;
4451
0
        int l;
4452
0
        const char *p, *e;
4453
        // Ignore ?# params: eg any file.fmt?param=val, except for S3 URLs
4454
0
        e = fn + ((strncmp(fn, "s3://", 5) && strncmp(fn, "s3+http://", 10) && strncmp(fn, "s3+https://", 11)) ? strcspn(fn, "?#") : strcspn(fn, "?"));
4455
        // Find the previous slash from there.
4456
0
        p = e;
4457
0
        while (p > fn && *p != '/') p--;
4458
0
        if (*p == '/') p++;
4459
4460
        // Attempt to open local file first
4461
0
        kputsn(p, e-p, &s);
4462
0
        if (access(s.s, R_OK) == 0)
4463
0
        {
4464
0
            free(s.s);
4465
0
            *local_fn = p;
4466
0
            *local_len = e-p;
4467
0
            return 0;
4468
0
        }
4469
4470
        // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .bai or .tbi index.
4471
0
        if ((remote_hfp = hopen(fn, "r")) == 0) {
4472
0
            hts_log_info("Failed to open index file '%s'", fn);
4473
0
            free(s.s);
4474
0
            return -1;
4475
0
        }
4476
0
        if (hts_detect_format2(remote_hfp, fn, &fmt)) {
4477
0
            hts_log_error("Failed to detect format of index file '%s'", fn);
4478
0
            goto fail;
4479
0
        }
4480
0
        if (fmt.category != index_file || (fmt.format != bai &&  fmt.format != csi && fmt.format != tbi
4481
0
                && fmt.format != crai && fmt.format != fai_format)) {
4482
0
            hts_log_error("Format of index file '%s' is not supported", fn);
4483
0
            goto fail;
4484
0
        }
4485
4486
0
        if (download) {
4487
0
            if ((local_fp = hts_open_tmpfile(s.s, "wx", &tmps))