Coverage Report

Created: 2024-02-11 06:32

/src/htslib/hts.c
Line
Count
Source (jump to first uncovered line)
1
/*  hts.c -- format-neutral I/O, indexing, and iterator API functions.
2
3
    Copyright (C) 2008, 2009, 2012-2024 Genome Research Ltd.
4
    Copyright (C) 2012, 2013 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <zlib.h>
30
#include <stdio.h>
31
#include <string.h>
32
#include <strings.h>
33
#include <stdlib.h>
34
#include <unistd.h>
35
#include <inttypes.h>
36
#include <limits.h>
37
#include <stdint.h>
38
#include <fcntl.h>
39
#include <errno.h>
40
#include <time.h>
41
#include <sys/stat.h>
42
#include <assert.h>
43
44
#ifdef HAVE_LIBLZMA
45
#ifdef HAVE_LZMA_H
46
#include <lzma.h>
47
#else
48
#include "os/lzma_stub.h"
49
#endif
50
#endif
51
52
#include "htslib/hts.h"
53
#include "htslib/bgzf.h"
54
#include "cram/cram.h"
55
#include "htslib/hfile.h"
56
#include "htslib/hts_endian.h"
57
#include "version.h"
58
#include "config_vars.h"
59
#include "hts_internal.h"
60
#include "hfile_internal.h"
61
#include "sam_internal.h"
62
#include "htslib/hts_expr.h"
63
#include "htslib/hts_os.h" // drand48
64
65
#include "htslib/khash.h"
66
#include "htslib/kseq.h"
67
#include "htslib/ksort.h"
68
#include "htslib/tbx.h"
69
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
70
#include <htscodecs/htscodecs.h>
71
#else
72
#include "htscodecs/htscodecs/htscodecs.h"
73
#endif
74
75
#ifndef EFTYPE
76
9
#define EFTYPE ENOEXEC
77
#endif
78
79
262k
KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal)
80
81
HTSLIB_EXPORT
82
int hts_verbose = HTS_LOG_WARNING;
83
84
const char *hts_version()
85
2
{
86
2
    return HTS_VERSION_TEXT;
87
2
}
88
89
0
unsigned int hts_features(void) {
90
0
    unsigned int feat = HTS_FEATURE_HTSCODECS; // Always present
91
92
0
#ifdef PACKAGE_URL
93
0
    feat |= HTS_FEATURE_CONFIGURE;
94
0
#endif
95
96
#ifdef ENABLE_PLUGINS
97
    feat |= HTS_FEATURE_PLUGINS;
98
#endif
99
100
0
#ifdef HAVE_LIBCURL
101
0
    feat |= HTS_FEATURE_LIBCURL;
102
0
#endif
103
104
0
#ifdef ENABLE_S3
105
0
    feat |= HTS_FEATURE_S3;
106
0
#endif
107
108
0
#ifdef ENABLE_GCS
109
0
    feat |= HTS_FEATURE_GCS;
110
0
#endif
111
112
#ifdef HAVE_LIBDEFLATE
113
    feat |= HTS_FEATURE_LIBDEFLATE;
114
#endif
115
116
0
#ifdef HAVE_LIBLZMA
117
0
    feat |= HTS_FEATURE_LZMA;
118
0
#endif
119
120
0
#ifdef HAVE_LIBBZ2
121
0
    feat |= HTS_FEATURE_BZIP2;
122
0
#endif
123
124
0
    return feat;
125
0
}
126
127
0
const char *hts_test_feature(unsigned int id) {
128
0
    unsigned int feat = hts_features();
129
130
0
    switch (id) {
131
0
    case HTS_FEATURE_CONFIGURE:
132
0
        return feat & HTS_FEATURE_CONFIGURE ? "yes" : NULL;
133
0
    case HTS_FEATURE_PLUGINS:
134
0
        return feat & HTS_FEATURE_PLUGINS ? "yes" : NULL;
135
0
    case HTS_FEATURE_LIBCURL:
136
0
        return feat & HTS_FEATURE_LIBCURL ? "yes" : NULL;
137
0
    case HTS_FEATURE_S3:
138
0
        return feat & HTS_FEATURE_S3 ? "yes" : NULL;
139
0
    case HTS_FEATURE_GCS:
140
0
        return feat & HTS_FEATURE_GCS ? "yes" : NULL;
141
0
    case HTS_FEATURE_LIBDEFLATE:
142
0
        return feat & HTS_FEATURE_LIBDEFLATE ? "yes" : NULL;
143
0
    case HTS_FEATURE_BZIP2:
144
0
        return feat & HTS_FEATURE_BZIP2 ? "yes" : NULL;
145
0
    case HTS_FEATURE_LZMA:
146
0
        return feat & HTS_FEATURE_LZMA ? "yes" : NULL;
147
148
0
    case HTS_FEATURE_HTSCODECS:
149
0
        return htscodecs_version();
150
151
0
    case HTS_FEATURE_CC:
152
0
        return HTS_CC;
153
0
    case HTS_FEATURE_CFLAGS:
154
0
        return HTS_CFLAGS;
155
0
    case HTS_FEATURE_LDFLAGS:
156
0
        return HTS_LDFLAGS;
157
0
    case HTS_FEATURE_CPPFLAGS:
158
0
        return HTS_CPPFLAGS;
159
160
0
    default:
161
0
        fprintf(stderr, "Unknown feature code: %u\n", id);
162
0
    }
163
164
0
    return NULL;
165
0
}
166
167
// Note this implementation also means we can just "strings" the library
168
// to find the configuration parameters.
169
0
const char *hts_feature_string(void) {
170
0
    static char config[1200];
171
0
    const char *flags=
172
173
0
#ifdef PACKAGE_URL
174
0
    "build=configure "
175
#else
176
    "build=Makefile "
177
#endif
178
179
0
#ifdef HAVE_LIBCURL
180
0
    "libcurl=yes "
181
#else
182
    "libcurl=no "
183
#endif
184
185
0
#ifdef ENABLE_S3
186
0
    "S3=yes "
187
#else
188
    "S3=no "
189
#endif
190
191
0
#ifdef ENABLE_GCS
192
0
    "GCS=yes "
193
#else
194
    "GCS=no "
195
#endif
196
197
#ifdef HAVE_LIBDEFLATE
198
    "libdeflate=yes "
199
#else
200
0
    "libdeflate=no "
201
0
#endif
202
203
0
#ifdef HAVE_LIBLZMA
204
0
    "lzma=yes "
205
#else
206
    "lzma=no "
207
#endif
208
209
0
#ifdef HAVE_LIBBZ2
210
0
    "bzip2=yes "
211
#else
212
    "bzip2=no "
213
#endif
214
215
// "plugins=" must stay at the end as it is followed by "plugin-path="
216
#ifdef ENABLE_PLUGINS
217
    "plugins=yes";
218
#else
219
0
    "plugins=no";
220
0
#endif
221
222
#ifdef ENABLE_PLUGINS
223
    snprintf(config, sizeof(config),
224
             "%s plugin-path=%.1000s htscodecs=%.40s",
225
             flags, hts_plugin_path(), htscodecs_version());
226
#else
227
0
    snprintf(config, sizeof(config),
228
0
             "%s htscodecs=%.40s",
229
0
             flags, htscodecs_version());
230
0
#endif
231
0
    return config;
232
0
}
233
234
235
HTSLIB_EXPORT
236
const unsigned char seq_nt16_table[256] = {
237
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
238
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
239
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
240
     1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
241
    15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
242
    15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
243
    15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
244
    15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
245
246
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
247
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
248
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
249
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
250
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
251
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
252
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
253
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
254
};
255
256
HTSLIB_EXPORT
257
const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN";
258
259
HTSLIB_EXPORT
260
const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
261
262
/**********************
263
 *** Basic file I/O ***
264
 **********************/
265
266
static enum htsFormatCategory format_category(enum htsExactFormat fmt)
267
18.8k
{
268
18.8k
    switch (fmt) {
269
0
    case bam:
270
0
    case sam:
271
4.54k
    case cram:
272
4.54k
    case fastq_format:
273
4.54k
    case fasta_format:
274
4.54k
        return sequence_data;
275
276
0
    case vcf:
277
0
    case bcf:
278
0
        return variant_data;
279
280
0
    case bai:
281
0
    case crai:
282
0
    case csi:
283
0
    case fai_format:
284
0
    case fqi_format:
285
0
    case gzi:
286
0
    case tbi:
287
0
        return index_file;
288
289
0
    case bed:
290
0
    case d4_format:
291
0
        return region_list;
292
293
0
    case htsget:
294
0
    case hts_crypt4gh_format:
295
0
        return unknown_category;
296
297
0
    case unknown_format:
298
7.15k
    case binary_format:
299
14.3k
    case text_format:
300
14.3k
    case empty_format:
301
14.3k
    case format_maximum:
302
14.3k
        break;
303
18.8k
    }
304
305
14.3k
    return unknown_category;
306
18.8k
}
307
308
// Decompress several hundred bytes by peeking at the file, which must be
309
// positioned at the start of a GZIP block.
310
static ssize_t
311
decompress_peek_gz(hFILE *fp, unsigned char *dest, size_t destsize)
312
2.82k
{
313
2.82k
    unsigned char buffer[2048];
314
2.82k
    z_stream zs;
315
2.82k
    ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
316
317
2.82k
    if (npeek < 0) return -1;
318
319
2.82k
    zs.zalloc = NULL;
320
2.82k
    zs.zfree = NULL;
321
2.82k
    zs.next_in = buffer;
322
2.82k
    zs.avail_in = npeek;
323
2.82k
    zs.next_out = dest;
324
2.82k
    zs.avail_out = destsize;
325
2.82k
    if (inflateInit2(&zs, 31) != Z_OK) return -1;
326
327
2.82k
    int ret;
328
2.82k
    const unsigned char *last_in = buffer;
329
5.63k
    while (zs.avail_out > 0) {
330
2.82k
        ret = inflate(&zs, Z_SYNC_FLUSH);
331
2.82k
        if (ret == Z_STREAM_END) {
332
0
            if (last_in == zs.next_in)
333
0
                break; // Paranoia to avoid potential looping. Shouldn't happen
334
0
            else
335
0
                last_in = zs.next_in;
336
0
            inflateReset(&zs);
337
2.82k
        } else if (ret != Z_OK) {
338
            // eg Z_BUF_ERROR due to avail_in/out becoming zero
339
18
            break;
340
18
        }
341
2.82k
    }
342
343
    // NB: zs.total_out is changed by inflateReset, so use pointer diff instead
344
2.82k
    destsize = zs.next_out - dest;
345
2.82k
    inflateEnd(&zs);
346
347
2.82k
    return destsize;
348
2.82k
}
349
350
#ifdef HAVE_LIBLZMA
351
// Similarly decompress a portion by peeking at the file, which must be
352
// positioned at the start of the file.
353
static ssize_t
354
decompress_peek_xz(hFILE *fp, unsigned char *dest, size_t destsize)
355
0
{
356
0
    unsigned char buffer[2048];
357
0
    ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
358
0
    if (npeek < 0) return -1;
359
360
0
    lzma_stream ls = LZMA_STREAM_INIT;
361
0
    if (lzma_stream_decoder(&ls, lzma_easy_decoder_memusage(9), 0) != LZMA_OK)
362
0
        return -1;
363
364
0
    ls.next_in = buffer;
365
0
    ls.avail_in = npeek;
366
0
    ls.next_out = dest;
367
0
    ls.avail_out = destsize;
368
369
0
    int r = lzma_code(&ls, LZMA_RUN);
370
0
    if (! (r == LZMA_OK || r == LZMA_STREAM_END)) {
371
0
        lzma_end(&ls);
372
0
        return -1;
373
0
    }
374
375
0
    destsize = ls.total_out;
376
0
    lzma_end(&ls);
377
378
0
    return destsize;
379
0
}
380
#endif
381
382
// Parse "x.y" text, taking care because the string is not NUL-terminated
383
// and filling in major/minor only when the digits are followed by a delimiter,
384
// so we don't misread "1.10" as "1.1" due to reaching the end of the buffer.
385
static void
386
parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim)
387
247
{
388
247
    const char *s    = (const char *) u;
389
247
    const char *slim = (const char *) ulim;
390
247
    short v;
391
392
247
    fmt->version.major = fmt->version.minor = -1;
393
394
394
    for (v = 0; s < slim && isdigit_c(*s); s++)
395
147
        v = 10 * v + *s - '0';
396
397
247
    if (s < slim) {
398
247
        fmt->version.major = v;
399
247
        if (*s == '.') {
400
0
            s++;
401
0
            for (v = 0; s < slim && isdigit_c(*s); s++)
402
0
                v = 10 * v + *s - '0';
403
0
            if (s < slim)
404
0
                fmt->version.minor = v;
405
0
        }
406
247
        else
407
247
            fmt->version.minor = 0;
408
247
    }
409
247
}
410
411
static int
412
cmp_nonblank(const char *key, const unsigned char *u, const unsigned char *ulim)
413
1.87k
{
414
1.87k
    const unsigned char *ukey = (const unsigned char *) key;
415
416
1.87k
    while (*ukey)
417
1.87k
        if (u >= ulim) return +1;
418
1.87k
        else if (isspace_c(*u)) u++;
419
1.87k
        else if (*u != *ukey) return (*ukey < *u)? -1 : +1;
420
0
        else u++, ukey++;
421
422
0
    return 0;
423
1.87k
}
424
425
static int is_text_only(const unsigned char *u, const unsigned char *ulim)
426
1.87k
{
427
51.1k
    for (; u < ulim; u++)
428
49.2k
        if (! (*u >= ' ' || *u == '\t' || *u == '\r' || *u == '\n'))
429
9
            return 0;
430
431
1.86k
    return 1;
432
1.87k
}
433
434
static int is_fastaq(const unsigned char *u, const unsigned char *ulim)
435
1.86k
{
436
1.86k
    const unsigned char *eol = memchr(u, '\n', ulim - u);
437
438
    // Check that the first line is entirely textual
439
1.86k
    if (! is_text_only(u, eol? eol : ulim)) return 0;
440
441
    // If the first line is very long, consider the file to indeed be FASTA/Q
442
1.86k
    if (eol == NULL) return 1;
443
444
1.82k
    u = eol+1; // Now points to the first character of the second line
445
446
    // Scan over all base-encoding letters (including 'N' but not SEQ's '=')
447
2.08k
    while (u < ulim && (seq_nt16_table[*u] != 15 || toupper(*u) == 'N')) {
448
264
        if (*u == '=') return 0;
449
264
        u++;
450
264
    }
451
452
1.82k
    return (u == ulim || *u == '\r' || *u == '\n')? 1 : 0;
453
1.82k
}
454
455
// Parse tab-delimited text, filling in a string of column types and returning
456
// the number of columns spotted (within [u,ulim), and up to column_len) or -1
457
// if non-printable characters were seen.  Column types:
458
//     i: integer, s: strand sign, C: CIGAR, O: SAM optional field, Z: anything
459
static int
460
parse_tabbed_text(char *columns, int column_len,
461
                  const unsigned char *u, const unsigned char *ulim,
462
                  int *complete)
463
13
{
464
13
    const char *str  = (const char *) u;
465
13
    const char *slim = (const char *) ulim;
466
13
    const char *s;
467
13
    int ncolumns = 0;
468
469
13
    enum { digit = 1, leading_sign = 2, cigar_operator = 4, other = 8 };
470
13
    unsigned seen = 0;
471
13
    *complete = 0;
472
473
1.96k
    for (s = str; s < slim; s++)
474
1.96k
        if (*s >= ' ') {
475
1.93k
            if (isdigit_c(*s))
476
168
                seen |= digit;
477
1.76k
            else if ((*s == '+' || *s == '-') && s == str)
478
2
                seen |= leading_sign;
479
1.76k
            else if (strchr(BAM_CIGAR_STR, *s) && s > str && isdigit_c(s[-1]))
480
4
                seen |= cigar_operator;
481
1.76k
            else
482
1.76k
                seen |= other;
483
1.93k
        }
484
27
        else if (*s == '\t' || *s == '\r' || *s == '\n') {
485
18
            size_t len = s - str;
486
18
            char type;
487
488
18
            if (seen == digit || seen == (leading_sign|digit)) type = 'i';
489
16
            else if (seen == (digit|cigar_operator)) type = 'C';
490
14
            else if (len == 1)
491
3
                switch (str[0]) {
492
1
                case '*': type = 'C'; break;
493
2
                case '+': case '-': case '.': type = 's'; break;
494
0
                default: type = 'Z'; break;
495
3
                }
496
11
            else if (len >= 5 && str[2] == ':' && str[4] == ':') type = 'O';
497
11
            else type = 'Z';
498
499
18
            columns[ncolumns++] = type;
500
18
            if (*s != '\t' || ncolumns >= column_len - 1) {
501
2
                *complete = 1; // finished the line or more columns than needed
502
2
                break;
503
2
            }
504
505
16
            str = s + 1;
506
16
            seen = 0;
507
16
        }
508
9
        else return -1;
509
510
4
    columns[ncolumns] = '\0';
511
4
    return ncolumns;
512
13
}
513
514
// Match COLUMNS as a prefix against PATTERN (so COLUMNS may run out first).
515
// Returns len(COLUMNS) (modulo '+'), or 0 if there is a mismatched entry.
516
static int colmatch(const char *columns, const char *pattern)
517
4
{
518
4
    int i;
519
8
    for (i = 0; columns[i] != '\0'; i++) {
520
4
        if (pattern[i] == '+') return i;
521
4
        if (! (columns[i] == pattern[i] || pattern[i] == 'Z')) return 0;
522
4
    }
523
524
4
    return i;
525
4
}
526
527
int hts_detect_format(hFILE *hfile, htsFormat *fmt)
528
0
{
529
0
    return hts_detect_format2(hfile, NULL, fmt);
530
0
}
531
532
int hts_detect_format2(hFILE *hfile, const char *fname, htsFormat *fmt)
533
26.4k
{
534
26.4k
    char extension[HTS_MAX_EXT_LEN], columns[24];
535
26.4k
    unsigned char s[1024];
536
26.4k
    int complete = 0;
537
26.4k
    ssize_t len = hpeek(hfile, s, 18);
538
26.4k
    if (len < 0) return -1;
539
540
26.4k
    fmt->category = unknown_category;
541
26.4k
    fmt->format = unknown_format;
542
26.4k
    fmt->version.major = fmt->version.minor = -1;
543
26.4k
    fmt->compression = no_compression;
544
26.4k
    fmt->compression_level = -1;
545
26.4k
    fmt->specific = NULL;
546
547
26.4k
    if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) {
548
        // The stream is either gzip-compressed or BGZF-compressed.
549
        // Determine which, and decompress the first few records or lines.
550
2.82k
        fmt->compression = gzip;
551
2.82k
        if (len >= 18 && (s[3] & 4)) {
552
1.08k
            if (memcmp(&s[12], "BC\2\0", 4) == 0)
553
9
                fmt->compression = bgzf;
554
1.07k
            else if (memcmp(&s[12], "RAZF", 4) == 0)
555
14
                fmt->compression = razf_compression;
556
1.08k
        }
557
2.82k
        if (len >= 9 && s[2] == 8)
558
2.82k
            fmt->compression_level = (s[8] == 2)? 9 : (s[8] == 4)? 1 : -1;
559
560
2.82k
        len = decompress_peek_gz(hfile, s, sizeof s);
561
2.82k
    }
562
23.6k
    else if (len >= 10 && memcmp(s, "BZh", 3) == 0 &&
563
23.6k
             (memcmp(&s[4], "\x31\x41\x59\x26\x53\x59", 6) == 0 ||
564
0
              memcmp(&s[4], "\x17\x72\x45\x38\x50\x90", 6) == 0)) {
565
0
        fmt->compression = bzip2_compression;
566
0
        fmt->compression_level = s[3] - '0';
567
        // Decompressing via libbz2 produces no output until it has a whole
568
        // block (of size 100Kb x level), which is too large for peeking.
569
        // So unfortunately we can recognise bzip2 but not the contents,
570
        // except that \x1772... magic indicates the stream is empty.
571
0
        if (s[4] == '\x31') return 0;
572
0
        else len = 0;
573
0
    }
574
23.6k
    else if (len >= 6 && memcmp(s, "\xfd""7zXZ\0", 6) == 0) {
575
0
        fmt->compression = xz_compression;
576
0
#ifdef HAVE_LIBLZMA
577
0
        len = decompress_peek_xz(hfile, s, sizeof s);
578
#else
579
        // Without liblzma, we can't recognise the decompressed contents.
580
        return 0;
581
#endif
582
0
    }
583
23.6k
    else if (len >= 4 && memcmp(s, "\x28\xb5\x2f\xfd", 4) == 0) {
584
0
        fmt->compression = zstd_compression;
585
0
        return 0;
586
0
    }
587
23.6k
    else {
588
23.6k
        len = hpeek(hfile, s, sizeof s);
589
23.6k
    }
590
26.4k
    if (len < 0) return -1;
591
592
26.4k
    if (len == 0) {
593
9
        fmt->format = empty_format;
594
9
        return 0;
595
9
    }
596
597
    // We avoid using filename extensions wherever possible (as filenames are
598
    // not always available), but in a few cases they must be considered:
599
    //  - FASTA/Q indexes are simply tab-separated text; files that match these
600
    //    patterns but not the fai/fqi extension are usually generic BED files
601
    //  - GZI indexes have no magic numbers so can only be detected by filename
602
26.4k
    if (fname && strcmp(fname, "-") != 0) {
603
26.4k
        char *s;
604
26.4k
        if (find_file_extension(fname, extension) < 0) extension[0] = '\0';
605
26.4k
        for (s = extension; *s; s++) *s = tolower_c(*s);
606
26.4k
    }
607
0
    else extension[0] = '\0';
608
609
26.4k
    if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=7 && s[5]<=7) {
610
5.83k
        fmt->category = sequence_data;
611
5.83k
        fmt->format = cram;
612
5.83k
        fmt->version.major = s[4], fmt->version.minor = s[5];
613
5.83k
        fmt->compression = custom;
614
5.83k
        return 0;
615
5.83k
    }
616
20.6k
    else if (len >= 4 && s[3] <= '\4') {
617
1.67k
        if (memcmp(s, "BAM\1", 4) == 0) {
618
1.00k
            fmt->category = sequence_data;
619
1.00k
            fmt->format = bam;
620
            // TODO Decompress enough to pick version from @HD-VN header
621
1.00k
            fmt->version.major = 1, fmt->version.minor = -1;
622
1.00k
            return 0;
623
1.00k
        }
624
672
        else if (memcmp(s, "BAI\1", 4) == 0) {
625
0
            fmt->category = index_file;
626
0
            fmt->format = bai;
627
0
            fmt->version.major = -1, fmt->version.minor = -1;
628
0
            return 0;
629
0
        }
630
672
        else if (memcmp(s, "BCF\4", 4) == 0) {
631
0
            fmt->category = variant_data;
632
0
            fmt->format = bcf;
633
0
            fmt->version.major = 1, fmt->version.minor = -1;
634
0
            return 0;
635
0
        }
636
672
        else if (memcmp(s, "BCF\2", 4) == 0) {
637
672
            fmt->category = variant_data;
638
672
            fmt->format = bcf;
639
672
            fmt->version.major = s[3];
640
672
            fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0;
641
672
            return 0;
642
672
        }
643
0
        else if (memcmp(s, "CSI\1", 4) == 0) {
644
0
            fmt->category = index_file;
645
0
            fmt->format = csi;
646
0
            fmt->version.major = 1, fmt->version.minor = -1;
647
0
            return 0;
648
0
        }
649
0
        else if (memcmp(s, "TBI\1", 4) == 0) {
650
0
            fmt->category = index_file;
651
0
            fmt->format = tbi;
652
0
            return 0;
653
0
        }
654
        // GZI indexes have no magic numbers, so must be recognised solely by
655
        // filename extension.
656
0
        else if (strcmp(extension, "gzi") == 0) {
657
0
            fmt->category = index_file;
658
0
            fmt->format = gzi;
659
0
            return 0;
660
0
        }
661
1.67k
    }
662
18.9k
    else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) {
663
7.14k
        fmt->category = variant_data;
664
7.14k
        fmt->format = vcf;
665
7.14k
        if (len >= 21 && s[16] == 'v')
666
3
            parse_version(fmt, &s[17], &s[len]);
667
7.14k
        return 0;
668
7.14k
    }
669
11.8k
    else if (len >= 4 && s[0] == '@' &&
670
11.8k
             (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 ||
671
10.0k
              memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0 ||
672
10.0k
              memcmp(s, "@CO\t", 4) == 0)) {
673
9.93k
        fmt->category = sequence_data;
674
9.93k
        fmt->format = sam;
675
        // @HD-VN is not guaranteed to be the first tag, but then @HD is
676
        // not guaranteed to be present at all...
677
9.93k
        if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0)
678
244
            parse_version(fmt, &s[7], &s[len]);
679
9.69k
        else
680
9.69k
            fmt->version.major = 1, fmt->version.minor = -1;
681
9.93k
        return 0;
682
9.93k
    }
683
1.87k
    else if (len >= 8 && memcmp(s, "d4\xdd\xdd", 4) == 0) {
684
0
        fmt->category = region_list;
685
0
        fmt->format = d4_format;
686
        // How to decode the D4 Format Version bytes is not yet specified
687
        // so we don't try to set fmt->version.{major,minor}.
688
0
        return 0;
689
0
    }
690
1.87k
    else if (cmp_nonblank("{\"htsget\":", s, &s[len]) == 0) {
691
0
        fmt->category = unknown_category;
692
0
        fmt->format = htsget;
693
0
        return 0;
694
0
    }
695
1.87k
    else if (len > 8 && memcmp(s, "crypt4gh", 8) == 0) {
696
0
        fmt->category = unknown_category;
697
0
        fmt->format = hts_crypt4gh_format;
698
0
        return 0;
699
0
    }
700
1.87k
    else if (len >= 1 && s[0] == '>' && is_fastaq(s, &s[len])) {
701
1.74k
        fmt->category = sequence_data;
702
1.74k
        fmt->format = fasta_format;
703
1.74k
        return 0;
704
1.74k
    }
705
133
    else if (len >= 1 && s[0] == '@' && is_fastaq(s, &s[len])) {
706
120
        fmt->category = sequence_data;
707
120
        fmt->format = fastq_format;
708
120
        return 0;
709
120
    }
710
13
    else if (parse_tabbed_text(columns, sizeof columns, s,
711
13
                               &s[len], &complete) > 0) {
712
        // A complete SAM line is at least 11 columns.  On unmapped long reads may
713
        // be missing two.  (On mapped long reads we must have an @ header so long
714
        // CIGAR is irrelevant.)
715
2
        if (colmatch(columns, "ZiZiiCZiiZZOOOOOOOOOOOOOOOOOOOO+")
716
2
            >= 9 + 2*complete) {
717
0
            fmt->category = sequence_data;
718
0
            fmt->format = sam;
719
0
            fmt->version.major = 1, fmt->version.minor = -1;
720
0
            return 0;
721
0
        }
722
2
        else if (fmt->compression == gzip && colmatch(columns, "iiiiii") == 6) {
723
0
            fmt->category = index_file;
724
0
            fmt->format = crai;
725
0
            return 0;
726
0
        }
727
2
        else if (strstr(extension, "fqi") && colmatch(columns, "Ziiiii") == 6) {
728
0
            fmt->category = index_file;
729
0
            fmt->format = fqi_format;
730
0
            return 0;
731
0
        }
732
2
        else if (strstr(extension, "fai") && colmatch(columns, "Ziiii") == 5) {
733
0
            fmt->category = index_file;
734
0
            fmt->format = fai_format;
735
0
            return 0;
736
0
        }
737
2
        else if (colmatch(columns, "Zii+") >= 3) {
738
0
            fmt->category = region_list;
739
0
            fmt->format = bed;
740
0
            return 0;
741
0
        }
742
2
    }
743
744
    // Arbitrary text files can be read using hts_getline().
745
13
    if (is_text_only(s, &s[len])) fmt->format = text_format;
746
747
    // Nothing recognised: leave unset fmt-> fields as unknown.
748
13
    return 0;
749
26.4k
}
750
751
char *hts_format_description(const htsFormat *format)
752
0
{
753
0
    kstring_t str = { 0, 0, NULL };
754
755
0
    switch (format->format) {
756
0
    case sam:   kputs("SAM", &str); break;
757
0
    case bam:   kputs("BAM", &str); break;
758
0
    case cram:  kputs("CRAM", &str); break;
759
0
    case fasta_format:  kputs("FASTA", &str); break;
760
0
    case fastq_format:  kputs("FASTQ", &str); break;
761
0
    case vcf:   kputs("VCF", &str); break;
762
0
    case bcf:
763
0
        if (format->version.major == 1) kputs("Legacy BCF", &str);
764
0
        else kputs("BCF", &str);
765
0
        break;
766
0
    case bai:   kputs("BAI", &str); break;
767
0
    case crai:  kputs("CRAI", &str); break;
768
0
    case csi:   kputs("CSI", &str); break;
769
0
    case fai_format:    kputs("FASTA-IDX", &str); break;
770
0
    case fqi_format:    kputs("FASTQ-IDX", &str); break;
771
0
    case gzi:   kputs("GZI", &str); break;
772
0
    case tbi:   kputs("Tabix", &str); break;
773
0
    case bed:   kputs("BED", &str); break;
774
0
    case d4_format:     kputs("D4", &str); break;
775
0
    case htsget: kputs("htsget", &str); break;
776
0
    case hts_crypt4gh_format: kputs("crypt4gh", &str); break;
777
0
    case empty_format:  kputs("empty", &str); break;
778
0
    default:    kputs("unknown", &str); break;
779
0
    }
780
781
0
    if (format->version.major >= 0) {
782
0
        kputs(" version ", &str);
783
0
        kputw(format->version.major, &str);
784
0
        if (format->version.minor >= 0) {
785
0
            kputc('.', &str);
786
0
            kputw(format->version.minor, &str);
787
0
        }
788
0
    }
789
790
0
    switch (format->compression) {
791
0
    case bzip2_compression:  kputs(" bzip2-compressed", &str); break;
792
0
    case razf_compression:   kputs(" legacy-RAZF-compressed", &str); break;
793
0
    case xz_compression:     kputs(" XZ-compressed", &str); break;
794
0
    case zstd_compression:   kputs(" Zstandard-compressed", &str); break;
795
0
    case custom: kputs(" compressed", &str); break;
796
0
    case gzip:   kputs(" gzip-compressed", &str); break;
797
798
0
    case bgzf:
799
0
        switch (format->format) {
800
0
        case bam:
801
0
        case bcf:
802
0
        case csi:
803
0
        case tbi:
804
            // These are by definition BGZF, so just use the generic term
805
0
            kputs(" compressed", &str);
806
0
            break;
807
0
        default:
808
0
            kputs(" BGZF-compressed", &str);
809
0
            break;
810
0
        }
811
0
        break;
812
813
0
    case no_compression:
814
0
        switch (format->format) {
815
0
        case bam:
816
0
        case bcf:
817
0
        case cram:
818
0
        case csi:
819
0
        case tbi:
820
            // These are normally compressed, so emphasise that this one isn't
821
0
            kputs(" uncompressed", &str);
822
0
            break;
823
0
        default:
824
0
            break;
825
0
        }
826
0
        break;
827
828
0
    default: break;
829
0
    }
830
831
0
    switch (format->category) {
832
0
    case sequence_data: kputs(" sequence", &str); break;
833
0
    case variant_data:  kputs(" variant calling", &str); break;
834
0
    case index_file:    kputs(" index", &str); break;
835
0
    case region_list:   kputs(" genomic region", &str); break;
836
0
    default: break;
837
0
    }
838
839
0
    if (format->compression == no_compression)
840
0
        switch (format->format) {
841
0
        case text_format:
842
0
        case sam:
843
0
        case crai:
844
0
        case vcf:
845
0
        case bed:
846
0
        case fai_format:
847
0
        case fqi_format:
848
0
        case fasta_format:
849
0
        case fastq_format:
850
0
        case htsget:
851
0
            kputs(" text", &str);
852
0
            break;
853
854
0
        case empty_format:
855
0
            break;
856
857
0
        default:
858
0
            kputs(" data", &str);
859
0
            break;
860
0
        }
861
0
    else
862
0
        kputs(" data", &str);
863
864
0
    return ks_release(&str);
865
0
}
866
867
htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
868
18.8k
{
869
18.8k
    char smode[101], *cp, *cp2, *mode_c, *uncomp = NULL;
870
18.8k
    htsFile *fp = NULL;
871
18.8k
    hFILE *hfile = NULL;
872
18.8k
    char fmt_code = '\0';
873
    // see enum htsExactFormat in htslib/hts.h
874
18.8k
    const char format_to_mode[] = "\0g\0\0b\0c\0\0b\0g\0\0\0\0\0Ff\0\0";
875
876
18.8k
    strncpy(smode, mode, 99);
877
18.8k
    smode[99]=0;
878
18.8k
    if ((cp = strchr(smode, ',')))
879
0
        *cp = '\0';
880
881
    // Migrate format code (b or c) to the end of the smode buffer.
882
49.3k
    for (cp2 = cp = smode; *cp; cp++) {
883
30.5k
        if (*cp == 'b')
884
7.15k
            fmt_code = 'b';
885
23.3k
        else if (*cp == 'c')
886
4.54k
            fmt_code = 'c';
887
18.8k
        else {
888
18.8k
            *cp2++ = *cp;
889
            // Cache the uncompress flag 'u' pos if present
890
18.8k
            if (!uncomp && (*cp == 'u')) {
891
0
                uncomp = cp2 - 1;
892
0
            }
893
18.8k
        }
894
30.5k
    }
895
18.8k
    mode_c = cp2;
896
18.8k
    *cp2++ = fmt_code;
897
18.8k
    *cp2++ = 0;
898
899
    // Set or reset the format code if opts->format is used
900
18.8k
    if (fmt && fmt->format > unknown_format
901
18.8k
        && fmt->format < sizeof(format_to_mode)) {
902
0
        *mode_c = format_to_mode[fmt->format];
903
0
    }
904
905
    // Uncompressed bam/bcf is not supported, change 'u' to '0' on write
906
18.8k
    if (uncomp && *mode_c == 'b' && (strchr(smode, 'w') || strchr(smode, 'a'))) {
907
0
        *uncomp = '0';
908
0
    }
909
910
    // If we really asked for a compressed text format then mode_c above will
911
    // point to nul.  We set to 'z' to enable bgzf.
912
18.8k
    if (strchr(mode, 'w') && fmt && fmt->compression == bgzf) {
913
0
        if (fmt->format == sam || fmt->format == vcf || fmt->format == text_format)
914
0
            *mode_c = 'z';
915
0
    }
916
917
18.8k
    char *rmme = NULL, *fnidx = strstr(fn, HTS_IDX_DELIM);
918
18.8k
    if ( fnidx ) {
919
0
        rmme = strdup(fn);
920
0
        if ( !rmme ) goto error;
921
0
        rmme[fnidx-fn] = 0;
922
0
        fn = rmme;
923
0
    }
924
925
18.8k
    hfile = hopen(fn, smode);
926
18.8k
    if (hfile == NULL) goto error;
927
928
18.8k
    fp = hts_hopen(hfile, fn, smode);
929
18.8k
    if (fp == NULL) goto error;
930
931
    // Compensate for the loss of exactness in htsExactFormat.
932
    // hts_hopen returns generics such as binary or text, but we
933
    // have been given something explicit here so use that instead.
934
18.8k
    if (fp->is_write && fmt &&
935
18.8k
        (fmt->format == bam || fmt->format == sam ||
936
0
         fmt->format == vcf || fmt->format == bcf ||
937
0
         fmt->format == bed || fmt->format == fasta_format ||
938
0
         fmt->format == fastq_format))
939
0
        fp->format.format = fmt->format;
940
941
18.8k
    if (fmt && fmt->specific) {
942
0
        if (hts_opt_apply(fp, fmt->specific) != 0) {
943
0
            if (((hts_opt*)fmt->specific)->opt == CRAM_OPT_REFERENCE &&
944
0
                (errno == ENOENT || errno == EIO || errno == EBADF ||
945
0
                  errno == EACCES || errno == EISDIR)) {
946
                /* error during reference file operation
947
                 for these specific errors, set the error as EINVAL */
948
0
                errno = EINVAL;
949
0
            }
950
0
            goto error;
951
0
        }
952
0
    }
953
18.8k
    if ( rmme ) free(rmme);
954
18.8k
    return fp;
955
956
0
error:
957
0
    hts_log_error("Failed to open file \"%s\"%s%s", fn,
958
0
                  errno ? " : " : "", errno ? strerror(errno) : "");
959
0
    if ( rmme ) free(rmme);
960
961
0
    if (hfile)
962
0
        hclose_abruptly(hfile);
963
964
0
    return NULL;
965
18.8k
}
966
967
18.8k
htsFile *hts_open(const char *fn, const char *mode) {
968
18.8k
    return hts_open_format(fn, mode, NULL);
969
18.8k
}
970
971
/*
972
 * Splits str into a prefix, delimiter ('\0' or delim), and suffix, writing
973
 * the prefix in lowercase into buf and returning a pointer to the suffix.
974
 * On return, buf is always NUL-terminated; thus assumes that the "keyword"
975
 * prefix should be one of several known values of maximum length buflen-2.
976
 * (If delim is not found, returns a pointer to the '\0'.)
977
 */
978
static const char *
979
scan_keyword(const char *str, char delim, char *buf, size_t buflen)
980
0
{
981
0
    size_t i = 0;
982
0
    while (*str && *str != delim) {
983
0
        if (i < buflen-1) buf[i++] = tolower_c(*str);
984
0
        str++;
985
0
    }
986
987
0
    buf[i] = '\0';
988
0
    return *str? str+1 : str;
989
0
}
990
991
/*
992
 * Parses arg and appends it to the option list.
993
 *
994
 * Returns 0 on success;
995
 *        -1 on failure.
996
 */
997
0
int hts_opt_add(hts_opt **opts, const char *c_arg) {
998
0
    hts_opt *o, *t;
999
0
    char *val;
1000
1001
    /*
1002
     * IMPORTANT!!!
1003
     * If you add another string option here, don't forget to also add
1004
     * it to the case statement in hts_opt_apply.
1005
     */
1006
1007
0
    if (!c_arg)
1008
0
        return -1;
1009
1010
0
    if (!(o =  malloc(sizeof(*o))))
1011
0
        return -1;
1012
1013
0
    if (!(o->arg = strdup(c_arg))) {
1014
0
        free(o);
1015
0
        return -1;
1016
0
    }
1017
1018
0
    if (!(val = strchr(o->arg, '=')))
1019
0
        val = "1"; // assume boolean
1020
0
    else
1021
0
        *val++ = '\0';
1022
1023
0
    if (strcmp(o->arg, "decode_md") == 0 ||
1024
0
        strcmp(o->arg, "DECODE_MD") == 0)
1025
0
        o->opt = CRAM_OPT_DECODE_MD, o->val.i = atoi(val);
1026
1027
0
    else if (strcmp(o->arg, "verbosity") == 0 ||
1028
0
             strcmp(o->arg, "VERBOSITY") == 0)
1029
0
        o->opt = CRAM_OPT_VERBOSITY, o->val.i = atoi(val);
1030
1031
0
    else if (strcmp(o->arg, "seqs_per_slice") == 0 ||
1032
0
             strcmp(o->arg, "SEQS_PER_SLICE") == 0)
1033
0
        o->opt = CRAM_OPT_SEQS_PER_SLICE, o->val.i = atoi(val);
1034
1035
0
    else if (strcmp(o->arg, "bases_per_slice") == 0 ||
1036
0
             strcmp(o->arg, "BASES_PER_SLICE") == 0)
1037
0
        o->opt = CRAM_OPT_BASES_PER_SLICE, o->val.i = atoi(val);
1038
1039
0
    else if (strcmp(o->arg, "slices_per_container") == 0 ||
1040
0
             strcmp(o->arg, "SLICES_PER_CONTAINER") == 0)
1041
0
        o->opt = CRAM_OPT_SLICES_PER_CONTAINER, o->val.i = atoi(val);
1042
1043
0
    else if (strcmp(o->arg, "embed_ref") == 0 ||
1044
0
             strcmp(o->arg, "EMBED_REF") == 0)
1045
0
        o->opt = CRAM_OPT_EMBED_REF, o->val.i = atoi(val);
1046
1047
0
    else if (strcmp(o->arg, "no_ref") == 0 ||
1048
0
             strcmp(o->arg, "NO_REF") == 0)
1049
0
        o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(val);
1050
1051
0
    else if (strcmp(o->arg, "pos_delta") == 0 ||
1052
0
             strcmp(o->arg, "POS_DELTA") == 0)
1053
0
        o->opt = CRAM_OPT_POS_DELTA, o->val.i = atoi(val);
1054
1055
0
    else if (strcmp(o->arg, "ignore_md5") == 0 ||
1056
0
             strcmp(o->arg, "IGNORE_MD5") == 0)
1057
0
        o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(val);
1058
1059
0
    else if (strcmp(o->arg, "use_bzip2") == 0 ||
1060
0
             strcmp(o->arg, "USE_BZIP2") == 0)
1061
0
        o->opt = CRAM_OPT_USE_BZIP2, o->val.i = atoi(val);
1062
1063
0
    else if (strcmp(o->arg, "use_rans") == 0 ||
1064
0
             strcmp(o->arg, "USE_RANS") == 0)
1065
0
        o->opt = CRAM_OPT_USE_RANS, o->val.i = atoi(val);
1066
1067
0
    else if (strcmp(o->arg, "use_lzma") == 0 ||
1068
0
             strcmp(o->arg, "USE_LZMA") == 0)
1069
0
        o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(val);
1070
1071
0
    else if (strcmp(o->arg, "use_tok") == 0 ||
1072
0
             strcmp(o->arg, "USE_TOK") == 0)
1073
0
        o->opt = CRAM_OPT_USE_TOK, o->val.i = atoi(val);
1074
1075
0
    else if (strcmp(o->arg, "use_fqz") == 0 ||
1076
0
             strcmp(o->arg, "USE_FQZ") == 0)
1077
0
        o->opt = CRAM_OPT_USE_FQZ, o->val.i = atoi(val);
1078
1079
0
    else if (strcmp(o->arg, "use_arith") == 0 ||
1080
0
             strcmp(o->arg, "USE_ARITH") == 0)
1081
0
        o->opt = CRAM_OPT_USE_ARITH, o->val.i = atoi(val);
1082
1083
0
    else if (strcmp(o->arg, "fast") == 0 ||
1084
0
             strcmp(o->arg, "FAST") == 0)
1085
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_FAST;
1086
1087
0
    else if (strcmp(o->arg, "normal") == 0 ||
1088
0
             strcmp(o->arg, "NORMAL") == 0)
1089
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_NORMAL;
1090
1091
0
    else if (strcmp(o->arg, "small") == 0 ||
1092
0
             strcmp(o->arg, "SMALL") == 0)
1093
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_SMALL;
1094
1095
0
    else if (strcmp(o->arg, "archive") == 0 ||
1096
0
             strcmp(o->arg, "ARCHIVE") == 0)
1097
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_ARCHIVE;
1098
1099
0
    else if (strcmp(o->arg, "reference") == 0 ||
1100
0
             strcmp(o->arg, "REFERENCE") == 0)
1101
0
        o->opt = CRAM_OPT_REFERENCE, o->val.s = val;
1102
1103
0
    else if (strcmp(o->arg, "version") == 0 ||
1104
0
             strcmp(o->arg, "VERSION") == 0)
1105
0
        o->opt = CRAM_OPT_VERSION, o->val.s =val;
1106
1107
0
    else if (strcmp(o->arg, "multi_seq_per_slice") == 0 ||
1108
0
             strcmp(o->arg, "MULTI_SEQ_PER_SLICE") == 0)
1109
0
        o->opt = CRAM_OPT_MULTI_SEQ_PER_SLICE, o->val.i = atoi(val);
1110
1111
0
    else if (strcmp(o->arg, "nthreads") == 0 ||
1112
0
             strcmp(o->arg, "NTHREADS") == 0)
1113
0
        o->opt = HTS_OPT_NTHREADS, o->val.i = atoi(val);
1114
1115
0
    else if (strcmp(o->arg, "cache_size") == 0 ||
1116
0
             strcmp(o->arg, "CACHE_SIZE") == 0) {
1117
0
        char *endp;
1118
0
        o->opt = HTS_OPT_CACHE_SIZE;
1119
0
        o->val.i = strtol(val, &endp, 0);
1120
        // NB: Doesn't support floats, eg 1.5g
1121
        // TODO: extend hts_parse_decimal? See also samtools sort.
1122
0
        switch (*endp) {
1123
0
        case 'g': case 'G': o->val.i *= 1024; // fall through
1124
0
        case 'm': case 'M': o->val.i *= 1024; // fall through
1125
0
        case 'k': case 'K': o->val.i *= 1024; break;
1126
0
        case '\0': break;
1127
0
        default:
1128
0
            hts_log_error("Unrecognised cache size suffix '%c'", *endp);
1129
0
            free(o->arg);
1130
0
            free(o);
1131
0
            return -1;
1132
0
        }
1133
0
    }
1134
1135
0
    else if (strcmp(o->arg, "required_fields") == 0 ||
1136
0
             strcmp(o->arg, "REQUIRED_FIELDS") == 0)
1137
0
        o->opt = CRAM_OPT_REQUIRED_FIELDS, o->val.i = strtol(val, NULL, 0);
1138
1139
0
    else if (strcmp(o->arg, "lossy_names") == 0 ||
1140
0
             strcmp(o->arg, "LOSSY_NAMES") == 0)
1141
0
        o->opt = CRAM_OPT_LOSSY_NAMES, o->val.i = strtol(val, NULL, 0);
1142
1143
0
    else if (strcmp(o->arg, "name_prefix") == 0 ||
1144
0
             strcmp(o->arg, "NAME_PREFIX") == 0)
1145
0
        o->opt = CRAM_OPT_PREFIX, o->val.s = val;
1146
1147
0
    else if (strcmp(o->arg, "store_md") == 0 ||
1148
0
             strcmp(o->arg, "store_md") == 0)
1149
0
        o->opt = CRAM_OPT_STORE_MD, o->val.i = atoi(val);
1150
1151
0
    else if (strcmp(o->arg, "store_nm") == 0 ||
1152
0
             strcmp(o->arg, "store_nm") == 0)
1153
0
        o->opt = CRAM_OPT_STORE_NM, o->val.i = atoi(val);
1154
1155
0
    else if (strcmp(o->arg, "block_size") == 0 ||
1156
0
             strcmp(o->arg, "BLOCK_SIZE") == 0)
1157
0
        o->opt = HTS_OPT_BLOCK_SIZE, o->val.i = strtol(val, NULL, 0);
1158
1159
0
    else if (strcmp(o->arg, "level") == 0 ||
1160
0
             strcmp(o->arg, "LEVEL") == 0)
1161
0
        o->opt = HTS_OPT_COMPRESSION_LEVEL, o->val.i = strtol(val, NULL, 0);
1162
1163
0
    else if (strcmp(o->arg, "filter") == 0 ||
1164
0
             strcmp(o->arg, "FILTER") == 0)
1165
0
        o->opt = HTS_OPT_FILTER, o->val.s = val;
1166
1167
0
    else if (strcmp(o->arg, "fastq_aux") == 0 ||
1168
0
        strcmp(o->arg, "FASTQ_AUX") == 0)
1169
0
        o->opt = FASTQ_OPT_AUX, o->val.s = val;
1170
1171
0
    else if (strcmp(o->arg, "fastq_barcode") == 0 ||
1172
0
        strcmp(o->arg, "FASTQ_BARCODE") == 0)
1173
0
        o->opt = FASTQ_OPT_BARCODE, o->val.s = val;
1174
1175
0
    else if (strcmp(o->arg, "fastq_rnum") == 0 ||
1176
0
        strcmp(o->arg, "FASTQ_RNUM") == 0)
1177
0
        o->opt = FASTQ_OPT_RNUM, o->val.i = 1;
1178
1179
0
    else if (strcmp(o->arg, "fastq_casava") == 0 ||
1180
0
        strcmp(o->arg, "FASTQ_CASAVA") == 0)
1181
0
        o->opt = FASTQ_OPT_CASAVA, o->val.i = 1;
1182
1183
0
    else if (strcmp(o->arg, "fastq_name2") == 0 ||
1184
0
        strcmp(o->arg, "FASTQ_NAME2") == 0)
1185
0
        o->opt = FASTQ_OPT_NAME2, o->val.i = 1;
1186
1187
0
    else {
1188
0
        hts_log_error("Unknown option '%s'", o->arg);
1189
0
        free(o->arg);
1190
0
        free(o);
1191
0
        return -1;
1192
0
    }
1193
1194
0
    o->next = NULL;
1195
1196
    // Append; assumes small list.
1197
0
    if (*opts) {
1198
0
        t = *opts;
1199
0
        while (t->next)
1200
0
            t = t->next;
1201
0
        t->next = o;
1202
0
    } else {
1203
0
        *opts = o;
1204
0
    }
1205
1206
0
    return 0;
1207
0
}
1208
1209
/*
1210
 * Applies an hts_opt option list to a given htsFile.
1211
 *
1212
 * Returns 0 on success
1213
 *        -1 on failure
1214
 */
1215
0
int hts_opt_apply(htsFile *fp, hts_opt *opts) {
1216
0
    hts_opt *last = NULL;
1217
1218
0
    for (; opts;  opts = (last=opts)->next) {
1219
0
        switch (opts->opt) {
1220
0
            case CRAM_OPT_REFERENCE:
1221
0
                if (!(fp->fn_aux = strdup(opts->val.s)))
1222
0
                    return -1;
1223
                // fall through
1224
0
            case CRAM_OPT_VERSION:
1225
0
            case CRAM_OPT_PREFIX:
1226
0
            case HTS_OPT_FILTER:
1227
0
            case FASTQ_OPT_AUX:
1228
0
            case FASTQ_OPT_BARCODE:
1229
0
                if (hts_set_opt(fp,  opts->opt,  opts->val.s) != 0)
1230
0
                    return -1;
1231
0
                break;
1232
0
            default:
1233
0
                if (hts_set_opt(fp,  opts->opt,  opts->val.i) != 0)
1234
0
                    return -1;
1235
0
                break;
1236
0
        }
1237
0
    }
1238
1239
0
    return 0;
1240
0
}
1241
1242
/*
1243
 * Frees an hts_opt list.
1244
 */
1245
0
void hts_opt_free(hts_opt *opts) {
1246
0
    hts_opt *last = NULL;
1247
0
    while (opts) {
1248
0
        opts = (last=opts)->next;
1249
0
        free(last->arg);
1250
0
        free(last);
1251
0
    }
1252
0
}
1253
1254
1255
/*
1256
 * Tokenise options as (key(=value)?,)*(key(=value)?)?
1257
 * NB: No provision for ',' appearing in the value!
1258
 * Add backslashing rules?
1259
 *
1260
 * This could be used as part of a general command line option parser or
1261
 * as a string concatenated onto the file open mode.
1262
 *
1263
 * Returns 0 on success
1264
 *        -1 on failure.
1265
 */
1266
0
int hts_parse_opt_list(htsFormat *fmt, const char *str) {
1267
0
    while (str && *str) {
1268
0
        const char *str_start;
1269
0
        int len;
1270
0
        char arg[8001];
1271
1272
0
        while (*str && *str == ',')
1273
0
            str++;
1274
1275
0
        for (str_start = str; *str && *str != ','; str++);
1276
0
        len = str - str_start;
1277
1278
        // Produce a nul terminated copy of the option
1279
0
        strncpy(arg, str_start, len < 8000 ? len : 8000);
1280
0
        arg[len < 8000 ? len : 8000] = '\0';
1281
1282
0
        if (hts_opt_add((hts_opt **)&fmt->specific, arg) != 0)
1283
0
            return -1;
1284
1285
0
        if (*str)
1286
0
            str++;
1287
0
    }
1288
1289
0
    return 0;
1290
0
}
1291
1292
/*
1293
 * Accepts a string file format (sam, bam, cram, vcf, bam) optionally
1294
 * followed by a comma separated list of key=value options and splits
1295
 * these up into the fields of htsFormat struct.
1296
 *
1297
 * format is assumed to be already initialised, either to blank
1298
 * "unknown" values or via previous hts_opt_add calls.
1299
 *
1300
 * Returns 0 on success
1301
 *        -1 on failure.
1302
 */
1303
0
int hts_parse_format(htsFormat *format, const char *str) {
1304
0
    char fmt[8];
1305
0
    const char *cp = scan_keyword(str, ',', fmt, sizeof fmt);
1306
1307
0
    format->version.minor = 0; // unknown
1308
0
    format->version.major = 0; // unknown
1309
1310
0
    if (strcmp(fmt, "sam") == 0) {
1311
0
        format->category          = sequence_data;
1312
0
        format->format            = sam;
1313
0
        format->compression       = no_compression;
1314
0
        format->compression_level = 0;
1315
0
    } else if (strcmp(fmt, "sam.gz") == 0) {
1316
0
        format->category          = sequence_data;
1317
0
        format->format            = sam;
1318
0
        format->compression       = bgzf;
1319
0
        format->compression_level = -1;
1320
0
    } else if (strcmp(fmt, "bam") == 0) {
1321
0
        format->category          = sequence_data;
1322
0
        format->format            = bam;
1323
0
        format->compression       = bgzf;
1324
0
        format->compression_level = -1;
1325
0
    } else if (strcmp(fmt, "cram") == 0) {
1326
0
        format->category          = sequence_data;
1327
0
        format->format            = cram;
1328
0
        format->compression       = custom;
1329
0
        format->compression_level = -1;
1330
0
    } else if (strcmp(fmt, "vcf") == 0) {
1331
0
        format->category          = variant_data;
1332
0
        format->format            = vcf;
1333
0
        format->compression       = no_compression;
1334
0
        format->compression_level = 0;
1335
0
    } else if (strcmp(fmt, "bcf") == 0) {
1336
0
        format->category          = variant_data;
1337
0
        format->format            = bcf;
1338
0
        format->compression       = bgzf;
1339
0
        format->compression_level = -1;
1340
0
    } else if (strcmp(fmt, "fastq") == 0 || strcmp(fmt, "fq") == 0) {
1341
0
        format->category          = sequence_data;
1342
0
        format->format            = fastq_format;
1343
0
        format->compression       = no_compression;
1344
0
        format->compression_level = 0;
1345
0
    } else if (strcmp(fmt, "fastq.gz") == 0 || strcmp(fmt, "fq.gz") == 0) {
1346
0
        format->category          = sequence_data;
1347
0
        format->format            = fastq_format;
1348
0
        format->compression       = bgzf;
1349
0
        format->compression_level = 0;
1350
0
    } else if (strcmp(fmt, "fasta") == 0 || strcmp(fmt, "fa") == 0) {
1351
0
        format->category          = sequence_data;
1352
0
        format->format            = fasta_format;
1353
0
        format->compression       = no_compression;
1354
0
        format->compression_level = 0;
1355
0
    } else if (strcmp(fmt, "fasta.gz") == 0 || strcmp(fmt, "fa.gz") == 0) {
1356
0
        format->category          = sequence_data;
1357
0
        format->format            = fasta_format;
1358
0
        format->compression       = bgzf;
1359
0
        format->compression_level = 0;
1360
0
    } else {
1361
0
        return -1;
1362
0
    }
1363
1364
0
    return hts_parse_opt_list(format, cp);
1365
0
}
1366
1367
1368
/*
1369
 * Tokenise options as (key(=value)?,)*(key(=value)?)?
1370
 * NB: No provision for ',' appearing in the value!
1371
 * Add backslashing rules?
1372
 *
1373
 * This could be used as part of a general command line option parser or
1374
 * as a string concatenated onto the file open mode.
1375
 *
1376
 * Returns 0 on success
1377
 *        -1 on failure.
1378
 */
1379
0
static int hts_process_opts(htsFile *fp, const char *opts) {
1380
0
    htsFormat fmt;
1381
1382
0
    fmt.specific = NULL;
1383
0
    if (hts_parse_opt_list(&fmt, opts) != 0)
1384
0
        return -1;
1385
1386
0
    if (hts_opt_apply(fp, fmt.specific) != 0) {
1387
0
        hts_opt_free(fmt.specific);
1388
0
        return -1;
1389
0
    }
1390
1391
0
    hts_opt_free(fmt.specific);
1392
1393
0
    return 0;
1394
0
}
1395
1396
static int hts_crypt4gh_redirect(const char *fn, const char *mode,
1397
0
                                 hFILE **hfile_ptr, htsFile *fp) {
1398
0
    hFILE *hfile1 = *hfile_ptr;
1399
0
    hFILE *hfile2 = NULL;
1400
0
    char fn_buf[512], *fn2 = fn_buf;
1401
0
    char mode2[102]; // Size set by sizeof(simple_mode) in hts_hopen()
1402
0
    const char *prefix = "crypt4gh:";
1403
0
    size_t fn2_len = strlen(prefix) + strlen(fn) + 1;
1404
0
    int ret = -1;
1405
1406
0
    if (fn2_len > sizeof(fn_buf)) {
1407
0
        if (fn2_len >= INT_MAX) // Silence gcc format-truncation warning
1408
0
            return -1;
1409
0
        fn2 = malloc(fn2_len);
1410
0
        if (!fn2) return -1;
1411
0
    }
1412
1413
    // Reopen fn using the crypt4gh plug-in (if available)
1414
0
    snprintf(fn2, fn2_len, "%s%s", prefix, fn);
1415
0
    snprintf(mode2, sizeof(mode2), "%s%s", mode, strchr(mode, ':') ? "" : ":");
1416
0
    hfile2 = hopen(fn2, mode2, "parent", hfile1, NULL);
1417
0
    if (hfile2) {
1418
        // Replace original hfile with the new one.  The original is now
1419
        // enclosed within hfile2
1420
0
        *hfile_ptr = hfile2;
1421
0
        ret = 0;
1422
0
    }
1423
1424
0
    if (fn2 != fn_buf)
1425
0
        free(fn2);
1426
0
    return ret;
1427
0
}
1428
1429
htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode)
1430
45.3k
{
1431
45.3k
    hFILE *hfile_orig = hfile;
1432
45.3k
    hFILE *hfile_cleanup = hfile;
1433
45.3k
    htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile));
1434
45.3k
    char simple_mode[101], *cp, *opts;
1435
45.3k
    simple_mode[100] = '\0';
1436
1437
45.3k
    if (fp == NULL) goto error;
1438
1439
45.3k
    fp->fn = strdup(fn);
1440
45.3k
    fp->is_be = ed_is_big();
1441
1442
    // Split mode into simple_mode,opts strings
1443
45.3k
    if ((cp = strchr(mode, ','))) {
1444
0
        strncpy(simple_mode, mode, cp-mode <= 100 ? cp-mode : 100);
1445
0
        simple_mode[cp-mode] = '\0';
1446
0
        opts = cp+1;
1447
45.3k
    } else {
1448
45.3k
        strncpy(simple_mode, mode, 100);
1449
45.3k
        opts = NULL;
1450
45.3k
    }
1451
1452
45.3k
    if (strchr(simple_mode, 'r')) {
1453
26.4k
        const int max_loops = 5; // Should be plenty
1454
26.4k
        int loops = 0;
1455
26.4k
        if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error;
1456
1457
        // Deal with formats that re-direct an underlying file via a plug-in.
1458
        // Loops as we may have crypt4gh served via htsget, or
1459
        // crypt4gh-in-crypt4gh.
1460
1461
26.4k
        while (fp->format.format == htsget ||
1462
26.4k
               fp->format.format == hts_crypt4gh_format) {
1463
            // Ensure we don't get stuck in an endless redirect loop
1464
0
            if (++loops > max_loops) {
1465
0
                errno = ELOOP;
1466
0
                goto error;
1467
0
            }
1468
1469
0
            if (fp->format.format == htsget) {
1470
0
                hFILE *hfile2 = hopen_htsget_redirect(hfile, simple_mode);
1471
0
                if (hfile2 == NULL) goto error;
1472
1473
0
                if (hfile != hfile_cleanup) {
1474
                    // Close the result of an earlier redirection
1475
0
                    hclose_abruptly(hfile);
1476
0
                }
1477
1478
0
                hfile = hfile2;
1479
0
            }
1480
0
            else if (fp->format.format == hts_crypt4gh_format) {
1481
0
                int should_preserve = (hfile == hfile_orig);
1482
0
                int update_cleanup = (hfile == hfile_cleanup);
1483
0
                if (hts_crypt4gh_redirect(fn, simple_mode, &hfile, fp) < 0)
1484
0
                    goto error;
1485
0
                if (should_preserve) {
1486
                    // The original hFILE is now contained in a crypt4gh
1487
                    // wrapper.  Should we need to close the wrapper due
1488
                    // to a later error, we need to prevent the wrapped
1489
                    // handle from being closed as the caller will see
1490
                    // this function return NULL and try to clean up itself.
1491
0
                    hfile_orig->preserve = 1;
1492
0
                }
1493
0
                if (update_cleanup) {
1494
                    // Update handle to close at the end if redirected by htsget
1495
0
                    hfile_cleanup = hfile;
1496
0
                }
1497
0
            }
1498
1499
            // Re-detect format against the result of the redirection
1500
0
            if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error;
1501
0
        }
1502
26.4k
    }
1503
18.8k
    else if (strchr(simple_mode, 'w') || strchr(simple_mode, 'a')) {
1504
18.8k
        htsFormat *fmt = &fp->format;
1505
18.8k
        fp->is_write = 1;
1506
1507
18.8k
        if (strchr(simple_mode, 'b')) fmt->format = binary_format;
1508
11.6k
        else if (strchr(simple_mode, 'c')) fmt->format = cram;
1509
7.15k
        else if (strchr(simple_mode, 'f')) fmt->format = fastq_format;
1510
7.15k
        else if (strchr(simple_mode, 'F')) fmt->format = fasta_format;
1511
7.15k
        else fmt->format = text_format;
1512
1513
18.8k
        if (strchr(simple_mode, 'z')) fmt->compression = bgzf;
1514
18.8k
        else if (strchr(simple_mode, 'g')) fmt->compression = gzip;
1515
18.8k
        else if (strchr(simple_mode, 'u')) fmt->compression = no_compression;
1516
18.8k
        else {
1517
            // No compression mode specified, set to the default for the format
1518
18.8k
            switch (fmt->format) {
1519
7.15k
            case binary_format: fmt->compression = bgzf; break;
1520
4.54k
            case cram: fmt->compression = custom; break;
1521
0
            case fastq_format: fmt->compression = no_compression; break;
1522
0
            case fasta_format: fmt->compression = no_compression; break;
1523
7.15k
            case text_format: fmt->compression = no_compression; break;
1524
0
            default: abort();
1525
18.8k
            }
1526
18.8k
        }
1527
1528
        // Fill in category (if determinable; e.g. 'b' could be BAM or BCF)
1529
18.8k
        fmt->category = format_category(fmt->format);
1530
1531
18.8k
        fmt->version.major = fmt->version.minor = -1;
1532
18.8k
        fmt->compression_level = -1;
1533
18.8k
        fmt->specific = NULL;
1534
18.8k
    }
1535
0
    else { errno = EINVAL; goto error; }
1536
1537
45.3k
    switch (fp->format.format) {
1538
7.15k
    case binary_format:
1539
8.15k
    case bam:
1540
8.82k
    case bcf:
1541
8.82k
        fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
1542
8.82k
        if (fp->fp.bgzf == NULL) goto error;
1543
8.82k
        fp->is_bin = fp->is_bgzf = 1;
1544
8.82k
        break;
1545
1546
10.3k
    case cram:
1547
10.3k
        fp->fp.cram = cram_dopen(hfile, fn, simple_mode);
1548
10.3k
        if (fp->fp.cram == NULL) goto error;
1549
9.93k
        if (!fp->is_write)
1550
5.39k
            cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, -1); // auto
1551
9.93k
        fp->is_cram = 1;
1552
9.93k
        break;
1553
1554
9
    case empty_format:
1555
7.16k
    case text_format:
1556
7.16k
    case bed:
1557
8.90k
    case fasta_format:
1558
9.02k
    case fastq_format:
1559
18.9k
    case sam:
1560
26.1k
    case vcf:
1561
26.1k
        if (fp->format.compression != no_compression) {
1562
2.74k
            fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
1563
2.74k
            if (fp->fp.bgzf == NULL) goto error;
1564
2.73k
            fp->is_bgzf = 1;
1565
2.73k
        }
1566
23.3k
        else
1567
23.3k
            fp->fp.hfile = hfile;
1568
26.0k
        break;
1569
1570
26.0k
    default:
1571
9
        errno = EFTYPE;
1572
9
        goto error;
1573
45.3k
    }
1574
1575
44.8k
    if (opts)
1576
0
        hts_process_opts(fp, opts);
1577
1578
    // Allow original file to close if it was preserved earlier by crypt4gh
1579
44.8k
    hfile_orig->preserve = 0;
1580
1581
    // If redirecting via htsget, close the original hFILE now (pedantically
1582
    // we would instead close it in hts_close(), but this a simplifying
1583
    // optimisation)
1584
44.8k
    if (hfile != hfile_cleanup) hclose_abruptly(hfile_cleanup);
1585
1586
44.8k
    return fp;
1587
1588
469
error:
1589
469
    hts_log_error("Failed to open file %s", fn);
1590
1591
    // If redirecting, close the failed redirection hFILE that we have opened
1592
469
    if (hfile != hfile_orig) hclose_abruptly(hfile);
1593
469
    hfile_orig->preserve = 0; // Allow caller to close the original hfile
1594
1595
469
    if (fp) {
1596
469
        free(fp->fn);
1597
469
        free(fp->fn_aux);
1598
469
        free(fp);
1599
469
    }
1600
469
    return NULL;
1601
45.3k
}
1602
1603
int hts_close(htsFile *fp)
1604
44.8k
{
1605
44.8k
    int ret = 0, save;
1606
44.8k
    if (!fp) {
1607
0
        errno = EINVAL;
1608
0
        return -1;
1609
0
    }
1610
1611
44.8k
    switch (fp->format.format) {
1612
500
    case binary_format:
1613
5.96k
    case bam:
1614
8.82k
    case bcf:
1615
8.82k
        ret = bgzf_close(fp->fp.bgzf);
1616
8.82k
        break;
1617
1618
9.93k
    case cram:
1619
9.93k
        if (!fp->is_write) {
1620
5.39k
            switch (cram_eof(fp->fp.cram)) {
1621
21
            case 2:
1622
21
                hts_log_warning("EOF marker is absent. The input is probably truncated");
1623
21
                break;
1624
4.02k
            case 0:  /* not at EOF, but may not have wanted all seqs */
1625
5.37k
            default: /* case 1, expected EOF */
1626
5.37k
                break;
1627
5.39k
            }
1628
5.39k
        }
1629
9.93k
        ret = cram_close(fp->fp.cram);
1630
9.93k
        break;
1631
1632
0
    case empty_format:
1633
503
    case text_format:
1634
503
    case bed:
1635
2.23k
    case fasta_format:
1636
2.35k
    case fastq_format:
1637
16.7k
    case sam:
1638
26.0k
    case vcf:
1639
26.0k
        if (fp->format.format == sam)
1640
14.4k
            ret = sam_state_destroy(fp);
1641
11.6k
        else if (fp->format.format == fastq_format ||
1642
11.6k
                 fp->format.format == fasta_format)
1643
1.85k
            fastq_state_destroy(fp);
1644
1645
26.0k
        if (fp->format.compression != no_compression)
1646
2.73k
            ret |= bgzf_close(fp->fp.bgzf);
1647
23.3k
        else
1648
23.3k
            ret |= hclose(fp->fp.hfile);
1649
26.0k
        break;
1650
1651
0
    default:
1652
0
        ret = -1;
1653
0
        break;
1654
44.8k
    }
1655
1656
44.8k
    save = errno;
1657
44.8k
    sam_hdr_destroy(fp->bam_header);
1658
44.8k
    hts_idx_destroy(fp->idx);
1659
44.8k
    hts_filter_free(fp->filter);
1660
44.8k
    free(fp->fn);
1661
44.8k
    free(fp->fn_aux);
1662
44.8k
    free(fp->line.s);
1663
44.8k
    free(fp);
1664
44.8k
    errno = save;
1665
44.8k
    return ret;
1666
44.8k
}
1667
1668
int hts_flush(htsFile *fp)
1669
0
{
1670
0
    if (fp == NULL) return 0;
1671
1672
0
    switch (fp->format.format) {
1673
0
    case binary_format:
1674
0
    case bam:
1675
0
    case bcf:
1676
0
        return bgzf_flush(fp->fp.bgzf);
1677
1678
0
    case cram:
1679
0
        return cram_flush(fp->fp.cram);
1680
1681
0
    case empty_format:
1682
0
    case text_format:
1683
0
    case bed:
1684
0
    case fasta_format:
1685
0
    case fastq_format:
1686
0
    case sam:
1687
0
    case vcf:
1688
0
        if (fp->format.compression != no_compression)
1689
0
            return bgzf_flush(fp->fp.bgzf);
1690
0
        else
1691
0
            return hflush(fp->fp.hfile);
1692
1693
0
    default:
1694
0
        break;
1695
0
    }
1696
1697
0
    return 0;
1698
0
}
1699
1700
const htsFormat *hts_get_format(htsFile *fp)
1701
0
{
1702
0
    return fp? &fp->format : NULL;
1703
0
}
1704
1705
0
const char *hts_format_file_extension(const htsFormat *format) {
1706
0
    if (!format)
1707
0
        return "?";
1708
1709
0
    switch (format->format) {
1710
0
    case sam:  return "sam";
1711
0
    case bam:  return "bam";
1712
0
    case bai:  return "bai";
1713
0
    case cram: return "cram";
1714
0
    case crai: return "crai";
1715
0
    case vcf:  return "vcf";
1716
0
    case bcf:  return "bcf";
1717
0
    case csi:  return "csi";
1718
0
    case fai_format:   return "fai";
1719
0
    case fqi_format:   return "fqi";
1720
0
    case gzi:  return "gzi";
1721
0
    case tbi:  return "tbi";
1722
0
    case bed:  return "bed";
1723
0
    case d4_format:    return "d4";
1724
0
    case fasta_format: return "fa";
1725
0
    case fastq_format: return "fq";
1726
0
    default:   return "?";
1727
0
    }
1728
0
}
1729
1730
0
static hFILE *hts_hfile(htsFile *fp) {
1731
0
    switch (fp->format.format) {
1732
0
    case binary_format:// fall through
1733
0
    case bcf:          // fall through
1734
0
    case bam:          return bgzf_hfile(fp->fp.bgzf);
1735
0
    case cram:         return cram_hfile(fp->fp.cram);
1736
0
    case text_format:  return fp->fp.hfile;
1737
0
    case vcf:          // fall through
1738
0
    case fastq_format: // fall through
1739
0
    case fasta_format: // fall through
1740
0
    case sam:          return fp->format.compression != no_compression
1741
0
                              ? bgzf_hfile(fp->fp.bgzf)
1742
0
                              : fp->fp.hfile;
1743
0
    default:           return NULL;
1744
0
    }
1745
0
}
1746
1747
0
int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) {
1748
0
    int r;
1749
0
    va_list args;
1750
1751
0
    switch (opt) {
1752
0
    case HTS_OPT_NTHREADS: {
1753
0
        va_start(args, opt);
1754
0
        int nthreads = va_arg(args, int);
1755
0
        va_end(args);
1756
0
        return hts_set_threads(fp, nthreads);
1757
0
    }
1758
1759
0
    case HTS_OPT_BLOCK_SIZE: {
1760
0
        hFILE *hf = hts_hfile(fp);
1761
1762
0
        if (hf) {
1763
0
            va_start(args, opt);
1764
0
            if (hfile_set_blksize(hf, va_arg(args, int)) != 0)
1765
0
                hts_log_warning("Failed to change block size");
1766
0
            va_end(args);
1767
0
        }
1768
0
        else {
1769
            // To do - implement for vcf/bcf.
1770
0
            hts_log_warning("Cannot change block size for this format");
1771
0
        }
1772
1773
0
        return 0;
1774
0
    }
1775
1776
0
    case HTS_OPT_THREAD_POOL: {
1777
0
        va_start(args, opt);
1778
0
        htsThreadPool *p = va_arg(args, htsThreadPool *);
1779
0
        va_end(args);
1780
0
        return hts_set_thread_pool(fp, p);
1781
0
    }
1782
1783
0
    case HTS_OPT_CACHE_SIZE: {
1784
0
        va_start(args, opt);
1785
0
        int cache_size = va_arg(args, int);
1786
0
        va_end(args);
1787
0
        hts_set_cache_size(fp, cache_size);
1788
0
        return 0;
1789
0
    }
1790
1791
0
    case FASTQ_OPT_CASAVA:
1792
0
    case FASTQ_OPT_RNUM:
1793
0
    case FASTQ_OPT_NAME2:
1794
0
        if (fp->format.format == fastq_format ||
1795
0
            fp->format.format == fasta_format)
1796
0
            return fastq_state_set(fp, opt);
1797
0
        return 0;
1798
1799
0
    case FASTQ_OPT_AUX:
1800
0
        if (fp->format.format == fastq_format ||
1801
0
            fp->format.format == fasta_format) {
1802
0
            va_start(args, opt);
1803
0
            char *list = va_arg(args, char *);
1804
0
            va_end(args);
1805
0
            return fastq_state_set(fp, opt, list);
1806
0
        }
1807
0
        return 0;
1808
1809
0
    case FASTQ_OPT_BARCODE:
1810
0
        if (fp->format.format == fastq_format ||
1811
0
            fp->format.format == fasta_format) {
1812
0
            va_start(args, opt);
1813
0
            char *bc = va_arg(args, char *);
1814
0
            va_end(args);
1815
0
            return fastq_state_set(fp, opt, bc);
1816
0
        }
1817
0
        return 0;
1818
1819
    // Options below here flow through to cram_set_voption
1820
0
    case HTS_OPT_COMPRESSION_LEVEL: {
1821
0
        va_start(args, opt);
1822
0
        int level = va_arg(args, int);
1823
0
        va_end(args);
1824
0
        if (fp->is_bgzf)
1825
0
            fp->fp.bgzf->compress_level = level;
1826
0
        else if (fp->format.format == cram)
1827
0
            return cram_set_option(fp->fp.cram, opt, level);
1828
0
        return 0;
1829
0
    }
1830
1831
0
    case HTS_OPT_FILTER: {
1832
0
        va_start(args, opt);
1833
0
        char *expr = va_arg(args, char *);
1834
0
        va_end(args);
1835
0
        return hts_set_filter_expression(fp, expr);
1836
0
    }
1837
1838
0
    case HTS_OPT_PROFILE: {
1839
0
        va_start(args, opt);
1840
0
        enum hts_profile_option prof = va_arg(args, int);
1841
0
        va_end(args);
1842
0
        if (fp->is_bgzf) {
1843
0
            switch (prof) {
1844
#ifdef HAVE_LIBDEFLATE
1845
            case HTS_PROFILE_FAST:    fp->fp.bgzf->compress_level =  2; break;
1846
            case HTS_PROFILE_NORMAL:  fp->fp.bgzf->compress_level = -1; break;
1847
            case HTS_PROFILE_SMALL:   fp->fp.bgzf->compress_level = 10; break;
1848
            case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level = 12; break;
1849
#else
1850
0
            case HTS_PROFILE_FAST:    fp->fp.bgzf->compress_level =  1; break;
1851
0
            case HTS_PROFILE_NORMAL:  fp->fp.bgzf->compress_level = -1; break;
1852
0
            case HTS_PROFILE_SMALL:   fp->fp.bgzf->compress_level =  8; break;
1853
0
            case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level =  9; break;
1854
0
#endif
1855
0
            }
1856
0
        } // else CRAM manages this in its own way
1857
0
        break;
1858
0
    }
1859
1860
0
    default:
1861
0
        break;
1862
0
    }
1863
1864
0
    if (fp->format.format != cram)
1865
0
        return 0;
1866
1867
0
    va_start(args, opt);
1868
0
    r = cram_set_voption(fp->fp.cram, opt, args);
1869
0
    va_end(args);
1870
1871
0
    return r;
1872
0
}
1873
1874
BGZF *hts_get_bgzfp(htsFile *fp);
1875
1876
int hts_set_threads(htsFile *fp, int n)
1877
0
{
1878
0
    if (fp->format.format == sam) {
1879
0
        return sam_set_threads(fp, n);
1880
0
    } else if (fp->format.compression == bgzf) {
1881
0
        return bgzf_mt(hts_get_bgzfp(fp), n, 256/*unused*/);
1882
0
    } else if (fp->format.format == cram) {
1883
0
        return hts_set_opt(fp, CRAM_OPT_NTHREADS, n);
1884
0
    }
1885
0
    else return 0;
1886
0
}
1887
1888
0
int hts_set_thread_pool(htsFile *fp, htsThreadPool *p) {
1889
0
    if (fp->format.format == sam || fp->format.format == text_format) {
1890
0
        return sam_set_thread_pool(fp, p);
1891
0
    } else if (fp->format.compression == bgzf) {
1892
0
        return bgzf_thread_pool(hts_get_bgzfp(fp), p->pool, p->qsize);
1893
0
    } else if (fp->format.format == cram) {
1894
0
        return hts_set_opt(fp, CRAM_OPT_THREAD_POOL, p);
1895
0
    }
1896
0
    else return 0;
1897
0
}
1898
1899
void hts_set_cache_size(htsFile *fp, int n)
1900
0
{
1901
0
    if (fp->format.compression == bgzf)
1902
0
        bgzf_set_cache_size(hts_get_bgzfp(fp), n);
1903
0
}
1904
1905
int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
1906
0
{
1907
0
    free(fp->fn_aux);
1908
0
    if (fn_aux) {
1909
0
        fp->fn_aux = strdup(fn_aux);
1910
0
        if (fp->fn_aux == NULL) return -1;
1911
0
    }
1912
0
    else fp->fn_aux = NULL;
1913
1914
0
    if (fp->format.format == cram)
1915
0
        if (cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux))
1916
0
            return -1;
1917
1918
0
    return 0;
1919
0
}
1920
1921
int hts_set_filter_expression(htsFile *fp, const char *expr)
1922
0
{
1923
0
    if (fp->filter)
1924
0
        hts_filter_free(fp->filter);
1925
1926
0
    if (!expr)
1927
0
        return 0;
1928
1929
0
    return (fp->filter = hts_filter_init(expr))
1930
0
        ? 0 : -1;
1931
0
}
1932
1933
hFILE *hts_open_tmpfile(const char *fname, const char *mode, kstring_t *tmpname)
1934
0
{
1935
0
    int pid = (int) getpid();
1936
0
    unsigned ptr = (uintptr_t) tmpname;
1937
0
    int n = 0;
1938
0
    hFILE *fp = NULL;
1939
1940
0
    do {
1941
        // Attempt to further uniquify the temporary filename
1942
0
        unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock()) ^ ptr;
1943
0
        n++;
1944
1945
0
        ks_clear(tmpname);
1946
0
        if (ksprintf(tmpname, "%s.tmp_%d_%d_%u", fname, pid, n, t) < 0) break;
1947
1948
0
        fp = hopen(tmpname->s, mode);
1949
0
    } while (fp == NULL && errno == EEXIST && n < 100);
1950
1951
0
    return fp;
1952
0
}
1953
1954
// For VCF/BCF backward sweeper. Not exposing these functions because their
1955
// future is uncertain. Things will probably have to change with hFILE...
1956
BGZF *hts_get_bgzfp(htsFile *fp)
1957
0
{
1958
0
    if (fp->is_bgzf)
1959
0
        return fp->fp.bgzf;
1960
0
    else
1961
0
        return NULL;
1962
0
}
1963
int hts_useek(htsFile *fp, off_t uoffset, int where)
1964
0
{
1965
0
    if (fp->is_bgzf)
1966
0
        return bgzf_useek(fp->fp.bgzf, uoffset, where);
1967
0
    else
1968
0
        return (hseek(fp->fp.hfile, uoffset, SEEK_SET) >= 0)? 0 : -1;
1969
0
}
1970
off_t hts_utell(htsFile *fp)
1971
0
{
1972
0
    if (fp->is_bgzf)
1973
0
        return bgzf_utell(fp->fp.bgzf);
1974
0
    else
1975
0
        return htell(fp->fp.hfile);
1976
0
}
1977
1978
int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
1979
159M
{
1980
159M
    int ret;
1981
159M
    if (! (delimiter == KS_SEP_LINE || delimiter == '\n')) {
1982
0
        hts_log_error("Unexpected delimiter %d", delimiter);
1983
0
        abort();
1984
0
    }
1985
1986
159M
    switch (fp->format.compression) {
1987
567k
    case no_compression:
1988
567k
        str->l = 0;
1989
567k
        ret = kgetline2(str, (kgets_func2 *) hgetln, fp->fp.hfile);
1990
567k
        if (ret >= 0) ret = (str->l <= INT_MAX)? (int) str->l : INT_MAX;
1991
5.16k
        else if (herrno(fp->fp.hfile)) ret = -2, errno = herrno(fp->fp.hfile);
1992
5.16k
        else ret = -1;
1993
567k
        break;
1994
1995
158M
    case gzip:
1996
158M
    case bgzf:
1997
158M
        ret = bgzf_getline(fp->fp.bgzf, '\n', str);
1998
158M
        break;
1999
2000
0
    default:
2001
0
        abort();
2002
159M
    }
2003
2004
159M
    ++fp->lineno;
2005
159M
    return ret;
2006
159M
}
2007
2008
char **hts_readlist(const char *string, int is_file, int *_n)
2009
0
{
2010
0
    unsigned int m = 0, n = 0;
2011
0
    char **s = 0, **s_new;
2012
0
    if ( is_file )
2013
0
    {
2014
0
        BGZF *fp = bgzf_open(string, "r");
2015
0
        if ( !fp ) return NULL;
2016
2017
0
        kstring_t str;
2018
0
        int ret;
2019
0
        str.s = 0; str.l = str.m = 0;
2020
0
        while ((ret = bgzf_getline(fp, '\n', &str)) >= 0)
2021
0
        {
2022
0
            if (str.l == 0) continue;
2023
0
            if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
2024
0
                goto err;
2025
0
            s[n] = strdup(str.s);
2026
0
            if (!s[n])
2027
0
                goto err;
2028
0
            n++;
2029
0
        }
2030
0
        if (ret < -1) // Read error
2031
0
            goto err;
2032
0
        bgzf_close(fp);
2033
0
        free(str.s);
2034
0
    }
2035
0
    else
2036
0
    {
2037
0
        const char *q = string, *p = string;
2038
0
        while ( 1 )
2039
0
        {
2040
0
            if (*p == ',' || *p == 0)
2041
0
            {
2042
0
                if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
2043
0
                    goto err;
2044
0
                s[n] = (char*)calloc(p - q + 1, 1);
2045
0
                if (!s[n])
2046
0
                    goto err;
2047
0
                strncpy(s[n++], q, p - q);
2048
0
                q = p + 1;
2049
0
            }
2050
0
            if ( !*p ) break;
2051
0
            p++;
2052
0
        }
2053
0
    }
2054
    // Try to shrink s to the minimum size needed
2055
0
    s_new = (char**)realloc(s, n * sizeof(char*));
2056
0
    if (!s_new)
2057
0
        goto err;
2058
2059
0
    s = s_new;
2060
0
    assert(n < INT_MAX); // hts_resize() should ensure this
2061
0
    *_n = n;
2062
0
    return s;
2063
2064
0
 err:
2065
0
    for (m = 0; m < n; m++)
2066
0
        free(s[m]);
2067
0
    free(s);
2068
0
    return NULL;
2069
0
}
2070
2071
char **hts_readlines(const char *fn, int *_n)
2072
0
{
2073
0
    unsigned int m = 0, n = 0;
2074
0
    char **s = 0, **s_new;
2075
0
    BGZF *fp = bgzf_open(fn, "r");
2076
0
    if ( fp ) { // read from file
2077
0
        kstring_t str;
2078
0
        int ret;
2079
0
        str.s = 0; str.l = str.m = 0;
2080
0
        while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
2081
0
            if (str.l == 0) continue;
2082
0
            if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
2083
0
                goto err;
2084
0
            s[n] = strdup(str.s);
2085
0
            if (!s[n])
2086
0
                goto err;
2087
0
            n++;
2088
0
        }
2089
0
        if (ret < -1) // Read error
2090
0
            goto err;
2091
0
        bgzf_close(fp);
2092
0
        free(str.s);
2093
0
    } else if (*fn == ':') { // read from string
2094
0
        const char *q, *p;
2095
0
        for (q = p = fn + 1;; ++p)
2096
0
            if (*p == ',' || *p == 0) {
2097
0
                if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
2098
0
                    goto err;
2099
0
                s[n] = (char*)calloc(p - q + 1, 1);
2100
0
                if (!s[n])
2101
0
                    goto err;
2102
0
                strncpy(s[n++], q, p - q);
2103
0
                q = p + 1;
2104
0
                if (*p == 0) break;
2105
0
            }
2106
0
    } else return 0;
2107
    // Try to shrink s to the minimum size needed
2108
0
    s_new = (char**)realloc(s, n * sizeof(char*));
2109
0
    if (!s_new)
2110
0
        goto err;
2111
2112
0
    s = s_new;
2113
0
    assert(n < INT_MAX); // hts_resize() should ensure this
2114
0
    *_n = n;
2115
0
    return s;
2116
2117
0
 err:
2118
0
    for (m = 0; m < n; m++)
2119
0
        free(s[m]);
2120
0
    free(s);
2121
0
    return NULL;
2122
0
}
2123
2124
// DEPRECATED: To be removed in a future HTSlib release
2125
int hts_file_type(const char *fname)
2126
0
{
2127
0
    int len = strlen(fname);
2128
0
    if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ;
2129
0
    if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF;
2130
0
    if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ;
2131
0
    if ( !strcmp("-",fname) ) return FT_STDIN;
2132
2133
0
    hFILE *f = hopen(fname, "r");
2134
0
    if (f == NULL) return 0;
2135
2136
0
    htsFormat fmt;
2137
0
    if (hts_detect_format2(f, fname, &fmt) < 0) { hclose_abruptly(f); return 0; }
2138
0
    if (hclose(f) < 0) return 0;
2139
2140
0
    switch (fmt.format) {
2141
0
    case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ;
2142
0
    case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ;
2143
0
    default:  return 0;
2144
0
    }
2145
0
}
2146
2147
int hts_check_EOF(htsFile *fp)
2148
0
{
2149
0
    if (fp->format.compression == bgzf)
2150
0
        return bgzf_check_EOF(hts_get_bgzfp(fp));
2151
0
    else if (fp->format.format == cram)
2152
0
        return cram_check_EOF(fp->fp.cram);
2153
0
    else
2154
0
        return 3;
2155
0
}
2156
2157
2158
/****************
2159
 *** Indexing ***
2160
 ****************/
2161
2162
0
#define HTS_MIN_MARKER_DIST 0x10000
2163
2164
// Finds the special meta bin
2165
//  ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1
2166
0
#define META_BIN(idx) ((idx)->n_bins + 1)
2167
2168
0
#define pair64_lt(a,b) ((a).u < (b).u)
2169
0
#define pair64max_lt(a,b) ((a).u < (b).u || \
2170
0
                           ((a).u == (b).u && (a).max < (b).max))
2171
2172
0
KSORT_INIT_STATIC(_off, hts_pair64_t, pair64_lt)
2173
0
KSORT_INIT_STATIC(_off_max, hts_pair64_max_t, pair64max_lt)
2174
2175
typedef struct {
2176
    int32_t m, n;
2177
    uint64_t loff;
2178
    hts_pair64_t *list;
2179
} bins_t;
2180
2181
KHASH_MAP_INIT_INT(bin, bins_t)
2182
typedef khash_t(bin) bidx_t;
2183
2184
typedef struct {
2185
    hts_pos_t n, m;
2186
    uint64_t *offset;
2187
} lidx_t;
2188
2189
struct hts_idx_t {
2190
    int fmt, min_shift, n_lvls, n_bins;
2191
    uint32_t l_meta;
2192
    int32_t n, m;
2193
    uint64_t n_no_coor;
2194
    bidx_t **bidx;
2195
    lidx_t *lidx;
2196
    uint8_t *meta; // MUST have a terminating NUL on the end
2197
    int tbi_n, last_tbi_tid;
2198
    struct {
2199
        uint32_t last_bin, save_bin;
2200
        hts_pos_t last_coor;
2201
        int last_tid, save_tid, finished;
2202
        uint64_t last_off, save_off;
2203
        uint64_t off_beg, off_end;
2204
        uint64_t n_mapped, n_unmapped;
2205
    } z; // keep internal states
2206
};
2207
2208
0
static char * idx_format_name(int fmt) {
2209
0
    switch (fmt) {
2210
0
        case HTS_FMT_CSI: return "csi";
2211
0
        case HTS_FMT_BAI: return "bai";
2212
0
        case HTS_FMT_TBI: return "tbi";
2213
0
        case HTS_FMT_CRAI: return "crai";
2214
0
        default: return "unknown";
2215
0
    }
2216
0
}
2217
2218
#ifdef DEBUG_INDEX
2219
static void idx_dump(const hts_idx_t *idx) {
2220
    int i;
2221
    int64_t j;
2222
2223
    if (!idx) fprintf(stderr, "Null index\n");
2224
2225
    fprintf(stderr, "format='%s', min_shift=%d, n_lvls=%d, n_bins=%d, l_meta=%u ",
2226
            idx_format_name(idx->fmt), idx->min_shift, idx->n_lvls, idx->n_bins, idx->l_meta);
2227
    fprintf(stderr, "n=%d, m=%d, n_no_coor=%"PRIu64"\n", idx->n, idx->m, idx->n_no_coor);
2228
    for (i = 0; i < idx->n; i++) {
2229
        bidx_t *bidx = idx->bidx[i];
2230
        lidx_t *lidx = &idx->lidx[i];
2231
        if (bidx) {
2232
            fprintf(stderr, "======== BIN Index - tid=%d, n_buckets=%d, size=%d\n", i, bidx->n_buckets, bidx->size);
2233
            int b;
2234
            for (b = 0; b < META_BIN(idx); b++) {
2235
                khint_t k;
2236
                if ((k = kh_get(bin, bidx, b)) != kh_end(bidx)) {
2237
                    bins_t *entries = &kh_value(bidx, k);
2238
                    int l = hts_bin_level(b);
2239
                    int64_t bin_width = 1LL << ((idx->n_lvls - l) * 3 + idx->min_shift);
2240
                    fprintf(stderr, "\tbin=%d, level=%d, parent=%d, n_chunks=%d, loff=%"PRIu64", interval=[%"PRId64" - %"PRId64"]\n",
2241
                        b, l, hts_bin_parent(b), entries->n, entries->loff, (b-hts_bin_first(l))*bin_width+1, (b+1-hts_bin_first(l))*bin_width);
2242
                    for (j = 0; j < entries->n; j++)
2243
                        fprintf(stderr, "\t\tchunk=%"PRId64", u=%"PRIu64", v=%"PRIu64"\n", j, entries->list[j].u, entries->list[j].v);
2244
                }
2245
            }
2246
        }
2247
        if (lidx) {
2248
            fprintf(stderr, "======== LINEAR Index - tid=%d, n_values=%"PRId64"\n", i, lidx->n);
2249
            for (j = 0; j < lidx->n; j++) {
2250
                fprintf(stderr, "\t\tentry=%"PRId64", offset=%"PRIu64", interval=[%"PRId64" - %"PRId64"]\n",
2251
                    j, lidx->offset[j], j*(1<<idx->min_shift)+1, (j+1)*(1<<idx->min_shift));
2252
            }
2253
        }
2254
    }
2255
}
2256
#endif
2257
2258
static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end)
2259
0
{
2260
0
    khint_t k;
2261
0
    bins_t *l;
2262
0
    int absent;
2263
0
    k = kh_put(bin, b, bin, &absent);
2264
0
    if (absent < 0) return -1; // Out of memory
2265
0
    l = &kh_value(b, k);
2266
0
    if (absent) {
2267
0
        l->m = 1; l->n = 0;
2268
0
        l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t));
2269
0
        if (!l->list) {
2270
0
            kh_del(bin, b, k);
2271
0
            return -1;
2272
0
        }
2273
0
    } else if (l->n == l->m) {
2274
0
        uint32_t new_m = l->m ? l->m << 1 : 1;
2275
0
        hts_pair64_t *new_list = realloc(l->list, new_m * sizeof(hts_pair64_t));
2276
0
        if (!new_list) return -1;
2277
0
        l->list = new_list;
2278
0
        l->m = new_m;
2279
0
    }
2280
0
    l->list[l->n].u = beg;
2281
0
    l->list[l->n++].v = end;
2282
0
    return 0;
2283
0
}
2284
2285
static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift)
2286
0
{
2287
0
    int i;
2288
0
    hts_pos_t beg, end;
2289
0
    beg = _beg >> min_shift;
2290
0
    end = (_end - 1) >> min_shift;
2291
0
    if (l->m < end + 1) {
2292
0
        size_t new_m = l->m * 2 > end + 1 ? l->m * 2 : end + 1;
2293
0
        uint64_t *new_offset;
2294
2295
0
        new_offset = (uint64_t*)realloc(l->offset, new_m * sizeof(uint64_t));
2296
0
        if (!new_offset) return -1;
2297
2298
        // fill unused memory with (uint64_t)-1
2299
0
        memset(new_offset + l->m, 0xff, sizeof(uint64_t) * (new_m - l->m));
2300
0
        l->m = new_m;
2301
0
        l->offset = new_offset;
2302
0
    }
2303
0
    for (i = beg; i <= end; ++i) {
2304
0
        if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset;
2305
0
    }
2306
0
    if (l->n < end + 1) l->n = end + 1;
2307
0
    return 0;
2308
0
}
2309
2310
hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
2311
0
{
2312
0
    hts_idx_t *idx;
2313
0
    idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t));
2314
0
    if (idx == NULL) return NULL;
2315
0
    idx->fmt = fmt;
2316
0
    idx->min_shift = min_shift;
2317
0
    idx->n_lvls = n_lvls;
2318
0
    idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7;
2319
0
    idx->z.save_tid = idx->z.last_tid = -1;
2320
0
    idx->z.save_bin = idx->z.last_bin = 0xffffffffu;
2321
0
    idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0;
2322
0
    idx->z.last_coor = 0xffffffffu;
2323
0
    if (n) {
2324
0
        idx->n = idx->m = n;
2325
0
        idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*));
2326
0
        if (idx->bidx == NULL) { free(idx); return NULL; }
2327
0
        idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t));
2328
0
        if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; }
2329
0
    }
2330
0
    idx->tbi_n = -1;
2331
0
    idx->last_tbi_tid = -1;
2332
0
    return idx;
2333
0
}
2334
2335
static void update_loff(hts_idx_t *idx, int i, int free_lidx)
2336
0
{
2337
0
    bidx_t *bidx = idx->bidx[i];
2338
0
    lidx_t *lidx = &idx->lidx[i];
2339
0
    khint_t k;
2340
0
    int l;
2341
    // the last entry is always valid
2342
0
    for (l=lidx->n-2; l >= 0; l--) {
2343
0
        if (lidx->offset[l] == (uint64_t)-1)
2344
0
            lidx->offset[l] = lidx->offset[l+1];
2345
0
    }
2346
0
    if (bidx == 0) return;
2347
0
    for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff
2348
0
        if (kh_exist(bidx, k))
2349
0
        {
2350
0
            if ( kh_key(bidx, k) < idx->n_bins )
2351
0
            {
2352
0
                int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls);
2353
                // disable linear index if bot_bin out of bounds
2354
0
                kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0;
2355
0
            }
2356
0
            else
2357
0
                kh_val(bidx, k).loff = 0;
2358
0
        }
2359
0
    if (free_lidx) {
2360
0
        free(lidx->offset);
2361
0
        lidx->m = lidx->n = 0;
2362
0
        lidx->offset = 0;
2363
0
    }
2364
0
}
2365
2366
static int compress_binning(hts_idx_t *idx, int i)
2367
0
{
2368
0
    bidx_t *bidx = idx->bidx[i];
2369
0
    khint_t k;
2370
0
    int l, m;
2371
0
    if (bidx == 0) return 0;
2372
    // merge a bin to its parent if the bin is too small
2373
0
    for (l = idx->n_lvls; l > 0; --l) {
2374
0
        unsigned start = hts_bin_first(l);
2375
0
        for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
2376
0
            bins_t *p, *q;
2377
0
            if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue;
2378
0
            p = &kh_value(bidx, k);
2379
0
            if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list);
2380
0
            if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) {
2381
0
                khint_t kp;
2382
0
                kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k)));
2383
0
                if (kp == kh_end(bidx)) continue;
2384
0
                q = &kh_val(bidx, kp);
2385
0
                if (q->n + p->n > q->m) {
2386
0
                    uint32_t new_m = q->n + p->n;
2387
0
                    hts_pair64_t *new_list;
2388
0
                    kroundup32(new_m);
2389
0
                    if (new_m > INT32_MAX) return -1; // Limited by index format
2390
0
                    new_list = realloc(q->list, new_m * sizeof(*new_list));
2391
0
                    if (!new_list) return -1;
2392
0
                    q->m = new_m;
2393
0
                    q->list = new_list;
2394
0
                }
2395
0
                memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t));
2396
0
                q->n += p->n;
2397
0
                free(p->list);
2398
0
                kh_del(bin, bidx, k);
2399
0
            }
2400
0
        }
2401
0
    }
2402
0
    k = kh_get(bin, bidx, 0);
2403
0
    if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list);
2404
    // merge adjacent chunks that start from the same BGZF block
2405
0
    for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
2406
0
        bins_t *p;
2407
0
        if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue;
2408
0
        p = &kh_value(bidx, k);
2409
0
        for (l = 1, m = 0; l < p->n; ++l) {
2410
0
            if (p->list[m].v>>16 >= p->list[l].u>>16) {
2411
0
                if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v;
2412
0
            } else p->list[++m] = p->list[l];
2413
0
        }
2414
0
        p->n = m + 1;
2415
0
    }
2416
0
    return 0;
2417
0
}
2418
2419
int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
2420
0
{
2421
0
    int i, ret = 0;
2422
0
    if (idx == NULL || idx->z.finished) return 0; // do not run this function on an empty index or multiple times
2423
0
    if (idx->z.save_tid >= 0) {
2424
0
        ret |= insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset);
2425
0
        ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset);
2426
0
        ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
2427
0
    }
2428
0
    for (i = 0; i < idx->n; ++i) {
2429
0
        update_loff(idx, i, (idx->fmt == HTS_FMT_CSI));
2430
0
        ret |= compress_binning(idx, i);
2431
0
    }
2432
0
    idx->z.finished = 1;
2433
2434
0
    return ret;
2435
0
}
2436
2437
int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
2438
0
{
2439
0
    int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3);
2440
0
    if (tid < 0 || (beg <= maxpos && end <= maxpos))
2441
0
        return 0;
2442
2443
0
    if (idx->fmt == HTS_FMT_CSI) {
2444
0
        hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" "
2445
0
                      "cannot be stored in a csi index with these parameters. "
2446
0
                      "Please use a larger min_shift or depth",
2447
0
                      beg, end);
2448
0
    } else {
2449
0
        hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos
2450
0
                      " cannot be stored in a %s index. Try using a csi index",
2451
0
                      beg, end, idx_format_name(idx->fmt));
2452
0
    }
2453
0
    errno = ERANGE;
2454
0
    return -1;
2455
0
}
2456
2457
int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped)
2458
0
{
2459
0
    int bin;
2460
0
    if (tid<0) beg = -1, end = 0;
2461
0
    if (hts_idx_check_range(idx, tid, beg, end) < 0)
2462
0
        return -1;
2463
0
    if (tid >= idx->m) { // enlarge the index
2464
0
        uint32_t new_m = idx->m * 2 > tid + 1 ? idx->m * 2 : tid + 1;
2465
0
        bidx_t **new_bidx;
2466
0
        lidx_t *new_lidx;
2467
2468
0
        new_bidx = (bidx_t**)realloc(idx->bidx, new_m * sizeof(bidx_t*));
2469
0
        if (!new_bidx) return -1;
2470
0
        idx->bidx = new_bidx;
2471
2472
0
        new_lidx = (lidx_t*) realloc(idx->lidx, new_m * sizeof(lidx_t));
2473
0
        if (!new_lidx) return -1;
2474
0
        idx->lidx = new_lidx;
2475
2476
0
        memset(&idx->bidx[idx->m], 0, (new_m - idx->m) * sizeof(bidx_t*));
2477
0
        memset(&idx->lidx[idx->m], 0, (new_m - idx->m) * sizeof(lidx_t));
2478
0
        idx->m = new_m;
2479
0
    }
2480
0
    if (idx->n < tid + 1) idx->n = tid + 1;
2481
0
    if (idx->z.finished) return 0;
2482
0
    if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome
2483
0
        if ( tid>=0 && idx->n_no_coor )
2484
0
        {
2485
0
            hts_log_error("NO_COOR reads not in a single block at the end %d %d", tid, idx->z.last_tid);
2486
0
            return -1;
2487
0
        }
2488
0
        if (tid>=0 && idx->bidx[tid] != 0)
2489
0
        {
2490
0
            hts_log_error("Chromosome blocks not continuous");
2491
0
            return -1;
2492
0
        }
2493
0
        idx->z.last_tid = tid;
2494
0
        idx->z.last_bin = 0xffffffffu;
2495
0
    } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order
2496
0
        hts_log_error("Unsorted positions on sequence #%d: %"PRIhts_pos" followed by %"PRIhts_pos, tid+1, idx->z.last_coor+1, beg+1);
2497
0
        return -1;
2498
0
    }
2499
0
    if (end < beg) {
2500
        // Malformed ranges are errors. (Empty ranges (beg==end) are unusual but acceptable.)
2501
0
        hts_log_error("Invalid record on sequence #%d: end %"PRId64" < begin %"PRId64, tid+1, end, beg+1);
2502
0
        return -1;
2503
0
    }
2504
0
    if ( tid>=0 )
2505
0
    {
2506
0
        if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin);
2507
        // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin
2508
0
        if (beg < 0)  beg = 0;
2509
0
        if (end <= 0) end = 1;
2510
        // idx->z.last_off points to the start of the current record
2511
0
        if (insert_to_l(&idx->lidx[tid], beg, end,
2512
0
                        idx->z.last_off, idx->min_shift) < 0) return -1;
2513
0
    }
2514
0
    else idx->n_no_coor++;
2515
0
    bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls);
2516
0
    if ((int)idx->z.last_bin != bin) { // then possibly write the binning index
2517
0
        if (idx->z.save_bin != 0xffffffffu) { // save_bin==0xffffffffu only happens to the first record
2518
0
            if (insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin,
2519
0
                            idx->z.save_off, idx->z.last_off) < 0) return -1;
2520
0
        }
2521
0
        if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information
2522
0
            idx->z.off_end = idx->z.last_off;
2523
0
            if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
2524
0
                            idx->z.off_beg, idx->z.off_end) < 0) return -1;
2525
0
            if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
2526
0
                            idx->z.n_mapped, idx->z.n_unmapped) < 0) return -1;
2527
0
            idx->z.n_mapped = idx->z.n_unmapped = 0;
2528
0
            idx->z.off_beg = idx->z.off_end;
2529
0
        }
2530
0
        idx->z.save_off = idx->z.last_off;
2531
0
        idx->z.save_bin = idx->z.last_bin = bin;
2532
0
        idx->z.save_tid = tid;
2533
0
    }
2534
0
    if (is_mapped) ++idx->z.n_mapped;
2535
0
    else ++idx->z.n_unmapped;
2536
0
    idx->z.last_off = offset;
2537
0
    idx->z.last_coor = beg;
2538
0
    return 0;
2539
0
}
2540
2541
// Needed for TBI only.  Ensure 'tid' with 'name' is in the index meta data.
2542
// idx->meta needs to have been initialised first with an appropriate Tabix
2543
// configuration via hts_idx_set_meta.
2544
//
2545
// NB number of references (first 4 bytes of tabix header) aren't in
2546
// idx->meta, but held in idx->n instead.
2547
0
int hts_idx_tbi_name(hts_idx_t *idx, int tid, const char *name) {
2548
    // Horrid - we have to map incoming tid to a tbi alternative tid.
2549
    // This is because TBI counts tids by "covered" refs while everything
2550
    // else counts by Nth SQ/contig record in header.
2551
0
    if (tid == idx->last_tbi_tid || tid < 0 || !name)
2552
0
        return idx->tbi_n;
2553
2554
0
    uint32_t len = strlen(name)+1;
2555
0
    uint8_t *tmp = (uint8_t *)realloc(idx->meta, idx->l_meta + len);
2556
0
    if (!tmp)
2557
0
        return -1;
2558
2559
    // Append name
2560
0
    idx->meta = tmp;
2561
0
    strcpy((char *)idx->meta + idx->l_meta, name);
2562
0
    idx->l_meta += len;
2563
2564
    // Update seq length
2565
0
    u32_to_le(le_to_u32(idx->meta+24)+len, idx->meta+24);
2566
2567
0
    idx->last_tbi_tid = tid;
2568
0
    return ++idx->tbi_n;
2569
0
}
2570
2571
// When doing samtools index we have a read_bam / hts_idx_push(bgzf_tell())
2572
// loop.  idx->z.last_off is the previous bzgf_tell location, so we know
2573
// the location the current bam record started at as well as where it ends.
2574
//
2575
// When building an index on the fly via a write_bam / hts_idx_push loop,
2576
// this isn't quite identical as we may amend the virtual coord returned
2577
// by bgzf_tell to the start of a new block if the next bam struct doesn't
2578
// fit.  It's essentially the same thing, but for bit-identical indices
2579
// we need to amend the idx->z.last_off when we know we're starting a new
2580
// block.
2581
void hts_idx_amend_last(hts_idx_t *idx, uint64_t offset)
2582
0
{
2583
0
    idx->z.last_off = offset;
2584
0
}
2585
2586
void hts_idx_destroy(hts_idx_t *idx)
2587
44.8k
{
2588
44.8k
    khint_t k;
2589
44.8k
    int i;
2590
44.8k
    if (idx == 0) return;
2591
2592
    // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c
2593
0
    if (idx->fmt == HTS_FMT_CRAI) {
2594
0
        hts_cram_idx_t *cidx = (hts_cram_idx_t *) idx;
2595
0
        cram_index_free(cidx->cram);
2596
0
        free(cidx);
2597
0
        return;
2598
0
    }
2599
2600
0
    for (i = 0; i < idx->m; ++i) {
2601
0
        bidx_t *bidx = idx->bidx[i];
2602
0
        free(idx->lidx[i].offset);
2603
0
        if (bidx == 0) continue;
2604
0
        for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
2605
0
            if (kh_exist(bidx, k))
2606
0
                free(kh_value(bidx, k).list);
2607
0
        kh_destroy(bin, bidx);
2608
0
    }
2609
0
    free(idx->bidx); free(idx->lidx); free(idx->meta);
2610
0
    free(idx);
2611
0
}
2612
2613
0
int hts_idx_fmt(hts_idx_t *idx) {
2614
0
    return idx->fmt;
2615
0
}
2616
2617
// The optimizer eliminates these ed_is_big() calls; still it would be good to
2618
// TODO Determine endianness at configure- or compile-time
2619
2620
static inline ssize_t HTS_RESULT_USED idx_write_int32(BGZF *fp, int32_t x)
2621
0
{
2622
0
    if (ed_is_big()) x = ed_swap_4(x);
2623
0
    return bgzf_write(fp, &x, sizeof x);
2624
0
}
2625
2626
static inline ssize_t HTS_RESULT_USED idx_write_uint32(BGZF *fp, uint32_t x)
2627
0
{
2628
0
    if (ed_is_big()) x = ed_swap_4(x);
2629
0
    return bgzf_write(fp, &x, sizeof x);
2630
0
}
2631
2632
static inline ssize_t HTS_RESULT_USED idx_write_uint64(BGZF *fp, uint64_t x)
2633
0
{
2634
0
    if (ed_is_big()) x = ed_swap_8(x);
2635
0
    return bgzf_write(fp, &x, sizeof x);
2636
0
}
2637
2638
static inline void swap_bins(bins_t *p)
2639
0
{
2640
0
    int i;
2641
0
    for (i = 0; i < p->n; ++i) {
2642
0
        ed_swap_8p(&p->list[i].u);
2643
0
        ed_swap_8p(&p->list[i].v);
2644
0
    }
2645
0
}
2646
2647
static int idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt)
2648
0
{
2649
0
    int32_t i, j;
2650
2651
0
    #define check(ret) if ((ret) < 0) return -1
2652
2653
    // VCF TBI/CSI only writes IDs for non-empty bins (ie covered references)
2654
    //
2655
    // NOTE: CSI meta is undefined in spec, so this code has an assumption
2656
    // that we're only using it for Tabix data.
2657
0
    int nids = idx->n;
2658
0
    if (idx->meta && idx->l_meta >= 4 && le_to_u32(idx->meta) == TBX_VCF) {
2659
0
        for (i = nids = 0; i < idx->n; ++i) {
2660
0
            if (idx->bidx[i])
2661
0
                nids++;
2662
0
        }
2663
0
    }
2664
0
    check(idx_write_int32(fp, nids));
2665
0
    if (fmt == HTS_FMT_TBI && idx->l_meta)
2666
0
        check(bgzf_write(fp, idx->meta, idx->l_meta));
2667
2668
0
    for (i = 0; i < idx->n; ++i) {
2669
0
        khint_t k;
2670
0
        bidx_t *bidx = idx->bidx[i];
2671
0
        lidx_t *lidx = &idx->lidx[i];
2672
2673
        // write binning index
2674
0
        if (nids == idx->n || bidx)
2675
0
            check(idx_write_int32(fp, bidx? kh_size(bidx) : 0));
2676
0
        if (bidx)
2677
0
            for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
2678
0
                if (kh_exist(bidx, k)) {
2679
0
                    bins_t *p = &kh_value(bidx, k);
2680
0
                    check(idx_write_uint32(fp, kh_key(bidx, k)));
2681
0
                    if (fmt == HTS_FMT_CSI) check(idx_write_uint64(fp, p->loff));
2682
                    //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v);
2683
0
                    check(idx_write_int32(fp, p->n));
2684
0
                    for (j = 0; j < p->n; ++j) {
2685
                        //fprintf(stderr, "\t%ld\t%ld\n", p->list[j].u, p->list[j].v);
2686
0
                        check(idx_write_uint64(fp, p->list[j].u));
2687
0
                        check(idx_write_uint64(fp, p->list[j].v));
2688
0
                    }
2689
0
                }
2690
2691
        // write linear index
2692
0
        if (fmt != HTS_FMT_CSI) {
2693
0
            check(idx_write_int32(fp, lidx->n));
2694
0
            for (j = 0; j < lidx->n; ++j)
2695
0
                check(idx_write_uint64(fp, lidx->offset[j]));
2696
0
        }
2697
0
    }
2698
2699
0
    check(idx_write_uint64(fp, idx->n_no_coor));
2700
#ifdef DEBUG_INDEX
2701
    idx_dump(idx);
2702
#endif
2703
2704
0
    return 0;
2705
0
    #undef check
2706
0
}
2707
2708
int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
2709
0
{
2710
0
    int ret, save;
2711
0
    if (idx == NULL || fn == NULL) { errno = EINVAL; return -1; }
2712
0
    char *fnidx = (char*)calloc(1, strlen(fn) + 5);
2713
0
    if (fnidx == NULL) return -1;
2714
2715
0
    strcpy(fnidx, fn);
2716
0
    switch (fmt) {
2717
0
    case HTS_FMT_BAI: strcat(fnidx, ".bai"); break;
2718
0
    case HTS_FMT_CSI: strcat(fnidx, ".csi"); break;
2719
0
    case HTS_FMT_TBI: strcat(fnidx, ".tbi"); break;
2720
0
    default: abort();
2721
0
    }
2722
2723
0
    ret = hts_idx_save_as(idx, fn, fnidx, fmt);
2724
0
    save = errno;
2725
0
    free(fnidx);
2726
0
    errno = save;
2727
0
    return ret;
2728
0
}
2729
2730
int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
2731
0
{
2732
0
    BGZF *fp;
2733
2734
0
    #define check(ret) if ((ret) < 0) goto fail
2735
2736
0
    if (fnidx == NULL) return hts_idx_save(idx, fn, fmt);
2737
2738
0
    fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
2739
0
    if (fp == NULL) return -1;
2740
2741
0
    if (fmt == HTS_FMT_CSI) {
2742
0
        check(bgzf_write(fp, "CSI\1", 4));
2743
0
        check(idx_write_int32(fp, idx->min_shift));
2744
0
        check(idx_write_int32(fp, idx->n_lvls));
2745
0
        check(idx_write_uint32(fp, idx->l_meta));
2746
0
        if (idx->l_meta) check(bgzf_write(fp, idx->meta, idx->l_meta));
2747
0
    } else if (fmt == HTS_FMT_TBI) {
2748
0
        check(bgzf_write(fp, "TBI\1", 4));
2749
0
    } else if (fmt == HTS_FMT_BAI) {
2750
0
        check(bgzf_write(fp, "BAI\1", 4));
2751
0
    } else abort();
2752
2753
0
    check(idx_save_core(idx, fp, fmt));
2754
2755
0
    return bgzf_close(fp);
2756
0
    #undef check
2757
2758
0
fail:
2759
0
    bgzf_close(fp);
2760
0
    return -1;
2761
0
}
2762
2763
static int idx_read_core(hts_idx_t *idx, BGZF *fp, int fmt)
2764
0
{
2765
0
    int32_t i, n, is_be;
2766
0
    is_be = ed_is_big();
2767
0
    if (idx == NULL) return -4;
2768
0
    for (i = 0; i < idx->n; ++i) {
2769
0
        bidx_t *h;
2770
0
        lidx_t *l = &idx->lidx[i];
2771
0
        uint32_t key;
2772
0
        int j, absent;
2773
0
        bins_t *p;
2774
0
        h = idx->bidx[i] = kh_init(bin);
2775
0
        if (bgzf_read(fp, &n, 4) != 4) return -1;
2776
0
        if (is_be) ed_swap_4p(&n);
2777
0
        if (n < 0) return -3;
2778
0
        for (j = 0; j < n; ++j) {
2779
0
            khint_t k;
2780
0
            if (bgzf_read(fp, &key, 4) != 4) return -1;
2781
0
            if (is_be) ed_swap_4p(&key);
2782
0
            k = kh_put(bin, h, key, &absent);
2783
0
            if (absent <  0) return -2; // No memory
2784
0
            if (absent == 0) return -3; // Duplicate bin number
2785
0
            p = &kh_val(h, k);
2786
0
            if (fmt == HTS_FMT_CSI) {
2787
0
                if (bgzf_read(fp, &p->loff, 8) != 8) return -1;
2788
0
                if (is_be) ed_swap_8p(&p->loff);
2789
0
            } else p->loff = 0;
2790
0
            if (bgzf_read(fp, &p->n, 4) != 4) return -1;
2791
0
            if (is_be) ed_swap_4p(&p->n);
2792
0
            if (p->n < 0) return -3;
2793
0
            if ((size_t) p->n > SIZE_MAX / sizeof(hts_pair64_t)) return -2;
2794
0
            p->m = p->n;
2795
0
            p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t));
2796
0
            if (p->list == NULL) return -2;
2797
0
            if (bgzf_read(fp, p->list, ((size_t) p->n)<<4) != ((size_t) p->n)<<4) return -1;
2798
0
            if (is_be) swap_bins(p);
2799
0
        }
2800
0
        if (fmt != HTS_FMT_CSI) { // load linear index
2801
0
            int j, k;
2802
0
            uint32_t x;
2803
0
            if (bgzf_read(fp, &x, 4) != 4) return -1;
2804
0
            if (is_be) ed_swap_4p(&x);
2805
0
            l->n = x;
2806
0
            if (l->n < 0) return -3;
2807
0
            if ((size_t) l->n > SIZE_MAX / sizeof(uint64_t)) return -2;
2808
0
            l->m = l->n;
2809
0
            l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t));
2810
0
            if (l->offset == NULL) return -2;
2811
0
            if (bgzf_read(fp, l->offset, l->n << 3) != l->n << 3) return -1;
2812
0
            if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]);
2813
0
            for (k = j = 0; j < l->n && l->offset[j] == 0; k = ++j); // stop at the first non-zero entry
2814
0
            for (j = l->n-1; j > k; j--) // fill missing values; may happen given older samtools and tabix
2815
0
                if (l->offset[j-1] == 0) l->offset[j-1] = l->offset[j];
2816
0
            update_loff(idx, i, 0);
2817
0
        }
2818
0
    }
2819
0
    if (bgzf_read(fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0;
2820
0
    if (is_be) ed_swap_8p(&idx->n_no_coor);
2821
#ifdef DEBUG_INDEX
2822
    idx_dump(idx);
2823
#endif
2824
2825
0
    return 0;
2826
0
}
2827
2828
static hts_idx_t *idx_read(const char *fn)
2829
0
{
2830
0
    uint8_t magic[4];
2831
0
    int i, is_be;
2832
0
    hts_idx_t *idx = NULL;
2833
0
    uint8_t *meta = NULL;
2834
0
    BGZF *fp = bgzf_open(fn, "r");
2835
0
    if (fp == NULL) return NULL;
2836
0
    is_be = ed_is_big();
2837
0
    if (bgzf_read(fp, magic, 4) != 4) goto fail;
2838
2839
0
    if (memcmp(magic, "CSI\1", 4) == 0) {
2840
0
        uint32_t x[3], n;
2841
0
        if (bgzf_read(fp, x, 12) != 12) goto fail;
2842
0
        if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]);
2843
0
        if (x[2]) {
2844
0
            if (SIZE_MAX - x[2] < 1) goto fail; // Prevent possible overflow
2845
0
            if ((meta = (uint8_t*)malloc((size_t) x[2] + 1)) == NULL) goto fail;
2846
0
            if (bgzf_read(fp, meta, x[2]) != x[2]) goto fail;
2847
            // Prevent possible strlen past the end in tbx_index_load2
2848
0
            meta[x[2]] = '\0';
2849
0
        }
2850
0
        if (bgzf_read(fp, &n, 4) != 4) goto fail;
2851
0
        if (is_be) ed_swap_4p(&n);
2852
0
        if (n > INT32_MAX) goto fail;
2853
0
        if ((idx = hts_idx_init(n, HTS_FMT_CSI, 0, x[0], x[1])) == NULL) goto fail;
2854
0
        idx->l_meta = x[2];
2855
0
        idx->meta = meta;
2856
0
        meta = NULL;
2857
0
        if (idx_read_core(idx, fp, HTS_FMT_CSI) < 0) goto fail;
2858
0
    }
2859
0
    else if (memcmp(magic, "TBI\1", 4) == 0) {
2860
0
        uint8_t x[8 * 4];
2861
0
        uint32_t n;
2862
        // Read file header
2863
0
        if (bgzf_read(fp, x, sizeof(x)) != sizeof(x)) goto fail;
2864
0
        n = le_to_u32(&x[0]); // location of n_ref
2865
0
        if (n > INT32_MAX) goto fail;
2866
0
        if ((idx = hts_idx_init(n, HTS_FMT_TBI, 0, 14, 5)) == NULL) goto fail;
2867
0
        n = le_to_u32(&x[7*4]); // location of l_nm
2868
0
        if (n > UINT32_MAX - 29) goto fail; // Prevent possible overflow
2869
0
        idx->l_meta = 28 + n;
2870
0
        if ((idx->meta = (uint8_t*)malloc(idx->l_meta + 1)) == NULL) goto fail;
2871
        // copy format, col_seq, col_beg, col_end, meta, skip, l_nm
2872
        // N.B. left in little-endian byte order.
2873
0
        memcpy(idx->meta, &x[1*4], 28);
2874
        // Read in sequence names.
2875
0
        if (bgzf_read(fp, idx->meta + 28, n) != n) goto fail;
2876
        // Prevent possible strlen past the end in tbx_index_load2
2877
0
        idx->meta[idx->l_meta] = '\0';
2878
0
        if (idx_read_core(idx, fp, HTS_FMT_TBI) < 0) goto fail;
2879
0
    }
2880
0
    else if (memcmp(magic, "BAI\1", 4) == 0) {
2881
0
        uint32_t n;
2882
0
        if (bgzf_read(fp, &n, 4) != 4) goto fail;
2883
0
        if (is_be) ed_swap_4p(&n);
2884
0
        if (n > INT32_MAX) goto fail;
2885
0
        if ((idx = hts_idx_init(n, HTS_FMT_BAI, 0, 14, 5)) == NULL) goto fail;
2886
0
        if (idx_read_core(idx, fp, HTS_FMT_BAI) < 0) goto fail;
2887
0
    }
2888
0
    else { errno = EINVAL; goto fail; }
2889
2890
0
    bgzf_close(fp);
2891
0
    return idx;
2892
2893
0
fail:
2894
0
    bgzf_close(fp);
2895
0
    hts_idx_destroy(idx);
2896
0
    free(meta);
2897
0
    return NULL;
2898
0
}
2899
2900
int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta,
2901
                      int is_copy)
2902
0
{
2903
0
    uint8_t *new_meta = meta;
2904
0
    if (is_copy) {
2905
0
        size_t l = l_meta;
2906
0
        if (l > SIZE_MAX - 1) {
2907
0
            errno = ENOMEM;
2908
0
            return -1;
2909
0
        }
2910
0
        new_meta = malloc(l + 1);
2911
0
        if (!new_meta) return -1;
2912
0
        memcpy(new_meta, meta, l);
2913
        // Prevent possible strlen past the end in tbx_index_load2
2914
0
        new_meta[l] = '\0';
2915
0
    }
2916
0
    if (idx->meta) free(idx->meta);
2917
0
    idx->l_meta = l_meta;
2918
0
    idx->meta = new_meta;
2919
0
    return 0;
2920
0
}
2921
2922
uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta)
2923
0
{
2924
0
    *l_meta = idx->l_meta;
2925
0
    return idx->meta;
2926
0
}
2927
2928
const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)
2929
0
{
2930
0
    if ( !idx || !idx->n )
2931
0
    {
2932
0
        *n = 0;
2933
0
        return NULL;
2934
0
    }
2935
2936
0
    int tid = 0, i;
2937
0
    const char **names = (const char**) calloc(idx->n,sizeof(const char*));
2938
0
    for (i=0; i<idx->n; i++)
2939
0
    {
2940
0
        bidx_t *bidx = idx->bidx[i];
2941
0
        if ( !bidx ) continue;
2942
0
        names[tid++] = getid(hdr,i);
2943
0
    }
2944
0
    *n = tid;
2945
0
    return names;
2946
0
}
2947
2948
0
int hts_idx_nseq(const hts_idx_t *idx) {
2949
0
    if (!idx) return -1;
2950
0
    return idx->n;
2951
0
}
2952
2953
int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped)
2954
0
{
2955
0
    if (!idx) return -1;
2956
0
    if ( idx->fmt == HTS_FMT_CRAI ) {
2957
0
        *mapped = 0; *unmapped = 0;
2958
0
        return -1;
2959
0
    }
2960
2961
0
    bidx_t *h = idx->bidx[tid];
2962
0
    if (!h) return -1;
2963
0
    khint_t k = kh_get(bin, h, META_BIN(idx));
2964
0
    if (k != kh_end(h)) {
2965
0
        *mapped = kh_val(h, k).list[1].u;
2966
0
        *unmapped = kh_val(h, k).list[1].v;
2967
0
        return 0;
2968
0
    } else {
2969
0
        *mapped = 0; *unmapped = 0;
2970
0
        return -1;
2971
0
    }
2972
0
}
2973
2974
uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
2975
0
{
2976
0
    if (idx->fmt == HTS_FMT_CRAI) return 0;
2977
0
    return idx->n_no_coor;
2978
0
}
2979
2980
/****************
2981
 *** Iterator ***
2982
 ****************/
2983
2984
// Note: even with 32-bit hts_pos_t, end needs to be 64-bit here due to 1LL<<s.
2985
static inline int reg2bins_narrow(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx)
2986
0
{
2987
0
    int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
2988
0
    for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
2989
0
        hts_pos_t b, e;
2990
0
        int i;
2991
0
        b = t + (beg>>s); e = t + (end>>s);
2992
0
        for (i = b; i <= e; ++i) {
2993
0
            if (kh_get(bin, bidx, i) != kh_end(bidx)) {
2994
0
                assert(itr->bins.n < itr->bins.m);
2995
0
                itr->bins.a[itr->bins.n++] = i;
2996
0
            }
2997
0
        }
2998
0
    }
2999
0
    return itr->bins.n;
3000
0
}
3001
3002
static inline int reg2bins_wide(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx)
3003
0
{
3004
0
    khint_t i;
3005
0
    hts_pos_t max_shift = 3 * n_lvls + min_shift;
3006
0
    --end;
3007
0
    if (beg < 0) beg = 0;
3008
0
    for (i = kh_begin(bidx); i != kh_end(bidx); i++) {
3009
0
        if (!kh_exist(bidx, i)) continue;
3010
0
        hts_pos_t bin = (hts_pos_t) kh_key(bidx, i);
3011
0
        int level = hts_bin_level(bin);
3012
0
        if (level > n_lvls) continue; // Dodgy index?
3013
0
        hts_pos_t first = hts_bin_first(level);
3014
0
        hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level));
3015
0
        hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level));
3016
0
        if (beg_at_level <= bin && bin <= end_at_level) {
3017
0
            assert(itr->bins.n < itr->bins.m);
3018
0
            itr->bins.a[itr->bins.n++] = bin;
3019
0
        }
3020
0
    }
3021
0
    return itr->bins.n;
3022
0
}
3023
3024
static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx)
3025
0
{
3026
0
    int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
3027
0
    size_t reg_bin_count = 0, hash_bin_count = kh_n_buckets(bidx), max_bins;
3028
0
    hts_pos_t end1;
3029
0
    if (end >= 1LL<<s) end = 1LL<<s;
3030
0
    if (beg >= end) return 0;
3031
0
    end1 = end - 1;
3032
3033
    // Count bins to see if it's faster to iterate through the hash table
3034
    // or the set of bins covering the region
3035
0
    for (l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3036
0
        reg_bin_count += (end1 >> s) - (beg >> s) + 1;
3037
0
    }
3038
0
    max_bins = reg_bin_count < kh_size(bidx) ? reg_bin_count : kh_size(bidx);
3039
0
    if (itr->bins.m - itr->bins.n < max_bins) {
3040
        // Worst-case memory usage.  May be wasteful on very sparse
3041
        // data, but the bin list usually won't be too big anyway.
3042
0
        size_t new_m = max_bins + itr->bins.n;
3043
0
        if (new_m > INT_MAX || new_m > SIZE_MAX / sizeof(int)) {
3044
0
            errno = ENOMEM;
3045
0
            return -1;
3046
0
        }
3047
0
        int *new_a = realloc(itr->bins.a, new_m * sizeof(*new_a));
3048
0
        if (!new_a) return -1;
3049
0
        itr->bins.a = new_a;
3050
0
        itr->bins.m = new_m;
3051
0
    }
3052
0
    if (reg_bin_count < hash_bin_count) {
3053
0
        return reg2bins_narrow(beg, end, itr, min_shift, n_lvls, bidx);
3054
0
    } else {
3055
0
        return reg2bins_wide(beg, end, itr, min_shift, n_lvls, bidx);
3056
0
    }
3057
0
}
3058
3059
static inline int add_to_interval(hts_itr_t *iter, bins_t *bin,
3060
                                  int tid, uint32_t interval,
3061
                                  uint64_t min_off, uint64_t max_off)
3062
0
{
3063
0
    hts_pair64_max_t *off;
3064
0
    int j;
3065
3066
0
    if (!bin->n)
3067
0
        return 0;
3068
0
    off = realloc(iter->off, (iter->n_off + bin->n) * sizeof(*off));
3069
0
    if (!off)
3070
0
        return -2;
3071
3072
0
    iter->off = off;
3073
0
    for (j = 0; j < bin->n; ++j) {
3074
0
        if (bin->list[j].v > min_off && bin->list[j].u < max_off) {
3075
0
            iter->off[iter->n_off].u = min_off > bin->list[j].u
3076
0
                ? min_off : bin->list[j].u;
3077
0
            iter->off[iter->n_off].v = max_off < bin->list[j].v
3078
0
                ? max_off : bin->list[j].v;
3079
            // hts_pair64_max_t::max is now used to link
3080
            // file offsets to region list entries.
3081
            // The iterator can use this to decide if it
3082
            // can skip some file regions.
3083
0
            iter->off[iter->n_off].max = ((uint64_t) tid << 32) | interval;
3084
0
            iter->n_off++;
3085
0
        }
3086
0
    }
3087
0
    return 0;
3088
0
}
3089
3090
static inline int reg2intervals_narrow(hts_itr_t *iter, const bidx_t *bidx,
3091
                                       int tid, int64_t beg, int64_t end,
3092
                                       uint32_t interval,
3093
                                       uint64_t min_off, uint64_t max_off,
3094
                                       int min_shift, int n_lvls)
3095
0
{
3096
0
    int l, t, s = min_shift + n_lvls * 3;
3097
0
    hts_pos_t b, e, i;
3098
3099
0
    for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3100
0
        b = t + (beg>>s); e = t + (end>>s);
3101
0
        for (i = b; i <= e; ++i) {
3102
0
            khint_t k = kh_get(bin, bidx, i);
3103
0
            if (k != kh_end(bidx)) {
3104
0
                bins_t *bin = &kh_value(bidx, k);
3105
0
                int res = add_to_interval(iter, bin, tid, interval, min_off, max_off);
3106
0
                if (res < 0)
3107
0
                    return res;
3108
0
            }
3109
0
        }
3110
0
    }
3111
0
    return 0;
3112
0
}
3113
3114
static inline int reg2intervals_wide(hts_itr_t *iter, const bidx_t *bidx,
3115
                                       int tid, int64_t beg, int64_t end,
3116
                                       uint32_t interval,
3117
                                       uint64_t min_off, uint64_t max_off,
3118
                                       int min_shift, int n_lvls)
3119
0
{
3120
0
    khint_t i;
3121
0
    hts_pos_t max_shift = 3 * n_lvls + min_shift;
3122
0
    --end;
3123
0
    if (beg < 0) beg = 0;
3124
0
    for (i = kh_begin(bidx); i != kh_end(bidx); i++) {
3125
0
        if (!kh_exist(bidx, i)) continue;
3126
0
        hts_pos_t bin = (hts_pos_t) kh_key(bidx, i);
3127
0
        int level = hts_bin_level(bin);
3128
0
        if (level > n_lvls) continue; // Dodgy index?
3129
0
        hts_pos_t first = hts_bin_first(level);
3130
0
        hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level));
3131
0
        hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level));
3132
0
        if (beg_at_level <= bin && bin <= end_at_level) {
3133
0
            bins_t *bin = &kh_value(bidx, i);
3134
0
            int res = add_to_interval(iter, bin, tid, interval, min_off, max_off);
3135
0
            if (res < 0)
3136
0
                return res;
3137
0
        }
3138
0
    }
3139
0
    return 0;
3140
0
}
3141
3142
static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint32_t interval, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls)
3143
0
{
3144
0
    int l, t, s;
3145
0
    int i, j;
3146
0
    hts_pos_t end1;
3147
0
    bidx_t *bidx;
3148
0
    int start_n_off;
3149
0
    size_t reg_bin_count = 0, hash_bin_count;
3150
0
    int res;
3151
3152
0
    if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg >= end)
3153
0
        return -1;
3154
3155
0
    hash_bin_count = kh_n_buckets(bidx);
3156
3157
0
    s = min_shift + (n_lvls<<1) + n_lvls;
3158
0
    if (end >= 1LL<<s)
3159
0
        end = 1LL<<s;
3160
3161
0
    end1 = end - 1;
3162
    // Count bins to see if it's faster to iterate through the hash table
3163
    // or the set of bins covering the region
3164
0
    for (l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3165
0
        reg_bin_count += (end1 >> s) - (beg >> s) + 1;
3166
0
    }
3167
3168
0
    start_n_off = iter->n_off;
3169
3170
    // Populate iter->off with the intervals for this region
3171
0
    if (reg_bin_count < hash_bin_count) {
3172
0
        res = reg2intervals_narrow(iter, bidx, tid, beg, end, interval,
3173
0
                                   min_off, max_off, min_shift, n_lvls);
3174
0
    } else {
3175
0
        res = reg2intervals_wide(iter, bidx, tid, beg, end, interval,
3176
0
                                 min_off, max_off, min_shift, n_lvls);
3177
0
    }
3178
0
    if (res < 0)
3179
0
        return res;
3180
3181
0
    if (iter->n_off - start_n_off > 1) {
3182
0
        ks_introsort(_off_max, iter->n_off - start_n_off, iter->off + start_n_off);
3183
0
        for (i = start_n_off, j = start_n_off + 1; j < iter->n_off; j++) {
3184
0
            if (iter->off[i].v >= iter->off[j].u) {
3185
0
                if (iter->off[i].v < iter->off[j].v)
3186
0
                    iter->off[i].v = iter->off[j].v;
3187
0
            } else {
3188
0
                i++;
3189
0
                if (i < j)
3190
0
                    iter->off[i] = iter->off[j];
3191
0
            }
3192
0
        }
3193
0
        iter->n_off = i + 1;
3194
0
    }
3195
3196
0
    return iter->n_off;
3197
0
}
3198
3199
0
static int compare_regions(const void *r1, const void *r2) {
3200
0
    hts_reglist_t *reg1 = (hts_reglist_t *)r1;
3201
0
    hts_reglist_t *reg2 = (hts_reglist_t *)r2;
3202
3203
0
    if (reg1->tid < 0 && reg2->tid >= 0)
3204
0
        return 1;
3205
0
    else if (reg1->tid >= 0 && reg2->tid < 0)
3206
0
        return -1;
3207
0
    else
3208
0
        return reg1->tid - reg2->tid;
3209
0
}
3210
3211
0
uint64_t hts_itr_off(const hts_idx_t* idx, int tid) {
3212
3213
0
    int i;
3214
0
    bidx_t* bidx;
3215
0
    uint64_t off0 = (uint64_t) -1;
3216
0
    khint_t k;
3217
0
    switch (tid) {
3218
0
    case HTS_IDX_START:
3219
        // Find the smallest offset, note that sequence ids may not be ordered sequentially
3220
0
        for (i = 0; i < idx->n; i++) {
3221
0
            bidx = idx->bidx[i];
3222
0
            k = kh_get(bin, bidx, META_BIN(idx));
3223
0
            if (k == kh_end(bidx))
3224
0
                continue;
3225
3226
0
            if (off0 > kh_val(bidx, k).list[0].u)
3227
0
                off0 = kh_val(bidx, k).list[0].u;
3228
0
        }
3229
0
        if (off0 == (uint64_t) -1 && idx->n_no_coor)
3230
0
            off0 = 0;
3231
        // only no-coor reads in this bam
3232
0
        break;
3233
0
    case HTS_IDX_NOCOOR:
3234
        /* No-coor reads sort after all of the mapped reads.  The position
3235
           is not stored in the index itself, so need to find the end
3236
           offset for the last mapped read.  A loop is needed here in
3237
           case references at the end of the file have no mapped reads,
3238
           or sequence ids are not ordered sequentially.
3239
           See issue samtools#568 and commits b2aab8, 60c22d and cc207d. */
3240
0
        for (i = 0; i < idx->n; i++) {
3241
0
            bidx = idx->bidx[i];
3242
0
            k = kh_get(bin, bidx, META_BIN(idx));
3243
0
            if (k != kh_end(bidx)) {
3244
0
                if (off0 == (uint64_t) -1 || off0 < kh_val(bidx, k).list[0].v) {
3245
0
                    off0 = kh_val(bidx, k).list[0].v;
3246
0
                }
3247
0
            }
3248
0
        }
3249
0
        if (off0 == (uint64_t) -1 && idx->n_no_coor)
3250
0
            off0 = 0;
3251
        // only no-coor reads in this bam
3252
0
        break;
3253
0
    case HTS_IDX_REST:
3254
0
        off0 = 0;
3255
0
        break;
3256
0
    case HTS_IDX_NONE:
3257
0
        off0 = 0;
3258
0
        break;
3259
0
    }
3260
3261
0
    return off0;
3262
0
}
3263
3264
hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
3265
0
{
3266
0
    int i, n_off, l, bin;
3267
0
    hts_pair64_max_t *off;
3268
0
    khint_t k;
3269
0
    bidx_t *bidx;
3270
0
    uint64_t min_off, max_off;
3271
0
    hts_itr_t *iter;
3272
0
    uint32_t unmapped = 0, rel_off;
3273
3274
    // It's possible to call this function with NULL idx iff
3275
    // tid is one of the special values HTS_IDX_REST or HTS_IDX_NONE
3276
0
    if (!idx && !(tid == HTS_IDX_REST || tid == HTS_IDX_NONE)) {
3277
0
        errno = EINVAL;
3278
0
        return NULL;
3279
0
    }
3280
3281
0
    iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
3282
0
    if (iter) {
3283
0
        if (tid < 0) {
3284
0
            uint64_t off = hts_itr_off(idx, tid);
3285
0
            if (off != (uint64_t) -1) {
3286
0
                iter->read_rest = 1;
3287
0
                iter->curr_off = off;
3288
0
                iter->readrec = readrec;
3289
0
                if (tid == HTS_IDX_NONE)
3290
0
                    iter->finished = 1;
3291
0
            } else {
3292
0
                free(iter);
3293
0
                iter = NULL;
3294
0
            }
3295
0
        } else if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) {
3296
0
            iter->finished = 1;
3297
0
        } else {
3298
0
            if (beg < 0) beg = 0;
3299
0
            if (end < beg) {
3300
0
              free(iter);
3301
0
              return NULL;
3302
0
            }
3303
3304
0
            k = kh_get(bin, bidx, META_BIN(idx));
3305
0
            if (k != kh_end(bidx))
3306
0
                unmapped = kh_val(bidx, k).list[1].v;
3307
0
            else
3308
0
                unmapped = 1;
3309
3310
0
            iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
3311
0
            iter->readrec = readrec;
3312
3313
0
            if ( !kh_size(bidx) ) { iter->finished = 1; return iter; }
3314
3315
0
            rel_off = beg>>idx->min_shift;
3316
            // compute min_off
3317
0
            bin = hts_bin_first(idx->n_lvls) + rel_off;
3318
0
            do {
3319
0
                int first;
3320
0
                k = kh_get(bin, bidx, bin);
3321
0
                if (k != kh_end(bidx)) break;
3322
0
                first = (hts_bin_parent(bin)<<3) + 1;
3323
0
                if (bin > first) --bin;
3324
0
                else bin = hts_bin_parent(bin);
3325
0
            } while (bin);
3326
0
            if (bin == 0) k = kh_get(bin, bidx, bin);
3327
0
            min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
3328
            // min_off can be calculated more accurately if the
3329
            // linear index is available
3330
0
            if (idx->lidx[tid].offset
3331
0
                && rel_off < idx->lidx[tid].n) {
3332
0
                if (min_off < idx->lidx[tid].offset[rel_off])
3333
0
                    min_off = idx->lidx[tid].offset[rel_off];
3334
0
                if (unmapped) {
3335
                    // unmapped reads are not covered by the linear index,
3336
                    // so search backwards for a smaller offset
3337
0
                    int tmp_off;
3338
0
                    for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) {
3339
0
                        if (idx->lidx[tid].offset[tmp_off] < min_off) {
3340
0
                            min_off = idx->lidx[tid].offset[tmp_off];
3341
0
                            break;
3342
0
                        }
3343
0
                    }
3344
                    // if the search went too far back or no satisfactory entry
3345
                    // was found, revert to the bin index loff value
3346
0
                    if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).loff || tmp_off < 0))
3347
0
                        min_off = kh_val(bidx, k).loff;
3348
0
                }
3349
0
            } else if (unmapped) { //CSI index
3350
0
                if (k != kh_end(bidx))
3351
0
                    min_off = kh_val(bidx, k).loff;
3352
0
            }
3353
3354
            // compute max_off: a virtual offset from a bin to the right of end
3355
            // First check if end lies within the range of the index (it won't
3356
            // if it's HTS_POS_MAX)
3357
0
            if (end < 1LL << (idx->min_shift + 3 * idx->n_lvls)) {
3358
0
                bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
3359
0
                if (bin >= idx->n_bins) bin = 0;
3360
0
                while (1) {
3361
                    // search for an extant bin by moving right, but moving up to the
3362
                    // parent whenever we get to a first child (which also covers falling
3363
                    // off the RHS, which wraps around and immediately goes up to bin 0)
3364
0
                    while (bin % 8 == 1) bin = hts_bin_parent(bin);
3365
0
                    if (bin == 0) { max_off = UINT64_MAX; break; }
3366
0
                    k = kh_get(bin, bidx, bin);
3367
0
                    if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; }
3368
0
                    bin++;
3369
0
                }
3370
0
            } else {
3371
                // Searching to end of reference
3372
0
                max_off = UINT64_MAX;
3373
0
            }
3374
3375
            // retrieve bins
3376
0
            if (reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls, bidx) < 0) {
3377
0
                hts_itr_destroy(iter);
3378
0
                return NULL;
3379
0
            }
3380
3381
0
            for (i = n_off = 0; i < iter->bins.n; ++i)
3382
0
                if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx))
3383
0
                    n_off += kh_value(bidx, k).n;
3384
0
            if (n_off == 0) {
3385
                // No overlapping bins means the iterator has already finished.
3386
0
                iter->finished = 1;
3387
0
                return iter;
3388
0
            }
3389
0
            off = calloc(n_off, sizeof(*off));
3390
0
            for (i = n_off = 0; i < iter->bins.n; ++i) {
3391
0
                if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) {
3392
0
                    int j;
3393
0
                    bins_t *p = &kh_value(bidx, k);
3394
0
                    for (j = 0; j < p->n; ++j)
3395
0
                        if (p->list[j].v > min_off && p->list[j].u < max_off) {
3396
0
                            off[n_off].u = min_off > p->list[j].u
3397
0
                                ? min_off : p->list[j].u;
3398
0
                            off[n_off].v = max_off < p->list[j].v
3399
0
                                ? max_off : p->list[j].v;
3400
                            // hts_pair64_max_t::max is now used to link
3401
                            // file offsets to region list entries.
3402
                            // The iterator can use this to decide if it
3403
                            // can skip some file regions.
3404
0
                            off[n_off].max = ((uint64_t) tid << 32) | j;
3405
0
                            n_off++;
3406
0
                        }
3407
0
                }
3408
0
            }
3409
3410
0
            if (n_off == 0) {
3411
0
                free(off);
3412
0
                iter->finished = 1;
3413
0
                return iter;
3414
0
            }
3415
0
            ks_introsort(_off_max, n_off, off);
3416
            // resolve completely contained adjacent blocks
3417
0
            for (i = 1, l = 0; i < n_off; ++i)
3418
0
                if (off[l].v < off[i].v) off[++l] = off[i];
3419
0
            n_off = l + 1;
3420
            // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
3421
0
            for (i = 1; i < n_off; ++i)
3422
0
                if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
3423
            // merge adjacent blocks
3424
0
            for (i = 1, l = 0; i < n_off; ++i) {
3425
0
                if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
3426
0
                else off[++l] = off[i];
3427
0
            }
3428
0
            n_off = l + 1;
3429
0
            iter->n_off = n_off; iter->off = off;
3430
0
        }
3431
0
    }
3432
3433
0
    return iter;
3434
0
}
3435
3436
int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
3437
0
{
3438
0
    int i, j, bin;
3439
0
    khint_t k;
3440
0
    bidx_t *bidx;
3441
0
    uint64_t min_off, max_off, t_off = (uint64_t)-1;
3442
0
    int tid;
3443
0
    hts_pos_t beg, end;
3444
0
    hts_reglist_t *curr_reg;
3445
0
    uint32_t unmapped = 0, rel_off;
3446
3447
0
    if (!idx || !iter || !iter->multi)
3448
0
        return -1;
3449
3450
0
    iter->i = -1;
3451
0
    for (i=0; i<iter->n_reg; i++) {
3452
3453
0
        curr_reg = &iter->reg_list[i];
3454
0
        tid = curr_reg->tid;
3455
3456
0
        if (tid < 0) {
3457
0
            t_off = hts_itr_off(idx, tid);
3458
0
            if (t_off != (uint64_t)-1) {
3459
0
                switch (tid) {
3460
0
                case HTS_IDX_NONE:
3461
0
                    iter->finished = 1;
3462
                    // fall through
3463
0
                case HTS_IDX_START:
3464
0
                case HTS_IDX_REST:
3465
0
                    iter->curr_off = t_off;
3466
0
                    iter->n_reg = 0;
3467
0
                    iter->reg_list = NULL;
3468
0
                    iter->read_rest = 1;
3469
0
                    return 0;
3470
0
                case HTS_IDX_NOCOOR:
3471
0
                    iter->nocoor = 1;
3472
0
                    iter->nocoor_off = t_off;
3473
0
                }
3474
0
            }
3475
0
        } else {
3476
0
            if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL || !kh_size(bidx))
3477
0
                continue;
3478
3479
0
            k = kh_get(bin, bidx, META_BIN(idx));
3480
0
            if (k != kh_end(bidx))
3481
0
                unmapped = kh_val(bidx, k).list[1].v;
3482
0
            else
3483
0
                unmapped = 1;
3484
3485
0
            for(j=0; j<curr_reg->count; j++) {
3486
0
                hts_pair32_t *curr_intv = &curr_reg->intervals[j];
3487
0
                if (curr_intv->end < curr_intv->beg)
3488
0
                    continue;
3489
3490
0
                beg = curr_intv->beg;
3491
0
                end = curr_intv->end;
3492
0
                rel_off = beg>>idx->min_shift;
3493
3494
                /* Compute 'min_off' by searching the lowest level bin containing 'beg'.
3495
                       If the computed bin is not in the index, try the next bin to the
3496
                       left, belonging to the same parent. If it is the first sibling bin,
3497
                       try the parent bin. */
3498
0
                bin = hts_bin_first(idx->n_lvls) + rel_off;
3499
0
                do {
3500
0
                    int first;
3501
0
                    k = kh_get(bin, bidx, bin);
3502
0
                    if (k != kh_end(bidx)) break;
3503
0
                    first = (hts_bin_parent(bin)<<3) + 1;
3504
0
                    if (bin > first) --bin;
3505
0
                    else bin = hts_bin_parent(bin);
3506
0
                } while (bin);
3507
0
                if (bin == 0)
3508
0
                    k = kh_get(bin, bidx, bin);
3509
0
                min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
3510
                // min_off can be calculated more accurately if the
3511
                // linear index is available
3512
0
                if (idx->lidx[tid].offset
3513
0
                    && rel_off < idx->lidx[tid].n) {
3514
0
                    if (min_off < idx->lidx[tid].offset[rel_off])
3515
0
                        min_off = idx->lidx[tid].offset[rel_off];
3516
0
                    if (unmapped) {
3517
0
                        int tmp_off;
3518
0
                        for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) {
3519
0
                            if (idx->lidx[tid].offset[tmp_off] < min_off) {
3520
0
                                min_off = idx->lidx[tid].offset[tmp_off];
3521
0
                                break;
3522
0
                            }
3523
0
                        }
3524
3525
0
                        if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).loff || tmp_off < 0))
3526
0
                            min_off = kh_val(bidx, k).loff;
3527
0
                    }
3528
0
                } else if (unmapped) { //CSI index
3529
0
                    if (k != kh_end(bidx))
3530
0
                        min_off = kh_val(bidx, k).loff;
3531
0
                }
3532
3533
                // compute max_off: a virtual offset from a bin to the right of end
3534
                // First check if end lies within the range of the index (it
3535
                // won't if it's HTS_POS_MAX)
3536
0
                if (end < 1LL << (idx->min_shift + 3 * idx->n_lvls)) {
3537
0
                    bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
3538
0
                    if (bin >= idx->n_bins) bin = 0;
3539
0
                    while (1) {
3540
                        // search for an extant bin by moving right, but moving up to the
3541
                        // parent whenever we get to a first child (which also covers falling
3542
                        // off the RHS, which wraps around and immediately goes up to bin 0)
3543
0
                        while (bin % 8 == 1) bin = hts_bin_parent(bin);
3544
0
                        if (bin == 0) { max_off = UINT64_MAX; break; }
3545
0
                        k = kh_get(bin, bidx, bin);
3546
0
                        if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) {
3547
0
                            max_off = kh_val(bidx, k).list[0].u;
3548
0
                            break;
3549
0
                        }
3550
0
                        bin++;
3551
0
                    }
3552
0
                } else {
3553
                    // Searching to end of reference
3554
0
                    max_off = UINT64_MAX;
3555
0
                }
3556
3557
                //convert coordinates to file offsets
3558
0
                if (reg2intervals(iter, idx, tid, beg, end, j,
3559
0
                                  min_off, max_off,
3560
0
                                  idx->min_shift, idx->n_lvls) < 0) {
3561
0
                    return -1;
3562
0
                }
3563
0
            }
3564
0
        }
3565
0
    }
3566
3567
0
    if (iter->n_off > 1)
3568
0
        ks_introsort(_off_max, iter->n_off, iter->off);
3569
3570
0
    if(!iter->n_off && !iter->nocoor)
3571
0
        iter->finished = 1;
3572
3573
0
    return 0;
3574
0
}
3575
3576
int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter)
3577
0
{
3578
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
3579
0
    int tid, i, n_off = 0;
3580
0
    uint32_t j;
3581
0
    hts_pos_t beg, end;
3582
0
    hts_reglist_t *curr_reg;
3583
0
    hts_pair32_t *curr_intv;
3584
0
    hts_pair64_max_t *off = NULL, *tmp;
3585
0
    cram_index *e = NULL;
3586
3587
0
    if (!cidx || !iter || !iter->multi)
3588
0
        return -1;
3589
3590
0
    iter->is_cram = 1;
3591
0
    iter->read_rest = 0;
3592
0
    iter->off = NULL;
3593
0
    iter->n_off = 0;
3594
0
    iter->curr_off = 0;
3595
0
    iter->i = -1;
3596
3597
0
    for (i=0; i<iter->n_reg; i++) {
3598
3599
0
        curr_reg = &iter->reg_list[i];
3600
0
        tid = curr_reg->tid;
3601
3602
0
        if (tid >= 0) {
3603
0
            tmp = realloc(off, (n_off + curr_reg->count) * sizeof(*off));
3604
0
            if (!tmp)
3605
0
                goto err;
3606
0
            off = tmp;
3607
3608
0
            for (j=0; j < curr_reg->count; j++) {
3609
0
                curr_intv = &curr_reg->intervals[j];
3610
0
                if (curr_intv->end < curr_intv->beg)
3611
0
                    continue;
3612
3613
0
                beg = curr_intv->beg;
3614
0
                end = curr_intv->end;
3615
3616
/* First, fetch the container overlapping 'beg' and assign its file offset to u, then
3617
 * find the container overlapping 'end' and assign the relative end of the slice to v.
3618
 * The cram_ptell function will adjust with the container offset, which is not stored
3619
 * in the index.
3620
 */
3621
0
                e = cram_index_query(cidx->cram, tid, beg+1, NULL);
3622
0
                if (e) {
3623
0
                    off[n_off].u = e->offset;
3624
                    // hts_pair64_max_t::max is now used to link
3625
                    // file offsets to region list entries.
3626
                    // The iterator can use this to decide if it
3627
                    // can skip some file regions.
3628
0
                    off[n_off].max = ((uint64_t) tid << 32) | j;
3629
3630
0
                    if (end >= HTS_POS_MAX) {
3631
0
                       e = cram_index_last(cidx->cram, tid, NULL);
3632
0
                    } else {
3633
0
                       e = cram_index_query_last(cidx->cram, tid, end+1);
3634
0
                    }
3635
3636
0
                    if (e) {
3637
0
                        off[n_off++].v = e->e_next
3638
0
                            ? e->e_next->offset
3639
0
                            : e->offset + e->slice + e->len;
3640
0
                    } else {
3641
0
                        hts_log_warning("Could not set offset end for region %d:%"PRIhts_pos"-%"PRIhts_pos". Skipping", tid, beg, end);
3642
0
                    }
3643
0
                }
3644
0
            }
3645
0
        } else {
3646
0
            switch (tid) {
3647
0
                case HTS_IDX_NOCOOR:
3648
0
                    e = cram_index_query(cidx->cram, tid, 1, NULL);
3649
0
                    if (e) {
3650
0
                        iter->nocoor = 1;
3651
0
                        iter->nocoor_off = e->offset;
3652
0
                    } else {
3653
0
                        hts_log_warning("No index entry for NOCOOR region");
3654
0
                    }
3655
0
                    break;
3656
0
                case HTS_IDX_START:
3657
0
                    e = cram_index_query(cidx->cram, tid, 1, NULL);
3658
0
                    if (e) {
3659
0
                        iter->read_rest = 1;
3660
0
                        tmp = realloc(off, sizeof(*off));
3661
0
                        if (!tmp)
3662
0
                            goto err;
3663
0
                        off = tmp;
3664
0
                        off[0].u = e->offset;
3665
0
                        off[0].v = 0;
3666
0
                        n_off=1;
3667
0
                    } else {
3668
0
                        hts_log_warning("No index entries");
3669
0
                    }
3670
0
                    break;
3671
0
                case HTS_IDX_REST:
3672
0
                    break;
3673
0
                case HTS_IDX_NONE:
3674
0
                    iter->finished = 1;
3675
0
                    break;
3676
0
                default:
3677
0
                    hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
3678
0
            }
3679
0
        }
3680
0
    }
3681
3682
0
    if (n_off) {
3683
0
        ks_introsort(_off_max, n_off, off);
3684
0
        iter->n_off = n_off; iter->off = off;
3685
0
    }
3686
3687
0
    if(!n_off && !iter->nocoor)
3688
0
        iter->finished = 1;
3689
3690
0
    return 0;
3691
3692
0
 err:
3693
0
    free(off);
3694
0
    return -1;
3695
0
}
3696
3697
void hts_itr_destroy(hts_itr_t *iter)
3698
0
{
3699
0
    if (iter) {
3700
0
        if (iter->multi) {
3701
0
            hts_reglist_free(iter->reg_list, iter->n_reg);
3702
0
        } else {
3703
0
            free(iter->bins.a);
3704
0
        }
3705
3706
0
        if (iter->off)
3707
0
            free(iter->off);
3708
0
        free(iter);
3709
0
    }
3710
0
}
3711
3712
static inline long long push_digit(long long i, char c)
3713
0
{
3714
    // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so
3715
0
    int digit = c - '0';
3716
0
    return 10 * i + digit;
3717
0
}
3718
3719
long long hts_parse_decimal(const char *str, char **strend, int flags)
3720
0
{
3721
0
    long long n = 0;
3722
0
    int digits = 0, decimals = 0, e = 0, lost = 0;
3723
0
    char sign = '+', esign = '+';
3724
0
    const char *s, *str_orig = str;
3725
3726
0
    while (isspace_c(*str)) str++;
3727
0
    s = str;
3728
3729
0
    if (*s == '+' || *s == '-') sign = *s++;
3730
0
    while (*s)
3731
0
        if (isdigit_c(*s)) digits++, n = push_digit(n, *s++);
3732
0
        else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++;
3733
0
        else break;
3734
3735
0
    if (*s == '.') {
3736
0
        s++;
3737
0
        while (isdigit_c(*s)) decimals++, digits++, n = push_digit(n, *s++);
3738
0
    }
3739
3740
0
    switch (*s) {
3741
0
    case 'e': case 'E':
3742
0
        s++;
3743
0
        if (*s == '+' || *s == '-') esign = *s++;
3744
0
        while (isdigit_c(*s)) e = push_digit(e, *s++);
3745
0
        if (esign == '-') e = -e;
3746
0
        break;
3747
3748
0
    case 'k': case 'K': e += 3; s++; break;
3749
0
    case 'm': case 'M': e += 6; s++; break;
3750
0
    case 'g': case 'G': e += 9; s++; break;
3751
0
    }
3752
3753
0
    e -= decimals;
3754
0
    while (e > 0) n *= 10, e--;
3755
0
    while (e < 0) lost += n % 10, n /= 10, e++;
3756
3757
0
    if (lost > 0) {
3758
0
        hts_log_warning("Discarding fractional part of %.*s", (int)(s - str), str);
3759
0
    }
3760
3761
0
    if (strend) {
3762
        // Set to the original input str pointer if not valid number syntax
3763
0
        *strend = (digits > 0)? (char *)s : (char *)str_orig;
3764
0
    } else if (digits == 0) {
3765
0
        hts_log_warning("Invalid numeric value %.8s[truncated]", str);
3766
0
    } else if (*s) {
3767
0
        if ((flags & HTS_PARSE_THOUSANDS_SEP) || (!(flags & HTS_PARSE_THOUSANDS_SEP) && *s != ','))
3768
0
            hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s);
3769
0
    }
3770
3771
0
    return (sign == '+')? n : -n;
3772
0
}
3773
3774
0
static void *hts_memrchr(const void *s, int c, size_t n) {
3775
0
    size_t i;
3776
0
    unsigned char *u = (unsigned char *)s;
3777
0
    for (i = n; i > 0; i--) {
3778
0
        if (u[i-1] == c)
3779
0
            return u+i-1;
3780
0
    }
3781
3782
0
    return NULL;
3783
0
}
3784
3785
/*
3786
 * A variant of hts_parse_reg which is reference-id aware.  It uses
3787
 * the iterator name2id callbacks to validate the region tokenisation works.
3788
 *
3789
 * This is necessary due to GRCh38 HLA additions which have reference names
3790
 * like "HLA-DRB1*12:17".
3791
 *
3792
 * All parameters are mandatory.
3793
 *
3794
 * To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200"
3795
 * are reference names, we may quote using curly braces.
3796
 * Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example.
3797
 *
3798
 * Flags are used to control how parsing works, and can be one of the below.
3799
 *
3800
 * HTS_PARSE_LIST:
3801
 *     If present, the region is assmed to be a comma separated list and
3802
 *     position parsing will not contain commas (this implicitly
3803
 *     clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal).
3804
 *     On success the return pointer will be the start of the next region, ie
3805
 *     the character after the comma.  (If *ret != '\0' then the caller can
3806
 *     assume another region is present in the list.)
3807
 *
3808
 *     If not set then positions may contain commas.  In this case the return
3809
 *     value should point to the end of the string, or NULL on failure.
3810
 *
3811
 * HTS_PARSE_ONE_COORD:
3812
 *     If present, X:100 is treated as the single base pair region X:100-100.
3813
 *     In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-<end>.
3814
 *     (This is the standard bcftools region convention.)
3815
 *
3816
 *     When not set X:100 is considered to be X:100-<end> where <end> is
3817
 *     the end of chromosome X (set to HTS_POS_MAX here).  X:100- and X:-100
3818
 *     are invalid.
3819
 *     (This is the standard samtools region convention.)
3820
 *
3821
 * Note the supplied string expects 1 based inclusive coordinates, but the
3822
 * returned coordinates start from 0 and are half open, so pos0 is valid
3823
 * for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}"
3824
 *
3825
 * On success a pointer to the byte after the end of the entire region
3826
 *            specifier is returned (plus any trailing comma), and tid,
3827
 *            beg & end will be set.
3828
 * On failure NULL is returned.
3829
 */
3830
const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg,
3831
                             hts_pos_t *end, hts_name2id_f getid, void *hdr,
3832
                             int flags)
3833
0
{
3834
0
    if (!s || !tid || !beg || !end || !getid)
3835
0
        return NULL;
3836
3837
0
    size_t s_len = strlen(s);
3838
0
    kstring_t ks = { 0, 0, NULL };
3839
3840
0
    const char *colon = NULL, *comma = NULL;
3841
0
    int quoted = 0;
3842
3843
0
    if (flags & HTS_PARSE_LIST)
3844
0
        flags &= ~HTS_PARSE_THOUSANDS_SEP;
3845
0
    else
3846
0
        flags |= HTS_PARSE_THOUSANDS_SEP;
3847
3848
0
    const char *s_end = s + s_len;
3849
3850
    // Braced quoting of references is permitted to resolve ambiguities.
3851
0
    if (*s == '{') {
3852
0
        const char *close = memchr(s, '}', s_len);
3853
0
        if (!close) {
3854
0
            hts_log_error("Mismatching braces in \"%s\"", s);
3855
0
            *tid = -1;
3856
0
            return NULL;
3857
0
        }
3858
0
        s++;
3859
0
        s_len--;
3860
0
        if (close[1] == ':')
3861
0
            colon = close+1;
3862
0
        quoted = 1; // number of trailing characters to trim
3863
3864
        // Truncate to this item only, if appropriate.
3865
0
        if (flags & HTS_PARSE_LIST) {
3866
0
            comma = strchr(close, ',');
3867
0
            if (comma) {
3868
0
                s_len = comma-s;
3869
0
                s_end = comma+1;
3870
0
            }
3871
0
        }
3872
0
    } else {
3873
        // Truncate to this item only, if appropriate.
3874
0
        if (flags & HTS_PARSE_LIST) {
3875
0
            comma = strchr(s, ',');
3876
0
            if (comma) {
3877
0
                s_len = comma-s;
3878
0
                s_end = comma+1;
3879
0
            }
3880
0
        }
3881
3882
0
        colon = hts_memrchr(s, ':', s_len);
3883
0
    }
3884
3885
    // No colon is simplest case; just check and return.
3886
0
    if (colon == NULL) {
3887
0
        *beg = 0; *end = HTS_POS_MAX;
3888
0
        kputsn(s, s_len-quoted, &ks); // convert to nul terminated string
3889
0
        if (!ks.s) {
3890
0
            *tid = -2;
3891
0
            return NULL;
3892
0
        }
3893
3894
0
        *tid = getid(hdr, ks.s);
3895
0
        free(ks.s);
3896
3897
0
        return *tid >= 0 ? s_end : NULL;
3898
0
    }
3899
3900
    // Has a colon, but check whole name first.
3901
0
    if (!quoted) {
3902
0
        *beg = 0; *end = HTS_POS_MAX;
3903
0
        kputsn(s, s_len, &ks); // convert to nul terminated string
3904
0
        if (!ks.s) {
3905
0
            *tid = -2;
3906
0
            return NULL;
3907
0
        }
3908
0
        if ((*tid = getid(hdr, ks.s)) >= 0) {
3909
            // Entire name matches, but also check this isn't
3910
            // ambiguous.  eg we have ref chr1 and ref chr1:100-200
3911
            // both present.
3912
0
            ks.l = 0;
3913
0
            kputsn(s, colon-s, &ks); // convert to nul terminated string
3914
0
            if (!ks.s) {
3915
0
                *tid = -2;
3916
0
                return NULL;
3917
0
            }
3918
0
            if (getid(hdr, ks.s) >= 0) {
3919
0
                free(ks.s);
3920
0
                *tid = -1;
3921
0
                hts_log_error("Range is ambiguous. "
3922
0
                              "Use {%s} or {%.*s}%s instead",
3923
0
                              s, (int)(colon-s), s, colon);
3924
0
                return NULL;
3925
0
            }
3926
0
            free(ks.s);
3927
3928
0
            return s_end;
3929
0
        }
3930
0
        if (*tid < -1) // Failed to parse header
3931
0
            return NULL;
3932
0
    }
3933
3934
    // Quoted, or unquoted and whole string isn't a name.
3935
    // Check the pre-colon part is valid.
3936
0
    ks.l = 0;
3937
0
    kputsn(s, colon-s-quoted, &ks); // convert to nul terminated string
3938
0
    if (!ks.s) {
3939
0
        *tid = -2;
3940
0
        return NULL;
3941
0
    }
3942
0
    *tid = getid(hdr, ks.s);
3943
0
    free(ks.s);
3944
0
    if (*tid < 0)
3945
0
        return NULL;
3946
3947
    // Finally parse the post-colon coordinates
3948
0
    char *hyphen;
3949
0
    *beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1;
3950
0
    if (*beg < 0) {
3951
0
        if (*beg != -1 && *hyphen == '-' && colon[1] != '\0') {
3952
            // User specified zero, but we're 1-based.
3953
0
            hts_log_error("Coordinates must be > 0");
3954
0
            return NULL;
3955
0
        }
3956
0
        if (isdigit_c(*hyphen) || *hyphen == '\0' || *hyphen == ',') {
3957
            // interpret chr:-100 as chr:1-100
3958
0
            *end = *beg==-1 ? HTS_POS_MAX : -(*beg+1);
3959
0
            *beg = 0;
3960
0
            return s_end;
3961
0
        } else if (*beg < -1) {
3962
0
            hts_log_error("Unexpected string \"%s\" after region", hyphen);
3963
0
            return NULL;
3964
0
        }
3965
0
    }
3966
3967
0
    if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) {
3968
0
        *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : HTS_POS_MAX;
3969
0
    } else if (*hyphen == '-') {
3970
0
        *end = hts_parse_decimal(hyphen+1, &hyphen, flags);
3971
0
        if (*hyphen != '\0' && *hyphen != ',') {
3972
0
            hts_log_error("Unexpected string \"%s\" after region", hyphen);
3973
0
            return NULL;
3974
0
        }
3975
0
    } else {
3976
0
        hts_log_error("Unexpected string \"%s\" after region", hyphen);
3977
0
        return NULL;
3978
0
    }
3979
3980
0
    if (*end == 0)
3981
0
        *end = HTS_POS_MAX; // interpret chr:100- as chr:100-<end>
3982
3983
0
    if (*beg >= *end) return NULL;
3984
3985
0
    return s_end;
3986
0
}
3987
3988
// Next release we should mark this as deprecated?
3989
// Use hts_parse_region above instead.
3990
const char *hts_parse_reg64(const char *s, hts_pos_t *beg, hts_pos_t *end)
3991
0
{
3992
0
    char *hyphen;
3993
0
    const char *colon = strrchr(s, ':');
3994
0
    if (colon == NULL) {
3995
0
        *beg = 0; *end = HTS_POS_MAX;
3996
0
        return s + strlen(s);
3997
0
    }
3998
3999
0
    *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1;
4000
0
    if (*beg < 0) *beg = 0;
4001
4002
0
    if (*hyphen == '\0') *end = HTS_POS_MAX;
4003
0
    else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP);
4004
0
    else return NULL;
4005
4006
0
    if (*beg >= *end) return NULL;
4007
0
    return colon;
4008
0
}
4009
4010
const char *hts_parse_reg(const char *s, int *beg, int *end)
4011
0
{
4012
0
    hts_pos_t beg64 = 0, end64 = 0;
4013
0
    const char *colon = hts_parse_reg64(s, &beg64, &end64);
4014
0
    if (beg64 > INT_MAX) {
4015
0
        hts_log_error("Position %"PRId64" too large", beg64);
4016
0
        return NULL;
4017
0
    }
4018
0
    if (end64 > INT_MAX) {
4019
0
        if (end64 == HTS_POS_MAX) {
4020
0
            end64 = INT_MAX;
4021
0
        } else {
4022
0
            hts_log_error("Position %"PRId64" too large", end64);
4023
0
            return NULL;
4024
0
        }
4025
0
    }
4026
0
    *beg = beg64;
4027
0
    *end = end64;
4028
0
    return colon;
4029
0
}
4030
4031
hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
4032
0
{
4033
0
    int tid;
4034
0
    hts_pos_t beg, end;
4035
4036
0
    if (strcmp(reg, ".") == 0)
4037
0
        return itr_query(idx, HTS_IDX_START, 0, 0, readrec);
4038
0
    else if (strcmp(reg, "*") == 0)
4039
0
        return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
4040
4041
0
    if (!hts_parse_region(reg, &tid, &beg, &end, getid, hdr, HTS_PARSE_THOUSANDS_SEP))
4042
0
        return NULL;
4043
4044
0
    return itr_query(idx, tid, beg, end, readrec);
4045
0
}
4046
4047
0
hts_itr_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int count, hts_name2id_f getid, void *hdr, hts_itr_multi_query_func *itr_specific, hts_readrec_func *readrec, hts_seek_func *seek, hts_tell_func *tell) {
4048
4049
0
    int i;
4050
4051
0
    if (!reglist)
4052
0
        return NULL;
4053
4054
0
    hts_itr_t *itr = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
4055
0
    if (itr) {
4056
0
        itr->n_reg = count;
4057
0
        itr->readrec = readrec;
4058
0
        itr->seek = seek;
4059
0
        itr->tell = tell;
4060
0
        itr->reg_list = reglist;
4061
0
        itr->finished = 0;
4062
0
        itr->nocoor = 0;
4063
0
        itr->multi = 1;
4064
4065
0
        for (i = 0; i < itr->n_reg; i++) {
4066
0
            if (itr->reg_list[i].reg) {
4067
0
                if (!strcmp(itr->reg_list[i].reg, ".")) {
4068
0
                    itr->reg_list[i].tid = HTS_IDX_START;
4069
0
                    continue;
4070
0
                }
4071
4072
0
                if (!strcmp(itr->reg_list[i].reg, "*")) {
4073
0
                    itr->reg_list[i].tid = HTS_IDX_NOCOOR;
4074
0
                    continue;
4075
0
                }
4076
4077
0
                itr->reg_list[i].tid = getid(hdr, reglist[i].reg);
4078
0
                if (itr->reg_list[i].tid < 0) {
4079
0
                    if (itr->reg_list[i].tid < -1) {
4080
0
                        hts_log_error("Failed to parse header");
4081
0
                        hts_itr_destroy(itr);
4082
0
                        return NULL;
4083
0
                    } else {
4084
0
                        hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", reglist[i].reg);
4085
0
                    }
4086
0
                }
4087
0
            }
4088
0
        }
4089
4090
0
        qsort(itr->reg_list, itr->n_reg, sizeof(hts_reglist_t), compare_regions);
4091
0
        if (itr_specific(idx, itr) != 0) {
4092
0
            hts_log_error("Failed to create the multi-region iterator!");
4093
0
            hts_itr_destroy(itr);
4094
0
            itr = NULL;
4095
0
        }
4096
0
    }
4097
4098
0
    return itr;
4099
0
}
4100
4101
int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
4102
0
{
4103
0
    int ret, tid;
4104
0
    hts_pos_t beg, end;
4105
0
    if (iter == NULL || iter->finished) return -1;
4106
0
    if (iter->read_rest) {
4107
0
        if (iter->curr_off) { // seek to the start
4108
0
            if (bgzf_seek(fp, iter->curr_off, SEEK_SET) < 0) {
4109
0
                hts_log_error("Failed to seek to offset %"PRIu64"%s%s",
4110
0
                              iter->curr_off,
4111
0
                              errno ? ": " : "", strerror(errno));
4112
0
                return -2;
4113
0
            }
4114
0
            iter->curr_off = 0; // only seek once
4115
0
        }
4116
0
        ret = iter->readrec(fp, data, r, &tid, &beg, &end);
4117
0
        if (ret < 0) iter->finished = 1;
4118
0
        iter->curr_tid = tid;
4119
0
        iter->curr_beg = beg;
4120
0
        iter->curr_end = end;
4121
0
        return ret;
4122
0
    }
4123
    // A NULL iter->off should always be accompanied by iter->finished.
4124
0
    assert(iter->off != NULL);
4125
0
    for (;;) {
4126
0
        if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
4127
0
            if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
4128
0
            if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
4129
0
                if (bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) {
4130
0
                    hts_log_error("Failed to seek to offset %"PRIu64"%s%s",
4131
0
                                  iter->off[iter->i+1].u,
4132
0
                                  errno ? ": " : "", strerror(errno));
4133
0
                    return -2;
4134
0
                }
4135
0
                iter->curr_off = bgzf_tell(fp);
4136
0
            }
4137
0
            ++iter->i;
4138
0
        }
4139
0
        if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) {
4140
0
            iter->curr_off = bgzf_tell(fp);
4141
0
            if (tid != iter->tid || beg >= iter->end) { // no need to proceed
4142
0
                ret = -1; break;
4143
0
            } else if (end > iter->beg && iter->end > beg) {
4144
0
                iter->curr_tid = tid;
4145
0
                iter->curr_beg = beg;
4146
0
                iter->curr_end = end;
4147
0
                return ret;
4148
0
            }
4149
0
        } else break; // end of file or error
4150
0
    }
4151
0
    iter->finished = 1;
4152
0
    return ret;
4153
0
}
4154
4155
int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r)
4156
0
{
4157
0
    void *fp;
4158
0
    int ret, tid, i, cr, ci;
4159
0
    hts_pos_t beg, end;
4160
0
    hts_reglist_t *found_reg;
4161
4162
0
    if (iter == NULL || iter->finished) return -1;
4163
4164
0
    if (iter->is_cram) {
4165
0
        fp = fd->fp.cram;
4166
0
    } else {
4167
0
        fp = fd->fp.bgzf;
4168
0
    }
4169
4170
0
    if (iter->read_rest) {
4171
0
        if (iter->curr_off) { // seek to the start
4172
0
            if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
4173
0
                hts_log_error("Seek at offset %" PRIu64 " failed.", iter->curr_off);
4174
0
                return -1;
4175
0
            }
4176
0
            iter->curr_off = 0; // only seek once
4177
0
        }
4178
4179
0
        ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
4180
0
        if (ret < 0)
4181
0
            iter->finished = 1;
4182
4183
0
        iter->curr_tid = tid;
4184
0
        iter->curr_beg = beg;
4185
0
        iter->curr_end = end;
4186
4187
0
        return ret;
4188
0
    }
4189
    // A NULL iter->off should always be accompanied by iter->finished.
4190
0
    assert(iter->off != NULL || iter->nocoor != 0);
4191
4192
0
    int next_range = 0;
4193
0
    for (;;) {
4194
        // Note that due to the way bam indexing works, iter->off may contain
4195
        // file chunks that are not actually needed as they contain data
4196
        // beyond the end of the requested region.  These are filtered out
4197
        // by comparing the tid and index into hts_reglist_t::intervals
4198
        // (packed for reasons of convenience into iter->off[iter->i].max)
4199
        // associated with the file region with iter->curr_tid and
4200
        // iter->curr_intv.
4201
4202
0
        if (next_range
4203
0
            || iter->curr_off == 0
4204
0
            || iter->i >= iter->n_off
4205
0
            || iter->curr_off >= iter->off[iter->i].v
4206
0
            || (iter->off[iter->i].max >> 32 == iter->curr_tid
4207
0
                && (iter->off[iter->i].max & 0xffffffff) < iter->curr_intv)) {
4208
4209
            // Jump to the next chunk.  It may be necessary to skip more
4210
            // than one as the iter->off list can include overlapping entries.
4211
0
            do {
4212
0
                iter->i++;
4213
0
            } while (iter->i < iter->n_off
4214
0
                     && (iter->curr_off >= iter->off[iter->i].v
4215
0
                         || (iter->off[iter->i].max >> 32 == iter->curr_tid
4216
0
                             && (iter->off[iter->i].max & 0xffffffff) < iter->curr_intv)));
4217
4218
0
            if (iter->is_cram && iter->i < iter->n_off) {
4219
                // Ensure iter->curr_reg is correct.
4220
                //
4221
                // We need this for CRAM as we shortcut some of the later
4222
                // logic by getting an end-of-range and continuing to the
4223
                // next offset.
4224
                //
4225
                // We cannot do this for BAM (and fortunately do not need to
4226
                // either) because in BAM world a query to genomic positions
4227
                // GX and GY leading to a seek offsets PX and PY may have
4228
                // GX > GY and PX < PY.  (This is due to the R-tree and falling
4229
                // between intervals, bumping up to a higher bin.)
4230
                // CRAM strictly follows PX >= PY if GX >= GY, so this logic
4231
                // works.
4232
0
                int want_tid = iter->off[iter->i].max >> 32;
4233
0
                if (!(iter->curr_reg < iter->n_reg &&
4234
0
                      iter->reg_list[iter->curr_reg].tid == want_tid)) {
4235
0
                    int j;
4236
0
                    for (j = 0; j < iter->n_reg; j++)
4237
0
                        if (iter->reg_list[j].tid == want_tid)
4238
0
                            break;
4239
0
                    if (j == iter->n_reg)
4240
0
                        return -1;
4241
0
                    iter->curr_reg = j;
4242
0
                    iter->curr_tid = iter->reg_list[iter->curr_reg].tid;
4243
0
                };
4244
0
                iter->curr_intv = iter->off[iter->i].max & 0xffffffff;
4245
0
            }
4246
4247
0
            if (iter->i >= iter->n_off) { // no more chunks, except NOCOORs
4248
0
                if (iter->nocoor) {
4249
0
                    next_range = 0;
4250
0
                    if (iter->seek(fp, iter->nocoor_off, SEEK_SET) < 0) {
4251
0
                        hts_log_error("Seek at offset %" PRIu64 " failed.", iter->nocoor_off);
4252
0
                        return -1;
4253
0
                    }
4254
0
                    if (iter->is_cram) {
4255
0
                        cram_range r = { HTS_IDX_NOCOOR };
4256
0
                        cram_set_option(fp, CRAM_OPT_RANGE_NOSEEK, &r);
4257
0
                    }
4258
4259
                    // The first slice covering the unmapped reads might
4260
                    // contain a few mapped reads, so scroll
4261
                    // forward until finding the first unmapped read.
4262
0
                    do {
4263
0
                        ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
4264
0
                    } while (tid >= 0 && ret >=0);
4265
4266
0
                    if (ret < 0)
4267
0
                        iter->finished = 1;
4268
0
                    else
4269
0
                        iter->read_rest = 1;
4270
4271
0
                    iter->curr_off = 0; // don't seek any more
4272
0
                    iter->curr_tid = tid;
4273
0
                    iter->curr_beg = beg;
4274
0
                    iter->curr_end = end;
4275
4276
0
                    return ret;
4277
0
                } else {
4278
0
                    ret = -1; break;
4279
0
                }
4280
0
            } else if (iter->i < iter->n_off) {
4281
                // New chunk may overlap the last one, so ensure we
4282
                // only seek forwards.
4283
0
                if (iter->curr_off < iter->off[iter->i].u || next_range) {
4284
0
                    iter->curr_off = iter->off[iter->i].u;
4285
4286
                    // CRAM has the capability of setting an end location.
4287
                    // This means multi-threaded decodes can stop once they
4288
                    // reach that point, rather than pointlessly decoding
4289
                    // more slices than we'll be using.
4290
                    //
4291
                    // We have to be careful here.  Whenever we set the cram
4292
                    // range we need a corresponding seek in order to ensure
4293
                    // we can safely decode at that offset.  We use next_range
4294
                    // var to ensure this is always true; this is set on
4295
                    // end-of-range condition. It's never modified for BAM.
4296
0
                    if (iter->is_cram) {
4297
                        // Next offset.[uv] tuple, but it's already been
4298
                        // included in our cram range, so don't seek and don't
4299
                        // reset range so we can efficiently multi-thread.
4300
0
                        if (next_range || iter->curr_off >= iter->end) {
4301
0
                            if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
4302
0
                                hts_log_error("Seek at offset %" PRIu64
4303
0
                                        " failed.", iter->curr_off);
4304
0
                                return -1;
4305
0
                            }
4306
4307
                            // Find the genomic range matching this interval.
4308
0
                            int j;
4309
0
                            hts_reglist_t *rl = &iter->reg_list[iter->curr_reg];
4310
0
                            cram_range r = {
4311
0
                                    rl->tid,
4312
0
                                    rl->intervals[iter->curr_intv].beg,
4313
0
                                    rl->intervals[iter->curr_intv].end
4314
0
                            };
4315
4316
                            // Expand it up to cover neighbouring intervals.
4317
                            // Note we can only have a single chromosome in a
4318
                            // range, so if we detect our blocks span chromosomes
4319
                            // or we have a multi-ref mode slice, we just use
4320
                            // HTS_IDX_START refid instead.  This doesn't actually
4321
                            // seek (due to CRAM_OPT_RANGE_NOSEEK) and is simply
4322
                            // and indicator of decoding with no end limit.
4323
                            //
4324
                            // That isn't as efficient as it could be, but it's
4325
                            // no poorer than before and it works.
4326
0
                            int tid = r.refid;
4327
0
                            int64_t end = r.end;
4328
0
                            int64_t v = iter->off[iter->i].v;
4329
0
                            j = iter->i+1;
4330
0
                            while (j < iter->n_off) {
4331
0
                                if (iter->off[j].u > v)
4332
0
                                    break;
4333
4334
0
                                uint64_t max = iter->off[j].max;
4335
0
                                if ((max>>32) != tid)
4336
0
                                    tid = HTS_IDX_START; // => no range limit
4337
4338
0
                                if (end < rl->intervals[max & 0xffffffff].end)
4339
0
                                    end = rl->intervals[max & 0xffffffff].end;
4340
0
                                if (v < iter->off[j].v)
4341
0
                                    v = iter->off[j].v;
4342
0
                                j++;
4343
0
                            }
4344
0
                            r.refid = tid;
4345
0
                            r.end = end;
4346
4347
                            // Remember maximum 'v' here so we don't do
4348
                            // unnecessary subsequent seeks for the next
4349
                            // regions.  We can't change curr_off, but
4350
                            // beg/end are used only by single region iterator so
4351
                            // we cache it there to avoid changing the struct.
4352
0
                            iter->end = v;
4353
4354
0
                            cram_set_option(fp, CRAM_OPT_RANGE_NOSEEK, &r);
4355
0
                            next_range = 0;
4356
0
                        }
4357
0
                    } else { // Not CRAM
4358
0
                        if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
4359
0
                            hts_log_error("Seek at offset %" PRIu64 " failed.",
4360
0
                                          iter->curr_off);
4361
0
                            return -1;
4362
0
                        }
4363
0
                    }
4364
0
                }
4365
0
            }
4366
0
        }
4367
4368
0
        ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
4369
0
        if (ret < 0) {
4370
0
            if (iter->is_cram && cram_eof(fp)) {
4371
                // Skip to end of range
4372
                //
4373
                // We should never be adjusting curr_off manually unless
4374
                // we also can guarantee we'll be doing a seek after to
4375
                // a new location.  Otherwise we'll be reading wrong offset
4376
                // for the next container.
4377
                //
4378
                // We ensure this by adjusting our CRAM_OPT_RANGE
4379
                // accordingly above, but to double check we also
4380
                // set the skipped_block flag to enforce a seek also.
4381
0
                iter->curr_off = iter->off[iter->i].v;
4382
0
                next_range = 1;
4383
4384
                // Next region
4385
0
                if (++iter->curr_intv >= iter->reg_list[iter->curr_reg].count){
4386
0
                    if (++iter->curr_reg >= iter->n_reg)
4387
0
                        break;
4388
0
                    iter->curr_intv = 0;
4389
0
                    iter->curr_tid = iter->reg_list[iter->curr_reg].tid;
4390
0
                }
4391
0
                continue;
4392
0
            } else {
4393
0
                break;
4394
0
            }
4395
0
        }
4396
4397
0
        iter->curr_off = iter->tell(fp);
4398
4399
0
        if (tid != iter->curr_tid) {
4400
0
            hts_reglist_t key;
4401
0
            key.tid = tid;
4402
4403
0
            found_reg = (hts_reglist_t *)bsearch(&key, iter->reg_list,
4404
0
                                                 iter->n_reg,
4405
0
                                                 sizeof(hts_reglist_t),
4406
0
                                                 compare_regions);
4407
0
            if (!found_reg)
4408
0
                continue;
4409
4410
0
            iter->curr_reg = (found_reg - iter->reg_list);
4411
0
            iter->curr_tid = tid;
4412
0
            iter->curr_intv = 0;
4413
0
        }
4414
4415
0
        cr = iter->curr_reg;
4416
0
        ci = iter->curr_intv;
4417
4418
0
        for (i = ci; i < iter->reg_list[cr].count; i++) {
4419
0
            if (end > iter->reg_list[cr].intervals[i].beg &&
4420
0
                iter->reg_list[cr].intervals[i].end > beg) {
4421
0
                iter->curr_beg = beg;
4422
0
                iter->curr_end = end;
4423
0
                iter->curr_intv = i;
4424
4425
0
                return ret;
4426
0
            }
4427
4428
            // Check if the read starts beyond intervals[i].end
4429
            // If so, the interval is finished so move on to the next.
4430
0
            if (beg > iter->reg_list[cr].intervals[i].end)
4431
0
                iter->curr_intv = i + 1;
4432
4433
            // No need to keep searching if the read ends before intervals[i].beg
4434
0
            if (end < iter->reg_list[cr].intervals[i].beg)
4435
0
                break;
4436
0
        }
4437
0
    }
4438
0
    iter->finished = 1;
4439
4440
0
    return ret;
4441
0
}
4442
4443
/**********************
4444
 *** Retrieve index ***
4445
 **********************/
4446
// Local_fn and local_len will return a sub-region of 'fn'.
4447
// Eg http://elsewhere/dir/foo.bam.bai?a=b may return
4448
// foo.bam.bai via local_fn and local_len.
4449
//
4450
// Returns -1 if index couldn't be opened.
4451
//         -2 on other errors
4452
static int idx_test_and_fetch(const char *fn, const char **local_fn, int *local_len, int download)
4453
0
{
4454
0
    hFILE *remote_hfp = NULL;
4455
0
    hFILE *local_fp = NULL;
4456
0
    int save_errno;
4457
0
    htsFormat fmt;
4458
0
    kstring_t s = KS_INITIALIZE;
4459
0
    kstring_t tmps = KS_INITIALIZE;
4460
4461
0
    if (hisremote(fn)) {
4462
0
        const int buf_size = 1 * 1024 * 1024;
4463
0
        int l;
4464
0
        const char *p, *e;
4465
        // Ignore ?# params: eg any file.fmt?param=val, except for S3 URLs
4466
0
        e = fn + ((strncmp(fn, "s3://", 5) && strncmp(fn, "s3+http://", 10) && strncmp(fn, "s3+https://", 11)) ? strcspn(fn, "?#") : strcspn(fn, "?"));
4467
        // Find the previous slash from there.
4468
0
        p = e;
4469
0
        while (p > fn && *p != '/') p--;
4470
0
        if (*p == '/') p++;
4471
4472
        // Attempt to open local file first
4473
0
        kputsn(p, e-p, &s);
4474
0
        if (access(s.s, R_OK) == 0)
4475
0
        {
4476
0
            free(s.s);
4477
0
            *local_fn = p;
4478
0
            *local_len = e-p;
4479
0
            return 0;
4480
0
        }
4481
4482
        // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .bai or .tbi index.
4483
0
        if ((remote_hfp = hopen(fn, "r")) == 0) {
4484
0
            hts_log_info("Failed to open index file '%s'", fn);
4485
0
            free(s.s);
4486
0
            return -1;
4487
0
        }
4488
0
        if (hts_detect_format2(remote_hfp, fn, &fmt))