Coverage Report

Created: 2025-11-11 06:39

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/hts.c
Line
Count
Source
1
/*  hts.c -- format-neutral I/O, indexing, and iterator API functions.
2
3
    Copyright (C) 2008, 2009, 2012-2024 Genome Research Ltd.
4
    Copyright (C) 2012, 2013 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <zlib.h>
30
#include <stdio.h>
31
#include <string.h>
32
#include <strings.h>
33
#include <stdlib.h>
34
#include <unistd.h>
35
#include <inttypes.h>
36
#include <limits.h>
37
#include <stdint.h>
38
#include <fcntl.h>
39
#include <errno.h>
40
#include <time.h>
41
#include <sys/stat.h>
42
#include <assert.h>
43
44
#ifdef HAVE_LIBLZMA
45
#ifdef HAVE_LZMA_H
46
#include <lzma.h>
47
#else
48
#include "os/lzma_stub.h"
49
#endif
50
#endif
51
52
#include "htslib/hts.h"
53
#include "htslib/bgzf.h"
54
#include "cram/cram.h"
55
#include "htslib/hfile.h"
56
#include "htslib/hts_endian.h"
57
#include "version.h"
58
#include "config_vars.h"
59
#include "hts_internal.h"
60
#include "hfile_internal.h"
61
#include "sam_internal.h"
62
#include "htslib/hts_expr.h"
63
#include "htslib/hts_os.h" // drand48
64
65
#include "htslib/khash.h"
66
#include "htslib/kseq.h"
67
#include "htslib/ksort.h"
68
#include "htslib/tbx.h"
69
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
70
#include <htscodecs/htscodecs.h>
71
#else
72
#include "htscodecs/htscodecs/htscodecs.h"
73
#endif
74
75
#ifndef EFTYPE
76
110
#define EFTYPE ENOEXEC
77
#endif
78
79
5.94k
KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal)
Unexecuted instantiation: kh_grow_to_fit_s2i
Unexecuted instantiation: kh_stats_s2i
80
81
HTSLIB_EXPORT
82
int hts_verbose = HTS_LOG_WARNING;
83
84
const char *hts_version(void)
85
2
{
86
2
    return HTS_VERSION_TEXT;
87
2
}
88
89
0
unsigned int hts_features(void) {
90
0
    unsigned int feat = HTS_FEATURE_HTSCODECS; // Always present
91
92
0
#ifdef PACKAGE_URL
93
0
    feat |= HTS_FEATURE_CONFIGURE;
94
0
#endif
95
96
#ifdef ENABLE_PLUGINS
97
    feat |= HTS_FEATURE_PLUGINS;
98
#endif
99
100
0
#ifdef HAVE_LIBCURL
101
0
    feat |= HTS_FEATURE_LIBCURL;
102
0
#endif
103
104
0
#ifdef ENABLE_S3
105
0
    feat |= HTS_FEATURE_S3;
106
0
#endif
107
108
0
#ifdef ENABLE_GCS
109
0
    feat |= HTS_FEATURE_GCS;
110
0
#endif
111
112
#ifdef HAVE_LIBDEFLATE
113
    feat |= HTS_FEATURE_LIBDEFLATE;
114
#endif
115
116
0
#ifdef HAVE_LIBLZMA
117
0
    feat |= HTS_FEATURE_LZMA;
118
0
#endif
119
120
0
#ifdef HAVE_LIBBZ2
121
0
    feat |= HTS_FEATURE_BZIP2;
122
0
#endif
123
124
0
    return feat;
125
0
}
126
127
0
const char *hts_test_feature(unsigned int id) {
128
0
    unsigned int feat = hts_features();
129
130
0
    switch (id) {
131
0
    case HTS_FEATURE_CONFIGURE:
132
0
        return feat & HTS_FEATURE_CONFIGURE ? "yes" : NULL;
133
0
    case HTS_FEATURE_PLUGINS:
134
0
        return feat & HTS_FEATURE_PLUGINS ? "yes" : NULL;
135
0
    case HTS_FEATURE_LIBCURL:
136
0
        return feat & HTS_FEATURE_LIBCURL ? "yes" : NULL;
137
0
    case HTS_FEATURE_S3:
138
0
        return feat & HTS_FEATURE_S3 ? "yes" : NULL;
139
0
    case HTS_FEATURE_GCS:
140
0
        return feat & HTS_FEATURE_GCS ? "yes" : NULL;
141
0
    case HTS_FEATURE_LIBDEFLATE:
142
0
        return feat & HTS_FEATURE_LIBDEFLATE ? "yes" : NULL;
143
0
    case HTS_FEATURE_BZIP2:
144
0
        return feat & HTS_FEATURE_BZIP2 ? "yes" : NULL;
145
0
    case HTS_FEATURE_LZMA:
146
0
        return feat & HTS_FEATURE_LZMA ? "yes" : NULL;
147
148
0
    case HTS_FEATURE_HTSCODECS:
149
0
        return htscodecs_version();
150
151
0
    case HTS_FEATURE_CC:
152
0
        return HTS_CC;
153
0
    case HTS_FEATURE_CFLAGS:
154
0
        return HTS_CFLAGS;
155
0
    case HTS_FEATURE_LDFLAGS:
156
0
        return HTS_LDFLAGS;
157
0
    case HTS_FEATURE_CPPFLAGS:
158
0
        return HTS_CPPFLAGS;
159
160
0
    default:
161
0
        fprintf(stderr, "Unknown feature code: %u\n", id);
162
0
    }
163
164
0
    return NULL;
165
0
}
166
167
// Note this implementation also means we can just "strings" the library
168
// to find the configuration parameters.
169
0
const char *hts_feature_string(void) {
170
0
    static char config[1200];
171
0
    const char *flags=
172
173
0
#ifdef PACKAGE_URL
174
0
    "build=configure "
175
#else
176
    "build=Makefile "
177
#endif
178
179
0
#ifdef HAVE_LIBCURL
180
0
    "libcurl=yes "
181
#else
182
    "libcurl=no "
183
#endif
184
185
0
#ifdef ENABLE_S3
186
0
    "S3=yes "
187
#else
188
    "S3=no "
189
#endif
190
191
0
#ifdef ENABLE_GCS
192
0
    "GCS=yes "
193
#else
194
    "GCS=no "
195
#endif
196
197
#ifdef HAVE_LIBDEFLATE
198
    "libdeflate=yes "
199
#else
200
0
    "libdeflate=no "
201
0
#endif
202
203
0
#ifdef HAVE_LIBLZMA
204
0
    "lzma=yes "
205
#else
206
    "lzma=no "
207
#endif
208
209
0
#ifdef HAVE_LIBBZ2
210
0
    "bzip2=yes "
211
#else
212
    "bzip2=no "
213
#endif
214
215
// "plugins=" must stay at the end as it is followed by "plugin-path="
216
#ifdef ENABLE_PLUGINS
217
    "plugins=yes";
218
#else
219
0
    "plugins=no";
220
0
#endif
221
222
#ifdef ENABLE_PLUGINS
223
    snprintf(config, sizeof(config),
224
             "%s plugin-path=%.1000s htscodecs=%.40s",
225
             flags, hts_plugin_path(), htscodecs_version());
226
#else
227
0
    snprintf(config, sizeof(config),
228
0
             "%s htscodecs=%.40s",
229
0
             flags, htscodecs_version());
230
0
#endif
231
0
    return config;
232
0
}
233
234
235
// Converts ASCII to BAM nibble encoding.
236
// Note 0123 is treated as ACGT (ABI colourspace encoding) and
237
// U is treated as T.
238
HTSLIB_EXPORT
239
const unsigned char seq_nt16_table[256] = {
240
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
241
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
242
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
243
     1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
244
    15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
245
    15,15, 5, 6,  8, 8, 7, 9, 15,10,15,15, 15,15,15,15,
246
    15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
247
    15,15, 5, 6,  8, 8, 7, 9, 15,10,15,15, 15,15,15,15,
248
249
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
250
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
251
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
252
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
253
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
254
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
255
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
256
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
257
};
258
259
HTSLIB_EXPORT
260
const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN";
261
262
HTSLIB_EXPORT
263
const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
264
265
/**********************
266
 *** Basic file I/O ***
267
 **********************/
268
269
static enum htsFormatCategory format_category(enum htsExactFormat fmt)
270
39.3k
{
271
39.3k
    switch (fmt) {
272
0
    case bam:
273
0
    case sam:
274
10.4k
    case cram:
275
10.4k
    case fastq_format:
276
10.4k
    case fasta_format:
277
10.4k
        return sequence_data;
278
279
0
    case vcf:
280
0
    case bcf:
281
0
        return variant_data;
282
283
0
    case bai:
284
0
    case crai:
285
0
    case csi:
286
0
    case fai_format:
287
0
    case fqi_format:
288
0
    case gzi:
289
0
    case tbi:
290
0
        return index_file;
291
292
0
    case bed:
293
0
    case d4_format:
294
0
        return region_list;
295
296
0
    case htsget:
297
0
    case hts_crypt4gh_format:
298
0
        return unknown_category;
299
300
0
    case unknown_format:
301
14.4k
    case binary_format:
302
28.8k
    case text_format:
303
28.8k
    case empty_format:
304
28.8k
    case format_maximum:
305
28.8k
        break;
306
39.3k
    }
307
308
28.8k
    return unknown_category;
309
39.3k
}
310
311
// Decompress several hundred bytes by peeking at the file, which must be
312
// positioned at the start of a GZIP block.
313
static ssize_t
314
decompress_peek_gz(hFILE *fp, unsigned char *dest, size_t destsize)
315
2.89k
{
316
2.89k
    unsigned char buffer[2048];
317
2.89k
    z_stream zs;
318
2.89k
    ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
319
320
2.89k
    if (npeek < 0) return -1;
321
322
2.89k
    zs.zalloc = NULL;
323
2.89k
    zs.zfree = NULL;
324
2.89k
    zs.next_in = buffer;
325
2.89k
    zs.avail_in = npeek;
326
2.89k
    zs.next_out = dest;
327
2.89k
    zs.avail_out = destsize;
328
2.89k
    if (inflateInit2(&zs, 31) != Z_OK) return -1;
329
330
2.89k
    int ret;
331
2.89k
    const unsigned char *last_in = buffer;
332
5.78k
    while (zs.avail_out > 0) {
333
3.17k
        ret = inflate(&zs, Z_SYNC_FLUSH);
334
3.17k
        if (ret == Z_STREAM_END) {
335
117
            if (last_in == zs.next_in)
336
0
                break; // Paranoia to avoid potential looping. Shouldn't happen
337
117
            else
338
117
                last_in = zs.next_in;
339
117
            inflateReset(&zs);
340
3.06k
        } else if (ret != Z_OK) {
341
            // eg Z_BUF_ERROR due to avail_in/out becoming zero
342
290
            break;
343
290
        }
344
3.17k
    }
345
346
    // NB: zs.total_out is changed by inflateReset, so use pointer diff instead
347
2.89k
    destsize = zs.next_out - dest;
348
2.89k
    inflateEnd(&zs);
349
350
2.89k
    return destsize;
351
2.89k
}
352
353
#ifdef HAVE_LIBLZMA
354
// Similarly decompress a portion by peeking at the file, which must be
355
// positioned at the start of the file.
356
static ssize_t
357
decompress_peek_xz(hFILE *fp, unsigned char *dest, size_t destsize)
358
2
{
359
2
    unsigned char buffer[2048];
360
2
    ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
361
2
    if (npeek < 0) return -1;
362
363
2
    lzma_stream ls = LZMA_STREAM_INIT;
364
2
    if (lzma_stream_decoder(&ls, lzma_easy_decoder_memusage(9), 0) != LZMA_OK)
365
0
        return -1;
366
367
2
    ls.next_in = buffer;
368
2
    ls.avail_in = npeek;
369
2
    ls.next_out = dest;
370
2
    ls.avail_out = destsize;
371
372
2
    int r = lzma_code(&ls, LZMA_RUN);
373
2
    if (! (r == LZMA_OK || r == LZMA_STREAM_END)) {
374
1
        lzma_end(&ls);
375
1
        return -1;
376
1
    }
377
378
1
    destsize = ls.total_out;
379
1
    lzma_end(&ls);
380
381
1
    return destsize;
382
2
}
383
#endif
384
385
// Parse "x.y" text, taking care because the string is not NUL-terminated
386
// and filling in major/minor only when the digits are followed by a delimiter,
387
// so we don't misread "1.10" as "1.1" due to reaching the end of the buffer.
388
static void
389
parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim)
390
296
{
391
296
    const char *s    = (const char *) u;
392
296
    const char *slim = (const char *) ulim;
393
296
    short v;
394
395
296
    fmt->version.major = fmt->version.minor = -1;
396
397
1.20k
    for (v = 0; s < slim && isdigit_c(*s); s++)
398
905
        v = 10 * v + *s - '0';
399
400
296
    if (s < slim) {
401
285
        fmt->version.major = v;
402
285
        if (*s == '.') {
403
77
            s++;
404
772
            for (v = 0; s < slim && isdigit_c(*s); s++)
405
695
                v = 10 * v + *s - '0';
406
77
            if (s < slim)
407
59
                fmt->version.minor = v;
408
77
        }
409
208
        else
410
208
            fmt->version.minor = 0;
411
285
    }
412
296
}
413
414
static int
415
cmp_nonblank(const char *key, const unsigned char *u, const unsigned char *ulim)
416
8.25k
{
417
8.25k
    const unsigned char *ukey = (const unsigned char *) key;
418
419
8.58k
    while (*ukey)
420
8.58k
        if (u >= ulim) return +1;
421
8.57k
        else if (isspace_c(*u)) u++;
422
8.26k
        else if (*u != *ukey) return (*ukey < *u)? -1 : +1;
423
19
        else u++, ukey++;
424
425
0
    return 0;
426
8.25k
}
427
428
static int is_text_only(const unsigned char *u, const unsigned char *ulim)
429
8.25k
{
430
191k
    for (; u < ulim; u++)
431
182k
        if (! (*u >= ' ' || *u == '\t' || *u == '\r' || *u == '\n'))
432
119
            return 0;
433
434
8.13k
    return 1;
435
8.25k
}
436
437
static inline int
438
alternate_zeros(const unsigned char *u, const unsigned char *ulim)
439
0
{
440
0
    for (; u < ulim; u += 2)
441
0
        if (*u != '\0') return 0;
442
0
    return 1;
443
0
}
444
445
static int is_utf16_text(const unsigned char *u, const unsigned char *ulim)
446
0
{
447
0
    if (ulim - u >= 6 &&
448
0
        ((u[0] == 0xfe && u[1] == 0xff && alternate_zeros(u+2, ulim)) ||
449
0
         (u[0] == 0xff && u[1] == 0xfe && alternate_zeros(u+3, ulim))))
450
0
        return 2;
451
0
    else if (ulim - u >= 8 &&
452
0
             (alternate_zeros(u, ulim) || alternate_zeros(u+1, ulim)))
453
0
        return 1;
454
0
    else
455
0
        return 0;
456
0
}
457
458
static int is_fastaq(const unsigned char *u, const unsigned char *ulim)
459
7.99k
{
460
7.99k
    const unsigned char *eol = memchr(u, '\n', ulim - u);
461
462
    // Check that the first line is entirely textual
463
7.99k
    if (! is_text_only(u, eol? eol : ulim)) return 0;
464
465
    // If the first line is very long, consider the file to indeed be FASTA/Q
466
7.97k
    if (eol == NULL) return 1;
467
468
6.65k
    u = eol+1; // Now points to the first character of the second line
469
470
    // Scan over all base-encoding letters (including 'N' but not SEQ's '=')
471
8.58k
    while (u < ulim && (seq_nt16_table[*u] != 15 || toupper(*u) == 'N')) {
472
1.93k
        if (*u == '=') return 0;
473
1.93k
        u++;
474
1.93k
    }
475
476
6.65k
    return (u == ulim || *u == '\r' || *u == '\n')? 1 : 0;
477
6.65k
}
478
479
// Parse tab-delimited text, filling in a string of column types and returning
480
// the number of columns spotted (within [u,ulim), and up to column_len) or -1
481
// if non-printable characters were seen.  Column types:
482
//     i: integer, s: strand sign, C: CIGAR, O: SAM optional field, Z: anything
483
static int
484
parse_tabbed_text(char *columns, int column_len,
485
                  const unsigned char *u, const unsigned char *ulim,
486
                  int *complete)
487
306
{
488
306
    const char *str  = (const char *) u;
489
306
    const char *slim = (const char *) ulim;
490
306
    const char *s;
491
306
    int ncolumns = 0;
492
493
306
    enum { digit = 1, leading_sign = 2, cigar_operator = 4, other = 8 };
494
306
    unsigned seen = 0;
495
306
    *complete = 0;
496
497
5.57k
    for (s = str; s < slim; s++)
498
5.45k
        if (*s >= ' ') {
499
4.32k
            if (isdigit_c(*s))
500
1.26k
                seen |= digit;
501
3.06k
            else if ((*s == '+' || *s == '-') && s == str)
502
114
                seen |= leading_sign;
503
2.94k
            else if (strchr(BAM_CIGAR_STR, *s) && s > str && isdigit_c(s[-1]))
504
162
                seen |= cigar_operator;
505
2.78k
            else
506
2.78k
                seen |= other;
507
4.32k
        }
508
1.13k
        else if (*s == '\t' || *s == '\r' || *s == '\n') {
509
1.02k
            size_t len = s - str;
510
1.02k
            char type;
511
512
1.02k
            if (seen == digit || seen == (leading_sign|digit)) type = 'i';
513
710
            else if (seen == (digit|cigar_operator)) type = 'C';
514
674
            else if (len == 1)
515
263
                switch (str[0]) {
516
85
                case '*': type = 'C'; break;
517
104
                case '+': case '-': case '.': type = 's'; break;
518
74
                default: type = 'Z'; break;
519
263
                }
520
411
            else if (len >= 5 && str[2] == ':' && str[4] == ':') type = 'O';
521
377
            else type = 'Z';
522
523
1.02k
            columns[ncolumns++] = type;
524
1.02k
            if (*s != '\t' || ncolumns >= column_len - 1) {
525
80
                *complete = 1; // finished the line or more columns than needed
526
80
                break;
527
80
            }
528
529
949
            str = s + 1;
530
949
            seen = 0;
531
949
        }
532
108
        else return -1;
533
534
198
    columns[ncolumns] = '\0';
535
198
    return ncolumns;
536
306
}
537
538
// Match COLUMNS as a prefix against PATTERN (so COLUMNS may run out first).
539
// Returns len(COLUMNS) (modulo '+'), or 0 if there is a mismatched entry.
540
static int colmatch(const char *columns, const char *pattern)
541
279
{
542
279
    int i;
543
1.03k
    for (i = 0; columns[i] != '\0'; i++) {
544
873
        if (pattern[i] == '+') return i;
545
869
        if (! (columns[i] == pattern[i] || pattern[i] == 'Z')) return 0;
546
869
    }
547
548
164
    return i;
549
279
}
550
551
int hts_detect_format(hFILE *hfile, htsFormat *fmt)
552
0
{
553
0
    return hts_detect_format2(hfile, NULL, fmt);
554
0
}
555
556
int hts_detect_format2(hFILE *hfile, const char *fname, htsFormat *fmt)
557
56.0k
{
558
56.0k
    char extension[HTS_MAX_EXT_LEN], columns[24];
559
56.0k
    unsigned char s[1024];
560
56.0k
    int complete = 0;
561
56.0k
    ssize_t len = hpeek(hfile, s, 18);
562
56.0k
    if (len < 0) return -1;
563
564
56.0k
    fmt->category = unknown_category;
565
56.0k
    fmt->format = unknown_format;
566
56.0k
    fmt->version.major = fmt->version.minor = -1;
567
56.0k
    fmt->compression = no_compression;
568
56.0k
    fmt->compression_level = -1;
569
56.0k
    fmt->specific = NULL;
570
571
56.0k
    if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) {
572
        // The stream is either gzip-compressed or BGZF-compressed.
573
        // Determine which, and decompress the first few records or lines.
574
2.89k
        fmt->compression = gzip;
575
2.89k
        if (len >= 18 && (s[3] & 4)) {
576
1.66k
            if (memcmp(&s[12], "BC\2\0", 4) == 0)
577
168
                fmt->compression = bgzf;
578
1.49k
            else if (memcmp(&s[12], "RAZF", 4) == 0)
579
60
                fmt->compression = razf_compression;
580
1.66k
        }
581
2.89k
        if (len >= 9 && s[2] == 8)
582
2.87k
            fmt->compression_level = (s[8] == 2)? 9 : (s[8] == 4)? 1 : -1;
583
584
2.89k
        len = decompress_peek_gz(hfile, s, sizeof s);
585
2.89k
    }
586
53.1k
    else if (len >= 10 && memcmp(s, "BZh", 3) == 0 &&
587
3
             (memcmp(&s[4], "\x31\x41\x59\x26\x53\x59", 6) == 0 ||
588
2
              memcmp(&s[4], "\x17\x72\x45\x38\x50\x90", 6) == 0)) {
589
2
        fmt->compression = bzip2_compression;
590
2
        fmt->compression_level = s[3] - '0';
591
        // Decompressing via libbz2 produces no output until it has a whole
592
        // block (of size 100Kb x level), which is too large for peeking.
593
        // So unfortunately we can recognise bzip2 but not the contents,
594
        // except that \x1772... magic indicates the stream is empty.
595
2
        if (s[4] == '\x31') return 0;
596
1
        else len = 0;
597
2
    }
598
53.1k
    else if (len >= 6 && memcmp(s, "\xfd""7zXZ\0", 6) == 0) {
599
2
        fmt->compression = xz_compression;
600
2
#ifdef HAVE_LIBLZMA
601
2
        len = decompress_peek_xz(hfile, s, sizeof s);
602
#else
603
        // Without liblzma, we can't recognise the decompressed contents.
604
        return 0;
605
#endif
606
2
    }
607
53.1k
    else if (len >= 4 && memcmp(s, "\x28\xb5\x2f\xfd", 4) == 0) {
608
1
        fmt->compression = zstd_compression;
609
1
        return 0;
610
1
    }
611
53.1k
    else {
612
53.1k
        len = hpeek(hfile, s, sizeof s);
613
53.1k
    }
614
56.0k
    if (len < 0) return -1;
615
616
56.0k
    if (len == 0) {
617
60
        fmt->format = empty_format;
618
60
        return 0;
619
60
    }
620
621
    // We avoid using filename extensions wherever possible (as filenames are
622
    // not always available), but in a few cases they must be considered:
623
    //  - FASTA/Q indexes are simply tab-separated text; files that match these
624
    //    patterns but not the fai/fqi extension are usually generic BED files
625
    //  - GZI indexes have no magic numbers so can only be detected by filename
626
56.0k
    if (fname && strcmp(fname, "-") != 0) {
627
56.0k
        char *s;
628
56.0k
        if (find_file_extension(fname, extension) < 0) extension[0] = '\0';
629
56.0k
        for (s = extension; *s; s++) *s = tolower_c(*s);
630
56.0k
    }
631
0
    else extension[0] = '\0';
632
633
56.0k
    if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=7 && s[5]<=7) {
634
11.7k
        fmt->category = sequence_data;
635
11.7k
        fmt->format = cram;
636
11.7k
        fmt->version.major = s[4], fmt->version.minor = s[5];
637
11.7k
        fmt->compression = custom;
638
11.7k
        return 0;
639
11.7k
    }
640
44.2k
    else if (len >= 4 && s[3] <= '\4') {
641
4.70k
        if (memcmp(s, "BAM\1", 4) == 0) {
642
3.33k
            fmt->category = sequence_data;
643
3.33k
            fmt->format = bam;
644
            // TODO Decompress enough to pick version from @HD-VN header
645
3.33k
            fmt->version.major = 1, fmt->version.minor = -1;
646
3.33k
            return 0;
647
3.33k
        }
648
1.37k
        else if (memcmp(s, "BAI\1", 4) == 0) {
649
1
            fmt->category = index_file;
650
1
            fmt->format = bai;
651
1
            fmt->version.major = -1, fmt->version.minor = -1;
652
1
            return 0;
653
1
        }
654
1.37k
        else if (memcmp(s, "BCF\4", 4) == 0) {
655
9
            fmt->category = variant_data;
656
9
            fmt->format = bcf;
657
9
            fmt->version.major = 1, fmt->version.minor = -1;
658
9
            return 0;
659
9
        }
660
1.36k
        else if (memcmp(s, "BCF\2", 4) == 0) {
661
1.35k
            fmt->category = variant_data;
662
1.35k
            fmt->format = bcf;
663
1.35k
            fmt->version.major = s[3];
664
1.35k
            fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0;
665
1.35k
            return 0;
666
1.35k
        }
667
13
        else if (memcmp(s, "CSI\1", 4) == 0) {
668
1
            fmt->category = index_file;
669
1
            fmt->format = csi;
670
1
            fmt->version.major = 1, fmt->version.minor = -1;
671
1
            return 0;
672
1
        }
673
12
        else if (memcmp(s, "TBI\1", 4) == 0) {
674
1
            fmt->category = index_file;
675
1
            fmt->format = tbi;
676
1
            return 0;
677
1
        }
678
        // GZI indexes have no magic numbers, so must be recognised solely by
679
        // filename extension.
680
11
        else if (strcmp(extension, "gzi") == 0) {
681
0
            fmt->category = index_file;
682
0
            fmt->format = gzi;
683
0
            return 0;
684
0
        }
685
4.70k
    }
686
39.5k
    else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) {
687
10.4k
        fmt->category = variant_data;
688
10.4k
        fmt->format = vcf;
689
10.4k
        if (len >= 21 && s[16] == 'v')
690
120
            parse_version(fmt, &s[17], &s[len]);
691
10.4k
        return 0;
692
10.4k
    }
693
29.0k
    else if (len >= 4 && s[0] == '@' &&
694
21.1k
             (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 ||
695
6.53k
              memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0 ||
696
20.8k
              memcmp(s, "@CO\t", 4) == 0)) {
697
20.8k
        fmt->category = sequence_data;
698
20.8k
        fmt->format = sam;
699
        // @HD-VN is not guaranteed to be the first tag, but then @HD is
700
        // not guaranteed to be present at all...
701
20.8k
        if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0)
702
176
            parse_version(fmt, &s[7], &s[len]);
703
20.6k
        else
704
20.6k
            fmt->version.major = 1, fmt->version.minor = -1;
705
20.8k
        return 0;
706
20.8k
    }
707
8.25k
    else if (len >= 8 && memcmp(s, "d4\xdd\xdd", 4) == 0) {
708
1
        fmt->category = region_list;
709
1
        fmt->format = d4_format;
710
        // How to decode the D4 Format Version bytes is not yet specified
711
        // so we don't try to set fmt->version.{major,minor}.
712
1
        return 0;
713
1
    }
714
8.25k
    else if (cmp_nonblank("{\"htsget\":", s, &s[len]) == 0) {
715
0
        fmt->category = unknown_category;
716
0
        fmt->format = htsget;
717
0
        return 0;
718
0
    }
719
8.25k
    else if (len > 8 && memcmp(s, "crypt4gh", 8) == 0) {
720
2
        fmt->category = unknown_category;
721
2
        fmt->format = hts_crypt4gh_format;
722
2
        return 0;
723
2
    }
724
8.25k
    else if (len >= 1 && s[0] == '>' && is_fastaq(s, &s[len])) {
725
7.60k
        fmt->category = sequence_data;
726
7.60k
        fmt->format = fasta_format;
727
7.60k
        return 0;
728
7.60k
    }
729
654
    else if (len >= 1 && s[0] == '@' && is_fastaq(s, &s[len])) {
730
348
        fmt->category = sequence_data;
731
348
        fmt->format = fastq_format;
732
348
        return 0;
733
348
    }
734
306
    else if (parse_tabbed_text(columns, sizeof columns, s,
735
306
                               &s[len], &complete) > 0) {
736
        // A complete SAM line is at least 11 columns.  On unmapped long reads may
737
        // be missing two.  (On mapped long reads we must have an @ header so long
738
        // CIGAR is irrelevant.)
739
159
        if (colmatch(columns, "ZiZiiCZiiZZOOOOOOOOOOOOOOOOOOOO+")
740
159
            >= 9 + 2*complete) {
741
48
            fmt->category = sequence_data;
742
48
            fmt->format = sam;
743
48
            fmt->version.major = 1, fmt->version.minor = -1;
744
48
            return 0;
745
48
        }
746
111
        else if (fmt->compression == gzip && colmatch(columns, "iiiiii") == 6) {
747
1
            fmt->category = index_file;
748
1
            fmt->format = crai;
749
1
            return 0;
750
1
        }
751
110
        else if (strstr(extension, "fqi") && colmatch(columns, "Ziiiii") == 6) {
752
0
            fmt->category = index_file;
753
0
            fmt->format = fqi_format;
754
0
            return 0;
755
0
        }
756
110
        else if (strstr(extension, "fai") && colmatch(columns, "Ziiii") == 5) {
757
0
            fmt->category = index_file;
758
0
            fmt->format = fai_format;
759
0
            return 0;
760
0
        }
761
110
        else if (colmatch(columns, "Zii+") >= 3) {
762
5
            fmt->category = region_list;
763
5
            fmt->format = bed;
764
5
            return 0;
765
5
        }
766
159
    }
767
768
    // Arbitrary text files can be read using hts_getline().
769
263
    if (is_text_only(s, &s[len])) fmt->format = text_format;
770
771
    // Nothing recognised: leave unset fmt-> fields as unknown.
772
263
    return 0;
773
56.0k
}
774
775
char *hts_format_description(const htsFormat *format)
776
0
{
777
0
    kstring_t str = { 0, 0, NULL };
778
779
0
    switch (format->format) {
780
0
    case sam:   kputs("SAM", &str); break;
781
0
    case bam:   kputs("BAM", &str); break;
782
0
    case cram:  kputs("CRAM", &str); break;
783
0
    case fasta_format:  kputs("FASTA", &str); break;
784
0
    case fastq_format:  kputs("FASTQ", &str); break;
785
0
    case vcf:   kputs("VCF", &str); break;
786
0
    case bcf:
787
0
        if (format->version.major == 1) kputs("Legacy BCF", &str);
788
0
        else kputs("BCF", &str);
789
0
        break;
790
0
    case bai:   kputs("BAI", &str); break;
791
0
    case crai:  kputs("CRAI", &str); break;
792
0
    case csi:   kputs("CSI", &str); break;
793
0
    case fai_format:    kputs("FASTA-IDX", &str); break;
794
0
    case fqi_format:    kputs("FASTQ-IDX", &str); break;
795
0
    case gzi:   kputs("GZI", &str); break;
796
0
    case tbi:   kputs("Tabix", &str); break;
797
0
    case bed:   kputs("BED", &str); break;
798
0
    case d4_format:     kputs("D4", &str); break;
799
0
    case htsget: kputs("htsget", &str); break;
800
0
    case hts_crypt4gh_format: kputs("crypt4gh", &str); break;
801
0
    case empty_format:  kputs("empty", &str); break;
802
0
    default:    kputs("unknown", &str); break;
803
0
    }
804
805
0
    if (format->version.major >= 0) {
806
0
        kputs(" version ", &str);
807
0
        kputw(format->version.major, &str);
808
0
        if (format->version.minor >= 0) {
809
0
            kputc('.', &str);
810
0
            kputw(format->version.minor, &str);
811
0
        }
812
0
    }
813
814
0
    switch (format->compression) {
815
0
    case bzip2_compression:  kputs(" bzip2-compressed", &str); break;
816
0
    case razf_compression:   kputs(" legacy-RAZF-compressed", &str); break;
817
0
    case xz_compression:     kputs(" XZ-compressed", &str); break;
818
0
    case zstd_compression:   kputs(" Zstandard-compressed", &str); break;
819
0
    case custom: kputs(" compressed", &str); break;
820
0
    case gzip:   kputs(" gzip-compressed", &str); break;
821
822
0
    case bgzf:
823
0
        switch (format->format) {
824
0
        case bam:
825
0
        case bcf:
826
0
        case csi:
827
0
        case tbi:
828
            // These are by definition BGZF, so just use the generic term
829
0
            kputs(" compressed", &str);
830
0
            break;
831
0
        default:
832
0
            kputs(" BGZF-compressed", &str);
833
0
            break;
834
0
        }
835
0
        break;
836
837
0
    case no_compression:
838
0
        switch (format->format) {
839
0
        case bam:
840
0
        case bcf:
841
0
        case cram:
842
0
        case csi:
843
0
        case tbi:
844
            // These are normally compressed, so emphasise that this one isn't
845
0
            kputs(" uncompressed", &str);
846
0
            break;
847
0
        default:
848
0
            break;
849
0
        }
850
0
        break;
851
852
0
    default: break;
853
0
    }
854
855
0
    switch (format->category) {
856
0
    case sequence_data: kputs(" sequence", &str); break;
857
0
    case variant_data:  kputs(" variant calling", &str); break;
858
0
    case index_file:    kputs(" index", &str); break;
859
0
    case region_list:   kputs(" genomic region", &str); break;
860
0
    default: break;
861
0
    }
862
863
0
    if (format->compression == no_compression)
864
0
        switch (format->format) {
865
0
        case text_format:
866
0
        case sam:
867
0
        case crai:
868
0
        case vcf:
869
0
        case bed:
870
0
        case fai_format:
871
0
        case fqi_format:
872
0
        case fasta_format:
873
0
        case fastq_format:
874
0
        case htsget:
875
0
            kputs(" text", &str);
876
0
            break;
877
878
0
        case empty_format:
879
0
            break;
880
881
0
        default:
882
0
            kputs(" data", &str);
883
0
            break;
884
0
        }
885
0
    else
886
0
        kputs(" data", &str);
887
888
0
    return ks_release(&str);
889
0
}
890
891
htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
892
39.3k
{
893
39.3k
    char smode[101], *cp, *cp2, *mode_c, *uncomp = NULL;
894
39.3k
    htsFile *fp = NULL;
895
39.3k
    hFILE *hfile = NULL;
896
39.3k
    char fmt_code = '\0';
897
    // see enum htsExactFormat in htslib/hts.h
898
39.3k
    const char format_to_mode[] = "\0g\0\0b\0c\0\0b\0g\0\0\0\0\0Ff\0\0";
899
900
39.3k
    strncpy(smode, mode, 99);
901
39.3k
    smode[99]=0;
902
39.3k
    if ((cp = strchr(smode, ',')))
903
0
        *cp = '\0';
904
905
    // Migrate format code (b or c) to the end of the smode buffer.
906
103k
    for (cp2 = cp = smode; *cp; cp++) {
907
64.2k
        if (*cp == 'b')
908
14.4k
            fmt_code = 'b';
909
49.8k
        else if (*cp == 'c')
910
10.4k
            fmt_code = 'c';
911
39.3k
        else {
912
39.3k
            *cp2++ = *cp;
913
            // Cache the uncompress flag 'u' pos if present
914
39.3k
            if (!uncomp && (*cp == 'u')) {
915
0
                uncomp = cp2 - 1;
916
0
            }
917
39.3k
        }
918
64.2k
    }
919
39.3k
    mode_c = cp2;
920
39.3k
    *cp2++ = fmt_code;
921
39.3k
    *cp2++ = 0;
922
923
    // Set or reset the format code if opts->format is used
924
39.3k
    if (fmt && fmt->format > unknown_format
925
0
        && fmt->format < sizeof(format_to_mode)) {
926
0
        *mode_c = format_to_mode[fmt->format];
927
0
    }
928
929
    // Uncompressed bam/bcf is not supported, change 'u' to '0' on write
930
39.3k
    if (uncomp && *mode_c == 'b' && (strchr(smode, 'w') || strchr(smode, 'a'))) {
931
0
        *uncomp = '0';
932
0
    }
933
934
    // If we really asked for a compressed text format then mode_c above will
935
    // point to nul.  We set to 'z' to enable bgzf.
936
39.3k
    if (strchr(mode, 'w') && fmt && fmt->compression == bgzf) {
937
0
        if (fmt->format == sam || fmt->format == vcf || fmt->format == text_format)
938
0
            *mode_c = 'z';
939
0
    }
940
941
39.3k
    char *rmme = NULL, *fnidx = strstr(fn, HTS_IDX_DELIM);
942
39.3k
    if ( fnidx ) {
943
0
        rmme = strdup(fn);
944
0
        if ( !rmme ) goto error;
945
0
        rmme[fnidx-fn] = 0;
946
0
        fn = rmme;
947
0
    }
948
949
39.3k
    hfile = hopen(fn, smode);
950
39.3k
    if (hfile == NULL) goto error;
951
952
39.3k
    fp = hts_hopen(hfile, fn, smode);
953
39.3k
    if (fp == NULL) goto error;
954
955
    // Compensate for the loss of exactness in htsExactFormat.
956
    // hts_hopen returns generics such as binary or text, but we
957
    // have been given something explicit here so use that instead.
958
39.3k
    if (fp->is_write && fmt &&
959
0
        (fmt->format == bam || fmt->format == sam ||
960
0
         fmt->format == vcf || fmt->format == bcf ||
961
0
         fmt->format == bed || fmt->format == fasta_format ||
962
0
         fmt->format == fastq_format))
963
0
        fp->format.format = fmt->format;
964
965
39.3k
    if (fmt && fmt->specific) {
966
0
        if (hts_opt_apply(fp, fmt->specific) != 0) {
967
0
            if (((hts_opt*)fmt->specific)->opt == CRAM_OPT_REFERENCE &&
968
0
                (errno == ENOENT || errno == EIO || errno == EBADF ||
969
0
                  errno == EACCES || errno == EISDIR)) {
970
                /* error during reference file operation
971
                 for these specific errors, set the error as EINVAL */
972
0
                errno = EINVAL;
973
0
            }
974
0
            goto error;
975
0
        }
976
0
    }
977
39.3k
    if ( rmme ) free(rmme);
978
39.3k
    return fp;
979
980
0
error:
981
0
    hts_log_error("Failed to open file \"%s\"%s%s", fn,
982
0
                  errno ? " : " : "", errno ? strerror(errno) : "");
983
0
    if ( rmme ) free(rmme);
984
985
0
    if (hfile)
986
0
        hclose_abruptly(hfile);
987
988
0
    return NULL;
989
39.3k
}
990
991
39.3k
htsFile *hts_open(const char *fn, const char *mode) {
992
39.3k
    return hts_open_format(fn, mode, NULL);
993
39.3k
}
994
995
/*
996
 * Splits str into a prefix, delimiter ('\0' or delim), and suffix, writing
997
 * the prefix in lowercase into buf and returning a pointer to the suffix.
998
 * On return, buf is always NUL-terminated; thus assumes that the "keyword"
999
 * prefix should be one of several known values of maximum length buflen-2.
1000
 * (If delim is not found, returns a pointer to the '\0'.)
1001
 */
1002
static const char *
1003
scan_keyword(const char *str, char delim, char *buf, size_t buflen)
1004
0
{
1005
0
    size_t i = 0;
1006
0
    while (*str && *str != delim) {
1007
0
        if (i < buflen-1) buf[i++] = tolower_c(*str);
1008
0
        str++;
1009
0
    }
1010
1011
0
    buf[i] = '\0';
1012
0
    return *str? str+1 : str;
1013
0
}
1014
1015
/*
1016
 * Parses arg and appends it to the option list.
1017
 *
1018
 * Returns 0 on success;
1019
 *        -1 on failure.
1020
 */
1021
0
int hts_opt_add(hts_opt **opts, const char *c_arg) {
1022
0
    hts_opt *o, *t;
1023
0
    char *val;
1024
1025
    /*
1026
     * IMPORTANT!!!
1027
     * If you add another string option here, don't forget to also add
1028
     * it to the case statement in hts_opt_apply.
1029
     */
1030
1031
0
    if (!c_arg)
1032
0
        return -1;
1033
1034
0
    if (!(o =  malloc(sizeof(*o))))
1035
0
        return -1;
1036
1037
0
    if (!(o->arg = strdup(c_arg))) {
1038
0
        free(o);
1039
0
        return -1;
1040
0
    }
1041
1042
0
    if (!(val = strchr(o->arg, '=')))
1043
0
        val = "1"; // assume boolean
1044
0
    else
1045
0
        *val++ = '\0';
1046
1047
0
    if (strcmp(o->arg, "decode_md") == 0 ||
1048
0
        strcmp(o->arg, "DECODE_MD") == 0)
1049
0
        o->opt = CRAM_OPT_DECODE_MD, o->val.i = atoi(val);
1050
1051
0
    else if (strcmp(o->arg, "verbosity") == 0 ||
1052
0
             strcmp(o->arg, "VERBOSITY") == 0)
1053
0
        o->opt = CRAM_OPT_VERBOSITY, o->val.i = atoi(val);
1054
1055
0
    else if (strcmp(o->arg, "seqs_per_slice") == 0 ||
1056
0
             strcmp(o->arg, "SEQS_PER_SLICE") == 0)
1057
0
        o->opt = CRAM_OPT_SEQS_PER_SLICE, o->val.i = atoi(val);
1058
1059
0
    else if (strcmp(o->arg, "bases_per_slice") == 0 ||
1060
0
             strcmp(o->arg, "BASES_PER_SLICE") == 0)
1061
0
        o->opt = CRAM_OPT_BASES_PER_SLICE, o->val.i = atoi(val);
1062
1063
0
    else if (strcmp(o->arg, "slices_per_container") == 0 ||
1064
0
             strcmp(o->arg, "SLICES_PER_CONTAINER") == 0)
1065
0
        o->opt = CRAM_OPT_SLICES_PER_CONTAINER, o->val.i = atoi(val);
1066
1067
0
    else if (strcmp(o->arg, "embed_ref") == 0 ||
1068
0
             strcmp(o->arg, "EMBED_REF") == 0)
1069
0
        o->opt = CRAM_OPT_EMBED_REF, o->val.i = atoi(val);
1070
1071
0
    else if (strcmp(o->arg, "no_ref") == 0 ||
1072
0
             strcmp(o->arg, "NO_REF") == 0)
1073
0
        o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(val);
1074
1075
0
    else if (strcmp(o->arg, "pos_delta") == 0 ||
1076
0
             strcmp(o->arg, "POS_DELTA") == 0)
1077
0
        o->opt = CRAM_OPT_POS_DELTA, o->val.i = atoi(val);
1078
1079
0
    else if (strcmp(o->arg, "ignore_md5") == 0 ||
1080
0
             strcmp(o->arg, "IGNORE_MD5") == 0)
1081
0
        o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(val);
1082
1083
0
    else if (strcmp(o->arg, "use_bzip2") == 0 ||
1084
0
             strcmp(o->arg, "USE_BZIP2") == 0)
1085
0
        o->opt = CRAM_OPT_USE_BZIP2, o->val.i = atoi(val);
1086
1087
0
    else if (strcmp(o->arg, "use_rans") == 0 ||
1088
0
             strcmp(o->arg, "USE_RANS") == 0)
1089
0
        o->opt = CRAM_OPT_USE_RANS, o->val.i = atoi(val);
1090
1091
0
    else if (strcmp(o->arg, "use_lzma") == 0 ||
1092
0
             strcmp(o->arg, "USE_LZMA") == 0)
1093
0
        o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(val);
1094
1095
0
    else if (strcmp(o->arg, "use_tok") == 0 ||
1096
0
             strcmp(o->arg, "USE_TOK") == 0)
1097
0
        o->opt = CRAM_OPT_USE_TOK, o->val.i = atoi(val);
1098
1099
0
    else if (strcmp(o->arg, "use_fqz") == 0 ||
1100
0
             strcmp(o->arg, "USE_FQZ") == 0)
1101
0
        o->opt = CRAM_OPT_USE_FQZ, o->val.i = atoi(val);
1102
1103
0
    else if (strcmp(o->arg, "use_arith") == 0 ||
1104
0
             strcmp(o->arg, "USE_ARITH") == 0)
1105
0
        o->opt = CRAM_OPT_USE_ARITH, o->val.i = atoi(val);
1106
1107
0
    else if (strcmp(o->arg, "fast") == 0 ||
1108
0
             strcmp(o->arg, "FAST") == 0)
1109
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_FAST;
1110
1111
0
    else if (strcmp(o->arg, "normal") == 0 ||
1112
0
             strcmp(o->arg, "NORMAL") == 0)
1113
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_NORMAL;
1114
1115
0
    else if (strcmp(o->arg, "small") == 0 ||
1116
0
             strcmp(o->arg, "SMALL") == 0)
1117
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_SMALL;
1118
1119
0
    else if (strcmp(o->arg, "archive") == 0 ||
1120
0
             strcmp(o->arg, "ARCHIVE") == 0)
1121
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_ARCHIVE;
1122
1123
0
    else if (strcmp(o->arg, "reference") == 0 ||
1124
0
             strcmp(o->arg, "REFERENCE") == 0)
1125
0
        o->opt = CRAM_OPT_REFERENCE, o->val.s = val;
1126
1127
0
    else if (strcmp(o->arg, "version") == 0 ||
1128
0
             strcmp(o->arg, "VERSION") == 0)
1129
0
        o->opt = CRAM_OPT_VERSION, o->val.s =val;
1130
1131
0
    else if (strcmp(o->arg, "multi_seq_per_slice") == 0 ||
1132
0
             strcmp(o->arg, "MULTI_SEQ_PER_SLICE") == 0)
1133
0
        o->opt = CRAM_OPT_MULTI_SEQ_PER_SLICE, o->val.i = atoi(val);
1134
1135
0
    else if (strcmp(o->arg, "nthreads") == 0 ||
1136
0
             strcmp(o->arg, "NTHREADS") == 0)
1137
0
        o->opt = HTS_OPT_NTHREADS, o->val.i = atoi(val);
1138
1139
0
    else if (strcmp(o->arg, "cache_size") == 0 ||
1140
0
             strcmp(o->arg, "CACHE_SIZE") == 0) {
1141
0
        char *endp;
1142
0
        o->opt = HTS_OPT_CACHE_SIZE;
1143
0
        o->val.i = strtol(val, &endp, 0);
1144
        // NB: Doesn't support floats, eg 1.5g
1145
        // TODO: extend hts_parse_decimal? See also samtools sort.
1146
0
        switch (*endp) {
1147
0
        case 'g': case 'G': o->val.i *= 1024; // fall through
1148
0
        case 'm': case 'M': o->val.i *= 1024; // fall through
1149
0
        case 'k': case 'K': o->val.i *= 1024; break;
1150
0
        case '\0': break;
1151
0
        default:
1152
0
            hts_log_error("Unrecognised cache size suffix '%c'", *endp);
1153
0
            free(o->arg);
1154
0
            free(o);
1155
0
            return -1;
1156
0
        }
1157
0
    }
1158
1159
0
    else if (strcmp(o->arg, "required_fields") == 0 ||
1160
0
             strcmp(o->arg, "REQUIRED_FIELDS") == 0)
1161
0
        o->opt = CRAM_OPT_REQUIRED_FIELDS, o->val.i = strtol(val, NULL, 0);
1162
1163
0
    else if (strcmp(o->arg, "lossy_names") == 0 ||
1164
0
             strcmp(o->arg, "LOSSY_NAMES") == 0)
1165
0
        o->opt = CRAM_OPT_LOSSY_NAMES, o->val.i = strtol(val, NULL, 0);
1166
1167
0
    else if (strcmp(o->arg, "name_prefix") == 0 ||
1168
0
             strcmp(o->arg, "NAME_PREFIX") == 0)
1169
0
        o->opt = CRAM_OPT_PREFIX, o->val.s = val;
1170
1171
0
    else if (strcmp(o->arg, "store_md") == 0 ||
1172
0
             strcmp(o->arg, "store_md") == 0)
1173
0
        o->opt = CRAM_OPT_STORE_MD, o->val.i = atoi(val);
1174
1175
0
    else if (strcmp(o->arg, "store_nm") == 0 ||
1176
0
             strcmp(o->arg, "store_nm") == 0)
1177
0
        o->opt = CRAM_OPT_STORE_NM, o->val.i = atoi(val);
1178
1179
0
    else if (strcmp(o->arg, "block_size") == 0 ||
1180
0
             strcmp(o->arg, "BLOCK_SIZE") == 0)
1181
0
        o->opt = HTS_OPT_BLOCK_SIZE, o->val.i = strtol(val, NULL, 0);
1182
1183
0
    else if (strcmp(o->arg, "level") == 0 ||
1184
0
             strcmp(o->arg, "LEVEL") == 0)
1185
0
        o->opt = HTS_OPT_COMPRESSION_LEVEL, o->val.i = strtol(val, NULL, 0);
1186
1187
0
    else if (strcmp(o->arg, "filter") == 0 ||
1188
0
             strcmp(o->arg, "FILTER") == 0)
1189
0
        o->opt = HTS_OPT_FILTER, o->val.s = val;
1190
1191
0
    else if (strcmp(o->arg, "fastq_aux") == 0 ||
1192
0
        strcmp(o->arg, "FASTQ_AUX") == 0)
1193
0
        o->opt = FASTQ_OPT_AUX, o->val.s = val;
1194
1195
0
    else if (strcmp(o->arg, "fastq_barcode") == 0 ||
1196
0
        strcmp(o->arg, "FASTQ_BARCODE") == 0)
1197
0
        o->opt = FASTQ_OPT_BARCODE, o->val.s = val;
1198
1199
0
    else if (strcmp(o->arg, "fastq_rnum") == 0 ||
1200
0
        strcmp(o->arg, "FASTQ_RNUM") == 0)
1201
0
        o->opt = FASTQ_OPT_RNUM, o->val.i = 1;
1202
1203
0
    else if (strcmp(o->arg, "fastq_casava") == 0 ||
1204
0
        strcmp(o->arg, "FASTQ_CASAVA") == 0)
1205
0
        o->opt = FASTQ_OPT_CASAVA, o->val.i = 1;
1206
1207
0
    else if (strcmp(o->arg, "fastq_name2") == 0 ||
1208
0
        strcmp(o->arg, "FASTQ_NAME2") == 0)
1209
0
        o->opt = FASTQ_OPT_NAME2, o->val.i = 1;
1210
1211
0
    else if (strcmp(o->arg, "fastq_umi") == 0 ||
1212
0
        strcmp(o->arg, "FASTQ_UMI") == 0)
1213
0
        o->opt = FASTQ_OPT_UMI, o->val.s = val;
1214
1215
0
    else if (strcmp(o->arg, "fastq_umi_regex") == 0 ||
1216
0
        strcmp(o->arg, "FASTQ_UMI_REGEX") == 0)
1217
0
        o->opt = FASTQ_OPT_UMI_REGEX, o->val.s = val;
1218
1219
0
    else {
1220
0
        hts_log_error("Unknown option '%s'", o->arg);
1221
0
        free(o->arg);
1222
0
        free(o);
1223
0
        return -1;
1224
0
    }
1225
1226
0
    o->next = NULL;
1227
1228
    // Append; assumes small list.
1229
0
    if (*opts) {
1230
0
        t = *opts;
1231
0
        while (t->next)
1232
0
            t = t->next;
1233
0
        t->next = o;
1234
0
    } else {
1235
0
        *opts = o;
1236
0
    }
1237
1238
0
    return 0;
1239
0
}
1240
1241
/*
1242
 * Applies an hts_opt option list to a given htsFile.
1243
 *
1244
 * Returns 0 on success
1245
 *        -1 on failure
1246
 */
1247
0
int hts_opt_apply(htsFile *fp, hts_opt *opts) {
1248
0
    hts_opt *last = NULL;
1249
1250
0
    for (; opts;  opts = (last=opts)->next) {
1251
0
        switch (opts->opt) {
1252
0
            case CRAM_OPT_REFERENCE:
1253
0
                if (!(fp->fn_aux = strdup(opts->val.s)))
1254
0
                    return -1;
1255
                // fall through
1256
0
            case CRAM_OPT_VERSION:
1257
0
            case CRAM_OPT_PREFIX:
1258
0
            case HTS_OPT_FILTER:
1259
0
            case FASTQ_OPT_AUX:
1260
0
            case FASTQ_OPT_BARCODE:
1261
0
            case FASTQ_OPT_UMI:
1262
0
            case FASTQ_OPT_UMI_REGEX:
1263
0
                if (hts_set_opt(fp,  opts->opt,  opts->val.s) != 0)
1264
0
                    return -1;
1265
0
                break;
1266
0
            default:
1267
0
                if (hts_set_opt(fp,  opts->opt,  opts->val.i) != 0)
1268
0
                    return -1;
1269
0
                break;
1270
0
        }
1271
0
    }
1272
1273
0
    return 0;
1274
0
}
1275
1276
/*
1277
 * Frees an hts_opt list.
1278
 */
1279
0
void hts_opt_free(hts_opt *opts) {
1280
0
    hts_opt *last = NULL;
1281
0
    while (opts) {
1282
0
        opts = (last=opts)->next;
1283
0
        free(last->arg);
1284
0
        free(last);
1285
0
    }
1286
0
}
1287
1288
1289
/*
1290
 * Tokenise options as (key(=value)?,)*(key(=value)?)?
1291
 * NB: No provision for ',' appearing in the value!
1292
 * Add backslashing rules?
1293
 *
1294
 * This could be used as part of a general command line option parser or
1295
 * as a string concatenated onto the file open mode.
1296
 *
1297
 * Returns 0 on success
1298
 *        -1 on failure.
1299
 */
1300
0
int hts_parse_opt_list(htsFormat *fmt, const char *str) {
1301
0
    while (str && *str) {
1302
0
        const char *str_start;
1303
0
        int len;
1304
0
        char arg[8001];
1305
1306
0
        while (*str && *str == ',')
1307
0
            str++;
1308
1309
0
        for (str_start = str; *str && *str != ','; str++);
1310
0
        len = str - str_start;
1311
1312
        // Produce a nul terminated copy of the option
1313
0
        strncpy(arg, str_start, len < 8000 ? len : 8000);
1314
0
        arg[len < 8000 ? len : 8000] = '\0';
1315
1316
0
        if (hts_opt_add((hts_opt **)&fmt->specific, arg) != 0)
1317
0
            return -1;
1318
1319
0
        if (*str)
1320
0
            str++;
1321
0
    }
1322
1323
0
    return 0;
1324
0
}
1325
1326
/*
1327
 * Accepts a string file format (sam, bam, cram, vcf, bam) optionally
1328
 * followed by a comma separated list of key=value options and splits
1329
 * these up into the fields of htsFormat struct.
1330
 *
1331
 * format is assumed to be already initialised, either to blank
1332
 * "unknown" values or via previous hts_opt_add calls.
1333
 *
1334
 * Returns 0 on success
1335
 *        -1 on failure.
1336
 */
1337
0
int hts_parse_format(htsFormat *format, const char *str) {
1338
0
    char fmt[9];
1339
0
    const char *cp = scan_keyword(str, ',', fmt, sizeof fmt);
1340
1341
0
    format->version.minor = 0; // unknown
1342
0
    format->version.major = 0; // unknown
1343
1344
0
    if (strcmp(fmt, "sam") == 0) {
1345
0
        format->category          = sequence_data;
1346
0
        format->format            = sam;
1347
0
        format->compression       = no_compression;
1348
0
        format->compression_level = 0;
1349
0
    } else if (strcmp(fmt, "sam.gz") == 0) {
1350
0
        format->category          = sequence_data;
1351
0
        format->format            = sam;
1352
0
        format->compression       = bgzf;
1353
0
        format->compression_level = -1;
1354
0
    } else if (strcmp(fmt, "bam") == 0) {
1355
0
        format->category          = sequence_data;
1356
0
        format->format            = bam;
1357
0
        format->compression       = bgzf;
1358
0
        format->compression_level = -1;
1359
0
    } else if (strcmp(fmt, "cram") == 0) {
1360
0
        format->category          = sequence_data;
1361
0
        format->format            = cram;
1362
0
        format->compression       = custom;
1363
0
        format->compression_level = -1;
1364
0
    } else if (strcmp(fmt, "vcf") == 0) {
1365
0
        format->category          = variant_data;
1366
0
        format->format            = vcf;
1367
0
        format->compression       = no_compression;
1368
0
        format->compression_level = 0;
1369
0
    } else if (strcmp(fmt, "bcf") == 0) {
1370
0
        format->category          = variant_data;
1371
0
        format->format            = bcf;
1372
0
        format->compression       = bgzf;
1373
0
        format->compression_level = -1;
1374
0
    } else if (strcmp(fmt, "fastq") == 0 || strcmp(fmt, "fq") == 0) {
1375
0
        format->category          = sequence_data;
1376
0
        format->format            = fastq_format;
1377
0
        format->compression       = no_compression;
1378
0
        format->compression_level = 0;
1379
0
    } else if (strcmp(fmt, "fastq.gz") == 0 || strcmp(fmt, "fq.gz") == 0) {
1380
0
        format->category          = sequence_data;
1381
0
        format->format            = fastq_format;
1382
0
        format->compression       = bgzf;
1383
0
        format->compression_level = 0;
1384
0
    } else if (strcmp(fmt, "fasta") == 0 || strcmp(fmt, "fa") == 0) {
1385
0
        format->category          = sequence_data;
1386
0
        format->format            = fasta_format;
1387
0
        format->compression       = no_compression;
1388
0
        format->compression_level = 0;
1389
0
    } else if (strcmp(fmt, "fasta.gz") == 0 || strcmp(fmt, "fa.gz") == 0) {
1390
0
        format->category          = sequence_data;
1391
0
        format->format            = fasta_format;
1392
0
        format->compression       = bgzf;
1393
0
        format->compression_level = 0;
1394
0
    } else {
1395
0
        return -1;
1396
0
    }
1397
1398
0
    return hts_parse_opt_list(format, cp);
1399
0
}
1400
1401
1402
/*
1403
 * Tokenise options as (key(=value)?,)*(key(=value)?)?
1404
 * NB: No provision for ',' appearing in the value!
1405
 * Add backslashing rules?
1406
 *
1407
 * This could be used as part of a general command line option parser or
1408
 * as a string concatenated onto the file open mode.
1409
 *
1410
 * Returns 0 on success
1411
 *        -1 on failure.
1412
 */
1413
0
static int hts_process_opts(htsFile *fp, const char *opts) {
1414
0
    htsFormat fmt;
1415
1416
0
    fmt.specific = NULL;
1417
0
    if (hts_parse_opt_list(&fmt, opts) != 0)
1418
0
        return -1;
1419
1420
0
    if (hts_opt_apply(fp, fmt.specific) != 0) {
1421
0
        hts_opt_free(fmt.specific);
1422
0
        return -1;
1423
0
    }
1424
1425
0
    hts_opt_free(fmt.specific);
1426
1427
0
    return 0;
1428
0
}
1429
1430
static int hts_crypt4gh_redirect(const char *fn, const char *mode,
1431
2
                                 hFILE **hfile_ptr, htsFile *fp) {
1432
2
    hFILE *hfile1 = *hfile_ptr;
1433
2
    hFILE *hfile2 = NULL;
1434
2
    char fn_buf[512], *fn2 = fn_buf;
1435
2
    char mode2[102]; // Size set by sizeof(simple_mode) in hts_hopen()
1436
2
    const char *prefix = "crypt4gh:";
1437
2
    size_t fn2_len = strlen(prefix) + strlen(fn) + 1;
1438
2
    int ret = -1;
1439
1440
2
    if (fn2_len > sizeof(fn_buf)) {
1441
0
        if (fn2_len >= INT_MAX) // Silence gcc format-truncation warning
1442
0
            return -1;
1443
0
        fn2 = malloc(fn2_len);
1444
0
        if (!fn2) return -1;
1445
0
    }
1446
1447
    // Reopen fn using the crypt4gh plug-in (if available)
1448
2
    snprintf(fn2, fn2_len, "%s%s", prefix, fn);
1449
2
    snprintf(mode2, sizeof(mode2), "%s%s", mode, strchr(mode, ':') ? "" : ":");
1450
2
    hfile2 = hopen(fn2, mode2, "parent", hfile1, NULL);
1451
2
    if (hfile2) {
1452
        // Replace original hfile with the new one.  The original is now
1453
        // enclosed within hfile2
1454
0
        *hfile_ptr = hfile2;
1455
0
        ret = 0;
1456
0
    }
1457
1458
2
    if (fn2 != fn_buf)
1459
0
        free(fn2);
1460
2
    return ret;
1461
2
}
1462
1463
htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode)
1464
95.3k
{
1465
95.3k
    hFILE *hfile_orig = hfile;
1466
95.3k
    hFILE *hfile_cleanup = hfile;
1467
95.3k
    htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile));
1468
95.3k
    char simple_mode[101], *cp, *opts;
1469
95.3k
    simple_mode[100] = '\0';
1470
1471
95.3k
    if (fp == NULL) goto error;
1472
1473
95.3k
    fp->fn = strdup(fn);
1474
95.3k
    fp->is_be = ed_is_big();
1475
1476
    // Split mode into simple_mode,opts strings
1477
95.3k
    if ((cp = strchr(mode, ','))) {
1478
0
        strncpy(simple_mode, mode, cp-mode <= 100 ? cp-mode : 100);
1479
0
        simple_mode[cp-mode] = '\0';
1480
0
        opts = cp+1;
1481
95.3k
    } else {
1482
95.3k
        strncpy(simple_mode, mode, 100);
1483
95.3k
        opts = NULL;
1484
95.3k
    }
1485
1486
95.3k
    if (strchr(simple_mode, 'r')) {
1487
56.0k
        const int max_loops = 5; // Should be plenty
1488
56.0k
        int loops = 0;
1489
56.0k
        if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error;
1490
1491
        // Deal with formats that re-direct an underlying file via a plug-in.
1492
        // Loops as we may have crypt4gh served via htsget, or
1493
        // crypt4gh-in-crypt4gh.
1494
1495
56.0k
        while (fp->format.format == htsget ||
1496
56.0k
               fp->format.format == hts_crypt4gh_format) {
1497
            // Ensure we don't get stuck in an endless redirect loop
1498
2
            if (++loops > max_loops) {
1499
0
                errno = ELOOP;
1500
0
                goto error;
1501
0
            }
1502
1503
2
            if (fp->format.format == htsget) {
1504
0
                hFILE *hfile2 = hopen_htsget_redirect(hfile, simple_mode);
1505
0
                if (hfile2 == NULL) goto error;
1506
1507
0
                if (hfile != hfile_cleanup) {
1508
                    // Close the result of an earlier redirection
1509
0
                    hclose_abruptly(hfile);
1510
0
                }
1511
1512
0
                hfile = hfile2;
1513
0
            }
1514
2
            else if (fp->format.format == hts_crypt4gh_format) {
1515
2
                int should_preserve = (hfile == hfile_orig);
1516
2
                int update_cleanup = (hfile == hfile_cleanup);
1517
2
                if (hts_crypt4gh_redirect(fn, simple_mode, &hfile, fp) < 0)
1518
2
                    goto error;
1519
0
                if (should_preserve) {
1520
                    // The original hFILE is now contained in a crypt4gh
1521
                    // wrapper.  Should we need to close the wrapper due
1522
                    // to a later error, we need to prevent the wrapped
1523
                    // handle from being closed as the caller will see
1524
                    // this function return NULL and try to clean up itself.
1525
0
                    hfile_orig->preserve = 1;
1526
0
                }
1527
0
                if (update_cleanup) {
1528
                    // Update handle to close at the end if redirected by htsget
1529
0
                    hfile_cleanup = hfile;
1530
0
                }
1531
0
            }
1532
1533
            // Re-detect format against the result of the redirection
1534
0
            if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error;
1535
0
        }
1536
56.0k
    }
1537
39.3k
    else if (strchr(simple_mode, 'w') || strchr(simple_mode, 'a')) {
1538
39.3k
        htsFormat *fmt = &fp->format;
1539
39.3k
        fp->is_write = 1;
1540
1541
39.3k
        if (strchr(simple_mode, 'b')) fmt->format = binary_format;
1542
24.9k
        else if (strchr(simple_mode, 'c')) fmt->format = cram;
1543
14.4k
        else if (strchr(simple_mode, 'f')) fmt->format = fastq_format;
1544
14.4k
        else if (strchr(simple_mode, 'F')) fmt->format = fasta_format;
1545
14.4k
        else fmt->format = text_format;
1546
1547
39.3k
        if (strchr(simple_mode, 'z')) fmt->compression = bgzf;
1548
39.3k
        else if (strchr(simple_mode, 'g')) fmt->compression = gzip;
1549
39.3k
        else if (strchr(simple_mode, 'u')) fmt->compression = no_compression;
1550
39.3k
        else {
1551
            // No compression mode specified, set to the default for the format
1552
39.3k
            switch (fmt->format) {
1553
14.4k
            case binary_format: fmt->compression = bgzf; break;
1554
10.4k
            case cram: fmt->compression = custom; break;
1555
0
            case fastq_format: fmt->compression = no_compression; break;
1556
0
            case fasta_format: fmt->compression = no_compression; break;
1557
14.4k
            case text_format: fmt->compression = no_compression; break;
1558
0
            default: abort();
1559
39.3k
            }
1560
39.3k
        }
1561
1562
        // Fill in category (if determinable; e.g. 'b' could be BAM or BCF)
1563
39.3k
        fmt->category = format_category(fmt->format);
1564
1565
39.3k
        fmt->version.major = fmt->version.minor = -1;
1566
39.3k
        fmt->compression_level = -1;
1567
39.3k
        fmt->specific = NULL;
1568
39.3k
    }
1569
0
    else { errno = EINVAL; goto error; }
1570
1571
95.3k
    switch (fp->format.format) {
1572
14.4k
    case binary_format:
1573
17.7k
    case bam:
1574
19.1k
    case bcf:
1575
19.1k
        fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
1576
19.1k
        if (fp->fp.bgzf == NULL) goto error;
1577
19.1k
        fp->is_bin = fp->is_bgzf = 1;
1578
19.1k
        break;
1579
1580
22.2k
    case cram:
1581
22.2k
        fp->fp.cram = cram_dopen(hfile, fn, simple_mode);
1582
22.2k
        if (fp->fp.cram == NULL) goto error;
1583
20.2k
        if (!fp->is_write)
1584
9.75k
            cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, -1); // auto
1585
20.2k
        fp->is_cram = 1;
1586
20.2k
        break;
1587
1588
60
    case empty_format:
1589
14.6k
    case text_format:
1590
14.6k
    case bed:
1591
22.2k
    case fasta_format:
1592
22.5k
    case fastq_format:
1593
43.4k
    case sam:
1594
53.9k
    case vcf:
1595
53.9k
        if (fp->format.compression != no_compression) {
1596
2.87k
            fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
1597
2.87k
            if (fp->fp.bgzf == NULL) goto error;
1598
2.81k
            fp->is_bgzf = 1;
1599
2.81k
        }
1600
51.0k
        else
1601
51.0k
            fp->fp.hfile = hfile;
1602
53.8k
        break;
1603
1604
53.8k
    default:
1605
110
        errno = EFTYPE;
1606
110
        goto error;
1607
95.3k
    }
1608
1609
93.2k
    if (opts)
1610
0
        hts_process_opts(fp, opts);
1611
1612
    // Allow original file to close if it was preserved earlier by crypt4gh
1613
93.2k
    hfile_orig->preserve = 0;
1614
1615
    // If redirecting via htsget, close the original hFILE now (pedantically
1616
    // we would instead close it in hts_close(), but this a simplifying
1617
    // optimisation)
1618
93.2k
    if (hfile != hfile_cleanup) hclose_abruptly(hfile_cleanup);
1619
1620
93.2k
    return fp;
1621
1622
2.14k
error:
1623
2.14k
    hts_log_error("Failed to open file %s", fn);
1624
1625
    // If redirecting, close the failed redirection hFILE that we have opened
1626
2.14k
    if (hfile != hfile_orig) hclose_abruptly(hfile);
1627
2.14k
    hfile_orig->preserve = 0; // Allow caller to close the original hfile
1628
1629
2.14k
    if (fp) {
1630
2.14k
        free(fp->fn);
1631
2.14k
        free(fp->fn_aux);
1632
2.14k
        free(fp);
1633
2.14k
    }
1634
2.14k
    return NULL;
1635
95.3k
}
1636
1637
static int hts_idx_close_otf_fp(hts_idx_t *idx);
1638
1639
int hts_close(htsFile *fp)
1640
93.2k
{
1641
93.2k
    int ret = 0, save;
1642
93.2k
    if (!fp) {
1643
0
        errno = EINVAL;
1644
0
        return -1;
1645
0
    }
1646
1647
93.2k
    switch (fp->format.format) {
1648
1.66k
    case binary_format:
1649
14.9k
    case bam:
1650
19.1k
    case bcf:
1651
19.1k
        ret = bgzf_close(fp->fp.bgzf);
1652
19.1k
        break;
1653
1654
20.2k
    case cram:
1655
20.2k
        if (!fp->is_write) {
1656
9.75k
            switch (cram_eof(fp->fp.cram)) {
1657
75
            case 2:
1658
75
                hts_log_warning("EOF marker is absent. The input is probably truncated");
1659
75
                break;
1660
7.09k
            case 0:  /* not at EOF, but may not have wanted all seqs */
1661
9.68k
            default: /* case 1, expected EOF */
1662
9.68k
                break;
1663
9.75k
            }
1664
9.75k
        }
1665
20.2k
        ret = cram_close(fp->fp.cram);
1666
20.2k
        break;
1667
1668
14
    case empty_format:
1669
1.83k
    case text_format:
1670
1.84k
    case bed:
1671
9.43k
    case fasta_format:
1672
9.78k
    case fastq_format:
1673
40.5k
    case sam:
1674
53.8k
    case vcf:
1675
53.8k
        if (fp->format.format == sam)
1676
30.7k
            ret = sam_state_destroy(fp);
1677
23.1k
        else if (fp->format.format == fastq_format ||
1678
22.7k
                 fp->format.format == fasta_format)
1679
7.94k
            fastq_state_destroy(fp);
1680
1681
53.8k
        if (fp->format.compression != no_compression)
1682
2.81k
            ret |= bgzf_close(fp->fp.bgzf);
1683
51.0k
        else
1684
51.0k
            ret |= hclose(fp->fp.hfile);
1685
53.8k
        break;
1686
1687
0
    default:
1688
0
        ret = -1;
1689
0
        break;
1690
93.2k
    }
1691
1692
93.2k
    if (fp->idx) {
1693
        // Close deferred index file handle, if present.
1694
        // Unfortunately this means errors on the index will get mixed with
1695
        // those on the main file, but as we only have the EOF block left to
1696
        // write it hopefully won't happen that often.
1697
0
        ret |= hts_idx_close_otf_fp(fp->idx);
1698
0
    }
1699
1700
93.2k
    save = errno;
1701
93.2k
    sam_hdr_destroy(fp->bam_header);
1702
93.2k
    hts_idx_destroy(fp->idx);
1703
93.2k
    hts_filter_free(fp->filter);
1704
93.2k
    free(fp->fn);
1705
93.2k
    free(fp->fn_aux);
1706
93.2k
    free(fp->line.s);
1707
93.2k
    free(fp);
1708
93.2k
    errno = save;
1709
93.2k
    return ret;
1710
93.2k
}
1711
1712
int hts_flush(htsFile *fp)
1713
0
{
1714
0
    if (fp == NULL) return 0;
1715
1716
0
    switch (fp->format.format) {
1717
0
    case binary_format:
1718
0
    case bam:
1719
0
    case bcf:
1720
0
        return bgzf_flush(fp->fp.bgzf);
1721
1722
0
    case cram:
1723
0
        return cram_flush(fp->fp.cram);
1724
1725
0
    case empty_format:
1726
0
    case text_format:
1727
0
    case bed:
1728
0
    case fasta_format:
1729
0
    case fastq_format:
1730
0
    case sam:
1731
0
    case vcf:
1732
0
        if (fp->format.compression != no_compression)
1733
0
            return bgzf_flush(fp->fp.bgzf);
1734
0
        else
1735
0
            return hflush(fp->fp.hfile);
1736
1737
0
    default:
1738
0
        break;
1739
0
    }
1740
1741
0
    return 0;
1742
0
}
1743
1744
const htsFormat *hts_get_format(htsFile *fp)
1745
0
{
1746
0
    return fp? &fp->format : NULL;
1747
0
}
1748
1749
0
const char *hts_format_file_extension(const htsFormat *format) {
1750
0
    if (!format)
1751
0
        return "?";
1752
1753
0
    switch (format->format) {
1754
0
    case sam:  return "sam";
1755
0
    case bam:  return "bam";
1756
0
    case bai:  return "bai";
1757
0
    case cram: return "cram";
1758
0
    case crai: return "crai";
1759
0
    case vcf:  return "vcf";
1760
0
    case bcf:  return "bcf";
1761
0
    case csi:  return "csi";
1762
0
    case fai_format:   return "fai";
1763
0
    case fqi_format:   return "fqi";
1764
0
    case gzi:  return "gzi";
1765
0
    case tbi:  return "tbi";
1766
0
    case bed:  return "bed";
1767
0
    case d4_format:    return "d4";
1768
0
    case fasta_format: return "fa";
1769
0
    case fastq_format: return "fq";
1770
0
    default:   return "?";
1771
0
    }
1772
0
}
1773
1774
0
static hFILE *hts_hfile(htsFile *fp) {
1775
0
    switch (fp->format.format) {
1776
0
    case binary_format:// fall through
1777
0
    case bcf:          // fall through
1778
0
    case bam:          return bgzf_hfile(fp->fp.bgzf);
1779
0
    case cram:         return cram_hfile(fp->fp.cram);
1780
0
    case text_format:  // fall through
1781
0
    case vcf:          // fall through
1782
0
    case fastq_format: // fall through
1783
0
    case fasta_format: // fall through
1784
0
    case sam:          return fp->format.compression != no_compression
1785
0
                              ? bgzf_hfile(fp->fp.bgzf)
1786
0
                              : fp->fp.hfile;
1787
0
    default:           return NULL;
1788
0
    }
1789
0
}
1790
1791
0
int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) {
1792
0
    int r;
1793
0
    va_list args;
1794
1795
0
    switch (opt) {
1796
0
    case HTS_OPT_NTHREADS: {
1797
0
        va_start(args, opt);
1798
0
        int nthreads = va_arg(args, int);
1799
0
        va_end(args);
1800
0
        return hts_set_threads(fp, nthreads);
1801
0
    }
1802
1803
0
    case HTS_OPT_BLOCK_SIZE: {
1804
0
        hFILE *hf = hts_hfile(fp);
1805
1806
0
        if (hf) {
1807
0
            va_start(args, opt);
1808
0
            if (hfile_set_blksize(hf, va_arg(args, int)) != 0)
1809
0
                hts_log_warning("Failed to change block size");
1810
0
            va_end(args);
1811
0
        }
1812
0
        else {
1813
            // To do - implement for vcf/bcf.
1814
0
            hts_log_warning("Cannot change block size for this format");
1815
0
        }
1816
1817
0
        return 0;
1818
0
    }
1819
1820
0
    case HTS_OPT_THREAD_POOL: {
1821
0
        va_start(args, opt);
1822
0
        htsThreadPool *p = va_arg(args, htsThreadPool *);
1823
0
        va_end(args);
1824
0
        return hts_set_thread_pool(fp, p);
1825
0
    }
1826
1827
0
    case HTS_OPT_CACHE_SIZE: {
1828
0
        va_start(args, opt);
1829
0
        int cache_size = va_arg(args, int);
1830
0
        va_end(args);
1831
0
        hts_set_cache_size(fp, cache_size);
1832
0
        return 0;
1833
0
    }
1834
1835
0
    case FASTQ_OPT_CASAVA:
1836
0
    case FASTQ_OPT_RNUM:
1837
0
    case FASTQ_OPT_NAME2:
1838
0
        if (fp->format.format == fastq_format ||
1839
0
            fp->format.format == fasta_format)
1840
0
            return fastq_state_set(fp, opt);
1841
0
        return 0;
1842
1843
0
    case FASTQ_OPT_AUX:
1844
0
        if (fp->format.format == fastq_format ||
1845
0
            fp->format.format == fasta_format) {
1846
0
            va_start(args, opt);
1847
0
            char *list = va_arg(args, char *);
1848
0
            va_end(args);
1849
0
            return fastq_state_set(fp, opt, list);
1850
0
        }
1851
0
        return 0;
1852
1853
0
    case FASTQ_OPT_BARCODE:
1854
0
    case FASTQ_OPT_UMI:
1855
0
    case FASTQ_OPT_UMI_REGEX:
1856
0
        if (fp->format.format == fastq_format ||
1857
0
            fp->format.format == fasta_format) {
1858
0
            va_start(args, opt);
1859
0
            char *bc = va_arg(args, char *);
1860
0
            va_end(args);
1861
0
            return fastq_state_set(fp, opt, bc);
1862
0
        }
1863
0
        return 0;
1864
1865
    // Options below here flow through to cram_set_voption
1866
0
    case HTS_OPT_COMPRESSION_LEVEL: {
1867
0
        va_start(args, opt);
1868
0
        int level = va_arg(args, int);
1869
0
        va_end(args);
1870
0
        if (fp->is_bgzf)
1871
0
            fp->fp.bgzf->compress_level = level;
1872
0
        else if (fp->format.format == cram)
1873
0
            return cram_set_option(fp->fp.cram, opt, level);
1874
0
        return 0;
1875
0
    }
1876
1877
0
    case HTS_OPT_FILTER: {
1878
0
        va_start(args, opt);
1879
0
        char *expr = va_arg(args, char *);
1880
0
        va_end(args);
1881
0
        return hts_set_filter_expression(fp, expr);
1882
0
    }
1883
1884
0
    case HTS_OPT_PROFILE: {
1885
0
        va_start(args, opt);
1886
0
        enum hts_profile_option prof = va_arg(args, int);
1887
0
        va_end(args);
1888
0
        if (fp->is_bgzf) {
1889
0
            switch (prof) {
1890
#ifdef HAVE_LIBDEFLATE
1891
            case HTS_PROFILE_FAST:    fp->fp.bgzf->compress_level =  2; break;
1892
            case HTS_PROFILE_NORMAL:  fp->fp.bgzf->compress_level = -1; break;
1893
            case HTS_PROFILE_SMALL:   fp->fp.bgzf->compress_level = 10; break;
1894
            case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level = 12; break;
1895
#else
1896
0
            case HTS_PROFILE_FAST:    fp->fp.bgzf->compress_level =  1; break;
1897
0
            case HTS_PROFILE_NORMAL:  fp->fp.bgzf->compress_level = -1; break;
1898
0
            case HTS_PROFILE_SMALL:   fp->fp.bgzf->compress_level =  8; break;
1899
0
            case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level =  9; break;
1900
0
#endif
1901
0
            }
1902
0
        } // else CRAM manages this in its own way
1903
0
        break;
1904
0
    }
1905
1906
0
    default:
1907
0
        break;
1908
0
    }
1909
1910
0
    if (fp->format.format != cram)
1911
0
        return 0;
1912
1913
0
    va_start(args, opt);
1914
0
    r = cram_set_voption(fp->fp.cram, opt, args);
1915
0
    va_end(args);
1916
1917
0
    return r;
1918
0
}
1919
1920
BGZF *hts_get_bgzfp(htsFile *fp);
1921
1922
int hts_set_threads(htsFile *fp, int n)
1923
0
{
1924
0
    if (fp->format.format == sam) {
1925
0
        return sam_set_threads(fp, n);
1926
0
    } else if (fp->format.compression == bgzf) {
1927
0
        return bgzf_mt(hts_get_bgzfp(fp), n, 256/*unused*/);
1928
0
    } else if (fp->format.format == cram) {
1929
0
        return hts_set_opt(fp, CRAM_OPT_NTHREADS, n);
1930
0
    }
1931
0
    else return 0;
1932
0
}
1933
1934
0
int hts_set_thread_pool(htsFile *fp, htsThreadPool *p) {
1935
0
    if (fp->format.format == sam || fp->format.format == text_format) {
1936
0
        return sam_set_thread_pool(fp, p);
1937
0
    } else if (fp->format.compression == bgzf) {
1938
0
        return bgzf_thread_pool(hts_get_bgzfp(fp), p->pool, p->qsize);
1939
0
    } else if (fp->format.format == cram) {
1940
0
        return hts_set_opt(fp, CRAM_OPT_THREAD_POOL, p);
1941
0
    }
1942
0
    else return 0;
1943
0
}
1944
1945
void hts_set_cache_size(htsFile *fp, int n)
1946
0
{
1947
0
    if (fp->format.compression == bgzf)
1948
0
        bgzf_set_cache_size(hts_get_bgzfp(fp), n);
1949
0
}
1950
1951
int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
1952
0
{
1953
0
    free(fp->fn_aux);
1954
0
    if (fn_aux) {
1955
0
        fp->fn_aux = strdup(fn_aux);
1956
0
        if (fp->fn_aux == NULL) return -1;
1957
0
    }
1958
0
    else fp->fn_aux = NULL;
1959
1960
0
    if (fp->format.format == cram)
1961
0
        if (cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux))
1962
0
            return -1;
1963
1964
0
    return 0;
1965
0
}
1966
1967
int hts_set_filter_expression(htsFile *fp, const char *expr)
1968
0
{
1969
0
    if (fp->filter)
1970
0
        hts_filter_free(fp->filter);
1971
1972
0
    if (!expr)
1973
0
        return 0;
1974
1975
0
    return (fp->filter = hts_filter_init(expr))
1976
0
        ? 0 : -1;
1977
0
}
1978
1979
hFILE *hts_open_tmpfile(const char *fname, const char *mode, kstring_t *tmpname)
1980
0
{
1981
0
    int pid = (int) getpid();
1982
0
    unsigned ptr = (uintptr_t) tmpname;
1983
0
    int n = 0;
1984
0
    hFILE *fp = NULL;
1985
1986
0
    do {
1987
        // Attempt to further uniquify the temporary filename
1988
0
        unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock()) ^ ptr;
1989
0
        n++;
1990
1991
0
        ks_clear(tmpname);
1992
0
        if (ksprintf(tmpname, "%s.tmp_%d_%d_%u", fname, pid, n, t) < 0) break;
1993
1994
0
        fp = hopen(tmpname->s, mode);
1995
0
    } while (fp == NULL && errno == EEXIST && n < 100);
1996
1997
0
    return fp;
1998
0
}
1999
2000
int hts_is_utf16_text(const kstring_t *str)
2001
0
{
2002
0
    const unsigned char *u = (const unsigned char *) (str->s);
2003
0
    return (str->l > 0 && str->s)? is_utf16_text(u, u + str->l) : 0;
2004
0
}
2005
2006
// For VCF/BCF backward sweeper. Not exposing these functions because their
2007
// future is uncertain. Things will probably have to change with hFILE...
2008
BGZF *hts_get_bgzfp(htsFile *fp)
2009
0
{
2010
0
    if (fp->is_bgzf)
2011
0
        return fp->fp.bgzf;
2012
0
    else
2013
0
        return NULL;
2014
0
}
2015
int hts_useek(htsFile *fp, off_t uoffset, int where)
2016
0
{
2017
0
    if (fp->is_bgzf)
2018
0
        return bgzf_useek(fp->fp.bgzf, uoffset, where);
2019
0
    else
2020
0
        return (hseek(fp->fp.hfile, uoffset, SEEK_SET) >= 0)? 0 : -1;
2021
0
}
2022
off_t hts_utell(htsFile *fp)
2023
0
{
2024
0
    if (fp->is_bgzf)
2025
0
        return bgzf_utell(fp->fp.bgzf);
2026
0
    else
2027
0
        return htell(fp->fp.hfile);
2028
0
}
2029
2030
int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
2031
97.7M
{
2032
97.7M
    int ret;
2033
97.7M
    if (! (delimiter == KS_SEP_LINE || delimiter == '\n')) {
2034
0
        hts_log_error("Unexpected delimiter %d", delimiter);
2035
0
        abort();
2036
0
    }
2037
2038
97.7M
    switch (fp->format.compression) {
2039
1.89M
    case no_compression:
2040
1.89M
        str->l = 0;
2041
1.89M
        ret = kgetline2(str, (kgets_func2 *) hgetln, fp->fp.hfile);
2042
1.89M
        if (ret >= 0) ret = (str->l <= INT_MAX)? (int) str->l : INT_MAX;
2043
13.5k
        else if (herrno(fp->fp.hfile)) ret = -2, errno = herrno(fp->fp.hfile);
2044
13.5k
        else ret = -1;
2045
1.89M
        break;
2046
2047
95.8M
    case gzip:
2048
95.8M
    case bgzf:
2049
95.8M
        ret = bgzf_getline(fp->fp.bgzf, '\n', str);
2050
95.8M
        break;
2051
2052
0
    default:
2053
0
        abort();
2054
97.7M
    }
2055
2056
97.7M
    ++fp->lineno;
2057
97.7M
    return ret;
2058
97.7M
}
2059
2060
char **hts_readlist(const char *string, int is_file, int *_n)
2061
0
{
2062
0
    unsigned int m = 0, n = 0;
2063
0
    char **s = 0, **s_new;
2064
0
    if ( is_file )
2065
0
    {
2066
0
        BGZF *fp = bgzf_open(string, "r");
2067
0
        if ( !fp ) return NULL;
2068
2069
0
        kstring_t str;
2070
0
        int ret;
2071
0
        str.s = 0; str.l = str.m = 0;
2072
0
        while ((ret = bgzf_getline(fp, '\n', &str)) >= 0)
2073
0
        {
2074
0
            if (str.l == 0) continue;
2075
0
            if (n == 0 && hts_is_utf16_text(&str))
2076
0
                hts_log_warning("'%s' appears to be encoded as UTF-16", string);
2077
0
            if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
2078
0
                goto err;
2079
0
            s[n] = strdup(str.s);
2080
0
            if (!s[n])
2081
0
                goto err;
2082
0
            n++;
2083
0
        }
2084
0
        if (ret < -1) // Read error
2085
0
            goto err;
2086
0
        bgzf_close(fp);
2087
0
        free(str.s);
2088
0
    }
2089
0
    else
2090
0
    {
2091
0
        const char *q = string, *p = string;
2092
0
        while ( 1 )
2093
0
        {
2094
0
            if (*p == ',' || *p == 0)
2095
0
            {
2096
0
                if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
2097
0
                    goto err;
2098
0
                s[n] = (char*)calloc(p - q + 1, 1);
2099
0
                if (!s[n])
2100
0
                    goto err;
2101
0
                strncpy(s[n++], q, p - q);
2102
0
                q = p + 1;
2103
0
            }
2104
0
            if ( !*p ) break;
2105
0
            p++;
2106
0
        }
2107
0
    }
2108
    // Try to shrink s to the minimum size needed
2109
0
    s_new = (char**)realloc(s, n * sizeof(char*));
2110
0
    if (!s_new)
2111
0
        goto err;
2112
2113
0
    s = s_new;
2114
0
    assert(n < INT_MAX); // hts_resize() should ensure this
2115
0
    *_n = n;
2116
0
    return s;
2117
2118
0
 err:
2119
0
    for (m = 0; m < n; m++)
2120
0
        free(s[m]);
2121
0
    free(s);
2122
0
    return NULL;
2123
0
}
2124
2125
char **hts_readlines(const char *fn, int *_n)
2126
0
{
2127
0
    unsigned int m = 0, n = 0;
2128
0
    char **s = 0, **s_new;
2129
0
    BGZF *fp = bgzf_open(fn, "r");
2130
0
    if ( fp ) { // read from file
2131
0
        kstring_t str;
2132
0
        int ret;
2133
0
        str.s = 0; str.l = str.m = 0;
2134
0
        while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
2135
0
            if (str.l == 0) continue;
2136
0
            if (n == 0 && hts_is_utf16_text(&str))
2137
0
                hts_log_warning("'%s' appears to be encoded as UTF-16", fn);
2138
0
            if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
2139
0
                goto err;
2140
0
            s[n] = strdup(str.s);
2141
0
            if (!s[n])
2142
0
                goto err;
2143
0
            n++;
2144
0
        }
2145
0
        if (ret < -1) // Read error
2146
0
            goto err;
2147
0
        bgzf_close(fp);
2148
0
        free(str.s);
2149
0
    } else if (*fn == ':') { // read from string
2150
0
        const char *q, *p;
2151
0
        for (q = p = fn + 1;; ++p)
2152
0
            if (*p == ',' || *p == 0) {
2153
0
                if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
2154
0
                    goto err;
2155
0
                s[n] = (char*)calloc(p - q + 1, 1);
2156
0
                if (!s[n])
2157
0
                    goto err;
2158
0
                strncpy(s[n++], q, p - q);
2159
0
                q = p + 1;
2160
0
                if (*p == 0) break;
2161
0
            }
2162
0
    } else return 0;
2163
    // Try to shrink s to the minimum size needed
2164
0
    s_new = (char**)realloc(s, n * sizeof(char*));
2165
0
    if (!s_new)
2166
0
        goto err;
2167
2168
0
    s = s_new;
2169
0
    assert(n < INT_MAX); // hts_resize() should ensure this
2170
0
    *_n = n;
2171
0
    return s;
2172
2173
0
 err:
2174
0
    for (m = 0; m < n; m++)
2175
0
        free(s[m]);
2176
0
    free(s);
2177
0
    return NULL;
2178
0
}
2179
2180
// DEPRECATED: To be removed in a future HTSlib release
2181
int hts_file_type(const char *fname)
2182
0
{
2183
0
    int len = strlen(fname);
2184
0
    if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ;
2185
0
    if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF;
2186
0
    if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ;
2187
0
    if ( !strcmp("-",fname) ) return FT_STDIN;
2188
2189
0
    hFILE *f = hopen(fname, "r");
2190
0
    if (f == NULL) return 0;
2191
2192
0
    htsFormat fmt;
2193
0
    if (hts_detect_format2(f, fname, &fmt) < 0) { hclose_abruptly(f); return 0; }
2194
0
    if (hclose(f) < 0) return 0;
2195
2196
0
    switch (fmt.format) {
2197
0
    case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ;
2198
0
    case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ;
2199
0
    default:  return 0;
2200
0
    }
2201
0
}
2202
2203
int hts_check_EOF(htsFile *fp)
2204
0
{
2205
0
    if (fp->format.compression == bgzf)
2206
0
        return bgzf_check_EOF(hts_get_bgzfp(fp));
2207
0
    else if (fp->format.format == cram)
2208
0
        return cram_check_EOF(fp->fp.cram);
2209
0
    else
2210
0
        return 3;
2211
0
}
2212
2213
2214
/****************
2215
 *** Indexing ***
2216
 ****************/
2217
2218
0
#define HTS_MIN_MARKER_DIST 0x10000
2219
2220
// Finds the special meta bin
2221
//  ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1
2222
0
#define META_BIN(idx) ((idx)->n_bins + 1)
2223
2224
0
#define pair64_lt(a,b) ((a).u < (b).u)
2225
0
#define pair64max_lt(a,b) ((a).u < (b).u || \
2226
0
                           ((a).u == (b).u && (a).max < (b).max))
2227
2228
0
KSORT_INIT_STATIC(_off, hts_pair64_t, pair64_lt)
2229
0
KSORT_INIT_STATIC(_off_max, hts_pair64_max_t, pair64max_lt)
2230
2231
typedef struct {
2232
    int32_t m, n;
2233
    uint64_t loff;
2234
    hts_pair64_t *list;
2235
} bins_t;
2236
2237
KHASH_MAP_INIT_INT(bin, bins_t)
2238
typedef khash_t(bin) bidx_t;
2239
2240
typedef struct {
2241
    hts_pos_t n, m;
2242
    uint64_t *offset;
2243
} lidx_t;
2244
2245
struct hts_idx_t {
2246
    int fmt, min_shift, n_lvls, n_bins;
2247
    uint32_t l_meta;
2248
    int32_t n, m;
2249
    uint64_t n_no_coor;
2250
    bidx_t **bidx;
2251
    lidx_t *lidx;
2252
    uint8_t *meta; // MUST have a terminating NUL on the end
2253
    int tbi_n, last_tbi_tid;
2254
    struct {
2255
        uint32_t last_bin, save_bin;
2256
        hts_pos_t last_coor;
2257
        int last_tid, save_tid, finished;
2258
        uint64_t last_off, save_off;
2259
        uint64_t off_beg, off_end;
2260
        uint64_t n_mapped, n_unmapped;
2261
    } z; // keep internal states
2262
    BGZF *otf_fp;  // Index on-the-fly output file
2263
};
2264
2265
0
static char * idx_format_name(int fmt) {
2266
0
    switch (fmt) {
2267
0
        case HTS_FMT_CSI: return "csi";
2268
0
        case HTS_FMT_BAI: return "bai";
2269
0
        case HTS_FMT_TBI: return "tbi";
2270
0
        case HTS_FMT_CRAI: return "crai";
2271
0
        default: return "unknown";
2272
0
    }
2273
0
}
2274
2275
#ifdef DEBUG_INDEX
2276
static void idx_dump(const hts_idx_t *idx) {
2277
    int i;
2278
    int64_t j;
2279
2280
    if (!idx) fprintf(stderr, "Null index\n");
2281
2282
    fprintf(stderr, "format='%s', min_shift=%d, n_lvls=%d, n_bins=%d, l_meta=%u ",
2283
            idx_format_name(idx->fmt), idx->min_shift, idx->n_lvls, idx->n_bins, idx->l_meta);
2284
    fprintf(stderr, "n=%d, m=%d, n_no_coor=%"PRIu64"\n", idx->n, idx->m, idx->n_no_coor);
2285
    for (i = 0; i < idx->n; i++) {
2286
        bidx_t *bidx = idx->bidx[i];
2287
        lidx_t *lidx = &idx->lidx[i];
2288
        if (bidx) {
2289
            fprintf(stderr, "======== BIN Index - tid=%d, n_buckets=%d, size=%d\n", i, bidx->n_buckets, bidx->size);
2290
            int b;
2291
            for (b = 0; b < META_BIN(idx); b++) {
2292
                khint_t k;
2293
                if ((k = kh_get(bin, bidx, b)) != kh_end(bidx)) {
2294
                    bins_t *entries = &kh_value(bidx, k);
2295
                    int l = hts_bin_level(b);
2296
                    int64_t bin_width = 1LL << ((idx->n_lvls - l) * 3 + idx->min_shift);
2297
                    fprintf(stderr, "\tbin=%d, level=%d, parent=%d, n_chunks=%d, loff=%"PRIu64", interval=[%"PRId64" - %"PRId64"]\n",
2298
                        b, l, hts_bin_parent(b), entries->n, entries->loff, (b-hts_bin_first(l))*bin_width+1, (b+1-hts_bin_first(l))*bin_width);
2299
                    for (j = 0; j < entries->n; j++)
2300
                        fprintf(stderr, "\t\tchunk=%"PRId64", u=%"PRIu64", v=%"PRIu64"\n", j, entries->list[j].u, entries->list[j].v);
2301
                }
2302
            }
2303
        }
2304
        if (lidx) {
2305
            fprintf(stderr, "======== LINEAR Index - tid=%d, n_values=%"PRId64"\n", i, lidx->n);
2306
            for (j = 0; j < lidx->n; j++) {
2307
                fprintf(stderr, "\t\tentry=%"PRId64", offset=%"PRIu64", interval=[%"PRId64" - %"PRId64"]\n",
2308
                    j, lidx->offset[j], j*(1<<idx->min_shift)+1, (j+1)*(1<<idx->min_shift));
2309
            }
2310
        }
2311
    }
2312
}
2313
#endif
2314
2315
static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end)
2316
0
{
2317
0
    khint_t k;
2318
0
    bins_t *l;
2319
0
    int absent;
2320
0
    k = kh_put(bin, b, bin, &absent);
2321
0
    if (absent < 0) return -1; // Out of memory
2322
0
    l = &kh_value(b, k);
2323
0
    if (absent) {
2324
0
        l->m = 1; l->n = 0;
2325
0
        l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t));
2326
0
        if (!l->list) {
2327
0
            kh_del(bin, b, k);
2328
0
            return -1;
2329
0
        }
2330
0
    } else if (l->n == l->m) {
2331
0
        uint32_t new_m = l->m ? l->m << 1 : 1;
2332
0
        hts_pair64_t *new_list = realloc(l->list, new_m * sizeof(hts_pair64_t));
2333
0
        if (!new_list) return -1;
2334
0
        l->list = new_list;
2335
0
        l->m = new_m;
2336
0
    }
2337
0
    l->list[l->n].u = beg;
2338
0
    l->list[l->n++].v = end;
2339
0
    return 0;
2340
0
}
2341
2342
static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift)
2343
0
{
2344
0
    int i;
2345
0
    hts_pos_t beg, end;
2346
0
    beg = _beg >> min_shift;
2347
0
    end = (_end - 1) >> min_shift;
2348
0
    if (l->m < end + 1) {
2349
0
        size_t new_m = l->m * 2 > end + 1 ? l->m * 2 : end + 1;
2350
0
        uint64_t *new_offset;
2351
2352
0
        new_offset = (uint64_t*)realloc(l->offset, new_m * sizeof(uint64_t));
2353
0
        if (!new_offset) return -1;
2354
2355
        // fill unused memory with (uint64_t)-1
2356
0
        memset(new_offset + l->m, 0xff, sizeof(uint64_t) * (new_m - l->m));
2357
0
        l->m = new_m;
2358
0
        l->offset = new_offset;
2359
0
    }
2360
0
    for (i = beg; i <= end; ++i) {
2361
0
        if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset;
2362
0
    }
2363
0
    if (l->n < end + 1) l->n = end + 1;
2364
0
    return 0;
2365
0
}
2366
2367
hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
2368
0
{
2369
0
    hts_idx_t *idx;
2370
0
    idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t));
2371
0
    if (idx == NULL) return NULL;
2372
0
    idx->fmt = fmt;
2373
0
    idx->min_shift = min_shift;
2374
0
    idx->n_lvls = n_lvls;
2375
0
    idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7;
2376
0
    idx->z.save_tid = idx->z.last_tid = -1;
2377
0
    idx->z.save_bin = idx->z.last_bin = 0xffffffffu;
2378
0
    idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0;
2379
0
    idx->z.last_coor = 0xffffffffu;
2380
0
    if (n) {
2381
0
        idx->n = idx->m = n;
2382
0
        idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*));
2383
0
        if (idx->bidx == NULL) { free(idx); return NULL; }
2384
0
        idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t));
2385
0
        if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; }
2386
0
    }
2387
0
    idx->tbi_n = -1;
2388
0
    idx->last_tbi_tid = -1;
2389
0
    idx->otf_fp = NULL;
2390
0
    return idx;
2391
0
}
2392
2393
static void update_loff(hts_idx_t *idx, int i, int free_lidx)
2394
0
{
2395
0
    bidx_t *bidx = idx->bidx[i];
2396
0
    lidx_t *lidx = &idx->lidx[i];
2397
0
    khint_t k;
2398
0
    int l;
2399
    // the last entry is always valid
2400
0
    for (l=lidx->n-2; l >= 0; l--) {
2401
0
        if (lidx->offset[l] == (uint64_t)-1)
2402
0
            lidx->offset[l] = lidx->offset[l+1];
2403
0
    }
2404
0
    if (bidx == 0) return;
2405
0
    for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff
2406
0
        if (kh_exist(bidx, k))
2407
0
        {
2408
0
            if ( kh_key(bidx, k) < idx->n_bins )
2409
0
            {
2410
0
                int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls);
2411
                // disable linear index if bot_bin out of bounds
2412
0
                kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0;
2413
0
            }
2414
0
            else
2415
0
                kh_val(bidx, k).loff = 0;
2416
0
        }
2417
0
    if (free_lidx) {
2418
0
        free(lidx->offset);
2419
0
        lidx->m = lidx->n = 0;
2420
0
        lidx->offset = 0;
2421
0
    }
2422
0
}
2423
2424
static int compress_binning(hts_idx_t *idx, int i)
2425
0
{
2426
0
    bidx_t *bidx = idx->bidx[i];
2427
0
    khint_t k;
2428
0
    int l, m;
2429
0
    if (bidx == 0) return 0;
2430
    // merge a bin to its parent if the bin is too small
2431
0
    for (l = idx->n_lvls; l > 0; --l) {
2432
0
        unsigned start = hts_bin_first(l);
2433
0
        for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
2434
0
            bins_t *p, *q;
2435
0
            if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue;
2436
0
            p = &kh_value(bidx, k);
2437
0
            if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list);
2438
0
            if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) {
2439
0
                khint_t kp;
2440
0
                kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k)));
2441
0
                if (kp == kh_end(bidx)) continue;
2442
0
                q = &kh_val(bidx, kp);
2443
0
                if (q->n + p->n > q->m) {
2444
0
                    uint32_t new_m = q->n + p->n;
2445
0
                    hts_pair64_t *new_list;
2446
0
                    kroundup32(new_m);
2447
0
                    if (new_m > INT32_MAX) return -1; // Limited by index format
2448
0
                    new_list = realloc(q->list, new_m * sizeof(*new_list));
2449
0
                    if (!new_list) return -1;
2450
0
                    q->m = new_m;
2451
0
                    q->list = new_list;
2452
0
                }
2453
0
                memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t));
2454
0
                q->n += p->n;
2455
0
                free(p->list);
2456
0
                kh_del(bin, bidx, k);
2457
0
            }
2458
0
        }
2459
0
    }
2460
0
    k = kh_get(bin, bidx, 0);
2461
0
    if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list);
2462
    // merge adjacent chunks that start from the same BGZF block
2463
0
    for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
2464
0
        bins_t *p;
2465
0
        if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue;
2466
0
        p = &kh_value(bidx, k);
2467
0
        for (l = 1, m = 0; l < p->n; ++l) {
2468
0
            if (p->list[m].v>>16 >= p->list[l].u>>16) {
2469
0
                if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v;
2470
0
            } else p->list[++m] = p->list[l];
2471
0
        }
2472
0
        p->n = m + 1;
2473
0
    }
2474
0
    return 0;
2475
0
}
2476
2477
int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
2478
0
{
2479
0
    int i, ret = 0;
2480
0
    if (idx == NULL || idx->z.finished) return 0; // do not run this function on an empty index or multiple times
2481
0
    if (idx->z.save_tid >= 0) {
2482
0
        ret |= insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset);
2483
0
        ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset);
2484
0
        ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
2485
0
    }
2486
0
    for (i = 0; i < idx->n; ++i) {
2487
0
        update_loff(idx, i, (idx->fmt == HTS_FMT_CSI));
2488
0
        ret |= compress_binning(idx, i);
2489
0
    }
2490
0
    idx->z.finished = 1;
2491
2492
0
    return ret;
2493
0
}
2494
2495
static inline hts_pos_t hts_idx_maxpos(const hts_idx_t *idx)
2496
0
{
2497
0
    return hts_bin_maxpos(idx->min_shift, idx->n_lvls);
2498
0
}
2499
2500
int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
2501
0
{
2502
0
    hts_pos_t maxpos = hts_idx_maxpos(idx);
2503
0
    if (tid < 0 || (beg <= maxpos && end <= maxpos))
2504
0
        return 0;
2505
2506
0
    if (idx->fmt == HTS_FMT_CSI) {
2507
0
        hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" "
2508
0
                      "cannot be stored in a csi index with these parameters. "
2509
0
                      "Please use a larger min_shift or depth",
2510
0
                      beg, end);
2511
0
    } else {
2512
0
        hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos
2513
0
                      " cannot be stored in a %s index. Try using a csi index",
2514
0
                      beg, end, idx_format_name(idx->fmt));
2515
0
    }
2516
0
    errno = ERANGE;
2517
0
    return -1;
2518
0
}
2519
2520
int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped)
2521
0
{
2522
0
    int bin;
2523
0
    if (tid<0) beg = -1, end = 0;
2524
0
    if (hts_idx_check_range(idx, tid, beg, end) < 0)
2525
0
        return -1;
2526
0
    if (tid >= idx->m) { // enlarge the index
2527
0
        uint32_t new_m = idx->m * 2 > tid + 1 ? idx->m * 2 : tid + 1;
2528
0
        bidx_t **new_bidx;
2529
0
        lidx_t *new_lidx;
2530
2531
0
        new_bidx = (bidx_t**)realloc(idx->bidx, new_m * sizeof(bidx_t*));
2532
0
        if (!new_bidx) return -1;
2533
0
        idx->bidx = new_bidx;
2534
2535
0
        new_lidx = (lidx_t*) realloc(idx->lidx, new_m * sizeof(lidx_t));
2536
0
        if (!new_lidx) return -1;
2537
0
        idx->lidx = new_lidx;
2538
2539
0
        memset(&idx->bidx[idx->m], 0, (new_m - idx->m) * sizeof(bidx_t*));
2540
0
        memset(&idx->lidx[idx->m], 0, (new_m - idx->m) * sizeof(lidx_t));
2541
0
        idx->m = new_m;
2542
0
    }
2543
0
    if (idx->n < tid + 1) idx->n = tid + 1;
2544
0
    if (idx->z.finished) return 0;
2545
0
    if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome
2546
0
        if ( tid>=0 && idx->n_no_coor )
2547
0
        {
2548
0
            hts_log_error("NO_COOR reads not in a single block at the end %d %d", tid, idx->z.last_tid);
2549
0
            return -1;
2550
0
        }
2551
0
        if (tid>=0 && idx->bidx[tid] != 0)
2552
0
        {
2553
0
            hts_log_error("Chromosome blocks not continuous");
2554
0
            return -1;
2555
0
        }
2556
0
        idx->z.last_tid = tid;
2557
0
        idx->z.last_bin = 0xffffffffu;
2558
0
    } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order
2559
0
        hts_log_error("Unsorted positions on sequence #%d: %"PRIhts_pos" followed by %"PRIhts_pos, tid+1, idx->z.last_coor+1, beg+1);
2560
0
        return -1;
2561
0
    }
2562
0
    if (end < beg) {
2563
        // Malformed ranges are errors. (Empty ranges (beg==end) are unusual but acceptable.)
2564
0
        hts_log_error("Invalid record on sequence #%d: end %"PRId64" < begin %"PRId64, tid+1, end, beg+1);
2565
0
        return -1;
2566
0
    }
2567
0
    if ( tid>=0 )
2568
0
    {
2569
0
        if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin);
2570
        // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin
2571
0
        if (beg < 0)  beg = 0;
2572
0
        if (end <= 0) end = 1;
2573
        // idx->z.last_off points to the start of the current record
2574
0
        if (insert_to_l(&idx->lidx[tid], beg, end,
2575
0
                        idx->z.last_off, idx->min_shift) < 0) return -1;
2576
0
    }
2577
0
    else idx->n_no_coor++;
2578
0
    bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls);
2579
0
    if ((int)idx->z.last_bin != bin) { // then possibly write the binning index
2580
0
        if (idx->z.save_bin != 0xffffffffu) { // save_bin==0xffffffffu only happens to the first record
2581
0
            if (insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin,
2582
0
                            idx->z.save_off, idx->z.last_off) < 0) return -1;
2583
0
        }
2584
0
        if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information
2585
0
            idx->z.off_end = idx->z.last_off;
2586
0
            if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
2587
0
                            idx->z.off_beg, idx->z.off_end) < 0) return -1;
2588
0
            if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
2589
0
                            idx->z.n_mapped, idx->z.n_unmapped) < 0) return -1;
2590
0
            idx->z.n_mapped = idx->z.n_unmapped = 0;
2591
0
            idx->z.off_beg = idx->z.off_end;
2592
0
        }
2593
0
        idx->z.save_off = idx->z.last_off;
2594
0
        idx->z.save_bin = idx->z.last_bin = bin;
2595
0
        idx->z.save_tid = tid;
2596
0
    }
2597
0
    if (is_mapped) ++idx->z.n_mapped;
2598
0
    else ++idx->z.n_unmapped;
2599
0
    idx->z.last_off = offset;
2600
0
    idx->z.last_coor = beg;
2601
0
    return 0;
2602
0
}
2603
2604
// Needed for TBI only.  Ensure 'tid' with 'name' is in the index meta data.
2605
// idx->meta needs to have been initialised first with an appropriate Tabix
2606
// configuration via hts_idx_set_meta.
2607
//
2608
// NB number of references (first 4 bytes of tabix header) aren't in
2609
// idx->meta, but held in idx->n instead.
2610
0
int hts_idx_tbi_name(hts_idx_t *idx, int tid, const char *name) {
2611
    // Horrid - we have to map incoming tid to a tbi alternative tid.
2612
    // This is because TBI counts tids by "covered" refs while everything
2613
    // else counts by Nth SQ/contig record in header.
2614
0
    if (tid == idx->last_tbi_tid || tid < 0 || !name)
2615
0
        return idx->tbi_n;
2616
2617
0
    uint32_t len = strlen(name)+1;
2618
0
    uint8_t *tmp = (uint8_t *)realloc(idx->meta, idx->l_meta + len);
2619
0
    if (!tmp)
2620
0
        return -1;
2621
2622
    // Append name
2623
0
    idx->meta = tmp;
2624
0
    strcpy((char *)idx->meta + idx->l_meta, name);
2625
0
    idx->l_meta += len;
2626
2627
    // Update seq length
2628
0
    u32_to_le(le_to_u32(idx->meta+24)+len, idx->meta+24);
2629
2630
0
    idx->last_tbi_tid = tid;
2631
0
    return ++idx->tbi_n;
2632
0
}
2633
2634
// When doing samtools index we have a read_bam / hts_idx_push(bgzf_tell())
2635
// loop.  idx->z.last_off is the previous bzgf_tell location, so we know
2636
// the location the current bam record started at as well as where it ends.
2637
//
2638
// When building an index on the fly via a write_bam / hts_idx_push loop,
2639
// this isn't quite identical as we may amend the virtual coord returned
2640
// by bgzf_tell to the start of a new block if the next bam struct doesn't
2641
// fit.  It's essentially the same thing, but for bit-identical indices
2642
// we need to amend the idx->z.last_off when we know we're starting a new
2643
// block.
2644
void hts_idx_amend_last(hts_idx_t *idx, uint64_t offset)
2645
0
{
2646
0
    idx->z.last_off = offset;
2647
0
}
2648
2649
void hts_idx_destroy(hts_idx_t *idx)
2650
93.2k
{
2651
93.2k
    khint_t k;
2652
93.2k
    int i;
2653
93.2k
    if (idx == 0) return;
2654
2655
    // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c
2656
0
    if (idx->fmt == HTS_FMT_CRAI) {
2657
0
        hts_cram_idx_t *cidx = (hts_cram_idx_t *) idx;
2658
0
        cram_index_free(cidx->cram);
2659
0
        free(cidx);
2660
0
        return;
2661
0
    }
2662
2663
0
    for (i = 0; i < idx->m; ++i) {
2664
0
        bidx_t *bidx = idx->bidx[i];
2665
0
        free(idx->lidx[i].offset);
2666
0
        if (bidx == 0) continue;
2667
0
        for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
2668
0
            if (kh_exist(bidx, k))
2669
0
                free(kh_value(bidx, k).list);
2670
0
        kh_destroy(bin, bidx);
2671
0
    }
2672
0
    free(idx->bidx); free(idx->lidx); free(idx->meta);
2673
0
    free(idx);
2674
0
}
2675
2676
0
int hts_idx_fmt(hts_idx_t *idx) {
2677
0
    return idx->fmt;
2678
0
}
2679
2680
// The optimizer eliminates these ed_is_big() calls; still it would be good to
2681
// TODO Determine endianness at configure- or compile-time
2682
2683
static inline ssize_t HTS_RESULT_USED idx_write_int32(BGZF *fp, int32_t x)
2684
0
{
2685
0
    if (ed_is_big()) x = ed_swap_4(x);
2686
0
    return bgzf_write(fp, &x, sizeof x);
2687
0
}
2688
2689
static inline ssize_t HTS_RESULT_USED idx_write_uint32(BGZF *fp, uint32_t x)
2690
0
{
2691
0
    if (ed_is_big()) x = ed_swap_4(x);
2692
0
    return bgzf_write(fp, &x, sizeof x);
2693
0
}
2694
2695
static inline ssize_t HTS_RESULT_USED idx_write_uint64(BGZF *fp, uint64_t x)
2696
0
{
2697
0
    if (ed_is_big()) x = ed_swap_8(x);
2698
0
    return bgzf_write(fp, &x, sizeof x);
2699
0
}
2700
2701
static inline void swap_bins(bins_t *p)
2702
0
{
2703
0
    int i;
2704
0
    for (i = 0; i < p->n; ++i) {
2705
0
        ed_swap_8p(&p->list[i].u);
2706
0
        ed_swap_8p(&p->list[i].v);
2707
0
    }
2708
0
}
2709
2710
static int need_idx_ugly_delay_hack(const hts_idx_t *idx)
2711
0
{
2712
    // Ugly hack for on-the-fly BAI indexes.  As these are uncompressed,
2713
    // we need to delay writing a few bytes of data until file close
2714
    // so that we have something to force a modification time update.
2715
    //
2716
    // (For compressed indexes like CSI, the BGZF EOF block serves the same
2717
    // purpose).
2718
0
    return idx->otf_fp && !idx->otf_fp->is_compressed;
2719
0
}
2720
2721
static int idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt)
2722
0
{
2723
0
    int32_t i, j;
2724
2725
0
    #define check(ret) if ((ret) < 0) return -1
2726
2727
    // VCF TBI/CSI only writes IDs for non-empty bins (ie covered references)
2728
    //
2729
    // NOTE: CSI meta is undefined in spec, so this code has an assumption
2730
    // that we're only using it for Tabix data.
2731
0
    int nids = idx->n;
2732
0
    if (idx->meta && idx->l_meta >= 4 && le_to_u32(idx->meta) == TBX_VCF) {
2733
0
        for (i = nids = 0; i < idx->n; ++i) {
2734
0
            if (idx->bidx[i])
2735
0
                nids++;
2736
0
        }
2737
0
    }
2738
0
    check(idx_write_int32(fp, nids));
2739
0
    if (fmt == HTS_FMT_TBI && idx->l_meta)
2740
0
        check(bgzf_write(fp, idx->meta, idx->l_meta));
2741
2742
0
    for (i = 0; i < idx->n; ++i) {
2743
0
        khint_t k;
2744
0
        bidx_t *bidx = idx->bidx[i];
2745
0
        lidx_t *lidx = &idx->lidx[i];
2746
2747
        // write binning index
2748
0
        if (nids == idx->n || bidx)
2749
0
            check(idx_write_int32(fp, bidx? kh_size(bidx) : 0));
2750
0
        if (bidx)
2751
0
            for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
2752
0
                if (kh_exist(bidx, k)) {
2753
0
                    bins_t *p = &kh_value(bidx, k);
2754
0
                    check(idx_write_uint32(fp, kh_key(bidx, k)));
2755
0
                    if (fmt == HTS_FMT_CSI) check(idx_write_uint64(fp, p->loff));
2756
                    //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v);
2757
0
                    check(idx_write_int32(fp, p->n));
2758
0
                    for (j = 0; j < p->n; ++j) {
2759
                        //fprintf(stderr, "\t%ld\t%ld\n", p->list[j].u, p->list[j].v);
2760
0
                        check(idx_write_uint64(fp, p->list[j].u));
2761
0
                        check(idx_write_uint64(fp, p->list[j].v));
2762
0
                    }
2763
0
                }
2764
2765
        // write linear index
2766
0
        if (fmt != HTS_FMT_CSI) {
2767
0
            check(idx_write_int32(fp, lidx->n));
2768
0
            for (j = 0; j < lidx->n; ++j)
2769
0
                check(idx_write_uint64(fp, lidx->offset[j]));
2770
0
        }
2771
0
    }
2772
2773
0
    if (!need_idx_ugly_delay_hack(idx)) {
2774
        // Write this for compressed (CSI) indexes, but for BAI we
2775
        // need to save a bit for later.  See hts_idx_close_otf_fp()
2776
0
        check(idx_write_uint64(fp, idx->n_no_coor));
2777
0
    }
2778
2779
#ifdef DEBUG_INDEX
2780
    idx_dump(idx);
2781
#endif
2782
2783
0
    return 0;
2784
0
    #undef check
2785
0
}
2786
2787
int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
2788
0
{
2789
0
    int ret, save;
2790
0
    if (idx == NULL || fn == NULL) { errno = EINVAL; return -1; }
2791
0
    char *fnidx = (char*)calloc(1, strlen(fn) + 5);
2792
0
    if (fnidx == NULL) return -1;
2793
2794
0
    strcpy(fnidx, fn);
2795
0
    switch (fmt) {
2796
0
    case HTS_FMT_BAI: strcat(fnidx, ".bai"); break;
2797
0
    case HTS_FMT_CSI: strcat(fnidx, ".csi"); break;
2798
0
    case HTS_FMT_TBI: strcat(fnidx, ".tbi"); break;
2799
0
    default: abort();
2800
0
    }
2801
2802
0
    ret = hts_idx_save_as(idx, fn, fnidx, fmt);
2803
0
    save = errno;
2804
0
    free(fnidx);
2805
0
    errno = save;
2806
0
    return ret;
2807
0
}
2808
2809
static int hts_idx_write_out(const hts_idx_t *idx, BGZF *fp, int fmt)
2810
0
{
2811
0
    #define check(ret) if ((ret) < 0) return -1
2812
2813
0
    if (fmt == HTS_FMT_CSI) {
2814
0
        check(bgzf_write(fp, "CSI\1", 4));
2815
0
        check(idx_write_int32(fp, idx->min_shift));
2816
0
        check(idx_write_int32(fp, idx->n_lvls));
2817
0
        check(idx_write_uint32(fp, idx->l_meta));
2818
0
        if (idx->l_meta) check(bgzf_write(fp, idx->meta, idx->l_meta));
2819
0
    } else if (fmt == HTS_FMT_TBI) {
2820
0
        check(bgzf_write(fp, "TBI\1", 4));
2821
0
    } else if (fmt == HTS_FMT_BAI) {
2822
0
        check(bgzf_write(fp, "BAI\1", 4));
2823
0
    } else abort();
2824
2825
0
    check(idx_save_core(idx, fp, fmt));
2826
2827
0
    #undef check
2828
0
    return 0;
2829
0
}
2830
2831
int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
2832
0
{
2833
0
    BGZF *fp;
2834
2835
0
    if (fnidx == NULL)
2836
0
        return hts_idx_save(idx, fn, fmt);
2837
2838
0
    fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
2839
0
    if (fp == NULL) return -1;
2840
2841
0
    if (hts_idx_write_out(idx, fp, fmt) < 0) {
2842
0
        int save_errno = errno;
2843
0
        bgzf_close(fp);
2844
0
        errno = save_errno;
2845
0
        return -1;
2846
0
    }
2847
2848
0
    return bgzf_close(fp);
2849
0
}
2850
2851
// idx_save for on-the-fly indexes.  Mostly duplicated from above, except
2852
// idx is not const because we want to store the file handle in it, and
2853
// the index file handle is not closed.  This allows the index file to be
2854
// closed after the EOF block on the indexed file has been written out,
2855
// so the modification times on the two files will be in the correct order.
2856
int hts_idx_save_but_not_close(hts_idx_t *idx, const char *fnidx, int fmt)
2857
0
{
2858
0
    idx->otf_fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
2859
0
    if (idx->otf_fp == NULL) return -1;
2860
2861
0
    if (hts_idx_write_out(idx, idx->otf_fp, fmt) < 0) {
2862
0
        int save_errno = errno;
2863
0
        bgzf_close(idx->otf_fp);
2864
0
        idx->otf_fp = NULL;
2865
0
        errno = save_errno;
2866
0
        return -1;
2867
0
    }
2868
2869
0
    return bgzf_flush(idx->otf_fp);
2870
0
}
2871
2872
static int hts_idx_close_otf_fp(hts_idx_t *idx)
2873
0
{
2874
0
    if (idx && idx->otf_fp) {
2875
0
        int ret = 0;
2876
0
        if (need_idx_ugly_delay_hack(idx)) {
2877
            // BAI index - write out the bytes we deferred earlier
2878
0
            ret = idx_write_uint64(idx->otf_fp, idx->n_no_coor) < 0;
2879
0
        }
2880
0
        ret |= bgzf_close(idx->otf_fp) < 0;
2881
0
        idx->otf_fp = NULL;
2882
0
        return ret == 0 ? 0 : -1;
2883
0
    }
2884
0
    return 0;
2885
0
}
2886
2887
static int idx_read_core(hts_idx_t *idx, BGZF *fp, int fmt)
2888
0
{
2889
0
    int32_t i, n, is_be;
2890
0
    is_be = ed_is_big();
2891
0
    if (idx == NULL) return -4;
2892
0
    for (i = 0; i < idx->n; ++i) {
2893
0
        bidx_t *h;
2894
0
        lidx_t *l = &idx->lidx[i];
2895
0
        uint32_t key;
2896
0
        int j, absent;
2897
0
        bins_t *p;
2898
0
        h = idx->bidx[i] = kh_init(bin);
2899
0
        if (bgzf_read(fp, &n, 4) != 4) return -1;
2900
0
        if (is_be) ed_swap_4p(&n);
2901
0
        if (n < 0) return -3;
2902
0
        for (j = 0; j < n; ++j) {
2903
0
            khint_t k;
2904
0
            if (bgzf_read(fp, &key, 4) != 4) return -1;
2905
0
            if (is_be) ed_swap_4p(&key);
2906
0
            k = kh_put(bin, h, key, &absent);
2907
0
            if (absent <  0) return -2; // No memory
2908
0
            if (absent == 0) return -3; // Duplicate bin number
2909
0
            p = &kh_val(h, k);
2910
0
            if (fmt == HTS_FMT_CSI) {
2911
0
                if (bgzf_read(fp, &p->loff, 8) != 8) return -1;
2912
0
                if (is_be) ed_swap_8p(&p->loff);
2913
0
            } else p->loff = 0;
2914
0
            if (bgzf_read(fp, &p->n, 4) != 4) return -1;
2915
0
            if (is_be) ed_swap_4p(&p->n);
2916
0
            if (p->n < 0) return -3;
2917
0
            if ((size_t) p->n > SIZE_MAX / sizeof(hts_pair64_t)) return -2;
2918
0
            p->m = p->n;
2919
0
            p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t));
2920
0
            if (p->list == NULL) return -2;
2921
0
            if (bgzf_read(fp, p->list, ((size_t) p->n)<<4) != ((size_t) p->n)<<4) return -1;
2922
0
            if (is_be) swap_bins(p);
2923
0
        }
2924
0
        if (fmt != HTS_FMT_CSI) { // load linear index
2925
0
            int j, k;
2926
0
            uint32_t x;
2927
0
            if (bgzf_read(fp, &x, 4) != 4) return -1;
2928
0
            if (is_be) ed_swap_4p(&x);
2929
0
            l->n = x;
2930
0
            if (l->n < 0) return -3;
2931
0
            if ((size_t) l->n > SIZE_MAX / sizeof(uint64_t)) return -2;
2932
0
            l->m = l->n;
2933
0
            l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t));
2934
0
            if (l->offset == NULL) return -2;
2935
0
            if (bgzf_read(fp, l->offset, l->n << 3) != l->n << 3) return -1;
2936
0
            if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]);
2937
0
            for (k = j = 0; j < l->n && l->offset[j] == 0; k = ++j); // stop at the first non-zero entry
2938
0
            for (j = l->n-1; j > k; j--) // fill missing values; may happen given older samtools and tabix
2939
0
                if (l->offset[j-1] == 0) l->offset[j-1] = l->offset[j];
2940
0
            update_loff(idx, i, 0);
2941
0
        }
2942
0
    }
2943
0
    if (bgzf_read(fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0;
2944
0
    if (is_be) ed_swap_8p(&idx->n_no_coor);
2945
#ifdef DEBUG_INDEX
2946
    idx_dump(idx);
2947
#endif
2948
2949
0
    return 0;
2950
0
}
2951
2952
static hts_idx_t *idx_read(const char *fn)
2953
0
{
2954
0
    uint8_t magic[4];
2955
0
    int i, is_be;
2956
0
    hts_idx_t *idx = NULL;
2957
0
    uint8_t *meta = NULL;
2958
0
    BGZF *fp = bgzf_open(fn, "r");
2959
0
    if (fp == NULL) return NULL;
2960
0
    is_be = ed_is_big();
2961
0
    if (bgzf_read(fp, magic, 4) != 4) goto fail;
2962
2963
0
    if (memcmp(magic, "CSI\1", 4) == 0) {
2964
0
        uint32_t x[3], n;
2965
0
        if (bgzf_read(fp, x, 12) != 12) goto fail;
2966
0
        if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]);
2967
0
        if (x[2]) {
2968
0
            if (SIZE_MAX - x[2] < 1) goto fail; // Prevent possible overflow
2969
0
            if ((meta = (uint8_t*)malloc((size_t) x[2] + 1)) == NULL) goto fail;
2970
0
            if (bgzf_read(fp, meta, x[2]) != x[2]) goto fail;
2971
            // Prevent possible strlen past the end in tbx_index_load2
2972
0
            meta[x[2]] = '\0';
2973
0
        }
2974
0
        if (bgzf_read(fp, &n, 4) != 4) goto fail;
2975
0
        if (is_be) ed_swap_4p(&n);
2976
0
        if (n > INT32_MAX) goto fail;
2977
0
        if ((idx = hts_idx_init(n, HTS_FMT_CSI, 0, x[0], x[1])) == NULL) goto fail;
2978
0
        idx->l_meta = x[2];
2979
0
        idx->meta = meta;
2980
0
        meta = NULL;
2981
0
        if (idx_read_core(idx, fp, HTS_FMT_CSI) < 0) goto fail;
2982
0
    }
2983
0
    else if (memcmp(magic, "TBI\1", 4) == 0) {
2984
0
        uint8_t x[8 * 4];
2985
0
        uint32_t n;
2986
        // Read file header
2987
0
        if (bgzf_read(fp, x, sizeof(x)) != sizeof(x)) goto fail;
2988
0
        n = le_to_u32(&x[0]); // location of n_ref
2989
0
        if (n > INT32_MAX) goto fail;
2990
0
        if ((idx = hts_idx_init(n, HTS_FMT_TBI, 0, 14, 5)) == NULL) goto fail;
2991
0
        n = le_to_u32(&x[7*4]); // location of l_nm
2992
0
        if (n > UINT32_MAX - 29) goto fail; // Prevent possible overflow
2993
0
        idx->l_meta = 28 + n;
2994
0
        if ((idx->meta = (uint8_t*)malloc(idx->l_meta + 1)) == NULL) goto fail;
2995
        // copy format, col_seq, col_beg, col_end, meta, skip, l_nm
2996
        // N.B. left in little-endian byte order.
2997
0
        memcpy(idx->meta, &x[1*4], 28);
2998
        // Read in sequence names.
2999
0
        if (bgzf_read(fp, idx->meta + 28, n) != n) goto fail;
3000
        // Prevent possible strlen past the end in tbx_index_load2
3001
0
        idx->meta[idx->l_meta] = '\0';
3002
0
        if (idx_read_core(idx, fp, HTS_FMT_TBI) < 0) goto fail;
3003
0
    }
3004
0
    else if (memcmp(magic, "BAI\1", 4) == 0) {
3005
0
        uint32_t n;
3006
0
        if (bgzf_read(fp, &n, 4) != 4) goto fail;
3007
0
        if (is_be) ed_swap_4p(&n);
3008
0
        if (n > INT32_MAX) goto fail;
3009
0
        if ((idx = hts_idx_init(n, HTS_FMT_BAI, 0, 14, 5)) == NULL) goto fail;
3010
0
        if (idx_read_core(idx, fp, HTS_FMT_BAI) < 0) goto fail;
3011
0
    }
3012
0
    else { errno = EINVAL; goto fail; }
3013
3014
0
    bgzf_close(fp);
3015
0
    return idx;
3016
3017
0
fail:
3018
0
    bgzf_close(fp);
3019
0
    hts_idx_destroy(idx);
3020
0
    free(meta);
3021
0
    return NULL;
3022
0
}
3023
3024
int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta,
3025
                      int is_copy)
3026
0
{
3027
0
    uint8_t *new_meta = meta;
3028
0
    if (is_copy) {
3029
0
        size_t l = l_meta;
3030
0
        if (l > SIZE_MAX - 1) {
3031
0
            errno = ENOMEM;
3032
0
            return -1;
3033
0
        }
3034
0
        new_meta = malloc(l + 1);
3035
0
        if (!new_meta) return -1;
3036
0
        memcpy(new_meta, meta, l);
3037
        // Prevent possible strlen past the end in tbx_index_load2
3038
0
        new_meta[l] = '\0';
3039
0
    }
3040
0
    if (idx->meta) free(idx->meta);
3041
0
    idx->l_meta = l_meta;
3042
0
    idx->meta = new_meta;
3043
0
    return 0;
3044
0
}
3045
3046
uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta)
3047
0
{
3048
0
    *l_meta = idx->l_meta;
3049
0
    return idx->meta;
3050
0
}
3051
3052
const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)
3053
0
{
3054
0
    if ( !idx || !idx->n )
3055
0
    {
3056
0
        *n = 0;
3057
0
        return NULL;
3058
0
    }
3059
3060
0
    int tid = 0, i;
3061
0
    const char **names = (const char**) calloc(idx->n,sizeof(const char*));
3062
0
    for (i=0; i<idx->n; i++)
3063
0
    {
3064
0
        bidx_t *bidx = idx->bidx[i];
3065
0
        if ( !bidx ) continue;
3066
0
        names[tid++] = getid(hdr,i);
3067
0
    }
3068
0
    *n = tid;
3069
0
    return names;
3070
0
}
3071
3072
0
int hts_idx_nseq(const hts_idx_t *idx) {
3073
0
    if (!idx) return -1;
3074
0
    return idx->n;
3075
0
}
3076
3077
int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped)
3078
0
{
3079
0
    if (!idx) return -1;
3080
0
    if ( idx->fmt == HTS_FMT_CRAI ) {
3081
0
        *mapped = 0; *unmapped = 0;
3082
0
        return -1;
3083
0
    }
3084
3085
0
    bidx_t *h = idx->bidx[tid];
3086
0
    if (!h) return -1;
3087
0
    khint_t k = kh_get(bin, h, META_BIN(idx));
3088
0
    if (k != kh_end(h)) {
3089
0
        *mapped = kh_val(h, k).list[1].u;
3090
0
        *unmapped = kh_val(h, k).list[1].v;
3091
0
        return 0;
3092
0
    } else {
3093
0
        *mapped = 0; *unmapped = 0;
3094
0
        return -1;
3095
0
    }
3096
0
}
3097
3098
uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
3099
0
{
3100
0
    if (idx->fmt == HTS_FMT_CRAI) return 0;
3101
0
    return idx->n_no_coor;
3102
0
}
3103
3104
/****************
3105
 *** Iterator ***
3106
 ****************/
3107
3108
// Note: even with 32-bit hts_pos_t, end needs to be 64-bit here due to 1LL<<s.
3109
static inline int reg2bins_narrow(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx)
3110
0
{
3111
0
    int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
3112
0
    for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3113
0
        hts_pos_t b, e;
3114
0
        int i;
3115
0
        b = t + (beg>>s); e = t + (end>>s);
3116
0
        for (i = b; i <= e; ++i) {
3117
0
            if (kh_get(bin, bidx, i) != kh_end(bidx)) {
3118
0
                assert(itr->bins.n < itr->bins.m);
3119
0
                itr->bins.a[itr->bins.n++] = i;
3120
0
            }
3121
0
        }
3122
0
    }
3123
0
    return itr->bins.n;
3124
0
}
3125
3126
static inline int reg2bins_wide(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx)
3127
0
{
3128
0
    khint_t i;
3129
0
    hts_pos_t max_shift = 3 * n_lvls + min_shift;
3130
0
    --end;
3131
0
    if (beg < 0) beg = 0;
3132
0
    for (i = kh_begin(bidx); i != kh_end(bidx); i++) {
3133
0
        if (!kh_exist(bidx, i)) continue;
3134
0
        hts_pos_t bin = (hts_pos_t) kh_key(bidx, i);
3135
0
        int level = hts_bin_level(bin);
3136
0
        if (level > n_lvls) continue; // Dodgy index?
3137
0
        hts_pos_t first = hts_bin_first(level);
3138
0
        hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level));
3139
0
        hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level));
3140
0
        if (beg_at_level <= bin && bin <= end_at_level) {
3141
0
            assert(itr->bins.n < itr->bins.m);
3142
0
            itr->bins.a[itr->bins.n++] = bin;
3143
0
        }
3144
0
    }
3145
0
    return itr->bins.n;
3146
0
}
3147
3148
static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx)
3149
0
{
3150
0
    int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
3151
0
    size_t reg_bin_count = 0, hash_bin_count = kh_n_buckets(bidx), max_bins;
3152
0
    hts_pos_t end1;
3153
0
    if (end >= 1LL<<s) end = 1LL<<s;
3154
0
    if (beg >= end) return 0;
3155
0
    end1 = end - 1;
3156
3157
    // Count bins to see if it's faster to iterate through the hash table
3158
    // or the set of bins covering the region
3159
0
    for (l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3160
0
        reg_bin_count += (end1 >> s) - (beg >> s) + 1;
3161
0
    }
3162
0
    max_bins = reg_bin_count < kh_size(bidx) ? reg_bin_count : kh_size(bidx);
3163
0
    if (itr->bins.m - itr->bins.n < max_bins) {
3164
        // Worst-case memory usage.  May be wasteful on very sparse
3165
        // data, but the bin list usually won't be too big anyway.
3166
0
        size_t new_m = max_bins + itr->bins.n;
3167
0
        if (new_m > INT_MAX || new_m > SIZE_MAX / sizeof(int)) {
3168
0
            errno = ENOMEM;
3169
0
            return -1;
3170
0
        }
3171
0
        int *new_a = realloc(itr->bins.a, new_m * sizeof(*new_a));
3172
0
        if (!new_a) return -1;
3173
0
        itr->bins.a = new_a;
3174
0
        itr->bins.m = new_m;
3175
0
    }
3176
0
    if (reg_bin_count < hash_bin_count) {
3177
0
        return reg2bins_narrow(beg, end, itr, min_shift, n_lvls, bidx);
3178
0
    } else {
3179
0
        return reg2bins_wide(beg, end, itr, min_shift, n_lvls, bidx);
3180
0
    }
3181
0
}
3182
3183
static inline int add_to_interval(hts_itr_t *iter, bins_t *bin,
3184
                                  int tid, uint32_t interval,
3185
                                  uint64_t min_off, uint64_t max_off)
3186
0
{
3187
0
    hts_pair64_max_t *off;
3188
0
    int j;
3189
3190
0
    if (!bin->n)
3191
0
        return 0;
3192
0
    off = realloc(iter->off, (iter->n_off + bin->n) * sizeof(*off));
3193
0
    if (!off)
3194
0
        return -2;
3195
3196
0
    iter->off = off;
3197
0
    for (j = 0; j < bin->n; ++j) {
3198
0
        if (bin->list[j].v > min_off && bin->list[j].u < max_off) {
3199
0
            iter->off[iter->n_off].u = min_off > bin->list[j].u
3200
0
                ? min_off : bin->list[j].u;
3201
0
            iter->off[iter->n_off].v = max_off < bin->list[j].v
3202
0
                ? max_off : bin->list[j].v;
3203
            // hts_pair64_max_t::max is now used to link
3204
            // file offsets to region list entries.
3205
            // The iterator can use this to decide if it
3206
            // can skip some file regions.
3207
0
            iter->off[iter->n_off].max = ((uint64_t) tid << 32) | interval;
3208
0
            iter->n_off++;
3209
0
        }
3210
0
    }
3211
0
    return 0;
3212
0
}
3213
3214
static inline int reg2intervals_narrow(hts_itr_t *iter, const bidx_t *bidx,
3215
                                       int tid, int64_t beg, int64_t end,
3216
                                       uint32_t interval,
3217
                                       uint64_t min_off, uint64_t max_off,
3218
                                       int min_shift, int n_lvls)
3219
0
{
3220
0
    int l, t, s = min_shift + n_lvls * 3;
3221
0
    hts_pos_t b, e, i;
3222
3223
0
    for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3224
0
        b = t + (beg>>s); e = t + (end>>s);
3225
0
        for (i = b; i <= e; ++i) {
3226
0
            khint_t k = kh_get(bin, bidx, i);
3227
0
            if (k != kh_end(bidx)) {
3228
0
                bins_t *bin = &kh_value(bidx, k);
3229
0
                int res = add_to_interval(iter, bin, tid, interval, min_off, max_off);
3230
0
                if (res < 0)
3231
0
                    return res;
3232
0
            }
3233
0
        }
3234
0
    }
3235
0
    return 0;
3236
0
}
3237
3238
static inline int reg2intervals_wide(hts_itr_t *iter, const bidx_t *bidx,
3239
                                       int tid, int64_t beg, int64_t end,
3240
                                       uint32_t interval,
3241
                                       uint64_t min_off, uint64_t max_off,
3242
                                       int min_shift, int n_lvls)
3243
0
{
3244
0
    khint_t i;
3245
0
    hts_pos_t max_shift = 3 * n_lvls + min_shift;
3246
0
    --end;
3247
0
    if (beg < 0) beg = 0;
3248
0
    for (i = kh_begin(bidx); i != kh_end(bidx); i++) {
3249
0
        if (!kh_exist(bidx, i)) continue;
3250
0
        hts_pos_t bin = (hts_pos_t) kh_key(bidx, i);
3251
0
        int level = hts_bin_level(bin);
3252
0
        if (level > n_lvls) continue; // Dodgy index?
3253
0
        hts_pos_t first = hts_bin_first(level);
3254
0
        hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level));
3255
0
        hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level));
3256
0
        if (beg_at_level <= bin && bin <= end_at_level) {
3257
0
            bins_t *bin = &kh_value(bidx, i);
3258
0
            int res = add_to_interval(iter, bin, tid, interval, min_off, max_off);
3259
0
            if (res < 0)
3260
0
                return res;
3261
0
        }
3262
0
    }
3263
0
    return 0;
3264
0
}
3265
3266
static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint32_t interval, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls)
3267
0
{
3268
0
    int l, t, s;
3269
0
    int i, j;
3270
0
    hts_pos_t end1;
3271
0
    bidx_t *bidx;
3272
0
    int start_n_off;
3273
0
    size_t reg_bin_count = 0, hash_bin_count;
3274
0
    int res;
3275
3276
0
    if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg > end)
3277
0
        return -1;
3278
3279
0
    hash_bin_count = kh_n_buckets(bidx);
3280
3281
0
    s = min_shift + (n_lvls<<1) + n_lvls;
3282
0
    if (end >= 1LL<<s)
3283
0
        end = 1LL<<s;
3284
3285
0
    end1 = end - 1;
3286
    // Count bins to see if it's faster to iterate through the hash table
3287
    // or the set of bins covering the region
3288
0
    for (l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3289
0
        reg_bin_count += (end1 >> s) - (beg >> s) + 1;
3290
0
    }
3291
3292
0
    start_n_off = iter->n_off;
3293
3294
    // Populate iter->off with the intervals for this region
3295
0
    if (reg_bin_count < hash_bin_count) {
3296
0
        res = reg2intervals_narrow(iter, bidx, tid, beg, end, interval,
3297
0
                                   min_off, max_off, min_shift, n_lvls);
3298
0
    } else {
3299
0
        res = reg2intervals_wide(iter, bidx, tid, beg, end, interval,
3300
0
                                 min_off, max_off, min_shift, n_lvls);
3301
0
    }
3302
0
    if (res < 0)
3303
0
        return res;
3304
3305
0
    if (iter->n_off - start_n_off > 1) {
3306
0
        ks_introsort(_off_max, iter->n_off - start_n_off, iter->off + start_n_off);
3307
0
        for (i = start_n_off, j = start_n_off + 1; j < iter->n_off; j++) {
3308
0
            if (iter->off[i].v >= iter->off[j].u) {
3309
0
                if (iter->off[i].v < iter->off[j].v)
3310
0
                    iter->off[i].v = iter->off[j].v;
3311
0
            } else {
3312
0
                i++;
3313
0
                if (i < j)
3314
0
                    iter->off[i] = iter->off[j];
3315
0
            }
3316
0
        }
3317
0
        iter->n_off = i + 1;
3318
0
    }
3319
3320
0
    return iter->n_off;
3321
0
}
3322
3323
0
static int compare_regions(const void *r1, const void *r2) {
3324
0
    hts_reglist_t *reg1 = (hts_reglist_t *)r1;
3325
0
    hts_reglist_t *reg2 = (hts_reglist_t *)r2;
3326
3327
0
    if (reg1->tid < 0 && reg2->tid >= 0)
3328
0
        return 1;
3329
0
    else if (reg1->tid >= 0 && reg2->tid < 0)
3330
0
        return -1;
3331
0
    else
3332
0
        return reg1->tid - reg2->tid;
3333
0
}
3334
3335
0
uint64_t hts_itr_off(const hts_idx_t* idx, int tid) {
3336
3337
0
    int i;
3338
0
    bidx_t* bidx;
3339
0
    uint64_t off0 = (uint64_t) -1;
3340
0
    khint_t k;
3341
0
    switch (tid) {
3342
0
    case HTS_IDX_START:
3343
        // Find the smallest offset, note that sequence ids may not be ordered sequentially
3344
0
        for (i = 0; i < idx->n; i++) {
3345
0
            bidx = idx->bidx[i];
3346
0
            k = kh_get(bin, bidx, META_BIN(idx));
3347
0
            if (k == kh_end(bidx))
3348
0
                continue;
3349
3350
0
            if (off0 > kh_val(bidx, k).list[0].u)
3351
0
                off0 = kh_val(bidx, k).list[0].u;
3352
0
        }
3353
0
        if (off0 == (uint64_t) -1 && idx->n_no_coor)
3354
0
            off0 = 0;
3355
        // only no-coor reads in this bam
3356
0
        break;
3357
0
    case HTS_IDX_NOCOOR:
3358
        /* No-coor reads sort after all of the mapped reads.  The position
3359
           is not stored in the index itself, so need to find the end
3360
           offset for the last mapped read.  A loop is needed here in
3361
           case references at the end of the file have no mapped reads,
3362
           or sequence ids are not ordered sequentially.
3363
           See issue samtools#568 and commits b2aab8, 60c22d and cc207d. */
3364
0
        for (i = 0; i < idx->n; i++) {
3365
0
            bidx = idx->bidx[i];
3366
0
            k = kh_get(bin, bidx, META_BIN(idx));
3367
0
            if (k != kh_end(bidx)) {
3368
0
                if (off0 == (uint64_t) -1 || off0 < kh_val(bidx, k).list[0].v) {
3369
0
                    off0 = kh_val(bidx, k).list[0].v;
3370
0
                }
3371
0
            }
3372
0
        }
3373
0
        if (off0 == (uint64_t) -1 && idx->n_no_coor)
3374
0
            off0 = 0;
3375
        // only no-coor reads in this bam
3376
0
        break;
3377
0
    case HTS_IDX_REST:
3378
0
        off0 = 0;
3379
0
        break;
3380
0
    case HTS_IDX_NONE:
3381
0
        off0 = 0;
3382
0
        break;
3383
0
    }
3384
3385
0
    return off0;
3386
0
}
3387
3388
hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
3389
0
{
3390
0
    int i, n_off, l, bin;
3391
0
    hts_pair64_max_t *off;
3392
0
    khint_t k;
3393
0
    bidx_t *bidx;
3394
0
    uint64_t min_off, max_off;
3395
0
    hts_pos_t idx_maxpos;
3396
0
    hts_itr_t *iter;
3397
0
    uint32_t unmapped = 0, rel_off;
3398
3399
    // It's possible to call this function with NULL idx iff
3400
    // tid is one of the special values HTS_IDX_REST or HTS_IDX_NONE
3401
0
    if (!idx && !(tid == HTS_IDX_REST || tid == HTS_IDX_NONE)) {
3402
0
        errno = EINVAL;
3403
0
        return NULL;
3404
0
    }
3405
3406
0
    iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
3407
0
    if (iter) {
3408
0
        if (tid < 0) {
3409
0
            uint64_t off = hts_itr_off(idx, tid);
3410
0
            if (off != (uint64_t) -1) {
3411
0
                iter->read_rest = 1;
3412
0
                iter->curr_off = off;
3413
0
                iter->readrec = readrec;
3414
0
                if (tid == HTS_IDX_NONE)
3415
0
                    iter->finished = 1;
3416
0
            } else {
3417
0
                free(iter);
3418
0
                iter = NULL;
3419
0
            }
3420
0
        } else if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) {
3421
0
            iter->finished = 1;
3422
0
        } else {
3423
0
            if (beg < 0) beg = 0;
3424
0
            if (end < beg) {
3425
0
              free(iter);
3426
0
              return NULL;
3427
0
            }
3428
3429
0
            k = kh_get(bin, bidx, META_BIN(idx));
3430
0
            if (k != kh_end(bidx))
3431
0
                unmapped = kh_val(bidx, k).list[1].v;
3432
0
            else
3433
0
                unmapped = 1;
3434
3435
0
            iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
3436
0
            iter->readrec = readrec;
3437
3438
0
            if ( !kh_size(bidx) ) { iter->finished = 1; return iter; }
3439
3440
0
            idx_maxpos = hts_idx_maxpos(idx);
3441
0
            if (beg >= idx_maxpos) { iter->finished = 1; return iter; }
3442
3443
0
            rel_off = beg>>idx->min_shift;
3444
            // compute min_off
3445
0
            bin = hts_bin_first(idx->n_lvls) + rel_off;
3446
0
            do {
3447
0
                int first;
3448
0
                k = kh_get(bin, bidx, bin);
3449
0
                if (k != kh_end(bidx)) break;
3450
0
                first = (hts_bin_parent(bin)<<3) + 1;
3451
0
                if (bin > first) --bin;
3452
0
                else bin = hts_bin_parent(bin);
3453
0
            } while (bin);
3454
0
            if (bin == 0) k = kh_get(bin, bidx, bin);
3455
0
            min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
3456
            // min_off can be calculated more accurately if the
3457
            // linear index is available
3458
0
            if (idx->lidx[tid].offset
3459
0
                && rel_off < idx->lidx[tid].n) {
3460
0
                if (min_off < idx->lidx[tid].offset[rel_off])
3461
0
                    min_off = idx->lidx[tid].offset[rel_off];
3462
0
                if (unmapped) {
3463
                    // unmapped reads are not covered by the linear index,
3464
                    // so search backwards for a smaller offset
3465
0
                    int tmp_off;
3466
0
                    for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) {
3467
0
                        if (idx->lidx[tid].offset[tmp_off] < min_off) {
3468
0
                            min_off = idx->lidx[tid].offset[tmp_off];
3469
0
                            break;
3470
0
                        }
3471
0
                    }
3472
                    // if the search went too far back or no satisfactory entry
3473
                    // was found, revert to the bin index loff value
3474
0
                    if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).loff || tmp_off < 0))
3475
0
                        min_off = kh_val(bidx, k).loff;
3476
0
                }
3477
0
            } else if (unmapped) { //CSI index
3478
0
                if (k != kh_end(bidx))
3479
0
                    min_off = kh_val(bidx, k).loff;
3480
0
            }
3481
3482
            // compute max_off: a virtual offset from a bin to the right of end
3483
            // First check if end lies within the range of the index (it won't
3484
            // if it's HTS_POS_MAX)
3485
0
            if (end <= idx_maxpos) {
3486
0
                bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
3487
0
                if (bin >= idx->n_bins) bin = 0;
3488
0
                while (1) {
3489
                    // search for an extant bin by moving right, but moving up to the
3490
                    // parent whenever we get to a first child (which also covers falling
3491
                    // off the RHS, which wraps around and immediately goes up to bin 0)
3492
0
                    while (bin % 8 == 1) bin = hts_bin_parent(bin);
3493
0
                    if (bin == 0) { max_off = UINT64_MAX; break; }
3494
0
                    k = kh_get(bin, bidx, bin);
3495
0
                    if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; }
3496
0
                    bin++;
3497
0
                }
3498
0
            } else {
3499
                // Searching to end of reference
3500
0
                max_off = UINT64_MAX;
3501
0
            }
3502
3503
            // retrieve bins
3504
0
            if (reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls, bidx) < 0) {
3505
0
                hts_itr_destroy(iter);
3506
0
                return NULL;
3507
0
            }
3508
3509
0
            for (i = n_off = 0; i < iter->bins.n; ++i)
3510
0
                if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx))
3511
0
                    n_off += kh_value(bidx, k).n;
3512
0
            if (n_off == 0) {
3513
                // No overlapping bins means the iterator has already finished.
3514
0
                iter->finished = 1;
3515
0
                return iter;
3516
0
            }
3517
0
            off = calloc(n_off, sizeof(*off));
3518
0
            for (i = n_off = 0; i < iter->bins.n; ++i) {
3519
0
                if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) {
3520
0
                    int j;
3521
0
                    bins_t *p = &kh_value(bidx, k);
3522
0
                    for (j = 0; j < p->n; ++j)
3523
0
                        if (p->list[j].v > min_off && p->list[j].u < max_off) {
3524
0
                            off[n_off].u = min_off > p->list[j].u
3525
0
                                ? min_off : p->list[j].u;
3526
0
                            off[n_off].v = max_off < p->list[j].v
3527
0
                                ? max_off : p->list[j].v;
3528
                            // hts_pair64_max_t::max is now used to link
3529
                            // file offsets to region list entries.
3530
                            // The iterator can use this to decide if it
3531
                            // can skip some file regions.
3532
0
                            off[n_off].max = ((uint64_t) tid << 32) | j;
3533
0
                            n_off++;
3534
0
                        }
3535
0
                }
3536
0
            }
3537
3538
0
            if (n_off == 0) {
3539
0
                free(off);
3540
0
                iter->finished = 1;
3541
0
                return iter;
3542
0
            }
3543
0
            ks_introsort(_off_max, n_off, off);
3544
            // resolve completely contained adjacent blocks
3545
0
            for (i = 1, l = 0; i < n_off; ++i)
3546
0
                if (off[l].v < off[i].v) off[++l] = off[i];
3547
0
            n_off = l + 1;
3548
            // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
3549
0
            for (i = 1; i < n_off; ++i)
3550
0
                if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
3551
            // merge adjacent blocks
3552
0
            for (i = 1, l = 0; i < n_off; ++i) {
3553
0
                if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
3554
0
                else off[++l] = off[i];
3555
0
            }
3556
0
            n_off = l + 1;
3557
0
            iter->n_off = n_off; iter->off = off;
3558
0
        }
3559
0
    }
3560
3561
0
    return iter;
3562
0
}
3563
3564
int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
3565
0
{
3566
0
    int i, j, bin;
3567
0
    khint_t k;
3568
0
    bidx_t *bidx;
3569
0
    uint64_t min_off, max_off, t_off = (uint64_t)-1;
3570
0
    int tid;
3571
0
    hts_pos_t beg, end, idx_maxpos;
3572
0
    hts_reglist_t *curr_reg;
3573
0
    uint32_t unmapped = 0, rel_off;
3574
3575
0
    if (!idx || !iter || !iter->multi)
3576
0
        return -1;
3577
3578
0
    iter->i = -1;
3579
0
    for (i=0; i<iter->n_reg; i++) {
3580
3581
0
        curr_reg = &iter->reg_list[i];
3582
0
        tid = curr_reg->tid;
3583
3584
0
        if (tid < 0) {
3585
0
            t_off = hts_itr_off(idx, tid);
3586
0
            if (t_off != (uint64_t)-1) {
3587
0
                switch (tid) {
3588
0
                case HTS_IDX_NONE:
3589
0
                    iter->finished = 1;
3590
                    // fall through
3591
0
                case HTS_IDX_START:
3592
0
                case HTS_IDX_REST:
3593
0
                    iter->curr_off = t_off;
3594
0
                    iter->n_reg = 0;
3595
0
                    iter->reg_list = NULL;
3596
0
                    iter->read_rest = 1;
3597
0
                    return 0;
3598
0
                case HTS_IDX_NOCOOR:
3599
0
                    iter->nocoor = 1;
3600
0
                    iter->nocoor_off = t_off;
3601
0
                }
3602
0
            }
3603
0
        } else {
3604
0
            if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL || !kh_size(bidx))
3605
0
                continue;
3606
3607
0
            k = kh_get(bin, bidx, META_BIN(idx));
3608
0
            if (k != kh_end(bidx))
3609
0
                unmapped = kh_val(bidx, k).list[1].v;
3610
0
            else
3611
0
                unmapped = 1;
3612
3613
0
            idx_maxpos = hts_idx_maxpos(idx);
3614
3615
0
            for(j=0; j<curr_reg->count; j++) {
3616
0
                hts_pair32_t *curr_intv = &curr_reg->intervals[j];
3617
0
                if (curr_intv->end < curr_intv->beg)
3618
0
                    continue;
3619
3620
0
                beg = curr_intv->beg;
3621
0
                end = curr_intv->end;
3622
0
                if (beg >= idx_maxpos)
3623
0
                    continue;
3624
0
                rel_off = beg>>idx->min_shift;
3625
3626
                /* Compute 'min_off' by searching the lowest level bin containing 'beg'.
3627
                       If the computed bin is not in the index, try the next bin to the
3628
                       left, belonging to the same parent. If it is the first sibling bin,
3629
                       try the parent bin. */
3630
0
                bin = hts_bin_first(idx->n_lvls) + rel_off;
3631
0
                do {
3632
0
                    int first;
3633
0
                    k = kh_get(bin, bidx, bin);
3634
0
                    if (k != kh_end(bidx)) break;
3635
0
                    first = (hts_bin_parent(bin)<<3) + 1;
3636
0
                    if (bin > first) --bin;
3637
0
                    else bin = hts_bin_parent(bin);
3638
0
                } while (bin);
3639
0
                if (bin == 0)
3640
0
                    k = kh_get(bin, bidx, bin);
3641
0
                min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
3642
                // min_off can be calculated more accurately if the
3643
                // linear index is available
3644
0
                if (idx->lidx[tid].offset
3645
0
                    && rel_off < idx->lidx[tid].n) {
3646
0
                    if (min_off < idx->lidx[tid].offset[rel_off])
3647
0
                        min_off = idx->lidx[tid].offset[rel_off];
3648
0
                    if (unmapped) {
3649
0
                        int tmp_off;
3650
0
                        for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) {
3651
0
                            if (idx->lidx[tid].offset[tmp_off] < min_off) {
3652
0
                                min_off = idx->lidx[tid].offset[tmp_off];
3653
0
                                break;
3654
0
                            }
3655
0
                        }
3656
3657
0
                        if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).loff || tmp_off < 0))
3658
0
                            min_off = kh_val(bidx, k).loff;
3659
0
                    }
3660
0
                } else if (unmapped) { //CSI index
3661
0
                    if (k != kh_end(bidx))
3662
0
                        min_off = kh_val(bidx, k).loff;
3663
0
                }
3664
3665
                // compute max_off: a virtual offset from a bin to the right of end
3666
                // First check if end lies within the range of the index (it
3667
                // won't if it's HTS_POS_MAX)
3668
0
                if (end <= idx_maxpos) {
3669
0
                    bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
3670
0
                    if (bin >= idx->n_bins) bin = 0;
3671
0
                    while (1) {
3672
                        // search for an extant bin by moving right, but moving up to the
3673
                        // parent whenever we get to a first child (which also covers falling
3674
                        // off the RHS, which wraps around and immediately goes up to bin 0)
3675
0
                        while (bin % 8 == 1) bin = hts_bin_parent(bin);
3676
0
                        if (bin == 0) { max_off = UINT64_MAX; break; }
3677
0
                        k = kh_get(bin, bidx, bin);
3678
0
                        if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) {
3679
0
                            max_off = kh_val(bidx, k).list[0].u;
3680
0
                            break;
3681
0
                        }
3682
0
                        bin++;
3683
0
                    }
3684
0
                } else {
3685
                    // Searching to end of reference
3686
0
                    max_off = UINT64_MAX;
3687
0
                }
3688
3689
                //convert coordinates to file offsets
3690
0
                if (reg2intervals(iter, idx, tid, beg, end, j,
3691
0
                                  min_off, max_off,
3692
0
                                  idx->min_shift, idx->n_lvls) < 0) {
3693
0
                    return -1;
3694
0
                }
3695
0
            }
3696
0
        }
3697
0
    }
3698
3699
0
    if (iter->n_off > 1)
3700
0
        ks_introsort(_off_max, iter->n_off, iter->off);
3701
3702
0
    if(!iter->n_off && !iter->nocoor)
3703
0
        iter->finished = 1;
3704
3705
0
    return 0;
3706
0
}
3707
3708
int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter)
3709
0
{
3710
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
3711
0
    int tid, i, n_off = 0;
3712
0
    uint32_t j;
3713
0
    hts_pos_t beg, end;
3714
0
    hts_reglist_t *curr_reg;
3715
0
    hts_pair32_t *curr_intv;
3716
0
    hts_pair64_max_t *off = NULL, *tmp;
3717
0
    cram_index *e = NULL;
3718
3719
0
    if (!cidx || !iter || !iter->multi)
3720
0
        return -1;
3721
3722
0
    iter->is_cram = 1;
3723
0
    iter->read_rest = 0;
3724
0
    iter->off = NULL;
3725
0
    iter->n_off = 0;
3726
0
    iter->curr_off = 0;
3727
0
    iter->i = -1;
3728
3729
0
    for (i=0; i<iter->n_reg; i++) {
3730
3731
0
        curr_reg = &iter->reg_list[i];
3732
0
        tid = curr_reg->tid;
3733
3734
0
        if (tid >= 0) {
3735
0
            tmp = realloc(off, (n_off + curr_reg->count) * sizeof(*off));
3736
0
            if (!tmp)
3737
0
                goto err;
3738
0
            off = tmp;
3739
3740
0
            for (j=0; j < curr_reg->count; j++) {
3741
0
                curr_intv = &curr_reg->intervals[j];
3742
0
                if (curr_intv->end < curr_intv->beg)
3743
0
                    continue;
3744
3745
0
                beg = curr_intv->beg;
3746
0
                end = curr_intv->end;
3747
3748
/* First, fetch the container overlapping 'beg' and assign its file offset to u, then
3749
 * find the container overlapping 'end' and assign the relative end of the slice to v.
3750
 * The cram_ptell function will adjust with the container offset, which is not stored
3751
 * in the index.
3752
 */
3753
0
                e = cram_index_query(cidx->cram, tid, beg+1, NULL);
3754
0
                if (e) {
3755
0
                    off[n_off].u = e->offset;
3756
                    // hts_pair64_max_t::max is now used to link
3757
                    // file offsets to region list entries.
3758
                    // The iterator can use this to decide if it
3759
                    // can skip some file regions.
3760
0
                    off[n_off].max = ((uint64_t) tid << 32) | j;
3761
3762
0
                    if (end >= HTS_POS_MAX) {
3763
0
                       e = cram_index_last(cidx->cram, tid, NULL);
3764
0
                    } else {
3765
0
                       e = cram_index_query_last(cidx->cram, tid, end+1);
3766
0
                    }
3767
3768
0
                    if (e) {
3769
0
                        off[n_off++].v = e->e_next
3770
0
                            ? e->e_next->offset
3771
0
                            : e->offset + e->slice + e->len;
3772
0
                    } else {
3773
0
                        hts_log_warning("Could not set offset end for region %d:%"PRIhts_pos"-%"PRIhts_pos". Skipping", tid, beg, end);
3774
0
                    }
3775
0
                }
3776
0
            }
3777
0
        } else {
3778
0
            switch (tid) {
3779
0
                case HTS_IDX_NOCOOR:
3780
0
                    e = cram_index_query(cidx->cram, tid, 1, NULL);
3781
0
                    if (e) {
3782
0
                        iter->nocoor = 1;
3783
0
                        iter->nocoor_off = e->offset;
3784
0
                    } else {
3785
0
                        hts_log_warning("No index entry for NOCOOR region");
3786
0
                    }
3787
0
                    break;
3788
0
                case HTS_IDX_START:
3789
0
                    e = cram_index_query(cidx->cram, tid, 1, NULL);
3790
0
                    if (e) {
3791
0
                        iter->read_rest = 1;
3792
0
                        tmp = realloc(off, sizeof(*off));
3793
0
                        if (!tmp)
3794
0
                            goto err;
3795
0
                        off = tmp;
3796
0
                        off[0].u = e->offset;
3797
0
                        off[0].v = 0;
3798
0
                        n_off=1;
3799
0
                    } else {
3800
0
                        hts_log_warning("No index entries");
3801
0
                    }
3802
0
                    break;
3803
0
                case HTS_IDX_REST:
3804
0
                    break;
3805
0
                case HTS_IDX_NONE:
3806
0
                    iter->finished = 1;
3807
0
                    break;
3808
0
                default:
3809
0
                    hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
3810
0
            }
3811
0
        }
3812
0
    }
3813
3814
0
    if (n_off) {
3815
0
        ks_introsort(_off_max, n_off, off);
3816
0
        iter->n_off = n_off; iter->off = off;
3817
0
    }
3818
3819
0
    if(!n_off && !iter->nocoor)
3820
0
        iter->finished = 1;
3821
3822
0
    return 0;
3823
3824
0
 err:
3825
0
    free(off);
3826
0
    return -1;
3827
0
}
3828
3829
void hts_itr_destroy(hts_itr_t *iter)
3830
0
{
3831
0
    if (iter) {
3832
0
        if (iter->multi) {
3833
0
            hts_reglist_free(iter->reg_list, iter->n_reg);
3834
0
        } else {
3835
0
            free(iter->bins.a);
3836
0
        }
3837
3838
0
        if (iter->off)
3839
0
            free(iter->off);
3840
0
        free(iter);
3841
0
    }
3842
0
}
3843
3844
static inline unsigned long long push_digit(unsigned long long i, char c)
3845
0
{
3846
    // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so
3847
0
    int digit = c - '0';
3848
0
    return 10 * i + digit;
3849
0
}
3850
3851
long long hts_parse_decimal(const char *str, char **strend, int flags)
3852
0
{
3853
0
    unsigned long long n = 0;
3854
0
    int digits = 0, decimals = 0, e = 0, lost = 0;
3855
0
    char sign = '+', esign = '+';
3856
0
    const char *s, *str_orig = str;
3857
3858
0
    while (isspace_c(*str)) str++;
3859
0
    s = str;
3860
3861
0
    if (*s == '+' || *s == '-') sign = *s++;
3862
0
    while (*s)
3863
0
        if (isdigit_c(*s)) digits++, n = push_digit(n, *s++);
3864
0
        else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++;
3865
0
        else break;
3866
3867
0
    if (*s == '.') {
3868
0
        s++;
3869
0
        while (isdigit_c(*s)) decimals++, digits++, n = push_digit(n, *s++);
3870
0
    }
3871
3872
0
    switch (*s) {
3873
0
    case 'e': case 'E':
3874
0
        s++;
3875
0
        if (*s == '+' || *s == '-') esign = *s++;
3876
0
        while (isdigit_c(*s)) e = push_digit(e, *s++);
3877
0
        if (esign == '-') e = -e;
3878
0
        break;
3879
3880
0
    case 'k': case 'K': e += 3; s++; break;
3881
0
    case 'm': case 'M': e += 6; s++; break;
3882
0
    case 'g': case 'G': e += 9; s++; break;
3883
0
    }
3884
3885
0
    e -= decimals;
3886
0
    while (e > 0) n *= 10, e--;
3887
0
    while (e < 0) lost += n % 10, n /= 10, e++;
3888
3889
0
    if (lost > 0) {
3890
0
        hts_log_warning("Discarding fractional part of %.*s", (int)(s - str), str);
3891
0
    }
3892
3893
0
    if (strend) {
3894
        // Set to the original input str pointer if not valid number syntax
3895
0
        *strend = (digits > 0)? (char *)s : (char *)str_orig;
3896
0
    } else if (digits == 0) {
3897
0
        hts_log_warning("Invalid numeric value %.8s[truncated]", str);
3898
0
    } else if (*s) {
3899
0
        if ((flags & HTS_PARSE_THOUSANDS_SEP) || (!(flags & HTS_PARSE_THOUSANDS_SEP) && *s != ','))
3900
0
            hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s);
3901
0
    }
3902
3903
0
    return (sign == '+')? n : -n;
3904
0
}
3905
3906
0
static void *hts_memrchr(const void *s, int c, size_t n) {
3907
0
    size_t i;
3908
0
    unsigned char *u = (unsigned char *)s;
3909
0
    for (i = n; i > 0; i--) {
3910
0
        if (u[i-1] == c)
3911
0
            return u+i-1;
3912
0
    }
3913
3914
0
    return NULL;
3915
0
}
3916
3917
/*
3918
 * A variant of hts_parse_reg which is reference-id aware.  It uses
3919
 * the iterator name2id callbacks to validate the region tokenisation works.
3920
 *
3921
 * This is necessary due to GRCh38 HLA additions which have reference names
3922
 * like "HLA-DRB1*12:17".
3923
 *
3924
 * All parameters are mandatory.
3925
 *
3926
 * To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200"
3927
 * are reference names, we may quote using curly braces.
3928
 * Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example.
3929
 *
3930
 * Flags are used to control how parsing works, and can be one of the below.
3931
 *
3932
 * HTS_PARSE_LIST:
3933
 *     If present, the region is assmed to be a comma separated list and
3934
 *     position parsing will not contain commas (this implicitly
3935
 *     clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal).
3936
 *     On success the return pointer will be the start of the next region, ie
3937
 *     the character after the comma.  (If *ret != '\0' then the caller can
3938
 *     assume another region is present in the list.)
3939
 *
3940
 *     If not set then positions may contain commas.  In this case the return
3941
 *     value should point to the end of the string, or NULL on failure.
3942
 *
3943
 * HTS_PARSE_ONE_COORD:
3944
 *     If present, X:100 is treated as the single base pair region X:100-100.
3945
 *     In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-<end>.
3946
 *     (This is the standard bcftools region convention.)
3947
 *
3948
 *     When not set X:100 is considered to be X:100-<end> where <end> is
3949
 *     the end of chromosome X (set to HTS_POS_MAX here).  X:100- and X:-100
3950
 *     are invalid.
3951
 *     (This is the standard samtools region convention.)
3952
 *
3953
 * Note the supplied string expects 1 based inclusive coordinates, but the
3954
 * returned coordinates start from 0 and are half open, so pos0 is valid
3955
 * for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}"
3956
 *
3957
 * On success a pointer to the byte after the end of the entire region
3958
 *            specifier is returned (plus any trailing comma), and tid,
3959
 *            beg & end will be set.
3960
 * On failure NULL is returned.
3961
 */
3962
const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg,
3963
                             hts_pos_t *end, hts_name2id_f getid, void *hdr,
3964
                             int flags)
3965
0
{
3966
0
    if (!s || !tid || !beg || !end || !getid)
3967
0
        return NULL;
3968
3969
0
    size_t s_len = strlen(s);
3970
0
    kstring_t ks = { 0, 0, NULL };
3971
3972
0
    const char *colon = NULL, *comma = NULL;
3973
0
    int quoted = 0;
3974
3975
0
    if (flags & HTS_PARSE_LIST)
3976
0
        flags &= ~HTS_PARSE_THOUSANDS_SEP;
3977
0
    else
3978
0
        flags |= HTS_PARSE_THOUSANDS_SEP;
3979
3980
0
    const char *s_end = s + s_len;
3981
3982
    // Braced quoting of references is permitted to resolve ambiguities.
3983
0
    if (*s == '{') {
3984
0
        const char *close = memchr(s, '}', s_len);
3985
0
        if (!close) {
3986
0
            hts_log_error("Mismatching braces in \"%s\"", s);
3987
0
            *tid = -1;
3988
0
            return NULL;
3989
0
        }
3990
0
        s++;
3991
0
        s_len--;
3992
0
        if (close[1] == ':')
3993
0
            colon = close+1;
3994
0
        quoted = 1; // number of trailing characters to trim
3995
3996
        // Truncate to this item only, if appropriate.
3997
0
        if (flags & HTS_PARSE_LIST) {
3998
0
            comma = strchr(close, ',');
3999
0
            if (comma) {
4000
0
                s_len = comma-s;
4001
0
                s_end = comma+1;
4002
0
            }
4003
0
        }
4004
0
    } else {
4005
        // Truncate to this item only, if appropriate.
4006
0
        if (flags & HTS_PARSE_LIST) {
4007
0
            comma = strchr(s, ',');
4008
0
            if (comma) {
4009
0
                s_len = comma-s;
4010
0
                s_end = comma+1;
4011
0
            }
4012
0
        }
4013
4014
0
        colon = hts_memrchr(s, ':', s_len);
4015
0
    }
4016
4017
    // No colon is simplest case; just check and return.
4018
0
    if (colon == NULL) {
4019
0
        *beg = 0; *end = HTS_POS_MAX;
4020
0
        kputsn(s, s_len-quoted, &ks); // convert to nul terminated string
4021
0
        if (!ks.s) {
4022
0
            *tid = -2;
4023
0
            return NULL;
4024
0
        }
4025
4026
0
        *tid = getid(hdr, ks.s);
4027
0
        free(ks.s);
4028
4029
0
        return *tid >= 0 ? s_end : NULL;
4030
0
    }
4031
4032
    // Has a colon, but check whole name first.
4033
0
    if (!quoted) {
4034
0
        *beg = 0; *end = HTS_POS_MAX;
4035
0
        kputsn(s, s_len, &ks); // convert to nul terminated string
4036
0
        if (!ks.s) {
4037
0
            *tid = -2;
4038
0
            return NULL;
4039
0
        }
4040
0
        if ((*tid = getid(hdr, ks.s)) >= 0) {
4041
            // Entire name matches, but also check this isn't
4042
            // ambiguous.  eg we have ref chr1 and ref chr1:100-200
4043
            // both present.
4044
0
            ks.l = 0;
4045
0
            kputsn(s, colon-s, &ks); // convert to nul terminated string
4046
0
            if (!ks.s) {
4047
0
                *tid = -2;
4048
0
                return NULL;
4049
0
            }
4050
0
            if (getid(hdr, ks.s) >= 0) {
4051
0
                free(ks.s);
4052
0
                *tid = -1;
4053
0
                hts_log_error("Range is ambiguous. "
4054
0
                              "Use {%s} or {%.*s}%s instead",
4055
0
                              s, (int)(colon-s), s, colon);
4056
0
                return NULL;
4057
0
            }
4058
0
            free(ks.s);
4059
4060
0
            return s_end;
4061
0
        }
4062
0
        if (*tid < -1) // Failed to parse header
4063
0
            return NULL;
4064
0
    }
4065
4066
    // Quoted, or unquoted and whole string isn't a name.
4067
    // Check the pre-colon part is valid.
4068
0
    ks.l = 0;
4069
0
    kputsn(s, colon-s-quoted, &ks); // convert to nul terminated string
4070
0
    if (!ks.s) {
4071
0
        *tid = -2;
4072
0
        return NULL;
4073
0
    }
4074
0
    *tid = getid(hdr, ks.s);
4075
0
    free(ks.s);
4076
0
    if (*tid < 0)
4077
0
        return NULL;
4078
4079
    // Finally parse the post-colon coordinates
4080
0
    char *hyphen;
4081
0
    *beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1;
4082
0
    if (*beg < 0) {
4083
0
        if (*beg != -1 && *hyphen == '-' && colon[1] != '\0') {
4084
            // User specified zero, but we're 1-based.
4085
0
            hts_log_error("Coordinates must be > 0");
4086
0
            return NULL;
4087
0
        }
4088
0
        if (isdigit_c(*hyphen) || *hyphen == '\0' || *hyphen == ',') {
4089
            // interpret chr:-100 as chr:1-100
4090
0
            *end = *beg==-1 ? HTS_POS_MAX : -(*beg+1);
4091
0
            *beg = 0;
4092
0
            return s_end;
4093
0
        } else if (*beg < -1) {
4094
0
            hts_log_error("Unexpected string \"%s\" after region", hyphen);
4095
0
            return NULL;
4096
0
        }
4097
0
    }
4098
4099
0
    if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) {
4100
0
        *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : HTS_POS_MAX;
4101
0
    } else if (*hyphen == '-') {
4102
0
        *end = hts_parse_decimal(hyphen+1, &hyphen, flags);
4103
0
        if (*hyphen != '\0' && *hyphen != ',') {
4104
0
            hts_log_error("Unexpected string \"%s\" after region", hyphen);
4105
0
            return NULL;
4106
0
        }
4107
0
    } else {
4108
0
        hts_log_error("Unexpected string \"%s\" after region", hyphen);
4109
0
        return NULL;
4110
0
    }
4111
4112
0
    if (*end == 0)
4113
0
        *end = HTS_POS_MAX; // interpret chr:100- as chr:100-<end>
4114
4115
0
    if (*beg >= *end) return NULL;
4116
4117
0
    return s_end;
4118
0
}
4119
4120
// Next release we should mark this as deprecated?
4121
// Use hts_parse_region above instead.
4122
const char *hts_parse_reg64(const char *s, hts_pos_t *beg, hts_pos_t *end)
4123
0
{
4124
0
    char *hyphen;
4125
0
    const char *colon = strrchr(s, ':');
4126
0
    if (colon == NULL) {
4127
0
        *beg = 0; *end = HTS_POS_MAX;
4128
0
        return s + strlen(s);
4129
0
    }
4130
4131
0
    *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1;
4132
0
    if (*beg < 0) *beg = 0;
4133
4134
0
    if (*hyphen == '\0') *end = HTS_POS_MAX;
4135
0
    else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP);
4136
0
    else return NULL;
4137
4138
0
    if (*beg >= *end) return NULL;
4139
0
    return colon;
4140
0
}
4141
4142
const char *hts_parse_reg(const char *s, int *beg, int *end)
4143
0
{
4144
0
    hts_pos_t beg64 = 0, end64 = 0;
4145
0
    const char *colon = hts_parse_reg64(s, &beg64, &end64);
4146
0
    if (beg64 > INT_MAX) {
4147
0
        hts_log_error("Position %"PRId64" too large", beg64);
4148
0
        return NULL;
4149
0
    }
4150
0
    if (end64 > INT_MAX) {
4151
0
        if (end64 == HTS_POS_MAX) {
4152
0
            end64 = INT_MAX;
4153
0
        } else {
4154
0
            hts_log_error("Position %"PRId64" too large", end64);
4155
0
            return NULL;
4156
0
        }
4157
0
    }
4158
0
    *beg = beg64;
4159
0
    *end = end64;
4160
0
    return colon;
4161
0
}
4162
4163
hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
4164
0
{
4165
0
    int tid;
4166
0
    hts_pos_t beg, end;
4167
4168
0
    if (strcmp(reg, ".") == 0)
4169
0
        return itr_query(idx, HTS_IDX_START, 0, 0, readrec);
4170
0
    else if (strcmp(reg, "*") == 0)
4171
0
        return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
4172
4173
0
    if (!hts_parse_region(reg, &tid, &beg, &end, getid, hdr, HTS_PARSE_THOUSANDS_SEP))
4174
0
        return NULL;
4175
4176
0
    return itr_query(idx, tid, beg, end, readrec);
4177
0
}
4178
4179
0
hts_itr_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int count, hts_name2id_f getid, void *hdr, hts_itr_multi_query_func *itr_specific, hts_readrec_func *readrec, hts_seek_func *seek, hts_tell_func *tell) {
4180
4181
0
    int i;
4182
4183
0
    if (!reglist)
4184
0
        return NULL;
4185
4186
0
    hts_itr_t *itr = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
4187
0
    if (itr) {
4188
0
        itr->n_reg = count;
4189
0
        itr->readrec = readrec;
4190
0
        itr->seek = seek;
4191
0
        itr->tell = tell;
4192
0
        itr->reg_list = reglist;
4193
0
        itr->finished = 0;
4194
0
        itr->nocoor = 0;
4195
0
        itr->multi = 1;
4196
4197
0
        for (i = 0; i < itr->n_reg; i++) {
4198
0
            if (itr->reg_list[i].reg) {
4199
0
                if (!strcmp(itr->reg_list[i].reg, ".")) {
4200
0
                    itr->reg_list[i].tid = HTS_IDX_START;
4201
0
                    continue;
4202
0
                }
4203
4204
0
                if (!strcmp(itr->reg_list[i].reg, "*")) {
4205
0
                    itr->reg_list[i].tid = HTS_IDX_NOCOOR;
4206
0
                    continue;
4207
0
                }
4208
4209
0
                itr->reg_list[i].tid = getid(hdr, reglist[i].reg);
4210
0
                if (itr->reg_list[i].tid < 0) {
4211
0
                    if (itr->reg_list[i].tid < -1) {
4212
0
                        hts_log_error("Failed to parse header");
4213
0
                        hts_itr_destroy(itr);
4214
0
                        return NULL;
4215
0
                    } else {
4216
0
                        hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", reglist[i].reg);
4217
0
                    }
4218
0
                }
4219
0
            }
4220
0
        }
4221
4222
0
        qsort(itr->reg_list, itr->n_reg, sizeof(hts_reglist_t), compare_regions);
4223
0
        if (itr_specific(idx, itr) != 0) {
4224
0
            hts_log_error("Failed to create the multi-region iterator!");
4225
0
            hts_itr_destroy(itr);
4226
0
            itr = NULL;
4227
0
        }
4228
0
    }
4229
4230
0
    return itr;
4231
0
}
4232
4233
int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
4234
0
{
4235
0
    int ret, tid;
4236
0
    hts_pos_t beg, end;
4237
0
    if (iter == NULL || iter->finished) return -1;
4238
0
    if (iter->read_rest) {
4239
0
        if (iter->curr_off) { // seek to the start
4240
0
            if (bgzf_seek(fp, iter->curr_off, SEEK_SET) < 0) {
4241
0
                hts_log_error("Failed to seek to offset %"PRIu64"%s%s",
4242
0
                              iter->curr_off,
4243
0
                              errno ? ": " : "", strerror(errno));
4244
0
                return -2;
4245
0
            }
4246
0
            iter->curr_off = 0; // only seek once
4247
0
        }
4248
0
        ret = iter->readrec(fp, data, r, &tid, &beg, &end);
4249
0
        if (ret < 0) iter->finished = 1;
4250
0
        iter->curr_tid = tid;
4251
0
        iter->curr_beg = beg;
4252
0
        iter->curr_end = end;
4253
0
        return ret;
4254
0
    }
4255
    // A NULL iter->off should always be accompanied by iter->finished.
4256
0
    assert(iter->off != NULL);
4257
0
    for (;;) {
4258
0
        if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
4259
0
            if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
4260
0
            if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
4261
0
                if (bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) {
4262
0
                    hts_log_error("Failed to seek to offset %"PRIu64"%s%s",
4263
0
                                  iter->off[iter->i+1].u,
4264
0
                                  errno ? ": " : "", strerror(errno));
4265
0
                    return -2;
4266
0
                }
4267
0
                iter->curr_off = bgzf_tell(fp);
4268
0
            }
4269
0
            ++iter->i;
4270
0
        }
4271
0
        if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) {
4272
0
            iter->curr_off = bgzf_tell(fp);
4273
0
            if (tid != iter->tid || beg >= iter->end) { // no need to proceed
4274
0
                ret = -1; break;
4275
0
            } else if (end > iter->beg && iter->end > beg) {
4276
0
                iter->curr_tid = tid;
4277
0
                iter->curr_beg = beg;
4278
0
                iter->curr_end = end;
4279
0
                return ret;
4280
0
            }
4281
0
        } else break; // end of file or error
4282
0
    }
4283
0
    iter->finished = 1;
4284
0
    return ret;
4285
0
}
4286
4287
int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r)
4288
0
{
4289
0
    void *fp;
4290
0
    int ret, tid, i, cr, ci;
4291
0
    hts_pos_t beg, end;
4292
0
    hts_reglist_t *found_reg;
4293
4294
0
    if (iter == NULL || iter->finished) return -1;
4295
4296
0
    if (iter->is_cram) {
4297
0
        fp = fd->fp.cram;
4298
0
    } else {
4299
0
        fp = fd->fp.bgzf;
4300
0
    }
4301
4302
0
    if (iter->read_rest) {
4303
0
        if (iter->curr_off) { // seek to the start
4304
0
            if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
4305
0
                hts_log_error("Seek at offset %" PRIu64 " failed.", iter->curr_off);
4306
0
                return -2;
4307
0
            }
4308
0
            iter->curr_off = 0; // only seek once
4309
0
        }
4310
4311
0
        ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
4312
0
        if (ret < 0)
4313
0
            iter->finished = 1;
4314
4315
0
        iter->curr_tid = tid;
4316
0
        iter->curr_beg = beg;
4317
0
        iter->curr_end = end;
4318
4319
0
        return ret;
4320
0
    }
4321
    // A NULL iter->off should always be accompanied by iter->finished.
4322
0
    assert(iter->off != NULL || iter->nocoor != 0);
4323
4324
0
    int next_range = 0;
4325
0
    for (;;) {
4326
        // Note that due to the way bam indexing works, iter->off may contain
4327
        // file chunks that are not actually needed as they contain data
4328
        // beyond the end of the requested region.  These are filtered out
4329
        // by comparing the tid and index into hts_reglist_t::intervals
4330
        // (packed for reasons of convenience into iter->off[iter->i].max)
4331
        // associated with the file region with iter->curr_tid and
4332
        // iter->curr_intv.
4333
4334
0
        if (next_range
4335
0
            || iter->curr_off == 0
4336
0
            || iter->i >= iter->n_off
4337
0
            || iter->curr_off >= iter->off[iter->i].v
4338
0
            || (iter->off[iter->i].max >> 32 == iter->curr_tid
4339
0
                && (iter->off[iter->i].max & 0xffffffff) < iter->curr_intv)) {
4340
4341
            // Jump to the next chunk.  It may be necessary to skip more
4342
            // than one as the iter->off list can include overlapping entries.
4343
0
            do {
4344
0
                iter->i++;
4345
0
            } while (iter->i < iter->n_off
4346
0
                     && (iter->curr_off >= iter->off[iter->i].v
4347
0
                         || (iter->off[iter->i].max >> 32 == iter->curr_tid
4348
0
                             && (iter->off[iter->i].max & 0xffffffff) < iter->curr_intv)));
4349
4350
0
            if (iter->is_cram && iter->i < iter->n_off) {
4351
                // Ensure iter->curr_reg is correct.
4352
                //
4353
                // We need this for CRAM as we shortcut some of the later
4354
                // logic by getting an end-of-range and continuing to the
4355
                // next offset.
4356
                //
4357
                // We cannot do this for BAM (and fortunately do not need to
4358
                // either) because in BAM world a query to genomic positions
4359
                // GX and GY leading to a seek offsets PX and PY may have
4360
                // GX > GY and PX < PY.  (This is due to the R-tree and falling
4361
                // between intervals, bumping up to a higher bin.)
4362
                // CRAM strictly follows PX >= PY if GX >= GY, so this logic
4363
                // works.
4364
0
                int want_tid = iter->off[iter->i].max >> 32;
4365
0
                if (!(iter->curr_reg < iter->n_reg &&
4366
0
                      iter->reg_list[iter->curr_reg].tid == want_tid)) {
4367
0
                    int j;
4368
0
                    for (j = 0; j < iter->n_reg; j++)
4369
0
                        if (iter->reg_list[j].tid == want_tid)
4370
0
                            break;
4371
0
                    if (j == iter->n_reg)
4372
0
                        return -1;
4373
0
                    iter->curr_reg = j;
4374
0
                    iter->curr_tid = iter->reg_list[iter->curr_reg].tid;
4375
0
                };
4376
0
                iter->curr_intv = iter->off[iter->i].max & 0xffffffff;
4377
0
            }
4378
4379
0
            if (iter->i >= iter->n_off) { // no more chunks, except NOCOORs
4380
0
                if (iter->nocoor) {
4381
0
                    next_range = 0;
4382
0
                    if (iter->seek(fp, iter->nocoor_off, SEEK_SET) < 0) {
4383
0
                        hts_log_error("Seek at offset %" PRIu64 " failed.", iter->nocoor_off);
4384
0
                        return -2;
4385
0
                    }
4386
0
                    if (iter->is_cram) {
4387
0
                        cram_range r = { HTS_IDX_NOCOOR };
4388
0
                        cram_set_option(fp, CRAM_OPT_RANGE_NOSEEK, &r);
4389
0
                    }
4390
4391
                    // The first slice covering the unmapped reads might
4392
                    // contain a few mapped reads, so scroll
4393
                    // forward until finding the first unmapped read.
4394
0
                    do {
4395
0
                        ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
4396
0
                    } while (tid >= 0 && ret >=0);
4397
4398
0
                    if (ret < 0)
4399
0
                        iter->finished = 1;
4400
0
                    else
4401
0
                        iter->read_rest = 1;
4402
4403
0
                    iter->curr_off = 0; // don't seek any more
4404
0
                    iter->curr_tid = tid;
4405
0
                    iter->curr_beg = beg;
4406
0
                    iter->curr_end = end;
4407
4408
0
                    return ret;
4409
0
                } else {
4410
0
                    ret = -1; break;
4411
0
                }
4412
0
            } else if (iter->i < iter->n_off) {
4413
                // New chunk may overlap the last one, so ensure we
4414
                // only seek forwards.
4415
0
                if (iter->curr_off < iter->off[iter->i].u || next_range) {
4416
0
                    iter->curr_off = iter->off[iter->i].u;
4417
4418
                    // CRAM has the capability of setting an end location.
4419
                    // This means multi-threaded decodes can stop once they
4420
                    // reach that point, rather than pointlessly decoding
4421
                    // more slices than we'll be using.
4422
                    //
4423
                    // We have to be careful here.  Whenever we set the cram
4424
                    // range we need a corresponding seek in order to ensure
4425
                    // we can safely decode at that offset.  We use next_range
4426
                    // var to ensure this is always true; this is set on
4427
                    // end-of-range condition. It's never modified for BAM.
4428
0
                    if (iter->is_cram) {
4429
                        // Next offset.[uv] tuple, but it's already been
4430
                        // included in our cram range, so don't seek and don't
4431
                        // reset range so we can efficiently multi-thread.
4432
0
                        if (next_range || iter->curr_off >= iter->end) {
4433
0
                            if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
4434
0
                                hts_log_error("Seek at offset %" PRIu64
4435
0
                                        " failed.", iter->curr_off);
4436
0
                                return -2;
4437
0
                            }
4438
4439
                            // Find the genomic range matching this interval.
4440
0
                            int j;
4441
0
                            hts_reglist_t *rl = &iter->reg_list[iter->curr_reg];
4442
0
                            cram_range r = {
4443
0
                                    rl->tid,
4444
0
                                    rl->intervals[iter->curr_intv].beg,
4445
0
                                    rl->intervals[iter->curr_intv].end
4446
0
                            };
4447
4448
                            // Expand it up to cover neighbouring intervals.
4449
                            // Note we can only have a single chromosome in a
4450
                            // range, so if we detect our blocks span chromosomes
4451
                            // or we have a multi-ref mode slice, we just use
4452
                            // HTS_IDX_START refid instead.  This doesn't actually
4453
                            // seek (due to CRAM_OPT_RANGE_NOSEEK) and is simply
4454
                            // and indicator of decoding with no end limit.
4455
                            //
4456
                            // That isn't as efficient as it could be, but it's
4457
                            // no poorer than before and it works.
4458
0
                            int tid = r.refid;
4459
0
                            int64_t end = r.end;
4460
0
                            int64_t v = iter->off[iter->i].v;
4461
0
                            j = iter->i+1;
4462
0
                            while (j < iter->n_off) {
4463
0
                                if (iter->off[j].u > v)
4464
0
                                    break;
4465
4466
0
                                uint64_t max = iter->off[j].max;
4467
0
                                if ((max>>32) != tid) {
4468
0
                                    tid = HTS_IDX_START; // => no range limit
4469
0
                                } else {
4470
0
                                    if (end < rl->intervals[max & 0xffffffff].end)
4471
0
                                        end = rl->intervals[max & 0xffffffff].end;
4472
0
                                }
4473
0
                                if (v < iter->off[j].v)
4474
0
                                    v = iter->off[j].v;
4475
0
                                j++;
4476
0
                            }
4477
0
                            r.refid = tid;
4478
0
                            r.end = end;
4479
4480
                            // Remember maximum 'v' here so we don't do
4481
                            // unnecessary subsequent seeks for the next
4482
                            // regions.  We can't change curr_off, but
4483
                            // beg/end are used only by single region iterator so
4484
                            // we cache it there to avoid changing the struct.
4485
0
                            iter->end = v;
4486
4487
0
                            cram_set_option(fp, CRAM_OPT_RANGE_NOSEEK, &r);
4488
0
                            next_range = 0;
4489
0
                        }
4490
0
                    } else { // Not CRAM
4491
0
                        if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
4492
0
                            hts_log_error("Seek at offset %" PRIu64 " failed.",
4493
0
                                          iter->curr_off);
4494
0
                            return -2;
4495
0
                        }
4496
0
                    }
4497
0
                }
4498
0
            }
4499
0
        }
4500
4501
0
        ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
4502
0
        if (ret < 0) {
4503
0
            if (iter->is_cram && cram_eof(fp)) {
4504
                // Skip to end of range
4505
                //
4506
                // We should never be adjusting curr_off manually unless
4507
                // we also can guarantee we'll be doing a seek after to
4508
                // a new location.  Otherwise we'll be reading wrong offset
4509
                // for the next container.
4510
                //
4511
                // We ensure this by adjusting our CRAM_OPT_RANGE
4512
                // accordingly above, but to double check we also
4513
                // set the skipped_block flag to enforce a seek also.
4514
0
                iter->curr_off = iter->off[iter->i].v;
4515
0
                next_range = 1;
4516
4517
                // Next region
4518
0
                if (++iter->curr_intv >= iter->reg_list[iter->curr_reg].count){
4519
0
                    if (++iter->curr_reg >= iter->n_reg)
4520
0
                        break;
4521
0
                    iter->curr_intv = 0;
4522
0
                    iter->curr_tid = iter->reg_list[iter->curr_reg].tid;
4523
0
                }
4524
0
                continue;
4525
0
            } else {
4526
0
                break;
4527
0
            }
4528
0
        }
4529
4530
0
        iter->curr_off = iter->tell(fp);
4531
4532
0
        if (tid != iter->curr_tid) {
4533
0
            hts_reglist_t key;
4534
0
            key.tid = tid;
4535
4536
0
            found_reg = (hts_reglist_t *)bsearch(&key, iter->reg_list,
4537
0
                                                 iter->n_reg,
4538
0
                                                 sizeof(hts_reglist_t),
4539
0
                                                 compare_regions);
4540
0
            if (!found_reg)
4541
0
                continue;
4542
4543
0
            iter->curr_reg = (found_reg - iter->reg_list);
4544
0
            iter->curr_tid = tid;
4545
0
            iter->curr_intv = 0;
4546
0
        }
4547
4548
0
        cr = iter->curr_reg;
4549
0
        ci = iter->curr_intv;
4550
4551
0
        for (i = ci; i < iter->reg_list[cr].count; i++) {
4552
0
            if (end > iter->reg_list[cr].intervals[i].beg &&
4553
0
                iter->reg_list[cr].intervals[i].end > beg) {
4554
0
                iter->curr_beg = beg;
4555
0
                iter->curr_end = end;
4556
0
                iter->curr_intv = i;
4557
4558
0
                return ret;
4559
0
            }
4560
4561
            // Check if the read starts beyond intervals[i].end
4562
            // If so, the interval is finished so move on to the next.
4563
0
            if (beg > iter->reg_list[cr].intervals[i].end)
4564
0
                iter->curr_intv = i + 1;
4565
4566
            // No need to keep searching if the read ends before intervals[i].beg
4567
0
            if (end < iter->reg_list[cr].intervals[i].beg)
4568
0
                break;
4569
0
        }
4570
0
    }
4571
0
    iter->finished = 1;
4572
4573
0
    return ret;
4574
0
}
4575
4576
/**********************
4577
 *** Retrieve index ***
4578
 **********************/
4579
// Local_fn and local_len will return a sub-region of 'fn'.
4580
// Eg http://elsewhere/dir/foo.bam.bai?a=b may return
4581
// foo.bam.bai via local_fn and local_len.
4582
//
4583
// Returns -1 if index couldn't be opened.
4584
//         -2 on other errors
4585
static int idx_test_and_fetch(const char *fn, const char **local_fn, int *local_len, int download)
4586
0
{
4587
0
    hFILE *remote_hfp = NULL;
4588
0
    hFILE *local_fp = NULL;
4589
0
    int save_errno;
4590
0
    htsFormat fmt;
4591
0
    kstring_t s = KS_INITIALIZE;
4592
0
    kstring_t tmps = KS_INITIALIZE;
4593
4594
0
    if (hisremote(fn)) {
4595
0
        const int buf_size = 1 * 1024 * 1024;
4596
0
        int l;
4597
0
        const char *p, *e;
4598
        // Ignore ?# params: eg any file.fmt?param=val, except for S3 URLs
4599
0
        e = fn + ((strncmp(fn, "s3://", 5) && strncmp(fn, "s3+http://", 10) && strncmp(fn, "s3+https://", 11)) ? strcspn(fn, "?#") : strcspn(fn, "?"));
4600
        // Find the previous slash from there.
4601
0
        p = e;
4602
0
        while (p > fn && *p != '/') p--;
4603
0
        if (*p == '/') p++;
4604
4605
        // Attempt to open local file first
4606
0
        kputsn(p, e-p, &s);
4607
0
        if (access(s.s, R_OK) == 0)
4608
0
        {
4609
0
            free(s.s);
4610
0
            *local_fn = p;
4611
0
            *local_len = e-p;
4612
0
            return 0;
4613
0
        }
4614
4615
        // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .bai or .tbi index.
4616
0
        if ((remote_hfp = hopen(fn, "r")) == 0) {
4617
0
            hts_log_info("Failed to open index file '%s'", fn);
4618
0
            free(s.s);
4619
0
            return -1;
4620
0
        }
4621
0
        if (hts_detect_format2(remote_hfp, fn, &fmt)) {
4622
0
            hts_log_error("Failed to detect format of index file '%s'", fn);
4623
0
            goto fail;
4624
0
        }
4625
0
        if (fmt.category != index_file || (fmt.format != bai &&  fmt.format != csi && fmt.format != tbi
4626
0
                && fmt.format != crai && fmt.format != fai_format)) {
4627
0
            hts_log_error("Format of index file '%s' is not supported", fn);
4628
0
            goto fail;
4629
0
        }
4630
4631
0
        if (download) {
4632
0
            if ((local_fp = hts_open_tmpfile(s.s, "wx", &tmps)) == NULL) {
4633
0
                hts_log_error("Failed to create file %s in the working directory", p);
4634
0
                goto fail;
4635
0
            }
4636
0
            hts_log_info("Downloading file %s to local directory", fn);
4637
0
            uint8_t *buf = (uint8_t*)calloc(buf_size, 1);
4638
0
            if (!buf) {
4639
0
                hts_log_error("%s", strerror(errno));
4640
0
                goto fail;
4641
0
            }
4642
0
            while ((l = hread(remote_hfp, buf, buf_size)) > 0) {
4643
0
                if (hwrite(local_fp, buf, l) != l) {
4644
0
                    hts_log_error("Failed to write data to %s : %s",
4645
0
                            fn, strerror(errno));
4646
0
                    free(buf);
4647
0
                    goto fail;
4648
0
                }
4649
0
            }
4650
0
            free(buf);
4651
0
            if (l < 0) {
4652
0
                hts_log_error("Error reading \"%s\"", fn);
4653
0
                goto fail;
4654
0
            }
4655
0
            if (hclose(local_fp) < 0) {
4656
0
                hts_log_error("Error closing %s : %s", fn, strerror(errno));
4657
0
                local_fp = NULL;
4658
0
                goto fail;
4659
0
            }
4660
0
            local_fp = NULL;
4661
0
            if (rename(tmps.s, s.s) < 0) {
4662
0
                hts_log_error("Error renaming %s : %s", tmps.s, strerror(errno));
4663
0
                goto fail;
4664
0
            }
4665
0
            ks_clear(&tmps);
4666
4667
0
            *local_fn = p;
4668
0
            *local_len = e-p;
4669
0
        } else {
4670
0
            *local_fn = fn;
4671
0
            *local_len = e-fn;
4672
0
        }
4673
4674
0
        if (hclose(remote_hfp) != 0) {
4675
0
            hts_log_error("Failed to close remote file %s", fn);
4676
0
        }
4677
4678
0
        free(tmps.s);
4679
0
        free(s.s);
4680
0
        return 0;
4681
0
    } else {
4682
0
        hFILE *local_hfp;
4683
0
        if ((local_hfp = hopen(fn, "r")) == 0) return -1;
4684
0
        hclose_abruptly(local_hfp);
4685
0
        *local_fn = fn;
4686
0
        *local_len = strlen(fn);
4687
0
        return 0;
4688
0
    }
4689
4690
0
 fail:
4691
0
    save_errno = errno;
4692
0
    if (remote_hfp) hclose_abruptly(remote_hfp);
4693
0
    if (local_fp) hclose_abruptly(local_fp);
4694
0
    if (tmps.l > 0) unlink(tmps.s);
4695
0
    free(tmps.s);
4696
0
    free(s.s);
4697
0
    errno = save_errno;
4698
0
    return -2;
4699
0
}
4700
4701
/*
4702
 * Check the existence of a local index file using part of the alignment
4703
 * file name.
4704
 *
4705
 * For a filename fn of fn.fmt (eg fn.bam or fn.cram) the order of checks is
4706
 * fn.fmt.csi,  fn.csi,
4707
 * fn.fmt.bai,  fn.bai  - if fmt is HTS_FMT_BAI
4708
 * fn.fmt.tbi,  fn.tbi  - if fmt is HTS_FMT_TBI
4709
 * fn.fmt.crai, fn.crai - if fmt is HTS_FMT_CRAI
4710
 * fn.fmt.fai           - if fmt is HTS_FMT_FAI
4711
 *   also .gzi if fmt is ".gz"
4712
 *
4713
 * @param fn    - pointer to the file name
4714
 * @param fmt   - one of the HTS_FMT index formats
4715
 * @param fnidx - pointer to the index file name placeholder
4716
 * @return        1 for success, 0 for failure
4717
 */
4718
5.09k
int hts_idx_check_local(const char *fn, int fmt, char **fnidx) {
4719
5.09k
    int i, l_fn, l_ext;
4720
5.09k
    const char *fn_tmp = NULL;
4721
5.09k
    char *fnidx_tmp;
4722
5.09k
    const char *csi_ext = ".csi";
4723
5.09k
    const char *bai_ext = ".bai";
4724
5.09k
    const char *tbi_ext = ".tbi";
4725
5.09k
    const char *crai_ext = ".crai";
4726
5.09k
    const char *fai_ext = ".fai";
4727
5.09k
    const char *gzi_ext = ".gzi";
4728
4729
5.09k
    if (!fn)
4730
0
        return 0;
4731
4732
5.09k
    if (hisremote(fn)) {
4733
0
        for (i = strlen(fn) - 1; i >= 0; --i)
4734
0
            if (fn[i] == '/') {
4735
0
                fn_tmp = (char *)&fn[i+1];
4736
0
                break;
4737
0
            }
4738
5.09k
    } else {
4739
        // Borrowed from hopen_fd_fileuri()
4740
5.09k
        if (strncmp(fn, "file://localhost/", 17) == 0) fn_tmp = fn + 16;
4741
5.09k
        else if (strncmp(fn, "file:///", 8) == 0) fn_tmp = fn + 7;
4742
5.09k
        else fn_tmp = fn;
4743
#if defined(_WIN32) || defined(__MSYS__)
4744
        // For cases like C:/foo
4745
        if (fn_tmp[0] == '/' && fn_tmp[1] && fn_tmp[2] == ':' && fn_tmp[3] == '/')
4746
            fn_tmp++;
4747
#endif
4748
5.09k
    }
4749
4750
5.09k
    if (!fn_tmp) return 0;
4751
5.09k
    hts_log_info("Using alignment file '%s'", fn_tmp);
4752
5.09k
    l_fn = strlen(fn_tmp); l_ext = 5;
4753
5.09k
    fnidx_tmp = (char*)calloc(l_fn + l_ext + 1, 1);
4754
5.09k
    if (!fnidx_tmp) return 0;
4755
4756
5.09k
    struct stat sbuf;
4757
4758
    // Try alignment.bam.csi first
4759
5.09k
    strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, csi_ext);
4760
5.09k
    if(stat(fnidx_tmp, &sbuf) == 0) {
4761
0
        *fnidx = fnidx_tmp;
4762
0
        return 1;
4763
5.09k
    } else { // Then try alignment.csi
4764
20.3k
        for (i = l_fn - 1; i > 0; --i)
4765
15.2k
            if (fnidx_tmp[i] == '.') {
4766
0
                strcpy(fnidx_tmp + i, csi_ext);
4767
0
                if(stat(fnidx_tmp, &sbuf) == 0) {
4768
0
                    *fnidx = fnidx_tmp;
4769
0
                    return 1;
4770
0
                }
4771
0
                break;
4772
0
            }
4773
5.09k
    }
4774
5.09k
    if (fmt == HTS_FMT_BAI) {
4775
        // Next, try alignment.bam.bai
4776
0
        strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, bai_ext);
4777
0
        if(stat(fnidx_tmp, &sbuf) == 0) {
4778
0
            *fnidx = fnidx_tmp;
4779
0
            return 1;
4780
0
        } else { // And finally, try alignment.bai
4781
0
            for (i = l_fn - 1; i > 0; --i)
4782
0
                if (fnidx_tmp[i] == '.') {
4783
0
                    strcpy(fnidx_tmp + i, bai_ext);
4784
0
                    if(stat(fnidx_tmp, &sbuf) == 0) {
4785
0
                        *fnidx = fnidx_tmp;
4786
0
                        return 1;
4787
0
                    }
4788
0
                    break;
4789
0
                }
4790
0
        }
4791
5.09k
    } else if (fmt == HTS_FMT_TBI) { // Or .tbi
4792
5.09k
        strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, tbi_ext);
4793
5.09k
        if(stat(fnidx_tmp, &sbuf) == 0) {
4794
0
            *fnidx = fnidx_tmp;
4795
0
            return 1;
4796
5.09k
        } else {
4797
20.3k
            for (i = l_fn - 1; i > 0; --i)
4798
15.2k
                if (fnidx_tmp[i] == '.') {
4799
0
                    strcpy(fnidx_tmp + i, tbi_ext);
4800
0
                    if(stat(fnidx_tmp, &sbuf) == 0) {
4801
0
                        *fnidx = fnidx_tmp;
4802
0
                        return 1;
4803
0
                    }
4804
0
                    break;
4805
0
                }
4806
5.09k
        }
4807
5.09k
    } else if (fmt == HTS_FMT_CRAI) { // Or .crai
4808
0
        strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, crai_ext);
4809
0
        if(stat(fnidx_tmp, &sbuf) == 0) {
4810
0
            *fnidx = fnidx_tmp;
4811
0
            return 1;
4812
0
        } else {
4813
0
            for (i = l_fn - 1; i > 0; --i)
4814
0
                if (fnidx_tmp[i] == '.') {
4815
0
                    strcpy(fnidx_tmp + i, crai_ext);
4816
0
                    if(stat(fnidx_tmp, &sbuf) == 0) {
4817
0
                        *fnidx = fnidx_tmp;
4818
0
                        return 1;
4819
0
                    }
4820
0
                    break;
4821
0
                }
4822
0
        }
4823
0
    } else if (fmt == HTS_FMT_FAI) { // Or .fai
4824
        // Check .gzi if we have a .gz file
4825
0
        strcpy(fnidx_tmp, fn_tmp);
4826
0
        int gzi_ok = 1;
4827
0
        if ((l_fn > 3 && strcmp(fn_tmp+l_fn-3, ".gz") == 0) ||
4828
0
            (l_fn > 5 && strcmp(fn_tmp+l_fn-5, ".bgzf") == 0)) {
4829
0
            strcpy(fnidx_tmp + l_fn, gzi_ext);
4830
0
            gzi_ok = stat(fnidx_tmp, &sbuf)==0;
4831
0
        }
4832
4833
        // Now check for .fai.  Occurs second as we're returning this
4834
        // in *fnidx irrespective of whether we did gzi check.
4835
0
        strcpy(fnidx_tmp + l_fn, fai_ext);
4836
0
        *fnidx = fnidx_tmp;
4837
0
        if (stat(fnidx_tmp, &sbuf) == 0)
4838
0
            return gzi_ok;
4839
0
        else
4840
0
            return 0;
4841
0
    }
4842
4843
5.09k
    free(fnidx_tmp);
4844
5.09k
    return 0;
4845
5.09k
}
4846
4847
0
static char *idx_filename(const char *fn, const char *ext, int download) {
4848
0
    int ret, local_len;
4849
0
    char *fnidx;
4850
0
    const char *local_fn = NULL;
4851
0
    kstring_t buffer = KS_INITIALIZE;
4852
4853
    // First try : append `ext` to `fn`
4854
0
    if (!(fnidx = haddextension(&buffer, fn, 0, ext))) {
4855
0
        free(buffer.s);
4856
0
        return NULL;
4857
0
    }
4858
0
    if ((ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, download)) == -1) {
4859
        // Second try : replace suffix of `fn` with `ext`
4860
0
        if (!(fnidx = haddextension(&buffer, fn, 1, ext))) {
4861
0
            free(buffer.s);
4862
0
            return NULL;
4863
0
        }
4864
0
        ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, download);
4865
0
    }
4866
4867
0
    if (ret < 0) {
4868
0
        free(buffer.s);
4869
0
        return NULL;
4870
0
    }
4871
4872
0
    memmove(fnidx, local_fn, local_len);
4873
0
    fnidx[local_len] = 0;
4874
0
    return fnidx;
4875
0
}
4876
4877
char *hts_idx_getfn(const char *fn, const char *ext)
4878
0
{
4879
0
    return idx_filename(fn, ext, HTS_IDX_SAVE_REMOTE);
4880
0
}
4881
4882
char *hts_idx_locatefn(const char *fn, const char *ext)
4883
0
{
4884
0
    return idx_filename(fn, ext, 0);
4885
0
}
4886
4887
static hts_idx_t *idx_find_and_load(const char *fn, int fmt, int flags)
4888
5.09k
{
4889
5.09k
    char *fnidx = strstr(fn, HTS_IDX_DELIM);
4890
5.09k
    hts_idx_t *idx;
4891
4892
5.09k
    if ( fnidx ) {
4893
0
        char *fn2 = strdup(fn);
4894
0
        if (!fn2) {
4895
0
            hts_log_error("%s", strerror(errno));
4896
0
            return NULL;
4897
0
        }
4898
0
        fn2[fnidx - fn] = '\0';
4899
0
        fnidx += strlen(HTS_IDX_DELIM);
4900
0
        idx = hts_idx_load3(fn2, fnidx, fmt, flags);
4901
0
        free(fn2);
4902
0
        return idx;
4903
0
    }
4904
4905
5.09k
    if (hts_idx_check_local(fn, fmt, &fnidx) == 0 && hisremote(fn)) {
4906
0
        if (flags & HTS_IDX_SAVE_REMOTE) {
4907
0
            fnidx = idx_filename(fn, ".csi", HTS_IDX_SAVE_REMOTE);
4908
0
            if (!fnidx) {
4909
0
                switch (fmt) {
4910
0
                case HTS_FMT_BAI: fnidx = idx_filename(fn, ".bai", HTS_IDX_SAVE_REMOTE); break;
4911
0
                case HTS_FMT_TBI: fnidx = idx_filename(fn, ".tbi", HTS_IDX_SAVE_REMOTE); break;
4912
0
                default: break;
4913
0
                }
4914
0
            }
4915
0
        } else {
4916
0
            fnidx = idx_filename(fn, ".csi", 0);
4917
0
            if (!fnidx) {
4918
0
                switch (fmt) {
4919
0
                case HTS_FMT_BAI: fnidx = idx_filename(fn, ".bai", 0); break;
4920
0
                case HTS_FMT_TBI: fnidx = idx_filename(fn, ".tbi", 0); break;
4921
0
                default: break;
4922
0
                }
4923
0
            }
4924
0
        }
4925
0
    }
4926
5.09k
    if (!fnidx) {
4927
5.09k
        if (!(flags & HTS_IDX_SILENT_FAIL))
4928
0
            hts_log_error("Could not retrieve index file for '%s'", fn);
4929
5.09k
        return 0;
4930
5.09k
    }
4931
4932
0
    if (flags & HTS_IDX_SAVE_REMOTE)
4933
0
        idx = hts_idx_load3(fn, fnidx, fmt, flags);
4934
0
    else
4935
0
        idx = idx_read(fnidx);
4936
0
    free(fnidx);
4937
0
    return idx;
4938
5.09k
}
4939
4940
0
hts_idx_t *hts_idx_load(const char *fn, int fmt) {
4941
0
    return idx_find_and_load(fn, fmt, 1);
4942
0
}
4943
4944
hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
4945
0
{
4946
0
    return hts_idx_load3(fn, fnidx, 0, 0);
4947
0
}
4948
4949
hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags)
4950
5.09k
{
4951
5.09k
    const char *local_fn = NULL;
4952
5.09k
    char *local_fnidx = NULL;
4953
5.09k
    int local_len;
4954
5.09k
    if (!fnidx)
4955
5.09k
        return idx_find_and_load(fn, fmt, flags);
4956
4957
    // Check that the index file is up to date, the main file might have changed
4958
0
    struct stat stat_idx,stat_main;
4959
0
    int remote_fn = hisremote(fn), remote_fnidx = hisremote(fnidx);
4960
0
    if ( !remote_fn && !remote_fnidx
4961
0
         && !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) )
4962
0
    {
4963
0
        if ( stat_idx.st_mtime < stat_main.st_mtime )
4964
0
            hts_log_warning("The index file is older than the data file: %s", fnidx);
4965
0
    }
4966
4967
0
    if (remote_fnidx && (flags & HTS_IDX_SAVE_REMOTE))
4968
0
    {
4969
0
        int ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, 1);
4970
0
        if (ret == 0) {
4971
0
            local_fnidx = strdup(local_fn);
4972
0
            if (local_fnidx) {
4973
0
                local_fnidx[local_len] = '\0';
4974
0
                fnidx = local_fnidx;
4975
0
            }
4976
0
        }
4977
0
    }
4978
4979
0
    hts_idx_t *idx = idx_read(fnidx);
4980
0
    if (!idx && !(flags & HTS_IDX_SILENT_FAIL))
4981
0
        hts_log_error("Could not load local index file '%s'%s%s", fnidx,
4982
0
                      errno ? " : " : "", errno ? strerror(errno) : "");
4983
4984
4985
0
    free(local_fnidx);
4986
4987
0
    return idx;
4988
5.09k
}
4989
4990
4991
4992
/**********************
4993
 ***     Memory     ***
4994
 **********************/
4995
4996
/* For use with hts_expand macros *only* */
4997
HTSLIB_EXPORT
4998
size_t hts_realloc_or_die(size_t n, size_t m, size_t m_sz, size_t size,
4999
4.87k
                          int clear, void **ptr, const char *func) {
5000
    /* If new_m and size are both below this limit, multiplying them
5001
       together can't overflow */
5002
4.87k
    const size_t safe = (size_t) 1 << (sizeof(size_t) * 4);
5003
4.87k
    void *new_ptr;
5004
4.87k
    size_t bytes, new_m;
5005
5006
4.87k
    new_m = n;
5007
4.87k
    kroundup_size_t(new_m);
5008
5009
4.87k
    bytes = size * new_m;
5010
5011
    /* Check for overflow.  Both ensure that new_m will fit in m (we make the
5012
       pessimistic assumption that m is signed), and that bytes has not
5013
       wrapped around. */
5014
4.87k
    if (new_m > (((size_t) 1 << (m_sz * 8 - 1)) - 1)
5015
4.87k
        || ((size > safe || new_m > safe)
5016
0
            && bytes / new_m != size)) {
5017
0
        errno = ENOMEM;
5018
0
        goto die;
5019
0
    }
5020
5021
4.87k
    new_ptr = realloc(*ptr, bytes);
5022
4.87k
    if (new_ptr == NULL) goto die;
5023
5024
4.87k
    if (clear) {
5025
0
        if (new_m > m) {
5026
0
            memset((char *) new_ptr + m * size, 0, (new_m - m) * size);
5027
0
        }
5028
0
    }
5029
5030
4.87k
    *ptr = new_ptr;
5031
5032
4.87k
    return new_m;
5033
5034
0
 die:
5035
0
    hts_log_error("%s", strerror(errno));
5036
0
    exit(1);
5037
4.87k
}
5038
5039
/*
5040
 * Companion to hts_resize() macro that does the actual allocation.
5041
 *
5042
 * Somewhat complicated as hts_resize() needs to write the new allocated
5043
 * size back into *size_in_out, and the value pointed to may either be
5044
 * int32_t, uint32_t or size_t depending on which array is being resized.
5045
 * This is solved by making `size_in_out` a void pointer, getting the macro
5046
 * to pass in the size of the item pointed to (in `size_sz`) and then using
5047
 * an appropriate cast (based on the value of size_sz).  The function
5048
 * ensures that the maximum size will be storable in a signed type of
5049
 * the given size so storing to an int32_t should work correctly.
5050
 *
5051
 * Assumes that sizeof(uint32_t) and sizeof(int32_t) is 4,
5052
 * sizeof(uint64_t) and sizeof(int64_t) is 8 and sizeof(size_t) is
5053
 * either 4 or 8.  It also assumes casting from unsigned to signed will
5054
 * work as long as the top bit isn't set.
5055
 */
5056
5057
int hts_resize_array_(size_t item_size, size_t num, size_t size_sz,
5058
                      void *size_in_out, void **ptr_in_out, int flags,
5059
27.4k
                      const char *func) {
5060
    /* If new_size and item_size are both below this limit, multiplying them
5061
       together can't overflow */
5062
27.4k
    const size_t safe = (size_t) 1 << (sizeof(size_t) * 4);
5063
27.4k
    void *new_ptr;
5064
27.4k
    size_t bytes, new_size;
5065
5066
27.4k
    new_size = num;
5067
27.4k
    kroundup_size_t(new_size);
5068
27.4k
    bytes = item_size * new_size;
5069
5070
    /* Check for overflow.  Both ensure that alloc will fit in alloc_in_out (we
5071
       make the pessimistic assumption that *alloc_in_out is signed), and that
5072
       bytes has not wrapped around. */
5073
5074
27.4k
    if ((new_size > (((size_t) 1 << (size_sz * 8 - 1)) - 1))
5075
27.4k
        || (((item_size > safe) || (new_size > safe))
5076
0
            && bytes / new_size != item_size)) {
5077
0
        hts_log(HTS_LOG_ERROR, func, "Memory allocation too large");
5078
0
        errno = ENOMEM;
5079
0
        return -1;
5080
0
    }
5081
5082
27.4k
    new_ptr = realloc(*ptr_in_out, bytes);
5083
27.4k
    if (new_ptr == NULL) {
5084
0
        int save_errno = errno;
5085
0
        hts_log(HTS_LOG_ERROR, func, "%s", strerror(errno));
5086
0
        errno = save_errno;
5087
0
        return -1;
5088
0
    }
5089
5090
27.4k
    if (flags & HTS_RESIZE_CLEAR) {
5091
27.4k
        size_t old_size;
5092
27.4k
        switch (size_sz) {
5093
27.4k
        case 4: old_size = *((uint32_t *) size_in_out); break;
5094
0
        case 8: old_size = *((uint64_t *) size_in_out); break;
5095
0
        default: abort();
5096
27.4k
        }
5097
27.4k
        if (new_size > old_size) {
5098
27.4k
            memset((char *) new_ptr + old_size * item_size, 0,
5099
27.4k
                   (new_size - old_size) * item_size);
5100
27.4k
        }
5101
27.4k
    }
5102
5103
27.4k
    switch (size_sz) {
5104
27.4k
    case 4: *((uint32_t *) size_in_out) = new_size; break;
5105
0
    case 8: *((uint64_t *) size_in_out) = new_size; break;
5106
0
    default: abort();
5107
27.4k
    }
5108
5109
27.4k
    *ptr_in_out = new_ptr;
5110
27.4k
    return 0;
5111
27.4k
}
5112
5113
void hts_lib_shutdown(void)
5114
0
{
5115
0
    hfile_shutdown(1);
5116
0
}
5117
5118
0
void hts_free(void *ptr) {
5119
0
    free(ptr);
5120
0
}
5121
5122
void hts_set_log_level(enum htsLogLevel level)
5123
0
{
5124
0
    hts_verbose = level;
5125
0
}
5126
5127
enum htsLogLevel hts_get_log_level(void)
5128
0
{
5129
0
    return hts_verbose;
5130
0
}
5131
5132
static char get_severity_tag(enum htsLogLevel severity)
5133
1.44M
{
5134
1.44M
    switch (severity) {
5135
47.2k
    case HTS_LOG_ERROR:
5136
47.2k
        return 'E';
5137
1.40M
    case HTS_LOG_WARNING:
5138
1.40M
        return 'W';
5139
0
    case HTS_LOG_INFO:
5140
0
        return 'I';
5141
0
    case HTS_LOG_DEBUG:
5142
0
        return 'D';
5143
0
    case HTS_LOG_TRACE:
5144
0
        return 'T';
5145
0
    default:
5146
0
        break;
5147
1.44M
    }
5148
5149
0
    return '*';
5150
1.44M
}
5151
5152
void hts_log(enum htsLogLevel severity, const char *context, const char *format, ...)
5153
1.83M
{
5154
1.83M
    int save_errno = errno;
5155
1.83M
    if (severity <= hts_verbose) {
5156
1.44M
        va_list argptr;
5157
5158
1.44M
        fprintf(stderr, "[%c::%s] ", get_severity_tag(severity), context);
5159
5160
1.44M
        va_start(argptr, format);
5161
1.44M
        vfprintf(stderr, format, argptr);
5162
1.44M
        va_end(argptr);
5163
5164
1.44M
        fprintf(stderr, "\n");
5165
1.44M
    }
5166
    errno = save_errno;
5167
1.83M
}