Coverage Report

Created: 2026-05-30 06:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/hts.c
Line
Count
Source
1
/*  hts.c -- format-neutral I/O, indexing, and iterator API functions.
2
3
    Copyright (C) 2008, 2009, 2012-2025 Genome Research Ltd.
4
    Copyright (C) 2012, 2013 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <zlib.h>
30
#include <stdio.h>
31
#include <string.h>
32
#include <strings.h>
33
#include <stdlib.h>
34
#include <unistd.h>
35
#include <inttypes.h>
36
#include <limits.h>
37
#include <stdint.h>
38
#include <fcntl.h>
39
#include <errno.h>
40
#include <time.h>
41
#include <sys/stat.h>
42
#include <assert.h>
43
44
#ifdef HAVE_LIBLZMA
45
#ifdef HAVE_LZMA_H
46
#include <lzma.h>
47
#else
48
#include "os/lzma_stub.h"
49
#endif
50
#endif
51
52
#include "htslib/hts.h"
53
#include "htslib/bgzf.h"
54
#include "cram/cram.h"
55
#include "htslib/hfile.h"
56
#include "htslib/hts_endian.h"
57
#include "version.h"
58
#include "config_vars.h"
59
#include "hts_internal.h"
60
#include "hfile_internal.h"
61
#include "sam_internal.h"
62
#include "htslib/hts_alloc.h"
63
#include "htslib/hts_expr.h"
64
#include "htslib/hts_os.h" // drand48
65
66
#include "htslib/khash.h"
67
#include "htslib/kseq.h"
68
#include "htslib/ksort.h"
69
#include "htslib/tbx.h"
70
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
71
#include <htscodecs/htscodecs.h>
72
#else
73
#include "htscodecs/htscodecs/htscodecs.h"
74
#endif
75
76
#ifndef EFTYPE
77
72
#define EFTYPE ENOEXEC
78
#endif
79
80
1.95k
KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal)
Unexecuted instantiation: kh_grow_to_fit_s2i
Unexecuted instantiation: kh_stats_s2i
81
82
HTSLIB_EXPORT
83
int hts_verbose = HTS_LOG_WARNING;
84
85
const char *hts_version(void)
86
2
{
87
2
    return HTS_VERSION_TEXT;
88
2
}
89
90
0
unsigned int hts_features(void) {
91
0
    unsigned int feat = HTS_FEATURE_HTSCODECS; // Always present
92
93
0
#ifdef PACKAGE_URL
94
0
    feat |= HTS_FEATURE_CONFIGURE;
95
0
#endif
96
97
#ifdef ENABLE_PLUGINS
98
    feat |= HTS_FEATURE_PLUGINS;
99
#endif
100
101
0
#ifdef HAVE_LIBCURL
102
0
    feat |= HTS_FEATURE_LIBCURL;
103
0
#endif
104
105
0
#ifdef ENABLE_S3
106
0
    feat |= HTS_FEATURE_S3;
107
0
#endif
108
109
0
#ifdef ENABLE_GCS
110
0
    feat |= HTS_FEATURE_GCS;
111
0
#endif
112
113
#ifdef HAVE_LIBDEFLATE
114
    feat |= HTS_FEATURE_LIBDEFLATE;
115
#endif
116
117
0
#ifdef HAVE_LIBLZMA
118
0
    feat |= HTS_FEATURE_LZMA;
119
0
#endif
120
121
0
#ifdef HAVE_LIBBZ2
122
0
    feat |= HTS_FEATURE_BZIP2;
123
0
#endif
124
125
0
    return feat;
126
0
}
127
128
0
const char *hts_test_feature(unsigned int id) {
129
0
    unsigned int feat = hts_features();
130
131
0
    switch (id) {
132
0
    case HTS_FEATURE_CONFIGURE:
133
0
        return feat & HTS_FEATURE_CONFIGURE ? "yes" : NULL;
134
0
    case HTS_FEATURE_PLUGINS:
135
0
        return feat & HTS_FEATURE_PLUGINS ? "yes" : NULL;
136
0
    case HTS_FEATURE_LIBCURL:
137
0
        return feat & HTS_FEATURE_LIBCURL ? "yes" : NULL;
138
0
    case HTS_FEATURE_S3:
139
0
        return feat & HTS_FEATURE_S3 ? "yes" : NULL;
140
0
    case HTS_FEATURE_GCS:
141
0
        return feat & HTS_FEATURE_GCS ? "yes" : NULL;
142
0
    case HTS_FEATURE_LIBDEFLATE:
143
0
        return feat & HTS_FEATURE_LIBDEFLATE ? "yes" : NULL;
144
0
    case HTS_FEATURE_BZIP2:
145
0
        return feat & HTS_FEATURE_BZIP2 ? "yes" : NULL;
146
0
    case HTS_FEATURE_LZMA:
147
0
        return feat & HTS_FEATURE_LZMA ? "yes" : NULL;
148
149
0
    case HTS_FEATURE_HTSCODECS:
150
0
        return htscodecs_version();
151
152
0
    case HTS_FEATURE_CC:
153
0
        return HTS_CC;
154
0
    case HTS_FEATURE_CFLAGS:
155
0
        return HTS_CFLAGS;
156
0
    case HTS_FEATURE_LDFLAGS:
157
0
        return HTS_LDFLAGS;
158
0
    case HTS_FEATURE_CPPFLAGS:
159
0
        return HTS_CPPFLAGS;
160
161
0
    default:
162
0
        fprintf(stderr, "Unknown feature code: %u\n", id);
163
0
    }
164
165
0
    return NULL;
166
0
}
167
168
// Note this implementation also means we can just "strings" the library
169
// to find the configuration parameters.
170
0
const char *hts_feature_string(void) {
171
0
    static char config[1200];
172
0
    const char *flags=
173
174
0
#ifdef PACKAGE_URL
175
0
    "build=configure "
176
#else
177
    "build=Makefile "
178
#endif
179
180
0
#ifdef HAVE_LIBCURL
181
0
    "libcurl=yes "
182
#else
183
    "libcurl=no "
184
#endif
185
186
0
#ifdef ENABLE_S3
187
0
    "S3=yes "
188
#else
189
    "S3=no "
190
#endif
191
192
0
#ifdef ENABLE_GCS
193
0
    "GCS=yes "
194
#else
195
    "GCS=no "
196
#endif
197
198
#ifdef HAVE_LIBDEFLATE
199
    "libdeflate=yes "
200
#else
201
0
    "libdeflate=no "
202
0
#endif
203
204
0
#ifdef HAVE_LIBLZMA
205
0
    "lzma=yes "
206
#else
207
    "lzma=no "
208
#endif
209
210
0
#ifdef HAVE_LIBBZ2
211
0
    "bzip2=yes "
212
#else
213
    "bzip2=no "
214
#endif
215
216
// "plugins=" must stay at the end as it is followed by "plugin-path="
217
#ifdef ENABLE_PLUGINS
218
    "plugins=yes";
219
#else
220
0
    "plugins=no";
221
0
#endif
222
223
#ifdef ENABLE_PLUGINS
224
    snprintf(config, sizeof(config),
225
             "%s plugin-path=%.1000s htscodecs=%.40s",
226
             flags, hts_plugin_path(), htscodecs_version());
227
#else
228
0
    snprintf(config, sizeof(config),
229
0
             "%s htscodecs=%.40s",
230
0
             flags, htscodecs_version());
231
0
#endif
232
0
    return config;
233
0
}
234
235
236
// Converts ASCII to BAM nibble encoding.
237
// Note 0123 is treated as ACGT (ABI colourspace encoding) and
238
// U is treated as T.
239
HTSLIB_EXPORT
240
const unsigned char seq_nt16_table[256] = {
241
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
242
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
243
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
244
     1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
245
    15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
246
    15,15, 5, 6,  8, 8, 7, 9, 15,10,15,15, 15,15,15,15,
247
    15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
248
    15,15, 5, 6,  8, 8, 7, 9, 15,10,15,15, 15,15,15,15,
249
250
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
251
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
252
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
253
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
254
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
255
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
256
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
257
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
258
};
259
260
HTSLIB_EXPORT
261
const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN";
262
263
HTSLIB_EXPORT
264
const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
265
266
/**********************
267
 *** Basic file I/O ***
268
 **********************/
269
270
static enum htsFormatCategory format_category(enum htsExactFormat fmt)
271
33.0k
{
272
33.0k
    switch (fmt) {
273
0
    case bam:
274
0
    case sam:
275
8.88k
    case cram:
276
8.88k
    case fastq_format:
277
8.88k
    case fasta_format:
278
8.88k
        return sequence_data;
279
280
0
    case vcf:
281
0
    case bcf:
282
0
        return variant_data;
283
284
0
    case bai:
285
0
    case crai:
286
0
    case csi:
287
0
    case fai_format:
288
0
    case fqi_format:
289
0
    case gzi:
290
0
    case tbi:
291
0
        return index_file;
292
293
0
    case bed:
294
0
    case d4_format:
295
0
        return region_list;
296
297
0
    case htsget:
298
0
    case hts_crypt4gh_format:
299
0
        return unknown_category;
300
301
0
    case unknown_format:
302
12.1k
    case binary_format:
303
24.2k
    case text_format:
304
24.2k
    case empty_format:
305
24.2k
    case format_maximum:
306
24.2k
        break;
307
33.0k
    }
308
309
24.2k
    return unknown_category;
310
33.0k
}
311
312
// Decompress several hundred bytes by peeking at the file, which must be
313
// positioned at the start of a GZIP block.
314
static ssize_t
315
decompress_peek_gz(hFILE *fp, unsigned char *dest, size_t destsize)
316
2.20k
{
317
2.20k
    unsigned char buffer[2048];
318
2.20k
    z_stream zs;
319
2.20k
    ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
320
321
2.20k
    if (npeek < 0) return -1;
322
323
2.20k
    zs.zalloc = NULL;
324
2.20k
    zs.zfree = NULL;
325
2.20k
    zs.next_in = buffer;
326
2.20k
    zs.avail_in = npeek;
327
2.20k
    zs.next_out = dest;
328
2.20k
    zs.avail_out = destsize;
329
2.20k
    if (inflateInit2(&zs, 31) != Z_OK) return -1;
330
331
2.20k
    int ret;
332
2.20k
    const unsigned char *last_in = buffer;
333
4.35k
    while (zs.avail_out > 0) {
334
2.46k
        ret = inflate(&zs, Z_SYNC_FLUSH);
335
2.46k
        if (ret == Z_STREAM_END) {
336
108
            if (last_in == zs.next_in)
337
0
                break; // Paranoia to avoid potential looping. Shouldn't happen
338
108
            else
339
108
                last_in = zs.next_in;
340
108
            inflateReset(&zs);
341
2.35k
        } else if (ret != Z_OK) {
342
            // eg Z_BUF_ERROR due to avail_in/out becoming zero
343
308
            break;
344
308
        }
345
2.46k
    }
346
347
    // NB: zs.total_out is changed by inflateReset, so use pointer diff instead
348
2.20k
    destsize = zs.next_out - dest;
349
2.20k
    inflateEnd(&zs);
350
351
2.20k
    return destsize;
352
2.20k
}
353
354
#ifdef HAVE_LIBLZMA
355
// Similarly decompress a portion by peeking at the file, which must be
356
// positioned at the start of the file.
357
static ssize_t
358
decompress_peek_xz(hFILE *fp, unsigned char *dest, size_t destsize)
359
2
{
360
2
    unsigned char buffer[2048];
361
2
    ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
362
2
    if (npeek < 0) return -1;
363
364
2
    lzma_stream ls = LZMA_STREAM_INIT;
365
2
    if (lzma_stream_decoder(&ls, lzma_easy_decoder_memusage(9), 0) != LZMA_OK)
366
0
        return -1;
367
368
2
    ls.next_in = buffer;
369
2
    ls.avail_in = npeek;
370
2
    ls.next_out = dest;
371
2
    ls.avail_out = destsize;
372
373
2
    int r = lzma_code(&ls, LZMA_RUN);
374
2
    if (! (r == LZMA_OK || r == LZMA_STREAM_END)) {
375
1
        lzma_end(&ls);
376
1
        return -1;
377
1
    }
378
379
1
    destsize = ls.total_out;
380
1
    lzma_end(&ls);
381
382
1
    return destsize;
383
2
}
384
#endif
385
386
// Parse "x.y" text, taking care because the string is not NUL-terminated
387
// and filling in major/minor only when the digits are followed by a delimiter,
388
// so we don't misread "1.10" as "1.1" due to reaching the end of the buffer.
389
static void
390
parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim)
391
301
{
392
301
    const char *s    = (const char *) u;
393
301
    const char *slim = (const char *) ulim;
394
301
    short v;
395
396
301
    fmt->version.major = fmt->version.minor = -1;
397
398
1.36k
    for (v = 0; s < slim && isdigit_c(*s); s++)
399
1.06k
        v = 10 * v + *s - '0';
400
401
301
    if (s < slim) {
402
291
        fmt->version.major = v;
403
291
        if (*s == '.') {
404
62
            s++;
405
407
            for (v = 0; s < slim && isdigit_c(*s); s++)
406
345
                v = 10 * v + *s - '0';
407
62
            if (s < slim)
408
32
                fmt->version.minor = v;
409
62
        }
410
229
        else
411
229
            fmt->version.minor = 0;
412
291
    }
413
301
}
414
415
static int
416
cmp_nonblank(const char *key, const unsigned char *u, const unsigned char *ulim)
417
6.75k
{
418
6.75k
    const unsigned char *ukey = (const unsigned char *) key;
419
420
7.09k
    while (*ukey)
421
7.09k
        if (u >= ulim) return +1;
422
7.07k
        else if (isspace_c(*u)) u++;
423
6.76k
        else if (*u != *ukey) return (*ukey < *u)? -1 : +1;
424
31
        else u++, ukey++;
425
426
0
    return 0;
427
6.75k
}
428
429
static int is_text_only(const unsigned char *u, const unsigned char *ulim)
430
6.71k
{
431
167k
    for (; u < ulim; u++)
432
160k
        if (! (*u >= ' ' || *u == '\t' || *u == '\r' || *u == '\n'))
433
82
            return 0;
434
435
6.63k
    return 1;
436
6.71k
}
437
438
static inline int
439
alternate_zeros(const unsigned char *u, const unsigned char *ulim)
440
0
{
441
0
    for (; u < ulim; u += 2)
442
0
        if (*u != '\0') return 0;
443
0
    return 1;
444
0
}
445
446
static int is_utf16_text(const unsigned char *u, const unsigned char *ulim)
447
0
{
448
0
    if (ulim - u >= 6 &&
449
0
        ((u[0] == 0xfe && u[1] == 0xff && alternate_zeros(u+2, ulim)) ||
450
0
         (u[0] == 0xff && u[1] == 0xfe && alternate_zeros(u+3, ulim))))
451
0
        return 2;
452
0
    else if (ulim - u >= 8 &&
453
0
             (alternate_zeros(u, ulim) || alternate_zeros(u+1, ulim)))
454
0
        return 1;
455
0
    else
456
0
        return 0;
457
0
}
458
459
static int is_fastaq(const unsigned char *u, const unsigned char *ulim)
460
6.50k
{
461
6.50k
    const unsigned char *eol = memchr(u, '\n', ulim - u);
462
463
    // Check that the first line is entirely textual
464
6.50k
    if (! is_text_only(u, eol? eol : ulim)) return 0;
465
466
    // If the first line is very long, consider the file to indeed be FASTA/Q
467
6.48k
    if (eol == NULL) return 1;
468
469
5.24k
    u = eol+1; // Now points to the first character of the second line
470
471
    // Scan over all base-encoding letters (including 'N' but not SEQ's '=')
472
6.86k
    while (u < ulim && (seq_nt16_table[*u] != 15 || toupper(*u) == 'N')) {
473
1.62k
        if (*u == '=') return 0;
474
1.62k
        u++;
475
1.62k
    }
476
477
5.24k
    return (u == ulim || *u == '\r' || *u == '\n')? 1 : 0;
478
5.24k
}
479
480
// Parse tab-delimited text, filling in a string of column types and returning
481
// the number of columns spotted (within [u,ulim), and up to column_len) or -1
482
// if non-printable characters were seen.  Column types:
483
//     i: integer, s: strand sign, C: CIGAR, O: SAM optional field, Z: anything
484
static int
485
parse_tabbed_text(char *columns, int column_len,
486
                  const unsigned char *u, const unsigned char *ulim,
487
                  int *complete)
488
286
{
489
286
    const char *str  = (const char *) u;
490
286
    const char *slim = (const char *) ulim;
491
286
    const char *s;
492
286
    int ncolumns = 0;
493
494
286
    enum { digit = 1, leading_sign = 2, cigar_operator = 4, other = 8 };
495
286
    unsigned seen = 0;
496
286
    *complete = 0;
497
498
6.67k
    for (s = str; s < slim; s++)
499
6.53k
        if (*s >= ' ') {
500
5.20k
            if (isdigit_c(*s))
501
1.54k
                seen |= digit;
502
3.66k
            else if ((*s == '+' || *s == '-') && s == str)
503
120
                seen |= leading_sign;
504
3.54k
            else if (strchr(BAM_CIGAR_STR, *s) && s > str && isdigit_c(s[-1]))
505
162
                seen |= cigar_operator;
506
3.38k
            else
507
3.38k
                seen |= other;
508
5.20k
        }
509
1.32k
        else if (*s == '\t' || *s == '\r' || *s == '\n') {
510
1.25k
            size_t len = s - str;
511
1.25k
            char type;
512
513
1.25k
            if (seen == digit || seen == (leading_sign|digit)) type = 'i';
514
724
            else if (seen == (digit|cigar_operator)) type = 'C';
515
679
            else if (len == 1)
516
277
                switch (str[0]) {
517
140
                case '*': type = 'C'; break;
518
58
                case '+': case '-': case '.': type = 's'; break;
519
79
                default: type = 'Z'; break;
520
277
                }
521
402
            else if (len >= 5 && str[2] == ':' && str[4] == ':') type = 'O';
522
372
            else type = 'Z';
523
524
1.25k
            columns[ncolumns++] = type;
525
1.25k
            if (*s != '\t' || ncolumns >= column_len - 1) {
526
67
                *complete = 1; // finished the line or more columns than needed
527
67
                break;
528
67
            }
529
530
1.18k
            str = s + 1;
531
1.18k
            seen = 0;
532
1.18k
        }
533
73
        else return -1;
534
535
213
    columns[ncolumns] = '\0';
536
213
    return ncolumns;
537
286
}
538
539
// Match COLUMNS as a prefix against PATTERN (so COLUMNS may run out first).
540
// Returns len(COLUMNS) (modulo '+'), or 0 if there is a mismatched entry.
541
static int colmatch(const char *columns, const char *pattern)
542
267
{
543
267
    int i;
544
1.34k
    for (i = 0; columns[i] != '\0'; i++) {
545
1.17k
        if (pattern[i] == '+') return i;
546
1.17k
        if (! (columns[i] == pattern[i] || pattern[i] == 'Z')) return 0;
547
1.17k
    }
548
549
167
    return i;
550
267
}
551
552
int hts_detect_format(hFILE *hfile, htsFormat *fmt)
553
0
{
554
0
    return hts_detect_format2(hfile, NULL, fmt);
555
0
}
556
557
int hts_detect_format2(hFILE *hfile, const char *fname, htsFormat *fmt)
558
47.0k
{
559
47.0k
    char extension[HTS_MAX_EXT_LEN], columns[24];
560
47.0k
    unsigned char s[1024];
561
47.0k
    int complete = 0;
562
47.0k
    ssize_t len = hpeek(hfile, s, 18);
563
47.0k
    if (len < 0) return -1;
564
565
47.0k
    fmt->category = unknown_category;
566
47.0k
    fmt->format = unknown_format;
567
47.0k
    fmt->version.major = fmt->version.minor = -1;
568
47.0k
    fmt->compression = no_compression;
569
47.0k
    fmt->compression_level = -1;
570
47.0k
    fmt->specific = NULL;
571
572
47.0k
    if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) {
573
        // The stream is either gzip-compressed or BGZF-compressed.
574
        // Determine which, and decompress the first few records or lines.
575
2.20k
        fmt->compression = gzip;
576
2.20k
        if (len >= 18 && (s[3] & 4)) {
577
1.33k
            if (memcmp(&s[12], "BC\2\0", 4) == 0)
578
177
                fmt->compression = bgzf;
579
1.16k
            else if (memcmp(&s[12], "RAZF", 4) == 0)
580
49
                fmt->compression = razf_compression;
581
1.33k
        }
582
2.20k
        if (len >= 9 && s[2] == 8)
583
2.18k
            fmt->compression_level = (s[8] == 2)? 9 : (s[8] == 4)? 1 : -1;
584
585
2.20k
        len = decompress_peek_gz(hfile, s, sizeof s);
586
2.20k
    }
587
44.8k
    else if (len >= 10 && memcmp(s, "BZh", 3) == 0 &&
588
12
             (memcmp(&s[4], "\x31\x41\x59\x26\x53\x59", 6) == 0 ||
589
10
              memcmp(&s[4], "\x17\x72\x45\x38\x50\x90", 6) == 0)) {
590
3
        fmt->compression = bzip2_compression;
591
3
        fmt->compression_level = s[3] - '0';
592
        // Decompressing via libbz2 produces no output until it has a whole
593
        // block (of size 100Kb x level), which is too large for peeking.
594
        // So unfortunately we can recognise bzip2 but not the contents,
595
        // except that \x1772... magic indicates the stream is empty.
596
3
        if (s[4] == '\x31') return 0;
597
1
        else len = 0;
598
3
    }
599
44.8k
    else if (len >= 6 && memcmp(s, "\xfd""7zXZ\0", 6) == 0) {
600
2
        fmt->compression = xz_compression;
601
2
#ifdef HAVE_LIBLZMA
602
2
        len = decompress_peek_xz(hfile, s, sizeof s);
603
#else
604
        // Without liblzma, we can't recognise the decompressed contents.
605
        return 0;
606
#endif
607
2
    }
608
44.8k
    else if (len >= 4 && memcmp(s, "\x28\xb5\x2f\xfd", 4) == 0) {
609
1
        fmt->compression = zstd_compression;
610
1
        return 0;
611
1
    }
612
44.8k
    else {
613
44.8k
        len = hpeek(hfile, s, sizeof s);
614
44.8k
    }
615
47.0k
    if (len < 0) return -1;
616
617
47.0k
    if (len == 0) {
618
53
        fmt->format = empty_format;
619
53
        return 0;
620
53
    }
621
622
    // We avoid using filename extensions wherever possible (as filenames are
623
    // not always available), but in a few cases they must be considered:
624
    //  - FASTA/Q indexes are simply tab-separated text; files that match these
625
    //    patterns but not the fai/fqi extension are usually generic BED files
626
    //  - GZI indexes have no magic numbers so can only be detected by filename
627
47.0k
    if (fname && strcmp(fname, "-") != 0) {
628
47.0k
        char *s;
629
47.0k
        if (find_file_extension(fname, extension) < 0) extension[0] = '\0';
630
47.0k
        for (s = extension; *s; s++) *s = tolower_c(*s);
631
47.0k
    }
632
0
    else extension[0] = '\0';
633
634
47.0k
    if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=7 && s[5]<=7) {
635
9.32k
        fmt->category = sequence_data;
636
9.32k
        fmt->format = cram;
637
9.32k
        fmt->version.major = s[4], fmt->version.minor = s[5];
638
9.32k
        fmt->compression = custom;
639
9.32k
        return 0;
640
9.32k
    }
641
37.7k
    else if (len >= 4 && s[3] <= '\4') {
642
3.88k
        if (memcmp(s, "BAM\1", 4) == 0) {
643
2.84k
            fmt->category = sequence_data;
644
2.84k
            fmt->format = bam;
645
            // TODO Decompress enough to pick version from @HD-VN header
646
2.84k
            fmt->version.major = 1, fmt->version.minor = -1;
647
2.84k
            return 0;
648
2.84k
        }
649
1.03k
        else if (memcmp(s, "BAI\1", 4) == 0) {
650
1
            fmt->category = index_file;
651
1
            fmt->format = bai;
652
1
            fmt->version.major = -1, fmt->version.minor = -1;
653
1
            return 0;
654
1
        }
655
1.03k
        else if (memcmp(s, "BCF\4", 4) == 0) {
656
6
            fmt->category = variant_data;
657
6
            fmt->format = bcf;
658
6
            fmt->version.major = 1, fmt->version.minor = -1;
659
6
            return 0;
660
6
        }
661
1.03k
        else if (memcmp(s, "BCF\2", 4) == 0) {
662
1.02k
            fmt->category = variant_data;
663
1.02k
            fmt->format = bcf;
664
1.02k
            fmt->version.major = s[3];
665
1.02k
            fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0;
666
1.02k
            return 0;
667
1.02k
        }
668
7
        else if (memcmp(s, "CSI\1", 4) == 0) {
669
1
            fmt->category = index_file;
670
1
            fmt->format = csi;
671
1
            fmt->version.major = 1, fmt->version.minor = -1;
672
1
            return 0;
673
1
        }
674
6
        else if (memcmp(s, "TBI\1", 4) == 0) {
675
1
            fmt->category = index_file;
676
1
            fmt->format = tbi;
677
1
            return 0;
678
1
        }
679
        // GZI indexes have no magic numbers, so must be recognised solely by
680
        // filename extension.
681
5
        else if (strcmp(extension, "gzi") == 0) {
682
0
            fmt->category = index_file;
683
0
            fmt->format = gzi;
684
0
            return 0;
685
0
        }
686
3.88k
    }
687
33.8k
    else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) {
688
8.63k
        fmt->category = variant_data;
689
8.63k
        fmt->format = vcf;
690
8.63k
        if (len >= 21 && s[16] == 'v')
691
189
            parse_version(fmt, &s[17], &s[len]);
692
8.63k
        return 0;
693
8.63k
    }
694
25.1k
    else if (len >= 4 && s[0] == '@' &&
695
18.7k
             (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 ||
696
6.48k
              memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0 ||
697
18.4k
              memcmp(s, "@CO\t", 4) == 0)) {
698
18.4k
        fmt->category = sequence_data;
699
18.4k
        fmt->format = sam;
700
        // @HD-VN is not guaranteed to be the first tag, but then @HD is
701
        // not guaranteed to be present at all...
702
18.4k
        if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0)
703
112
            parse_version(fmt, &s[7], &s[len]);
704
18.3k
        else
705
18.3k
            fmt->version.major = 1, fmt->version.minor = -1;
706
18.4k
        return 0;
707
18.4k
    }
708
6.75k
    else if (len >= 8 && memcmp(s, "d4\xdd\xdd", 4) == 0) {
709
1
        fmt->category = region_list;
710
1
        fmt->format = d4_format;
711
        // How to decode the D4 Format Version bytes is not yet specified
712
        // so we don't try to set fmt->version.{major,minor}.
713
1
        return 0;
714
1
    }
715
6.75k
    else if (cmp_nonblank("{\"htsget\":", s, &s[len]) == 0) {
716
0
        fmt->category = unknown_category;
717
0
        fmt->format = htsget;
718
0
        return 0;
719
0
    }
720
6.75k
    else if (len > 8 && memcmp(s, "crypt4gh", 8) == 0) {
721
2
        fmt->category = unknown_category;
722
2
        fmt->format = hts_crypt4gh_format;
723
2
        return 0;
724
2
    }
725
6.75k
    else if (len >= 1 && s[0] == '>' && is_fastaq(s, &s[len])) {
726
6.14k
        fmt->category = sequence_data;
727
6.14k
        fmt->format = fasta_format;
728
6.14k
        return 0;
729
6.14k
    }
730
610
    else if (len >= 1 && s[0] == '@' && is_fastaq(s, &s[len])) {
731
324
        fmt->category = sequence_data;
732
324
        fmt->format = fastq_format;
733
324
        return 0;
734
324
    }
735
286
    else if (parse_tabbed_text(columns, sizeof columns, s,
736
286
                               &s[len], &complete) > 0) {
737
        // A complete SAM line is at least 11 columns.  On unmapped long reads may
738
        // be missing two.  (On mapped long reads we must have an @ header so long
739
        // CIGAR is irrelevant.)
740
169
        if (colmatch(columns, "ZiZiiCZiiZZOOOOOOOOOOOOOOOOOOOO+")
741
169
            >= 9 + 2*complete) {
742
80
            fmt->category = sequence_data;
743
80
            fmt->format = sam;
744
80
            fmt->version.major = 1, fmt->version.minor = -1;
745
80
            return 0;
746
80
        }
747
89
        else if (fmt->compression == gzip && colmatch(columns, "iiiiii") == 6) {
748
1
            fmt->category = index_file;
749
1
            fmt->format = crai;
750
1
            return 0;
751
1
        }
752
88
        else if (strstr(extension, "fqi") && colmatch(columns, "Ziiiii") == 6) {
753
0
            fmt->category = index_file;
754
0
            fmt->format = fqi_format;
755
0
            return 0;
756
0
        }
757
88
        else if (strstr(extension, "fai") && colmatch(columns, "Ziiii") == 5) {
758
0
            fmt->category = index_file;
759
0
            fmt->format = fai_format;
760
0
            return 0;
761
0
        }
762
88
        else if (colmatch(columns, "Zii+") >= 3) {
763
3
            fmt->category = region_list;
764
3
            fmt->format = bed;
765
3
            return 0;
766
3
        }
767
169
    }
768
769
    // Arbitrary text files can be read using hts_getline().
770
207
    if (is_text_only(s, &s[len])) fmt->format = text_format;
771
772
    // Nothing recognised: leave unset fmt-> fields as unknown.
773
207
    return 0;
774
47.0k
}
775
776
char *hts_format_description(const htsFormat *format)
777
0
{
778
0
    kstring_t str = { 0, 0, NULL };
779
780
0
    switch (format->format) {
781
0
    case sam:   kputs("SAM", &str); break;
782
0
    case bam:   kputs("BAM", &str); break;
783
0
    case cram:  kputs("CRAM", &str); break;
784
0
    case fasta_format:  kputs("FASTA", &str); break;
785
0
    case fastq_format:  kputs("FASTQ", &str); break;
786
0
    case vcf:   kputs("VCF", &str); break;
787
0
    case bcf:
788
0
        if (format->version.major == 1) kputs("Legacy BCF", &str);
789
0
        else kputs("BCF", &str);
790
0
        break;
791
0
    case bai:   kputs("BAI", &str); break;
792
0
    case crai:  kputs("CRAI", &str); break;
793
0
    case csi:   kputs("CSI", &str); break;
794
0
    case fai_format:    kputs("FASTA-IDX", &str); break;
795
0
    case fqi_format:    kputs("FASTQ-IDX", &str); break;
796
0
    case gzi:   kputs("GZI", &str); break;
797
0
    case tbi:   kputs("Tabix", &str); break;
798
0
    case bed:   kputs("BED", &str); break;
799
0
    case d4_format:     kputs("D4", &str); break;
800
0
    case htsget: kputs("htsget", &str); break;
801
0
    case hts_crypt4gh_format: kputs("crypt4gh", &str); break;
802
0
    case empty_format:  kputs("empty", &str); break;
803
0
    default:    kputs("unknown", &str); break;
804
0
    }
805
806
0
    if (format->version.major >= 0) {
807
0
        kputs(" version ", &str);
808
0
        kputw(format->version.major, &str);
809
0
        if (format->version.minor >= 0) {
810
0
            kputc('.', &str);
811
0
            kputw(format->version.minor, &str);
812
0
        }
813
0
    }
814
815
0
    switch (format->compression) {
816
0
    case bzip2_compression:  kputs(" bzip2-compressed", &str); break;
817
0
    case razf_compression:   kputs(" legacy-RAZF-compressed", &str); break;
818
0
    case xz_compression:     kputs(" XZ-compressed", &str); break;
819
0
    case zstd_compression:   kputs(" Zstandard-compressed", &str); break;
820
0
    case custom: kputs(" compressed", &str); break;
821
0
    case gzip:   kputs(" gzip-compressed", &str); break;
822
823
0
    case bgzf:
824
0
        switch (format->format) {
825
0
        case bam:
826
0
        case bcf:
827
0
        case csi:
828
0
        case tbi:
829
            // These are by definition BGZF, so just use the generic term
830
0
            kputs(" compressed", &str);
831
0
            break;
832
0
        default:
833
0
            kputs(" BGZF-compressed", &str);
834
0
            break;
835
0
        }
836
0
        break;
837
838
0
    case no_compression:
839
0
        switch (format->format) {
840
0
        case bam:
841
0
        case bcf:
842
0
        case cram:
843
0
        case csi:
844
0
        case tbi:
845
            // These are normally compressed, so emphasise that this one isn't
846
0
            kputs(" uncompressed", &str);
847
0
            break;
848
0
        default:
849
0
            break;
850
0
        }
851
0
        break;
852
853
0
    default: break;
854
0
    }
855
856
0
    switch (format->category) {
857
0
    case sequence_data: kputs(" sequence", &str); break;
858
0
    case variant_data:  kputs(" variant calling", &str); break;
859
0
    case index_file:    kputs(" index", &str); break;
860
0
    case region_list:   kputs(" genomic region", &str); break;
861
0
    default: break;
862
0
    }
863
864
0
    if (format->compression == no_compression)
865
0
        switch (format->format) {
866
0
        case text_format:
867
0
        case sam:
868
0
        case crai:
869
0
        case vcf:
870
0
        case bed:
871
0
        case fai_format:
872
0
        case fqi_format:
873
0
        case fasta_format:
874
0
        case fastq_format:
875
0
        case htsget:
876
0
            kputs(" text", &str);
877
0
            break;
878
879
0
        case empty_format:
880
0
            break;
881
882
0
        default:
883
0
            kputs(" data", &str);
884
0
            break;
885
0
        }
886
0
    else
887
0
        kputs(" data", &str);
888
889
0
    return ks_release(&str);
890
0
}
891
892
htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
893
33.0k
{
894
33.0k
    char smode[101], *cp, *cp2, *mode_c, *uncomp = NULL;
895
33.0k
    htsFile *fp = NULL;
896
33.0k
    hFILE *hfile = NULL;
897
33.0k
    char fmt_code = '\0';
898
    // see enum htsExactFormat in htslib/hts.h
899
33.0k
    const char format_to_mode[] = "\0g\0\0b\0c\0\0b\0g\0\0\0\0\0Ff\0\0";
900
901
33.0k
    strncpy(smode, mode, 99);
902
33.0k
    smode[99]=0;
903
33.0k
    if ((cp = strchr(smode, ',')))
904
0
        *cp = '\0';
905
906
    // Migrate format code (b or c) to the end of the smode buffer.
907
87.1k
    for (cp2 = cp = smode; *cp; cp++) {
908
54.0k
        if (*cp == 'b')
909
12.1k
            fmt_code = 'b';
910
41.9k
        else if (*cp == 'c')
911
8.88k
            fmt_code = 'c';
912
33.0k
        else {
913
33.0k
            *cp2++ = *cp;
914
            // Cache the uncompress flag 'u' pos if present
915
33.0k
            if (!uncomp && (*cp == 'u')) {
916
0
                uncomp = cp2 - 1;
917
0
            }
918
33.0k
        }
919
54.0k
    }
920
33.0k
    mode_c = cp2;
921
33.0k
    *cp2++ = fmt_code;
922
33.0k
    *cp2++ = 0;
923
924
    // Set or reset the format code if opts->format is used
925
33.0k
    if (fmt && fmt->format > unknown_format
926
0
        && fmt->format < sizeof(format_to_mode)) {
927
0
        *mode_c = format_to_mode[fmt->format];
928
0
    }
929
930
    // Uncompressed bam/bcf is not supported, change 'u' to '0' on write
931
33.0k
    if (uncomp && *mode_c == 'b' && (strchr(smode, 'w') || strchr(smode, 'a'))) {
932
0
        *uncomp = '0';
933
0
    }
934
935
    // If we really asked for a compressed text format then mode_c above will
936
    // point to nul.  We set to 'z' to enable bgzf.
937
33.0k
    if (strchr(mode, 'w') && fmt && fmt->compression == bgzf) {
938
0
        if (fmt->format == sam || fmt->format == vcf || fmt->format == text_format)
939
0
            *mode_c = 'z';
940
0
    }
941
942
33.0k
    char *rmme = NULL, *fnidx = strstr(fn, HTS_IDX_DELIM);
943
33.0k
    if ( fnidx ) {
944
0
        rmme = strdup(fn);
945
0
        if ( !rmme ) goto error;
946
0
        rmme[fnidx-fn] = 0;
947
0
        fn = rmme;
948
0
    }
949
950
33.0k
    hfile = hopen(fn, smode);
951
33.0k
    if (hfile == NULL) goto error;
952
953
33.0k
    fp = hts_hopen(hfile, fn, smode);
954
33.0k
    if (fp == NULL) goto error;
955
956
    // Compensate for the loss of exactness in htsExactFormat.
957
    // hts_hopen returns generics such as binary or text, but we
958
    // have been given something explicit here so use that instead.
959
33.0k
    if (fp->is_write && fmt &&
960
0
        (fmt->format == bam || fmt->format == sam ||
961
0
         fmt->format == vcf || fmt->format == bcf ||
962
0
         fmt->format == bed || fmt->format == fasta_format ||
963
0
         fmt->format == fastq_format))
964
0
        fp->format.format = fmt->format;
965
966
33.0k
    if (fmt && fmt->specific) {
967
0
        if (hts_opt_apply(fp, fmt->specific) != 0) {
968
0
            if (((hts_opt*)fmt->specific)->opt == CRAM_OPT_REFERENCE &&
969
0
                (errno == ENOENT || errno == EIO || errno == EBADF ||
970
0
                  errno == EACCES || errno == EISDIR)) {
971
                /* error during reference file operation
972
                 for these specific errors, set the error as EINVAL */
973
0
                errno = EINVAL;
974
0
            }
975
0
            goto error;
976
0
        }
977
0
    }
978
33.0k
    if ( rmme ) free(rmme);
979
33.0k
    return fp;
980
981
0
error:
982
0
    hts_log_error("Failed to open file \"%s\"%s%s", fn,
983
0
                  errno ? " : " : "", errno ? strerror(errno) : "");
984
0
    if ( rmme ) free(rmme);
985
986
0
    if (hfile)
987
0
        hclose_abruptly(hfile);
988
989
0
    return NULL;
990
33.0k
}
991
992
33.0k
htsFile *hts_open(const char *fn, const char *mode) {
993
33.0k
    return hts_open_format(fn, mode, NULL);
994
33.0k
}
995
996
/*
997
 * Splits str into a prefix, delimiter ('\0' or delim), and suffix, writing
998
 * the prefix in lowercase into buf and returning a pointer to the suffix.
999
 * On return, buf is always NUL-terminated; thus assumes that the "keyword"
1000
 * prefix should be one of several known values of maximum length buflen-2.
1001
 * (If delim is not found, returns a pointer to the '\0'.)
1002
 */
1003
static const char *
1004
scan_keyword(const char *str, char delim, char *buf, size_t buflen)
1005
0
{
1006
0
    size_t i = 0;
1007
0
    while (*str && *str != delim) {
1008
0
        if (i < buflen-1) buf[i++] = tolower_c(*str);
1009
0
        str++;
1010
0
    }
1011
1012
0
    buf[i] = '\0';
1013
0
    return *str? str+1 : str;
1014
0
}
1015
1016
/*
1017
 * Parses arg and appends it to the option list.
1018
 *
1019
 * Returns 0 on success;
1020
 *        -1 on failure.
1021
 */
1022
0
int hts_opt_add(hts_opt **opts, const char *c_arg) {
1023
0
    hts_opt *o, *t;
1024
0
    char *val;
1025
1026
    /*
1027
     * IMPORTANT!!!
1028
     * If you add another string option here, don't forget to also add
1029
     * it to the case statement in hts_opt_apply.
1030
     */
1031
1032
0
    if (!c_arg)
1033
0
        return -1;
1034
1035
0
    if (!(o =  malloc(sizeof(*o))))
1036
0
        return -1;
1037
1038
0
    if (!(o->arg = strdup(c_arg))) {
1039
0
        free(o);
1040
0
        return -1;
1041
0
    }
1042
1043
0
    if (!(val = strchr(o->arg, '=')))
1044
0
        val = "1"; // assume boolean
1045
0
    else
1046
0
        *val++ = '\0';
1047
1048
0
    if (strcmp(o->arg, "decode_md") == 0 ||
1049
0
        strcmp(o->arg, "DECODE_MD") == 0)
1050
0
        o->opt = CRAM_OPT_DECODE_MD, o->val.i = atoi(val);
1051
1052
0
    else if (strcmp(o->arg, "verbosity") == 0 ||
1053
0
             strcmp(o->arg, "VERBOSITY") == 0)
1054
0
        o->opt = CRAM_OPT_VERBOSITY, o->val.i = atoi(val);
1055
1056
0
    else if (strcmp(o->arg, "seqs_per_slice") == 0 ||
1057
0
             strcmp(o->arg, "SEQS_PER_SLICE") == 0)
1058
0
        o->opt = CRAM_OPT_SEQS_PER_SLICE, o->val.i = atoi(val);
1059
1060
0
    else if (strcmp(o->arg, "bases_per_slice") == 0 ||
1061
0
             strcmp(o->arg, "BASES_PER_SLICE") == 0)
1062
0
        o->opt = CRAM_OPT_BASES_PER_SLICE, o->val.i = atoi(val);
1063
1064
0
    else if (strcmp(o->arg, "slices_per_container") == 0 ||
1065
0
             strcmp(o->arg, "SLICES_PER_CONTAINER") == 0)
1066
0
        o->opt = CRAM_OPT_SLICES_PER_CONTAINER, o->val.i = atoi(val);
1067
1068
0
    else if (strcmp(o->arg, "embed_ref") == 0 ||
1069
0
             strcmp(o->arg, "EMBED_REF") == 0)
1070
0
        o->opt = CRAM_OPT_EMBED_REF, o->val.i = atoi(val);
1071
1072
0
    else if (strcmp(o->arg, "no_ref") == 0 ||
1073
0
             strcmp(o->arg, "NO_REF") == 0)
1074
0
        o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(val);
1075
1076
0
    else if (strcmp(o->arg, "pos_delta") == 0 ||
1077
0
             strcmp(o->arg, "POS_DELTA") == 0)
1078
0
        o->opt = CRAM_OPT_POS_DELTA, o->val.i = atoi(val);
1079
1080
0
    else if (strcmp(o->arg, "ignore_md5") == 0 ||
1081
0
             strcmp(o->arg, "IGNORE_MD5") == 0)
1082
0
        o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(val);
1083
1084
0
    else if (strcmp(o->arg, "use_bzip2") == 0 ||
1085
0
             strcmp(o->arg, "USE_BZIP2") == 0)
1086
0
        o->opt = CRAM_OPT_USE_BZIP2, o->val.i = atoi(val);
1087
1088
0
    else if (strcmp(o->arg, "use_rans") == 0 ||
1089
0
             strcmp(o->arg, "USE_RANS") == 0)
1090
0
        o->opt = CRAM_OPT_USE_RANS, o->val.i = atoi(val);
1091
1092
0
    else if (strcmp(o->arg, "use_lzma") == 0 ||
1093
0
             strcmp(o->arg, "USE_LZMA") == 0)
1094
0
        o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(val);
1095
1096
0
    else if (strcmp(o->arg, "use_tok") == 0 ||
1097
0
             strcmp(o->arg, "USE_TOK") == 0)
1098
0
        o->opt = CRAM_OPT_USE_TOK, o->val.i = atoi(val);
1099
1100
0
    else if (strcmp(o->arg, "use_fqz") == 0 ||
1101
0
             strcmp(o->arg, "USE_FQZ") == 0)
1102
0
        o->opt = CRAM_OPT_USE_FQZ, o->val.i = atoi(val);
1103
1104
0
    else if (strcmp(o->arg, "use_arith") == 0 ||
1105
0
             strcmp(o->arg, "USE_ARITH") == 0)
1106
0
        o->opt = CRAM_OPT_USE_ARITH, o->val.i = atoi(val);
1107
1108
0
    else if (strcmp(o->arg, "fast") == 0 ||
1109
0
             strcmp(o->arg, "FAST") == 0)
1110
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_FAST;
1111
1112
0
    else if (strcmp(o->arg, "normal") == 0 ||
1113
0
             strcmp(o->arg, "NORMAL") == 0)
1114
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_NORMAL;
1115
1116
0
    else if (strcmp(o->arg, "small") == 0 ||
1117
0
             strcmp(o->arg, "SMALL") == 0)
1118
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_SMALL;
1119
1120
0
    else if (strcmp(o->arg, "archive") == 0 ||
1121
0
             strcmp(o->arg, "ARCHIVE") == 0)
1122
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_ARCHIVE;
1123
1124
0
    else if (strcmp(o->arg, "reference") == 0 ||
1125
0
             strcmp(o->arg, "REFERENCE") == 0)
1126
0
        o->opt = CRAM_OPT_REFERENCE, o->val.s = val;
1127
1128
0
    else if (strcmp(o->arg, "version") == 0 ||
1129
0
             strcmp(o->arg, "VERSION") == 0)
1130
0
        o->opt = CRAM_OPT_VERSION, o->val.s =val;
1131
1132
0
    else if (strcmp(o->arg, "multi_seq_per_slice") == 0 ||
1133
0
             strcmp(o->arg, "MULTI_SEQ_PER_SLICE") == 0)
1134
0
        o->opt = CRAM_OPT_MULTI_SEQ_PER_SLICE, o->val.i = atoi(val);
1135
1136
0
    else if (strcmp(o->arg, "nthreads") == 0 ||
1137
0
             strcmp(o->arg, "NTHREADS") == 0)
1138
0
        o->opt = HTS_OPT_NTHREADS, o->val.i = atoi(val);
1139
1140
0
    else if (strcmp(o->arg, "cache_size") == 0 ||
1141
0
             strcmp(o->arg, "CACHE_SIZE") == 0) {
1142
0
        char *endp;
1143
0
        o->opt = HTS_OPT_CACHE_SIZE;
1144
0
        o->val.i = strtol(val, &endp, 0);
1145
        // NB: Doesn't support floats, eg 1.5g
1146
        // TODO: extend hts_parse_decimal? See also samtools sort.
1147
0
        switch (*endp) {
1148
0
        case 'g': case 'G': o->val.i *= 1024; // fall through
1149
0
        case 'm': case 'M': o->val.i *= 1024; // fall through
1150
0
        case 'k': case 'K': o->val.i *= 1024; break;
1151
0
        case '\0': break;
1152
0
        default:
1153
0
            hts_log_error("Unrecognised cache size suffix '%c'", *endp);
1154
0
            free(o->arg);
1155
0
            free(o);
1156
0
            return -1;
1157
0
        }
1158
0
    }
1159
1160
0
    else if (strcmp(o->arg, "required_fields") == 0 ||
1161
0
             strcmp(o->arg, "REQUIRED_FIELDS") == 0)
1162
0
        o->opt = CRAM_OPT_REQUIRED_FIELDS, o->val.i = strtol(val, NULL, 0);
1163
1164
0
    else if (strcmp(o->arg, "lossy_names") == 0 ||
1165
0
             strcmp(o->arg, "LOSSY_NAMES") == 0)
1166
0
        o->opt = CRAM_OPT_LOSSY_NAMES, o->val.i = strtol(val, NULL, 0);
1167
1168
0
    else if (strcmp(o->arg, "name_prefix") == 0 ||
1169
0
             strcmp(o->arg, "NAME_PREFIX") == 0)
1170
0
        o->opt = CRAM_OPT_PREFIX, o->val.s = val;
1171
1172
0
    else if (strcmp(o->arg, "store_md") == 0 ||
1173
0
             strcmp(o->arg, "store_md") == 0)
1174
0
        o->opt = CRAM_OPT_STORE_MD, o->val.i = atoi(val);
1175
1176
0
    else if (strcmp(o->arg, "store_nm") == 0 ||
1177
0
             strcmp(o->arg, "store_nm") == 0)
1178
0
        o->opt = CRAM_OPT_STORE_NM, o->val.i = atoi(val);
1179
1180
0
    else if (strcmp(o->arg, "block_size") == 0 ||
1181
0
             strcmp(o->arg, "BLOCK_SIZE") == 0)
1182
0
        o->opt = HTS_OPT_BLOCK_SIZE, o->val.i = strtol(val, NULL, 0);
1183
1184
0
    else if (strcmp(o->arg, "level") == 0 ||
1185
0
             strcmp(o->arg, "LEVEL") == 0)
1186
0
        o->opt = HTS_OPT_COMPRESSION_LEVEL, o->val.i = strtol(val, NULL, 0);
1187
1188
0
    else if (strcmp(o->arg, "filter") == 0 ||
1189
0
             strcmp(o->arg, "FILTER") == 0)
1190
0
        o->opt = HTS_OPT_FILTER, o->val.s = val;
1191
1192
0
    else if (strcmp(o->arg, "fastq_aux") == 0 ||
1193
0
        strcmp(o->arg, "FASTQ_AUX") == 0)
1194
0
        o->opt = FASTQ_OPT_AUX, o->val.s = val;
1195
1196
0
    else if (strcmp(o->arg, "fastq_barcode") == 0 ||
1197
0
        strcmp(o->arg, "FASTQ_BARCODE") == 0)
1198
0
        o->opt = FASTQ_OPT_BARCODE, o->val.s = val;
1199
1200
0
    else if (strcmp(o->arg, "fastq_rnum") == 0 ||
1201
0
        strcmp(o->arg, "FASTQ_RNUM") == 0)
1202
0
        o->opt = FASTQ_OPT_RNUM, o->val.i = 1;
1203
1204
0
    else if (strcmp(o->arg, "fastq_casava") == 0 ||
1205
0
        strcmp(o->arg, "FASTQ_CASAVA") == 0)
1206
0
        o->opt = FASTQ_OPT_CASAVA, o->val.i = 1;
1207
1208
0
    else if (strcmp(o->arg, "fastq_name2") == 0 ||
1209
0
        strcmp(o->arg, "FASTQ_NAME2") == 0)
1210
0
        o->opt = FASTQ_OPT_NAME2, o->val.i = 1;
1211
1212
0
    else if (strcmp(o->arg, "fastq_umi") == 0 ||
1213
0
        strcmp(o->arg, "FASTQ_UMI") == 0)
1214
0
        o->opt = FASTQ_OPT_UMI, o->val.s = val;
1215
1216
0
    else if (strcmp(o->arg, "fastq_umi_regex") == 0 ||
1217
0
        strcmp(o->arg, "FASTQ_UMI_REGEX") == 0)
1218
0
        o->opt = FASTQ_OPT_UMI_REGEX, o->val.s = val;
1219
1220
0
    else {
1221
0
        hts_log_error("Unknown option '%s'", o->arg);
1222
0
        free(o->arg);
1223
0
        free(o);
1224
0
        return -1;
1225
0
    }
1226
1227
0
    o->next = NULL;
1228
1229
    // Append; assumes small list.
1230
0
    if (*opts) {
1231
0
        t = *opts;
1232
0
        while (t->next)
1233
0
            t = t->next;
1234
0
        t->next = o;
1235
0
    } else {
1236
0
        *opts = o;
1237
0
    }
1238
1239
0
    return 0;
1240
0
}
1241
1242
/*
1243
 * Applies an hts_opt option list to a given htsFile.
1244
 *
1245
 * Returns 0 on success
1246
 *        -1 on failure
1247
 */
1248
0
int hts_opt_apply(htsFile *fp, hts_opt *opts) {
1249
0
    hts_opt *last = NULL;
1250
1251
0
    for (; opts;  opts = (last=opts)->next) {
1252
0
        switch (opts->opt) {
1253
0
            case CRAM_OPT_REFERENCE:
1254
0
                if (!(fp->fn_aux = strdup(opts->val.s)))
1255
0
                    return -1;
1256
                // fall through
1257
0
            case CRAM_OPT_VERSION:
1258
0
            case CRAM_OPT_PREFIX:
1259
0
            case HTS_OPT_FILTER:
1260
0
            case FASTQ_OPT_AUX:
1261
0
            case FASTQ_OPT_BARCODE:
1262
0
            case FASTQ_OPT_UMI:
1263
0
            case FASTQ_OPT_UMI_REGEX:
1264
0
                if (hts_set_opt(fp,  opts->opt,  opts->val.s) != 0)
1265
0
                    return -1;
1266
0
                break;
1267
0
            default:
1268
0
                if (hts_set_opt(fp,  opts->opt,  opts->val.i) != 0)
1269
0
                    return -1;
1270
0
                break;
1271
0
        }
1272
0
    }
1273
1274
0
    return 0;
1275
0
}
1276
1277
/*
1278
 * Frees an hts_opt list.
1279
 */
1280
0
void hts_opt_free(hts_opt *opts) {
1281
0
    hts_opt *last = NULL;
1282
0
    while (opts) {
1283
0
        opts = (last=opts)->next;
1284
0
        free(last->arg);
1285
0
        free(last);
1286
0
    }
1287
0
}
1288
1289
1290
/*
1291
 * Tokenise options as (key(=value)?,)*(key(=value)?)?
1292
 * NB: No provision for ',' appearing in the value!
1293
 * Add backslashing rules?
1294
 *
1295
 * This could be used as part of a general command line option parser or
1296
 * as a string concatenated onto the file open mode.
1297
 *
1298
 * Returns 0 on success
1299
 *        -1 on failure.
1300
 */
1301
0
int hts_parse_opt_list(htsFormat *fmt, const char *str) {
1302
0
    while (str && *str) {
1303
0
        const char *str_start;
1304
0
        int len;
1305
0
        char arg[8001];
1306
1307
0
        while (*str && *str == ',')
1308
0
            str++;
1309
1310
0
        for (str_start = str; *str && *str != ','; str++);
1311
0
        len = str - str_start;
1312
1313
        // Produce a nul terminated copy of the option
1314
0
        strncpy(arg, str_start, len < 8000 ? len : 8000);
1315
0
        arg[len < 8000 ? len : 8000] = '\0';
1316
1317
0
        if (hts_opt_add((hts_opt **)&fmt->specific, arg) != 0)
1318
0
            return -1;
1319
1320
0
        if (*str)
1321
0
            str++;
1322
0
    }
1323
1324
0
    return 0;
1325
0
}
1326
1327
/*
1328
 * Accepts a string file format (sam, bam, cram, vcf, bam) optionally
1329
 * followed by a comma separated list of key=value options and splits
1330
 * these up into the fields of htsFormat struct.
1331
 *
1332
 * format is assumed to be already initialised, either to blank
1333
 * "unknown" values or via previous hts_opt_add calls.
1334
 *
1335
 * Returns 0 on success
1336
 *        -1 on failure.
1337
 */
1338
0
int hts_parse_format(htsFormat *format, const char *str) {
1339
0
    char fmt[9];
1340
0
    const char *cp = scan_keyword(str, ',', fmt, sizeof fmt);
1341
1342
0
    format->version.minor = 0; // unknown
1343
0
    format->version.major = 0; // unknown
1344
1345
0
    if (strcmp(fmt, "sam") == 0) {
1346
0
        format->category          = sequence_data;
1347
0
        format->format            = sam;
1348
0
        format->compression       = no_compression;
1349
0
        format->compression_level = 0;
1350
0
    } else if (strcmp(fmt, "sam.gz") == 0) {
1351
0
        format->category          = sequence_data;
1352
0
        format->format            = sam;
1353
0
        format->compression       = bgzf;
1354
0
        format->compression_level = -1;
1355
0
    } else if (strcmp(fmt, "bam") == 0) {
1356
0
        format->category          = sequence_data;
1357
0
        format->format            = bam;
1358
0
        format->compression       = bgzf;
1359
0
        format->compression_level = -1;
1360
0
    } else if (strcmp(fmt, "cram") == 0) {
1361
0
        format->category          = sequence_data;
1362
0
        format->format            = cram;
1363
0
        format->compression       = custom;
1364
0
        format->compression_level = -1;
1365
0
    } else if (strcmp(fmt, "vcf") == 0) {
1366
0
        format->category          = variant_data;
1367
0
        format->format            = vcf;
1368
0
        format->compression       = no_compression;
1369
0
        format->compression_level = 0;
1370
0
    } else if (strcmp(fmt, "bcf") == 0) {
1371
0
        format->category          = variant_data;
1372
0
        format->format            = bcf;
1373
0
        format->compression       = bgzf;
1374
0
        format->compression_level = -1;
1375
0
    } else if (strcmp(fmt, "fastq") == 0 || strcmp(fmt, "fq") == 0) {
1376
0
        format->category          = sequence_data;
1377
0
        format->format            = fastq_format;
1378
0
        format->compression       = no_compression;
1379
0
        format->compression_level = 0;
1380
0
    } else if (strcmp(fmt, "fastq.gz") == 0 || strcmp(fmt, "fq.gz") == 0) {
1381
0
        format->category          = sequence_data;
1382
0
        format->format            = fastq_format;
1383
0
        format->compression       = bgzf;
1384
0
        format->compression_level = 0;
1385
0
    } else if (strcmp(fmt, "fasta") == 0 || strcmp(fmt, "fa") == 0) {
1386
0
        format->category          = sequence_data;
1387
0
        format->format            = fasta_format;
1388
0
        format->compression       = no_compression;
1389
0
        format->compression_level = 0;
1390
0
    } else if (strcmp(fmt, "fasta.gz") == 0 || strcmp(fmt, "fa.gz") == 0) {
1391
0
        format->category          = sequence_data;
1392
0
        format->format            = fasta_format;
1393
0
        format->compression       = bgzf;
1394
0
        format->compression_level = 0;
1395
0
    } else {
1396
0
        return -1;
1397
0
    }
1398
1399
0
    return hts_parse_opt_list(format, cp);
1400
0
}
1401
1402
1403
/*
1404
 * Tokenise options as (key(=value)?,)*(key(=value)?)?
1405
 * NB: No provision for ',' appearing in the value!
1406
 * Add backslashing rules?
1407
 *
1408
 * This could be used as part of a general command line option parser or
1409
 * as a string concatenated onto the file open mode.
1410
 *
1411
 * Returns 0 on success
1412
 *        -1 on failure.
1413
 */
1414
0
static int hts_process_opts(htsFile *fp, const char *opts) {
1415
0
    htsFormat fmt;
1416
1417
0
    fmt.specific = NULL;
1418
0
    if (hts_parse_opt_list(&fmt, opts) != 0)
1419
0
        return -1;
1420
1421
0
    if (hts_opt_apply(fp, fmt.specific) != 0) {
1422
0
        hts_opt_free(fmt.specific);
1423
0
        return -1;
1424
0
    }
1425
1426
0
    hts_opt_free(fmt.specific);
1427
1428
0
    return 0;
1429
0
}
1430
1431
static int hts_crypt4gh_redirect(const char *fn, const char *mode,
1432
2
                                 hFILE **hfile_ptr, htsFile *fp) {
1433
2
    hFILE *hfile1 = *hfile_ptr;
1434
2
    hFILE *hfile2 = NULL;
1435
2
    char fn_buf[512], *fn2 = fn_buf;
1436
2
    char mode2[102]; // Size set by sizeof(simple_mode) in hts_hopen()
1437
2
    const char *prefix = "crypt4gh:";
1438
2
    size_t fn2_len = strlen(prefix) + strlen(fn) + 1;
1439
2
    int ret = -1;
1440
1441
2
    if (fn2_len > sizeof(fn_buf)) {
1442
0
        if (fn2_len >= INT_MAX) // Silence gcc format-truncation warning
1443
0
            return -1;
1444
0
        fn2 = malloc(fn2_len);
1445
0
        if (!fn2) return -1;
1446
0
    }
1447
1448
    // Reopen fn using the crypt4gh plug-in (if available)
1449
2
    snprintf(fn2, fn2_len, "%s%s", prefix, fn);
1450
2
    snprintf(mode2, sizeof(mode2), "%s%s", mode, strchr(mode, ':') ? "" : ":");
1451
2
    hfile2 = hopen(fn2, mode2, "parent", hfile1, NULL);
1452
2
    if (hfile2) {
1453
        // Replace original hfile with the new one.  The original is now
1454
        // enclosed within hfile2
1455
0
        *hfile_ptr = hfile2;
1456
0
        ret = 0;
1457
0
    }
1458
1459
2
    if (fn2 != fn_buf)
1460
0
        free(fn2);
1461
2
    return ret;
1462
2
}
1463
1464
htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode)
1465
80.1k
{
1466
80.1k
    hFILE *hfile_orig = hfile;
1467
80.1k
    hFILE *hfile_cleanup = hfile;
1468
80.1k
    htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile));
1469
80.1k
    char simple_mode[101], *cp, *opts;
1470
80.1k
    simple_mode[100] = '\0';
1471
1472
80.1k
    if (fp == NULL) goto error;
1473
1474
80.1k
    fp->fn = strdup(fn);
1475
80.1k
    fp->is_be = ed_is_big();
1476
1477
    // Split mode into simple_mode,opts strings
1478
80.1k
    if ((cp = strchr(mode, ','))) {
1479
0
        strncpy(simple_mode, mode, cp-mode <= 100 ? cp-mode : 100);
1480
0
        simple_mode[cp-mode] = '\0';
1481
0
        opts = cp+1;
1482
80.1k
    } else {
1483
80.1k
        strncpy(simple_mode, mode, 100);
1484
80.1k
        opts = NULL;
1485
80.1k
    }
1486
1487
80.1k
    if (strchr(simple_mode, 'r')) {
1488
47.0k
        const int max_loops = 5; // Should be plenty
1489
47.0k
        int loops = 0;
1490
47.0k
        if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error;
1491
1492
        // Deal with formats that re-direct an underlying file via a plug-in.
1493
        // Loops as we may have crypt4gh served via htsget, or
1494
        // crypt4gh-in-crypt4gh.
1495
1496
47.0k
        while (fp->format.format == htsget ||
1497
47.0k
               fp->format.format == hts_crypt4gh_format) {
1498
            // Ensure we don't get stuck in an endless redirect loop
1499
2
            if (++loops > max_loops) {
1500
0
                errno = ELOOP;
1501
0
                goto error;
1502
0
            }
1503
1504
2
            if (fp->format.format == htsget) {
1505
0
                hFILE *hfile2 = hopen_htsget_redirect(hfile, simple_mode);
1506
0
                if (hfile2 == NULL) goto error;
1507
1508
0
                if (hfile != hfile_cleanup) {
1509
                    // Close the result of an earlier redirection
1510
0
                    hclose_abruptly(hfile);
1511
0
                }
1512
1513
0
                hfile = hfile2;
1514
0
            }
1515
2
            else if (fp->format.format == hts_crypt4gh_format) {
1516
2
                int should_preserve = (hfile == hfile_orig);
1517
2
                int update_cleanup = (hfile == hfile_cleanup);
1518
2
                if (hts_crypt4gh_redirect(fn, simple_mode, &hfile, fp) < 0)
1519
2
                    goto error;
1520
0
                if (should_preserve) {
1521
                    // The original hFILE is now contained in a crypt4gh
1522
                    // wrapper.  Should we need to close the wrapper due
1523
                    // to a later error, we need to prevent the wrapped
1524
                    // handle from being closed as the caller will see
1525
                    // this function return NULL and try to clean up itself.
1526
0
                    hfile_orig->preserve = 1;
1527
0
                }
1528
0
                if (update_cleanup) {
1529
                    // Update handle to close at the end if redirected by htsget
1530
0
                    hfile_cleanup = hfile;
1531
0
                }
1532
0
            }
1533
1534
            // Re-detect format against the result of the redirection
1535
0
            if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error;
1536
0
        }
1537
47.0k
    }
1538
33.0k
    else if (strchr(simple_mode, 'w') || strchr(simple_mode, 'a')) {
1539
33.0k
        htsFormat *fmt = &fp->format;
1540
33.0k
        fp->is_write = 1;
1541
1542
33.0k
        if (strchr(simple_mode, 'b')) fmt->format = binary_format;
1543
20.9k
        else if (strchr(simple_mode, 'c')) fmt->format = cram;
1544
12.1k
        else if (strchr(simple_mode, 'f')) fmt->format = fastq_format;
1545
12.1k
        else if (strchr(simple_mode, 'F')) fmt->format = fasta_format;
1546
12.1k
        else fmt->format = text_format;
1547
1548
33.0k
        if (strchr(simple_mode, 'z')) fmt->compression = bgzf;
1549
33.0k
        else if (strchr(simple_mode, 'g')) fmt->compression = gzip;
1550
33.0k
        else if (strchr(simple_mode, 'u')) fmt->compression = no_compression;
1551
33.0k
        else {
1552
            // No compression mode specified, set to the default for the format
1553
33.0k
            switch (fmt->format) {
1554
12.1k
            case binary_format: fmt->compression = bgzf; break;
1555
8.88k
            case cram: fmt->compression = custom; break;
1556
0
            case fastq_format: fmt->compression = no_compression; break;
1557
0
            case fasta_format: fmt->compression = no_compression; break;
1558
12.1k
            case text_format: fmt->compression = no_compression; break;
1559
0
            default: abort();
1560
33.0k
            }
1561
33.0k
        }
1562
1563
        // Fill in category (if determinable; e.g. 'b' could be BAM or BCF)
1564
33.0k
        fmt->category = format_category(fmt->format);
1565
1566
33.0k
        fmt->version.major = fmt->version.minor = -1;
1567
33.0k
        fmt->compression_level = -1;
1568
33.0k
        fmt->specific = NULL;
1569
33.0k
    }
1570
0
    else { errno = EINVAL; goto error; }
1571
1572
80.1k
    switch (fp->format.format) {
1573
12.1k
    case binary_format:
1574
14.9k
    case bam:
1575
15.9k
    case bcf:
1576
15.9k
        fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
1577
15.9k
        if (fp->fp.bgzf == NULL) goto error;
1578
15.9k
        fp->is_bin = fp->is_bgzf = 1;
1579
15.9k
        break;
1580
1581
18.2k
    case cram:
1582
18.2k
        fp->fp.cram = cram_dopen(hfile, fn, simple_mode);
1583
18.2k
        if (fp->fp.cram == NULL) goto error;
1584
16.6k
        if (!fp->is_write)
1585
7.71k
            cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, -1); // auto
1586
16.6k
        fp->is_cram = 1;
1587
16.6k
        break;
1588
1589
53
    case empty_format:
1590
12.3k
    case text_format:
1591
12.3k
    case bed:
1592
18.4k
    case fasta_format:
1593
18.7k
    case fastq_format:
1594
37.2k
    case sam:
1595
45.9k
    case vcf:
1596
45.9k
        if (fp->format.compression != no_compression) {
1597
2.16k
            fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
1598
2.16k
            if (fp->fp.bgzf == NULL) goto error;
1599
2.11k
            fp->is_bgzf = 1;
1600
2.11k
        }
1601
43.7k
        else
1602
43.7k
            fp->fp.hfile = hfile;
1603
45.8k
        break;
1604
1605
45.8k
    default:
1606
72
        errno = EFTYPE;
1607
72
        goto error;
1608
80.1k
    }
1609
1610
78.4k
    if (opts)
1611
0
        hts_process_opts(fp, opts);
1612
1613
    // Allow original file to close if it was preserved earlier by crypt4gh
1614
78.4k
    hfile_orig->preserve = 0;
1615
1616
    // If redirecting via htsget, close the original hFILE now (pedantically
1617
    // we would instead close it in hts_close(), but this a simplifying
1618
    // optimisation)
1619
78.4k
    if (hfile != hfile_cleanup) hclose_abruptly(hfile_cleanup);
1620
1621
78.4k
    return fp;
1622
1623
1.73k
error:
1624
1.73k
    hts_log_error("Failed to open file %s", fn);
1625
1626
    // If redirecting, close the failed redirection hFILE that we have opened
1627
1.73k
    if (hfile != hfile_orig) hclose_abruptly(hfile);
1628
1.73k
    hfile_orig->preserve = 0; // Allow caller to close the original hfile
1629
1630
1.73k
    if (fp) {
1631
1.73k
        free(fp->fn);
1632
1.73k
        free(fp->fn_aux);
1633
1.73k
        free(fp);
1634
1.73k
    }
1635
1.73k
    return NULL;
1636
80.1k
}
1637
1638
static int hts_idx_close_otf_fp(hts_idx_t *idx);
1639
1640
int hts_close(htsFile *fp)
1641
78.4k
{
1642
78.4k
    int ret = 0, save;
1643
78.4k
    if (!fp) {
1644
0
        errno = EINVAL;
1645
0
        return -1;
1646
0
    }
1647
1648
78.4k
    switch (fp->format.format) {
1649
4.13k
    case binary_format:
1650
12.6k
    case bam:
1651
15.9k
    case bcf:
1652
15.9k
        ret = bgzf_close(fp->fp.bgzf);
1653
15.9k
        break;
1654
1655
16.6k
    case cram:
1656
16.6k
        if (!fp->is_write) {
1657
7.71k
            switch (cram_eof(fp->fp.cram)) {
1658
54
            case 2:
1659
54
                hts_log_warning("EOF marker is absent. The input is probably truncated");
1660
54
                break;
1661
5.73k
            case 0:  /* not at EOF, but may not have wanted all seqs */
1662
7.66k
            default: /* case 1, expected EOF */
1663
7.66k
                break;
1664
7.71k
            }
1665
7.71k
        }
1666
16.6k
        ret = cram_close(fp->fp.cram);
1667
16.6k
        break;
1668
1669
12
    case empty_format:
1670
4.28k
    case text_format:
1671
4.28k
    case bed:
1672
10.4k
    case fasta_format:
1673
10.7k
    case fastq_format:
1674
34.8k
    case sam:
1675
45.8k
    case vcf:
1676
45.8k
        if (fp->format.format == sam)
1677
24.1k
            ret = sam_state_destroy(fp);
1678
21.7k
        else if (fp->format.format == fastq_format ||
1679
21.3k
                 fp->format.format == fasta_format)
1680
6.46k
            fastq_state_destroy(fp);
1681
1682
45.8k
        if (fp->format.compression != no_compression)
1683
2.11k
            ret |= bgzf_close(fp->fp.bgzf);
1684
43.7k
        else
1685
43.7k
            ret |= hclose(fp->fp.hfile);
1686
45.8k
        break;
1687
1688
0
    default:
1689
0
        ret = -1;
1690
0
        break;
1691
78.4k
    }
1692
1693
78.4k
    if (fp->idx) {
1694
        // Close deferred index file handle, if present.
1695
        // Unfortunately this means errors on the index will get mixed with
1696
        // those on the main file, but as we only have the EOF block left to
1697
        // write it hopefully won't happen that often.
1698
0
        ret |= hts_idx_close_otf_fp(fp->idx);
1699
0
    }
1700
1701
78.4k
    save = errno;
1702
78.4k
    sam_hdr_destroy(fp->bam_header);
1703
78.4k
    hts_idx_destroy(fp->idx);
1704
78.4k
    hts_filter_free(fp->filter);
1705
78.4k
    free(fp->fn);
1706
78.4k
    free(fp->fn_aux);
1707
78.4k
    free(fp->line.s);
1708
78.4k
    free(fp);
1709
78.4k
    errno = save;
1710
78.4k
    return ret;
1711
78.4k
}
1712
1713
int hts_flush(htsFile *fp)
1714
0
{
1715
0
    if (fp == NULL) return 0;
1716
1717
0
    switch (fp->format.format) {
1718
0
    case binary_format:
1719
0
    case bam:
1720
0
    case bcf:
1721
0
        return bgzf_flush(fp->fp.bgzf);
1722
1723
0
    case cram:
1724
0
        return cram_flush(fp->fp.cram);
1725
1726
0
    case empty_format:
1727
0
    case text_format:
1728
0
    case bed:
1729
0
    case fasta_format:
1730
0
    case fastq_format:
1731
0
    case sam:
1732
0
    case vcf:
1733
0
        if (fp->format.compression != no_compression)
1734
0
            return bgzf_flush(fp->fp.bgzf);
1735
0
        else
1736
0
            return hflush(fp->fp.hfile);
1737
1738
0
    default:
1739
0
        break;
1740
0
    }
1741
1742
0
    return 0;
1743
0
}
1744
1745
const htsFormat *hts_get_format(htsFile *fp)
1746
0
{
1747
0
    return fp? &fp->format : NULL;
1748
0
}
1749
1750
0
const char *hts_format_file_extension(const htsFormat *format) {
1751
0
    if (!format)
1752
0
        return "?";
1753
1754
0
    switch (format->format) {
1755
0
    case sam:  return "sam";
1756
0
    case bam:  return "bam";
1757
0
    case bai:  return "bai";
1758
0
    case cram: return "cram";
1759
0
    case crai: return "crai";
1760
0
    case vcf:  return "vcf";
1761
0
    case bcf:  return "bcf";
1762
0
    case csi:  return "csi";
1763
0
    case fai_format:   return "fai";
1764
0
    case fqi_format:   return "fqi";
1765
0
    case gzi:  return "gzi";
1766
0
    case tbi:  return "tbi";
1767
0
    case bed:  return "bed";
1768
0
    case d4_format:    return "d4";
1769
0
    case fasta_format: return "fa";
1770
0
    case fastq_format: return "fq";
1771
0
    default:   return "?";
1772
0
    }
1773
0
}
1774
1775
0
static hFILE *hts_hfile(htsFile *fp) {
1776
0
    switch (fp->format.format) {
1777
0
    case binary_format:// fall through
1778
0
    case bcf:          // fall through
1779
0
    case bam:          return bgzf_hfile(fp->fp.bgzf);
1780
0
    case cram:         return cram_hfile(fp->fp.cram);
1781
0
    case text_format:  // fall through
1782
0
    case vcf:          // fall through
1783
0
    case fastq_format: // fall through
1784
0
    case fasta_format: // fall through
1785
0
    case sam:          return fp->format.compression != no_compression
1786
0
                              ? bgzf_hfile(fp->fp.bgzf)
1787
0
                              : fp->fp.hfile;
1788
0
    default:           return NULL;
1789
0
    }
1790
0
}
1791
1792
0
int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) {
1793
0
    int r;
1794
0
    va_list args;
1795
1796
0
    switch (opt) {
1797
0
    case HTS_OPT_NTHREADS: {
1798
0
        va_start(args, opt);
1799
0
        int nthreads = va_arg(args, int);
1800
0
        va_end(args);
1801
0
        return hts_set_threads(fp, nthreads);
1802
0
    }
1803
1804
0
    case HTS_OPT_BLOCK_SIZE: {
1805
0
        hFILE *hf = hts_hfile(fp);
1806
1807
0
        if (hf) {
1808
0
            va_start(args, opt);
1809
0
            if (hfile_set_blksize(hf, va_arg(args, int)) != 0)
1810
0
                hts_log_warning("Failed to change block size");
1811
0
            va_end(args);
1812
0
        }
1813
0
        else {
1814
            // To do - implement for vcf/bcf.
1815
0
            hts_log_warning("Cannot change block size for this format");
1816
0
        }
1817
1818
0
        return 0;
1819
0
    }
1820
1821
0
    case HTS_OPT_THREAD_POOL: {
1822
0
        va_start(args, opt);
1823
0
        htsThreadPool *p = va_arg(args, htsThreadPool *);
1824
0
        va_end(args);
1825
0
        return hts_set_thread_pool(fp, p);
1826
0
    }
1827
1828
0
    case HTS_OPT_CACHE_SIZE: {
1829
0
        va_start(args, opt);
1830
0
        int cache_size = va_arg(args, int);
1831
0
        va_end(args);
1832
0
        hts_set_cache_size(fp, cache_size);
1833
0
        return 0;
1834
0
    }
1835
1836
0
    case FASTQ_OPT_CASAVA:
1837
0
    case FASTQ_OPT_RNUM:
1838
0
    case FASTQ_OPT_NAME2:
1839
0
        if (fp->format.format == fastq_format ||
1840
0
            fp->format.format == fasta_format)
1841
0
            return fastq_state_set(fp, opt);
1842
0
        return 0;
1843
1844
0
    case FASTQ_OPT_AUX:
1845
0
        if (fp->format.format == fastq_format ||
1846
0
            fp->format.format == fasta_format) {
1847
0
            va_start(args, opt);
1848
0
            char *list = va_arg(args, char *);
1849
0
            va_end(args);
1850
0
            return fastq_state_set(fp, opt, list);
1851
0
        }
1852
0
        return 0;
1853
1854
0
    case FASTQ_OPT_BARCODE:
1855
0
    case FASTQ_OPT_UMI:
1856
0
    case FASTQ_OPT_UMI_REGEX:
1857
0
        if (fp->format.format == fastq_format ||
1858
0
            fp->format.format == fasta_format) {
1859
0
            va_start(args, opt);
1860
0
            char *bc = va_arg(args, char *);
1861
0
            va_end(args);
1862
0
            return fastq_state_set(fp, opt, bc);
1863
0
        }
1864
0
        return 0;
1865
1866
    // Options below here flow through to cram_set_voption
1867
0
    case HTS_OPT_COMPRESSION_LEVEL: {
1868
0
        va_start(args, opt);
1869
0
        int level = va_arg(args, int);
1870
0
        va_end(args);
1871
0
        if (fp->is_bgzf)
1872
0
            fp->fp.bgzf->compress_level = level;
1873
0
        else if (fp->format.format == cram)
1874
0
            return cram_set_option(fp->fp.cram, opt, level);
1875
0
        return 0;
1876
0
    }
1877
1878
0
    case HTS_OPT_FILTER: {
1879
0
        va_start(args, opt);
1880
0
        char *expr = va_arg(args, char *);
1881
0
        va_end(args);
1882
0
        return hts_set_filter_expression(fp, expr);
1883
0
    }
1884
1885
0
    case HTS_OPT_PROFILE: {
1886
0
        va_start(args, opt);
1887
0
        enum hts_profile_option prof = va_arg(args, int);
1888
0
        va_end(args);
1889
0
        if (fp->is_bgzf) {
1890
0
            switch (prof) {
1891
#ifdef HAVE_LIBDEFLATE
1892
            case HTS_PROFILE_FAST:    fp->fp.bgzf->compress_level =  2; break;
1893
            case HTS_PROFILE_NORMAL:  fp->fp.bgzf->compress_level = -1; break;
1894
            case HTS_PROFILE_SMALL:   fp->fp.bgzf->compress_level = 10; break;
1895
            case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level = 12; break;
1896
#else
1897
0
            case HTS_PROFILE_FAST:    fp->fp.bgzf->compress_level =  1; break;
1898
0
            case HTS_PROFILE_NORMAL:  fp->fp.bgzf->compress_level = -1; break;
1899
0
            case HTS_PROFILE_SMALL:   fp->fp.bgzf->compress_level =  8; break;
1900
0
            case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level =  9; break;
1901
0
#endif
1902
0
            }
1903
0
        } // else CRAM manages this in its own way
1904
0
        break;
1905
0
    }
1906
1907
0
    default:
1908
0
        break;
1909
0
    }
1910
1911
0
    if (fp->format.format != cram)
1912
0
        return 0;
1913
1914
0
    va_start(args, opt);
1915
0
    r = cram_set_voption(fp->fp.cram, opt, args);
1916
0
    va_end(args);
1917
1918
0
    return r;
1919
0
}
1920
1921
BGZF *hts_get_bgzfp(htsFile *fp);
1922
1923
int hts_set_threads(htsFile *fp, int n)
1924
0
{
1925
0
    if (fp->format.format == sam) {
1926
0
        return sam_set_threads(fp, n);
1927
0
    } else if (fp->format.compression == bgzf) {
1928
0
        return bgzf_mt(hts_get_bgzfp(fp), n, 256/*unused*/);
1929
0
    } else if (fp->format.format == cram) {
1930
0
        return hts_set_opt(fp, CRAM_OPT_NTHREADS, n);
1931
0
    }
1932
0
    else return 0;
1933
0
}
1934
1935
0
int hts_set_thread_pool(htsFile *fp, htsThreadPool *p) {
1936
0
    if (fp->format.format == sam || fp->format.format == text_format) {
1937
0
        return sam_set_thread_pool(fp, p);
1938
0
    } else if (fp->format.compression == bgzf) {
1939
0
        return bgzf_thread_pool(hts_get_bgzfp(fp), p->pool, p->qsize);
1940
0
    } else if (fp->format.format == cram) {
1941
0
        return hts_set_opt(fp, CRAM_OPT_THREAD_POOL, p);
1942
0
    }
1943
0
    else return 0;
1944
0
}
1945
1946
void hts_set_cache_size(htsFile *fp, int n)
1947
0
{
1948
0
    if (fp->format.compression == bgzf)
1949
0
        bgzf_set_cache_size(hts_get_bgzfp(fp), n);
1950
0
}
1951
1952
int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
1953
0
{
1954
0
    free(fp->fn_aux);
1955
0
    if (fn_aux) {
1956
0
        fp->fn_aux = strdup(fn_aux);
1957
0
        if (fp->fn_aux == NULL) return -1;
1958
0
    }
1959
0
    else fp->fn_aux = NULL;
1960
1961
0
    if (fp->format.format == cram)
1962
0
        if (cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux))
1963
0
            return -1;
1964
1965
0
    return 0;
1966
0
}
1967
1968
int hts_set_filter_expression(htsFile *fp, const char *expr)
1969
0
{
1970
0
    if (fp->filter)
1971
0
        hts_filter_free(fp->filter);
1972
1973
0
    if (!expr)
1974
0
        return 0;
1975
1976
0
    return (fp->filter = hts_filter_init(expr))
1977
0
        ? 0 : -1;
1978
0
}
1979
1980
hFILE *hts_open_tmpfile(const char *fname, const char *mode, kstring_t *tmpname)
1981
0
{
1982
0
    int pid = (int) getpid();
1983
0
    unsigned ptr = (uintptr_t) tmpname;
1984
0
    int n = 0;
1985
0
    hFILE *fp = NULL;
1986
1987
0
    do {
1988
        // Attempt to further uniquify the temporary filename
1989
0
        unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock()) ^ ptr;
1990
0
        n++;
1991
1992
0
        ks_clear(tmpname);
1993
0
        if (ksprintf(tmpname, "%s.tmp_%d_%d_%u", fname, pid, n, t) < 0) break;
1994
1995
0
        fp = hopen(tmpname->s, mode);
1996
0
    } while (fp == NULL && errno == EEXIST && n < 100);
1997
1998
0
    return fp;
1999
0
}
2000
2001
int hts_is_utf16_text(const kstring_t *str)
2002
0
{
2003
0
    const unsigned char *u = (const unsigned char *) (str->s);
2004
0
    return (str->l > 0 && str->s)? is_utf16_text(u, u + str->l) : 0;
2005
0
}
2006
2007
// For VCF/BCF backward sweeper. Not exposing these functions because their
2008
// future is uncertain. Things will probably have to change with hFILE...
2009
BGZF *hts_get_bgzfp(htsFile *fp)
2010
0
{
2011
0
    if (fp->is_bgzf)
2012
0
        return fp->fp.bgzf;
2013
0
    else
2014
0
        return NULL;
2015
0
}
2016
int hts_useek(htsFile *fp, off_t uoffset, int where)
2017
0
{
2018
0
    if (fp->is_bgzf)
2019
0
        return bgzf_useek(fp->fp.bgzf, uoffset, where);
2020
0
    else
2021
0
        return (hseek(fp->fp.hfile, uoffset, SEEK_SET) >= 0)? 0 : -1;
2022
0
}
2023
off_t hts_utell(htsFile *fp)
2024
0
{
2025
0
    if (fp->is_bgzf)
2026
0
        return bgzf_utell(fp->fp.bgzf);
2027
0
    else
2028
0
        return htell(fp->fp.hfile);
2029
0
}
2030
2031
// Wrap hgetln() with a kgets_func2 signature for kgetline2()
2032
1.26M
static ssize_t hgetln_wrapper(char *buf, size_t len, void *vfp) {
2033
1.26M
    return hgetln(buf, len, (hFILE *) vfp);
2034
1.26M
}
2035
2036
int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
2037
71.1M
{
2038
71.1M
    int ret;
2039
71.1M
    if (! (delimiter == KS_SEP_LINE || delimiter == '\n')) {
2040
0
        hts_log_error("Unexpected delimiter %d", delimiter);
2041
0
        abort();
2042
0
    }
2043
2044
71.1M
    switch (fp->format.compression) {
2045
1.23M
    case no_compression:
2046
1.23M
        str->l = 0;
2047
1.23M
        ret = kgetline2(str, hgetln_wrapper, fp->fp.hfile);
2048
1.23M
        if (ret >= 0) ret = (str->l <= INT_MAX)? (int) str->l : INT_MAX;
2049
8.36k
        else if (herrno(fp->fp.hfile)) ret = -2, errno = herrno(fp->fp.hfile);
2050
8.36k
        else ret = -1;
2051
1.23M
        break;
2052
2053
69.8M
    case gzip:
2054
69.9M
    case bgzf:
2055
69.9M
        ret = bgzf_getline(fp->fp.bgzf, '\n', str);
2056
69.9M
        break;
2057
2058
0
    default:
2059
0
        abort();
2060
71.1M
    }
2061
2062
71.1M
    ++fp->lineno;
2063
71.1M
    return ret;
2064
71.1M
}
2065
2066
char **hts_readlist(const char *string, int is_file, int *_n)
2067
0
{
2068
0
    unsigned int m = 0, n = 0;
2069
0
    char **s = 0, **s_new;
2070
0
    if ( is_file )
2071
0
    {
2072
0
        BGZF *fp = bgzf_open(string, "r");
2073
0
        if ( !fp ) return NULL;
2074
2075
0
        kstring_t str;
2076
0
        int ret;
2077
0
        str.s = 0; str.l = str.m = 0;
2078
0
        while ((ret = bgzf_getline(fp, '\n', &str)) >= 0)
2079
0
        {
2080
0
            if (str.l == 0) continue;
2081
0
            if (n == 0 && hts_is_utf16_text(&str))
2082
0
                hts_log_warning("'%s' appears to be encoded as UTF-16", string);
2083
0
            if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
2084
0
                goto err;
2085
0
            s[n] = strdup(str.s);
2086
0
            if (!s[n])
2087
0
                goto err;
2088
0
            n++;
2089
0
        }
2090
0
        if (ret < -1) // Read error
2091
0
            goto err;
2092
0
        bgzf_close(fp);
2093
0
        free(str.s);
2094
0
    }
2095
0
    else
2096
0
    {
2097
0
        const char *q = string, *p = string;
2098
0
        while ( 1 )
2099
0
        {
2100
0
            if (*p == ',' || *p == 0)
2101
0
            {
2102
0
                if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
2103
0
                    goto err;
2104
0
                s[n] = (char*)calloc(p - q + 1, 1);
2105
0
                if (!s[n])
2106
0
                    goto err;
2107
0
                strncpy(s[n++], q, p - q);
2108
0
                q = p + 1;
2109
0
            }
2110
0
            if ( !*p ) break;
2111
0
            p++;
2112
0
        }
2113
0
    }
2114
    // Try to shrink s to the minimum size needed
2115
0
    s_new = hts_realloc_p(s, sizeof(char*), n);
2116
0
    if (!s_new)
2117
0
        goto err;
2118
2119
0
    s = s_new;
2120
0
    assert(n < INT_MAX); // hts_resize() should ensure this
2121
0
    *_n = n;
2122
0
    return s;
2123
2124
0
 err:
2125
0
    for (m = 0; m < n; m++)
2126
0
        free(s[m]);
2127
0
    free(s);
2128
0
    return NULL;
2129
0
}
2130
2131
char **hts_readlines(const char *fn, int *_n)
2132
0
{
2133
0
    unsigned int m = 0, n = 0;
2134
0
    char **s = 0, **s_new;
2135
0
    BGZF *fp = bgzf_open(fn, "r");
2136
0
    if ( fp ) { // read from file
2137
0
        kstring_t str;
2138
0
        int ret;
2139
0
        str.s = 0; str.l = str.m = 0;
2140
0
        while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
2141
0
            if (str.l == 0) continue;
2142
0
            if (n == 0 && hts_is_utf16_text(&str))
2143
0
                hts_log_warning("'%s' appears to be encoded as UTF-16", fn);
2144
0
            if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
2145
0
                goto err;
2146
0
            s[n] = strdup(str.s);
2147
0
            if (!s[n])
2148
0
                goto err;
2149
0
            n++;
2150
0
        }
2151
0
        if (ret < -1) // Read error
2152
0
            goto err;
2153
0
        bgzf_close(fp);
2154
0
        free(str.s);
2155
0
    } else if (*fn == ':') { // read from string
2156
0
        const char *q, *p;
2157
0
        for (q = p = fn + 1;; ++p)
2158
0
            if (*p == ',' || *p == 0) {
2159
0
                if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
2160
0
                    goto err;
2161
0
                s[n] = (char*)calloc(p - q + 1, 1);
2162
0
                if (!s[n])
2163
0
                    goto err;
2164
0
                strncpy(s[n++], q, p - q);
2165
0
                q = p + 1;
2166
0
                if (*p == 0) break;
2167
0
            }
2168
0
    } else return 0;
2169
    // Try to shrink s to the minimum size needed
2170
0
    s_new = hts_realloc_p(s, sizeof(char*), n);
2171
0
    if (!s_new)
2172
0
        goto err;
2173
2174
0
    s = s_new;
2175
0
    assert(n < INT_MAX); // hts_resize() should ensure this
2176
0
    *_n = n;
2177
0
    return s;
2178
2179
0
 err:
2180
0
    for (m = 0; m < n; m++)
2181
0
        free(s[m]);
2182
0
    free(s);
2183
0
    return NULL;
2184
0
}
2185
2186
// DEPRECATED: To be removed in a future HTSlib release
2187
int hts_file_type(const char *fname)
2188
0
{
2189
0
    int len = strlen(fname);
2190
0
    if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ;
2191
0
    if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF;
2192
0
    if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ;
2193
0
    if ( !strcmp("-",fname) ) return FT_STDIN;
2194
2195
0
    hFILE *f = hopen(fname, "r");
2196
0
    if (f == NULL) return 0;
2197
2198
0
    htsFormat fmt;
2199
0
    if (hts_detect_format2(f, fname, &fmt) < 0) { hclose_abruptly(f); return 0; }
2200
0
    if (hclose(f) < 0) return 0;
2201
2202
0
    switch (fmt.format) {
2203
0
    case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ;
2204
0
    case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ;
2205
0
    default:  return 0;
2206
0
    }
2207
0
}
2208
2209
int hts_check_EOF(htsFile *fp)
2210
0
{
2211
0
    if (fp->format.compression == bgzf)
2212
0
        return bgzf_check_EOF(hts_get_bgzfp(fp));
2213
0
    else if (fp->format.format == cram)
2214
0
        return cram_check_EOF(fp->fp.cram);
2215
0
    else
2216
0
        return 3;
2217
0
}
2218
2219
2220
/****************
2221
 *** Indexing ***
2222
 ****************/
2223
2224
0
#define HTS_MIN_MARKER_DIST 0x10000
2225
2226
// Finds the special meta bin
2227
//  ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1
2228
0
#define META_BIN(idx) ((idx)->n_bins + 1)
2229
2230
0
#define pair64_lt(a,b) ((a).u < (b).u)
2231
0
#define pair64max_lt(a,b) ((a).u < (b).u || \
2232
0
                           ((a).u == (b).u && (a).max < (b).max))
2233
2234
0
KSORT_INIT_STATIC(_off, hts_pair64_t, pair64_lt)
2235
0
KSORT_INIT_STATIC(_off_max, hts_pair64_max_t, pair64max_lt)
2236
2237
typedef struct {
2238
    int32_t m, n;
2239
    uint64_t loff;
2240
    hts_pair64_t *list;
2241
} bins_t;
2242
2243
KHASH_MAP_INIT_INT(bin, bins_t)
2244
typedef khash_t(bin) bidx_t;
2245
2246
typedef struct {
2247
    hts_pos_t n, m;
2248
    uint64_t *offset;
2249
} lidx_t;
2250
2251
struct hts_idx_t {
2252
    int fmt, min_shift, n_lvls, n_bins;
2253
    uint32_t l_meta;
2254
    int32_t n, m;
2255
    uint64_t n_no_coor;
2256
    bidx_t **bidx;
2257
    lidx_t *lidx;
2258
    uint8_t *meta; // MUST have a terminating NUL on the end
2259
    int tbi_n, last_tbi_tid;
2260
    struct {
2261
        uint32_t last_bin, save_bin;
2262
        hts_pos_t last_coor;
2263
        int last_tid, save_tid, finished;
2264
        uint64_t last_off, save_off;
2265
        uint64_t off_beg, off_end;
2266
        uint64_t n_mapped, n_unmapped;
2267
    } z; // keep internal states
2268
    BGZF *otf_fp;  // Index on-the-fly output file
2269
};
2270
2271
0
static char * idx_format_name(int fmt) {
2272
0
    switch (fmt) {
2273
0
        case HTS_FMT_CSI: return "csi";
2274
0
        case HTS_FMT_BAI: return "bai";
2275
0
        case HTS_FMT_TBI: return "tbi";
2276
0
        case HTS_FMT_CRAI: return "crai";
2277
0
        default: return "unknown";
2278
0
    }
2279
0
}
2280
2281
#ifdef DEBUG_INDEX
2282
static void idx_dump(const hts_idx_t *idx) {
2283
    int i;
2284
    int64_t j;
2285
2286
    if (!idx) fprintf(stderr, "Null index\n");
2287
2288
    fprintf(stderr, "format='%s', min_shift=%d, n_lvls=%d, n_bins=%d, l_meta=%u ",
2289
            idx_format_name(idx->fmt), idx->min_shift, idx->n_lvls, idx->n_bins, idx->l_meta);
2290
    fprintf(stderr, "n=%d, m=%d, n_no_coor=%"PRIu64"\n", idx->n, idx->m, idx->n_no_coor);
2291
    for (i = 0; i < idx->n; i++) {
2292
        bidx_t *bidx = idx->bidx[i];
2293
        lidx_t *lidx = &idx->lidx[i];
2294
        if (bidx) {
2295
            fprintf(stderr, "======== BIN Index - tid=%d, n_buckets=%d, size=%d\n", i, bidx->n_buckets, bidx->size);
2296
            int b;
2297
            for (b = 0; b < META_BIN(idx); b++) {
2298
                khint_t k;
2299
                if ((k = kh_get(bin, bidx, b)) != kh_end(bidx)) {
2300
                    bins_t *entries = &kh_value(bidx, k);
2301
                    int l = hts_bin_level(b);
2302
                    int64_t bin_width = 1LL << ((idx->n_lvls - l) * 3 + idx->min_shift);
2303
                    fprintf(stderr, "\tbin=%d, level=%d, parent=%d, n_chunks=%d, loff=%"PRIu64", interval=[%"PRId64" - %"PRId64"]\n",
2304
                        b, l, hts_bin_parent(b), entries->n, entries->loff, (b-hts_bin_first(l))*bin_width+1, (b+1-hts_bin_first(l))*bin_width);
2305
                    for (j = 0; j < entries->n; j++)
2306
                        fprintf(stderr, "\t\tchunk=%"PRId64", u=%"PRIu64", v=%"PRIu64"\n", j, entries->list[j].u, entries->list[j].v);
2307
                }
2308
            }
2309
        }
2310
        if (lidx) {
2311
            fprintf(stderr, "======== LINEAR Index - tid=%d, n_values=%"PRId64"\n", i, lidx->n);
2312
            for (j = 0; j < lidx->n; j++) {
2313
                fprintf(stderr, "\t\tentry=%"PRId64", offset=%"PRIu64", interval=[%"PRId64" - %"PRId64"]\n",
2314
                    j, lidx->offset[j], j*(1<<idx->min_shift)+1, (j+1)*(1<<idx->min_shift));
2315
            }
2316
        }
2317
    }
2318
}
2319
#endif
2320
2321
static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end)
2322
0
{
2323
0
    khint_t k;
2324
0
    bins_t *l;
2325
0
    int absent;
2326
0
    k = kh_put(bin, b, bin, &absent);
2327
0
    if (absent < 0) return -1; // Out of memory
2328
0
    l = &kh_value(b, k);
2329
0
    if (absent) {
2330
0
        l->m = 1; l->n = 0;
2331
0
        l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t));
2332
0
        if (!l->list) {
2333
0
            kh_del(bin, b, k);
2334
0
            return -1;
2335
0
        }
2336
0
    } else if (l->n == l->m) {
2337
0
        uint32_t new_m = l->m ? l->m << 1 : 1;
2338
0
        hts_pair64_t *new_list = hts_realloc_p(l->list,
2339
0
                                               sizeof(hts_pair64_t), new_m);
2340
0
        if (!new_list) return -1;
2341
0
        l->list = new_list;
2342
0
        l->m = new_m;
2343
0
    }
2344
0
    l->list[l->n].u = beg;
2345
0
    l->list[l->n++].v = end;
2346
0
    return 0;
2347
0
}
2348
2349
static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift)
2350
0
{
2351
0
    int i;
2352
0
    hts_pos_t beg, end;
2353
0
    beg = _beg >> min_shift;
2354
0
    end = (_end - 1) >> min_shift;
2355
0
    if (l->m < end + 1) {
2356
0
        size_t new_m = l->m * 2 > end + 1 ? l->m * 2 : end + 1;
2357
0
        uint64_t *new_offset;
2358
2359
0
        new_offset = hts_realloc_p(l->offset, sizeof(uint64_t), new_m);
2360
0
        if (!new_offset) return -1;
2361
2362
        // fill unused memory with (uint64_t)-1
2363
0
        memset(new_offset + l->m, 0xff, sizeof(uint64_t) * (new_m - l->m));
2364
0
        l->m = new_m;
2365
0
        l->offset = new_offset;
2366
0
    }
2367
0
    for (i = beg; i <= end; ++i) {
2368
0
        if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset;
2369
0
    }
2370
0
    if (l->n < end + 1) l->n = end + 1;
2371
0
    return 0;
2372
0
}
2373
2374
void hts_adjust_csi_settings(int64_t max_len_in, int *min_shift_, int *n_lvls_)
2375
0
{
2376
0
    const int max_n_lvls = 9; // To prevent bin number overflow
2377
0
    int min_shift = *min_shift_;
2378
0
    int n_lvls = *n_lvls_;
2379
0
    int64_t max_len = max_len_in + 256, maxpos;
2380
2381
    // Check if we need to adjust n_lvls or min_shift to get the range needed
2382
0
    if (max_len <= hts_bin_maxpos(min_shift, max_n_lvls)) {
2383
        // Can get required range by adjusting n_lvls
2384
0
        maxpos = hts_bin_maxpos(min_shift, n_lvls);
2385
0
        while (max_len > maxpos) {
2386
0
            ++n_lvls;
2387
0
            maxpos *= 8;
2388
0
        }
2389
0
        *n_lvls_ = n_lvls;
2390
0
    } else {
2391
        // No room to change n_lvls - adjust min_shift instead
2392
        // This was likely user-supplied so warn about the change too.
2393
0
        n_lvls = max_n_lvls;
2394
0
        maxpos = hts_bin_maxpos(min_shift, n_lvls);
2395
0
        while (max_len > maxpos) {
2396
0
            ++min_shift;
2397
0
            maxpos *= 2;
2398
0
        }
2399
0
        hts_log_warning("Adjusted min_shift from %d to %d"
2400
0
                        " due to longest reference of %"PRId64" bases.",
2401
0
                        *min_shift_, min_shift, max_len_in);
2402
0
        *n_lvls_ = n_lvls;
2403
0
        *min_shift_ = min_shift;
2404
0
    }
2405
0
}
2406
2407
hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
2408
0
{
2409
0
    hts_idx_t *idx;
2410
0
    idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t));
2411
0
    if (idx == NULL) return NULL;
2412
0
    idx->fmt = fmt;
2413
0
    idx->min_shift = min_shift;
2414
0
    idx->n_lvls = n_lvls;
2415
0
    idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7;
2416
0
    idx->z.save_tid = idx->z.last_tid = -1;
2417
0
    idx->z.save_bin = idx->z.last_bin = 0xffffffffu;
2418
0
    idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0;
2419
0
    idx->z.last_coor = 0xffffffffu;
2420
0
    if (n) {
2421
0
        idx->n = idx->m = n;
2422
0
        idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*));
2423
0
        if (idx->bidx == NULL) { free(idx); return NULL; }
2424
0
        idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t));
2425
0
        if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; }
2426
0
    }
2427
0
    idx->tbi_n = -1;
2428
0
    idx->last_tbi_tid = -1;
2429
0
    idx->otf_fp = NULL;
2430
0
    return idx;
2431
0
}
2432
2433
static void update_loff(hts_idx_t *idx, int i, int free_lidx)
2434
0
{
2435
0
    bidx_t *bidx = idx->bidx[i];
2436
0
    lidx_t *lidx = &idx->lidx[i];
2437
0
    khint_t k;
2438
0
    int l;
2439
    // the last entry is always valid
2440
0
    for (l=lidx->n-2; l >= 0; l--) {
2441
0
        if (lidx->offset[l] == (uint64_t)-1)
2442
0
            lidx->offset[l] = lidx->offset[l+1];
2443
0
    }
2444
0
    if (bidx == 0) return;
2445
0
    for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff
2446
0
        if (kh_exist(bidx, k))
2447
0
        {
2448
0
            if ( kh_key(bidx, k) < idx->n_bins )
2449
0
            {
2450
0
                int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls);
2451
                // disable linear index if bot_bin out of bounds
2452
0
                kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0;
2453
0
            }
2454
0
            else
2455
0
                kh_val(bidx, k).loff = 0;
2456
0
        }
2457
0
    if (free_lidx) {
2458
0
        free(lidx->offset);
2459
0
        lidx->m = lidx->n = 0;
2460
0
        lidx->offset = 0;
2461
0
    }
2462
0
}
2463
2464
static int compress_binning(hts_idx_t *idx, int i)
2465
0
{
2466
0
    bidx_t *bidx = idx->bidx[i];
2467
0
    khint_t k;
2468
0
    int l, m;
2469
0
    if (bidx == 0) return 0;
2470
    // merge a bin to its parent if the bin is too small
2471
0
    for (l = idx->n_lvls; l > 0; --l) {
2472
0
        unsigned start = hts_bin_first(l);
2473
0
        for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
2474
0
            bins_t *p, *q;
2475
0
            if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue;
2476
0
            p = &kh_value(bidx, k);
2477
0
            if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list);
2478
0
            if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) {
2479
0
                khint_t kp;
2480
0
                kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k)));
2481
0
                if (kp == kh_end(bidx)) continue;
2482
0
                q = &kh_val(bidx, kp);
2483
0
                if (q->n + p->n > q->m) {
2484
0
                    uint32_t new_m = q->n + p->n;
2485
0
                    hts_pair64_t *new_list;
2486
0
                    kroundup32(new_m);
2487
0
                    if (new_m > INT32_MAX) return -1; // Limited by index format
2488
0
                    new_list = hts_realloc_p(q->list, sizeof(*new_list), new_m);
2489
0
                    if (!new_list) return -1;
2490
0
                    q->m = new_m;
2491
0
                    q->list = new_list;
2492
0
                }
2493
0
                memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t));
2494
0
                q->n += p->n;
2495
0
                free(p->list);
2496
0
                kh_del(bin, bidx, k);
2497
0
            }
2498
0
        }
2499
0
    }
2500
0
    k = kh_get(bin, bidx, 0);
2501
0
    if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list);
2502
    // merge adjacent chunks that start from the same BGZF block
2503
0
    for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
2504
0
        bins_t *p;
2505
0
        if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue;
2506
0
        p = &kh_value(bidx, k);
2507
0
        for (l = 1, m = 0; l < p->n; ++l) {
2508
0
            if (p->list[m].v>>16 >= p->list[l].u>>16) {
2509
0
                if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v;
2510
0
            } else p->list[++m] = p->list[l];
2511
0
        }
2512
0
        p->n = m + 1;
2513
0
    }
2514
0
    return 0;
2515
0
}
2516
2517
int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
2518
0
{
2519
0
    int i, ret = 0;
2520
0
    if (idx == NULL || idx->z.finished) return 0; // do not run this function on an empty index or multiple times
2521
0
    if (idx->z.save_tid >= 0) {
2522
0
        ret |= insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset);
2523
0
        ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset);
2524
0
        ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
2525
0
    }
2526
0
    for (i = 0; i < idx->n; ++i) {
2527
0
        update_loff(idx, i, (idx->fmt == HTS_FMT_CSI));
2528
0
        ret |= compress_binning(idx, i);
2529
0
    }
2530
0
    idx->z.finished = 1;
2531
2532
0
    return ret;
2533
0
}
2534
2535
static inline hts_pos_t hts_idx_maxpos(const hts_idx_t *idx)
2536
0
{
2537
0
    return hts_bin_maxpos(idx->min_shift, idx->n_lvls);
2538
0
}
2539
2540
int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
2541
0
{
2542
0
    hts_pos_t maxpos = hts_idx_maxpos(idx);
2543
0
    if (tid < 0 || (beg <= maxpos && end <= maxpos))
2544
0
        return 0;
2545
2546
0
    if (idx->fmt == HTS_FMT_CSI) {
2547
0
        hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" "
2548
0
                      "cannot be stored in a csi index with these parameters. "
2549
0
                      "Please use a larger min_shift or depth",
2550
0
                      beg, end);
2551
0
    } else {
2552
0
        hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos
2553
0
                      " cannot be stored in a %s index. Try using a csi index",
2554
0
                      beg, end, idx_format_name(idx->fmt));
2555
0
    }
2556
0
    errno = ERANGE;
2557
0
    return -1;
2558
0
}
2559
2560
int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped)
2561
0
{
2562
0
    int bin;
2563
0
    if (tid<0) beg = -1, end = 0;
2564
0
    if (hts_idx_check_range(idx, tid, beg, end) < 0)
2565
0
        return -1;
2566
0
    if (tid >= idx->m) { // enlarge the index
2567
0
        uint32_t new_m = idx->m * 2 > tid + 1 ? idx->m * 2 : tid + 1;
2568
0
        bidx_t **new_bidx;
2569
0
        lidx_t *new_lidx;
2570
2571
0
        new_bidx = hts_realloc_p(idx->bidx, sizeof(bidx_t*), new_m);
2572
0
        if (!new_bidx) return -1;
2573
0
        idx->bidx = new_bidx;
2574
2575
0
        new_lidx = hts_realloc_p(idx->lidx, sizeof(lidx_t), new_m);
2576
0
        if (!new_lidx) return -1;
2577
0
        idx->lidx = new_lidx;
2578
2579
0
        memset(&idx->bidx[idx->m], 0, (new_m - idx->m) * sizeof(bidx_t*));
2580
0
        memset(&idx->lidx[idx->m], 0, (new_m - idx->m) * sizeof(lidx_t));
2581
0
        idx->m = new_m;
2582
0
    }
2583
0
    if (idx->n < tid + 1) idx->n = tid + 1;
2584
0
    if (idx->z.finished) return 0;
2585
0
    if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome
2586
0
        if ( tid>=0 && idx->n_no_coor )
2587
0
        {
2588
0
            hts_log_error("NO_COOR reads not in a single block at the end %d %d", tid, idx->z.last_tid);
2589
0
            return -1;
2590
0
        }
2591
0
        if (tid>=0 && idx->bidx[tid] != 0)
2592
0
        {
2593
0
            hts_log_error("Chromosome blocks not continuous");
2594
0
            return -1;
2595
0
        }
2596
0
        idx->z.last_tid = tid;
2597
0
        idx->z.last_bin = 0xffffffffu;
2598
0
    } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order
2599
0
        hts_log_error("Unsorted positions on sequence #%d: %"PRIhts_pos" followed by %"PRIhts_pos, tid+1, idx->z.last_coor+1, beg+1);
2600
0
        return -1;
2601
0
    }
2602
0
    if (end < beg) {
2603
        // Malformed ranges are errors. (Empty ranges (beg==end) are unusual but acceptable.)
2604
0
        hts_log_error("Invalid record on sequence #%d: end %"PRId64" < begin %"PRId64, tid+1, end, beg+1);
2605
0
        return -1;
2606
0
    }
2607
0
    if ( tid>=0 )
2608
0
    {
2609
0
        if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin);
2610
        // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin
2611
0
        if (beg < 0)  beg = 0;
2612
0
        if (end <= 0) end = 1;
2613
        // idx->z.last_off points to the start of the current record
2614
0
        if (insert_to_l(&idx->lidx[tid], beg, end,
2615
0
                        idx->z.last_off, idx->min_shift) < 0) return -1;
2616
0
    }
2617
0
    else idx->n_no_coor++;
2618
0
    bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls);
2619
0
    if ((int)idx->z.last_bin != bin) { // then possibly write the binning index
2620
0
        if (idx->z.save_bin != 0xffffffffu) { // save_bin==0xffffffffu only happens to the first record
2621
0
            if (insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin,
2622
0
                            idx->z.save_off, idx->z.last_off) < 0) return -1;
2623
0
        }
2624
0
        if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information
2625
0
            idx->z.off_end = idx->z.last_off;
2626
0
            if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
2627
0
                            idx->z.off_beg, idx->z.off_end) < 0) return -1;
2628
0
            if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
2629
0
                            idx->z.n_mapped, idx->z.n_unmapped) < 0) return -1;
2630
0
            idx->z.n_mapped = idx->z.n_unmapped = 0;
2631
0
            idx->z.off_beg = idx->z.off_end;
2632
0
        }
2633
0
        idx->z.save_off = idx->z.last_off;
2634
0
        idx->z.save_bin = idx->z.last_bin = bin;
2635
0
        idx->z.save_tid = tid;
2636
0
    }
2637
0
    if (is_mapped) ++idx->z.n_mapped;
2638
0
    else ++idx->z.n_unmapped;
2639
0
    idx->z.last_off = offset;
2640
0
    idx->z.last_coor = beg;
2641
0
    return 0;
2642
0
}
2643
2644
// Needed for TBI only.  Ensure 'tid' with 'name' is in the index meta data.
2645
// idx->meta needs to have been initialised first with an appropriate Tabix
2646
// configuration via hts_idx_set_meta.
2647
//
2648
// NB number of references (first 4 bytes of tabix header) aren't in
2649
// idx->meta, but held in idx->n instead.
2650
0
int hts_idx_tbi_name(hts_idx_t *idx, int tid, const char *name) {
2651
    // Horrid - we have to map incoming tid to a tbi alternative tid.
2652
    // This is because TBI counts tids by "covered" refs while everything
2653
    // else counts by Nth SQ/contig record in header.
2654
0
    if (tid == idx->last_tbi_tid || tid < 0 || !name)
2655
0
        return idx->tbi_n;
2656
2657
0
    uint32_t len = strlen(name)+1;
2658
0
    uint8_t *tmp = hts_realloc_ps(idx->meta, sizeof(*tmp), idx->l_meta, len);
2659
0
    if (!tmp)
2660
0
        return -1;
2661
2662
    // Append name
2663
0
    idx->meta = tmp;
2664
0
    strcpy((char *)idx->meta + idx->l_meta, name);
2665
0
    idx->l_meta += len;
2666
2667
    // Update seq length
2668
0
    u32_to_le(le_to_u32(idx->meta+24)+len, idx->meta+24);
2669
2670
0
    idx->last_tbi_tid = tid;
2671
0
    return ++idx->tbi_n;
2672
0
}
2673
2674
// When doing samtools index we have a read_bam / hts_idx_push(bgzf_tell())
2675
// loop.  idx->z.last_off is the previous bzgf_tell location, so we know
2676
// the location the current bam record started at as well as where it ends.
2677
//
2678
// When building an index on the fly via a write_bam / hts_idx_push loop,
2679
// this isn't quite identical as we may amend the virtual coord returned
2680
// by bgzf_tell to the start of a new block if the next bam struct doesn't
2681
// fit.  It's essentially the same thing, but for bit-identical indices
2682
// we need to amend the idx->z.last_off when we know we're starting a new
2683
// block.
2684
void hts_idx_amend_last(hts_idx_t *idx, uint64_t offset)
2685
0
{
2686
0
    idx->z.last_off = offset;
2687
0
}
2688
2689
void hts_idx_destroy(hts_idx_t *idx)
2690
78.4k
{
2691
78.4k
    khint_t k;
2692
78.4k
    int i;
2693
78.4k
    if (idx == 0) return;
2694
2695
    // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c
2696
0
    if (idx->fmt == HTS_FMT_CRAI) {
2697
0
        hts_cram_idx_t *cidx = (hts_cram_idx_t *) idx;
2698
0
        cram_index_free(cidx->cram);
2699
0
        free(cidx);
2700
0
        return;
2701
0
    }
2702
2703
0
    for (i = 0; i < idx->m; ++i) {
2704
0
        bidx_t *bidx = idx->bidx[i];
2705
0
        free(idx->lidx[i].offset);
2706
0
        if (bidx == 0) continue;
2707
0
        for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
2708
0
            if (kh_exist(bidx, k))
2709
0
                free(kh_value(bidx, k).list);
2710
0
        kh_destroy(bin, bidx);
2711
0
    }
2712
0
    free(idx->bidx); free(idx->lidx); free(idx->meta);
2713
0
    free(idx);
2714
0
}
2715
2716
0
int hts_idx_fmt(hts_idx_t *idx) {
2717
0
    return idx->fmt;
2718
0
}
2719
2720
// The optimizer eliminates these ed_is_big() calls; still it would be good to
2721
// TODO Determine endianness at configure- or compile-time
2722
2723
static inline ssize_t HTS_RESULT_USED idx_write_int32(BGZF *fp, int32_t x)
2724
0
{
2725
0
    if (ed_is_big()) x = ed_swap_4(x);
2726
0
    return bgzf_write(fp, &x, sizeof x);
2727
0
}
2728
2729
static inline ssize_t HTS_RESULT_USED idx_write_uint32(BGZF *fp, uint32_t x)
2730
0
{
2731
0
    if (ed_is_big()) x = ed_swap_4(x);
2732
0
    return bgzf_write(fp, &x, sizeof x);
2733
0
}
2734
2735
static inline ssize_t HTS_RESULT_USED idx_write_uint64(BGZF *fp, uint64_t x)
2736
0
{
2737
0
    if (ed_is_big()) x = ed_swap_8(x);
2738
0
    return bgzf_write(fp, &x, sizeof x);
2739
0
}
2740
2741
static inline void swap_bins(bins_t *p)
2742
0
{
2743
0
    int i;
2744
0
    for (i = 0; i < p->n; ++i) {
2745
0
        ed_swap_8p(&p->list[i].u);
2746
0
        ed_swap_8p(&p->list[i].v);
2747
0
    }
2748
0
}
2749
2750
static int need_idx_ugly_delay_hack(const hts_idx_t *idx)
2751
0
{
2752
    // Ugly hack for on-the-fly BAI indexes.  As these are uncompressed,
2753
    // we need to delay writing a few bytes of data until file close
2754
    // so that we have something to force a modification time update.
2755
    //
2756
    // (For compressed indexes like CSI, the BGZF EOF block serves the same
2757
    // purpose).
2758
0
    return idx->otf_fp && !idx->otf_fp->is_compressed;
2759
0
}
2760
2761
static int idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt)
2762
0
{
2763
0
    int32_t i, j;
2764
2765
0
    #define check(ret) if ((ret) < 0) return -1
2766
2767
    // VCF TBI/CSI only writes IDs for non-empty bins (ie covered references)
2768
    //
2769
    // NOTE: CSI meta is undefined in spec, so this code has an assumption
2770
    // that we're only using it for Tabix data.
2771
0
    int nids = idx->n;
2772
0
    if (idx->meta && idx->l_meta >= 4 && le_to_u32(idx->meta) == TBX_VCF) {
2773
0
        for (i = nids = 0; i < idx->n; ++i) {
2774
0
            if (idx->bidx[i])
2775
0
                nids++;
2776
0
        }
2777
0
    }
2778
0
    check(idx_write_int32(fp, nids));
2779
0
    if (fmt == HTS_FMT_TBI && idx->l_meta)
2780
0
        check(bgzf_write(fp, idx->meta, idx->l_meta));
2781
2782
0
    for (i = 0; i < idx->n; ++i) {
2783
0
        khint_t k;
2784
0
        bidx_t *bidx = idx->bidx[i];
2785
0
        lidx_t *lidx = &idx->lidx[i];
2786
2787
        // write binning index
2788
0
        if (nids == idx->n || bidx)
2789
0
            check(idx_write_int32(fp, bidx? kh_size(bidx) : 0));
2790
0
        if (bidx)
2791
0
            for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
2792
0
                if (kh_exist(bidx, k)) {
2793
0
                    bins_t *p = &kh_value(bidx, k);
2794
0
                    check(idx_write_uint32(fp, kh_key(bidx, k)));
2795
0
                    if (fmt == HTS_FMT_CSI) check(idx_write_uint64(fp, p->loff));
2796
                    //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v);
2797
0
                    check(idx_write_int32(fp, p->n));
2798
0
                    for (j = 0; j < p->n; ++j) {
2799
                        //fprintf(stderr, "\t%ld\t%ld\n", p->list[j].u, p->list[j].v);
2800
0
                        check(idx_write_uint64(fp, p->list[j].u));
2801
0
                        check(idx_write_uint64(fp, p->list[j].v));
2802
0
                    }
2803
0
                }
2804
2805
        // write linear index
2806
0
        if (fmt != HTS_FMT_CSI) {
2807
0
            check(idx_write_int32(fp, lidx->n));
2808
0
            for (j = 0; j < lidx->n; ++j)
2809
0
                check(idx_write_uint64(fp, lidx->offset[j]));
2810
0
        }
2811
0
    }
2812
2813
0
    if (!need_idx_ugly_delay_hack(idx)) {
2814
        // Write this for compressed (CSI) indexes, but for BAI we
2815
        // need to save a bit for later.  See hts_idx_close_otf_fp()
2816
0
        check(idx_write_uint64(fp, idx->n_no_coor));
2817
0
    }
2818
2819
#ifdef DEBUG_INDEX
2820
    idx_dump(idx);
2821
#endif
2822
2823
0
    return 0;
2824
0
    #undef check
2825
0
}
2826
2827
int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
2828
0
{
2829
0
    int ret, save;
2830
0
    if (idx == NULL || fn == NULL) { errno = EINVAL; return -1; }
2831
0
    char *fnidx = (char*)calloc(1, strlen(fn) + 5);
2832
0
    if (fnidx == NULL) return -1;
2833
2834
0
    strcpy(fnidx, fn);
2835
0
    switch (fmt) {
2836
0
    case HTS_FMT_BAI: strcat(fnidx, ".bai"); break;
2837
0
    case HTS_FMT_CSI: strcat(fnidx, ".csi"); break;
2838
0
    case HTS_FMT_TBI: strcat(fnidx, ".tbi"); break;
2839
0
    default: abort();
2840
0
    }
2841
2842
0
    ret = hts_idx_save_as(idx, fn, fnidx, fmt);
2843
0
    save = errno;
2844
0
    free(fnidx);
2845
0
    errno = save;
2846
0
    return ret;
2847
0
}
2848
2849
static int hts_idx_write_out(const hts_idx_t *idx, BGZF *fp, int fmt)
2850
0
{
2851
0
    #define check(ret) if ((ret) < 0) return -1
2852
2853
0
    if (fmt == HTS_FMT_CSI) {
2854
0
        check(bgzf_write(fp, "CSI\1", 4));
2855
0
        check(idx_write_int32(fp, idx->min_shift));
2856
0
        check(idx_write_int32(fp, idx->n_lvls));
2857
0
        check(idx_write_uint32(fp, idx->l_meta));
2858
0
        if (idx->l_meta) check(bgzf_write(fp, idx->meta, idx->l_meta));
2859
0
    } else if (fmt == HTS_FMT_TBI) {
2860
0
        check(bgzf_write(fp, "TBI\1", 4));
2861
0
    } else if (fmt == HTS_FMT_BAI) {
2862
0
        check(bgzf_write(fp, "BAI\1", 4));
2863
0
    } else abort();
2864
2865
0
    check(idx_save_core(idx, fp, fmt));
2866
2867
0
    #undef check
2868
0
    return 0;
2869
0
}
2870
2871
int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
2872
0
{
2873
0
    BGZF *fp;
2874
2875
0
    if (fnidx == NULL)
2876
0
        return hts_idx_save(idx, fn, fmt);
2877
2878
0
    fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
2879
0
    if (fp == NULL) return -1;
2880
2881
0
    if (hts_idx_write_out(idx, fp, fmt) < 0) {
2882
0
        int save_errno = errno;
2883
0
        bgzf_close(fp);
2884
0
        errno = save_errno;
2885
0
        return -1;
2886
0
    }
2887
2888
0
    return bgzf_close(fp);
2889
0
}
2890
2891
// idx_save for on-the-fly indexes.  Mostly duplicated from above, except
2892
// idx is not const because we want to store the file handle in it, and
2893
// the index file handle is not closed.  This allows the index file to be
2894
// closed after the EOF block on the indexed file has been written out,
2895
// so the modification times on the two files will be in the correct order.
2896
int hts_idx_save_but_not_close(hts_idx_t *idx, const char *fnidx, int fmt)
2897
0
{
2898
0
    idx->otf_fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
2899
0
    if (idx->otf_fp == NULL) return -1;
2900
2901
0
    if (hts_idx_write_out(idx, idx->otf_fp, fmt) < 0) {
2902
0
        int save_errno = errno;
2903
0
        bgzf_close(idx->otf_fp);
2904
0
        idx->otf_fp = NULL;
2905
0
        errno = save_errno;
2906
0
        return -1;
2907
0
    }
2908
2909
0
    return bgzf_flush(idx->otf_fp);
2910
0
}
2911
2912
static int hts_idx_close_otf_fp(hts_idx_t *idx)
2913
0
{
2914
0
    if (idx && idx->otf_fp) {
2915
0
        int ret = 0;
2916
0
        if (need_idx_ugly_delay_hack(idx)) {
2917
            // BAI index - write out the bytes we deferred earlier
2918
0
            ret = idx_write_uint64(idx->otf_fp, idx->n_no_coor) < 0;
2919
0
        }
2920
0
        ret |= bgzf_close(idx->otf_fp) < 0;
2921
0
        idx->otf_fp = NULL;
2922
0
        return ret == 0 ? 0 : -1;
2923
0
    }
2924
0
    return 0;
2925
0
}
2926
2927
static int idx_read_core(hts_idx_t *idx, BGZF *fp, int fmt)
2928
0
{
2929
0
    int32_t i, n, is_be;
2930
0
    is_be = ed_is_big();
2931
0
    if (idx == NULL) return -4;
2932
0
    for (i = 0; i < idx->n; ++i) {
2933
0
        bidx_t *h;
2934
0
        lidx_t *l = &idx->lidx[i];
2935
0
        uint32_t key;
2936
0
        int j, absent;
2937
0
        bins_t *p;
2938
0
        h = idx->bidx[i] = kh_init(bin);
2939
0
        if (bgzf_read(fp, &n, 4) != 4) return -1;
2940
0
        if (is_be) ed_swap_4p(&n);
2941
0
        if (n < 0) return -3;
2942
0
        for (j = 0; j < n; ++j) {
2943
0
            khint_t k;
2944
0
            if (bgzf_read(fp, &key, 4) != 4) return -1;
2945
0
            if (is_be) ed_swap_4p(&key);
2946
0
            k = kh_put(bin, h, key, &absent);
2947
0
            if (absent <  0) return -2; // No memory
2948
0
            if (absent == 0) return -3; // Duplicate bin number
2949
0
            p = &kh_val(h, k);
2950
0
            if (fmt == HTS_FMT_CSI) {
2951
0
                if (bgzf_read(fp, &p->loff, 8) != 8) return -1;
2952
0
                if (is_be) ed_swap_8p(&p->loff);
2953
0
            } else p->loff = 0;
2954
0
            if (bgzf_read(fp, &p->n, 4) != 4) return -1;
2955
0
            if (is_be) ed_swap_4p(&p->n);
2956
0
            if (p->n < 0) return -3;
2957
0
            if ((size_t) p->n > SIZE_MAX / sizeof(hts_pair64_t)) return -2;
2958
0
            p->m = p->n;
2959
0
            p->list = hts_malloc_p(sizeof(hts_pair64_t), p->m);
2960
0
            if (p->list == NULL) return -2;
2961
0
            if (bgzf_read(fp, p->list, ((size_t) p->n)<<4) != ((size_t) p->n)<<4) return -1;
2962
0
            if (is_be) swap_bins(p);
2963
0
        }
2964
0
        if (fmt != HTS_FMT_CSI) { // load linear index
2965
0
            int j, k;
2966
0
            uint32_t x;
2967
0
            if (bgzf_read(fp, &x, 4) != 4) return -1;
2968
0
            if (is_be) ed_swap_4p(&x);
2969
0
            l->n = x;
2970
0
            if (l->n < 0) return -3;
2971
0
            if ((size_t) l->n > SIZE_MAX / sizeof(uint64_t)) return -2;
2972
0
            l->m = l->n;
2973
0
            l->offset = hts_malloc_p(sizeof(uint64_t), l->n);
2974
0
            if (l->offset == NULL) return -2;
2975
0
            if (bgzf_read(fp, l->offset, l->n << 3) != l->n << 3) return -1;
2976
0
            if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]);
2977
0
            for (k = j = 0; j < l->n && l->offset[j] == 0; k = ++j); // stop at the first non-zero entry
2978
0
            for (j = l->n-1; j > k; j--) // fill missing values; may happen given older samtools and tabix
2979
0
                if (l->offset[j-1] == 0) l->offset[j-1] = l->offset[j];
2980
0
            update_loff(idx, i, 0);
2981
0
        }
2982
0
    }
2983
0
    if (bgzf_read(fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0;
2984
0
    if (is_be) ed_swap_8p(&idx->n_no_coor);
2985
#ifdef DEBUG_INDEX
2986
    idx_dump(idx);
2987
#endif
2988
2989
0
    return 0;
2990
0
}
2991
2992
static hts_idx_t *idx_read(const char *fn)
2993
0
{
2994
0
    uint8_t magic[4];
2995
0
    int i, is_be;
2996
0
    hts_idx_t *idx = NULL;
2997
0
    uint8_t *meta = NULL;
2998
0
    BGZF *fp = bgzf_open(fn, "r");
2999
0
    if (fp == NULL) return NULL;
3000
0
    is_be = ed_is_big();
3001
0
    if (bgzf_read(fp, magic, 4) != 4) goto fail;
3002
3003
0
    if (memcmp(magic, "CSI\1", 4) == 0) {
3004
0
        uint32_t x[3], n;
3005
0
        if (bgzf_read(fp, x, 12) != 12) goto fail;
3006
0
        if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]);
3007
0
        if (x[2]) {
3008
0
            if (SIZE_MAX - x[2] < 1) goto fail; // Prevent possible overflow
3009
0
            if ((meta = hts_malloc_ps(sizeof(*meta), x[2], 1)) == NULL) goto fail;
3010
0
            if (bgzf_read(fp, meta, x[2]) != x[2]) goto fail;
3011
            // Prevent possible strlen past the end in tbx_index_load2
3012
0
            meta[x[2]] = '\0';
3013
0
        }
3014
0
        if (bgzf_read(fp, &n, 4) != 4) goto fail;
3015
0
        if (is_be) ed_swap_4p(&n);
3016
0
        if (n > INT32_MAX) goto fail;
3017
0
        if ((idx = hts_idx_init(n, HTS_FMT_CSI, 0, x[0], x[1])) == NULL) goto fail;
3018
0
        idx->l_meta = x[2];
3019
0
        idx->meta = meta;
3020
0
        meta = NULL;
3021
0
        if (idx_read_core(idx, fp, HTS_FMT_CSI) < 0) goto fail;
3022
0
    }
3023
0
    else if (memcmp(magic, "TBI\1", 4) == 0) {
3024
0
        uint8_t x[8 * 4];
3025
0
        uint32_t n;
3026
        // Read file header
3027
0
        if (bgzf_read(fp, x, sizeof(x)) != sizeof(x)) goto fail;
3028
0
        n = le_to_u32(&x[0]); // location of n_ref
3029
0
        if (n > INT32_MAX) goto fail;
3030
0
        if ((idx = hts_idx_init(n, HTS_FMT_TBI, 0, 14, 5)) == NULL) goto fail;
3031
0
        n = le_to_u32(&x[7*4]); // location of l_nm
3032
0
        if (n > UINT32_MAX - 29) goto fail; // Prevent possible overflow
3033
0
        idx->l_meta = 28 + n;
3034
0
        if ((idx->meta = hts_malloc_ps(sizeof(*idx->meta), idx->l_meta, 1)) == NULL) goto fail;
3035
        // copy format, col_seq, col_beg, col_end, meta, skip, l_nm
3036
        // N.B. left in little-endian byte order.
3037
0
        memcpy(idx->meta, &x[1*4], 28);
3038
        // Read in sequence names.
3039
0
        if (bgzf_read(fp, idx->meta + 28, n) != n) goto fail;
3040
        // Prevent possible strlen past the end in tbx_index_load2
3041
0
        idx->meta[idx->l_meta] = '\0';
3042
0
        if (idx_read_core(idx, fp, HTS_FMT_TBI) < 0) goto fail;
3043
0
    }
3044
0
    else if (memcmp(magic, "BAI\1", 4) == 0) {
3045
0
        uint32_t n;
3046
0
        if (bgzf_read(fp, &n, 4) != 4) goto fail;
3047
0
        if (is_be) ed_swap_4p(&n);
3048
0
        if (n > INT32_MAX) goto fail;
3049
0
        if ((idx = hts_idx_init(n, HTS_FMT_BAI, 0, 14, 5)) == NULL) goto fail;
3050
0
        if (idx_read_core(idx, fp, HTS_FMT_BAI) < 0) goto fail;
3051
0
    }
3052
0
    else { errno = EINVAL; goto fail; }
3053
3054
0
    bgzf_close(fp);
3055
0
    return idx;
3056
3057
0
fail:
3058
0
    bgzf_close(fp);
3059
0
    hts_idx_destroy(idx);
3060
0
    free(meta);
3061
0
    return NULL;
3062
0
}
3063
3064
int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta,
3065
                      int is_copy)
3066
0
{
3067
0
    uint8_t *new_meta = meta;
3068
0
    if (is_copy) {
3069
0
        size_t l = l_meta;
3070
0
        if (l > SIZE_MAX - 1) {
3071
0
            errno = ENOMEM;
3072
0
            return -1;
3073
0
        }
3074
0
        new_meta = hts_malloc_ps(sizeof(*new_meta), l, 1);
3075
0
        if (!new_meta) return -1;
3076
0
        memcpy(new_meta, meta, l);
3077
        // Prevent possible strlen past the end in tbx_index_load2
3078
0
        new_meta[l] = '\0';
3079
0
    }
3080
0
    if (idx->meta) free(idx->meta);
3081
0
    idx->l_meta = l_meta;
3082
0
    idx->meta = new_meta;
3083
0
    return 0;
3084
0
}
3085
3086
uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta)
3087
0
{
3088
0
    *l_meta = idx->l_meta;
3089
0
    return idx->meta;
3090
0
}
3091
3092
const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)
3093
0
{
3094
0
    if ( !idx || !idx->n )
3095
0
    {
3096
0
        *n = 0;
3097
0
        return NULL;
3098
0
    }
3099
3100
0
    int tid = 0, i;
3101
0
    const char **names = (const char**) calloc(idx->n,sizeof(const char*));
3102
0
    for (i=0; i<idx->n; i++)
3103
0
    {
3104
0
        bidx_t *bidx = idx->bidx[i];
3105
0
        if ( !bidx ) continue;
3106
0
        names[tid++] = getid(hdr,i);
3107
0
    }
3108
0
    *n = tid;
3109
0
    return names;
3110
0
}
3111
3112
0
int hts_idx_nseq(const hts_idx_t *idx) {
3113
0
    if (!idx) return -1;
3114
0
    return idx->n;
3115
0
}
3116
3117
int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped)
3118
0
{
3119
0
    if (!idx) return -1;
3120
0
    if ( idx->fmt == HTS_FMT_CRAI ) {
3121
0
        *mapped = 0; *unmapped = 0;
3122
0
        return -1;
3123
0
    }
3124
3125
0
    bidx_t *h = idx->bidx[tid];
3126
0
    if (!h) return -1;
3127
0
    khint_t k = kh_get(bin, h, META_BIN(idx));
3128
0
    if (k != kh_end(h)) {
3129
0
        *mapped = kh_val(h, k).list[1].u;
3130
0
        *unmapped = kh_val(h, k).list[1].v;
3131
0
        return 0;
3132
0
    } else {
3133
0
        *mapped = 0; *unmapped = 0;
3134
0
        return -1;
3135
0
    }
3136
0
}
3137
3138
uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
3139
0
{
3140
0
    if (idx->fmt == HTS_FMT_CRAI) return 0;
3141
0
    return idx->n_no_coor;
3142
0
}
3143
3144
/****************
3145
 *** Iterator ***
3146
 ****************/
3147
3148
// Note: even with 32-bit hts_pos_t, end needs to be 64-bit here due to 1LL<<s.
3149
static inline int reg2bins_narrow(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx)
3150
0
{
3151
0
    int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
3152
0
    for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3153
0
        hts_pos_t b, e;
3154
0
        int i;
3155
0
        b = t + (beg>>s); e = t + (end>>s);
3156
0
        for (i = b; i <= e; ++i) {
3157
0
            if (kh_get(bin, bidx, i) != kh_end(bidx)) {
3158
0
                assert(itr->bins.n < itr->bins.m);
3159
0
                itr->bins.a[itr->bins.n++] = i;
3160
0
            }
3161
0
        }
3162
0
    }
3163
0
    return itr->bins.n;
3164
0
}
3165
3166
static inline int reg2bins_wide(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx)
3167
0
{
3168
0
    khint_t i;
3169
0
    hts_pos_t max_shift = 3 * n_lvls + min_shift;
3170
0
    --end;
3171
0
    if (beg < 0) beg = 0;
3172
0
    for (i = kh_begin(bidx); i != kh_end(bidx); i++) {
3173
0
        if (!kh_exist(bidx, i)) continue;
3174
0
        hts_pos_t bin = (hts_pos_t) kh_key(bidx, i);
3175
0
        int level = hts_bin_level(bin);
3176
0
        if (level > n_lvls) continue; // Dodgy index?
3177
0
        hts_pos_t first = hts_bin_first(level);
3178
0
        hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level));
3179
0
        hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level));
3180
0
        if (beg_at_level <= bin && bin <= end_at_level) {
3181
0
            assert(itr->bins.n < itr->bins.m);
3182
0
            itr->bins.a[itr->bins.n++] = bin;
3183
0
        }
3184
0
    }
3185
0
    return itr->bins.n;
3186
0
}
3187
3188
static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx)
3189
0
{
3190
0
    int l, s = min_shift + (n_lvls<<1) + n_lvls;
3191
0
    size_t reg_bin_count = 0, hash_bin_count = kh_n_buckets(bidx), max_bins;
3192
0
    hts_pos_t end1;
3193
0
    if (end >= 1LL<<s) end = 1LL<<s;
3194
0
    if (beg >= end) return 0;
3195
0
    end1 = end - 1;
3196
3197
    // Count bins to see if it's faster to iterate through the hash table
3198
    // or the set of bins covering the region
3199
0
    for (l = 0; l <= n_lvls; s -= 3, ++l) {
3200
0
        reg_bin_count += (end1 >> s) - (beg >> s) + 1;
3201
0
    }
3202
0
    max_bins = reg_bin_count < kh_size(bidx) ? reg_bin_count : kh_size(bidx);
3203
0
    if (itr->bins.m - itr->bins.n < max_bins) {
3204
        // Worst-case memory usage.  May be wasteful on very sparse
3205
        // data, but the bin list usually won't be too big anyway.
3206
0
        size_t new_m = max_bins + itr->bins.n;
3207
0
        if (new_m > INT_MAX || new_m > SIZE_MAX / sizeof(int)) {
3208
0
            errno = ENOMEM;
3209
0
            return -1;
3210
0
        }
3211
0
        int *new_a = hts_realloc_p(itr->bins.a, sizeof(*new_a), new_m);
3212
0
        if (!new_a) return -1;
3213
0
        itr->bins.a = new_a;
3214
0
        itr->bins.m = new_m;
3215
0
    }
3216
0
    if (reg_bin_count < hash_bin_count) {
3217
0
        return reg2bins_narrow(beg, end, itr, min_shift, n_lvls, bidx);
3218
0
    } else {
3219
0
        return reg2bins_wide(beg, end, itr, min_shift, n_lvls, bidx);
3220
0
    }
3221
0
}
3222
3223
static inline int add_to_interval(hts_itr_t *iter, bins_t *bin,
3224
                                  int tid, uint32_t interval,
3225
                                  uint64_t min_off, uint64_t max_off)
3226
0
{
3227
0
    hts_pair64_max_t *off;
3228
0
    int j;
3229
3230
0
    if (!bin->n)
3231
0
        return 0;
3232
0
    off = hts_realloc_ps(iter->off, sizeof(*off), iter->n_off, bin->n);
3233
0
    if (!off)
3234
0
        return -2;
3235
3236
0
    iter->off = off;
3237
0
    for (j = 0; j < bin->n; ++j) {
3238
0
        if (bin->list[j].v > min_off && bin->list[j].u < max_off) {
3239
0
            iter->off[iter->n_off].u = min_off > bin->list[j].u
3240
0
                ? min_off : bin->list[j].u;
3241
0
            iter->off[iter->n_off].v = max_off < bin->list[j].v
3242
0
                ? max_off : bin->list[j].v;
3243
            // hts_pair64_max_t::max is now used to link
3244
            // file offsets to region list entries.
3245
            // The iterator can use this to decide if it
3246
            // can skip some file regions.
3247
0
            iter->off[iter->n_off].max = ((uint64_t) tid << 32) | interval;
3248
0
            iter->n_off++;
3249
0
        }
3250
0
    }
3251
0
    return 0;
3252
0
}
3253
3254
static inline int reg2intervals_narrow(hts_itr_t *iter, const bidx_t *bidx,
3255
                                       int tid, int64_t beg, int64_t end,
3256
                                       uint32_t interval,
3257
                                       uint64_t min_off, uint64_t max_off,
3258
                                       int min_shift, int n_lvls)
3259
0
{
3260
0
    int l, t, s = min_shift + n_lvls * 3;
3261
0
    hts_pos_t b, e, i;
3262
3263
0
    for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3264
0
        b = t + (beg>>s); e = t + (end>>s);
3265
0
        for (i = b; i <= e; ++i) {
3266
0
            khint_t k = kh_get(bin, bidx, i);
3267
0
            if (k != kh_end(bidx)) {
3268
0
                bins_t *bin = &kh_value(bidx, k);
3269
0
                int res = add_to_interval(iter, bin, tid, interval, min_off, max_off);
3270
0
                if (res < 0)
3271
0
                    return res;
3272
0
            }
3273
0
        }
3274
0
    }
3275
0
    return 0;
3276
0
}
3277
3278
static inline int reg2intervals_wide(hts_itr_t *iter, const bidx_t *bidx,
3279
                                       int tid, int64_t beg, int64_t end,
3280
                                       uint32_t interval,
3281
                                       uint64_t min_off, uint64_t max_off,
3282
                                       int min_shift, int n_lvls)
3283
0
{
3284
0
    khint_t i;
3285
0
    hts_pos_t max_shift = 3 * n_lvls + min_shift;
3286
0
    --end;
3287
0
    if (beg < 0) beg = 0;
3288
0
    for (i = kh_begin(bidx); i != kh_end(bidx); i++) {
3289
0
        if (!kh_exist(bidx, i)) continue;
3290
0
        hts_pos_t bin = (hts_pos_t) kh_key(bidx, i);
3291
0
        int level = hts_bin_level(bin);
3292
0
        if (level > n_lvls) continue; // Dodgy index?
3293
0
        hts_pos_t first = hts_bin_first(level);
3294
0
        hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level));
3295
0
        hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level));
3296
0
        if (beg_at_level <= bin && bin <= end_at_level) {
3297
0
            bins_t *bin = &kh_value(bidx, i);
3298
0
            int res = add_to_interval(iter, bin, tid, interval, min_off, max_off);
3299
0
            if (res < 0)
3300
0
                return res;
3301
0
        }
3302
0
    }
3303
0
    return 0;
3304
0
}
3305
3306
static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint32_t interval, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls)
3307
0
{
3308
0
    int l, s;
3309
0
    int i, j;
3310
0
    hts_pos_t end1;
3311
0
    bidx_t *bidx;
3312
0
    int start_n_off;
3313
0
    size_t reg_bin_count = 0, hash_bin_count;
3314
0
    int res;
3315
3316
0
    if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg > end)
3317
0
        return -1;
3318
3319
0
    hash_bin_count = kh_n_buckets(bidx);
3320
3321
0
    s = min_shift + (n_lvls<<1) + n_lvls;
3322
0
    if (end >= 1LL<<s)
3323
0
        end = 1LL<<s;
3324
3325
0
    end1 = end - 1;
3326
    // Count bins to see if it's faster to iterate through the hash table
3327
    // or the set of bins covering the region
3328
0
    for (l = 0; l <= n_lvls; s -= 3, ++l) {
3329
0
        reg_bin_count += (end1 >> s) - (beg >> s) + 1;
3330
0
    }
3331
3332
0
    start_n_off = iter->n_off;
3333
3334
    // Populate iter->off with the intervals for this region
3335
0
    if (reg_bin_count < hash_bin_count) {
3336
0
        res = reg2intervals_narrow(iter, bidx, tid, beg, end, interval,
3337
0
                                   min_off, max_off, min_shift, n_lvls);
3338
0
    } else {
3339
0
        res = reg2intervals_wide(iter, bidx, tid, beg, end, interval,
3340
0
                                 min_off, max_off, min_shift, n_lvls);
3341
0
    }
3342
0
    if (res < 0)
3343
0
        return res;
3344
3345
0
    if (iter->n_off - start_n_off > 1) {
3346
0
        ks_introsort(_off_max, iter->n_off - start_n_off, iter->off + start_n_off);
3347
0
        for (i = start_n_off, j = start_n_off + 1; j < iter->n_off; j++) {
3348
0
            if (iter->off[i].v >= iter->off[j].u) {
3349
0
                if (iter->off[i].v < iter->off[j].v)
3350
0
                    iter->off[i].v = iter->off[j].v;
3351
0
            } else {
3352
0
                i++;
3353
0
                if (i < j)
3354
0
                    iter->off[i] = iter->off[j];
3355
0
            }
3356
0
        }
3357
0
        iter->n_off = i + 1;
3358
0
    }
3359
3360
0
    return iter->n_off;
3361
0
}
3362
3363
0
static int compare_regions(const void *r1, const void *r2) {
3364
0
    hts_reglist_t *reg1 = (hts_reglist_t *)r1;
3365
0
    hts_reglist_t *reg2 = (hts_reglist_t *)r2;
3366
3367
0
    if (reg1->tid < 0 && reg2->tid >= 0)
3368
0
        return 1;
3369
0
    else if (reg1->tid >= 0 && reg2->tid < 0)
3370
0
        return -1;
3371
0
    else
3372
0
        return reg1->tid - reg2->tid;
3373
0
}
3374
3375
0
uint64_t hts_itr_off(const hts_idx_t* idx, int tid) {
3376
3377
0
    int i;
3378
0
    bidx_t* bidx;
3379
0
    uint64_t off0 = (uint64_t) -1;
3380
0
    khint_t k;
3381
0
    switch (tid) {
3382
0
    case HTS_IDX_START:
3383
        // Find the smallest offset, note that sequence ids may not be ordered sequentially
3384
0
        for (i = 0; i < idx->n; i++) {
3385
0
            bidx = idx->bidx[i];
3386
0
            k = kh_get(bin, bidx, META_BIN(idx));
3387
0
            if (k == kh_end(bidx))
3388
0
                continue;
3389
3390
0
            if (off0 > kh_val(bidx, k).list[0].u)
3391
0
                off0 = kh_val(bidx, k).list[0].u;
3392
0
        }
3393
0
        if (off0 == (uint64_t) -1 && idx->n_no_coor)
3394
0
            off0 = 0;
3395
        // only no-coor reads in this bam
3396
0
        break;
3397
0
    case HTS_IDX_NOCOOR:
3398
        /* No-coor reads sort after all of the mapped reads.  The position
3399
           is not stored in the index itself, so need to find the end
3400
           offset for the last mapped read.  A loop is needed here in
3401
           case references at the end of the file have no mapped reads,
3402
           or sequence ids are not ordered sequentially.
3403
           See issue samtools#568 and commits b2aab8, 60c22d and cc207d. */
3404
0
        for (i = 0; i < idx->n; i++) {
3405
0
            bidx = idx->bidx[i];
3406
0
            k = kh_get(bin, bidx, META_BIN(idx));
3407
0
            if (k != kh_end(bidx)) {
3408
0
                if (off0 == (uint64_t) -1 || off0 < kh_val(bidx, k).list[0].v) {
3409
0
                    off0 = kh_val(bidx, k).list[0].v;
3410
0
                }
3411
0
            }
3412
0
        }
3413
0
        if (off0 == (uint64_t) -1 && idx->n_no_coor)
3414
0
            off0 = 0;
3415
        // only no-coor reads in this bam
3416
0
        break;
3417
0
    case HTS_IDX_REST:
3418
0
        off0 = 0;
3419
0
        break;
3420
0
    case HTS_IDX_NONE:
3421
0
        off0 = 0;
3422
0
        break;
3423
0
    }
3424
3425
0
    return off0;
3426
0
}
3427
3428
hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
3429
0
{
3430
0
    int i, n_off, l, bin;
3431
0
    hts_pair64_max_t *off;
3432
0
    khint_t k;
3433
0
    bidx_t *bidx;
3434
0
    uint64_t min_off, max_off;
3435
0
    hts_pos_t idx_maxpos;
3436
0
    hts_itr_t *iter;
3437
0
    uint32_t unmapped = 0, rel_off;
3438
3439
    // It's possible to call this function with NULL idx iff
3440
    // tid is one of the special values HTS_IDX_REST or HTS_IDX_NONE
3441
0
    if (!idx && !(tid == HTS_IDX_REST || tid == HTS_IDX_NONE)) {
3442
0
        errno = EINVAL;
3443
0
        return NULL;
3444
0
    }
3445
3446
0
    iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
3447
0
    if (iter) {
3448
0
        if (tid < 0) {
3449
0
            uint64_t off = hts_itr_off(idx, tid);
3450
0
            if (off != (uint64_t) -1) {
3451
0
                iter->read_rest = 1;
3452
0
                iter->curr_off = off;
3453
0
                iter->readrec = readrec;
3454
0
                if (tid == HTS_IDX_NONE)
3455
0
                    iter->finished = 1;
3456
0
            } else {
3457
0
                free(iter);
3458
0
                iter = NULL;
3459
0
            }
3460
0
        } else if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) {
3461
0
            iter->finished = 1;
3462
0
        } else {
3463
0
            if (beg < 0) beg = 0;
3464
0
            if (end < beg) {
3465
0
              free(iter);
3466
0
              return NULL;
3467
0
            }
3468
3469
0
            k = kh_get(bin, bidx, META_BIN(idx));
3470
0
            if (k != kh_end(bidx))
3471
0
                unmapped = kh_val(bidx, k).list[1].v;
3472
0
            else
3473
0
                unmapped = 1;
3474
3475
0
            iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
3476
0
            iter->readrec = readrec;
3477
3478
0
            if ( !kh_size(bidx) ) { iter->finished = 1; return iter; }
3479
3480
0
            idx_maxpos = hts_idx_maxpos(idx);
3481
0
            if (beg >= idx_maxpos) { iter->finished = 1; return iter; }
3482
3483
0
            rel_off = beg>>idx->min_shift;
3484
            // compute min_off
3485
0
            bin = hts_bin_first(idx->n_lvls) + rel_off;
3486
0
            do {
3487
0
                int first;
3488
0
                k = kh_get(bin, bidx, bin);
3489
0
                if (k != kh_end(bidx)) break;
3490
0
                first = (hts_bin_parent(bin)<<3) + 1;
3491
0
                if (bin > first) --bin;
3492
0
                else bin = hts_bin_parent(bin);
3493
0
            } while (bin);
3494
0
            if (bin == 0) k = kh_get(bin, bidx, bin);
3495
0
            min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
3496
            // min_off can be calculated more accurately if the
3497
            // linear index is available
3498
0
            if (idx->lidx[tid].offset
3499
0
                && rel_off < idx->lidx[tid].n) {
3500
0
                if (min_off < idx->lidx[tid].offset[rel_off])
3501
0
                    min_off = idx->lidx[tid].offset[rel_off];
3502
0
                if (unmapped) {
3503
                    // unmapped reads are not covered by the linear index,
3504
                    // so search backwards for a smaller offset
3505
0
                    int tmp_off;
3506
0
                    for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) {
3507
0
                        if (idx->lidx[tid].offset[tmp_off] < min_off) {
3508
0
                            min_off = idx->lidx[tid].offset[tmp_off];
3509
0
                            break;
3510
0
                        }
3511
0
                    }
3512
                    // if the search went too far back or no satisfactory entry
3513
                    // was found, revert to the bin index loff value
3514
0
                    if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).loff || tmp_off < 0))
3515
0
                        min_off = kh_val(bidx, k).loff;
3516
0
                }
3517
0
            } else if (unmapped) { //CSI index
3518
0
                if (k != kh_end(bidx))
3519
0
                    min_off = kh_val(bidx, k).loff;
3520
0
            }
3521
3522
            // compute max_off: a virtual offset from a bin to the right of end
3523
            // First check if end lies within the range of the index (it won't
3524
            // if it's HTS_POS_MAX)
3525
0
            if (end <= idx_maxpos) {
3526
0
                bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
3527
0
                if (bin >= idx->n_bins) bin = 0;
3528
0
                while (1) {
3529
                    // search for an extant bin by moving right, but moving up to the
3530
                    // parent whenever we get to a first child (which also covers falling
3531
                    // off the RHS, which wraps around and immediately goes up to bin 0)
3532
0
                    while (bin % 8 == 1) bin = hts_bin_parent(bin);
3533
0
                    if (bin == 0) { max_off = UINT64_MAX; break; }
3534
0
                    k = kh_get(bin, bidx, bin);
3535
0
                    if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; }
3536
0
                    bin++;
3537
0
                }
3538
0
            } else {
3539
                // Searching to end of reference
3540
0
                max_off = UINT64_MAX;
3541
0
            }
3542
3543
            // retrieve bins
3544
0
            if (reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls, bidx) < 0) {
3545
0
                hts_itr_destroy(iter);
3546
0
                return NULL;
3547
0
            }
3548
3549
0
            for (i = n_off = 0; i < iter->bins.n; ++i)
3550
0
                if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx))
3551
0
                    n_off += kh_value(bidx, k).n;
3552
0
            if (n_off == 0) {
3553
                // No overlapping bins means the iterator has already finished.
3554
0
                iter->finished = 1;
3555
0
                return iter;
3556
0
            }
3557
0
            off = calloc(n_off, sizeof(*off));
3558
0
            for (i = n_off = 0; i < iter->bins.n; ++i) {
3559
0
                if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) {
3560
0
                    int j;
3561
0
                    bins_t *p = &kh_value(bidx, k);
3562
0
                    for (j = 0; j < p->n; ++j)
3563
0
                        if (p->list[j].v > min_off && p->list[j].u < max_off) {
3564
0
                            off[n_off].u = min_off > p->list[j].u
3565
0
                                ? min_off : p->list[j].u;
3566
0
                            off[n_off].v = max_off < p->list[j].v
3567
0
                                ? max_off : p->list[j].v;
3568
                            // hts_pair64_max_t::max is now used to link
3569
                            // file offsets to region list entries.
3570
                            // The iterator can use this to decide if it
3571
                            // can skip some file regions.
3572
0
                            off[n_off].max = ((uint64_t) tid << 32) | j;
3573
0
                            n_off++;
3574
0
                        }
3575
0
                }
3576
0
            }
3577
3578
0
            if (n_off == 0) {
3579
0
                free(off);
3580
0
                iter->finished = 1;
3581
0
                return iter;
3582
0
            }
3583
0
            ks_introsort(_off_max, n_off, off);
3584
            // resolve completely contained adjacent blocks
3585
0
            for (i = 1, l = 0; i < n_off; ++i)
3586
0
                if (off[l].v < off[i].v) off[++l] = off[i];
3587
0
            n_off = l + 1;
3588
            // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
3589
0
            for (i = 1; i < n_off; ++i)
3590
0
                if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
3591
            // merge adjacent blocks
3592
0
            for (i = 1, l = 0; i < n_off; ++i) {
3593
0
                if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
3594
0
                else off[++l] = off[i];
3595
0
            }
3596
0
            n_off = l + 1;
3597
0
            iter->n_off = n_off; iter->off = off;
3598
0
        }
3599
0
    }
3600
3601
0
    return iter;
3602
0
}
3603
3604
int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
3605
0
{
3606
0
    int i, j, bin;
3607
0
    khint_t k;
3608
0
    bidx_t *bidx;
3609
0
    uint64_t min_off, max_off, t_off = (uint64_t)-1;
3610
0
    int tid;
3611
0
    hts_pos_t beg, end, idx_maxpos;
3612
0
    hts_reglist_t *curr_reg;
3613
0
    uint32_t unmapped = 0, rel_off;
3614
3615
0
    if (!idx || !iter || !iter->multi)
3616
0
        return -1;
3617
3618
0
    iter->i = -1;
3619
0
    for (i=0; i<iter->n_reg; i++) {
3620
3621
0
        curr_reg = &iter->reg_list[i];
3622
0
        tid = curr_reg->tid;
3623
3624
0
        if (tid < 0) {
3625
0
            t_off = hts_itr_off(idx, tid);
3626
0
            if (t_off != (uint64_t)-1) {
3627
0
                switch (tid) {
3628
0
                case HTS_IDX_NONE:
3629
0
                    iter->finished = 1;
3630
                    // fall through
3631
0
                case HTS_IDX_START:
3632
0
                case HTS_IDX_REST:
3633
0
                    iter->curr_off = t_off;
3634
0
                    iter->n_reg = 0;
3635
0
                    iter->reg_list = NULL;
3636
0
                    iter->read_rest = 1;
3637
0
                    return 0;
3638
0
                case HTS_IDX_NOCOOR:
3639
0
                    iter->nocoor = 1;
3640
0
                    iter->nocoor_off = t_off;
3641
0
                }
3642
0
            }
3643
0
        } else {
3644
0
            if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL || !kh_size(bidx))
3645
0
                continue;
3646
3647
0
            k = kh_get(bin, bidx, META_BIN(idx));
3648
0
            if (k != kh_end(bidx))
3649
0
                unmapped = kh_val(bidx, k).list[1].v;
3650
0
            else
3651
0
                unmapped = 1;
3652
3653
0
            idx_maxpos = hts_idx_maxpos(idx);
3654
3655
0
            for(j=0; j<curr_reg->count; j++) {
3656
0
                hts_pair32_t *curr_intv = &curr_reg->intervals[j];
3657
0
                if (curr_intv->end < curr_intv->beg)
3658
0
                    continue;
3659
3660
0
                beg = curr_intv->beg;
3661
0
                end = curr_intv->end;
3662
0
                if (beg >= idx_maxpos)
3663
0
                    continue;
3664
0
                rel_off = beg>>idx->min_shift;
3665
3666
                /* Compute 'min_off' by searching the lowest level bin containing 'beg'.
3667
                       If the computed bin is not in the index, try the next bin to the
3668
                       left, belonging to the same parent. If it is the first sibling bin,
3669
                       try the parent bin. */
3670
0
                bin = hts_bin_first(idx->n_lvls) + rel_off;
3671
0
                do {
3672
0
                    int first;
3673
0
                    k = kh_get(bin, bidx, bin);
3674
0
                    if (k != kh_end(bidx)) break;
3675
0
                    first = (hts_bin_parent(bin)<<3) + 1;
3676
0
                    if (bin > first) --bin;
3677
0
                    else bin = hts_bin_parent(bin);
3678
0
                } while (bin);
3679
0
                if (bin == 0)
3680
0
                    k = kh_get(bin, bidx, bin);
3681
0
                min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
3682
                // min_off can be calculated more accurately if the
3683
                // linear index is available
3684
0
                if (idx->lidx[tid].offset
3685
0
                    && rel_off < idx->lidx[tid].n) {
3686
0
                    if (min_off < idx->lidx[tid].offset[rel_off])
3687
0
                        min_off = idx->lidx[tid].offset[rel_off];
3688
0
                    if (unmapped) {
3689
0
                        int tmp_off;
3690
0
                        for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) {
3691
0
                            if (idx->lidx[tid].offset[tmp_off] < min_off) {
3692
0
                                min_off = idx->lidx[tid].offset[tmp_off];
3693
0
                                break;
3694
0
                            }
3695
0
                        }
3696
3697
0
                        if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).loff || tmp_off < 0))
3698
0
                            min_off = kh_val(bidx, k).loff;
3699
0
                    }
3700
0
                } else if (unmapped) { //CSI index
3701
0
                    if (k != kh_end(bidx))
3702
0
                        min_off = kh_val(bidx, k).loff;
3703
0
                }
3704
3705
                // compute max_off: a virtual offset from a bin to the right of end
3706
                // First check if end lies within the range of the index (it
3707
                // won't if it's HTS_POS_MAX)
3708
0
                if (end <= idx_maxpos) {
3709
0
                    bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
3710
0
                    if (bin >= idx->n_bins) bin = 0;
3711
0
                    while (1) {
3712
                        // search for an extant bin by moving right, but moving up to the
3713
                        // parent whenever we get to a first child (which also covers falling
3714
                        // off the RHS, which wraps around and immediately goes up to bin 0)
3715
0
                        while (bin % 8 == 1) bin = hts_bin_parent(bin);
3716
0
                        if (bin == 0) { max_off = UINT64_MAX; break; }
3717
0
                        k = kh_get(bin, bidx, bin);
3718
0
                        if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) {
3719
0
                            max_off = kh_val(bidx, k).list[0].u;
3720
0
                            break;
3721
0
                        }
3722
0
                        bin++;
3723
0
                    }
3724
0
                } else {
3725
                    // Searching to end of reference
3726
0
                    max_off = UINT64_MAX;
3727
0
                }
3728
3729
                //convert coordinates to file offsets
3730
0
                if (reg2intervals(iter, idx, tid, beg, end, j,
3731
0
                                  min_off, max_off,
3732
0
                                  idx->min_shift, idx->n_lvls) < 0) {
3733
0
                    return -1;
3734
0
                }
3735
0
            }
3736
0
        }
3737
0
    }
3738
3739
0
    if (iter->n_off > 1)
3740
0
        ks_introsort(_off_max, iter->n_off, iter->off);
3741
3742
0
    if(!iter->n_off && !iter->nocoor)
3743
0
        iter->finished = 1;
3744
3745
0
    return 0;
3746
0
}
3747
3748
int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter)
3749
0
{
3750
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
3751
0
    int tid, i, n_off = 0;
3752
0
    uint32_t j;
3753
0
    hts_pos_t beg, end;
3754
0
    hts_reglist_t *curr_reg;
3755
0
    hts_pair32_t *curr_intv;
3756
0
    hts_pair64_max_t *off = NULL, *tmp;
3757
0
    cram_index *e = NULL;
3758
3759
0
    if (!cidx || !iter || !iter->multi)
3760
0
        return -1;
3761
3762
0
    iter->is_cram = 1;
3763
0
    iter->read_rest = 0;
3764
0
    iter->off = NULL;
3765
0
    iter->n_off = 0;
3766
0
    iter->curr_off = 0;
3767
0
    iter->i = -1;
3768
3769
0
    for (i=0; i<iter->n_reg; i++) {
3770
3771
0
        curr_reg = &iter->reg_list[i];
3772
0
        tid = curr_reg->tid;
3773
3774
0
        if (tid >= 0) {
3775
0
            tmp = hts_realloc_ps(off, sizeof(*off), n_off, curr_reg->count);
3776
0
            if (!tmp)
3777
0
                goto err;
3778
0
            off = tmp;
3779
3780
0
            for (j=0; j < curr_reg->count; j++) {
3781
0
                curr_intv = &curr_reg->intervals[j];
3782
0
                if (curr_intv->end < curr_intv->beg)
3783
0
                    continue;
3784
3785
0
                beg = curr_intv->beg;
3786
0
                end = curr_intv->end;
3787
3788
/* First, fetch the container overlapping 'beg' and assign its file offset to u, then
3789
 * find the container overlapping 'end' and assign the relative end of the slice to v.
3790
 * The cram_ptell function will adjust with the container offset, which is not stored
3791
 * in the index.
3792
 */
3793
0
                e = cram_index_query(cidx->cram, tid, beg+1, NULL);
3794
0
                if (e) {
3795
0
                    off[n_off].u = e->offset;
3796
                    // hts_pair64_max_t::max is now used to link
3797
                    // file offsets to region list entries.
3798
                    // The iterator can use this to decide if it
3799
                    // can skip some file regions.
3800
0
                    off[n_off].max = ((uint64_t) tid << 32) | j;
3801
3802
0
                    if (end >= HTS_POS_MAX) {
3803
0
                       e = cram_index_last(cidx->cram, tid, NULL);
3804
0
                    } else {
3805
0
                       e = cram_index_query_last(cidx->cram, tid, end+1);
3806
0
                    }
3807
3808
0
                    if (e) {
3809
0
                        off[n_off++].v = e->e_next
3810
0
                            ? e->e_next->offset
3811
0
                            : e->offset + e->slice + e->len;
3812
0
                    } else {
3813
0
                        hts_log_warning("Could not set offset end for region %d:%"PRIhts_pos"-%"PRIhts_pos". Skipping", tid, beg, end);
3814
0
                    }
3815
0
                }
3816
0
            }
3817
0
        } else {
3818
0
            switch (tid) {
3819
0
                case HTS_IDX_NOCOOR:
3820
0
                    e = cram_index_query(cidx->cram, tid, 1, NULL);
3821
0
                    if (e) {
3822
0
                        iter->nocoor = 1;
3823
0
                        iter->nocoor_off = e->offset;
3824
0
                    } else {
3825
0
                        hts_log_warning("No index entry for NOCOOR region");
3826
0
                    }
3827
0
                    break;
3828
0
                case HTS_IDX_START:
3829
0
                    e = cram_index_query(cidx->cram, tid, 1, NULL);
3830
0
                    if (e) {
3831
0
                        iter->read_rest = 1;
3832
0
                        tmp = realloc(off, sizeof(*off));
3833
0
                        if (!tmp)
3834
0
                            goto err;
3835
0
                        off = tmp;
3836
0
                        off[0].u = e->offset;
3837
0
                        off[0].v = 0;
3838
0
                        n_off=1;
3839
0
                    } else {
3840
0
                        hts_log_warning("No index entries");
3841
0
                    }
3842
0
                    break;
3843
0
                case HTS_IDX_REST:
3844
0
                    break;
3845
0
                case HTS_IDX_NONE:
3846
0
                    iter->finished = 1;
3847
0
                    break;
3848
0
                default:
3849
0
                    hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
3850
0
            }
3851
0
        }
3852
0
    }
3853
3854
0
    if (n_off) {
3855
0
        ks_introsort(_off_max, n_off, off);
3856
0
        iter->n_off = n_off; iter->off = off;
3857
0
    }
3858
3859
0
    if(!n_off && !iter->nocoor)
3860
0
        iter->finished = 1;
3861
3862
0
    return 0;
3863
3864
0
 err:
3865
0
    free(off);
3866
0
    return -1;
3867
0
}
3868
3869
void hts_itr_destroy(hts_itr_t *iter)
3870
0
{
3871
0
    if (iter) {
3872
0
        if (iter->multi) {
3873
0
            hts_reglist_free(iter->reg_list, iter->n_reg);
3874
0
        } else {
3875
0
            free(iter->bins.a);
3876
0
        }
3877
3878
0
        if (iter->off)
3879
0
            free(iter->off);
3880
0
        free(iter);
3881
0
    }
3882
0
}
3883
3884
static inline unsigned long long push_digit(unsigned long long i, char c)
3885
0
{
3886
    // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so
3887
0
    int digit = c - '0';
3888
0
    return 10 * i + digit;
3889
0
}
3890
3891
long long hts_parse_decimal(const char *str, char **strend, int flags)
3892
0
{
3893
0
    unsigned long long n = 0;
3894
0
    int digits = 0, decimals = 0, e = 0, lost = 0;
3895
0
    char sign = '+', esign = '+';
3896
0
    const char *s, *str_orig = str;
3897
3898
0
    while (isspace_c(*str)) str++;
3899
0
    s = str;
3900
3901
0
    if (*s == '+' || *s == '-') sign = *s++;
3902
0
    while (*s)
3903
0
        if (isdigit_c(*s)) digits++, n = push_digit(n, *s++);
3904
0
        else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++;
3905
0
        else break;
3906
3907
0
    if (*s == '.') {
3908
0
        s++;
3909
0
        while (isdigit_c(*s)) decimals++, digits++, n = push_digit(n, *s++);
3910
0
    }
3911
3912
0
    switch (*s) {
3913
0
    case 'e': case 'E':
3914
0
        s++;
3915
0
        if (*s == '+' || *s == '-') esign = *s++;
3916
0
        while (isdigit_c(*s)) e = push_digit(e, *s++);
3917
0
        if (esign == '-') e = -e;
3918
0
        break;
3919
3920
0
    case 'k': case 'K': e += 3; s++; break;
3921
0
    case 'm': case 'M': e += 6; s++; break;
3922
0
    case 'g': case 'G': e += 9; s++; break;
3923
0
    }
3924
3925
0
    e -= decimals;
3926
0
    while (e > 0) n *= 10, e--;
3927
0
    while (e < 0) lost += n % 10, n /= 10, e++;
3928
3929
0
    if (lost > 0) {
3930
0
        hts_log_warning("Discarding fractional part of %.*s", (int)(s - str), str);
3931
0
    }
3932
3933
0
    if (strend) {
3934
        // Set to the original input str pointer if not valid number syntax
3935
0
        *strend = (digits > 0)? (char *)s : (char *)str_orig;
3936
0
    } else if (digits == 0) {
3937
0
        hts_log_warning("Invalid numeric value %.8s[truncated]", str);
3938
0
    } else if (*s) {
3939
0
        if ((flags & HTS_PARSE_THOUSANDS_SEP) || (!(flags & HTS_PARSE_THOUSANDS_SEP) && *s != ','))
3940
0
            hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s);
3941
0
    }
3942
3943
0
    return (sign == '+')? n : -n;
3944
0
}
3945
3946
0
static void *hts_memrchr(const void *s, int c, size_t n) {
3947
0
    size_t i;
3948
0
    unsigned char *u = (unsigned char *)s;
3949
0
    for (i = n; i > 0; i--) {
3950
0
        if (u[i-1] == c)
3951
0
            return u+i-1;
3952
0
    }
3953
3954
0
    return NULL;
3955
0
}
3956
3957
/*
3958
 * A variant of hts_parse_reg which is reference-id aware.  It uses
3959
 * the iterator name2id callbacks to validate the region tokenisation works.
3960
 *
3961
 * This is necessary due to GRCh38 HLA additions which have reference names
3962
 * like "HLA-DRB1*12:17".
3963
 *
3964
 * All parameters are mandatory.
3965
 *
3966
 * To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200"
3967
 * are reference names, we may quote using curly braces.
3968
 * Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example.
3969
 *
3970
 * Flags are used to control how parsing works, and can be one of the below.
3971
 *
3972
 * HTS_PARSE_LIST:
3973
 *     If present, the region is assmed to be a comma separated list and
3974
 *     position parsing will not contain commas (this implicitly
3975
 *     clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal).
3976
 *     On success the return pointer will be the start of the next region, ie
3977
 *     the character after the comma.  (If *ret != '\0' then the caller can
3978
 *     assume another region is present in the list.)
3979
 *
3980
 *     If not set then positions may contain commas.  In this case the return
3981
 *     value should point to the end of the string, or NULL on failure.
3982
 *
3983
 * HTS_PARSE_ONE_COORD:
3984
 *     If present, X:100 is treated as the single base pair region X:100-100.
3985
 *     In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-<end>.
3986
 *     (This is the standard bcftools region convention.)
3987
 *
3988
 *     When not set X:100 is considered to be X:100-<end> where <end> is
3989
 *     the end of chromosome X (set to HTS_POS_MAX here).  X:100- and X:-100
3990
 *     are invalid.
3991
 *     (This is the standard samtools region convention.)
3992
 *
3993
 * Note the supplied string expects 1 based inclusive coordinates, but the
3994
 * returned coordinates start from 0 and are half open, so pos0 is valid
3995
 * for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}"
3996
 *
3997
 * On success a pointer to the byte after the end of the entire region
3998
 *            specifier is returned (plus any trailing comma), and tid,
3999
 *            beg & end will be set.
4000
 * On failure NULL is returned.
4001
 */
4002
const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg,
4003
                             hts_pos_t *end, hts_name2id_f getid, void *hdr,
4004
                             int flags)
4005
0
{
4006
0
    if (!s || !tid || !beg || !end || !getid)
4007
0
        return NULL;
4008
4009
0
    size_t s_len = strlen(s);
4010
0
    kstring_t ks = { 0, 0, NULL };
4011
4012
0
    const char *colon = NULL, *comma = NULL;
4013
0
    int quoted = 0;
4014
4015
0
    if (flags & HTS_PARSE_LIST)
4016
0
        flags &= ~HTS_PARSE_THOUSANDS_SEP;
4017
0
    else
4018
0
        flags |= HTS_PARSE_THOUSANDS_SEP;
4019
4020
0
    const char *s_end = s + s_len;
4021
4022
    // Braced quoting of references is permitted to resolve ambiguities.
4023
0
    if (*s == '{') {
4024
0
        const char *close = memchr(s, '}', s_len);
4025
0
        if (!close) {
4026
0
            hts_log_error("Mismatching braces in \"%s\"", s);
4027
0
            *tid = -1;
4028
0
            return NULL;
4029
0
        }
4030
0
        s++;
4031
0
        s_len--;
4032
0
        if (close[1] == ':')
4033
0
            colon = close+1;
4034
0
        quoted = 1; // number of trailing characters to trim
4035
4036
        // Truncate to this item only, if appropriate.
4037
0
        if (flags & HTS_PARSE_LIST) {
4038
0
            comma = strchr(close, ',');
4039
0
            if (comma) {
4040
0
                s_len = comma-s;
4041
0
                s_end = comma+1;
4042
0
            }
4043
0
        }
4044
0
    } else {
4045
        // Truncate to this item only, if appropriate.
4046
0
        if (flags & HTS_PARSE_LIST) {
4047
0
            comma = strchr(s, ',');
4048
0
            if (comma) {
4049
0
                s_len = comma-s;
4050
0
                s_end = comma+1;
4051
0
            }
4052
0
        }
4053
4054
0
        colon = hts_memrchr(s, ':', s_len);
4055
0
    }
4056
4057
    // No colon is simplest case; just check and return.
4058
0
    if (colon == NULL) {
4059
0
        *beg = 0; *end = HTS_POS_MAX;
4060
0
        kputsn(s, s_len-quoted, &ks); // convert to nul terminated string
4061
0
        if (!ks.s) {
4062
0
            *tid = -2;
4063
0
            return NULL;
4064
0
        }
4065
4066
0
        *tid = getid(hdr, ks.s);
4067
0
        free(ks.s);
4068
4069
0
        return *tid >= 0 ? s_end : NULL;
4070
0
    }
4071
4072
    // Has a colon, but check whole name first.
4073
0
    if (!quoted) {
4074
0
        *beg = 0; *end = HTS_POS_MAX;
4075
0
        kputsn(s, s_len, &ks); // convert to nul terminated string
4076
0
        if (!ks.s) {
4077
0
            *tid = -2;
4078
0
            return NULL;
4079
0
        }
4080
0
        if ((*tid = getid(hdr, ks.s)) >= 0) {
4081
            // Entire name matches, but also check this isn't
4082
            // ambiguous.  eg we have ref chr1 and ref chr1:100-200
4083
            // both present.
4084
0
            ks.l = 0;
4085
0
            kputsn(s, colon-s, &ks); // convert to nul terminated string
4086
0
            if (!ks.s) {
4087
0
                *tid = -2;
4088
0
                return NULL;
4089
0
            }
4090
0
            if (getid(hdr, ks.s) >= 0) {
4091
0
                free(ks.s);
4092
0
                *tid = -1;
4093
0
                hts_log_error("Range is ambiguous. "
4094
0
                              "Use {%s} or {%.*s}%s instead",
4095
0
                              s, (int)(colon-s), s, colon);
4096
0
                return NULL;
4097
0
            }
4098
0
            free(ks.s);
4099
4100
0
            return s_end;
4101
0
        }
4102
0
        if (*tid < -1) // Failed to parse header
4103
0
            return NULL;
4104
0
    }
4105
4106
    // Quoted, or unquoted and whole string isn't a name.
4107
    // Check the pre-colon part is valid.
4108
0
    ks.l = 0;
4109
0
    kputsn(s, colon-s-quoted, &ks); // convert to nul terminated string
4110
0
    if (!ks.s) {
4111
0
        *tid = -2;
4112
0
        return NULL;
4113
0
    }
4114
0
    *tid = getid(hdr, ks.s);
4115
0
    free(ks.s);
4116
0
    if (*tid < 0)
4117
0
        return NULL;
4118
4119
    // Finally parse the post-colon coordinates
4120
0
    char *hyphen;
4121
0
    *beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1;
4122
0
    if (*beg < 0) {
4123
0
        if (*beg != -1 && *hyphen == '-' && colon[1] != '\0') {
4124
            // User specified zero, but we're 1-based.
4125
0
            hts_log_error("Coordinates must be > 0");
4126
0
            return NULL;
4127
0
        }
4128
0
        if (isdigit_c(*hyphen) || *hyphen == '\0' || *hyphen == ',') {
4129
            // interpret chr:-100 as chr:1-100
4130
0
            *end = *beg==-1 ? HTS_POS_MAX : -(*beg+1);
4131
0
            *beg = 0;
4132
0
            return s_end;
4133
0
        } else if (*beg < -1) {
4134
0
            hts_log_error("Unexpected string \"%s\" after region", hyphen);
4135
0
            return NULL;
4136
0
        }
4137
0
    }
4138
4139
0
    if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) {
4140
0
        *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : HTS_POS_MAX;
4141
0
    } else if (*hyphen == '-') {
4142
0
        *end = hts_parse_decimal(hyphen+1, &hyphen, flags);
4143
0
        if (*hyphen != '\0' && *hyphen != ',') {
4144
0
            hts_log_error("Unexpected string \"%s\" after region", hyphen);
4145
0
            return NULL;
4146
0
        }
4147
0
    } else {
4148
0
        hts_log_error("Unexpected string \"%s\" after region", hyphen);
4149
0
        return NULL;
4150
0
    }
4151
4152
0
    if (*end == 0)
4153
0
        *end = HTS_POS_MAX; // interpret chr:100- as chr:100-<end>
4154
4155
0
    if (*beg >= *end) return NULL;
4156
4157
0
    return s_end;
4158
0
}
4159
4160
// Next release we should mark this as deprecated?
4161
// Use hts_parse_region above instead.
4162
const char *hts_parse_reg64(const char *s, hts_pos_t *beg, hts_pos_t *end)
4163
0
{
4164
0
    char *hyphen;
4165
0
    const char *colon = strrchr(s, ':');
4166
0
    if (colon == NULL) {
4167
0
        *beg = 0; *end = HTS_POS_MAX;
4168
0
        return s + strlen(s);
4169
0
    }
4170
4171
0
    *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1;
4172
0
    if (*beg < 0) *beg = 0;
4173
4174
0
    if (*hyphen == '\0') *end = HTS_POS_MAX;
4175
0
    else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP);
4176
0
    else return NULL;
4177
4178
0
    if (*beg >= *end) return NULL;
4179
0
    return colon;
4180
0
}
4181
4182
const char *hts_parse_reg(const char *s, int *beg, int *end)
4183
0
{
4184
0
    hts_pos_t beg64 = 0, end64 = 0;
4185
0
    const char *colon = hts_parse_reg64(s, &beg64, &end64);
4186
0
    if (beg64 > INT_MAX) {
4187
0
        hts_log_error("Position %"PRId64" too large", beg64);
4188
0
        return NULL;
4189
0
    }
4190
0
    if (end64 > INT_MAX) {
4191
0
        if (end64 == HTS_POS_MAX) {
4192
0
            end64 = INT_MAX;
4193
0
        } else {
4194
0
            hts_log_error("Position %"PRId64" too large", end64);
4195
0
            return NULL;
4196
0
        }
4197
0
    }
4198
0
    *beg = beg64;
4199
0
    *end = end64;
4200
0
    return colon;
4201
0
}
4202
4203
hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
4204
0
{
4205
0
    int tid;
4206
0
    hts_pos_t beg, end;
4207
4208
0
    if (strcmp(reg, ".") == 0)
4209
0
        return itr_query(idx, HTS_IDX_START, 0, 0, readrec);
4210
0
    else if (strcmp(reg, "*") == 0)
4211
0
        return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
4212
4213
0
    if (!hts_parse_region(reg, &tid, &beg, &end, getid, hdr, HTS_PARSE_THOUSANDS_SEP))
4214
0
        return NULL;
4215
4216
0
    return itr_query(idx, tid, beg, end, readrec);
4217
0
}
4218
4219
0
hts_itr_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int count, hts_name2id_f getid, void *hdr, hts_itr_multi_query_func *itr_specific, hts_readrec_func *readrec, hts_seek_func *seek, hts_tell_func *tell) {
4220
4221
0
    int i;
4222
4223
0
    if (!reglist)
4224
0
        return NULL;
4225
4226
0
    hts_itr_t *itr = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
4227
0
    if (itr) {
4228
0
        itr->n_reg = count;
4229
0
        itr->readrec = readrec;
4230
0
        itr->seek = seek;
4231
0
        itr->tell = tell;
4232
0
        itr->reg_list = reglist;
4233
0
        itr->finished = 0;
4234
0
        itr->nocoor = 0;
4235
0
        itr->multi = 1;
4236
4237
0
        for (i = 0; i < itr->n_reg; i++) {
4238
0
            if (itr->reg_list[i].reg) {
4239
0
                if (!strcmp(itr->reg_list[i].reg, ".")) {
4240
0
                    itr->reg_list[i].tid = HTS_IDX_START;
4241
0
                    continue;
4242
0
                }
4243
4244
0
                if (!strcmp(itr->reg_list[i].reg, "*")) {
4245
0
                    itr->reg_list[i].tid = HTS_IDX_NOCOOR;
4246
0
                    continue;
4247
0
                }
4248
4249
0
                itr->reg_list[i].tid = getid(hdr, reglist[i].reg);
4250
0
                if (itr->reg_list[i].tid < 0) {
4251
0
                    if (itr->reg_list[i].tid < -1) {
4252
0
                        hts_log_error("Failed to parse header");
4253
0
                        hts_itr_destroy(itr);
4254
0
                        return NULL;
4255
0
                    } else {
4256
0
                        hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", reglist[i].reg);
4257
0
                    }
4258
0
                }
4259
0
            }
4260
0
        }
4261
4262
0
        qsort(itr->reg_list, itr->n_reg, sizeof(hts_reglist_t), compare_regions);
4263
0
        if (itr_specific(idx, itr) != 0) {
4264
0
            hts_log_error("Failed to create the multi-region iterator!");
4265
0
            hts_itr_destroy(itr);
4266
0
            itr = NULL;
4267
0
        }
4268
0
    }
4269
4270
0
    return itr;
4271
0
}
4272
4273
int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
4274
0
{
4275
0
    int ret, tid;
4276
0
    hts_pos_t beg, end;
4277
0
    if (iter == NULL || iter->finished) return -1;
4278
0
    if (iter->read_rest) {
4279
0
        if (iter->curr_off) { // seek to the start
4280
0
            if (bgzf_seek(fp, iter->curr_off, SEEK_SET) < 0) {
4281
0
                hts_log_error("Failed to seek to offset %"PRIu64"%s%s",
4282
0
                              iter->curr_off,
4283
0
                              errno ? ": " : "", strerror(errno));
4284
0
                return -2;
4285
0
            }
4286
0
            iter->curr_off = 0; // only seek once
4287
0
        }
4288
0
        ret = iter->readrec(fp, data, r, &tid, &beg, &end);
4289
0
        if (ret < 0) iter->finished = 1;
4290
0
        iter->curr_tid = tid;
4291
0
        iter->curr_beg = beg;
4292
0
        iter->curr_end = end;
4293
0
        return ret;
4294
0
    }
4295
    // A NULL iter->off should always be accompanied by iter->finished.
4296
0
    assert(iter->off != NULL);
4297
0
    for (;;) {
4298
0
        if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
4299
0
            if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
4300
0
            if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
4301
0
                if (bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) {
4302
0
                    hts_log_error("Failed to seek to offset %"PRIu64"%s%s",
4303
0
                                  iter->off[iter->i+1].u,
4304
0
                                  errno ? ": " : "", strerror(errno));
4305
0
                    return -2;
4306
0
                }
4307
0
                iter->curr_off = bgzf_tell(fp);
4308
0
            }
4309
0
            ++iter->i;
4310
0
        }
4311
0
        if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) {
4312
0
            iter->curr_off = bgzf_tell(fp);
4313
0
            if (tid != iter->tid || beg >= iter->end) { // no need to proceed
4314
0
                ret = -1; break;
4315
0
            } else if (end > iter->beg && iter->end > beg) {
4316
0
                iter->curr_tid = tid;
4317
0
                iter->curr_beg = beg;
4318
0
                iter->curr_end = end;
4319
0
                return ret;
4320
0
            }
4321
0
        } else break; // end of file or error
4322
0
    }
4323
0
    iter->finished = 1;
4324
0
    return ret;
4325
0
}
4326
4327
int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r)
4328
0
{
4329
0
    void *fp;
4330
0
    int ret, tid, i, cr, ci;
4331
0
    hts_pos_t beg, end;
4332
0
    hts_reglist_t *found_reg;
4333
4334
0
    if (iter == NULL || iter->finished) return -1;
4335
4336
0
    if (iter->is_cram) {
4337
0
        fp = fd->fp.cram;
4338
0
    } else {
4339
0
        fp = fd->fp.bgzf;
4340
0
    }
4341
4342
0
    if (iter->read_rest) {
4343
0
        if (iter->curr_off) { // seek to the start
4344
0
            if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
4345
0
                hts_log_error("Seek at offset %" PRIu64 " failed.", iter->curr_off);
4346
0
                return -2;
4347
0
            }
4348
0
            iter->curr_off = 0; // only seek once
4349
0
        }
4350
4351
0
        ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
4352
0
        if (ret < 0)
4353
0
            iter->finished = 1;
4354
4355
0
        iter->curr_tid = tid;
4356
0
        iter->curr_beg = beg;
4357
0
        iter->curr_end = end;
4358
4359
0
        return ret;
4360
0
    }
4361
    // A NULL iter->off should always be accompanied by iter->finished.
4362
0
    assert(iter->off != NULL || iter->nocoor != 0);
4363
4364
0
    int next_range = 0;
4365
0
    for (;;) {
4366
        // Note that due to the way bam indexing works, iter->off may contain
4367
        // file chunks that are not actually needed as they contain data
4368
        // beyond the end of the requested region.  These are filtered out
4369
        // by comparing the tid and index into hts_reglist_t::intervals
4370
        // (packed for reasons of convenience into iter->off[iter->i].max)
4371
        // associated with the file region with iter->curr_tid and
4372
        // iter->curr_intv.
4373
4374
0
        if (next_range
4375
0
            || iter->curr_off == 0
4376
0
            || iter->i >= iter->n_off
4377
0
            || iter->curr_off >= iter->off[iter->i].v
4378
0
            || (iter->off[iter->i].max >> 32 == iter->curr_tid
4379
0
                && (iter->off[iter->i].max & 0xffffffff) < iter->curr_intv)) {
4380
4381
            // Jump to the next chunk.  It may be necessary to skip more
4382
            // than one as the iter->off list can include overlapping entries.
4383
0
            do {
4384
0
                iter->i++;
4385
0
            } while (iter->i < iter->n_off
4386
0
                     && (iter->curr_off >= iter->off[iter->i].v
4387
0
                         || (iter->off[iter->i].max >> 32 == iter->curr_tid
4388
0
                             && (iter->off[iter->i].max & 0xffffffff) < iter->curr_intv)));
4389
4390
0
            if (iter->is_cram && iter->i < iter->n_off) {
4391
                // Ensure iter->curr_reg is correct.
4392
                //
4393
                // We need this for CRAM as we shortcut some of the later
4394
                // logic by getting an end-of-range and continuing to the
4395
                // next offset.
4396
                //
4397
                // We cannot do this for BAM (and fortunately do not need to
4398
                // either) because in BAM world a query to genomic positions
4399
                // GX and GY leading to a seek offsets PX and PY may have
4400
                // GX > GY and PX < PY.  (This is due to the R-tree and falling
4401
                // between intervals, bumping up to a higher bin.)
4402
                // CRAM strictly follows PX >= PY if GX >= GY, so this logic
4403
                // works.
4404
0
                int want_tid = iter->off[iter->i].max >> 32;
4405
0
                if (!(iter->curr_reg < iter->n_reg &&
4406
0
                      iter->reg_list[iter->curr_reg].tid == want_tid)) {
4407
0
                    int j;
4408
0
                    for (j = 0; j < iter->n_reg; j++)
4409
0
                        if (iter->reg_list[j].tid == want_tid)
4410
0
                            break;
4411
0
                    if (j == iter->n_reg)
4412
0
                        return -1;
4413
0
                    iter->curr_reg = j;
4414
0
                    iter->curr_tid = iter->reg_list[iter->curr_reg].tid;
4415
0
                };
4416
0
                iter->curr_intv = iter->off[iter->i].max & 0xffffffff;
4417
0
            }
4418
4419
0
            if (iter->i >= iter->n_off) { // no more chunks, except NOCOORs
4420
0
                if (iter->nocoor) {
4421
0
                    next_range = 0;
4422
0
                    if (iter->seek(fp, iter->nocoor_off, SEEK_SET) < 0) {
4423
0
                        hts_log_error("Seek at offset %" PRIu64 " failed.", iter->nocoor_off);
4424
0
                        return -2;
4425
0
                    }
4426
0
                    if (iter->is_cram) {
4427
0
                        cram_range r = { HTS_IDX_NOCOOR };
4428
0
                        cram_set_option(fp, CRAM_OPT_RANGE_NOSEEK, &r);
4429
0
                    }
4430
4431
                    // The first slice covering the unmapped reads might
4432
                    // contain a few mapped reads, so scroll
4433
                    // forward until finding the first unmapped read.
4434
0
                    do {
4435
0
                        ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
4436
0
                    } while (tid >= 0 && ret >=0);
4437
4438
0
                    if (ret < 0)
4439
0
                        iter->finished = 1;
4440
0
                    else
4441
0
                        iter->read_rest = 1;
4442
4443
0
                    iter->curr_off = 0; // don't seek any more
4444
0
                    iter->curr_tid = tid;
4445
0
                    iter->curr_beg = beg;
4446
0
                    iter->curr_end = end;
4447
4448
0
                    return ret;
4449
0
                } else {
4450
0
                    ret = -1; break;
4451
0
                }
4452
0
            } else if (iter->i < iter->n_off) {
4453
                // New chunk may overlap the last one, so ensure we
4454
                // only seek forwards.
4455
0
                if (iter->curr_off < iter->off[iter->i].u || next_range) {
4456
0
                    iter->curr_off = iter->off[iter->i].u;
4457
4458
                    // CRAM has the capability of setting an end location.
4459
                    // This means multi-threaded decodes can stop once they
4460
                    // reach that point, rather than pointlessly decoding
4461
                    // more slices than we'll be using.
4462
                    //
4463
                    // We have to be careful here.  Whenever we set the cram
4464
                    // range we need a corresponding seek in order to ensure
4465
                    // we can safely decode at that offset.  We use next_range
4466
                    // var to ensure this is always true; this is set on
4467
                    // end-of-range condition. It's never modified for BAM.
4468
0
                    if (iter->is_cram) {
4469
                        // Next offset.[uv] tuple, but it's already been
4470
                        // included in our cram range, so don't seek and don't
4471
                        // reset range so we can efficiently multi-thread.
4472
0
                        if (next_range || iter->curr_off >= iter->end) {
4473
0
                            if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
4474
0
                                hts_log_error("Seek at offset %" PRIu64
4475
0
                                        " failed.", iter->curr_off);
4476
0
                                return -2;
4477
0
                            }
4478
4479
                            // Find the genomic range matching this interval.
4480
0
                            int j;
4481
0
                            hts_reglist_t *rl = &iter->reg_list[iter->curr_reg];
4482
0
                            cram_range r = {
4483
0
                                    rl->tid,
4484
0
                                    rl->intervals[iter->curr_intv].beg,
4485
0
                                    rl->intervals[iter->curr_intv].end
4486
0
                            };
4487
4488
                            // Expand it up to cover neighbouring intervals.
4489
                            // Note we can only have a single chromosome in a
4490
                            // range, so if we detect our blocks span chromosomes
4491
                            // or we have a multi-ref mode slice, we just use
4492
                            // HTS_IDX_START refid instead.  This doesn't actually
4493
                            // seek (due to CRAM_OPT_RANGE_NOSEEK) and is simply
4494
                            // and indicator of decoding with no end limit.
4495
                            //
4496
                            // That isn't as efficient as it could be, but it's
4497
                            // no poorer than before and it works.
4498
0
                            int tid = r.refid;
4499
0
                            int64_t end = r.end;
4500
0
                            int64_t v = iter->off[iter->i].v;
4501
0
                            j = iter->i+1;
4502
0
                            while (j < iter->n_off) {
4503
0
                                if (iter->off[j].u > v)
4504
0
                                    break;
4505
4506
0
                                uint64_t max = iter->off[j].max;
4507
0
                                if ((max>>32) != tid) {
4508
0
                                    tid = HTS_IDX_START; // => no range limit
4509
0
                                } else {
4510
0
                                    if (end < rl->intervals[max & 0xffffffff].end)
4511
0
                                        end = rl->intervals[max & 0xffffffff].end;
4512
0
                                }
4513
0
                                if (v < iter->off[j].v)
4514
0
                                    v = iter->off[j].v;
4515
0
                                j++;
4516
0
                            }
4517
0
                            r.refid = tid;
4518
0
                            r.end = end;
4519
4520
                            // Remember maximum 'v' here so we don't do
4521
                            // unnecessary subsequent seeks for the next
4522
                            // regions.  We can't change curr_off, but
4523
                            // beg/end are used only by single region iterator so
4524
                            // we cache it there to avoid changing the struct.
4525
0
                            iter->end = v;
4526
4527
0
                            cram_set_option(fp, CRAM_OPT_RANGE_NOSEEK, &r);
4528
0
                            next_range = 0;
4529
0
                        }
4530
0
                    } else { // Not CRAM
4531
0
                        if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
4532
0
                            hts_log_error("Seek at offset %" PRIu64 " failed.",
4533
0
                                          iter->curr_off);
4534
0
                            return -2;
4535
0
                        }
4536
0
                    }
4537
0
                }
4538
0
            }
4539
0
        }
4540
4541
0
        ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
4542
0
        if (ret < 0) {
4543
0
            if (iter->is_cram && cram_eof(fp)) {
4544
                // Skip to end of range
4545
                //
4546
                // We should never be adjusting curr_off manually unless
4547
                // we also can guarantee we'll be doing a seek after to
4548
                // a new location.  Otherwise we'll be reading wrong offset
4549
                // for the next container.
4550
                //
4551
                // We ensure this by adjusting our CRAM_OPT_RANGE
4552
                // accordingly above, but to double check we also
4553
                // set the skipped_block flag to enforce a seek also.
4554
0
                iter->curr_off = iter->off[iter->i].v;
4555
0
                next_range = 1;
4556
4557
                // Next region
4558
0
                if (++iter->curr_intv >= iter->reg_list[iter->curr_reg].count){
4559
0
                    if (++iter->curr_reg >= iter->n_reg)
4560
0
                        break;
4561
0
                    iter->curr_intv = 0;
4562
0
                    iter->curr_tid = iter->reg_list[iter->curr_reg].tid;
4563
0
                }
4564
0
                continue;
4565
0
            } else {
4566
0
                break;
4567
0
            }
4568
0
        }
4569
4570
0
        iter->curr_off = iter->tell(fp);
4571
4572
0
        if (tid != iter->curr_tid) {
4573
0
            hts_reglist_t key;
4574
0
            key.tid = tid;
4575
4576
0
            found_reg = (hts_reglist_t *)bsearch(&key, iter->reg_list,
4577
0
                                                 iter->n_reg,
4578
0
                                                 sizeof(hts_reglist_t),
4579
0
                                                 compare_regions);
4580
0
            if (!found_reg)
4581
0
                continue;
4582
4583
0
            iter->curr_reg = (found_reg - iter->reg_list);
4584
0
            iter->curr_tid = tid;
4585
0
            iter->curr_intv = 0;
4586
0
        }
4587
4588
0
        cr = iter->curr_reg;
4589
0
        ci = iter->curr_intv;
4590
4591
0
        for (i = ci; i < iter->reg_list[cr].count; i++) {
4592
0
            if (end > iter->reg_list[cr].intervals[i].beg &&
4593
0
                iter->reg_list[cr].intervals[i].end > beg) {
4594
0
                iter->curr_beg = beg;
4595
0
                iter->curr_end = end;
4596
0
                iter->curr_intv = i;
4597
4598
0
                return ret;
4599
0
            }
4600
4601
            // Check if the read starts beyond intervals[i].end
4602
            // If so, the interval is finished so move on to the next.
4603
0
            if (beg > iter->reg_list[cr].intervals[i].end)
4604
0
                iter->curr_intv = i + 1;
4605
4606
            // No need to keep searching if the read ends before intervals[i].beg
4607
0
            if (end < iter->reg_list[cr].intervals[i].beg)
4608
0
                break;
4609
0
        }
4610
0
    }
4611
0
    iter->finished = 1;
4612
4613
0
    return ret;
4614
0
}
4615
4616
/**********************
4617
 *** Retrieve index ***
4618
 **********************/
4619
// Local_fn and local_len will return a sub-region of 'fn'.
4620
// Eg http://elsewhere/dir/foo.bam.bai?a=b may return
4621
// foo.bam.bai via local_fn and local_len.
4622
//
4623
// Returns -1 if index couldn't be opened.
4624
//         -2 on other errors
4625
static int idx_test_and_fetch(const char *fn, const char **local_fn, int *local_len, int download)
4626
0
{
4627
0
    hFILE *remote_hfp = NULL;
4628
0
    hFILE *local_fp = NULL;
4629
0
    int save_errno;
4630
0
    htsFormat fmt;
4631
0
    kstring_t s = KS_INITIALIZE;
4632
0
    kstring_t tmps = KS_INITIALIZE;
4633
4634
0
    if (hisremote(fn)) {
4635
0
        const int buf_size = 1 * 1024 * 1024;
4636
0
        int l;
4637
0
        const char *p, *e;
4638
        // Ignore ?# params: eg any file.fmt?param=val, except for S3 URLs
4639
0
        e = fn + ((strncmp(fn, "s3://", 5) && strncmp(fn, "s3+http://", 10) && strncmp(fn, "s3+https://", 11)) ? strcspn(fn, "?#") : strcspn(fn, "?"));
4640
        // Find the previous slash from there.
4641
0
        p = e;
4642
0
        while (p > fn && *p != '/') p--;
4643
0
        if (*p == '/') p++;
4644
4645
        // Attempt to open local file first
4646
0
        kputsn(p, e-p, &s);
4647
0
        if (access(s.s, R_OK) == 0)
4648
0
        {
4649
0
            free(s.s);
4650
0
            *local_fn = p;
4651
0
            *local_len = e-p;
4652
0
            return 0;
4653
0
        }
4654
4655
        // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .bai or .tbi index.
4656
0
        if ((remote_hfp = hopen(fn, "r")) == 0) {
4657
0
            hts_log_info("Failed to open index file '%s'", fn);
4658
0
            free(s.s);
4659
0
            return -1;
4660
0
        }
4661
0
        if (hts_detect_format2(remote_hfp, fn, &fmt)) {
4662
0
            hts_log_error("Failed to detect format of index file '%s'", fn);
4663
0
            goto fail;
4664
0
        }
4665
0
        if (fmt.category != index_file || (fmt.format != bai &&  fmt.format != csi && fmt.format != tbi
4666
0
                && fmt.format != crai && fmt.format != fai_format)) {
4667
0
            hts_log_error("Format of index file '%s' is not supported", fn);
4668
0
            goto fail;
4669
0
        }
4670
4671
0
        if (download) {
4672
0
            if ((local_fp = hts_open_tmpfile(s.s, "wx", &tmps)) == NULL) {
4673
0
                hts_log_error("Failed to create file %s in the working directory", p);
4674
0
                goto fail;
4675
0
            }
4676
0
            hts_log_info("Downloading file %s to local directory", fn);
4677
0
            uint8_t *buf = (uint8_t*)calloc(buf_size, 1);
4678
0
            if (!buf) {
4679
0
                hts_log_error("%s", strerror(errno));
4680
0
                goto fail;
4681
0
            }
4682
0
            while ((l = hread(remote_hfp, buf, buf_size)) > 0) {
4683
0
                if (hwrite(local_fp, buf, l) != l) {
4684
0
                    hts_log_error("Failed to write data to %s : %s",
4685
0
                            fn, strerror(errno));
4686
0
                    free(buf);
4687
0
                    goto fail;
4688
0
                }
4689
0
            }
4690
0
            free(buf);
4691
0
            if (l < 0) {
4692
0
                hts_log_error("Error reading \"%s\"", fn);
4693
0
                goto fail;
4694
0
            }
4695
0
            if (hclose(local_fp) < 0) {
4696
0
                hts_log_error("Error closing %s : %s", fn, strerror(errno));
4697
0
                local_fp = NULL;
4698
0
                goto fail;
4699
0
            }
4700
0
            local_fp = NULL;
4701
0
            if (rename(tmps.s, s.s) < 0) {
4702
0
                hts_log_error("Error renaming %s : %s", tmps.s, strerror(errno));
4703
0
                goto fail;
4704
0
            }
4705
0
            ks_clear(&tmps);
4706
4707
0
            *local_fn = p;
4708
0
            *local_len = e-p;
4709
0
        } else {
4710
0
            *local_fn = fn;
4711
0
            *local_len = e-fn;
4712
0
        }
4713
4714
0
        if (hclose(remote_hfp) != 0) {
4715
0
            hts_log_error("Failed to close remote file %s", fn);
4716
0
        }
4717
4718
0
        free(tmps.s);
4719
0
        free(s.s);
4720
0
        return 0;
4721
0
    } else {
4722
0
        hFILE *local_hfp;
4723
0
        if ((local_hfp = hopen(fn, "r")) == 0) return -1;
4724
0
        hclose_abruptly(local_hfp);
4725
0
        *local_fn = fn;
4726
0
        *local_len = strlen(fn);
4727
0
        return 0;
4728
0
    }
4729
4730
0
 fail:
4731
0
    save_errno = errno;
4732
0
    if (remote_hfp) hclose_abruptly(remote_hfp);
4733
0
    if (local_fp) hclose_abruptly(local_fp);
4734
0
    if (tmps.l > 0) unlink(tmps.s);
4735
0
    free(tmps.s);
4736
0
    free(s.s);
4737
0
    errno = save_errno;
4738
0
    return -2;
4739
0
}
4740
4741
/*
4742
 * Check the existence of a local index file using part of the alignment
4743
 * file name.
4744
 *
4745
 * For a filename fn of fn.fmt (eg fn.bam or fn.cram) the order of checks is
4746
 * fn.fmt.csi,  fn.csi,
4747
 * fn.fmt.bai,  fn.bai  - if fmt is HTS_FMT_BAI
4748
 * fn.fmt.tbi,  fn.tbi  - if fmt is HTS_FMT_TBI
4749
 * fn.fmt.crai, fn.crai - if fmt is HTS_FMT_CRAI
4750
 * fn.fmt.fai           - if fmt is HTS_FMT_FAI
4751
 *   also .gzi if fmt is ".gz"
4752
 *
4753
 * @param fn    - pointer to the file name
4754
 * @param fmt   - one of the HTS_FMT index formats
4755
 * @param fnidx - pointer to the index file name placeholder
4756
 * @return        1 for success, 0 for failure
4757
 */
4758
4.24k
int hts_idx_check_local(const char *fn, int fmt, char **fnidx) {
4759
4.24k
    int i, l_fn, l_ext;
4760
4.24k
    const char *fn_tmp = NULL;
4761
4.24k
    char *fnidx_tmp;
4762
4.24k
    const char *csi_ext = ".csi";
4763
4.24k
    const char *bai_ext = ".bai";
4764
4.24k
    const char *tbi_ext = ".tbi";
4765
4.24k
    const char *crai_ext = ".crai";
4766
4.24k
    const char *fai_ext = ".fai";
4767
4.24k
    const char *gzi_ext = ".gzi";
4768
4769
4.24k
    if (!fn)
4770
0
        return 0;
4771
4772
4.24k
    if (hisremote(fn)) {
4773
0
        for (i = strlen(fn) - 1; i >= 0; --i)
4774
0
            if (fn[i] == '/') {
4775
0
                fn_tmp = (char *)&fn[i+1];
4776
0
                break;
4777
0
            }
4778
4.24k
    } else {
4779
        // Borrowed from hopen_fd_fileuri()
4780
4.24k
        if (strncmp(fn, "file://localhost/", 17) == 0) fn_tmp = fn + 16;
4781
4.24k
        else if (strncmp(fn, "file:///", 8) == 0) fn_tmp = fn + 7;
4782
4.24k
        else fn_tmp = fn;
4783
#if defined(_WIN32) || defined(__MSYS__)
4784
        // For cases like C:/foo
4785
        if (fn_tmp[0] == '/' && fn_tmp[1] && fn_tmp[2] == ':' && fn_tmp[3] == '/')
4786
            fn_tmp++;
4787
#endif
4788
4.24k
    }
4789
4790
4.24k
    if (!fn_tmp) return 0;
4791
4.24k
    hts_log_info("Using alignment file '%s'", fn_tmp);
4792
4.24k
    l_fn = strlen(fn_tmp); l_ext = 5;
4793
4.24k
    fnidx_tmp = (char*)calloc(l_fn + l_ext + 1, 1);
4794
4.24k
    if (!fnidx_tmp) return 0;
4795
4796
4.24k
    struct stat sbuf;
4797
4798
    // Try alignment.bam.csi first
4799
4.24k
    strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, csi_ext);
4800
4.24k
    if(stat(fnidx_tmp, &sbuf) == 0) {
4801
0
        *fnidx = fnidx_tmp;
4802
0
        return 1;
4803
4.24k
    } else { // Then try alignment.csi
4804
16.9k
        for (i = l_fn - 1; i > 0; --i)
4805
12.7k
            if (fnidx_tmp[i] == '.') {
4806
0
                strcpy(fnidx_tmp + i, csi_ext);
4807
0
                if(stat(fnidx_tmp, &sbuf) == 0) {
4808
0
                    *fnidx = fnidx_tmp;
4809
0
                    return 1;
4810
0
                }
4811
0
                break;
4812
0
            }
4813
4.24k
    }
4814
4.24k
    if (fmt == HTS_FMT_BAI) {
4815
        // Next, try alignment.bam.bai
4816
0
        strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, bai_ext);
4817
0
        if(stat(fnidx_tmp, &sbuf) == 0) {
4818
0
            *fnidx = fnidx_tmp;
4819
0
            return 1;
4820
0
        } else { // And finally, try alignment.bai
4821
0
            for (i = l_fn - 1; i > 0; --i)
4822
0
                if (fnidx_tmp[i] == '.') {
4823
0
                    strcpy(fnidx_tmp + i, bai_ext);
4824
0
                    if(stat(fnidx_tmp, &sbuf) == 0) {
4825
0
                        *fnidx = fnidx_tmp;
4826
0
                        return 1;
4827
0
                    }
4828
0
                    break;
4829
0
                }
4830
0
        }
4831
4.24k
    } else if (fmt == HTS_FMT_TBI) { // Or .tbi
4832
4.24k
        strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, tbi_ext);
4833
4.24k
        if(stat(fnidx_tmp, &sbuf) == 0) {
4834
0
            *fnidx = fnidx_tmp;
4835
0
            return 1;
4836
4.24k
        } else {
4837
16.9k
            for (i = l_fn - 1; i > 0; --i)
4838
12.7k
                if (fnidx_tmp[i] == '.') {
4839
0
                    strcpy(fnidx_tmp + i, tbi_ext);
4840
0
                    if(stat(fnidx_tmp, &sbuf) == 0) {
4841
0
                        *fnidx = fnidx_tmp;
4842
0
                        return 1;
4843
0
                    }
4844
0
                    break;
4845
0
                }
4846
4.24k
        }
4847
4.24k
    } else if (fmt == HTS_FMT_CRAI) { // Or .crai
4848
0
        strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, crai_ext);
4849
0
        if(stat(fnidx_tmp, &sbuf) == 0) {
4850
0
            *fnidx = fnidx_tmp;
4851
0
            return 1;
4852
0
        } else {
4853
0
            for (i = l_fn - 1; i > 0; --i)
4854
0
                if (fnidx_tmp[i] == '.') {
4855
0
                    strcpy(fnidx_tmp + i, crai_ext);
4856
0
                    if(stat(fnidx_tmp, &sbuf) == 0) {
4857
0
                        *fnidx = fnidx_tmp;
4858
0
                        return 1;
4859
0
                    }
4860
0
                    break;
4861
0
                }
4862
0
        }
4863
0
    } else if (fmt == HTS_FMT_FAI) { // Or .fai
4864
        // Check .gzi if we have a .gz file
4865
0
        strcpy(fnidx_tmp, fn_tmp);
4866
0
        int gzi_ok = 1;
4867
0
        if ((l_fn > 3 && strcmp(fn_tmp+l_fn-3, ".gz") == 0) ||
4868
0
            (l_fn > 5 && strcmp(fn_tmp+l_fn-5, ".bgzf") == 0)) {
4869
0
            strcpy(fnidx_tmp + l_fn, gzi_ext);
4870
0
            gzi_ok = stat(fnidx_tmp, &sbuf)==0;
4871
0
        }
4872
4873
        // Now check for .fai.  Occurs second as we're returning this
4874
        // in *fnidx irrespective of whether we did gzi check.
4875
0
        strcpy(fnidx_tmp + l_fn, fai_ext);
4876
0
        *fnidx = fnidx_tmp;
4877
0
        if (stat(fnidx_tmp, &sbuf) == 0)
4878
0
            return gzi_ok;
4879
0
        else
4880
0
            return 0;
4881
0
    }
4882
4883
4.24k
    free(fnidx_tmp);
4884
4.24k
    return 0;
4885
4.24k
}
4886
4887
0
static char *idx_filename(const char *fn, const char *ext, int download) {
4888
0
    int ret, local_len;
4889
0
    char *fnidx;
4890
0
    const char *local_fn = NULL;
4891
0
    kstring_t buffer = KS_INITIALIZE;
4892
4893
    // First try : append `ext` to `fn`
4894
0
    if (!(fnidx = haddextension(&buffer, fn, 0, ext))) {
4895
0
        free(buffer.s);
4896
0
        return NULL;
4897
0
    }
4898
0
    if ((ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, download)) == -1) {
4899
        // Second try : replace suffix of `fn` with `ext`
4900
0
        if (!(fnidx = haddextension(&buffer, fn, 1, ext))) {
4901
0
            free(buffer.s);
4902
0
            return NULL;
4903
0
        }
4904
0
        ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, download);
4905
0
    }
4906
4907
0
    if (ret < 0) {
4908
0
        free(buffer.s);
4909
0
        return NULL;
4910
0
    }
4911
4912
0
    memmove(fnidx, local_fn, local_len);
4913
0
    fnidx[local_len] = 0;
4914
0
    return fnidx;
4915
0
}
4916
4917
char *hts_idx_getfn(const char *fn, const char *ext)
4918
0
{
4919
0
    return idx_filename(fn, ext, HTS_IDX_SAVE_REMOTE);
4920
0
}
4921
4922
char *hts_idx_locatefn(const char *fn, const char *ext)
4923
0
{
4924
0
    return idx_filename(fn, ext, 0);
4925
0
}
4926
4927
static hts_idx_t *idx_find_and_load(const char *fn, int fmt, int flags)
4928
4.24k
{
4929
4.24k
    char *fnidx = strstr(fn, HTS_IDX_DELIM);
4930
4.24k
    hts_idx_t *idx;
4931
4932
4.24k
    if ( fnidx ) {
4933
0
        char *fn2 = strdup(fn);
4934
0
        if (!fn2) {
4935
0
            hts_log_error("%s", strerror(errno));
4936
0
            return NULL;
4937
0
        }
4938
0
        fn2[fnidx - fn] = '\0';
4939
0
        fnidx += strlen(HTS_IDX_DELIM);
4940
0
        idx = hts_idx_load3(fn2, fnidx, fmt, flags);
4941
0
        free(fn2);
4942
0
        return idx;
4943
0
    }
4944
4945
4.24k
    if (hts_idx_check_local(fn, fmt, &fnidx) == 0 && hisremote(fn)) {
4946
0
        if (flags & HTS_IDX_SAVE_REMOTE) {
4947
0
            fnidx = idx_filename(fn, ".csi", HTS_IDX_SAVE_REMOTE);
4948
0
            if (!fnidx) {
4949
0
                switch (fmt) {
4950
0
                case HTS_FMT_BAI: fnidx = idx_filename(fn, ".bai", HTS_IDX_SAVE_REMOTE); break;
4951
0
                case HTS_FMT_TBI: fnidx = idx_filename(fn, ".tbi", HTS_IDX_SAVE_REMOTE); break;
4952
0
                default: break;
4953
0
                }
4954
0
            }
4955
0
        } else {
4956
0
            fnidx = idx_filename(fn, ".csi", 0);
4957
0
            if (!fnidx) {
4958
0
                switch (fmt) {
4959
0
                case HTS_FMT_BAI: fnidx = idx_filename(fn, ".bai", 0); break;
4960
0
                case HTS_FMT_TBI: fnidx = idx_filename(fn, ".tbi", 0); break;
4961
0
                default: break;
4962
0
                }
4963
0
            }
4964
0
        }
4965
0
    }
4966
4.24k
    if (!fnidx) {
4967
4.24k
        if (!(flags & HTS_IDX_SILENT_FAIL))
4968
0
            hts_log_error("Could not retrieve index file for '%s'", fn);
4969
4.24k
        return 0;
4970
4.24k
    }
4971
4972
0
    if (flags & HTS_IDX_SAVE_REMOTE)
4973
0
        idx = hts_idx_load3(fn, fnidx, fmt, flags);
4974
0
    else
4975
0
        idx = idx_read(fnidx);
4976
0
    free(fnidx);
4977
0
    return idx;
4978
4.24k
}
4979
4980
0
hts_idx_t *hts_idx_load(const char *fn, int fmt) {
4981
0
    return idx_find_and_load(fn, fmt, 1);
4982
0
}
4983
4984
hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
4985
0
{
4986
0
    return hts_idx_load3(fn, fnidx, 0, 0);
4987
0
}
4988
4989
hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags)
4990
4.24k
{
4991
4.24k
    const char *local_fn = NULL;
4992
4.24k
    char *local_fnidx = NULL;
4993
4.24k
    int local_len;
4994
4.24k
    if (!fnidx)
4995
4.24k
        return idx_find_and_load(fn, fmt, flags);
4996
4997
    // Check that the index file is up to date, the main file might have changed
4998
0
    struct stat stat_idx,stat_main;
4999
0
    int remote_fn = hisremote(fn), remote_fnidx = hisremote(fnidx);
5000
0
    if ( !remote_fn && !remote_fnidx
5001
0
         && !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) )
5002
0
    {
5003
0
        if ( stat_idx.st_mtime < stat_main.st_mtime )
5004
0
            hts_log_warning("The index file is older than the data file: %s", fnidx);
5005
0
    }
5006
5007
0
    if (remote_fnidx && (flags & HTS_IDX_SAVE_REMOTE))
5008
0
    {
5009
0
        int ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, 1);
5010
0
        if (ret == 0) {
5011
0
            local_fnidx = strdup(local_fn);
5012
0
            if (local_fnidx) {
5013
0
                local_fnidx[local_len] = '\0';
5014
0
                fnidx = local_fnidx;
5015
0
            }
5016
0
        }
5017
0
    }
5018
5019
0
    hts_idx_t *idx = idx_read(fnidx);
5020
0
    if (!idx && !(flags & HTS_IDX_SILENT_FAIL))
5021
0
        hts_log_error("Could not load local index file '%s'%s%s", fnidx,
5022
0
                      errno ? " : " : "", errno ? strerror(errno) : "");
5023
5024
5025
0
    free(local_fnidx);
5026
5027
0
    return idx;
5028
4.24k
}
5029
5030
5031
5032
/**********************
5033
 ***     Memory     ***
5034
 **********************/
5035
5036
/* For use with hts_expand macros *only* */
5037
HTSLIB_EXPORT
5038
size_t hts_realloc_or_die(size_t n, size_t m, size_t m_sz, size_t size,
5039
4.00k
                          int clear, void **ptr, const char *func) {
5040
    /* If new_m and size are both below this limit, multiplying them
5041
       together can't overflow */
5042
4.00k
    const size_t safe = (size_t) 1 << (sizeof(size_t) * 4);
5043
4.00k
    void *new_ptr;
5044
4.00k
    size_t bytes, new_m;
5045
5046
4.00k
    new_m = n;
5047
4.00k
    kroundup_size_t(new_m);
5048
5049
4.00k
    bytes = size * new_m;
5050
5051
    /* Check for overflow.  Both ensure that new_m will fit in m (we make the
5052
       pessimistic assumption that m is signed), and that bytes has not
5053
       wrapped around. */
5054
4.00k
    if (new_m > (((size_t) 1 << (m_sz * 8 - 1)) - 1)
5055
4.00k
        || ((size > safe || new_m > safe)
5056
0
            && bytes / new_m != size)) {
5057
0
        errno = ENOMEM;
5058
0
        goto die;
5059
0
    }
5060
5061
4.00k
    new_ptr = realloc(*ptr, bytes);
5062
4.00k
    if (new_ptr == NULL) goto die;
5063
5064
4.00k
    if (clear) {
5065
0
        if (new_m > m) {
5066
0
            memset((char *) new_ptr + m * size, 0, (new_m - m) * size);
5067
0
        }
5068
0
    }
5069
5070
4.00k
    *ptr = new_ptr;
5071
5072
4.00k
    return new_m;
5073
5074
0
 die:
5075
0
    hts_log_error("%s", strerror(errno));
5076
0
    exit(1);
5077
4.00k
}
5078
5079
/*
5080
 * Companion to hts_resize() macro that does the actual allocation.
5081
 *
5082
 * Somewhat complicated as hts_resize() needs to write the new allocated
5083
 * size back into *size_in_out, and the value pointed to may either be
5084
 * int32_t, uint32_t or size_t depending on which array is being resized.
5085
 * This is solved by making `size_in_out` a void pointer, getting the macro
5086
 * to pass in the size of the item pointed to (in `size_sz`) and then using
5087
 * an appropriate cast (based on the value of size_sz).  The function
5088
 * ensures that the maximum size will be storable in a signed type of
5089
 * the given size so storing to an int32_t should work correctly.
5090
 *
5091
 * Assumes that sizeof(uint32_t) and sizeof(int32_t) is 4,
5092
 * sizeof(uint64_t) and sizeof(int64_t) is 8 and sizeof(size_t) is
5093
 * either 4 or 8.  It also assumes casting from unsigned to signed will
5094
 * work as long as the top bit isn't set.
5095
 */
5096
5097
int hts_resize_array_(size_t item_size, size_t num, size_t size_sz,
5098
                      void *size_in_out, void **ptr_in_out, int flags,
5099
22.4k
                      const char *func) {
5100
    /* If new_size and item_size are both below this limit, multiplying them
5101
       together can't overflow */
5102
22.4k
    const size_t safe = (size_t) 1 << (sizeof(size_t) * 4);
5103
22.4k
    void *new_ptr;
5104
22.4k
    size_t bytes, new_size;
5105
5106
22.4k
    new_size = num;
5107
22.4k
    kroundup_size_t(new_size);
5108
22.4k
    bytes = item_size * new_size;
5109
5110
    /* Check for overflow.  Both ensure that alloc will fit in alloc_in_out (we
5111
       make the pessimistic assumption that *alloc_in_out is signed), and that
5112
       bytes has not wrapped around. */
5113
5114
22.4k
    if ((new_size > (((size_t) 1 << (size_sz * 8 - 1)) - 1))
5115
22.4k
        || (((item_size > safe) || (new_size > safe))
5116
0
            && bytes / new_size != item_size)) {
5117
0
        hts_log(HTS_LOG_ERROR, func, "Memory allocation too large");
5118
0
        errno = ENOMEM;
5119
0
        return -1;
5120
0
    }
5121
5122
22.4k
    new_ptr = realloc(*ptr_in_out, bytes);
5123
22.4k
    if (new_ptr == NULL) {
5124
0
        int save_errno = errno;
5125
0
        hts_log(HTS_LOG_ERROR, func, "%s", strerror(errno));
5126
0
        errno = save_errno;
5127
0
        return -1;
5128
0
    }
5129
5130
22.4k
    if (flags & HTS_RESIZE_CLEAR) {
5131
22.4k
        size_t old_size;
5132
22.4k
        switch (size_sz) {
5133
22.4k
        case 4: old_size = *((uint32_t *) size_in_out); break;
5134
0
        case 8: old_size = *((uint64_t *) size_in_out); break;
5135
0
        default: abort();
5136
22.4k
        }
5137
22.4k
        if (new_size > old_size) {
5138
22.4k
            memset((char *) new_ptr + old_size * item_size, 0,
5139
22.4k
                   (new_size - old_size) * item_size);
5140
22.4k
        }
5141
22.4k
    }
5142
5143
22.4k
    switch (size_sz) {
5144
22.4k
    case 4: *((uint32_t *) size_in_out) = new_size; break;
5145
0
    case 8: *((uint64_t *) size_in_out) = new_size; break;
5146
0
    default: abort();
5147
22.4k
    }
5148
5149
22.4k
    *ptr_in_out = new_ptr;
5150
22.4k
    return 0;
5151
22.4k
}
5152
5153
void hts_lib_shutdown(void)
5154
0
{
5155
0
    hfile_shutdown(1);
5156
0
}
5157
5158
0
void hts_free(void *ptr) {
5159
0
    free(ptr);
5160
0
}
5161
5162
void hts_set_log_level(enum htsLogLevel level)
5163
0
{
5164
0
    hts_verbose = level;
5165
0
}
5166
5167
enum htsLogLevel hts_get_log_level(void)
5168
0
{
5169
0
    return hts_verbose;
5170
0
}
5171
5172
static char get_severity_tag(enum htsLogLevel severity)
5173
375k
{
5174
375k
    switch (severity) {
5175
38.4k
    case HTS_LOG_ERROR:
5176
38.4k
        return 'E';
5177
337k
    case HTS_LOG_WARNING:
5178
337k
        return 'W';
5179
0
    case HTS_LOG_INFO:
5180
0
        return 'I';
5181
0
    case HTS_LOG_DEBUG:
5182
0
        return 'D';
5183
0
    case HTS_LOG_TRACE:
5184
0
        return 'T';
5185
0
    default:
5186
0
        break;
5187
375k
    }
5188
5189
0
    return '*';
5190
375k
}
5191
5192
void hts_log(enum htsLogLevel severity, const char *context, const char *format, ...)
5193
413k
{
5194
413k
    int save_errno = errno;
5195
413k
    if (severity <= hts_verbose) {
5196
375k
        va_list argptr;
5197
5198
375k
        fprintf(stderr, "[%c::%s] ", get_severity_tag(severity), context);
5199
5200
375k
        va_start(argptr, format);
5201
375k
        vfprintf(stderr, format, argptr);
5202
375k
        va_end(argptr);
5203
5204
375k
        fprintf(stderr, "\n");
5205
375k
    }
5206
    errno = save_errno;
5207
413k
}