Coverage Report

Created: 2025-08-29 06:39

/src/htslib/hts.c
Line
Count
Source (jump to first uncovered line)
1
/*  hts.c -- format-neutral I/O, indexing, and iterator API functions.
2
3
    Copyright (C) 2008, 2009, 2012-2024 Genome Research Ltd.
4
    Copyright (C) 2012, 2013 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <zlib.h>
30
#include <stdio.h>
31
#include <string.h>
32
#include <strings.h>
33
#include <stdlib.h>
34
#include <unistd.h>
35
#include <inttypes.h>
36
#include <limits.h>
37
#include <stdint.h>
38
#include <fcntl.h>
39
#include <errno.h>
40
#include <time.h>
41
#include <sys/stat.h>
42
#include <assert.h>
43
44
#ifdef HAVE_LIBLZMA
45
#ifdef HAVE_LZMA_H
46
#include <lzma.h>
47
#else
48
#include "os/lzma_stub.h"
49
#endif
50
#endif
51
52
#include "htslib/hts.h"
53
#include "htslib/bgzf.h"
54
#include "cram/cram.h"
55
#include "htslib/hfile.h"
56
#include "htslib/hts_endian.h"
57
#include "version.h"
58
#include "config_vars.h"
59
#include "hts_internal.h"
60
#include "hfile_internal.h"
61
#include "sam_internal.h"
62
#include "htslib/hts_expr.h"
63
#include "htslib/hts_os.h" // drand48
64
65
#include "htslib/khash.h"
66
#include "htslib/kseq.h"
67
#include "htslib/ksort.h"
68
#include "htslib/tbx.h"
69
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
70
#include <htscodecs/htscodecs.h>
71
#else
72
#include "htscodecs/htscodecs/htscodecs.h"
73
#endif
74
75
#ifndef EFTYPE
76
1
#define EFTYPE ENOEXEC
77
#endif
78
79
24.5k
KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal)
80
81
HTSLIB_EXPORT
82
int hts_verbose = HTS_LOG_WARNING;
83
84
const char *hts_version(void)
85
2
{
86
2
    return HTS_VERSION_TEXT;
87
2
}
88
89
0
unsigned int hts_features(void) {
90
0
    unsigned int feat = HTS_FEATURE_HTSCODECS; // Always present
91
92
0
#ifdef PACKAGE_URL
93
0
    feat |= HTS_FEATURE_CONFIGURE;
94
0
#endif
95
96
#ifdef ENABLE_PLUGINS
97
    feat |= HTS_FEATURE_PLUGINS;
98
#endif
99
100
0
#ifdef HAVE_LIBCURL
101
0
    feat |= HTS_FEATURE_LIBCURL;
102
0
#endif
103
104
0
#ifdef ENABLE_S3
105
0
    feat |= HTS_FEATURE_S3;
106
0
#endif
107
108
0
#ifdef ENABLE_GCS
109
0
    feat |= HTS_FEATURE_GCS;
110
0
#endif
111
112
#ifdef HAVE_LIBDEFLATE
113
    feat |= HTS_FEATURE_LIBDEFLATE;
114
#endif
115
116
0
#ifdef HAVE_LIBLZMA
117
0
    feat |= HTS_FEATURE_LZMA;
118
0
#endif
119
120
0
#ifdef HAVE_LIBBZ2
121
0
    feat |= HTS_FEATURE_BZIP2;
122
0
#endif
123
124
0
    return feat;
125
0
}
126
127
0
const char *hts_test_feature(unsigned int id) {
128
0
    unsigned int feat = hts_features();
129
130
0
    switch (id) {
131
0
    case HTS_FEATURE_CONFIGURE:
132
0
        return feat & HTS_FEATURE_CONFIGURE ? "yes" : NULL;
133
0
    case HTS_FEATURE_PLUGINS:
134
0
        return feat & HTS_FEATURE_PLUGINS ? "yes" : NULL;
135
0
    case HTS_FEATURE_LIBCURL:
136
0
        return feat & HTS_FEATURE_LIBCURL ? "yes" : NULL;
137
0
    case HTS_FEATURE_S3:
138
0
        return feat & HTS_FEATURE_S3 ? "yes" : NULL;
139
0
    case HTS_FEATURE_GCS:
140
0
        return feat & HTS_FEATURE_GCS ? "yes" : NULL;
141
0
    case HTS_FEATURE_LIBDEFLATE:
142
0
        return feat & HTS_FEATURE_LIBDEFLATE ? "yes" : NULL;
143
0
    case HTS_FEATURE_BZIP2:
144
0
        return feat & HTS_FEATURE_BZIP2 ? "yes" : NULL;
145
0
    case HTS_FEATURE_LZMA:
146
0
        return feat & HTS_FEATURE_LZMA ? "yes" : NULL;
147
148
0
    case HTS_FEATURE_HTSCODECS:
149
0
        return htscodecs_version();
150
151
0
    case HTS_FEATURE_CC:
152
0
        return HTS_CC;
153
0
    case HTS_FEATURE_CFLAGS:
154
0
        return HTS_CFLAGS;
155
0
    case HTS_FEATURE_LDFLAGS:
156
0
        return HTS_LDFLAGS;
157
0
    case HTS_FEATURE_CPPFLAGS:
158
0
        return HTS_CPPFLAGS;
159
160
0
    default:
161
0
        fprintf(stderr, "Unknown feature code: %u\n", id);
162
0
    }
163
164
0
    return NULL;
165
0
}
166
167
// Note this implementation also means we can just "strings" the library
168
// to find the configuration parameters.
169
0
const char *hts_feature_string(void) {
170
0
    static char config[1200];
171
0
    const char *flags=
172
173
0
#ifdef PACKAGE_URL
174
0
    "build=configure "
175
#else
176
    "build=Makefile "
177
#endif
178
179
0
#ifdef HAVE_LIBCURL
180
0
    "libcurl=yes "
181
#else
182
    "libcurl=no "
183
#endif
184
185
0
#ifdef ENABLE_S3
186
0
    "S3=yes "
187
#else
188
    "S3=no "
189
#endif
190
191
0
#ifdef ENABLE_GCS
192
0
    "GCS=yes "
193
#else
194
    "GCS=no "
195
#endif
196
197
#ifdef HAVE_LIBDEFLATE
198
    "libdeflate=yes "
199
#else
200
0
    "libdeflate=no "
201
0
#endif
202
203
0
#ifdef HAVE_LIBLZMA
204
0
    "lzma=yes "
205
#else
206
    "lzma=no "
207
#endif
208
209
0
#ifdef HAVE_LIBBZ2
210
0
    "bzip2=yes "
211
#else
212
    "bzip2=no "
213
#endif
214
215
// "plugins=" must stay at the end as it is followed by "plugin-path="
216
#ifdef ENABLE_PLUGINS
217
    "plugins=yes";
218
#else
219
0
    "plugins=no";
220
0
#endif
221
222
#ifdef ENABLE_PLUGINS
223
    snprintf(config, sizeof(config),
224
             "%s plugin-path=%.1000s htscodecs=%.40s",
225
             flags, hts_plugin_path(), htscodecs_version());
226
#else
227
0
    snprintf(config, sizeof(config),
228
0
             "%s htscodecs=%.40s",
229
0
             flags, htscodecs_version());
230
0
#endif
231
0
    return config;
232
0
}
233
234
235
// Converts ASCII to BAM nibble encoding.
236
// Note 0123 is treated as ACGT (ABI colourspace encoding) and
237
// U is treated as T.
238
HTSLIB_EXPORT
239
const unsigned char seq_nt16_table[256] = {
240
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
241
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
242
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
243
     1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
244
    15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
245
    15,15, 5, 6,  8, 8, 7, 9, 15,10,15,15, 15,15,15,15,
246
    15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
247
    15,15, 5, 6,  8, 8, 7, 9, 15,10,15,15, 15,15,15,15,
248
249
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
250
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
251
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
252
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
253
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
254
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
255
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
256
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
257
};
258
259
HTSLIB_EXPORT
260
const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN";
261
262
HTSLIB_EXPORT
263
const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
264
265
/**********************
266
 *** Basic file I/O ***
267
 **********************/
268
269
static enum htsFormatCategory format_category(enum htsExactFormat fmt)
270
3.94k
{
271
3.94k
    switch (fmt) {
272
0
    case bam:
273
0
    case sam:
274
979
    case cram:
275
979
    case fastq_format:
276
979
    case fasta_format:
277
979
        return sequence_data;
278
279
0
    case vcf:
280
0
    case bcf:
281
0
        return variant_data;
282
283
0
    case bai:
284
0
    case crai:
285
0
    case csi:
286
0
    case fai_format:
287
0
    case fqi_format:
288
0
    case gzi:
289
0
    case tbi:
290
0
        return index_file;
291
292
0
    case bed:
293
0
    case d4_format:
294
0
        return region_list;
295
296
0
    case htsget:
297
0
    case hts_crypt4gh_format:
298
0
        return unknown_category;
299
300
0
    case unknown_format:
301
1.48k
    case binary_format:
302
2.96k
    case text_format:
303
2.96k
    case empty_format:
304
2.96k
    case format_maximum:
305
2.96k
        break;
306
3.94k
    }
307
308
2.96k
    return unknown_category;
309
3.94k
}
310
311
// Decompress several hundred bytes by peeking at the file, which must be
312
// positioned at the start of a GZIP block.
313
static ssize_t
314
decompress_peek_gz(hFILE *fp, unsigned char *dest, size_t destsize)
315
1.11k
{
316
1.11k
    unsigned char buffer[2048];
317
1.11k
    z_stream zs;
318
1.11k
    ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
319
320
1.11k
    if (npeek < 0) return -1;
321
322
1.11k
    zs.zalloc = NULL;
323
1.11k
    zs.zfree = NULL;
324
1.11k
    zs.next_in = buffer;
325
1.11k
    zs.avail_in = npeek;
326
1.11k
    zs.next_out = dest;
327
1.11k
    zs.avail_out = destsize;
328
1.11k
    if (inflateInit2(&zs, 31) != Z_OK) return -1;
329
330
1.11k
    int ret;
331
1.11k
    const unsigned char *last_in = buffer;
332
2.23k
    while (zs.avail_out > 0) {
333
1.12k
        ret = inflate(&zs, Z_SYNC_FLUSH);
334
1.12k
        if (ret == Z_STREAM_END) {
335
3
            if (last_in == zs.next_in)
336
0
                break; // Paranoia to avoid potential looping. Shouldn't happen
337
3
            else
338
3
                last_in = zs.next_in;
339
3
            inflateReset(&zs);
340
1.11k
        } else if (ret != Z_OK) {
341
            // eg Z_BUF_ERROR due to avail_in/out becoming zero
342
5
            break;
343
5
        }
344
1.12k
    }
345
346
    // NB: zs.total_out is changed by inflateReset, so use pointer diff instead
347
1.11k
    destsize = zs.next_out - dest;
348
1.11k
    inflateEnd(&zs);
349
350
1.11k
    return destsize;
351
1.11k
}
352
353
#ifdef HAVE_LIBLZMA
354
// Similarly decompress a portion by peeking at the file, which must be
355
// positioned at the start of the file.
356
static ssize_t
357
decompress_peek_xz(hFILE *fp, unsigned char *dest, size_t destsize)
358
0
{
359
0
    unsigned char buffer[2048];
360
0
    ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
361
0
    if (npeek < 0) return -1;
362
363
0
    lzma_stream ls = LZMA_STREAM_INIT;
364
0
    if (lzma_stream_decoder(&ls, lzma_easy_decoder_memusage(9), 0) != LZMA_OK)
365
0
        return -1;
366
367
0
    ls.next_in = buffer;
368
0
    ls.avail_in = npeek;
369
0
    ls.next_out = dest;
370
0
    ls.avail_out = destsize;
371
372
0
    int r = lzma_code(&ls, LZMA_RUN);
373
0
    if (! (r == LZMA_OK || r == LZMA_STREAM_END)) {
374
0
        lzma_end(&ls);
375
0
        return -1;
376
0
    }
377
378
0
    destsize = ls.total_out;
379
0
    lzma_end(&ls);
380
381
0
    return destsize;
382
0
}
383
#endif
384
385
// Parse "x.y" text, taking care because the string is not NUL-terminated
386
// and filling in major/minor only when the digits are followed by a delimiter,
387
// so we don't misread "1.10" as "1.1" due to reaching the end of the buffer.
388
static void
389
parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim)
390
46
{
391
46
    const char *s    = (const char *) u;
392
46
    const char *slim = (const char *) ulim;
393
46
    short v;
394
395
46
    fmt->version.major = fmt->version.minor = -1;
396
397
76
    for (v = 0; s < slim && isdigit_c(*s); s++)
398
30
        v = 10 * v + *s - '0';
399
400
46
    if (s < slim) {
401
46
        fmt->version.major = v;
402
46
        if (*s == '.') {
403
18
            s++;
404
81
            for (v = 0; s < slim && isdigit_c(*s); s++)
405
63
                v = 10 * v + *s - '0';
406
18
            if (s < slim)
407
18
                fmt->version.minor = v;
408
18
        }
409
28
        else
410
28
            fmt->version.minor = 0;
411
46
    }
412
46
}
413
414
static int
415
cmp_nonblank(const char *key, const unsigned char *u, const unsigned char *ulim)
416
958
{
417
958
    const unsigned char *ukey = (const unsigned char *) key;
418
419
958
    while (*ukey)
420
958
        if (u >= ulim) return +1;
421
958
        else if (isspace_c(*u)) u++;
422
958
        else if (*u != *ukey) return (*ukey < *u)? -1 : +1;
423
0
        else u++, ukey++;
424
425
0
    return 0;
426
958
}
427
428
static int is_text_only(const unsigned char *u, const unsigned char *ulim)
429
957
{
430
34.9k
    for (; u < ulim; u++)
431
33.9k
        if (! (*u >= ' ' || *u == '\t' || *u == '\r' || *u == '\n'))
432
1
            return 0;
433
434
956
    return 1;
435
957
}
436
437
static inline int
438
alternate_zeros(const unsigned char *u, const unsigned char *ulim)
439
0
{
440
0
    for (; u < ulim; u += 2)
441
0
        if (*u != '\0') return 0;
442
0
    return 1;
443
0
}
444
445
static int is_utf16_text(const unsigned char *u, const unsigned char *ulim)
446
0
{
447
0
    if (ulim - u >= 6 &&
448
0
        ((u[0] == 0xfe && u[1] == 0xff && alternate_zeros(u+2, ulim)) ||
449
0
         (u[0] == 0xff && u[1] == 0xfe && alternate_zeros(u+3, ulim))))
450
0
        return 2;
451
0
    else if (ulim - u >= 8 &&
452
0
             (alternate_zeros(u, ulim) || alternate_zeros(u+1, ulim)))
453
0
        return 1;
454
0
    else
455
0
        return 0;
456
0
}
457
458
static int is_fastaq(const unsigned char *u, const unsigned char *ulim)
459
956
{
460
956
    const unsigned char *eol = memchr(u, '\n', ulim - u);
461
462
    // Check that the first line is entirely textual
463
956
    if (! is_text_only(u, eol? eol : ulim)) return 0;
464
465
    // If the first line is very long, consider the file to indeed be FASTA/Q
466
956
    if (eol == NULL) return 1;
467
468
944
    u = eol+1; // Now points to the first character of the second line
469
470
    // Scan over all base-encoding letters (including 'N' but not SEQ's '=')
471
1.01k
    while (u < ulim && (seq_nt16_table[*u] != 15 || toupper(*u) == 'N')) {
472
68
        if (*u == '=') return 0;
473
68
        u++;
474
68
    }
475
476
944
    return (u == ulim || *u == '\r' || *u == '\n')? 1 : 0;
477
944
}
478
479
// Parse tab-delimited text, filling in a string of column types and returning
480
// the number of columns spotted (within [u,ulim), and up to column_len) or -1
481
// if non-printable characters were seen.  Column types:
482
//     i: integer, s: strand sign, C: CIGAR, O: SAM optional field, Z: anything
483
static int
484
parse_tabbed_text(char *columns, int column_len,
485
                  const unsigned char *u, const unsigned char *ulim,
486
                  int *complete)
487
1
{
488
1
    const char *str  = (const char *) u;
489
1
    const char *slim = (const char *) ulim;
490
1
    const char *s;
491
1
    int ncolumns = 0;
492
493
1
    enum { digit = 1, leading_sign = 2, cigar_operator = 4, other = 8 };
494
1
    unsigned seen = 0;
495
1
    *complete = 0;
496
497
1
    for (s = str; s < slim; s++)
498
1
        if (*s >= ' ') {
499
0
            if (isdigit_c(*s))
500
0
                seen |= digit;
501
0
            else if ((*s == '+' || *s == '-') && s == str)
502
0
                seen |= leading_sign;
503
0
            else if (strchr(BAM_CIGAR_STR, *s) && s > str && isdigit_c(s[-1]))
504
0
                seen |= cigar_operator;
505
0
            else
506
0
                seen |= other;
507
0
        }
508
1
        else if (*s == '\t' || *s == '\r' || *s == '\n') {
509
0
            size_t len = s - str;
510
0
            char type;
511
512
0
            if (seen == digit || seen == (leading_sign|digit)) type = 'i';
513
0
            else if (seen == (digit|cigar_operator)) type = 'C';
514
0
            else if (len == 1)
515
0
                switch (str[0]) {
516
0
                case '*': type = 'C'; break;
517
0
                case '+': case '-': case '.': type = 's'; break;
518
0
                default: type = 'Z'; break;
519
0
                }
520
0
            else if (len >= 5 && str[2] == ':' && str[4] == ':') type = 'O';
521
0
            else type = 'Z';
522
523
0
            columns[ncolumns++] = type;
524
0
            if (*s != '\t' || ncolumns >= column_len - 1) {
525
0
                *complete = 1; // finished the line or more columns than needed
526
0
                break;
527
0
            }
528
529
0
            str = s + 1;
530
0
            seen = 0;
531
0
        }
532
1
        else return -1;
533
534
0
    columns[ncolumns] = '\0';
535
0
    return ncolumns;
536
1
}
537
538
// Match COLUMNS as a prefix against PATTERN (so COLUMNS may run out first).
539
// Returns len(COLUMNS) (modulo '+'), or 0 if there is a mismatched entry.
540
static int colmatch(const char *columns, const char *pattern)
541
0
{
542
0
    int i;
543
0
    for (i = 0; columns[i] != '\0'; i++) {
544
0
        if (pattern[i] == '+') return i;
545
0
        if (! (columns[i] == pattern[i] || pattern[i] == 'Z')) return 0;
546
0
    }
547
548
0
    return i;
549
0
}
550
551
int hts_detect_format(hFILE *hfile, htsFormat *fmt)
552
0
{
553
0
    return hts_detect_format2(hfile, NULL, fmt);
554
0
}
555
556
int hts_detect_format2(hFILE *hfile, const char *fname, htsFormat *fmt)
557
5.44k
{
558
5.44k
    char extension[HTS_MAX_EXT_LEN], columns[24];
559
5.44k
    unsigned char s[1024];
560
5.44k
    int complete = 0;
561
5.44k
    ssize_t len = hpeek(hfile, s, 18);
562
5.44k
    if (len < 0) return -1;
563
564
5.44k
    fmt->category = unknown_category;
565
5.44k
    fmt->format = unknown_format;
566
5.44k
    fmt->version.major = fmt->version.minor = -1;
567
5.44k
    fmt->compression = no_compression;
568
5.44k
    fmt->compression_level = -1;
569
5.44k
    fmt->specific = NULL;
570
571
5.44k
    if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) {
572
        // The stream is either gzip-compressed or BGZF-compressed.
573
        // Determine which, and decompress the first few records or lines.
574
1.11k
        fmt->compression = gzip;
575
1.11k
        if (len >= 18 && (s[3] & 4)) {
576
521
            if (memcmp(&s[12], "BC\2\0", 4) == 0)
577
16
                fmt->compression = bgzf;
578
505
            else if (memcmp(&s[12], "RAZF", 4) == 0)
579
1
                fmt->compression = razf_compression;
580
521
        }
581
1.11k
        if (len >= 9 && s[2] == 8)
582
1.11k
            fmt->compression_level = (s[8] == 2)? 9 : (s[8] == 4)? 1 : -1;
583
584
1.11k
        len = decompress_peek_gz(hfile, s, sizeof s);
585
1.11k
    }
586
4.33k
    else if (len >= 10 && memcmp(s, "BZh", 3) == 0 &&
587
4.33k
             (memcmp(&s[4], "\x31\x41\x59\x26\x53\x59", 6) == 0 ||
588
0
              memcmp(&s[4], "\x17\x72\x45\x38\x50\x90", 6) == 0)) {
589
0
        fmt->compression = bzip2_compression;
590
0
        fmt->compression_level = s[3] - '0';
591
        // Decompressing via libbz2 produces no output until it has a whole
592
        // block (of size 100Kb x level), which is too large for peeking.
593
        // So unfortunately we can recognise bzip2 but not the contents,
594
        // except that \x1772... magic indicates the stream is empty.
595
0
        if (s[4] == '\x31') return 0;
596
0
        else len = 0;
597
0
    }
598
4.33k
    else if (len >= 6 && memcmp(s, "\xfd""7zXZ\0", 6) == 0) {
599
0
        fmt->compression = xz_compression;
600
0
#ifdef HAVE_LIBLZMA
601
0
        len = decompress_peek_xz(hfile, s, sizeof s);
602
#else
603
        // Without liblzma, we can't recognise the decompressed contents.
604
        return 0;
605
#endif
606
0
    }
607
4.33k
    else if (len >= 4 && memcmp(s, "\x28\xb5\x2f\xfd", 4) == 0) {
608
0
        fmt->compression = zstd_compression;
609
0
        return 0;
610
0
    }
611
4.33k
    else {
612
4.33k
        len = hpeek(hfile, s, sizeof s);
613
4.33k
    }
614
5.44k
    if (len < 0) return -1;
615
616
5.44k
    if (len == 0) {
617
1
        fmt->format = empty_format;
618
1
        return 0;
619
1
    }
620
621
    // We avoid using filename extensions wherever possible (as filenames are
622
    // not always available), but in a few cases they must be considered:
623
    //  - FASTA/Q indexes are simply tab-separated text; files that match these
624
    //    patterns but not the fai/fqi extension are usually generic BED files
625
    //  - GZI indexes have no magic numbers so can only be detected by filename
626
5.44k
    if (fname && strcmp(fname, "-") != 0) {
627
5.44k
        char *s;
628
5.44k
        if (find_file_extension(fname, extension) < 0) extension[0] = '\0';
629
5.44k
        for (s = extension; *s; s++) *s = tolower_c(*s);
630
5.44k
    }
631
0
    else extension[0] = '\0';
632
633
5.44k
    if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=7 && s[5]<=7) {
634
636
        fmt->category = sequence_data;
635
636
        fmt->format = cram;
636
636
        fmt->version.major = s[4], fmt->version.minor = s[5];
637
636
        fmt->compression = custom;
638
636
        return 0;
639
636
    }
640
4.81k
    else if (len >= 4 && s[3] <= '\4') {
641
217
        if (memcmp(s, "BAM\1", 4) == 0) {
642
76
            fmt->category = sequence_data;
643
76
            fmt->format = bam;
644
            // TODO Decompress enough to pick version from @HD-VN header
645
76
            fmt->version.major = 1, fmt->version.minor = -1;
646
76
            return 0;
647
76
        }
648
141
        else if (memcmp(s, "BAI\1", 4) == 0) {
649
0
            fmt->category = index_file;
650
0
            fmt->format = bai;
651
0
            fmt->version.major = -1, fmt->version.minor = -1;
652
0
            return 0;
653
0
        }
654
141
        else if (memcmp(s, "BCF\4", 4) == 0) {
655
0
            fmt->category = variant_data;
656
0
            fmt->format = bcf;
657
0
            fmt->version.major = 1, fmt->version.minor = -1;
658
0
            return 0;
659
0
        }
660
141
        else if (memcmp(s, "BCF\2", 4) == 0) {
661
141
            fmt->category = variant_data;
662
141
            fmt->format = bcf;
663
141
            fmt->version.major = s[3];
664
141
            fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0;
665
141
            return 0;
666
141
        }
667
0
        else if (memcmp(s, "CSI\1", 4) == 0) {
668
0
            fmt->category = index_file;
669
0
            fmt->format = csi;
670
0
            fmt->version.major = 1, fmt->version.minor = -1;
671
0
            return 0;
672
0
        }
673
0
        else if (memcmp(s, "TBI\1", 4) == 0) {
674
0
            fmt->category = index_file;
675
0
            fmt->format = tbi;
676
0
            return 0;
677
0
        }
678
        // GZI indexes have no magic numbers, so must be recognised solely by
679
        // filename extension.
680
0
        else if (strcmp(extension, "gzi") == 0) {
681
0
            fmt->category = index_file;
682
0
            fmt->format = gzi;
683
0
            return 0;
684
0
        }
685
217
    }
686
4.59k
    else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) {
687
1.36k
        fmt->category = variant_data;
688
1.36k
        fmt->format = vcf;
689
1.36k
        if (len >= 21 && s[16] == 'v')
690
18
            parse_version(fmt, &s[17], &s[len]);
691
1.36k
        return 0;
692
1.36k
    }
693
3.22k
    else if (len >= 4 && s[0] == '@' &&
694
3.22k
             (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 ||
695
2.30k
              memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0 ||
696
2.30k
              memcmp(s, "@CO\t", 4) == 0)) {
697
2.26k
        fmt->category = sequence_data;
698
2.26k
        fmt->format = sam;
699
        // @HD-VN is not guaranteed to be the first tag, but then @HD is
700
        // not guaranteed to be present at all...
701
2.26k
        if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0)
702
28
            parse_version(fmt, &s[7], &s[len]);
703
2.24k
        else
704
2.24k
            fmt->version.major = 1, fmt->version.minor = -1;
705
2.26k
        return 0;
706
2.26k
    }
707
958
    else if (len >= 8 && memcmp(s, "d4\xdd\xdd", 4) == 0) {
708
0
        fmt->category = region_list;
709
0
        fmt->format = d4_format;
710
        // How to decode the D4 Format Version bytes is not yet specified
711
        // so we don't try to set fmt->version.{major,minor}.
712
0
        return 0;
713
0
    }
714
958
    else if (cmp_nonblank("{\"htsget\":", s, &s[len]) == 0) {
715
0
        fmt->category = unknown_category;
716
0
        fmt->format = htsget;
717
0
        return 0;
718
0
    }
719
958
    else if (len > 8 && memcmp(s, "crypt4gh", 8) == 0) {
720
1
        fmt->category = unknown_category;
721
1
        fmt->format = hts_crypt4gh_format;
722
1
        return 0;
723
1
    }
724
957
    else if (len >= 1 && s[0] == '>' && is_fastaq(s, &s[len])) {
725
924
        fmt->category = sequence_data;
726
924
        fmt->format = fasta_format;
727
924
        return 0;
728
924
    }
729
33
    else if (len >= 1 && s[0] == '@' && is_fastaq(s, &s[len])) {
730
32
        fmt->category = sequence_data;
731
32
        fmt->format = fastq_format;
732
32
        return 0;
733
32
    }
734
1
    else if (parse_tabbed_text(columns, sizeof columns, s,
735
1
                               &s[len], &complete) > 0) {
736
        // A complete SAM line is at least 11 columns.  On unmapped long reads may
737
        // be missing two.  (On mapped long reads we must have an @ header so long
738
        // CIGAR is irrelevant.)
739
0
        if (colmatch(columns, "ZiZiiCZiiZZOOOOOOOOOOOOOOOOOOOO+")
740
0
            >= 9 + 2*complete) {
741
0
            fmt->category = sequence_data;
742
0
            fmt->format = sam;
743
0
            fmt->version.major = 1, fmt->version.minor = -1;
744
0
            return 0;
745
0
        }
746
0
        else if (fmt->compression == gzip && colmatch(columns, "iiiiii") == 6) {
747
0
            fmt->category = index_file;
748
0
            fmt->format = crai;
749
0
            return 0;
750
0
        }
751
0
        else if (strstr(extension, "fqi") && colmatch(columns, "Ziiiii") == 6) {
752
0
            fmt->category = index_file;
753
0
            fmt->format = fqi_format;
754
0
            return 0;
755
0
        }
756
0
        else if (strstr(extension, "fai") && colmatch(columns, "Ziiii") == 5) {
757
0
            fmt->category = index_file;
758
0
            fmt->format = fai_format;
759
0
            return 0;
760
0
        }
761
0
        else if (colmatch(columns, "Zii+") >= 3) {
762
0
            fmt->category = region_list;
763
0
            fmt->format = bed;
764
0
            return 0;
765
0
        }
766
0
    }
767
768
    // Arbitrary text files can be read using hts_getline().
769
1
    if (is_text_only(s, &s[len])) fmt->format = text_format;
770
771
    // Nothing recognised: leave unset fmt-> fields as unknown.
772
1
    return 0;
773
5.44k
}
774
775
char *hts_format_description(const htsFormat *format)
776
0
{
777
0
    kstring_t str = { 0, 0, NULL };
778
779
0
    switch (format->format) {
780
0
    case sam:   kputs("SAM", &str); break;
781
0
    case bam:   kputs("BAM", &str); break;
782
0
    case cram:  kputs("CRAM", &str); break;
783
0
    case fasta_format:  kputs("FASTA", &str); break;
784
0
    case fastq_format:  kputs("FASTQ", &str); break;
785
0
    case vcf:   kputs("VCF", &str); break;
786
0
    case bcf:
787
0
        if (format->version.major == 1) kputs("Legacy BCF", &str);
788
0
        else kputs("BCF", &str);
789
0
        break;
790
0
    case bai:   kputs("BAI", &str); break;
791
0
    case crai:  kputs("CRAI", &str); break;
792
0
    case csi:   kputs("CSI", &str); break;
793
0
    case fai_format:    kputs("FASTA-IDX", &str); break;
794
0
    case fqi_format:    kputs("FASTQ-IDX", &str); break;
795
0
    case gzi:   kputs("GZI", &str); break;
796
0
    case tbi:   kputs("Tabix", &str); break;
797
0
    case bed:   kputs("BED", &str); break;
798
0
    case d4_format:     kputs("D4", &str); break;
799
0
    case htsget: kputs("htsget", &str); break;
800
0
    case hts_crypt4gh_format: kputs("crypt4gh", &str); break;
801
0
    case empty_format:  kputs("empty", &str); break;
802
0
    default:    kputs("unknown", &str); break;
803
0
    }
804
805
0
    if (format->version.major >= 0) {
806
0
        kputs(" version ", &str);
807
0
        kputw(format->version.major, &str);
808
0
        if (format->version.minor >= 0) {
809
0
            kputc('.', &str);
810
0
            kputw(format->version.minor, &str);
811
0
        }
812
0
    }
813
814
0
    switch (format->compression) {
815
0
    case bzip2_compression:  kputs(" bzip2-compressed", &str); break;
816
0
    case razf_compression:   kputs(" legacy-RAZF-compressed", &str); break;
817
0
    case xz_compression:     kputs(" XZ-compressed", &str); break;
818
0
    case zstd_compression:   kputs(" Zstandard-compressed", &str); break;
819
0
    case custom: kputs(" compressed", &str); break;
820
0
    case gzip:   kputs(" gzip-compressed", &str); break;
821
822
0
    case bgzf:
823
0
        switch (format->format) {
824
0
        case bam:
825
0
        case bcf:
826
0
        case csi:
827
0
        case tbi:
828
            // These are by definition BGZF, so just use the generic term
829
0
            kputs(" compressed", &str);
830
0
            break;
831
0
        default:
832
0
            kputs(" BGZF-compressed", &str);
833
0
            break;
834
0
        }
835
0
        break;
836
837
0
    case no_compression:
838
0
        switch (format->format) {
839
0
        case bam:
840
0
        case bcf:
841
0
        case cram:
842
0
        case csi:
843
0
        case tbi:
844
            // These are normally compressed, so emphasise that this one isn't
845
0
            kputs(" uncompressed", &str);
846
0
            break;
847
0
        default:
848
0
            break;
849
0
        }
850
0
        break;
851
852
0
    default: break;
853
0
    }
854
855
0
    switch (format->category) {
856
0
    case sequence_data: kputs(" sequence", &str); break;
857
0
    case variant_data:  kputs(" variant calling", &str); break;
858
0
    case index_file:    kputs(" index", &str); break;
859
0
    case region_list:   kputs(" genomic region", &str); break;
860
0
    default: break;
861
0
    }
862
863
0
    if (format->compression == no_compression)
864
0
        switch (format->format) {
865
0
        case text_format:
866
0
        case sam:
867
0
        case crai:
868
0
        case vcf:
869
0
        case bed:
870
0
        case fai_format:
871
0
        case fqi_format:
872
0
        case fasta_format:
873
0
        case fastq_format:
874
0
        case htsget:
875
0
            kputs(" text", &str);
876
0
            break;
877
878
0
        case empty_format:
879
0
            break;
880
881
0
        default:
882
0
            kputs(" data", &str);
883
0
            break;
884
0
        }
885
0
    else
886
0
        kputs(" data", &str);
887
888
0
    return ks_release(&str);
889
0
}
890
891
htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
892
3.94k
{
893
3.94k
    char smode[101], *cp, *cp2, *mode_c, *uncomp = NULL;
894
3.94k
    htsFile *fp = NULL;
895
3.94k
    hFILE *hfile = NULL;
896
3.94k
    char fmt_code = '\0';
897
    // see enum htsExactFormat in htslib/hts.h
898
3.94k
    const char format_to_mode[] = "\0g\0\0b\0c\0\0b\0g\0\0\0\0\0Ff\0\0";
899
900
3.94k
    strncpy(smode, mode, 99);
901
3.94k
    smode[99]=0;
902
3.94k
    if ((cp = strchr(smode, ',')))
903
0
        *cp = '\0';
904
905
    // Migrate format code (b or c) to the end of the smode buffer.
906
10.3k
    for (cp2 = cp = smode; *cp; cp++) {
907
6.40k
        if (*cp == 'b')
908
1.48k
            fmt_code = 'b';
909
4.92k
        else if (*cp == 'c')
910
979
            fmt_code = 'c';
911
3.94k
        else {
912
3.94k
            *cp2++ = *cp;
913
            // Cache the uncompress flag 'u' pos if present
914
3.94k
            if (!uncomp && (*cp == 'u')) {
915
0
                uncomp = cp2 - 1;
916
0
            }
917
3.94k
        }
918
6.40k
    }
919
3.94k
    mode_c = cp2;
920
3.94k
    *cp2++ = fmt_code;
921
3.94k
    *cp2++ = 0;
922
923
    // Set or reset the format code if opts->format is used
924
3.94k
    if (fmt && fmt->format > unknown_format
925
3.94k
        && fmt->format < sizeof(format_to_mode)) {
926
0
        *mode_c = format_to_mode[fmt->format];
927
0
    }
928
929
    // Uncompressed bam/bcf is not supported, change 'u' to '0' on write
930
3.94k
    if (uncomp && *mode_c == 'b' && (strchr(smode, 'w') || strchr(smode, 'a'))) {
931
0
        *uncomp = '0';
932
0
    }
933
934
    // If we really asked for a compressed text format then mode_c above will
935
    // point to nul.  We set to 'z' to enable bgzf.
936
3.94k
    if (strchr(mode, 'w') && fmt && fmt->compression == bgzf) {
937
0
        if (fmt->format == sam || fmt->format == vcf || fmt->format == text_format)
938
0
            *mode_c = 'z';
939
0
    }
940
941
3.94k
    char *rmme = NULL, *fnidx = strstr(fn, HTS_IDX_DELIM);
942
3.94k
    if ( fnidx ) {
943
0
        rmme = strdup(fn);
944
0
        if ( !rmme ) goto error;
945
0
        rmme[fnidx-fn] = 0;
946
0
        fn = rmme;
947
0
    }
948
949
3.94k
    hfile = hopen(fn, smode);
950
3.94k
    if (hfile == NULL) goto error;
951
952
3.94k
    fp = hts_hopen(hfile, fn, smode);
953
3.94k
    if (fp == NULL) goto error;
954
955
    // Compensate for the loss of exactness in htsExactFormat.
956
    // hts_hopen returns generics such as binary or text, but we
957
    // have been given something explicit here so use that instead.
958
3.94k
    if (fp->is_write && fmt &&
959
3.94k
        (fmt->format == bam || fmt->format == sam ||
960
0
         fmt->format == vcf || fmt->format == bcf ||
961
0
         fmt->format == bed || fmt->format == fasta_format ||
962
0
         fmt->format == fastq_format))
963
0
        fp->format.format = fmt->format;
964
965
3.94k
    if (fmt && fmt->specific) {
966
0
        if (hts_opt_apply(fp, fmt->specific) != 0) {
967
0
            if (((hts_opt*)fmt->specific)->opt == CRAM_OPT_REFERENCE &&
968
0
                (errno == ENOENT || errno == EIO || errno == EBADF ||
969
0
                  errno == EACCES || errno == EISDIR)) {
970
                /* error during reference file operation
971
                 for these specific errors, set the error as EINVAL */
972
0
                errno = EINVAL;
973
0
            }
974
0
            goto error;
975
0
        }
976
0
    }
977
3.94k
    if ( rmme ) free(rmme);
978
3.94k
    return fp;
979
980
0
error:
981
0
    hts_log_error("Failed to open file \"%s\"%s%s", fn,
982
0
                  errno ? " : " : "", errno ? strerror(errno) : "");
983
0
    if ( rmme ) free(rmme);
984
985
0
    if (hfile)
986
0
        hclose_abruptly(hfile);
987
988
0
    return NULL;
989
3.94k
}
990
991
3.94k
htsFile *hts_open(const char *fn, const char *mode) {
992
3.94k
    return hts_open_format(fn, mode, NULL);
993
3.94k
}
994
995
/*
996
 * Splits str into a prefix, delimiter ('\0' or delim), and suffix, writing
997
 * the prefix in lowercase into buf and returning a pointer to the suffix.
998
 * On return, buf is always NUL-terminated; thus assumes that the "keyword"
999
 * prefix should be one of several known values of maximum length buflen-2.
1000
 * (If delim is not found, returns a pointer to the '\0'.)
1001
 */
1002
static const char *
1003
scan_keyword(const char *str, char delim, char *buf, size_t buflen)
1004
0
{
1005
0
    size_t i = 0;
1006
0
    while (*str && *str != delim) {
1007
0
        if (i < buflen-1) buf[i++] = tolower_c(*str);
1008
0
        str++;
1009
0
    }
1010
1011
0
    buf[i] = '\0';
1012
0
    return *str? str+1 : str;
1013
0
}
1014
1015
/*
1016
 * Parses arg and appends it to the option list.
1017
 *
1018
 * Returns 0 on success;
1019
 *        -1 on failure.
1020
 */
1021
0
int hts_opt_add(hts_opt **opts, const char *c_arg) {
1022
0
    hts_opt *o, *t;
1023
0
    char *val;
1024
1025
    /*
1026
     * IMPORTANT!!!
1027
     * If you add another string option here, don't forget to also add
1028
     * it to the case statement in hts_opt_apply.
1029
     */
1030
1031
0
    if (!c_arg)
1032
0
        return -1;
1033
1034
0
    if (!(o =  malloc(sizeof(*o))))
1035
0
        return -1;
1036
1037
0
    if (!(o->arg = strdup(c_arg))) {
1038
0
        free(o);
1039
0
        return -1;
1040
0
    }
1041
1042
0
    if (!(val = strchr(o->arg, '=')))
1043
0
        val = "1"; // assume boolean
1044
0
    else
1045
0
        *val++ = '\0';
1046
1047
0
    if (strcmp(o->arg, "decode_md") == 0 ||
1048
0
        strcmp(o->arg, "DECODE_MD") == 0)
1049
0
        o->opt = CRAM_OPT_DECODE_MD, o->val.i = atoi(val);
1050
1051
0
    else if (strcmp(o->arg, "verbosity") == 0 ||
1052
0
             strcmp(o->arg, "VERBOSITY") == 0)
1053
0
        o->opt = CRAM_OPT_VERBOSITY, o->val.i = atoi(val);
1054
1055
0
    else if (strcmp(o->arg, "seqs_per_slice") == 0 ||
1056
0
             strcmp(o->arg, "SEQS_PER_SLICE") == 0)
1057
0
        o->opt = CRAM_OPT_SEQS_PER_SLICE, o->val.i = atoi(val);
1058
1059
0
    else if (strcmp(o->arg, "bases_per_slice") == 0 ||
1060
0
             strcmp(o->arg, "BASES_PER_SLICE") == 0)
1061
0
        o->opt = CRAM_OPT_BASES_PER_SLICE, o->val.i = atoi(val);
1062
1063
0
    else if (strcmp(o->arg, "slices_per_container") == 0 ||
1064
0
             strcmp(o->arg, "SLICES_PER_CONTAINER") == 0)
1065
0
        o->opt = CRAM_OPT_SLICES_PER_CONTAINER, o->val.i = atoi(val);
1066
1067
0
    else if (strcmp(o->arg, "embed_ref") == 0 ||
1068
0
             strcmp(o->arg, "EMBED_REF") == 0)
1069
0
        o->opt = CRAM_OPT_EMBED_REF, o->val.i = atoi(val);
1070
1071
0
    else if (strcmp(o->arg, "no_ref") == 0 ||
1072
0
             strcmp(o->arg, "NO_REF") == 0)
1073
0
        o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(val);
1074
1075
0
    else if (strcmp(o->arg, "pos_delta") == 0 ||
1076
0
             strcmp(o->arg, "POS_DELTA") == 0)
1077
0
        o->opt = CRAM_OPT_POS_DELTA, o->val.i = atoi(val);
1078
1079
0
    else if (strcmp(o->arg, "ignore_md5") == 0 ||
1080
0
             strcmp(o->arg, "IGNORE_MD5") == 0)
1081
0
        o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(val);
1082
1083
0
    else if (strcmp(o->arg, "use_bzip2") == 0 ||
1084
0
             strcmp(o->arg, "USE_BZIP2") == 0)
1085
0
        o->opt = CRAM_OPT_USE_BZIP2, o->val.i = atoi(val);
1086
1087
0
    else if (strcmp(o->arg, "use_rans") == 0 ||
1088
0
             strcmp(o->arg, "USE_RANS") == 0)
1089
0
        o->opt = CRAM_OPT_USE_RANS, o->val.i = atoi(val);
1090
1091
0
    else if (strcmp(o->arg, "use_lzma") == 0 ||
1092
0
             strcmp(o->arg, "USE_LZMA") == 0)
1093
0
        o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(val);
1094
1095
0
    else if (strcmp(o->arg, "use_tok") == 0 ||
1096
0
             strcmp(o->arg, "USE_TOK") == 0)
1097
0
        o->opt = CRAM_OPT_USE_TOK, o->val.i = atoi(val);
1098
1099
0
    else if (strcmp(o->arg, "use_fqz") == 0 ||
1100
0
             strcmp(o->arg, "USE_FQZ") == 0)
1101
0
        o->opt = CRAM_OPT_USE_FQZ, o->val.i = atoi(val);
1102
1103
0
    else if (strcmp(o->arg, "use_arith") == 0 ||
1104
0
             strcmp(o->arg, "USE_ARITH") == 0)
1105
0
        o->opt = CRAM_OPT_USE_ARITH, o->val.i = atoi(val);
1106
1107
0
    else if (strcmp(o->arg, "fast") == 0 ||
1108
0
             strcmp(o->arg, "FAST") == 0)
1109
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_FAST;
1110
1111
0
    else if (strcmp(o->arg, "normal") == 0 ||
1112
0
             strcmp(o->arg, "NORMAL") == 0)
1113
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_NORMAL;
1114
1115
0
    else if (strcmp(o->arg, "small") == 0 ||
1116
0
             strcmp(o->arg, "SMALL") == 0)
1117
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_SMALL;
1118
1119
0
    else if (strcmp(o->arg, "archive") == 0 ||
1120
0
             strcmp(o->arg, "ARCHIVE") == 0)
1121
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_ARCHIVE;
1122
1123
0
    else if (strcmp(o->arg, "reference") == 0 ||
1124
0
             strcmp(o->arg, "REFERENCE") == 0)
1125
0
        o->opt = CRAM_OPT_REFERENCE, o->val.s = val;
1126
1127
0
    else if (strcmp(o->arg, "version") == 0 ||
1128
0
             strcmp(o->arg, "VERSION") == 0)
1129
0
        o->opt = CRAM_OPT_VERSION, o->val.s =val;
1130
1131
0
    else if (strcmp(o->arg, "multi_seq_per_slice") == 0 ||
1132
0
             strcmp(o->arg, "MULTI_SEQ_PER_SLICE") == 0)
1133
0
        o->opt = CRAM_OPT_MULTI_SEQ_PER_SLICE, o->val.i = atoi(val);
1134
1135
0
    else if (strcmp(o->arg, "nthreads") == 0 ||
1136
0
             strcmp(o->arg, "NTHREADS") == 0)
1137
0
        o->opt = HTS_OPT_NTHREADS, o->val.i = atoi(val);
1138
1139
0
    else if (strcmp(o->arg, "cache_size") == 0 ||
1140
0
             strcmp(o->arg, "CACHE_SIZE") == 0) {
1141
0
        char *endp;
1142
0
        o->opt = HTS_OPT_CACHE_SIZE;
1143
0
        o->val.i = strtol(val, &endp, 0);
1144
        // NB: Doesn't support floats, eg 1.5g
1145
        // TODO: extend hts_parse_decimal? See also samtools sort.
1146
0
        switch (*endp) {
1147
0
        case 'g': case 'G': o->val.i *= 1024; // fall through
1148
0
        case 'm': case 'M': o->val.i *= 1024; // fall through
1149
0
        case 'k': case 'K': o->val.i *= 1024; break;
1150
0
        case '\0': break;
1151
0
        default:
1152
0
            hts_log_error("Unrecognised cache size suffix '%c'", *endp);
1153
0
            free(o->arg);
1154
0
            free(o);
1155
0
            return -1;
1156
0
        }
1157
0
    }
1158
1159
0
    else if (strcmp(o->arg, "required_fields") == 0 ||
1160
0
             strcmp(o->arg, "REQUIRED_FIELDS") == 0)
1161
0
        o->opt = CRAM_OPT_REQUIRED_FIELDS, o->val.i = strtol(val, NULL, 0);
1162
1163
0
    else if (strcmp(o->arg, "lossy_names") == 0 ||
1164
0
             strcmp(o->arg, "LOSSY_NAMES") == 0)
1165
0
        o->opt = CRAM_OPT_LOSSY_NAMES, o->val.i = strtol(val, NULL, 0);
1166
1167
0
    else if (strcmp(o->arg, "name_prefix") == 0 ||
1168
0
             strcmp(o->arg, "NAME_PREFIX") == 0)
1169
0
        o->opt = CRAM_OPT_PREFIX, o->val.s = val;
1170
1171
0
    else if (strcmp(o->arg, "store_md") == 0 ||
1172
0
             strcmp(o->arg, "store_md") == 0)
1173
0
        o->opt = CRAM_OPT_STORE_MD, o->val.i = atoi(val);
1174
1175
0
    else if (strcmp(o->arg, "store_nm") == 0 ||
1176
0
             strcmp(o->arg, "store_nm") == 0)
1177
0
        o->opt = CRAM_OPT_STORE_NM, o->val.i = atoi(val);
1178
1179
0
    else if (strcmp(o->arg, "block_size") == 0 ||
1180
0
             strcmp(o->arg, "BLOCK_SIZE") == 0)
1181
0
        o->opt = HTS_OPT_BLOCK_SIZE, o->val.i = strtol(val, NULL, 0);
1182
1183
0
    else if (strcmp(o->arg, "level") == 0 ||
1184
0
             strcmp(o->arg, "LEVEL") == 0)
1185
0
        o->opt = HTS_OPT_COMPRESSION_LEVEL, o->val.i = strtol(val, NULL, 0);
1186
1187
0
    else if (strcmp(o->arg, "filter") == 0 ||
1188
0
             strcmp(o->arg, "FILTER") == 0)
1189
0
        o->opt = HTS_OPT_FILTER, o->val.s = val;
1190
1191
0
    else if (strcmp(o->arg, "fastq_aux") == 0 ||
1192
0
        strcmp(o->arg, "FASTQ_AUX") == 0)
1193
0
        o->opt = FASTQ_OPT_AUX, o->val.s = val;
1194
1195
0
    else if (strcmp(o->arg, "fastq_barcode") == 0 ||
1196
0
        strcmp(o->arg, "FASTQ_BARCODE") == 0)
1197
0
        o->opt = FASTQ_OPT_BARCODE, o->val.s = val;
1198
1199
0
    else if (strcmp(o->arg, "fastq_rnum") == 0 ||
1200
0
        strcmp(o->arg, "FASTQ_RNUM") == 0)
1201
0
        o->opt = FASTQ_OPT_RNUM, o->val.i = 1;
1202
1203
0
    else if (strcmp(o->arg, "fastq_casava") == 0 ||
1204
0
        strcmp(o->arg, "FASTQ_CASAVA") == 0)
1205
0
        o->opt = FASTQ_OPT_CASAVA, o->val.i = 1;
1206
1207
0
    else if (strcmp(o->arg, "fastq_name2") == 0 ||
1208
0
        strcmp(o->arg, "FASTQ_NAME2") == 0)
1209
0
        o->opt = FASTQ_OPT_NAME2, o->val.i = 1;
1210
1211
0
    else {
1212
0
        hts_log_error("Unknown option '%s'", o->arg);
1213
0
        free(o->arg);
1214
0
        free(o);
1215
0
        return -1;
1216
0
    }
1217
1218
0
    o->next = NULL;
1219
1220
    // Append; assumes small list.
1221
0
    if (*opts) {
1222
0
        t = *opts;
1223
0
        while (t->next)
1224
0
            t = t->next;
1225
0
        t->next = o;
1226
0
    } else {
1227
0
        *opts = o;
1228
0
    }
1229
1230
0
    return 0;
1231
0
}
1232
1233
/*
1234
 * Applies an hts_opt option list to a given htsFile.
1235
 *
1236
 * Returns 0 on success
1237
 *        -1 on failure
1238
 */
1239
0
int hts_opt_apply(htsFile *fp, hts_opt *opts) {
1240
0
    hts_opt *last = NULL;
1241
1242
0
    for (; opts;  opts = (last=opts)->next) {
1243
0
        switch (opts->opt) {
1244
0
            case CRAM_OPT_REFERENCE:
1245
0
                if (!(fp->fn_aux = strdup(opts->val.s)))
1246
0
                    return -1;
1247
                // fall through
1248
0
            case CRAM_OPT_VERSION:
1249
0
            case CRAM_OPT_PREFIX:
1250
0
            case HTS_OPT_FILTER:
1251
0
            case FASTQ_OPT_AUX:
1252
0
            case FASTQ_OPT_BARCODE:
1253
0
                if (hts_set_opt(fp,  opts->opt,  opts->val.s) != 0)
1254
0
                    return -1;
1255
0
                break;
1256
0
            default:
1257
0
                if (hts_set_opt(fp,  opts->opt,  opts->val.i) != 0)
1258
0
                    return -1;
1259
0
                break;
1260
0
        }
1261
0
    }
1262
1263
0
    return 0;
1264
0
}
1265
1266
/*
1267
 * Frees an hts_opt list.
1268
 */
1269
0
void hts_opt_free(hts_opt *opts) {
1270
0
    hts_opt *last = NULL;
1271
0
    while (opts) {
1272
0
        opts = (last=opts)->next;
1273
0
        free(last->arg);
1274
0
        free(last);
1275
0
    }
1276
0
}
1277
1278
1279
/*
1280
 * Tokenise options as (key(=value)?,)*(key(=value)?)?
1281
 * NB: No provision for ',' appearing in the value!
1282
 * Add backslashing rules?
1283
 *
1284
 * This could be used as part of a general command line option parser or
1285
 * as a string concatenated onto the file open mode.
1286
 *
1287
 * Returns 0 on success
1288
 *        -1 on failure.
1289
 */
1290
0
int hts_parse_opt_list(htsFormat *fmt, const char *str) {
1291
0
    while (str && *str) {
1292
0
        const char *str_start;
1293
0
        int len;
1294
0
        char arg[8001];
1295
1296
0
        while (*str && *str == ',')
1297
0
            str++;
1298
1299
0
        for (str_start = str; *str && *str != ','; str++);
1300
0
        len = str - str_start;
1301
1302
        // Produce a nul terminated copy of the option
1303
0
        strncpy(arg, str_start, len < 8000 ? len : 8000);
1304
0
        arg[len < 8000 ? len : 8000] = '\0';
1305
1306
0
        if (hts_opt_add((hts_opt **)&fmt->specific, arg) != 0)
1307
0
            return -1;
1308
1309
0
        if (*str)
1310
0
            str++;
1311
0
    }
1312
1313
0
    return 0;
1314
0
}
1315
1316
/*
1317
 * Accepts a string file format (sam, bam, cram, vcf, bam) optionally
1318
 * followed by a comma separated list of key=value options and splits
1319
 * these up into the fields of htsFormat struct.
1320
 *
1321
 * format is assumed to be already initialised, either to blank
1322
 * "unknown" values or via previous hts_opt_add calls.
1323
 *
1324
 * Returns 0 on success
1325
 *        -1 on failure.
1326
 */
1327
0
int hts_parse_format(htsFormat *format, const char *str) {
1328
0
    char fmt[9];
1329
0
    const char *cp = scan_keyword(str, ',', fmt, sizeof fmt);
1330
1331
0
    format->version.minor = 0; // unknown
1332
0
    format->version.major = 0; // unknown
1333
1334
0
    if (strcmp(fmt, "sam") == 0) {
1335
0
        format->category          = sequence_data;
1336
0
        format->format            = sam;
1337
0
        format->compression       = no_compression;
1338
0
        format->compression_level = 0;
1339
0
    } else if (strcmp(fmt, "sam.gz") == 0) {
1340
0
        format->category          = sequence_data;
1341
0
        format->format            = sam;
1342
0
        format->compression       = bgzf;
1343
0
        format->compression_level = -1;
1344
0
    } else if (strcmp(fmt, "bam") == 0) {
1345
0
        format->category          = sequence_data;
1346
0
        format->format            = bam;
1347
0
        format->compression       = bgzf;
1348
0
        format->compression_level = -1;
1349
0
    } else if (strcmp(fmt, "cram") == 0) {
1350
0
        format->category          = sequence_data;
1351
0
        format->format            = cram;
1352
0
        format->compression       = custom;
1353
0
        format->compression_level = -1;
1354
0
    } else if (strcmp(fmt, "vcf") == 0) {
1355
0
        format->category          = variant_data;
1356
0
        format->format            = vcf;
1357
0
        format->compression       = no_compression;
1358
0
        format->compression_level = 0;
1359
0
    } else if (strcmp(fmt, "bcf") == 0) {
1360
0
        format->category          = variant_data;
1361
0
        format->format            = bcf;
1362
0
        format->compression       = bgzf;
1363
0
        format->compression_level = -1;
1364
0
    } else if (strcmp(fmt, "fastq") == 0 || strcmp(fmt, "fq") == 0) {
1365
0
        format->category          = sequence_data;
1366
0
        format->format            = fastq_format;
1367
0
        format->compression       = no_compression;
1368
0
        format->compression_level = 0;
1369
0
    } else if (strcmp(fmt, "fastq.gz") == 0 || strcmp(fmt, "fq.gz") == 0) {
1370
0
        format->category          = sequence_data;
1371
0
        format->format            = fastq_format;
1372
0
        format->compression       = bgzf;
1373
0
        format->compression_level = 0;
1374
0
    } else if (strcmp(fmt, "fasta") == 0 || strcmp(fmt, "fa") == 0) {
1375
0
        format->category          = sequence_data;
1376
0
        format->format            = fasta_format;
1377
0
        format->compression       = no_compression;
1378
0
        format->compression_level = 0;
1379
0
    } else if (strcmp(fmt, "fasta.gz") == 0 || strcmp(fmt, "fa.gz") == 0) {
1380
0
        format->category          = sequence_data;
1381
0
        format->format            = fasta_format;
1382
0
        format->compression       = bgzf;
1383
0
        format->compression_level = 0;
1384
0
    } else {
1385
0
        return -1;
1386
0
    }
1387
1388
0
    return hts_parse_opt_list(format, cp);
1389
0
}
1390
1391
1392
/*
1393
 * Tokenise options as (key(=value)?,)*(key(=value)?)?
1394
 * NB: No provision for ',' appearing in the value!
1395
 * Add backslashing rules?
1396
 *
1397
 * This could be used as part of a general command line option parser or
1398
 * as a string concatenated onto the file open mode.
1399
 *
1400
 * Returns 0 on success
1401
 *        -1 on failure.
1402
 */
1403
0
static int hts_process_opts(htsFile *fp, const char *opts) {
1404
0
    htsFormat fmt;
1405
1406
0
    fmt.specific = NULL;
1407
0
    if (hts_parse_opt_list(&fmt, opts) != 0)
1408
0
        return -1;
1409
1410
0
    if (hts_opt_apply(fp, fmt.specific) != 0) {
1411
0
        hts_opt_free(fmt.specific);
1412
0
        return -1;
1413
0
    }
1414
1415
0
    hts_opt_free(fmt.specific);
1416
1417
0
    return 0;
1418
0
}
1419
1420
static int hts_crypt4gh_redirect(const char *fn, const char *mode,
1421
1
                                 hFILE **hfile_ptr, htsFile *fp) {
1422
1
    hFILE *hfile1 = *hfile_ptr;
1423
1
    hFILE *hfile2 = NULL;
1424
1
    char fn_buf[512], *fn2 = fn_buf;
1425
1
    char mode2[102]; // Size set by sizeof(simple_mode) in hts_hopen()
1426
1
    const char *prefix = "crypt4gh:";
1427
1
    size_t fn2_len = strlen(prefix) + strlen(fn) + 1;
1428
1
    int ret = -1;
1429
1430
1
    if (fn2_len > sizeof(fn_buf)) {
1431
0
        if (fn2_len >= INT_MAX) // Silence gcc format-truncation warning
1432
0
            return -1;
1433
0
        fn2 = malloc(fn2_len);
1434
0
        if (!fn2) return -1;
1435
0
    }
1436
1437
    // Reopen fn using the crypt4gh plug-in (if available)
1438
1
    snprintf(fn2, fn2_len, "%s%s", prefix, fn);
1439
1
    snprintf(mode2, sizeof(mode2), "%s%s", mode, strchr(mode, ':') ? "" : ":");
1440
1
    hfile2 = hopen(fn2, mode2, "parent", hfile1, NULL);
1441
1
    if (hfile2) {
1442
        // Replace original hfile with the new one.  The original is now
1443
        // enclosed within hfile2
1444
0
        *hfile_ptr = hfile2;
1445
0
        ret = 0;
1446
0
    }
1447
1448
1
    if (fn2 != fn_buf)
1449
0
        free(fn2);
1450
1
    return ret;
1451
1
}
1452
1453
htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode)
1454
9.39k
{
1455
9.39k
    hFILE *hfile_orig = hfile;
1456
9.39k
    hFILE *hfile_cleanup = hfile;
1457
9.39k
    htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile));
1458
9.39k
    char simple_mode[101], *cp, *opts;
1459
9.39k
    simple_mode[100] = '\0';
1460
1461
9.39k
    if (fp == NULL) goto error;
1462
1463
9.39k
    fp->fn = strdup(fn);
1464
9.39k
    fp->is_be = ed_is_big();
1465
1466
    // Split mode into simple_mode,opts strings
1467
9.39k
    if ((cp = strchr(mode, ','))) {
1468
0
        strncpy(simple_mode, mode, cp-mode <= 100 ? cp-mode : 100);
1469
0
        simple_mode[cp-mode] = '\0';
1470
0
        opts = cp+1;
1471
9.39k
    } else {
1472
9.39k
        strncpy(simple_mode, mode, 100);
1473
9.39k
        opts = NULL;
1474
9.39k
    }
1475
1476
9.39k
    if (strchr(simple_mode, 'r')) {
1477
5.44k
        const int max_loops = 5; // Should be plenty
1478
5.44k
        int loops = 0;
1479
5.44k
        if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error;
1480
1481
        // Deal with formats that re-direct an underlying file via a plug-in.
1482
        // Loops as we may have crypt4gh served via htsget, or
1483
        // crypt4gh-in-crypt4gh.
1484
1485
5.44k
        while (fp->format.format == htsget ||
1486
5.44k
               fp->format.format == hts_crypt4gh_format) {
1487
            // Ensure we don't get stuck in an endless redirect loop
1488
1
            if (++loops > max_loops) {
1489
0
                errno = ELOOP;
1490
0
                goto error;
1491
0
            }
1492
1493
1
            if (fp->format.format == htsget) {
1494
0
                hFILE *hfile2 = hopen_htsget_redirect(hfile, simple_mode);
1495
0
                if (hfile2 == NULL) goto error;
1496
1497
0
                if (hfile != hfile_cleanup) {
1498
                    // Close the result of an earlier redirection
1499
0
                    hclose_abruptly(hfile);
1500
0
                }
1501
1502
0
                hfile = hfile2;
1503
0
            }
1504
1
            else if (fp->format.format == hts_crypt4gh_format) {
1505
1
                int should_preserve = (hfile == hfile_orig);
1506
1
                int update_cleanup = (hfile == hfile_cleanup);
1507
1
                if (hts_crypt4gh_redirect(fn, simple_mode, &hfile, fp) < 0)
1508
1
                    goto error;
1509
0
                if (should_preserve) {
1510
                    // The original hFILE is now contained in a crypt4gh
1511
                    // wrapper.  Should we need to close the wrapper due
1512
                    // to a later error, we need to prevent the wrapped
1513
                    // handle from being closed as the caller will see
1514
                    // this function return NULL and try to clean up itself.
1515
0
                    hfile_orig->preserve = 1;
1516
0
                }
1517
0
                if (update_cleanup) {
1518
                    // Update handle to close at the end if redirected by htsget
1519
0
                    hfile_cleanup = hfile;
1520
0
                }
1521
0
            }
1522
1523
            // Re-detect format against the result of the redirection
1524
0
            if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error;
1525
0
        }
1526
5.44k
    }
1527
3.94k
    else if (strchr(simple_mode, 'w') || strchr(simple_mode, 'a')) {
1528
3.94k
        htsFormat *fmt = &fp->format;
1529
3.94k
        fp->is_write = 1;
1530
1531
3.94k
        if (strchr(simple_mode, 'b')) fmt->format = binary_format;
1532
2.46k
        else if (strchr(simple_mode, 'c')) fmt->format = cram;
1533
1.48k
        else if (strchr(simple_mode, 'f')) fmt->format = fastq_format;
1534
1.48k
        else if (strchr(simple_mode, 'F')) fmt->format = fasta_format;
1535
1.48k
        else fmt->format = text_format;
1536
1537
3.94k
        if (strchr(simple_mode, 'z')) fmt->compression = bgzf;
1538
3.94k
        else if (strchr(simple_mode, 'g')) fmt->compression = gzip;
1539
3.94k
        else if (strchr(simple_mode, 'u')) fmt->compression = no_compression;
1540
3.94k
        else {
1541
            // No compression mode specified, set to the default for the format
1542
3.94k
            switch (fmt->format) {
1543
1.48k
            case binary_format: fmt->compression = bgzf; break;
1544
979
            case cram: fmt->compression = custom; break;
1545
0
            case fastq_format: fmt->compression = no_compression; break;
1546
0
            case fasta_format: fmt->compression = no_compression; break;
1547
1.48k
            case text_format: fmt->compression = no_compression; break;
1548
0
            default: abort();
1549
3.94k
            }
1550
3.94k
        }
1551
1552
        // Fill in category (if determinable; e.g. 'b' could be BAM or BCF)
1553
3.94k
        fmt->category = format_category(fmt->format);
1554
1555
3.94k
        fmt->version.major = fmt->version.minor = -1;
1556
3.94k
        fmt->compression_level = -1;
1557
3.94k
        fmt->specific = NULL;
1558
3.94k
    }
1559
0
    else { errno = EINVAL; goto error; }
1560
1561
9.39k
    switch (fp->format.format) {
1562
1.48k
    case binary_format:
1563
1.55k
    case bam:
1564
1.69k
    case bcf:
1565
1.69k
        fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
1566
1.69k
        if (fp->fp.bgzf == NULL) goto error;
1567
1.69k
        fp->is_bin = fp->is_bgzf = 1;
1568
1.69k
        break;
1569
1570
1.61k
    case cram:
1571
1.61k
        fp->fp.cram = cram_dopen(hfile, fn, simple_mode);
1572
1.61k
        if (fp->fp.cram == NULL) goto error;
1573
1.59k
        if (!fp->is_write)
1574
616
            cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, -1); // auto
1575
1.59k
        fp->is_cram = 1;
1576
1.59k
        break;
1577
1578
1
    case empty_format:
1579
1.48k
    case text_format:
1580
1.48k
    case bed:
1581
2.40k
    case fasta_format:
1582
2.43k
    case fastq_format:
1583
4.70k
    case sam:
1584
6.07k
    case vcf:
1585
6.07k
        if (fp->format.compression != no_compression) {
1586
1.10k
            fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
1587
1.10k
            if (fp->fp.bgzf == NULL) goto error;
1588
1.10k
            fp->is_bgzf = 1;
1589
1.10k
        }
1590
4.97k
        else
1591
4.97k
            fp->fp.hfile = hfile;
1592
6.07k
        break;
1593
1594
6.07k
    default:
1595
1
        errno = EFTYPE;
1596
1
        goto error;
1597
9.39k
    }
1598
1599
9.36k
    if (opts)
1600
0
        hts_process_opts(fp, opts);
1601
1602
    // Allow original file to close if it was preserved earlier by crypt4gh
1603
9.36k
    hfile_orig->preserve = 0;
1604
1605
    // If redirecting via htsget, close the original hFILE now (pedantically
1606
    // we would instead close it in hts_close(), but this a simplifying
1607
    // optimisation)
1608
9.36k
    if (hfile != hfile_cleanup) hclose_abruptly(hfile_cleanup);
1609
1610
9.36k
    return fp;
1611
1612
23
error:
1613
23
    hts_log_error("Failed to open file %s", fn);
1614
1615
    // If redirecting, close the failed redirection hFILE that we have opened
1616
23
    if (hfile != hfile_orig) hclose_abruptly(hfile);
1617
23
    hfile_orig->preserve = 0; // Allow caller to close the original hfile
1618
1619
23
    if (fp) {
1620
23
        free(fp->fn);
1621
23
        free(fp->fn_aux);
1622
23
        free(fp);
1623
23
    }
1624
23
    return NULL;
1625
9.39k
}
1626
1627
static int hts_idx_close_otf_fp(hts_idx_t *idx);
1628
1629
int hts_close(htsFile *fp)
1630
9.36k
{
1631
9.36k
    int ret = 0, save;
1632
9.36k
    if (!fp) {
1633
0
        errno = EINVAL;
1634
0
        return -1;
1635
0
    }
1636
1637
9.36k
    switch (fp->format.format) {
1638
91
    case binary_format:
1639
1.12k
    case bam:
1640
1.69k
    case bcf:
1641
1.69k
        ret = bgzf_close(fp->fp.bgzf);
1642
1.69k
        break;
1643
1644
1.59k
    case cram:
1645
1.59k
        if (!fp->is_write) {
1646
616
            switch (cram_eof(fp->fp.cram)) {
1647
0
            case 2:
1648
0
                hts_log_warning("EOF marker is absent. The input is probably truncated");
1649
0
                break;
1650
439
            case 0:  /* not at EOF, but may not have wanted all seqs */
1651
616
            default: /* case 1, expected EOF */
1652
616
                break;
1653
616
            }
1654
616
        }
1655
1.59k
        ret = cram_close(fp->fp.cram);
1656
1.59k
        break;
1657
1658
0
    case empty_format:
1659
91
    case text_format:
1660
91
    case bed:
1661
1.01k
    case fasta_format:
1662
1.04k
    case fastq_format:
1663
4.27k
    case sam:
1664
6.07k
    case vcf:
1665
6.07k
        if (fp->format.format == sam)
1666
3.22k
            ret = sam_state_destroy(fp);
1667
2.84k
        else if (fp->format.format == fastq_format ||
1668
2.84k
                 fp->format.format == fasta_format)
1669
956
            fastq_state_destroy(fp);
1670
1671
6.07k
        if (fp->format.compression != no_compression)
1672
1.10k
            ret |= bgzf_close(fp->fp.bgzf);
1673
4.97k
        else
1674
4.97k
            ret |= hclose(fp->fp.hfile);
1675
6.07k
        break;
1676
1677
0
    default:
1678
0
        ret = -1;
1679
0
        break;
1680
9.36k
    }
1681
1682
9.36k
    if (fp->idx) {
1683
        // Close deferred index file handle, if present.
1684
        // Unfortunately this means errors on the index will get mixed with
1685
        // those on the main file, but as we only have the EOF block left to
1686
        // write it hopefully won't happen that often.
1687
0
        ret |= hts_idx_close_otf_fp(fp->idx);
1688
0
    }
1689
1690
9.36k
    save = errno;
1691
9.36k
    sam_hdr_destroy(fp->bam_header);
1692
9.36k
    hts_idx_destroy(fp->idx);
1693
9.36k
    hts_filter_free(fp->filter);
1694
9.36k
    free(fp->fn);
1695
9.36k
    free(fp->fn_aux);
1696
9.36k
    free(fp->line.s);
1697
9.36k
    free(fp);
1698
9.36k
    errno = save;
1699
9.36k
    return ret;
1700
9.36k
}
1701
1702
int hts_flush(htsFile *fp)
1703
0
{
1704
0
    if (fp == NULL) return 0;
1705
1706
0
    switch (fp->format.format) {
1707
0
    case binary_format:
1708
0
    case bam:
1709
0
    case bcf:
1710
0
        return bgzf_flush(fp->fp.bgzf);
1711
1712
0
    case cram:
1713
0
        return cram_flush(fp->fp.cram);
1714
1715
0
    case empty_format:
1716
0
    case text_format:
1717
0
    case bed:
1718
0
    case fasta_format:
1719
0
    case fastq_format:
1720
0
    case sam:
1721
0
    case vcf:
1722
0
        if (fp->format.compression != no_compression)
1723
0
            return bgzf_flush(fp->fp.bgzf);
1724
0
        else
1725
0
            return hflush(fp->fp.hfile);
1726
1727
0
    default:
1728
0
        break;
1729
0
    }
1730
1731
0
    return 0;
1732
0
}
1733
1734
const htsFormat *hts_get_format(htsFile *fp)
1735
0
{
1736
0
    return fp? &fp->format : NULL;
1737
0
}
1738
1739
0
const char *hts_format_file_extension(const htsFormat *format) {
1740
0
    if (!format)
1741
0
        return "?";
1742
1743
0
    switch (format->format) {
1744
0
    case sam:  return "sam";
1745
0
    case bam:  return "bam";
1746
0
    case bai:  return "bai";
1747
0
    case cram: return "cram";
1748
0
    case crai: return "crai";
1749
0
    case vcf:  return "vcf";
1750
0
    case bcf:  return "bcf";
1751
0
    case csi:  return "csi";
1752
0
    case fai_format:   return "fai";
1753
0
    case fqi_format:   return "fqi";
1754
0
    case gzi:  return "gzi";
1755
0
    case tbi:  return "tbi";
1756
0
    case bed:  return "bed";
1757
0
    case d4_format:    return "d4";
1758
0
    case fasta_format: return "fa";
1759
0
    case fastq_format: return "fq";
1760
0
    default:   return "?";
1761
0
    }
1762
0
}
1763
1764
0
static hFILE *hts_hfile(htsFile *fp) {
1765
0
    switch (fp->format.format) {
1766
0
    case binary_format:// fall through
1767
0
    case bcf:          // fall through
1768
0
    case bam:          return bgzf_hfile(fp->fp.bgzf);
1769
0
    case cram:         return cram_hfile(fp->fp.cram);
1770
0
    case text_format:  // fall through
1771
0
    case vcf:          // fall through
1772
0
    case fastq_format: // fall through
1773
0
    case fasta_format: // fall through
1774
0
    case sam:          return fp->format.compression != no_compression
1775
0
                              ? bgzf_hfile(fp->fp.bgzf)
1776
0
                              : fp->fp.hfile;
1777
0
    default:           return NULL;
1778
0
    }
1779
0
}
1780
1781
0
int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) {
1782
0
    int r;
1783
0
    va_list args;
1784
1785
0
    switch (opt) {
1786
0
    case HTS_OPT_NTHREADS: {
1787
0
        va_start(args, opt);
1788
0
        int nthreads = va_arg(args, int);
1789
0
        va_end(args);
1790
0
        return hts_set_threads(fp, nthreads);
1791
0
    }
1792
1793
0
    case HTS_OPT_BLOCK_SIZE: {
1794
0
        hFILE *hf = hts_hfile(fp);
1795
1796
0
        if (hf) {
1797
0
            va_start(args, opt);
1798
0
            if (hfile_set_blksize(hf, va_arg(args, int)) != 0)
1799
0
                hts_log_warning("Failed to change block size");
1800
0
            va_end(args);
1801
0
        }
1802
0
        else {
1803
            // To do - implement for vcf/bcf.
1804
0
            hts_log_warning("Cannot change block size for this format");
1805
0
        }
1806
1807
0
        return 0;
1808
0
    }
1809
1810
0
    case HTS_OPT_THREAD_POOL: {
1811
0
        va_start(args, opt);
1812
0
        htsThreadPool *p = va_arg(args, htsThreadPool *);
1813
0
        va_end(args);
1814
0
        return hts_set_thread_pool(fp, p);
1815
0
    }
1816
1817
0
    case HTS_OPT_CACHE_SIZE: {
1818
0
        va_start(args, opt);
1819
0
        int cache_size = va_arg(args, int);
1820
0
        va_end(args);
1821
0
        hts_set_cache_size(fp, cache_size);
1822
0
        return 0;
1823
0
    }
1824
1825
0
    case FASTQ_OPT_CASAVA:
1826
0
    case FASTQ_OPT_RNUM:
1827
0
    case FASTQ_OPT_NAME2:
1828
0
        if (fp->format.format == fastq_format ||
1829
0
            fp->format.format == fasta_format)
1830
0
            return fastq_state_set(fp, opt);
1831
0
        return 0;
1832
1833
0
    case FASTQ_OPT_AUX:
1834
0
        if (fp->format.format == fastq_format ||
1835
0
            fp->format.format == fasta_format) {
1836
0
            va_start(args, opt);
1837
0
            char *list = va_arg(args, char *);
1838
0
            va_end(args);
1839
0
            return fastq_state_set(fp, opt, list);
1840
0
        }
1841
0
        return 0;
1842
1843
0
    case FASTQ_OPT_BARCODE:
1844
0
        if (fp->format.format == fastq_format ||
1845
0
            fp->format.format == fasta_format) {
1846
0
            va_start(args, opt);
1847
0
            char *bc = va_arg(args, char *);
1848
0
            va_end(args);
1849
0
            return fastq_state_set(fp, opt, bc);
1850
0
        }
1851
0
        return 0;
1852
1853
    // Options below here flow through to cram_set_voption
1854
0
    case HTS_OPT_COMPRESSION_LEVEL: {
1855
0
        va_start(args, opt);
1856
0
        int level = va_arg(args, int);
1857
0
        va_end(args);
1858
0
        if (fp->is_bgzf)
1859
0
            fp->fp.bgzf->compress_level = level;
1860
0
        else if (fp->format.format == cram)
1861
0
            return cram_set_option(fp->fp.cram, opt, level);
1862
0
        return 0;
1863
0
    }
1864
1865
0
    case HTS_OPT_FILTER: {
1866
0
        va_start(args, opt);
1867
0
        char *expr = va_arg(args, char *);
1868
0
        va_end(args);
1869
0
        return hts_set_filter_expression(fp, expr);
1870
0
    }
1871
1872
0
    case HTS_OPT_PROFILE: {
1873
0
        va_start(args, opt);
1874
0
        enum hts_profile_option prof = va_arg(args, int);
1875
0
        va_end(args);
1876
0
        if (fp->is_bgzf) {
1877
0
            switch (prof) {
1878
#ifdef HAVE_LIBDEFLATE
1879
            case HTS_PROFILE_FAST:    fp->fp.bgzf->compress_level =  2; break;
1880
            case HTS_PROFILE_NORMAL:  fp->fp.bgzf->compress_level = -1; break;
1881
            case HTS_PROFILE_SMALL:   fp->fp.bgzf->compress_level = 10; break;
1882
            case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level = 12; break;
1883
#else
1884
0
            case HTS_PROFILE_FAST:    fp->fp.bgzf->compress_level =  1; break;
1885
0
            case HTS_PROFILE_NORMAL:  fp->fp.bgzf->compress_level = -1; break;
1886
0
            case HTS_PROFILE_SMALL:   fp->fp.bgzf->compress_level =  8; break;
1887
0
            case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level =  9; break;
1888
0
#endif
1889
0
            }
1890
0
        } // else CRAM manages this in its own way
1891
0
        break;
1892
0
    }
1893
1894
0
    default:
1895
0
        break;
1896
0
    }
1897
1898
0
    if (fp->format.format != cram)
1899
0
        return 0;
1900
1901
0
    va_start(args, opt);
1902
0
    r = cram_set_voption(fp->fp.cram, opt, args);
1903
0
    va_end(args);
1904
1905
0
    return r;
1906
0
}
1907
1908
BGZF *hts_get_bgzfp(htsFile *fp);
1909
1910
int hts_set_threads(htsFile *fp, int n)
1911
0
{
1912
0
    if (fp->format.format == sam) {
1913
0
        return sam_set_threads(fp, n);
1914
0
    } else if (fp->format.compression == bgzf) {
1915
0
        return bgzf_mt(hts_get_bgzfp(fp), n, 256/*unused*/);
1916
0
    } else if (fp->format.format == cram) {
1917
0
        return hts_set_opt(fp, CRAM_OPT_NTHREADS, n);
1918
0
    }
1919
0
    else return 0;
1920
0
}
1921
1922
0
int hts_set_thread_pool(htsFile *fp, htsThreadPool *p) {
1923
0
    if (fp->format.format == sam || fp->format.format == text_format) {
1924
0
        return sam_set_thread_pool(fp, p);
1925
0
    } else if (fp->format.compression == bgzf) {
1926
0
        return bgzf_thread_pool(hts_get_bgzfp(fp), p->pool, p->qsize);
1927
0
    } else if (fp->format.format == cram) {
1928
0
        return hts_set_opt(fp, CRAM_OPT_THREAD_POOL, p);
1929
0
    }
1930
0
    else return 0;
1931
0
}
1932
1933
void hts_set_cache_size(htsFile *fp, int n)
1934
0
{
1935
0
    if (fp->format.compression == bgzf)
1936
0
        bgzf_set_cache_size(hts_get_bgzfp(fp), n);
1937
0
}
1938
1939
int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
1940
0
{
1941
0
    free(fp->fn_aux);
1942
0
    if (fn_aux) {
1943
0
        fp->fn_aux = strdup(fn_aux);
1944
0
        if (fp->fn_aux == NULL) return -1;
1945
0
    }
1946
0
    else fp->fn_aux = NULL;
1947
1948
0
    if (fp->format.format == cram)
1949
0
        if (cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux))
1950
0
            return -1;
1951
1952
0
    return 0;
1953
0
}
1954
1955
int hts_set_filter_expression(htsFile *fp, const char *expr)
1956
0
{
1957
0
    if (fp->filter)
1958
0
        hts_filter_free(fp->filter);
1959
1960
0
    if (!expr)
1961
0
        return 0;
1962
1963
0
    return (fp->filter = hts_filter_init(expr))
1964
0
        ? 0 : -1;
1965
0
}
1966
1967
hFILE *hts_open_tmpfile(const char *fname, const char *mode, kstring_t *tmpname)
1968
0
{
1969
0
    int pid = (int) getpid();
1970
0
    unsigned ptr = (uintptr_t) tmpname;
1971
0
    int n = 0;
1972
0
    hFILE *fp = NULL;
1973
1974
0
    do {
1975
        // Attempt to further uniquify the temporary filename
1976
0
        unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock()) ^ ptr;
1977
0
        n++;
1978
1979
0
        ks_clear(tmpname);
1980
0
        if (ksprintf(tmpname, "%s.tmp_%d_%d_%u", fname, pid, n, t) < 0) break;
1981
1982
0
        fp = hopen(tmpname->s, mode);
1983
0
    } while (fp == NULL && errno == EEXIST && n < 100);
1984
1985
0
    return fp;
1986
0
}
1987
1988
int hts_is_utf16_text(const kstring_t *str)
1989
0
{
1990
0
    const unsigned char *u = (const unsigned char *) (str->s);
1991
0
    return (str->l > 0 && str->s)? is_utf16_text(u, u + str->l) : 0;
1992
0
}
1993
1994
// For VCF/BCF backward sweeper. Not exposing these functions because their
1995
// future is uncertain. Things will probably have to change with hFILE...
1996
BGZF *hts_get_bgzfp(htsFile *fp)
1997
0
{
1998
0
    if (fp->is_bgzf)
1999
0
        return fp->fp.bgzf;
2000
0
    else
2001
0
        return NULL;
2002
0
}
2003
int hts_useek(htsFile *fp, off_t uoffset, int where)
2004
0
{
2005
0
    if (fp->is_bgzf)
2006
0
        return bgzf_useek(fp->fp.bgzf, uoffset, where);
2007
0
    else
2008
0
        return (hseek(fp->fp.hfile, uoffset, SEEK_SET) >= 0)? 0 : -1;
2009
0
}
2010
off_t hts_utell(htsFile *fp)
2011
0
{
2012
0
    if (fp->is_bgzf)
2013
0
        return bgzf_utell(fp->fp.bgzf);
2014
0
    else
2015
0
        return htell(fp->fp.hfile);
2016
0
}
2017
2018
int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
2019
36.2M
{
2020
36.2M
    int ret;
2021
36.2M
    if (! (delimiter == KS_SEP_LINE || delimiter == '\n')) {
2022
0
        hts_log_error("Unexpected delimiter %d", delimiter);
2023
0
        abort();
2024
0
    }
2025
2026
36.2M
    switch (fp->format.compression) {
2027
920k
    case no_compression:
2028
920k
        str->l = 0;
2029
920k
        ret = kgetline2(str, (kgets_func2 *) hgetln, fp->fp.hfile);
2030
920k
        if (ret >= 0) ret = (str->l <= INT_MAX)? (int) str->l : INT_MAX;
2031
1.31k
        else if (herrno(fp->fp.hfile)) ret = -2, errno = herrno(fp->fp.hfile);
2032
1.31k
        else ret = -1;
2033
920k
        break;
2034
2035
35.3M
    case gzip:
2036
35.3M
    case bgzf:
2037
35.3M
        ret = bgzf_getline(fp->fp.bgzf, '\n', str);
2038
35.3M
        break;
2039
2040
0
    default:
2041
0
        abort();
2042
36.2M
    }
2043
2044
36.2M
    ++fp->lineno;
2045
36.2M
    return ret;
2046
36.2M
}
2047
2048
char **hts_readlist(const char *string, int is_file, int *_n)
2049
0
{
2050
0
    unsigned int m = 0, n = 0;
2051
0
    char **s = 0, **s_new;
2052
0
    if ( is_file )
2053
0
    {
2054
0
        BGZF *fp = bgzf_open(string, "r");
2055
0
        if ( !fp ) return NULL;
2056
2057
0
        kstring_t str;
2058
0
        int ret;
2059
0
        str.s = 0; str.l = str.m = 0;
2060
0
        while ((ret = bgzf_getline(fp, '\n', &str)) >= 0)
2061
0
        {
2062
0
            if (str.l == 0) continue;
2063
0
            if (n == 0 && hts_is_utf16_text(&str))
2064
0
                hts_log_warning("'%s' appears to be encoded as UTF-16", string);
2065
0
            if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
2066
0
                goto err;
2067
0
            s[n] = strdup(str.s);
2068
0
            if (!s[n])
2069
0
                goto err;
2070
0
            n++;
2071
0
        }
2072
0
        if (ret < -1) // Read error
2073
0
            goto err;
2074
0
        bgzf_close(fp);
2075
0
        free(str.s);
2076
0
    }
2077
0
    else
2078
0
    {
2079
0
        const char *q = string, *p = string;
2080
0
        while ( 1 )
2081
0
        {
2082
0
            if (*p == ',' || *p == 0)
2083
0
            {
2084
0
                if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
2085
0
                    goto err;
2086
0
                s[n] = (char*)calloc(p - q + 1, 1);
2087
0
                if (!s[n])
2088
0
                    goto err;
2089
0
                strncpy(s[n++], q, p - q);
2090
0
                q = p + 1;
2091
0
            }
2092
0
            if ( !*p ) break;
2093
0
            p++;
2094
0
        }
2095
0
    }
2096
    // Try to shrink s to the minimum size needed
2097
0
    s_new = (char**)realloc(s, n * sizeof(char*));
2098
0
    if (!s_new)
2099
0
        goto err;
2100
2101
0
    s = s_new;
2102
0
    assert(n < INT_MAX); // hts_resize() should ensure this
2103
0
    *_n = n;
2104
0
    return s;
2105
2106
0
 err:
2107
0
    for (m = 0; m < n; m++)
2108
0
        free(s[m]);
2109
0
    free(s);
2110
0
    return NULL;
2111
0
}
2112
2113
char **hts_readlines(const char *fn, int *_n)
2114
0
{
2115
0
    unsigned int m = 0, n = 0;
2116
0
    char **s = 0, **s_new;
2117
0
    BGZF *fp = bgzf_open(fn, "r");
2118
0
    if ( fp ) { // read from file
2119
0
        kstring_t str;
2120
0
        int ret;
2121
0
        str.s = 0; str.l = str.m = 0;
2122
0
        while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
2123
0
            if (str.l == 0) continue;
2124
0
            if (n == 0 && hts_is_utf16_text(&str))
2125
0
                hts_log_warning("'%s' appears to be encoded as UTF-16", fn);
2126
0
            if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
2127
0
                goto err;
2128
0
            s[n] = strdup(str.s);
2129
0
            if (!s[n])
2130
0
                goto err;
2131
0
            n++;
2132
0
        }
2133
0
        if (ret < -1) // Read error
2134
0
            goto err;
2135
0
        bgzf_close(fp);
2136
0
        free(str.s);
2137
0
    } else if (*fn == ':') { // read from string
2138
0
        const char *q, *p;
2139
0
        for (q = p = fn + 1;; ++p)
2140
0
            if (*p == ',' || *p == 0) {
2141
0
                if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
2142
0
                    goto err;
2143
0
                s[n] = (char*)calloc(p - q + 1, 1);
2144
0
                if (!s[n])
2145
0
                    goto err;
2146
0
                strncpy(s[n++], q, p - q);
2147
0
                q = p + 1;
2148
0
                if (*p == 0) break;
2149
0
            }
2150
0
    } else return 0;
2151
    // Try to shrink s to the minimum size needed
2152
0
    s_new = (char**)realloc(s, n * sizeof(char*));
2153
0
    if (!s_new)
2154
0
        goto err;
2155
2156
0
    s = s_new;
2157
0
    assert(n < INT_MAX); // hts_resize() should ensure this
2158
0
    *_n = n;
2159
0
    return s;
2160
2161
0
 err:
2162
0
    for (m = 0; m < n; m++)
2163
0
        free(s[m]);
2164
0
    free(s);
2165
0
    return NULL;
2166
0
}
2167
2168
// DEPRECATED: To be removed in a future HTSlib release
2169
int hts_file_type(const char *fname)
2170
0
{
2171
0
    int len = strlen(fname);
2172
0
    if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ;
2173
0
    if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF;
2174
0
    if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ;
2175
0
    if ( !strcmp("-",fname) ) return FT_STDIN;
2176
2177
0
    hFILE *f = hopen(fname, "r");
2178
0
    if (f == NULL) return 0;
2179
2180
0
    htsFormat fmt;
2181
0
    if (hts_detect_format2(f, fname, &fmt) < 0) { hclose_abruptly(f); return 0; }
2182
0
    if (hclose(f) < 0) return 0;
2183
2184
0
    switch (fmt.format) {
2185
0
    case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ;
2186
0
    case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ;
2187
0
    default:  return 0;
2188
0
    }
2189
0
}
2190
2191
int hts_check_EOF(htsFile *fp)
2192
0
{
2193
0
    if (fp->format.compression == bgzf)
2194
0
        return bgzf_check_EOF(hts_get_bgzfp(fp));
2195
0
    else if (fp->format.format == cram)
2196
0
        return cram_check_EOF(fp->fp.cram);
2197
0
    else
2198
0
        return 3;
2199
0
}
2200
2201
2202
/****************
2203
 *** Indexing ***
2204
 ****************/
2205
2206
0
#define HTS_MIN_MARKER_DIST 0x10000
2207
2208
// Finds the special meta bin
2209
//  ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1
2210
0
#define META_BIN(idx) ((idx)->n_bins + 1)
2211
2212
0
#define pair64_lt(a,b) ((a).u < (b).u)
2213
0
#define pair64max_lt(a,b) ((a).u < (b).u || \
2214
0
                           ((a).u == (b).u && (a).max < (b).max))
2215
2216
0
KSORT_INIT_STATIC(_off, hts_pair64_t, pair64_lt)
2217
0
KSORT_INIT_STATIC(_off_max, hts_pair64_max_t, pair64max_lt)
2218
2219
typedef struct {
2220
    int32_t m, n;
2221
    uint64_t loff;
2222
    hts_pair64_t *list;
2223
} bins_t;
2224
2225
KHASH_MAP_INIT_INT(bin, bins_t)
2226
typedef khash_t(bin) bidx_t;
2227
2228
typedef struct {
2229
    hts_pos_t n, m;
2230
    uint64_t *offset;
2231
} lidx_t;
2232
2233
struct hts_idx_t {
2234
    int fmt, min_shift, n_lvls, n_bins;
2235
    uint32_t l_meta;
2236
    int32_t n, m;
2237
    uint64_t n_no_coor;
2238
    bidx_t **bidx;
2239
    lidx_t *lidx;
2240
    uint8_t *meta; // MUST have a terminating NUL on the end
2241
    int tbi_n, last_tbi_tid;
2242
    struct {
2243
        uint32_t last_bin, save_bin;
2244
        hts_pos_t last_coor;
2245
        int last_tid, save_tid, finished;
2246
        uint64_t last_off, save_off;
2247
        uint64_t off_beg, off_end;
2248
        uint64_t n_mapped, n_unmapped;
2249
    } z; // keep internal states
2250
    BGZF *otf_fp;  // Index on-the-fly output file
2251
};
2252
2253
0
static char * idx_format_name(int fmt) {
2254
0
    switch (fmt) {
2255
0
        case HTS_FMT_CSI: return "csi";
2256
0
        case HTS_FMT_BAI: return "bai";
2257
0
        case HTS_FMT_TBI: return "tbi";
2258
0
        case HTS_FMT_CRAI: return "crai";
2259
0
        default: return "unknown";
2260
0
    }
2261
0
}
2262
2263
#ifdef DEBUG_INDEX
2264
static void idx_dump(const hts_idx_t *idx) {
2265
    int i;
2266
    int64_t j;
2267
2268
    if (!idx) fprintf(stderr, "Null index\n");
2269
2270
    fprintf(stderr, "format='%s', min_shift=%d, n_lvls=%d, n_bins=%d, l_meta=%u ",
2271
            idx_format_name(idx->fmt), idx->min_shift, idx->n_lvls, idx->n_bins, idx->l_meta);
2272
    fprintf(stderr, "n=%d, m=%d, n_no_coor=%"PRIu64"\n", idx->n, idx->m, idx->n_no_coor);
2273
    for (i = 0; i < idx->n; i++) {
2274
        bidx_t *bidx = idx->bidx[i];
2275
        lidx_t *lidx = &idx->lidx[i];
2276
        if (bidx) {
2277
            fprintf(stderr, "======== BIN Index - tid=%d, n_buckets=%d, size=%d\n", i, bidx->n_buckets, bidx->size);
2278
            int b;
2279
            for (b = 0; b < META_BIN(idx); b++) {
2280
                khint_t k;
2281
                if ((k = kh_get(bin, bidx, b)) != kh_end(bidx)) {
2282
                    bins_t *entries = &kh_value(bidx, k);
2283
                    int l = hts_bin_level(b);
2284
                    int64_t bin_width = 1LL << ((idx->n_lvls - l) * 3 + idx->min_shift);
2285
                    fprintf(stderr, "\tbin=%d, level=%d, parent=%d, n_chunks=%d, loff=%"PRIu64", interval=[%"PRId64" - %"PRId64"]\n",
2286
                        b, l, hts_bin_parent(b), entries->n, entries->loff, (b-hts_bin_first(l))*bin_width+1, (b+1-hts_bin_first(l))*bin_width);
2287
                    for (j = 0; j < entries->n; j++)
2288
                        fprintf(stderr, "\t\tchunk=%"PRId64", u=%"PRIu64", v=%"PRIu64"\n", j, entries->list[j].u, entries->list[j].v);
2289
                }
2290
            }
2291
        }
2292
        if (lidx) {
2293
            fprintf(stderr, "======== LINEAR Index - tid=%d, n_values=%"PRId64"\n", i, lidx->n);
2294
            for (j = 0; j < lidx->n; j++) {
2295
                fprintf(stderr, "\t\tentry=%"PRId64", offset=%"PRIu64", interval=[%"PRId64" - %"PRId64"]\n",
2296
                    j, lidx->offset[j], j*(1<<idx->min_shift)+1, (j+1)*(1<<idx->min_shift));
2297
            }
2298
        }
2299
    }
2300
}
2301
#endif
2302
2303
static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end)
2304
0
{
2305
0
    khint_t k;
2306
0
    bins_t *l;
2307
0
    int absent;
2308
0
    k = kh_put(bin, b, bin, &absent);
2309
0
    if (absent < 0) return -1; // Out of memory
2310
0
    l = &kh_value(b, k);
2311
0
    if (absent) {
2312
0
        l->m = 1; l->n = 0;
2313
0
        l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t));
2314
0
        if (!l->list) {
2315
0
            kh_del(bin, b, k);
2316
0
            return -1;
2317
0
        }
2318
0
    } else if (l->n == l->m) {
2319
0
        uint32_t new_m = l->m ? l->m << 1 : 1;
2320
0
        hts_pair64_t *new_list = realloc(l->list, new_m * sizeof(hts_pair64_t));
2321
0
        if (!new_list) return -1;
2322
0
        l->list = new_list;
2323
0
        l->m = new_m;
2324
0
    }
2325
0
    l->list[l->n].u = beg;
2326
0
    l->list[l->n++].v = end;
2327
0
    return 0;
2328
0
}
2329
2330
static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift)
2331
0
{
2332
0
    int i;
2333
0
    hts_pos_t beg, end;
2334
0
    beg = _beg >> min_shift;
2335
0
    end = (_end - 1) >> min_shift;
2336
0
    if (l->m < end + 1) {
2337
0
        size_t new_m = l->m * 2 > end + 1 ? l->m * 2 : end + 1;
2338
0
        uint64_t *new_offset;
2339
2340
0
        new_offset = (uint64_t*)realloc(l->offset, new_m * sizeof(uint64_t));
2341
0
        if (!new_offset) return -1;
2342
2343
        // fill unused memory with (uint64_t)-1
2344
0
        memset(new_offset + l->m, 0xff, sizeof(uint64_t) * (new_m - l->m));
2345
0
        l->m = new_m;
2346
0
        l->offset = new_offset;
2347
0
    }
2348
0
    for (i = beg; i <= end; ++i) {
2349
0
        if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset;
2350
0
    }
2351
0
    if (l->n < end + 1) l->n = end + 1;
2352
0
    return 0;
2353
0
}
2354
2355
hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
2356
0
{
2357
0
    hts_idx_t *idx;
2358
0
    idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t));
2359
0
    if (idx == NULL) return NULL;
2360
0
    idx->fmt = fmt;
2361
0
    idx->min_shift = min_shift;
2362
0
    idx->n_lvls = n_lvls;
2363
0
    idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7;
2364
0
    idx->z.save_tid = idx->z.last_tid = -1;
2365
0
    idx->z.save_bin = idx->z.last_bin = 0xffffffffu;
2366
0
    idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0;
2367
0
    idx->z.last_coor = 0xffffffffu;
2368
0
    if (n) {
2369
0
        idx->n = idx->m = n;
2370
0
        idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*));
2371
0
        if (idx->bidx == NULL) { free(idx); return NULL; }
2372
0
        idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t));
2373
0
        if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; }
2374
0
    }
2375
0
    idx->tbi_n = -1;
2376
0
    idx->last_tbi_tid = -1;
2377
0
    idx->otf_fp = NULL;
2378
0
    return idx;
2379
0
}
2380
2381
static void update_loff(hts_idx_t *idx, int i, int free_lidx)
2382
0
{
2383
0
    bidx_t *bidx = idx->bidx[i];
2384
0
    lidx_t *lidx = &idx->lidx[i];
2385
0
    khint_t k;
2386
0
    int l;
2387
    // the last entry is always valid
2388
0
    for (l=lidx->n-2; l >= 0; l--) {
2389
0
        if (lidx->offset[l] == (uint64_t)-1)
2390
0
            lidx->offset[l] = lidx->offset[l+1];
2391
0
    }
2392
0
    if (bidx == 0) return;
2393
0
    for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff
2394
0
        if (kh_exist(bidx, k))
2395
0
        {
2396
0
            if ( kh_key(bidx, k) < idx->n_bins )
2397
0
            {
2398
0
                int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls);
2399
                // disable linear index if bot_bin out of bounds
2400
0
                kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0;
2401
0
            }
2402
0
            else
2403
0
                kh_val(bidx, k).loff = 0;
2404
0
        }
2405
0
    if (free_lidx) {
2406
0
        free(lidx->offset);
2407
0
        lidx->m = lidx->n = 0;
2408
0
        lidx->offset = 0;
2409
0
    }
2410
0
}
2411
2412
static int compress_binning(hts_idx_t *idx, int i)
2413
0
{
2414
0
    bidx_t *bidx = idx->bidx[i];
2415
0
    khint_t k;
2416
0
    int l, m;
2417
0
    if (bidx == 0) return 0;
2418
    // merge a bin to its parent if the bin is too small
2419
0
    for (l = idx->n_lvls; l > 0; --l) {
2420
0
        unsigned start = hts_bin_first(l);
2421
0
        for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
2422
0
            bins_t *p, *q;
2423
0
            if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue;
2424
0
            p = &kh_value(bidx, k);
2425
0
            if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list);
2426
0
            if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) {
2427
0
                khint_t kp;
2428
0
                kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k)));
2429
0
                if (kp == kh_end(bidx)) continue;
2430
0
                q = &kh_val(bidx, kp);
2431
0
                if (q->n + p->n > q->m) {
2432
0
                    uint32_t new_m = q->n + p->n;
2433
0
                    hts_pair64_t *new_list;
2434
0
                    kroundup32(new_m);
2435
0
                    if (new_m > INT32_MAX) return -1; // Limited by index format
2436
0
                    new_list = realloc(q->list, new_m * sizeof(*new_list));
2437
0
                    if (!new_list) return -1;
2438
0
                    q->m = new_m;
2439
0
                    q->list = new_list;
2440
0
                }
2441
0
                memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t));
2442
0
                q->n += p->n;
2443
0
                free(p->list);
2444
0
                kh_del(bin, bidx, k);
2445
0
            }
2446
0
        }
2447
0
    }
2448
0
    k = kh_get(bin, bidx, 0);
2449
0
    if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list);
2450
    // merge adjacent chunks that start from the same BGZF block
2451
0
    for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
2452
0
        bins_t *p;
2453
0
        if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue;
2454
0
        p = &kh_value(bidx, k);
2455
0
        for (l = 1, m = 0; l < p->n; ++l) {
2456
0
            if (p->list[m].v>>16 >= p->list[l].u>>16) {
2457
0
                if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v;
2458
0
            } else p->list[++m] = p->list[l];
2459
0
        }
2460
0
        p->n = m + 1;
2461
0
    }
2462
0
    return 0;
2463
0
}
2464
2465
int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
2466
0
{
2467
0
    int i, ret = 0;
2468
0
    if (idx == NULL || idx->z.finished) return 0; // do not run this function on an empty index or multiple times
2469
0
    if (idx->z.save_tid >= 0) {
2470
0
        ret |= insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset);
2471
0
        ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset);
2472
0
        ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
2473
0
    }
2474
0
    for (i = 0; i < idx->n; ++i) {
2475
0
        update_loff(idx, i, (idx->fmt == HTS_FMT_CSI));
2476
0
        ret |= compress_binning(idx, i);
2477
0
    }
2478
0
    idx->z.finished = 1;
2479
2480
0
    return ret;
2481
0
}
2482
2483
static inline hts_pos_t hts_idx_maxpos(const hts_idx_t *idx)
2484
0
{
2485
0
    return hts_bin_maxpos(idx->min_shift, idx->n_lvls);
2486
0
}
2487
2488
int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
2489
0
{
2490
0
    hts_pos_t maxpos = hts_idx_maxpos(idx);
2491
0
    if (tid < 0 || (beg <= maxpos && end <= maxpos))
2492
0
        return 0;
2493
2494
0
    if (idx->fmt == HTS_FMT_CSI) {
2495
0
        hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" "
2496
0
                      "cannot be stored in a csi index with these parameters. "
2497
0
                      "Please use a larger min_shift or depth",
2498
0
                      beg, end);
2499
0
    } else {
2500
0
        hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos
2501
0
                      " cannot be stored in a %s index. Try using a csi index",
2502
0
                      beg, end, idx_format_name(idx->fmt));
2503
0
    }
2504
0
    errno = ERANGE;
2505
0
    return -1;
2506
0
}
2507
2508
int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped)
2509
0
{
2510
0
    int bin;
2511
0
    if (tid<0) beg = -1, end = 0;
2512
0
    if (hts_idx_check_range(idx, tid, beg, end) < 0)
2513
0
        return -1;
2514
0
    if (tid >= idx->m) { // enlarge the index
2515
0
        uint32_t new_m = idx->m * 2 > tid + 1 ? idx->m * 2 : tid + 1;
2516
0
        bidx_t **new_bidx;
2517
0
        lidx_t *new_lidx;
2518
2519
0
        new_bidx = (bidx_t**)realloc(idx->bidx, new_m * sizeof(bidx_t*));
2520
0
        if (!new_bidx) return -1;
2521
0
        idx->bidx = new_bidx;
2522
2523
0
        new_lidx = (lidx_t*) realloc(idx->lidx, new_m * sizeof(lidx_t));
2524
0
        if (!new_lidx) return -1;
2525
0
        idx->lidx = new_lidx;
2526
2527
0
        memset(&idx->bidx[idx->m], 0, (new_m - idx->m) * sizeof(bidx_t*));
2528
0
        memset(&idx->lidx[idx->m], 0, (new_m - idx->m) * sizeof(lidx_t));
2529
0
        idx->m = new_m;
2530
0
    }
2531
0
    if (idx->n < tid + 1) idx->n = tid + 1;
2532
0
    if (idx->z.finished) return 0;
2533
0
    if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome
2534
0
        if ( tid>=0 && idx->n_no_coor )
2535
0
        {
2536
0
            hts_log_error("NO_COOR reads not in a single block at the end %d %d", tid, idx->z.last_tid);
2537
0
            return -1;
2538
0
        }
2539
0
        if (tid>=0 && idx->bidx[tid] != 0)
2540
0
        {
2541
0
            hts_log_error("Chromosome blocks not continuous");
2542
0
            return -1;
2543
0
        }
2544
0
        idx->z.last_tid = tid;
2545
0
        idx->z.last_bin = 0xffffffffu;
2546
0
    } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order
2547
0
        hts_log_error("Unsorted positions on sequence #%d: %"PRIhts_pos" followed by %"PRIhts_pos, tid+1, idx->z.last_coor+1, beg+1);
2548
0
        return -1;
2549
0
    }
2550
0
    if (end < beg) {
2551
        // Malformed ranges are errors. (Empty ranges (beg==end) are unusual but acceptable.)
2552
0
        hts_log_error("Invalid record on sequence #%d: end %"PRId64" < begin %"PRId64, tid+1, end, beg+1);
2553
0
        return -1;
2554
0
    }
2555
0
    if ( tid>=0 )
2556
0
    {
2557
0
        if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin);
2558
        // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin
2559
0
        if (beg < 0)  beg = 0;
2560
0
        if (end <= 0) end = 1;
2561
        // idx->z.last_off points to the start of the current record
2562
0
        if (insert_to_l(&idx->lidx[tid], beg, end,
2563
0
                        idx->z.last_off, idx->min_shift) < 0) return -1;
2564
0
    }
2565
0
    else idx->n_no_coor++;
2566
0
    bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls);
2567
0
    if ((int)idx->z.last_bin != bin) { // then possibly write the binning index
2568
0
        if (idx->z.save_bin != 0xffffffffu) { // save_bin==0xffffffffu only happens to the first record
2569
0
            if (insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin,
2570
0
                            idx->z.save_off, idx->z.last_off) < 0) return -1;
2571
0
        }
2572
0
        if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information
2573
0
            idx->z.off_end = idx->z.last_off;
2574
0
            if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
2575
0
                            idx->z.off_beg, idx->z.off_end) < 0) return -1;
2576
0
            if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
2577
0
                            idx->z.n_mapped, idx->z.n_unmapped) < 0) return -1;
2578
0
            idx->z.n_mapped = idx->z.n_unmapped = 0;
2579
0
            idx->z.off_beg = idx->z.off_end;
2580
0
        }
2581
0
        idx->z.save_off = idx->z.last_off;
2582
0
        idx->z.save_bin = idx->z.last_bin = bin;
2583
0
        idx->z.save_tid = tid;
2584
0
    }
2585
0
    if (is_mapped) ++idx->z.n_mapped;
2586
0
    else ++idx->z.n_unmapped;
2587
0
    idx->z.last_off = offset;
2588
0
    idx->z.last_coor = beg;
2589
0
    return 0;
2590
0
}
2591
2592
// Needed for TBI only.  Ensure 'tid' with 'name' is in the index meta data.
2593
// idx->meta needs to have been initialised first with an appropriate Tabix
2594
// configuration via hts_idx_set_meta.
2595
//
2596
// NB number of references (first 4 bytes of tabix header) aren't in
2597
// idx->meta, but held in idx->n instead.
2598
0
int hts_idx_tbi_name(hts_idx_t *idx, int tid, const char *name) {
2599
    // Horrid - we have to map incoming tid to a tbi alternative tid.
2600
    // This is because TBI counts tids by "covered" refs while everything
2601
    // else counts by Nth SQ/contig record in header.
2602
0
    if (tid == idx->last_tbi_tid || tid < 0 || !name)
2603
0
        return idx->tbi_n;
2604
2605
0
    uint32_t len = strlen(name)+1;
2606
0
    uint8_t *tmp = (uint8_t *)realloc(idx->meta, idx->l_meta + len);
2607
0
    if (!tmp)
2608
0
        return -1;
2609
2610
    // Append name
2611
0
    idx->meta = tmp;
2612
0
    strcpy((char *)idx->meta + idx->l_meta, name);
2613
0
    idx->l_meta += len;
2614
2615
    // Update seq length
2616
0
    u32_to_le(le_to_u32(idx->meta+24)+len, idx->meta+24);
2617
2618
0
    idx->last_tbi_tid = tid;
2619
0
    return ++idx->tbi_n;
2620
0
}
2621
2622
// When doing samtools index we have a read_bam / hts_idx_push(bgzf_tell())
2623
// loop.  idx->z.last_off is the previous bzgf_tell location, so we know
2624
// the location the current bam record started at as well as where it ends.
2625
//
2626
// When building an index on the fly via a write_bam / hts_idx_push loop,
2627
// this isn't quite identical as we may amend the virtual coord returned
2628
// by bgzf_tell to the start of a new block if the next bam struct doesn't
2629
// fit.  It's essentially the same thing, but for bit-identical indices
2630
// we need to amend the idx->z.last_off when we know we're starting a new
2631
// block.
2632
void hts_idx_amend_last(hts_idx_t *idx, uint64_t offset)
2633
0
{
2634
0
    idx->z.last_off = offset;
2635
0
}
2636
2637
void hts_idx_destroy(hts_idx_t *idx)
2638
9.36k
{
2639
9.36k
    khint_t k;
2640
9.36k
    int i;
2641
9.36k
    if (idx == 0) return;
2642
2643
    // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c
2644
0
    if (idx->fmt == HTS_FMT_CRAI) {
2645
0
        hts_cram_idx_t *cidx = (hts_cram_idx_t *) idx;
2646
0
        cram_index_free(cidx->cram);
2647
0
        free(cidx);
2648
0
        return;
2649
0
    }
2650
2651
0
    for (i = 0; i < idx->m; ++i) {
2652
0
        bidx_t *bidx = idx->bidx[i];
2653
0
        free(idx->lidx[i].offset);
2654
0
        if (bidx == 0) continue;
2655
0
        for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
2656
0
            if (kh_exist(bidx, k))
2657
0
                free(kh_value(bidx, k).list);
2658
0
        kh_destroy(bin, bidx);
2659
0
    }
2660
0
    free(idx->bidx); free(idx->lidx); free(idx->meta);
2661
0
    free(idx);
2662
0
}
2663
2664
0
int hts_idx_fmt(hts_idx_t *idx) {
2665
0
    return idx->fmt;
2666
0
}
2667
2668
// The optimizer eliminates these ed_is_big() calls; still it would be good to
2669
// TODO Determine endianness at configure- or compile-time
2670
2671
static inline ssize_t HTS_RESULT_USED idx_write_int32(BGZF *fp, int32_t x)
2672
0
{
2673
0
    if (ed_is_big()) x = ed_swap_4(x);
2674
0
    return bgzf_write(fp, &x, sizeof x);
2675
0
}
2676
2677
static inline ssize_t HTS_RESULT_USED idx_write_uint32(BGZF *fp, uint32_t x)
2678
0
{
2679
0
    if (ed_is_big()) x = ed_swap_4(x);
2680
0
    return bgzf_write(fp, &x, sizeof x);
2681
0
}
2682
2683
static inline ssize_t HTS_RESULT_USED idx_write_uint64(BGZF *fp, uint64_t x)
2684
0
{
2685
0
    if (ed_is_big()) x = ed_swap_8(x);
2686
0
    return bgzf_write(fp, &x, sizeof x);
2687
0
}
2688
2689
static inline void swap_bins(bins_t *p)
2690
0
{
2691
0
    int i;
2692
0
    for (i = 0; i < p->n; ++i) {
2693
0
        ed_swap_8p(&p->list[i].u);
2694
0
        ed_swap_8p(&p->list[i].v);
2695
0
    }
2696
0
}
2697
2698
static int need_idx_ugly_delay_hack(const hts_idx_t *idx)
2699
0
{
2700
    // Ugly hack for on-the-fly BAI indexes.  As these are uncompressed,
2701
    // we need to delay writing a few bytes of data until file close
2702
    // so that we have something to force a modification time update.
2703
    //
2704
    // (For compressed indexes like CSI, the BGZF EOF block serves the same
2705
    // purpose).
2706
0
    return idx->otf_fp && !idx->otf_fp->is_compressed;
2707
0
}
2708
2709
static int idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt)
2710
0
{
2711
0
    int32_t i, j;
2712
2713
0
    #define check(ret) if ((ret) < 0) return -1
2714
2715
    // VCF TBI/CSI only writes IDs for non-empty bins (ie covered references)
2716
    //
2717
    // NOTE: CSI meta is undefined in spec, so this code has an assumption
2718
    // that we're only using it for Tabix data.
2719
0
    int nids = idx->n;
2720
0
    if (idx->meta && idx->l_meta >= 4 && le_to_u32(idx->meta) == TBX_VCF) {
2721
0
        for (i = nids = 0; i < idx->n; ++i) {
2722
0
            if (idx->bidx[i])
2723
0
                nids++;
2724
0
        }
2725
0
    }
2726
0
    check(idx_write_int32(fp, nids));
2727
0
    if (fmt == HTS_FMT_TBI && idx->l_meta)
2728
0
        check(bgzf_write(fp, idx->meta, idx->l_meta));
2729
2730
0
    for (i = 0; i < idx->n; ++i) {
2731
0
        khint_t k;
2732
0
        bidx_t *bidx = idx->bidx[i];
2733
0
        lidx_t *lidx = &idx->lidx[i];
2734
2735
        // write binning index
2736
0
        if (nids == idx->n || bidx)
2737
0
            check(idx_write_int32(fp, bidx? kh_size(bidx) : 0));
2738
0
        if (bidx)
2739
0
            for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
2740
0
                if (kh_exist(bidx, k)) {
2741
0
                    bins_t *p = &kh_value(bidx, k);
2742
0
                    check(idx_write_uint32(fp, kh_key(bidx, k)));
2743
0
                    if (fmt == HTS_FMT_CSI) check(idx_write_uint64(fp, p->loff));
2744
                    //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v);
2745
0
                    check(idx_write_int32(fp, p->n));
2746
0
                    for (j = 0; j < p->n; ++j) {
2747
                        //fprintf(stderr, "\t%ld\t%ld\n", p->list[j].u, p->list[j].v);
2748
0
                        check(idx_write_uint64(fp, p->list[j].u));
2749
0
                        check(idx_write_uint64(fp, p->list[j].v));
2750
0
                    }
2751
0
                }
2752
2753
        // write linear index
2754
0
        if (fmt != HTS_FMT_CSI) {
2755
0
            check(idx_write_int32(fp, lidx->n));
2756
0
            for (j = 0; j < lidx->n; ++j)
2757
0
                check(idx_write_uint64(fp, lidx->offset[j]));
2758
0
        }
2759
0
    }
2760
2761
0
    if (!need_idx_ugly_delay_hack(idx)) {
2762
        // Write this for compressed (CSI) indexes, but for BAI we
2763
        // need to save a bit for later.  See hts_idx_close_otf_fp()
2764
0
        check(idx_write_uint64(fp, idx->n_no_coor));
2765
0
    }
2766
2767
#ifdef DEBUG_INDEX
2768
    idx_dump(idx);
2769
#endif
2770
2771
0
    return 0;
2772
0
    #undef check
2773
0
}
2774
2775
int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
2776
0
{
2777
0
    int ret, save;
2778
0
    if (idx == NULL || fn == NULL) { errno = EINVAL; return -1; }
2779
0
    char *fnidx = (char*)calloc(1, strlen(fn) + 5);
2780
0
    if (fnidx == NULL) return -1;
2781
2782
0
    strcpy(fnidx, fn);
2783
0
    switch (fmt) {
2784
0
    case HTS_FMT_BAI: strcat(fnidx, ".bai"); break;
2785
0
    case HTS_FMT_CSI: strcat(fnidx, ".csi"); break;
2786
0
    case HTS_FMT_TBI: strcat(fnidx, ".tbi"); break;
2787
0
    default: abort();
2788
0
    }
2789
2790
0
    ret = hts_idx_save_as(idx, fn, fnidx, fmt);
2791
0
    save = errno;
2792
0
    free(fnidx);
2793
0
    errno = save;
2794
0
    return ret;
2795
0
}
2796
2797
static int hts_idx_write_out(const hts_idx_t *idx, BGZF *fp, int fmt)
2798
0
{
2799
0
    #define check(ret) if ((ret) < 0) return -1
2800
2801
0
    if (fmt == HTS_FMT_CSI) {
2802
0
        check(bgzf_write(fp, "CSI\1", 4));
2803
0
        check(idx_write_int32(fp, idx->min_shift));
2804
0
        check(idx_write_int32(fp, idx->n_lvls));
2805
0
        check(idx_write_uint32(fp, idx->l_meta));
2806
0
        if (idx->l_meta) check(bgzf_write(fp, idx->meta, idx->l_meta));
2807
0
    } else if (fmt == HTS_FMT_TBI) {
2808
0
        check(bgzf_write(fp, "TBI\1", 4));
2809
0
    } else if (fmt == HTS_FMT_BAI) {
2810
0
        check(bgzf_write(fp, "BAI\1", 4));
2811
0
    } else abort();
2812
2813
0
    check(idx_save_core(idx, fp, fmt));
2814
2815
0
    #undef check
2816
0
    return 0;
2817
0
}
2818
2819
int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
2820
0
{
2821
0
    BGZF *fp;
2822
2823
0
    if (fnidx == NULL)
2824
0
        return hts_idx_save(idx, fn, fmt);
2825
2826
0
    fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
2827
0
    if (fp == NULL) return -1;
2828
2829
0
    if (hts_idx_write_out(idx, fp, fmt) < 0) {
2830
0
        int save_errno = errno;
2831
0
        bgzf_close(fp);
2832
0
        errno = save_errno;
2833
0
        return -1;
2834
0
    }
2835
2836
0
    return bgzf_close(fp);
2837
0
}
2838
2839
// idx_save for on-the-fly indexes.  Mostly duplicated from above, except
2840
// idx is not const because we want to store the file handle in it, and
2841
// the index file handle is not closed.  This allows the index file to be
2842
// closed after the EOF block on the indexed file has been written out,
2843
// so the modification times on the two files will be in the correct order.
2844
int hts_idx_save_but_not_close(hts_idx_t *idx, const char *fnidx, int fmt)
2845
0
{
2846
0
    idx->otf_fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
2847
0
    if (idx->otf_fp == NULL) return -1;
2848
2849
0
    if (hts_idx_write_out(idx, idx->otf_fp, fmt) < 0) {
2850
0
        int save_errno = errno;
2851
0
        bgzf_close(idx->otf_fp);
2852
0
        idx->otf_fp = NULL;
2853
0
        errno = save_errno;
2854
0
        return -1;
2855
0
    }
2856
2857
0
    return bgzf_flush(idx->otf_fp);
2858
0
}
2859
2860
static int hts_idx_close_otf_fp(hts_idx_t *idx)
2861
0
{
2862
0
    if (idx && idx->otf_fp) {
2863
0
        int ret = 0;
2864
0
        if (need_idx_ugly_delay_hack(idx)) {
2865
            // BAI index - write out the bytes we deferred earlier
2866
0
            ret = idx_write_uint64(idx->otf_fp, idx->n_no_coor) < 0;
2867
0
        }
2868
0
        ret |= bgzf_close(idx->otf_fp) < 0;
2869
0
        idx->otf_fp = NULL;
2870
0
        return ret == 0 ? 0 : -1;
2871
0
    }
2872
0
    return 0;
2873
0
}
2874
2875
static int idx_read_core(hts_idx_t *idx, BGZF *fp, int fmt)
2876
0
{
2877
0
    int32_t i, n, is_be;
2878
0
    is_be = ed_is_big();
2879
0
    if (idx == NULL) return -4;
2880
0
    for (i = 0; i < idx->n; ++i) {
2881
0
        bidx_t *h;
2882
0
        lidx_t *l = &idx->lidx[i];
2883
0
        uint32_t key;
2884
0
        int j, absent;
2885
0
        bins_t *p;
2886
0
        h = idx->bidx[i] = kh_init(bin);
2887
0
        if (bgzf_read(fp, &n, 4) != 4) return -1;
2888
0
        if (is_be) ed_swap_4p(&n);
2889
0
        if (n < 0) return -3;
2890
0
        for (j = 0; j < n; ++j) {
2891
0
            khint_t k;
2892
0
            if (bgzf_read(fp, &key, 4) != 4) return -1;
2893
0
            if (is_be) ed_swap_4p(&key);
2894
0
            k = kh_put(bin, h, key, &absent);
2895
0
            if (absent <  0) return -2; // No memory
2896
0
            if (absent == 0) return -3; // Duplicate bin number
2897
0
            p = &kh_val(h, k);
2898
0
            if (fmt == HTS_FMT_CSI) {
2899
0
                if (bgzf_read(fp, &p->loff, 8) != 8) return -1;
2900
0
                if (is_be) ed_swap_8p(&p->loff);
2901
0
            } else p->loff = 0;
2902
0
            if (bgzf_read(fp, &p->n, 4) != 4) return -1;
2903
0
            if (is_be) ed_swap_4p(&p->n);
2904
0
            if (p->n < 0) return -3;
2905
0
            if ((size_t) p->n > SIZE_MAX / sizeof(hts_pair64_t)) return -2;
2906
0
            p->m = p->n;
2907
0
            p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t));
2908
0
            if (p->list == NULL) return -2;
2909
0
            if (bgzf_read(fp, p->list, ((size_t) p->n)<<4) != ((size_t) p->n)<<4) return -1;
2910
0
            if (is_be) swap_bins(p);
2911
0
        }
2912
0
        if (fmt != HTS_FMT_CSI) { // load linear index
2913
0
            int j, k;
2914
0
            uint32_t x;
2915
0
            if (bgzf_read(fp, &x, 4) != 4) return -1;
2916
0
            if (is_be) ed_swap_4p(&x);
2917
0
            l->n = x;
2918
0
            if (l->n < 0) return -3;
2919
0
            if ((size_t) l->n > SIZE_MAX / sizeof(uint64_t)) return -2;
2920
0
            l->m = l->n;
2921
0
            l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t));
2922
0
            if (l->offset == NULL) return -2;
2923
0
            if (bgzf_read(fp, l->offset, l->n << 3) != l->n << 3) return -1;
2924
0
            if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]);
2925
0
            for (k = j = 0; j < l->n && l->offset[j] == 0; k = ++j); // stop at the first non-zero entry
2926
0
            for (j = l->n-1; j > k; j--) // fill missing values; may happen given older samtools and tabix
2927
0
                if (l->offset[j-1] == 0) l->offset[j-1] = l->offset[j];
2928
0
            update_loff(idx, i, 0);
2929
0
        }
2930
0
    }
2931
0
    if (bgzf_read(fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0;
2932
0
    if (is_be) ed_swap_8p(&idx->n_no_coor);
2933
#ifdef DEBUG_INDEX
2934
    idx_dump(idx);
2935
#endif
2936
2937
0
    return 0;
2938
0
}
2939
2940
static hts_idx_t *idx_read(const char *fn)
2941
0
{
2942
0
    uint8_t magic[4];
2943
0
    int i, is_be;
2944
0
    hts_idx_t *idx = NULL;
2945
0
    uint8_t *meta = NULL;
2946
0
    BGZF *fp = bgzf_open(fn, "r");
2947
0
    if (fp == NULL) return NULL;
2948
0
    is_be = ed_is_big();
2949
0
    if (bgzf_read(fp, magic, 4) != 4) goto fail;
2950
2951
0
    if (memcmp(magic, "CSI\1", 4) == 0) {
2952
0
        uint32_t x[3], n;
2953
0
        if (bgzf_read(fp, x, 12) != 12) goto fail;
2954
0
        if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]);
2955
0
        if (x[2]) {
2956
0
            if (SIZE_MAX - x[2] < 1) goto fail; // Prevent possible overflow
2957
0
            if ((meta = (uint8_t*)malloc((size_t) x[2] + 1)) == NULL) goto fail;
2958
0
            if (bgzf_read(fp, meta, x[2]) != x[2]) goto fail;
2959
            // Prevent possible strlen past the end in tbx_index_load2
2960
0
            meta[x[2]] = '\0';
2961
0
        }
2962
0
        if (bgzf_read(fp, &n, 4) != 4) goto fail;
2963
0
        if (is_be) ed_swap_4p(&n);
2964
0
        if (n > INT32_MAX) goto fail;
2965
0
        if ((idx = hts_idx_init(n, HTS_FMT_CSI, 0, x[0], x[1])) == NULL) goto fail;
2966
0
        idx->l_meta = x[2];
2967
0
        idx->meta = meta;
2968
0
        meta = NULL;
2969
0
        if (idx_read_core(idx, fp, HTS_FMT_CSI) < 0) goto fail;
2970
0
    }
2971
0
    else if (memcmp(magic, "TBI\1", 4) == 0) {
2972
0
        uint8_t x[8 * 4];
2973
0
        uint32_t n;
2974
        // Read file header
2975
0
        if (bgzf_read(fp, x, sizeof(x)) != sizeof(x)) goto fail;
2976
0
        n = le_to_u32(&x[0]); // location of n_ref
2977
0
        if (n > INT32_MAX) goto fail;
2978
0
        if ((idx = hts_idx_init(n, HTS_FMT_TBI, 0, 14, 5)) == NULL) goto fail;
2979
0
        n = le_to_u32(&x[7*4]); // location of l_nm
2980
0
        if (n > UINT32_MAX - 29) goto fail; // Prevent possible overflow
2981
0
        idx->l_meta = 28 + n;
2982
0
        if ((idx->meta = (uint8_t*)malloc(idx->l_meta + 1)) == NULL) goto fail;
2983
        // copy format, col_seq, col_beg, col_end, meta, skip, l_nm
2984
        // N.B. left in little-endian byte order.
2985
0
        memcpy(idx->meta, &x[1*4], 28);
2986
        // Read in sequence names.
2987
0
        if (bgzf_read(fp, idx->meta + 28, n) != n) goto fail;
2988
        // Prevent possible strlen past the end in tbx_index_load2
2989
0
        idx->meta[idx->l_meta] = '\0';
2990
0
        if (idx_read_core(idx, fp, HTS_FMT_TBI) < 0) goto fail;
2991
0
    }
2992
0
    else if (memcmp(magic, "BAI\1", 4) == 0) {
2993
0
        uint32_t n;
2994
0
        if (bgzf_read(fp, &n, 4) != 4) goto fail;
2995
0
        if (is_be) ed_swap_4p(&n);
2996
0
        if (n > INT32_MAX) goto fail;
2997
0
        if ((idx = hts_idx_init(n, HTS_FMT_BAI, 0, 14, 5)) == NULL) goto fail;
2998
0
        if (idx_read_core(idx, fp, HTS_FMT_BAI) < 0) goto fail;
2999
0
    }
3000
0
    else { errno = EINVAL; goto fail; }
3001
3002
0
    bgzf_close(fp);
3003
0
    return idx;
3004
3005
0
fail:
3006
0
    bgzf_close(fp);
3007
0
    hts_idx_destroy(idx);
3008
0
    free(meta);
3009
0
    return NULL;
3010
0
}
3011
3012
int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta,
3013
                      int is_copy)
3014
0
{
3015
0
    uint8_t *new_meta = meta;
3016
0
    if (is_copy) {
3017
0
        size_t l = l_meta;
3018
0
        if (l > SIZE_MAX - 1) {
3019
0
            errno = ENOMEM;
3020
0
            return -1;
3021
0
        }
3022
0
        new_meta = malloc(l + 1);
3023
0
        if (!new_meta) return -1;
3024
0
        memcpy(new_meta, meta, l);
3025
        // Prevent possible strlen past the end in tbx_index_load2
3026
0
        new_meta[l] = '\0';
3027
0
    }
3028
0
    if (idx->meta) free(idx->meta);
3029
0
    idx->l_meta = l_meta;
3030
0
    idx->meta = new_meta;
3031
0
    return 0;
3032
0
}
3033
3034
uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta)
3035
0
{
3036
0
    *l_meta = idx->l_meta;
3037
0
    return idx->meta;
3038
0
}
3039
3040
const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)
3041
0
{
3042
0
    if ( !idx || !idx->n )
3043
0
    {
3044
0
        *n = 0;
3045
0
        return NULL;
3046
0
    }
3047
3048
0
    int tid = 0, i;
3049
0
    const char **names = (const char**) calloc(idx->n,sizeof(const char*));
3050
0
    for (i=0; i<idx->n; i++)
3051
0
    {
3052
0
        bidx_t *bidx = idx->bidx[i];
3053
0
        if ( !bidx ) continue;
3054
0
        names[tid++] = getid(hdr,i);
3055
0
    }
3056
0
    *n = tid;
3057
0
    return names;
3058
0
}
3059
3060
0
int hts_idx_nseq(const hts_idx_t *idx) {
3061
0
    if (!idx) return -1;
3062
0
    return idx->n;
3063
0
}
3064
3065
int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped)
3066
0
{
3067
0
    if (!idx) return -1;
3068
0
    if ( idx->fmt == HTS_FMT_CRAI ) {
3069
0
        *mapped = 0; *unmapped = 0;
3070
0
        return -1;
3071
0
    }
3072
3073
0
    bidx_t *h = idx->bidx[tid];
3074
0
    if (!h) return -1;
3075
0
    khint_t k = kh_get(bin, h, META_BIN(idx));
3076
0
    if (k != kh_end(h)) {
3077
0
        *mapped = kh_val(h, k).list[1].u;
3078
0
        *unmapped = kh_val(h, k).list[1].v;
3079
0
        return 0;
3080
0
    } else {
3081
0
        *mapped = 0; *unmapped = 0;
3082
0
        return -1;
3083
0
    }
3084
0
}
3085
3086
uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
3087
0
{
3088
0
    if (idx->fmt == HTS_FMT_CRAI) return 0;
3089
0
    return idx->n_no_coor;
3090
0
}
3091
3092
/****************
3093
 *** Iterator ***
3094
 ****************/
3095
3096
// Note: even with 32-bit hts_pos_t, end needs to be 64-bit here due to 1LL<<s.
3097
static inline int reg2bins_narrow(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx)
3098
0
{
3099
0
    int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
3100
0
    for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3101
0
        hts_pos_t b, e;
3102
0
        int i;
3103
0
        b = t + (beg>>s); e = t + (end>>s);
3104
0
        for (i = b; i <= e; ++i) {
3105
0
            if (kh_get(bin, bidx, i) != kh_end(bidx)) {
3106
0
                assert(itr->bins.n < itr->bins.m);
3107
0
                itr->bins.a[itr->bins.n++] = i;
3108
0
            }
3109
0
        }
3110
0
    }
3111
0
    return itr->bins.n;
3112
0
}
3113
3114
static inline int reg2bins_wide(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx)
3115
0
{
3116
0
    khint_t i;
3117
0
    hts_pos_t max_shift = 3 * n_lvls + min_shift;
3118
0
    --end;
3119
0
    if (beg < 0) beg = 0;
3120
0
    for (i = kh_begin(bidx); i != kh_end(bidx); i++) {
3121
0
        if (!kh_exist(bidx, i)) continue;
3122
0
        hts_pos_t bin = (hts_pos_t) kh_key(bidx, i);
3123
0
        int level = hts_bin_level(bin);
3124
0
        if (level > n_lvls) continue; // Dodgy index?
3125
0
        hts_pos_t first = hts_bin_first(level);
3126
0
        hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level));
3127
0
        hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level));
3128
0
        if (beg_at_level <= bin && bin <= end_at_level) {
3129
0
            assert(itr->bins.n < itr->bins.m);
3130
0
            itr->bins.a[itr->bins.n++] = bin;
3131
0
        }
3132
0
    }
3133
0
    return itr->bins.n;
3134
0
}
3135
3136
static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx)
3137
0
{
3138
0
    int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
3139
0
    size_t reg_bin_count = 0, hash_bin_count = kh_n_buckets(bidx), max_bins;
3140
0
    hts_pos_t end1;
3141
0
    if (end >= 1LL<<s) end = 1LL<<s;
3142
0
    if (beg >= end) return 0;
3143
0
    end1 = end - 1;
3144
3145
    // Count bins to see if it's faster to iterate through the hash table
3146
    // or the set of bins covering the region
3147
0
    for (l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3148
0
        reg_bin_count += (end1 >> s) - (beg >> s) + 1;
3149
0
    }
3150
0
    max_bins = reg_bin_count < kh_size(bidx) ? reg_bin_count : kh_size(bidx);
3151
0
    if (itr->bins.m - itr->bins.n < max_bins) {
3152
        // Worst-case memory usage.  May be wasteful on very sparse
3153
        // data, but the bin list usually won't be too big anyway.
3154
0
        size_t new_m = max_bins + itr->bins.n;
3155
0
        if (new_m > INT_MAX || new_m > SIZE_MAX / sizeof(int)) {
3156
0
            errno = ENOMEM;
3157
0
            return -1;
3158
0
        }
3159
0
        int *new_a = realloc(itr->bins.a, new_m * sizeof(*new_a));
3160
0
        if (!new_a) return -1;
3161
0
        itr->bins.a = new_a;
3162
0
        itr->bins.m = new_m;
3163
0
    }
3164
0
    if (reg_bin_count < hash_bin_count) {
3165
0
        return reg2bins_narrow(beg, end, itr, min_shift, n_lvls, bidx);
3166
0
    } else {
3167
0
        return reg2bins_wide(beg, end, itr, min_shift, n_lvls, bidx);
3168
0
    }
3169
0
}
3170
3171
static inline int add_to_interval(hts_itr_t *iter, bins_t *bin,
3172
                                  int tid, uint32_t interval,
3173
                                  uint64_t min_off, uint64_t max_off)
3174
0
{
3175
0
    hts_pair64_max_t *off;
3176
0
    int j;
3177
3178
0
    if (!bin->n)
3179
0
        return 0;
3180
0
    off = realloc(iter->off, (iter->n_off + bin->n) * sizeof(*off));
3181
0
    if (!off)
3182
0
        return -2;
3183
3184
0
    iter->off = off;
3185
0
    for (j = 0; j < bin->n; ++j) {
3186
0
        if (bin->list[j].v > min_off && bin->list[j].u < max_off) {
3187
0
            iter->off[iter->n_off].u = min_off > bin->list[j].u
3188
0
                ? min_off : bin->list[j].u;
3189
0
            iter->off[iter->n_off].v = max_off < bin->list[j].v
3190
0
                ? max_off : bin->list[j].v;
3191
            // hts_pair64_max_t::max is now used to link
3192
            // file offsets to region list entries.
3193
            // The iterator can use this to decide if it
3194
            // can skip some file regions.
3195
0
            iter->off[iter->n_off].max = ((uint64_t) tid << 32) | interval;
3196
0
            iter->n_off++;
3197
0
        }
3198
0
    }
3199
0
    return 0;
3200
0
}
3201
3202
static inline int reg2intervals_narrow(hts_itr_t *iter, const bidx_t *bidx,
3203
                                       int tid, int64_t beg, int64_t end,
3204
                                       uint32_t interval,
3205
                                       uint64_t min_off, uint64_t max_off,
3206
                                       int min_shift, int n_lvls)
3207
0
{
3208
0
    int l, t, s = min_shift + n_lvls * 3;
3209
0
    hts_pos_t b, e, i;
3210
3211
0
    for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3212
0
        b = t + (beg>>s); e = t + (end>>s);
3213
0
        for (i = b; i <= e; ++i) {
3214
0
            khint_t k = kh_get(bin, bidx, i);
3215
0
            if (k != kh_end(bidx)) {
3216
0
                bins_t *bin = &kh_value(bidx, k);
3217
0
                int res = add_to_interval(iter, bin, tid, interval, min_off, max_off);
3218
0
                if (res < 0)
3219
0
                    return res;
3220
0
            }
3221
0
        }
3222
0
    }
3223
0
    return 0;
3224
0
}
3225
3226
static inline int reg2intervals_wide(hts_itr_t *iter, const bidx_t *bidx,
3227
                                       int tid, int64_t beg, int64_t end,
3228
                                       uint32_t interval,
3229
                                       uint64_t min_off, uint64_t max_off,
3230
                                       int min_shift, int n_lvls)
3231
0
{
3232
0
    khint_t i;
3233
0
    hts_pos_t max_shift = 3 * n_lvls + min_shift;
3234
0
    --end;
3235
0
    if (beg < 0) beg = 0;
3236
0
    for (i = kh_begin(bidx); i != kh_end(bidx); i++) {
3237
0
        if (!kh_exist(bidx, i)) continue;
3238
0
        hts_pos_t bin = (hts_pos_t) kh_key(bidx, i);
3239
0
        int level = hts_bin_level(bin);
3240
0
        if (level > n_lvls) continue; // Dodgy index?
3241
0
        hts_pos_t first = hts_bin_first(level);
3242
0
        hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level));
3243
0
        hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level));
3244
0
        if (beg_at_level <= bin && bin <= end_at_level) {
3245
0
            bins_t *bin = &kh_value(bidx, i);
3246
0
            int res = add_to_interval(iter, bin, tid, interval, min_off, max_off);
3247
0
            if (res < 0)
3248
0
                return res;
3249
0
        }
3250
0
    }
3251
0
    return 0;
3252
0
}
3253
3254
static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint32_t interval, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls)
3255
0
{
3256
0
    int l, t, s;
3257
0
    int i, j;
3258
0
    hts_pos_t end1;
3259
0
    bidx_t *bidx;
3260
0
    int start_n_off;
3261
0
    size_t reg_bin_count = 0, hash_bin_count;
3262
0
    int res;
3263
3264
0
    if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg > end)
3265
0
        return -1;
3266
3267
0
    hash_bin_count = kh_n_buckets(bidx);
3268
3269
0
    s = min_shift + (n_lvls<<1) + n_lvls;
3270
0
    if (end >= 1LL<<s)
3271
0
        end = 1LL<<s;
3272
3273
0
    end1 = end - 1;
3274
    // Count bins to see if it's faster to iterate through the hash table
3275
    // or the set of bins covering the region
3276
0
    for (l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3277
0
        reg_bin_count += (end1 >> s) - (beg >> s) + 1;
3278
0
    }
3279
3280
0
    start_n_off = iter->n_off;
3281
3282
    // Populate iter->off with the intervals for this region
3283
0
    if (reg_bin_count < hash_bin_count) {
3284
0
        res = reg2intervals_narrow(iter, bidx, tid, beg, end, interval,
3285
0
                                   min_off, max_off, min_shift, n_lvls);
3286
0
    } else {
3287
0
        res = reg2intervals_wide(iter, bidx, tid, beg, end, interval,
3288
0
                                 min_off, max_off, min_shift, n_lvls);
3289
0
    }
3290
0
    if (res < 0)
3291
0
        return res;
3292
3293
0
    if (iter->n_off - start_n_off > 1) {
3294
0
        ks_introsort(_off_max, iter->n_off - start_n_off, iter->off + start_n_off);
3295
0
        for (i = start_n_off, j = start_n_off + 1; j < iter->n_off; j++) {
3296
0
            if (iter->off[i].v >= iter->off[j].u) {
3297
0
                if (iter->off[i].v < iter->off[j].v)
3298
0
                    iter->off[i].v = iter->off[j].v;
3299
0
            } else {
3300
0
                i++;
3301
0
                if (i < j)
3302
0
                    iter->off[i] = iter->off[j];
3303
0
            }
3304
0
        }
3305
0
        iter->n_off = i + 1;
3306
0
    }
3307
3308
0
    return iter->n_off;
3309
0
}
3310
3311
0
static int compare_regions(const void *r1, const void *r2) {
3312
0
    hts_reglist_t *reg1 = (hts_reglist_t *)r1;
3313
0
    hts_reglist_t *reg2 = (hts_reglist_t *)r2;
3314
3315
0
    if (reg1->tid < 0 && reg2->tid >= 0)
3316
0
        return 1;
3317
0
    else if (reg1->tid >= 0 && reg2->tid < 0)
3318
0
        return -1;
3319
0
    else
3320
0
        return reg1->tid - reg2->tid;
3321
0
}
3322
3323
0
uint64_t hts_itr_off(const hts_idx_t* idx, int tid) {
3324
3325
0
    int i;
3326
0
    bidx_t* bidx;
3327
0
    uint64_t off0 = (uint64_t) -1;
3328
0
    khint_t k;
3329
0
    switch (tid) {
3330
0
    case HTS_IDX_START:
3331
        // Find the smallest offset, note that sequence ids may not be ordered sequentially
3332
0
        for (i = 0; i < idx->n; i++) {
3333
0
            bidx = idx->bidx[i];
3334
0
            k = kh_get(bin, bidx, META_BIN(idx));
3335
0
            if (k == kh_end(bidx))
3336
0
                continue;
3337
3338
0
            if (off0 > kh_val(bidx, k).list[0].u)
3339
0
                off0 = kh_val(bidx, k).list[0].u;
3340
0
        }
3341
0
        if (off0 == (uint64_t) -1 && idx->n_no_coor)
3342
0
            off0 = 0;
3343
        // only no-coor reads in this bam
3344
0
        break;
3345
0
    case HTS_IDX_NOCOOR:
3346
        /* No-coor reads sort after all of the mapped reads.  The position
3347
           is not stored in the index itself, so need to find the end
3348
           offset for the last mapped read.  A loop is needed here in
3349
           case references at the end of the file have no mapped reads,
3350
           or sequence ids are not ordered sequentially.
3351
           See issue samtools#568 and commits b2aab8, 60c22d and cc207d. */
3352
0
        for (i = 0; i < idx->n; i++) {
3353
0
            bidx = idx->bidx[i];
3354
0
            k = kh_get(bin, bidx, META_BIN(idx));
3355
0
            if (k != kh_end(bidx)) {
3356
0
                if (off0 == (uint64_t) -1 || off0 < kh_val(bidx, k).list[0].v) {
3357
0
                    off0 = kh_val(bidx, k).list[0].v;
3358
0
                }
3359
0
            }
3360
0
        }
3361
0
        if (off0 == (uint64_t) -1 && idx->n_no_coor)
3362
0
            off0 = 0;
3363
        // only no-coor reads in this bam
3364
0
        break;
3365
0
    case HTS_IDX_REST:
3366
0
        off0 = 0;
3367
0
        break;
3368
0
    case HTS_IDX_NONE:
3369
0
        off0 = 0;
3370
0
        break;
3371
0
    }
3372
3373
0
    return off0;
3374
0
}
3375
3376
hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
3377
0
{
3378
0
    int i, n_off, l, bin;
3379
0
    hts_pair64_max_t *off;
3380
0
    khint_t k;
3381
0
    bidx_t *bidx;
3382
0
    uint64_t min_off, max_off;
3383
0
    hts_pos_t idx_maxpos;
3384
0
    hts_itr_t *iter;
3385
0
    uint32_t unmapped = 0, rel_off;
3386
3387
    // It's possible to call this function with NULL idx iff
3388
    // tid is one of the special values HTS_IDX_REST or HTS_IDX_NONE
3389
0
    if (!idx && !(tid == HTS_IDX_REST || tid == HTS_IDX_NONE)) {
3390
0
        errno = EINVAL;
3391
0
        return NULL;
3392
0
    }
3393
3394
0
    iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
3395
0
    if (iter) {
3396
0
        if (tid < 0) {
3397
0
            uint64_t off = hts_itr_off(idx, tid);
3398
0
            if (off != (uint64_t) -1) {
3399
0
                iter->read_rest = 1;
3400
0
                iter->curr_off = off;
3401
0
                iter->readrec = readrec;
3402
0
                if (tid == HTS_IDX_NONE)
3403
0
                    iter->finished = 1;
3404
0
            } else {
3405
0
                free(iter);
3406
0
                iter = NULL;
3407
0
            }
3408
0
        } else if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) {
3409
0
            iter->finished = 1;
3410
0
        } else {
3411
0
            if (beg < 0) beg = 0;
3412
0
            if (end < beg) {
3413
0
              free(iter);
3414
0
              return NULL;
3415
0
            }
3416
3417
0
            k = kh_get(bin, bidx, META_BIN(idx));
3418
0
            if (k != kh_end(bidx))
3419
0
                unmapped = kh_val(bidx, k).list[1].v;
3420
0
            else
3421
0
                unmapped = 1;
3422
3423
0
            iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
3424
0
            iter->readrec = readrec;
3425
3426
0
            if ( !kh_size(bidx) ) { iter->finished = 1; return iter; }
3427
3428
0
            idx_maxpos = hts_idx_maxpos(idx);
3429
0
            if (beg >= idx_maxpos) { iter->finished = 1; return iter; }
3430
3431
0
            rel_off = beg>>idx->min_shift;
3432
            // compute min_off
3433
0
            bin = hts_bin_first(idx->n_lvls) + rel_off;
3434
0
            do {
3435
0
                int first;
3436
0
                k = kh_get(bin, bidx, bin);
3437
0
                if (k != kh_end(bidx)) break;
3438
0
                first = (hts_bin_parent(bin)<<3) + 1;
3439
0
                if (bin > first) --bin;
3440
0
                else bin = hts_bin_parent(bin);
3441
0
            } while (bin);
3442
0
            if (bin == 0) k = kh_get(bin, bidx, bin);
3443
0
            min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
3444
            // min_off can be calculated more accurately if the
3445
            // linear index is available
3446
0
            if (idx->lidx[tid].offset
3447
0
                && rel_off < idx->lidx[tid].n) {
3448
0
                if (min_off < idx->lidx[tid].offset[rel_off])
3449
0
                    min_off = idx->lidx[tid].offset[rel_off];
3450
0
                if (unmapped) {
3451
                    // unmapped reads are not covered by the linear index,
3452
                    // so search backwards for a smaller offset
3453
0
                    int tmp_off;
3454
0
                    for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) {
3455
0
                        if (idx->lidx[tid].offset[tmp_off] < min_off) {
3456
0
                            min_off = idx->lidx[tid].offset[tmp_off];
3457
0
                            break;
3458
0
                        }
3459
0
                    }
3460
                    // if the search went too far back or no satisfactory entry
3461
                    // was found, revert to the bin index loff value
3462
0
                    if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).loff || tmp_off < 0))
3463
0
                        min_off = kh_val(bidx, k).loff;
3464
0
                }
3465
0
            } else if (unmapped) { //CSI index
3466
0
                if (k != kh_end(bidx))
3467
0
                    min_off = kh_val(bidx, k).loff;
3468
0
            }
3469
3470
            // compute max_off: a virtual offset from a bin to the right of end
3471
            // First check if end lies within the range of the index (it won't
3472
            // if it's HTS_POS_MAX)
3473
0
            if (end <= idx_maxpos) {
3474
0
                bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
3475
0
                if (bin >= idx->n_bins) bin = 0;
3476
0
                while (1) {
3477
                    // search for an extant bin by moving right, but moving up to the
3478
                    // parent whenever we get to a first child (which also covers falling
3479
                    // off the RHS, which wraps around and immediately goes up to bin 0)
3480
0
                    while (bin % 8 == 1) bin = hts_bin_parent(bin);
3481
0
                    if (bin == 0) { max_off = UINT64_MAX; break; }
3482
0
                    k = kh_get(bin, bidx, bin);
3483
0
                    if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; }
3484
0
                    bin++;
3485
0
                }
3486
0
            } else {
3487
                // Searching to end of reference
3488
0
                max_off = UINT64_MAX;
3489
0
            }
3490
3491
            // retrieve bins
3492
0
            if (reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls, bidx) < 0) {
3493
0
                hts_itr_destroy(iter);
3494
0
                return NULL;
3495
0
            }
3496
3497
0
            for (i = n_off = 0; i < iter->bins.n; ++i)
3498
0
                if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx))
3499
0
                    n_off += kh_value(bidx, k).n;
3500
0
            if (n_off == 0) {
3501
                // No overlapping bins means the iterator has already finished.
3502
0
                iter->finished = 1;
3503
0
                return iter;
3504
0
            }
3505
0
            off = calloc(n_off, sizeof(*off));
3506
0
            for (i = n_off = 0; i < iter->bins.n; ++i) {
3507
0
                if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) {
3508
0
                    int j;
3509
0
                    bins_t *p = &kh_value(bidx, k);
3510
0
                    for (j = 0; j < p->n; ++j)
3511
0
                        if (p->list[j].v > min_off && p->list[j].u < max_off) {
3512
0
                            off[n_off].u = min_off > p->list[j].u
3513
0
                                ? min_off : p->list[j].u;
3514
0
                            off[n_off].v = max_off < p->list[j].v
3515
0
                                ? max_off : p->list[j].v;
3516
                            // hts_pair64_max_t::max is now used to link
3517
                            // file offsets to region list entries.
3518
                            // The iterator can use this to decide if it
3519
                            // can skip some file regions.
3520
0
                            off[n_off].max = ((uint64_t) tid << 32) | j;
3521
0
                            n_off++;
3522
0
                        }
3523
0
                }
3524
0
            }
3525
3526
0
            if (n_off == 0) {
3527
0
                free(off);
3528
0
                iter->finished = 1;
3529
0
                return iter;
3530
0
            }
3531
0
            ks_introsort(_off_max, n_off, off);
3532
            // resolve completely contained adjacent blocks
3533
0
            for (i = 1, l = 0; i < n_off; ++i)
3534
0
                if (off[l].v < off[i].v) off[++l] = off[i];
3535
0
            n_off = l + 1;
3536
            // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
3537
0
            for (i = 1; i < n_off; ++i)
3538
0
                if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
3539
            // merge adjacent blocks
3540
0
            for (i = 1, l = 0; i < n_off; ++i) {
3541
0
                if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
3542
0
                else off[++l] = off[i];
3543
0
            }
3544
0
            n_off = l + 1;
3545
0
            iter->n_off = n_off; iter->off = off;
3546
0
        }
3547
0
    }
3548
3549
0
    return iter;
3550
0
}
3551
3552
int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
3553
0
{
3554
0
    int i, j, bin;
3555
0
    khint_t k;
3556
0
    bidx_t *bidx;
3557
0
    uint64_t min_off, max_off, t_off = (uint64_t)-1;
3558
0
    int tid;
3559
0
    hts_pos_t beg, end, idx_maxpos;
3560
0
    hts_reglist_t *curr_reg;
3561
0
    uint32_t unmapped = 0, rel_off;
3562
3563
0
    if (!idx || !iter || !iter->multi)
3564
0
        return -1;
3565
3566
0
    iter->i = -1;
3567
0
    for (i=0; i<iter->n_reg; i++) {
3568
3569
0
        curr_reg = &iter->reg_list[i];
3570
0
        tid = curr_reg->tid;
3571
3572
0
        if (tid < 0) {
3573
0
            t_off = hts_itr_off(idx, tid);
3574
0
            if (t_off != (uint64_t)-1) {
3575
0
                switch (tid) {
3576
0
                case HTS_IDX_NONE:
3577
0
                    iter->finished = 1;
3578
                    // fall through
3579
0
                case HTS_IDX_START:
3580
0
                case HTS_IDX_REST:
3581
0
                    iter->curr_off = t_off;
3582
0
                    iter->n_reg = 0;
3583
0
                    iter->reg_list = NULL;
3584
0
                    iter->read_rest = 1;
3585
0
                    return 0;
3586
0
                case HTS_IDX_NOCOOR:
3587
0
                    iter->nocoor = 1;
3588
0
                    iter->nocoor_off = t_off;
3589
0
                }
3590
0
            }
3591
0
        } else {
3592
0
            if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL || !kh_size(bidx))
3593
0
                continue;
3594
3595
0
            k = kh_get(bin, bidx, META_BIN(idx));
3596
0
            if (k != kh_end(bidx))
3597
0
                unmapped = kh_val(bidx, k).list[1].v;
3598
0
            else
3599
0
                unmapped = 1;
3600
3601
0
            idx_maxpos = hts_idx_maxpos(idx);
3602
3603
0
            for(j=0; j<curr_reg->count; j++) {
3604
0
                hts_pair32_t *curr_intv = &curr_reg->intervals[j];
3605
0
                if (curr_intv->end < curr_intv->beg)
3606
0
                    continue;
3607
3608
0
                beg = curr_intv->beg;
3609
0
                end = curr_intv->end;
3610
0
                if (beg >= idx_maxpos)
3611
0
                    continue;
3612
0
                rel_off = beg>>idx->min_shift;
3613
3614
                /* Compute 'min_off' by searching the lowest level bin containing 'beg'.
3615
                       If the computed bin is not in the index, try the next bin to the
3616
                       left, belonging to the same parent. If it is the first sibling bin,
3617
                       try the parent bin. */
3618
0
                bin = hts_bin_first(idx->n_lvls) + rel_off;
3619
0
                do {
3620
0
                    int first;
3621
0
                    k = kh_get(bin, bidx, bin);
3622
0
                    if (k != kh_end(bidx)) break;
3623
0
                    first = (hts_bin_parent(bin)<<3) + 1;
3624
0
                    if (bin > first) --bin;
3625
0
                    else bin = hts_bin_parent(bin);
3626
0
                } while (bin);
3627
0
                if (bin == 0)
3628
0
                    k = kh_get(bin, bidx, bin);
3629
0
                min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
3630
                // min_off can be calculated more accurately if the
3631
                // linear index is available
3632
0
                if (idx->lidx[tid].offset
3633
0
                    && rel_off < idx->lidx[tid].n) {
3634
0
                    if (min_off < idx->lidx[tid].offset[rel_off])
3635
0
                        min_off = idx->lidx[tid].offset[rel_off];
3636
0
                    if (unmapped) {
3637
0
                        int tmp_off;
3638
0
                        for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) {
3639
0
                            if (idx->lidx[tid].offset[tmp_off] < min_off) {
3640
0
                                min_off = idx->lidx[tid].offset[tmp_off];
3641
0
                                break;
3642
0
                            }
3643
0
                        }
3644
3645
0
                        if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).loff || tmp_off < 0))
3646
0
                            min_off = kh_val(bidx, k).loff;
3647
0
                    }
3648
0
                } else if (unmapped) { //CSI index
3649
0
                    if (k != kh_end(bidx))
3650
0
                        min_off = kh_val(bidx, k).loff;
3651
0
                }
3652
3653
                // compute max_off: a virtual offset from a bin to the right of end
3654
                // First check if end lies within the range of the index (it
3655
                // won't if it's HTS_POS_MAX)
3656
0
                if (end <= idx_maxpos) {
3657
0
                    bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
3658
0
                    if (bin >= idx->n_bins) bin = 0;
3659
0
                    while (1) {
3660
                        // search for an extant bin by moving right, but moving up to the
3661
                        // parent whenever we get to a first child (which also covers falling
3662
                        // off the RHS, which wraps around and immediately goes up to bin 0)
3663
0
                        while (bin % 8 == 1) bin = hts_bin_parent(bin);
3664
0
                        if (bin == 0) { max_off = UINT64_MAX; break; }
3665
0
                        k = kh_get(bin, bidx, bin);
3666
0
                        if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) {
3667
0
                            max_off = kh_val(bidx, k).list[0].u;
3668
0
                            break;
3669
0
                        }
3670
0
                        bin++;
3671
0
                    }
3672
0
                } else {
3673
                    // Searching to end of reference
3674
0
                    max_off = UINT64_MAX;
3675
0
                }
3676
3677
                //convert coordinates to file offsets
3678
0
                if (reg2intervals(iter, idx, tid, beg, end, j,
3679
0
                                  min_off, max_off,
3680
0
                                  idx->min_shift, idx->n_lvls) < 0) {
3681
0
                    return -1;
3682
0
                }
3683
0
            }
3684
0
        }
3685
0
    }
3686
3687
0
    if (iter->n_off > 1)
3688
0
        ks_introsort(_off_max, iter->n_off, iter->off);
3689
3690
0
    if(!iter->n_off && !iter->nocoor)
3691
0
        iter->finished = 1;
3692
3693
0
    return 0;
3694
0
}
3695
3696
int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter)
3697
0
{
3698
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
3699
0
    int tid, i, n_off = 0;
3700
0
    uint32_t j;
3701
0
    hts_pos_t beg, end;
3702
0
    hts_reglist_t *curr_reg;
3703
0
    hts_pair32_t *curr_intv;
3704
0
    hts_pair64_max_t *off = NULL, *tmp;
3705
0
    cram_index *e = NULL;
3706
3707
0
    if (!cidx || !iter || !iter->multi)
3708
0
        return -1;
3709
3710
0
    iter->is_cram = 1;
3711
0
    iter->read_rest = 0;
3712
0
    iter->off = NULL;
3713
0
    iter->n_off = 0;
3714
0
    iter->curr_off = 0;
3715
0
    iter->i = -1;
3716
3717
0
    for (i=0; i<iter->n_reg; i++) {
3718
3719
0
        curr_reg = &iter->reg_list[i];
3720
0
        tid = curr_reg->tid;
3721
3722
0
        if (tid >= 0) {
3723
0
            tmp = realloc(off, (n_off + curr_reg->count) * sizeof(*off));
3724
0
            if (!tmp)
3725
0
                goto err;
3726
0
            off = tmp;
3727
3728
0
            for (j=0; j < curr_reg->count; j++) {
3729
0
                curr_intv = &curr_reg->intervals[j];
3730
0
                if (curr_intv->end < curr_intv->beg)
3731
0
                    continue;
3732
3733
0
                beg = curr_intv->beg;
3734
0
                end = curr_intv->end;
3735
3736
/* First, fetch the container overlapping 'beg' and assign its file offset to u, then
3737
 * find the container overlapping 'end' and assign the relative end of the slice to v.
3738
 * The cram_ptell function will adjust with the container offset, which is not stored
3739
 * in the index.
3740
 */
3741
0
                e = cram_index_query(cidx->cram, tid, beg+1, NULL);
3742
0
                if (e) {
3743
0
                    off[n_off].u = e->offset;
3744
                    // hts_pair64_max_t::max is now used to link
3745
                    // file offsets to region list entries.
3746
                    // The iterator can use this to decide if it
3747
                    // can skip some file regions.
3748
0
                    off[n_off].max = ((uint64_t) tid << 32) | j;
3749
3750
0
                    if (end >= HTS_POS_MAX) {
3751
0
                       e = cram_index_last(cidx->cram, tid, NULL);
3752
0
                    } else {
3753
0
                       e = cram_index_query_last(cidx->cram, tid, end+1);
3754
0
                    }
3755
3756
0
                    if (e) {
3757
0
                        off[n_off++].v = e->e_next
3758
0
                            ? e->e_next->offset
3759
0
                            : e->offset + e->slice + e->len;
3760
0
                    } else {
3761
0
                        hts_log_warning("Could not set offset end for region %d:%"PRIhts_pos"-%"PRIhts_pos". Skipping", tid, beg, end);
3762
0
                    }
3763
0
                }
3764
0
            }
3765
0
        } else {
3766
0
            switch (tid) {
3767
0
                case HTS_IDX_NOCOOR:
3768
0
                    e = cram_index_query(cidx->cram, tid, 1, NULL);
3769
0
                    if (e) {
3770
0
                        iter->nocoor = 1;
3771
0
                        iter->nocoor_off = e->offset;
3772
0
                    } else {
3773
0
                        hts_log_warning("No index entry for NOCOOR region");
3774
0
                    }
3775
0
                    break;
3776
0
                case HTS_IDX_START:
3777
0
                    e = cram_index_query(cidx->cram, tid, 1, NULL);
3778
0
                    if (e) {
3779
0
                        iter->read_rest = 1;
3780
0
                        tmp = realloc(off, sizeof(*off));
3781
0
                        if (!tmp)
3782
0
                            goto err;
3783
0
                        off = tmp;
3784
0
                        off[0].u = e->offset;
3785
0
                        off[0].v = 0;
3786
0
                        n_off=1;
3787
0
                    } else {
3788
0
                        hts_log_warning("No index entries");
3789
0
                    }
3790
0
                    break;
3791
0
                case HTS_IDX_REST:
3792
0
                    break;
3793
0
                case HTS_IDX_NONE:
3794
0
                    iter->finished = 1;
3795
0
                    break;
3796
0
                default:
3797
0
                    hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
3798
0
            }
3799
0
        }
3800
0
    }
3801
3802
0
    if (n_off) {
3803
0
        ks_introsort(_off_max, n_off, off);
3804
0
        iter->n_off = n_off; iter->off = off;
3805
0
    }
3806
3807
0
    if(!n_off && !iter->nocoor)
3808
0
        iter->finished = 1;
3809
3810
0
    return 0;
3811
3812
0
 err:
3813
0
    free(off);
3814
0
    return -1;
3815
0
}
3816
3817
void hts_itr_destroy(hts_itr_t *iter)
3818
0
{
3819
0
    if (iter) {
3820
0
        if (iter->multi) {
3821
0
            hts_reglist_free(iter->reg_list, iter->n_reg);
3822
0
        } else {
3823
0
            free(iter->bins.a);
3824
0
        }
3825
3826
0
        if (iter->off)
3827
0
            free(iter->off);
3828
0
        free(iter);
3829
0
    }
3830
0
}
3831
3832
static inline unsigned long long push_digit(unsigned long long i, char c)
3833
0
{
3834
    // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so
3835
0
    int digit = c - '0';
3836
0
    return 10 * i + digit;
3837
0
}
3838
3839
long long hts_parse_decimal(const char *str, char **strend, int flags)
3840
0
{
3841
0
    unsigned long long n = 0;
3842
0
    int digits = 0, decimals = 0, e = 0, lost = 0;
3843
0
    char sign = '+', esign = '+';
3844
0
    const char *s, *str_orig = str;
3845
3846
0
    while (isspace_c(*str)) str++;
3847
0
    s = str;
3848
3849
0
    if (*s == '+' || *s == '-') sign = *s++;
3850
0
    while (*s)
3851
0
        if (isdigit_c(*s)) digits++, n = push_digit(n, *s++);
3852
0
        else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++;
3853
0
        else break;
3854
3855
0
    if (*s == '.') {
3856
0
        s++;
3857
0
        while (isdigit_c(*s)) decimals++, digits++, n = push_digit(n, *s++);
3858
0
    }
3859
3860
0
    switch (*s) {
3861
0
    case 'e': case 'E':
3862
0
        s++;
3863
0
        if (*s == '+' || *s == '-') esign = *s++;
3864
0
        while (isdigit_c(*s)) e = push_digit(e, *s++);
3865
0
        if (esign == '-') e = -e;
3866
0
        break;
3867
3868
0
    case 'k': case 'K': e += 3; s++; break;
3869
0
    case 'm': case 'M': e += 6; s++; break;
3870
0
    case 'g': case 'G': e += 9; s++; break;
3871
0
    }
3872
3873
0
    e -= decimals;
3874
0
    while (e > 0) n *= 10, e--;
3875
0
    while (e < 0) lost += n % 10, n /= 10, e++;
3876
3877
0
    if (lost > 0) {
3878
0
        hts_log_warning("Discarding fractional part of %.*s", (int)(s - str), str);
3879
0
    }
3880
3881
0
    if (strend) {
3882
        // Set to the original input str pointer if not valid number syntax
3883
0
        *strend = (digits > 0)? (char *)s : (char *)str_orig;
3884
0
    } else if (digits == 0) {
3885
0
        hts_log_warning("Invalid numeric value %.8s[truncated]", str);
3886
0
    } else if (*s) {
3887
0
        if ((flags & HTS_PARSE_THOUSANDS_SEP) || (!(flags & HTS_PARSE_THOUSANDS_SEP) && *s != ','))
3888
0
            hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s);
3889
0
    }
3890
3891
0
    return (sign == '+')? n : -n;
3892
0
}
3893
3894
0
static void *hts_memrchr(const void *s, int c, size_t n) {
3895
0
    size_t i;
3896
0
    unsigned char *u = (unsigned char *)s;
3897
0
    for (i = n; i > 0; i--) {
3898
0
        if (u[i-1] == c)
3899
0
            return u+i-1;
3900
0
    }
3901
3902
0
    return NULL;
3903
0
}
3904
3905
/*
3906
 * A variant of hts_parse_reg which is reference-id aware.  It uses
3907
 * the iterator name2id callbacks to validate the region tokenisation works.
3908
 *
3909
 * This is necessary due to GRCh38 HLA additions which have reference names
3910
 * like "HLA-DRB1*12:17".
3911
 *
3912
 * All parameters are mandatory.
3913
 *
3914
 * To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200"
3915
 * are reference names, we may quote using curly braces.
3916
 * Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example.
3917
 *
3918
 * Flags are used to control how parsing works, and can be one of the below.
3919
 *
3920
 * HTS_PARSE_LIST:
3921
 *     If present, the region is assmed to be a comma separated list and
3922
 *     position parsing will not contain commas (this implicitly
3923
 *     clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal).
3924
 *     On success the return pointer will be the start of the next region, ie
3925
 *     the character after the comma.  (If *ret != '\0' then the caller can
3926
 *     assume another region is present in the list.)
3927
 *
3928
 *     If not set then positions may contain commas.  In this case the return
3929
 *     value should point to the end of the string, or NULL on failure.
3930
 *
3931
 * HTS_PARSE_ONE_COORD:
3932
 *     If present, X:100 is treated as the single base pair region X:100-100.
3933
 *     In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-<end>.
3934
 *     (This is the standard bcftools region convention.)
3935
 *
3936
 *     When not set X:100 is considered to be X:100-<end> where <end> is
3937
 *     the end of chromosome X (set to HTS_POS_MAX here).  X:100- and X:-100
3938
 *     are invalid.
3939
 *     (This is the standard samtools region convention.)
3940
 *
3941
 * Note the supplied string expects 1 based inclusive coordinates, but the
3942
 * returned coordinates start from 0 and are half open, so pos0 is valid
3943
 * for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}"
3944
 *
3945
 * On success a pointer to the byte after the end of the entire region
3946
 *            specifier is returned (plus any trailing comma), and tid,
3947
 *            beg & end will be set.
3948
 * On failure NULL is returned.
3949
 */
3950
const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg,
3951
                             hts_pos_t *end, hts_name2id_f getid, void *hdr,
3952
                             int flags)
3953
0
{
3954
0
    if (!s || !tid || !beg || !end || !getid)
3955
0
        return NULL;
3956
3957
0
    size_t s_len = strlen(s);
3958
0
    kstring_t ks = { 0, 0, NULL };
3959
3960
0
    const char *colon = NULL, *comma = NULL;
3961
0
    int quoted = 0;
3962
3963
0
    if (flags & HTS_PARSE_LIST)
3964
0
        flags &= ~HTS_PARSE_THOUSANDS_SEP;
3965
0
    else
3966
0
        flags |= HTS_PARSE_THOUSANDS_SEP;
3967
3968
0
    const char *s_end = s + s_len;
3969
3970
    // Braced quoting of references is permitted to resolve ambiguities.
3971
0
    if (*s == '{') {
3972
0
        const char *close = memchr(s, '}', s_len);
3973
0
        if (!close) {
3974
0
            hts_log_error("Mismatching braces in \"%s\"", s);
3975
0
            *tid = -1;
3976
0
            return NULL;
3977
0
        }
3978
0
        s++;
3979
0
        s_len--;
3980
0
        if (close[1] == ':')
3981
0
            colon = close+1;
3982
0
        quoted = 1; // number of trailing characters to trim
3983
3984
        // Truncate to this item only, if appropriate.
3985
0
        if (flags & HTS_PARSE_LIST) {
3986
0
            comma = strchr(close, ',');
3987
0
            if (comma) {
3988
0
                s_len = comma-s;
3989
0
                s_end = comma+1;
3990
0
            }
3991
0
        }
3992
0
    } else {
3993
        // Truncate to this item only, if appropriate.
3994
0
        if (flags & HTS_PARSE_LIST) {
3995
0
            comma = strchr(s, ',');
3996
0
            if (comma) {
3997
0
                s_len = comma-s;
3998
0
                s_end = comma+1;
3999
0
            }
4000
0
        }
4001
4002
0
        colon = hts_memrchr(s, ':', s_len);
4003
0
    }
4004
4005
    // No colon is simplest case; just check and return.
4006
0
    if (colon == NULL) {
4007
0
        *beg = 0; *end = HTS_POS_MAX;
4008
0
        kputsn(s, s_len-quoted, &ks); // convert to nul terminated string
4009
0
        if (!ks.s) {
4010
0
            *tid = -2;
4011
0
            return NULL;
4012
0
        }
4013
4014
0
        *tid = getid(hdr, ks.s);
4015
0
        free(ks.s);
4016
4017
0
        return *tid >= 0 ? s_end : NULL;
4018
0
    }
4019
4020
    // Has a colon, but check whole name first.
4021
0
    if (!quoted) {
4022
0
        *beg = 0; *end = HTS_POS_MAX;
4023
0
        kputsn(s, s_len, &ks); // convert to nul terminated string
4024
0
        if (!ks.s) {
4025
0
            *tid = -2;
4026
0
            return NULL;
4027
0
        }
4028
0
        if ((*tid = getid(hdr, ks.s)) >= 0) {
4029
            // Entire name matches, but also check this isn't
4030
            // ambiguous.  eg we have ref chr1 and ref chr1:100-200
4031
            // both present.
4032
0
            ks.l = 0;
4033
0
            kputsn(s, colon-s, &ks); // convert to nul terminated string
4034
0
            if (!ks.s) {
4035
0
                *tid = -2;
4036
0
                return NULL;
4037
0
            }
4038
0
            if (getid(hdr, ks.s) >= 0) {
4039
0
                free(ks.s);
4040
0
                *tid = -1;
4041
0
                hts_log_error("Range is ambiguous. "
4042
0
                              "Use {%s} or {%.*s}%s instead",
4043
0
                              s, (int)(colon-s), s, colon);
4044
0
                return NULL;
4045
0
            }
4046
0
            free(ks.s);
4047
4048
0
            return s_end;
4049
0
        }
4050
0
        if (*tid < -1) // Failed to parse header
4051
0
            return NULL;
4052
0
    }
4053
4054
    // Quoted, or unquoted and whole string isn't a name.
4055
    // Check the pre-colon part is valid.
4056
0
    ks.l = 0;
4057
0
    kputsn(s, colon-s-quoted, &ks); // convert to nul terminated string
4058
0
    if (!ks.s) {
4059
0
        *tid = -2;
4060
0
        return NULL;
4061
0
    }
4062
0
    *tid = getid(hdr, ks.s);
4063
0
    free(ks.s);
4064
0
    if (*tid < 0)
4065
0
        return NULL;
4066
4067
    // Finally parse the post-colon coordinates
4068
0
    char *hyphen;
4069
0
    *beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1;
4070
0
    if (*beg < 0) {
4071
0
        if (*beg != -1 && *hyphen == '-' && colon[1] != '\0') {
4072
            // User specified zero, but we're 1-based.
4073
0
            hts_log_error("Coordinates must be > 0");
4074
0
            return NULL;
4075
0
        }
4076
0
        if (isdigit_c(*hyphen) || *hyphen == '\0' || *hyphen == ',') {
4077
            // interpret chr:-100 as chr:1-100
4078
0
            *end = *beg==-1 ? HTS_POS_MAX : -(*beg+1);
4079
0
            *beg = 0;
4080
0
            return s_end;
4081
0
        } else if (*beg < -1) {
4082
0
            hts_log_error("Unexpected string \"%s\" after region", hyphen);
4083
0
            return NULL;
4084
0
        }
4085
0
    }
4086
4087
0
    if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) {
4088
0
        *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : HTS_POS_MAX;
4089
0
    } else if (*hyphen == '-') {
4090
0
        *end = hts_parse_decimal(hyphen+1, &hyphen, flags);
4091
0
        if (*hyphen != '\0' && *hyphen != ',') {
4092
0
            hts_log_error("Unexpected string \"%s\" after region", hyphen);
4093
0
            return NULL;
4094
0
        }
4095
0
    } else {
4096
0
        hts_log_error("Unexpected string \"%s\" after region", hyphen);
4097
0
        return NULL;
4098
0
    }
4099
4100
0
    if (*end == 0)
4101
0
        *end = HTS_POS_MAX; // interpret chr:100- as chr:100-<end>
4102
4103
0
    if (*beg >= *end) return NULL;
4104
4105
0
    return s_end;
4106
0
}
4107
4108
// Next release we should mark this as deprecated?
4109
// Use hts_parse_region above instead.
4110
const char *hts_parse_reg64(const char *s, hts_pos_t *beg, hts_pos_t *end)
4111
0
{
4112
0
    char *hyphen;
4113
0
    const char *colon = strrchr(s, ':');
4114
0
    if (colon == NULL) {
4115
0
        *beg = 0; *end = HTS_POS_MAX;
4116
0
        return s + strlen(s);
4117
0
    }
4118
4119
0
    *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1;
4120
0
    if (*beg < 0) *beg = 0;
4121
4122
0
    if (*hyphen == '\0') *end = HTS_POS_MAX;
4123
0
    else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP);
4124
0
    else return NULL;
4125
4126
0
    if (*beg >= *end) return NULL;
4127
0
    return colon;
4128
0
}
4129
4130
const char *hts_parse_reg(const char *s, int *beg, int *end)
4131
0
{
4132
0
    hts_pos_t beg64 = 0, end64 = 0;
4133
0
    const char *colon = hts_parse_reg64(s, &beg64, &end64);
4134
0
    if (beg64 > INT_MAX) {
4135
0
        hts_log_error("Position %"PRId64" too large", beg64);
4136
0
        return NULL;
4137
0
    }
4138
0
    if (end64 > INT_MAX) {
4139
0
        if (end64 == HTS_POS_MAX) {
4140
0
            end64 = INT_MAX;
4141
0
        } else {
4142
0
            hts_log_error("Position %"PRId64" too large", end64);
4143
0
            return NULL;
4144
0
        }
4145
0
    }
4146
0
    *beg = beg64;
4147
0
    *end = end64;
4148
0
    return colon;
4149
0
}
4150
4151
hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
4152
0
{
4153
0
    int tid;
4154
0
    hts_pos_t beg, end;
4155
4156
0
    if (strcmp(reg, ".") == 0)
4157
0
        return itr_query(idx, HTS_IDX_START, 0, 0, readrec);
4158
0
    else if (strcmp(reg, "*") == 0)
4159
0
        return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
4160
4161
0
    if (!hts_parse_region(reg, &tid, &beg, &end, getid, hdr, HTS_PARSE_THOUSANDS_SEP))
4162
0
        return NULL;
4163
4164
0
    return itr_query(idx, tid, beg, end, readrec);
4165
0
}
4166
4167
0
hts_itr_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int count, hts_name2id_f getid, void *hdr, hts_itr_multi_query_func *itr_specific, hts_readrec_func *readrec, hts_seek_func *seek, hts_tell_func *tell) {
4168
4169
0
    int i;
4170
4171
0
    if (!reglist)
4172
0
        return NULL;
4173
4174
0
    hts_itr_t *itr = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
4175
0
    if (itr) {
4176
0
        itr->n_reg = count;
4177
0
        itr->readrec = readrec;
4178
0
        itr->seek = seek;
4179
0
        itr->tell = tell;
4180
0
        itr->reg_list = reglist;
4181
0
        itr->finished = 0;
4182
0
        itr->nocoor = 0;
4183
0
        itr->multi = 1;
4184
4185
0
        for (i = 0; i < itr->n_reg; i++) {
4186
0
            if (itr->reg_list[i].reg) {
4187
0
                if (!strcmp(itr->reg_list[i].reg, ".")) {
4188
0
                    itr->reg_list[i].tid = HTS_IDX_START;
4189
0
                    continue;
4190
0
                }
4191
4192
0
                if (!strcmp(itr->reg_list[i].reg, "*")) {
4193
0
                    itr->reg_list[i].tid = HTS_IDX_NOCOOR;
4194
0
                    continue;
4195
0
                }
4196
4197
0
                itr->reg_list[i].tid = getid(hdr, reglist[i].reg);
4198
0
                if (itr->reg_list[i].tid < 0) {
4199
0
                    if (itr->reg_list[i].tid < -1) {
4200
0
                        hts_log_error("Failed to parse header");
4201
0
                        hts_itr_destroy(itr);
4202
0
                        return NULL;
4203
0
                    } else {
4204
0
                        hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", reglist[i].reg);
4205
0
                    }
4206
0
                }
4207
0
            }
4208
0
        }
4209
4210
0
        qsort(itr->reg_list, itr->n_reg, sizeof(hts_reglist_t), compare_regions);
4211
0
        if (itr_specific(idx, itr) != 0) {
4212
0
            hts_log_error("Failed to create the multi-region iterator!");
4213
0
            hts_itr_destroy(itr);
4214
0
            itr = NULL;
4215
0
        }
4216
0
    }
4217
4218
0
    return itr;
4219
0
}
4220
4221
int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
4222
0
{
4223
0
    int ret, tid;
4224
0
    hts_pos_t beg, end;
4225
0
    if (iter == NULL || iter->finished) return -1;
4226
0
    if (iter->read_rest) {
4227
0
        if (iter->curr_off) { // seek to the start
4228
0
            if (bgzf_seek(fp, iter->curr_off, SEEK_SET) < 0) {
4229
0
                hts_log_error("Failed to seek to offset %"PRIu64"%s%s",
4230
0
                              iter->curr_off,
4231
0
                              errno ? ": " : "", strerror(errno));
4232
0
                return -2;
4233
0
            }
4234
0
            iter->curr_off = 0; // only seek once
4235
0
        }
4236
0
        ret = iter->readrec(fp, data, r, &tid, &beg, &end);
4237
0
        if (ret < 0) iter->finished = 1;
4238
0
        iter->curr_tid = tid;
4239
0
        iter->curr_beg = beg;
4240
0
        iter->curr_end = end;
4241
0
        return ret;
4242
0
    }
4243
    // A NULL iter->off should always be accompanied by iter->finished.
4244
0
    assert(iter->off != NULL);
4245
0
    for (;;) {
4246
0
        if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
4247
0
            if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
4248
0
            if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
4249
0
                if (bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) {
4250
0
                    hts_log_error("Failed to seek to offset %"PRIu64"%s%s",
4251
0
                                  iter->off[iter->i+1].u,
4252
0
                                  errno ? ": " : "", strerror(errno));
4253
0
                    return -2;
4254
0
                }
4255
0
                iter->curr_off = bgzf_tell(fp);
4256
0
            }
4257
0
            ++iter->i;
4258
0
        }
4259
0
        if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) {
4260
0
            iter->curr_off = bgzf_tell(fp);
4261
0
            if (tid != iter->tid || beg >= iter->end) { // no need to proceed
4262
0
                ret = -1; break;
4263
0
            } else if (end > iter->beg && iter->end > beg) {
4264
0
                iter->curr_tid = tid;
4265
0
                iter->curr_beg = beg;
4266
0
                iter->curr_end = end;
4267
0
                return ret;
4268
0
            }
4269
0
        } else break; // end of file or error
4270
0
    }
4271
0
    iter->finished = 1;
4272
0
    return ret;
4273
0
}
4274
4275
int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r)
4276
0
{
4277
0
    void *fp;
4278
0
    int ret, tid, i, cr, ci;
4279
0
    hts_pos_t beg, end;
4280
0
    hts_reglist_t *found_reg;
4281
4282
0
    if (iter == NULL || iter->finished) return -1;
4283
4284
0
    if (iter->is_cram) {
4285
0
        fp = fd->fp.cram;
4286
0
    } else {
4287
0
        fp = fd->fp.bgzf;
4288
0
    }
4289
4290
0
    if (iter->read_rest) {
4291
0
        if (iter->curr_off) { // seek to the start
4292
0
            if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
4293
0
                hts_log_error("Seek at offset %" PRIu64 " failed.", iter->curr_off);
4294
0
                return -2;
4295
0
            }
4296
0
            iter->curr_off = 0; // only seek once
4297
0
        }
4298
4299
0
        ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
4300
0
        if (ret < 0)
4301
0
            iter->finished = 1;
4302
4303
0
        iter->curr_tid = tid;
4304
0
        iter->curr_beg = beg;
4305
0
        iter->curr_end = end;
4306
4307
0
        return ret;
4308
0
    }
4309
    // A NULL iter->off should always be accompanied by iter->finished.
4310
0
    assert(iter->off != NULL || iter->nocoor != 0);
4311
4312
0
    int next_range = 0;
4313
0
    for (;;) {
4314
        // Note that due to the way bam indexing works, iter->off may contain
4315
        // file chunks that are not actually needed as they contain data
4316
        // beyond the end of the requested region.  These are filtered out
4317
        // by comparing the tid and index into hts_reglist_t::intervals
4318
        // (packed for reasons of convenience into iter->off[iter->i].max)
4319
        // associated with the file region with iter->curr_tid and
4320
        // iter->curr_intv.
4321
4322
0
        if (next_range
4323
0
            || iter->curr_off == 0
4324
0
            || iter->i >= iter->n_off
4325
0
            || iter->curr_off >= iter->off[iter->i].v
4326
0
            || (iter->off[iter->i].max >> 32 == iter->curr_tid
4327
0
                && (iter->off[iter->i].max & 0xffffffff) < iter->curr_intv)) {
4328
4329
            // Jump to the next chunk.  It may be necessary to skip more
4330
            // than one as the iter->off list can include overlapping entries.
4331
0
            do {
4332
0
                iter->i++;
4333
0
            } while (iter->i < iter->n_off
4334
0
                     && (iter->curr_off >= iter->off[iter->i].v
4335
0
                         || (iter->off[iter->i].max >> 32 == iter->curr_tid
4336
0
                             && (iter->off[iter->i].max & 0xffffffff) < iter->curr_intv)));
4337
4338
0
            if (iter->is_cram && iter->i < iter->n_off) {
4339
                // Ensure iter->curr_reg is correct.
4340
                //
4341
                // We need this for CRAM as we shortcut some of the later
4342
                // logic by getting an end-of-range and continuing to the
4343
                // next offset.
4344
                //
4345
                // We cannot do this for BAM (and fortunately do not need to
4346
                // either) because in BAM world a query to genomic positions
4347
                // GX and GY leading to a seek offsets PX and PY may have
4348
                // GX > GY and PX < PY.  (This is due to the R-tree and falling
4349
                // between intervals, bumping up to a higher bin.)
4350
                // CRAM strictly follows PX >= PY if GX >= GY, so this logic
4351
                // works.
4352
0
                int want_tid = iter->off[iter->i].max >> 32;
4353
0
                if (!(iter->curr_reg < iter->n_reg &&
4354
0
                      iter->reg_list[iter->curr_reg].tid == want_tid)) {
4355
0
                    int j;
4356
0
                    for (j = 0; j < iter->n_reg; j++)
4357
0
                        if (iter->reg_list[j].tid == want_tid)
4358
0
                            break;
4359
0
                    if (j == iter->n_reg)
4360
0
                        return -1;
4361
0
                    iter->curr_reg = j;
4362
0
                    iter->curr_tid = iter->reg_list[iter->curr_reg].tid;
4363
0
                };
4364
0
                iter->curr_intv = iter->off[iter->i].max & 0xffffffff;
4365
0
            }
4366
4367
0
            if (iter->i >= iter->n_off) { // no more chunks, except NOCOORs
4368
0
                if (iter->nocoor) {
4369
0
                    next_range = 0;
4370
0
                    if (iter->seek(fp, iter->nocoor_off, SEEK_SET) < 0) {
4371
0
                        hts_log_error("Seek at offset %" PRIu64 " failed.", iter->nocoor_off);
4372
0
                        return -2;
4373
0
                    }
4374
0
                    if (iter->is_cram) {
4375
0
                        cram_range r = { HTS_IDX_NOCOOR };
4376
0
                        cram_set_option(fp, CRAM_OPT_RANGE_NOSEEK, &r);
4377
0
                    }
4378
4379
                    // The first slice covering the unmapped reads might
4380
                    // contain a few mapped reads, so scroll
4381
                    // forward until finding the first unmapped read.
4382
0
                    do {
4383
0
                        ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
4384
0
                    } while (tid >= 0 && ret >=0);
4385
4386
0
                    if (ret < 0)
4387
0
                        iter->finished = 1;
4388
0
                    else
4389
0
                        iter->read_rest = 1;
4390
4391
0
                    iter->curr_off = 0; // don't seek any more
4392
0
                    iter->curr_tid = tid;
4393
0
                    iter->curr_beg = beg;
4394
0
                    iter->curr_end = end;
4395
4396
0
                    return ret;
4397
0
                } else {
4398
0
                    ret = -1; break;
4399
0
                }
4400
0
            } else if (iter->i < iter->n_off) {
4401
                // New chunk may overlap the last one, so ensure we
4402
                // only seek forwards.
4403
0
                if (iter->curr_off < iter->off[iter->i].u || next_range) {
4404
0
                    iter->curr_off = iter->off[iter->i].u;
4405
4406
                    // CRAM has the capability of setting an end location.
4407
                    // This means multi-threaded decodes can stop once they
4408
                    // reach that point, rather than pointlessly decoding
4409
                    // more slices than we'll be using.
4410
                    //
4411
                    // We have to be careful here.  Whenever we set the cram
4412
                    // range we need a corresponding seek in order to ensure
4413
                    // we can safely decode at that offset.  We use next_range
4414
                    // var to ensure this is always true; this is set on
4415
                    // end-of-range condition. It's never modified for BAM.
4416
0
                    if (iter->is_cram) {
4417
                        // Next offset.[uv] tuple, but it's already been
4418
                        // included in our cram range, so don't seek and don't
4419
                        // reset range so we can efficiently multi-thread.
4420
0
                        if (next_range || iter->curr_off >= iter->end) {
4421
0
                            if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
4422
0
                                hts_log_error("Seek at offset %" PRIu64
4423
0
                                        " failed.", iter->curr_off);
4424
0
                                return -2;
4425
0
                            }
4426
4427
                            // Find the genomic range matching this interval.
4428
0
                            int j;
4429
0
                            hts_reglist_t *rl = &iter->reg_list[iter->curr_reg];
4430
0
                            cram_range r = {
4431
0
                                    rl->tid,
4432
0
                                    rl->intervals[iter->curr_intv].beg,
4433
0
                                    rl->intervals[iter->curr_intv].end
4434
0
                            };
4435
4436
                            // Expand it up to cover neighbouring intervals.
4437
                            // Note we can only have a single chromosome in a
4438
                            // range, so if we detect our blocks span chromosomes
4439
                            // or we have a multi-ref mode slice, we just use
4440
                            // HTS_IDX_START refid instead.  This doesn't actually
4441
                            // seek (due to CRAM_OPT_RANGE_NOSEEK) and is simply
4442
                            // and indicator of decoding with no end limit.
4443
                            //
4444
                            // That isn't as efficient as it could be, but it's
4445
                            // no poorer than before and it works.
4446
0
                            int tid = r.refid;
4447
0
                            int64_t end = r.end;
4448
0
                            int64_t v = iter->off[iter->i].v;
4449
0
                            j = iter->i+1;
4450
0
                            while (j < iter->n_off) {
4451
0
                                if (iter->off[j].u > v)
4452
0
                                    break;
4453
4454
0
                                uint64_t max = iter->off[j].max;
4455
0
                                if ((max>>32) != tid) {
4456
0
                                    tid = HTS_IDX_START; // => no range limit
4457
0
                                } else {
4458
0
                                    if (end < rl->intervals[max & 0xffffffff].end)
4459
0
                                        end = rl->intervals[max & 0xffffffff].end;
4460
0
                                }
4461
0
                                if (v < iter->off[j].v)
4462
0
                                    v = iter->off[j].v;
4463
0
                                j++;
4464
0
                            }
4465
0
                            r.refid = tid;
4466
0
                            r.end = end;
4467
4468
                            // Remember maximum 'v' here so we don't do
4469
                            // unnecessary subsequent seeks for the next
4470
                            // regions.  We can't change curr_off, but
4471
                            // beg/end are used only by single region iterator so
4472
                            // we cache it there to avoid changing the struct.
4473
0
                            iter->end = v;
4474
4475
0
                            cram_set_option(fp, CRAM_OPT_RANGE_NOSEEK, &r);
4476
0
                            next_range = 0;
4477
0
                        }
4478
0
                    } else { // Not CRAM
4479
0
                        if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
4480
0
                            hts_log_error("Seek at offset %" PRIu64 " failed.",
4481
0
                                          iter->curr_off);
4482
0
                            return -2;
4483
0
                        }
4484
0
                    }
4485
0
                }
4486
0
            }
4487
0
        }
4488
4489
0
        ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
4490
0
        if (ret < 0) {
4491
0
            if (iter->is_cram && cram_eof(fp)) {
4492
                // Skip to end of range
4493
                //
4494
                // We should never be adjusting curr_off manually unless
4495
                // we also can guarantee we'll be doing a seek after to
4496
                // a new location.  Otherwise we'll be reading wrong offset
4497
                // for the next container.
4498
                //
4499
                // We ensure this by adjusting our CRAM_OPT_RANGE
4500
                // accordingly above, but to double check we also
4501
                // set the skipped_block flag to enforce a seek also.
4502
0
                iter->curr_off = iter->off[iter->i].v;
4503
0
                next_range = 1;
4504
4505
                // Next region
4506
0
                if (++iter->curr_intv >= iter->reg_list[iter->curr_reg].count){
4507
0
                    if (++iter->curr_reg >= iter->n_reg)
4508
0
                        break;
4509
0
                    iter->curr_intv = 0;
4510
0
                    iter->curr_tid = iter->reg_list[iter->curr_reg].tid;
4511
0
                }
4512
0
                continue;
4513
0
            } else {
4514
0
                break;
4515
0
            }
4516
0
        }
4517
4518
0
        iter->curr_off = iter->tell(fp);
4519
4520
0
        if (tid != iter->curr_tid) {
4521
0
            hts_reglist_t key;
4522
0
            key.tid = tid;
4523
4524
0
            found_reg = (hts_reglist_t *)bsearch(&key, iter->reg_list,
4525
0
                                                 iter->n_reg,
4526
0
                                                 sizeof(hts_reglist_t),
4527
0
                                                 compare_regions);
4528
0
            if (!found_reg)
4529
0
                continue;
4530
4531
0
            iter->curr_reg = (found_reg - iter->reg_list);
4532
0
            iter->curr_tid = tid;
4533
0
            iter->curr_intv = 0;
4534
0
        }
4535
4536
0
        cr = iter->curr_reg;
4537
0
        ci = iter->curr_intv;
4538
4539
0
        for (i = ci; i < iter->reg_list[cr].count; i++) {
4540
0
            if (end > iter->reg_list[cr].intervals[i].beg &&
4541
0
                iter->reg_list[cr].intervals[i].end > beg) {
4542
0
                iter->curr_beg = beg;
4543
0
                iter->curr_end = end;
4544
0
                iter->curr_intv = i;
4545
4546
0
                return ret;
4547
0
            }
4548
4549
            // Check if the read starts beyond intervals[i].end
4550
            // If so, the interval is finished so move on to the next.
4551
0
            if (beg > iter->reg_list[cr].intervals[i].end)
4552
0
                iter->curr_intv = i + 1;
4553
4554
            // No need to keep searching if the read ends before intervals[i].beg
4555
0
            if (end < iter->reg_list[cr].intervals[i].beg)
4556
0
                break;
4557
0
        }
4558
0
    }
4559
0
    iter->finished = 1;
4560
4561
0
    return ret;
4562
0
}
4563
4564
/**********************
4565
 *** Retrieve index ***
4566
 **********************/
4567
// Local_fn and local_len will return a sub-region of 'fn'.
4568
// Eg http://elsewhere/dir/foo.bam.bai?a=b may return
4569
// foo.bam.bai via local_fn and local_len.
4570
//
4571
// Returns -1 if index couldn't be opened.
4572
//         -2 on other errors
4573
static int idx_test_and_fetch(const char *fn, const char **local_fn, int *local_len, int download)
4574
0
{
4575
0
    hFILE *remote_hfp = NULL;
4576
0
    hFILE *local_fp = NULL;
4577
0
    int save_errno;
4578
0
    htsFormat fmt;
4579
0
    kstring_t s = KS_INITIALIZE;
4580
0
    kstring_t tmps = KS_INITIALIZE;
4581
4582
0
    if (hisremote(fn)) {
4583
0
        const int buf_size = 1 * 1024 * 1024;
4584
0
        int l;
4585
0
        const char *p, *e;
4586
        // Ignore ?# params: eg any file.fmt?param=val, except for S3 URLs
4587
0
        e = fn + ((strncmp(fn, "s3://", 5) && strncmp(fn, "s3+http://", 10) && strncmp(fn, "s3+https://", 11)) ? strcspn(fn, "?#") : strcspn(fn, "?"));
4588
        // Find the previous slash from there.
4589
0
        p = e;
4590
0
        while (p > fn && *p != '/') p--;
4591
0
        if (*p == '/') p++;
4592
4593
        // Attempt to open local file first
4594
0
        kputsn(p, e-p, &s);
4595
0
        if (access(s.s, R_OK) == 0)
4596
0
        {
4597
0
            free(s.s);
4598
0
            *local_fn = p;
4599
0
            *local_len = e-p;
4600
0
            return 0;
4601
0
        }
4602
4603
        // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .bai or .tbi index.
4604
0
        if ((remote_hfp = hopen(fn, "r")) == 0) {
4605
0
            hts_log_info("Failed to open index file '%s'", fn);
4606
0
            free(s.s);
4607
0
            return -1;
4608
0
        }
4609
0
        if (hts_detect_format2(remote_hfp, fn, &fmt)) {
4610
0
            hts_log_error("Failed to detect format of index file '%s'", fn);
4611
0
            goto fail;
4612
0
        }
4613
0
        if (fmt.category != index_file || (fmt.format != bai &&  fmt.format != csi && fmt.format != tbi
4614
0
                && fmt.format != crai && fmt.format != fai_format)) {
4615
0
            hts_log_error("Format of index file '%s' is not supported", fn);
4616
0
            goto fail;
4617
0
        }
4618
4619
0
        if (download) {
4620
0
            if ((local_fp = hts_open_tmpfile(s.s, "wx", &tmps)) == NULL) {
4621
0
                hts_log_error("Failed to create file %s in the working directory", p);
4622
0
                goto fail;
4623
0
            }
4624
0
            hts_log_info("Downloading file %s to local directory", fn);
4625
0
            uint8_t *buf = (uint8_t*)calloc(buf_size, 1);
4626
0
            if (!buf) {
4627
0
                hts_log_error("%s", strerror(errno));
4628
0
                goto fail;
4629
0
            }
4630
0
            while ((l = hread(remote_hfp, buf, buf_size)) > 0) {
4631
0
                if (hwrite(local_fp, buf, l) != l) {
4632
0
                    hts_log_error("Failed to write data to %s : %s",
4633
0
                            fn, strerror(errno));
4634
0
                    free(buf);
4635
0
                    goto fail;
4636
0
                }
4637
0
            }
4638
0
            free(buf);
4639
0
            if (l < 0) {
4640
0
                hts_log_error("Error reading \"%s\"", fn);
4641
0
                goto fail;
4642
0
            }
4643
0
            if (hclose(local_fp) < 0) {
4644
0
                hts_log_error("Error closing %s : %s", fn, strerror(errno));
4645
0
                local_fp = NULL;
4646
0
                goto fail;
4647
0
            }
4648
0
            local_fp = NULL;
4649
0
            if (rename(tmps.s, s.s) < 0) {
4650
0
                hts_log_error("Error renaming %s : %s", tmps.s, strerror(errno));
4651
0
                goto fail;
4652
0
            }
4653
0
            ks_clear(&tmps);
4654
4655
0
            *local_fn = p;
4656
0
            *local_len = e-p;
4657
0
        } else {
4658
0
            *local_fn = fn;
4659
0
            *local_len = e-fn;
4660
0
        }
4661
4662
0
        if (hclose(remote_hfp) != 0) {
4663
0
            hts_log_error("Failed to close remote file %s", fn);
4664
0
        }
4665
4666
0
        free(tmps.s);
4667
0
        free(s.s);
4668
0
        return 0;
4669
0
    } else {
4670
0
        hFILE *local_hfp;
4671
0
        if ((local_hfp = hopen(fn, "r")) == 0) return -1;
4672
0
        hclose_abruptly(local_hfp);
4673
0
        *local_fn = fn;
4674
0
        *local_len = strlen(fn);
4675
0
        return 0;
4676
0
    }
4677
4678
0
 fail:
4679
0
    save_errno = errno;
4680
0
    if (remote_hfp) hclose_abruptly(remote_hfp);
4681
0
    if (local_fp) hclose_abruptly(local_fp);
4682
0
    if (tmps.l > 0) unlink(tmps.s);
4683
0
    free(tmps.s);
4684
0
    free(s.s);
4685
0
    errno = save_errno;
4686
0
    return -2;
4687
0
}
4688
4689
/*
4690
 * Check the existence of a local index file using part of the alignment
4691
 * file name.
4692
 *
4693
 * For a filename fn of fn.fmt (eg fn.bam or fn.cram) the order of checks is
4694
 * fn.fmt.csi,  fn.csi,
4695
 * fn.fmt.bai,  fn.bai  - if fmt is HTS_FMT_BAI
4696
 * fn.fmt.tbi,  fn.tbi  - if fmt is HTS_FMT_TBI
4697
 * fn.fmt.crai, fn.crai - if fmt is HTS_FMT_CRAI
4698
 * fn.fmt.fai           - if fmt is HTS_FMT_FAI
4699
 *   also .gzi if fmt is ".gz"
4700
 *
4701
 * @param fn    - pointer to the file name
4702
 * @param fmt   - one of the HTS_FMT index formats
4703
 * @param fnidx - pointer to the index file name placeholder
4704
 * @return        1 for success, 0 for failure
4705
 */
4706
772
int hts_idx_check_local(const char *fn, int fmt, char **fnidx) {
4707
772
    int i, l_fn, l_ext;
4708
772
    const char *fn_tmp = NULL;
4709
772
    char *fnidx_tmp;
4710
772
    const char *csi_ext = ".csi";
4711
772
    const char *bai_ext = ".bai";
4712
772
    const char *tbi_ext = ".tbi";
4713
772
    const char *crai_ext = ".crai";
4714
772
    const char *fai_ext = ".fai";
4715
772
    const char *gzi_ext = ".gzi";
4716
4717
772
    if (!fn)
4718
0
        return 0;
4719
4720
772
    if (hisremote(fn)) {
4721
0
        for (i = strlen(fn) - 1; i >= 0; --i)
4722
0
            if (fn[i] == '/') {
4723
0
                fn_tmp = (char *)&fn[i+1];
4724
0
                break;
4725
0
            }
4726
772
    } else {
4727
        // Borrowed from hopen_fd_fileuri()
4728
772
        if (strncmp(fn, "file://localhost/", 17) == 0) fn_tmp = fn + 16;
4729
772
        else if (strncmp(fn, "file:///", 8) == 0) fn_tmp = fn + 7;
4730
772
        else fn_tmp = fn;
4731
#if defined(_WIN32) || defined(__MSYS__)
4732
        // For cases like C:/foo
4733
        if (fn_tmp[0] == '/' && fn_tmp[1] && fn_tmp[2] == ':' && fn_tmp[3] == '/')
4734
            fn_tmp++;
4735
#endif
4736
772
    }
4737
4738
772
    if (!fn_tmp) return 0;
4739
772
    hts_log_info("Using alignment file '%s'", fn_tmp);
4740
772
    l_fn = strlen(fn_tmp); l_ext = 5;
4741
772
    fnidx_tmp = (char*)calloc(l_fn + l_ext + 1, 1);
4742
772
    if (!fnidx_tmp) return 0;
4743
4744
772
    struct stat sbuf;
4745
4746
    // Try alignment.bam.csi first
4747
772
    strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, csi_ext);
4748
772
    if(stat(fnidx_tmp, &sbuf) == 0) {
4749
0
        *fnidx = fnidx_tmp;
4750
0
        return 1;
4751
772
    } else { // Then try alignment.csi
4752
3.08k
        for (i = l_fn - 1; i > 0; --i)
4753
2.31k
            if (fnidx_tmp[i] == '.') {
4754
0
                strcpy(fnidx_tmp + i, csi_ext);
4755
0
                if(stat(fnidx_tmp, &sbuf) == 0) {
4756
0
                    *fnidx = fnidx_tmp;
4757
0
                    return 1;
4758
0
                }
4759
0
                break;
4760
0
            }
4761
772
    }
4762
772
    if (fmt == HTS_FMT_BAI) {
4763
        // Next, try alignment.bam.bai
4764
0
        strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, bai_ext);
4765
0
        if(stat(fnidx_tmp, &sbuf) == 0) {
4766
0
            *fnidx = fnidx_tmp;
4767
0
            return 1;
4768
0
        } else { // And finally, try alignment.bai
4769
0
            for (i = l_fn - 1; i > 0; --i)
4770
0
                if (fnidx_tmp[i] == '.') {
4771
0
                    strcpy(fnidx_tmp + i, bai_ext);
4772
0
                    if(stat(fnidx_tmp, &sbuf) == 0) {
4773
0
                        *fnidx = fnidx_tmp;
4774
0
                        return 1;
4775
0
                    }
4776
0
                    break;
4777
0
                }
4778
0
        }
4779
772
    } else if (fmt == HTS_FMT_TBI) { // Or .tbi
4780
772
        strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, tbi_ext);
4781
772
        if(stat(fnidx_tmp, &sbuf) == 0) {
4782
0
            *fnidx = fnidx_tmp;
4783
0
            return 1;
4784
772
        } else {
4785
3.08k
            for (i = l_fn - 1; i > 0; --i)
4786
2.31k
                if (fnidx_tmp[i] == '.') {
4787
0
                    strcpy(fnidx_tmp + i, tbi_ext);
4788
0
                    if(stat(fnidx_tmp, &sbuf) == 0) {
4789
0
                        *fnidx = fnidx_tmp;
4790
0
                        return 1;
4791
0
                    }
4792
0
                    break;
4793
0
                }
4794
772
        }
4795
772
    } else if (fmt == HTS_FMT_CRAI) { // Or .crai
4796
0
        strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, crai_ext);
4797
0
        if(stat(fnidx_tmp, &sbuf) == 0) {
4798
0
            *fnidx = fnidx_tmp;
4799
0
            return 1;
4800
0
        } else {
4801
0
            for (i = l_fn - 1; i > 0; --i)
4802
0
                if (fnidx_tmp[i] == '.') {
4803
0
                    strcpy(fnidx_tmp + i, crai_ext);
4804
0
                    if(stat(fnidx_tmp, &sbuf) == 0) {
4805
0
                        *fnidx = fnidx_tmp;
4806
0
                        return 1;
4807
0
                    }
4808
0
                    break;
4809
0
                }
4810
0
        }
4811
0
    } else if (fmt == HTS_FMT_FAI) { // Or .fai
4812
        // Check .gzi if we have a .gz file
4813
0
        strcpy(fnidx_tmp, fn_tmp);
4814
0
        int gzi_ok = 1;
4815
0
        if ((l_fn > 3 && strcmp(fn_tmp+l_fn-3, ".gz") == 0) ||
4816
0
            (l_fn > 5 && strcmp(fn_tmp+l_fn-5, ".bgzf") == 0)) {
4817
0
            strcpy(fnidx_tmp + l_fn, gzi_ext);
4818
0
            gzi_ok = stat(fnidx_tmp, &sbuf)==0;
4819
0
        }
4820
4821
        // Now check for .fai.  Occurs second as we're returning this
4822
        // in *fnidx irrespective of whether we did gzi check.
4823
0
        strcpy(fnidx_tmp + l_fn, fai_ext);
4824
0
        *fnidx = fnidx_tmp;
4825
0
        if (stat(fnidx_tmp, &sbuf) == 0)
4826
0
            return gzi_ok;
4827
0
        else
4828
0
            return 0;
4829
0
    }
4830
4831
772
    free(fnidx_tmp);
4832
772
    return 0;
4833
772
}
4834
4835
0
static char *idx_filename(const char *fn, const char *ext, int download) {
4836
0
    int ret, local_len;
4837
0
    char *fnidx;
4838
0
    const char *local_fn = NULL;
4839
0
    kstring_t buffer = KS_INITIALIZE;
4840
4841
    // First try : append `ext` to `fn`
4842
0
    if (!(fnidx = haddextension(&buffer, fn, 0, ext))) {
4843
0
        free(buffer.s);
4844
0
        return NULL;
4845
0
    }
4846
0
    if ((ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, download)) == -1) {
4847
        // Second try : replace suffix of `fn` with `ext`
4848
0
        if (!(fnidx = haddextension(&buffer, fn, 1, ext))) {
4849
0
            free(buffer.s);
4850
0
            return NULL;
4851
0
        }
4852
0
        ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, download);
4853
0
    }
4854
4855
0
    if (ret < 0) {
4856
0
        free(buffer.s);
4857
0
        return NULL;
4858
0
    }
4859
4860
0
    memmove(fnidx, local_fn, local_len);
4861
0
    fnidx[local_len] = 0;
4862
0
    return fnidx;
4863
0
}
4864
4865
char *hts_idx_getfn(const char *fn, const char *ext)
4866
0
{
4867
0
    return idx_filename(fn, ext, HTS_IDX_SAVE_REMOTE);
4868
0
}
4869
4870
char *hts_idx_locatefn(const char *fn, const char *ext)
4871
0
{
4872
0
    return idx_filename(fn, ext, 0);
4873
0
}
4874
4875
static hts_idx_t *idx_find_and_load(const char *fn, int fmt, int flags)
4876
772
{
4877
772
    char *fnidx = strstr(fn, HTS_IDX_DELIM);
4878
772
    hts_idx_t *idx;
4879
4880
772
    if ( fnidx ) {
4881
0
        char *fn2 = strdup(fn);
4882
0
        if (!fn2) {
4883
0
            hts_log_error("%s", strerror(errno));
4884
0
            return NULL;
4885
0
        }
4886
0
        fn2[fnidx - fn] = '\0';
4887
0
        fnidx += strlen(HTS_IDX_DELIM);
4888
0
        idx = hts_idx_load3(fn2, fnidx, fmt, flags);
4889
0
        free(fn2);
4890
0
        return idx;
4891
0
    }
4892
4893
772
    if (hts_idx_check_local(fn, fmt, &fnidx) == 0 && hisremote(fn)) {
4894
0
        if (flags & HTS_IDX_SAVE_REMOTE) {
4895
0
            fnidx = idx_filename(fn, ".csi", HTS_IDX_SAVE_REMOTE);
4896
0
            if (!fnidx) {
4897
0
                switch (fmt) {
4898
0
                case HTS_FMT_BAI: fnidx = idx_filename(fn, ".bai", HTS_IDX_SAVE_REMOTE); break;
4899
0
                case HTS_FMT_TBI: fnidx = idx_filename(fn, ".tbi", HTS_IDX_SAVE_REMOTE); break;
4900
0
                default: break;
4901
0
                }
4902
0
            }
4903
0
        } else {
4904
0
            fnidx = idx_filename(fn, ".csi", 0);
4905
0
            if (!fnidx) {
4906
0
                switch (fmt) {
4907
0
                case HTS_FMT_BAI: fnidx = idx_filename(fn, ".bai", 0); break;
4908
0
                case HTS_FMT_TBI: fnidx = idx_filename(fn, ".tbi", 0); break;
4909
0
                default: break;
4910
0
                }
4911
0
            }
4912
0
        }
4913
0
    }
4914
772
    if (!fnidx) {
4915
772
        if (!(flags & HTS_IDX_SILENT_FAIL))
4916
0
            hts_log_error("Could not retrieve index file for '%s'", fn);
4917
772
        return 0;
4918
772
    }
4919
4920
0
    if (flags & HTS_IDX_SAVE_REMOTE)
4921
0
        idx = hts_idx_load3(fn, fnidx, fmt, flags);
4922
0
    else
4923
0
        idx = idx_read(fnidx);
4924
0
    free(fnidx);
4925
0
    return idx;
4926
772
}
4927
4928
0
hts_idx_t *hts_idx_load(const char *fn, int fmt) {
4929
0
    return idx_find_and_load(fn, fmt, 1);
4930
0
}
4931
4932
hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
4933
0
{
4934
0
    return hts_idx_load3(fn, fnidx, 0, 0);
4935
0
}
4936
4937
hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags)
4938
772
{
4939
772
    const char *local_fn = NULL;
4940
772
    char *local_fnidx = NULL;
4941
772
    int local_len;
4942
772
    if (!fnidx)
4943
772
        return idx_find_and_load(fn, fmt, flags);
4944
4945
    // Check that the index file is up to date, the main file might have changed
4946
0
    struct stat stat_idx,stat_main;
4947
0
    int remote_fn = hisremote(fn), remote_fnidx = hisremote(fnidx);
4948
0
    if ( !remote_fn && !remote_fnidx
4949
0
         && !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) )
4950
0
    {
4951
0
        if ( stat_idx.st_mtime < stat_main.st_mtime )
4952
0
            hts_log_warning("The index file is older than the data file: %s", fnidx);
4953
0
    }
4954
4955
0
    if (remote_fnidx && (flags & HTS_IDX_SAVE_REMOTE))
4956
0
    {
4957
0
        int ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, 1);
4958
0
        if (ret == 0) {
4959
0
            local_fnidx = strdup(local_fn);
4960
0
            if (local_fnidx) {
4961
0
                local_fnidx[local_len] = '\0';
4962
0
                fnidx = local_fnidx;
4963
0
            }
4964
0
        }
4965
0
    }
4966
4967
0
    hts_idx_t *idx = idx_read(fnidx);
4968
0
    if (!idx && !(flags & HTS_IDX_SILENT_FAIL))
4969
0
        hts_log_error("Could not load local index file '%s'%s%s", fnidx,
4970
0
                      errno ? " : " : "", errno ? strerror(errno) : "");
4971
4972
4973
0
    free(local_fnidx);
4974
4975
0
    return idx;
4976
772
}
4977
4978
4979
4980
/**********************
4981
 ***     Memory     ***
4982
 **********************/
4983
4984
/* For use with hts_expand macros *only* */
4985
HTSLIB_EXPORT
4986
size_t hts_realloc_or_die(size_t n, size_t m, size_t m_sz, size_t size,
4987
781
                          int clear, void **ptr, const char *func) {
4988
    /* If new_m and size are both below this limit, multiplying them
4989
       together can't overflow */
4990
781
    const size_t safe = (size_t) 1 << (sizeof(size_t) * 4);
4991
781
    void *new_ptr;
4992
781
    size_t bytes, new_m;
4993
4994
781
    new_m = n;
4995
781
    kroundup_size_t(new_m);
4996
4997
781
    bytes = size * new_m;
4998
4999
    /* Check for overflow.  Both ensure that new_m will fit in m (we make the
5000
       pessimistic assumption that m is signed), and that bytes has not
5001
       wrapped around. */
5002
781
    if (new_m > (((size_t) 1 << (m_sz * 8 - 1)) - 1)
5003
781
        || ((size > safe || new_m > safe)
5004
781
            && bytes / new_m != size)) {
5005
0
        errno = ENOMEM;
5006
0
        goto die;
5007
0
    }
5008
5009
781
    new_ptr = realloc(*ptr, bytes);
5010
781
    if (new_ptr == NULL) goto die;
5011
5012
781
    if (clear) {
5013
0
        if (new_m > m) {
5014
0
            memset((char *) new_ptr + m * size, 0, (new_m - m) * size);
5015
0
        }
5016
0
    }
5017
5018
781
    *ptr = new_ptr;
5019
5020
781
    return new_m;
5021
5022
0
 die:
5023
0
    hts_log_error("%s", strerror(errno));
5024
0
    exit(1);
5025
781
}
5026
5027
/*
5028
 * Companion to hts_resize() macro that does the actual allocation.
5029
 *
5030
 * Somewhat complicated as hts_resize() needs to write the new allocated
5031
 * size back into *size_in_out, and the value pointed to may either be
5032
 * int32_t, uint32_t or size_t depending on which array is being resized.
5033
 * This is solved by making `size_in_out` a void pointer, getting the macro
5034
 * to pass in the size of the item pointed to (in `size_sz`) and then using
5035
 * an appropriate cast (based on the value of size_sz).  The function
5036
 * ensures that the maximum size will be storable in a signed type of
5037
 * the given size so storing to an int32_t should work correctly.
5038
 *
5039
 * Assumes that sizeof(uint32_t) and sizeof(int32_t) is 4,
5040
 * sizeof(uint64_t) and sizeof(int64_t) is 8 and sizeof(size_t) is
5041
 * either 4 or 8.  It also assumes casting from unsigned to signed will
5042
 * work as long as the top bit isn't set.
5043
 */
5044
5045
int hts_resize_array_(size_t item_size, size_t num, size_t size_sz,
5046
                      void *size_in_out, void **ptr_in_out, int flags,
5047
4.54k
                      const char *func) {
5048
    /* If new_size and item_size are both below this limit, multiplying them
5049
       together can't overflow */
5050
4.54k
    const size_t safe = (size_t) 1 << (sizeof(size_t) * 4);
5051
4.54k
    void *new_ptr;
5052
4.54k
    size_t bytes, new_size;
5053
5054
4.54k
    new_size = num;
5055
4.54k
    kroundup_size_t(new_size);
5056
4.54k
    bytes = item_size * new_size;
5057
5058
    /* Check for overflow.  Both ensure that alloc will fit in alloc_in_out (we
5059
       make the pessimistic assumption that *alloc_in_out is signed), and that
5060
       bytes has not wrapped around. */
5061
5062
4.54k
    if ((new_size > (((size_t) 1 << (size_sz * 8 - 1)) - 1))
5063
4.54k
        || (((item_size > safe) || (new_size > safe))
5064
4.54k
            && bytes / new_size != item_size)) {
5065
0
        hts_log(HTS_LOG_ERROR, func, "Memory allocation too large");
5066
0
        errno = ENOMEM;
5067
0
        return -1;
5068
0
    }
5069
5070
4.54k
    new_ptr = realloc(*ptr_in_out, bytes);
5071
4.54k
    if (new_ptr == NULL) {
5072
0
        int save_errno = errno;
5073
0
        hts_log(HTS_LOG_ERROR, func, "%s", strerror(errno));
5074
0
        errno = save_errno;
5075
0
        return -1;
5076
0
    }
5077
5078
4.54k
    if (flags & HTS_RESIZE_CLEAR) {
5079
4.54k
        size_t old_size;
5080
4.54k
        switch (size_sz) {
5081
4.54k
        case 4: old_size = *((uint32_t *) size_in_out); break;
5082
0
        case 8: old_size = *((uint64_t *) size_in_out); break;
5083
0
        default: abort();
5084
4.54k
        }
5085
4.54k
        if (new_size > old_size) {
5086
4.54k
            memset((char *) new_ptr + old_size * item_size, 0,
5087
4.54k
                   (new_size - old_size) * item_size);
5088
4.54k
        }
5089
4.54k
    }
5090
5091
4.54k
    switch (size_sz) {
5092
4.54k
    case 4: *((uint32_t *) size_in_out) = new_size; break;
5093
0
    case 8: *((uint64_t *) size_in_out) = new_size; break;
5094
0
    default: abort();
5095
4.54k
    }
5096
5097
4.54k
    *ptr_in_out = new_ptr;
5098
4.54k
    return 0;
5099
4.54k
}
5100
5101
void hts_lib_shutdown(void)
5102
0
{
5103
0
    hfile_shutdown(1);
5104
0
}
5105
5106
0
void hts_free(void *ptr) {
5107
0
    free(ptr);
5108
0
}
5109
5110
void hts_set_log_level(enum htsLogLevel level)
5111
0
{
5112
0
    hts_verbose = level;
5113
0
}
5114
5115
enum htsLogLevel hts_get_log_level(void)
5116
0
{
5117
0
    return hts_verbose;
5118
0
}
5119
5120
static char get_severity_tag(enum htsLogLevel severity)
5121
498k
{
5122
498k
    switch (severity) {
5123
13.7k
    case HTS_LOG_ERROR:
5124
13.7k
        return 'E';
5125
485k
    case HTS_LOG_WARNING:
5126
485k
        return 'W';
5127
0
    case HTS_LOG_INFO:
5128
0
        return 'I';
5129
0
    case HTS_LOG_DEBUG:
5130
0
        return 'D';
5131
0
    case HTS_LOG_TRACE:
5132
0
        return 'T';
5133
0
    default:
5134
0
        break;
5135
498k
    }
5136
5137
0
    return '*';
5138
498k
}
5139
5140
void hts_log(enum htsLogLevel severity, const char *context, const char *format, ...)
5141
637k
{
5142
637k
    int save_errno = errno;
5143
637k
    if (severity <= hts_verbose) {
5144
498k
        va_list argptr;
5145
5146
498k
        fprintf(stderr, "[%c::%s] ", get_severity_tag(severity), context);
5147
5148
498k
        va_start(argptr, format);
5149
498k
        vfprintf(stderr, format, argptr);
5150
498k
        va_end(argptr);
5151
5152
498k
        fprintf(stderr, "\n");
5153
498k
    }
5154
637k
    errno = save_errno;
5155
637k
}