Coverage Report

Created: 2024-07-23 06:11

/src/htslib/hts.c
Line
Count
Source (jump to first uncovered line)
1
/*  hts.c -- format-neutral I/O, indexing, and iterator API functions.
2
3
    Copyright (C) 2008, 2009, 2012-2024 Genome Research Ltd.
4
    Copyright (C) 2012, 2013 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <zlib.h>
30
#include <stdio.h>
31
#include <string.h>
32
#include <strings.h>
33
#include <stdlib.h>
34
#include <unistd.h>
35
#include <inttypes.h>
36
#include <limits.h>
37
#include <stdint.h>
38
#include <fcntl.h>
39
#include <errno.h>
40
#include <time.h>
41
#include <sys/stat.h>
42
#include <assert.h>
43
44
#ifdef HAVE_LIBLZMA
45
#ifdef HAVE_LZMA_H
46
#include <lzma.h>
47
#else
48
#include "os/lzma_stub.h"
49
#endif
50
#endif
51
52
#include "htslib/hts.h"
53
#include "htslib/bgzf.h"
54
#include "cram/cram.h"
55
#include "htslib/hfile.h"
56
#include "htslib/hts_endian.h"
57
#include "version.h"
58
#include "config_vars.h"
59
#include "hts_internal.h"
60
#include "hfile_internal.h"
61
#include "sam_internal.h"
62
#include "htslib/hts_expr.h"
63
#include "htslib/hts_os.h" // drand48
64
65
#include "htslib/khash.h"
66
#include "htslib/kseq.h"
67
#include "htslib/ksort.h"
68
#include "htslib/tbx.h"
69
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
70
#include <htscodecs/htscodecs.h>
71
#else
72
#include "htscodecs/htscodecs/htscodecs.h"
73
#endif
74
75
#ifndef EFTYPE
76
126
#define EFTYPE ENOEXEC
77
#endif
78
79
180k
KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal)
80
81
HTSLIB_EXPORT
82
int hts_verbose = HTS_LOG_WARNING;
83
84
const char *hts_version(void)
85
2
{
86
2
    return HTS_VERSION_TEXT;
87
2
}
88
89
0
unsigned int hts_features(void) {
90
0
    unsigned int feat = HTS_FEATURE_HTSCODECS; // Always present
91
92
0
#ifdef PACKAGE_URL
93
0
    feat |= HTS_FEATURE_CONFIGURE;
94
0
#endif
95
96
#ifdef ENABLE_PLUGINS
97
    feat |= HTS_FEATURE_PLUGINS;
98
#endif
99
100
0
#ifdef HAVE_LIBCURL
101
0
    feat |= HTS_FEATURE_LIBCURL;
102
0
#endif
103
104
0
#ifdef ENABLE_S3
105
0
    feat |= HTS_FEATURE_S3;
106
0
#endif
107
108
0
#ifdef ENABLE_GCS
109
0
    feat |= HTS_FEATURE_GCS;
110
0
#endif
111
112
#ifdef HAVE_LIBDEFLATE
113
    feat |= HTS_FEATURE_LIBDEFLATE;
114
#endif
115
116
0
#ifdef HAVE_LIBLZMA
117
0
    feat |= HTS_FEATURE_LZMA;
118
0
#endif
119
120
0
#ifdef HAVE_LIBBZ2
121
0
    feat |= HTS_FEATURE_BZIP2;
122
0
#endif
123
124
0
    return feat;
125
0
}
126
127
0
const char *hts_test_feature(unsigned int id) {
128
0
    unsigned int feat = hts_features();
129
130
0
    switch (id) {
131
0
    case HTS_FEATURE_CONFIGURE:
132
0
        return feat & HTS_FEATURE_CONFIGURE ? "yes" : NULL;
133
0
    case HTS_FEATURE_PLUGINS:
134
0
        return feat & HTS_FEATURE_PLUGINS ? "yes" : NULL;
135
0
    case HTS_FEATURE_LIBCURL:
136
0
        return feat & HTS_FEATURE_LIBCURL ? "yes" : NULL;
137
0
    case HTS_FEATURE_S3:
138
0
        return feat & HTS_FEATURE_S3 ? "yes" : NULL;
139
0
    case HTS_FEATURE_GCS:
140
0
        return feat & HTS_FEATURE_GCS ? "yes" : NULL;
141
0
    case HTS_FEATURE_LIBDEFLATE:
142
0
        return feat & HTS_FEATURE_LIBDEFLATE ? "yes" : NULL;
143
0
    case HTS_FEATURE_BZIP2:
144
0
        return feat & HTS_FEATURE_BZIP2 ? "yes" : NULL;
145
0
    case HTS_FEATURE_LZMA:
146
0
        return feat & HTS_FEATURE_LZMA ? "yes" : NULL;
147
148
0
    case HTS_FEATURE_HTSCODECS:
149
0
        return htscodecs_version();
150
151
0
    case HTS_FEATURE_CC:
152
0
        return HTS_CC;
153
0
    case HTS_FEATURE_CFLAGS:
154
0
        return HTS_CFLAGS;
155
0
    case HTS_FEATURE_LDFLAGS:
156
0
        return HTS_LDFLAGS;
157
0
    case HTS_FEATURE_CPPFLAGS:
158
0
        return HTS_CPPFLAGS;
159
160
0
    default:
161
0
        fprintf(stderr, "Unknown feature code: %u\n", id);
162
0
    }
163
164
0
    return NULL;
165
0
}
166
167
// Note this implementation also means we can just "strings" the library
168
// to find the configuration parameters.
169
0
const char *hts_feature_string(void) {
170
0
    static char config[1200];
171
0
    const char *flags=
172
173
0
#ifdef PACKAGE_URL
174
0
    "build=configure "
175
#else
176
    "build=Makefile "
177
#endif
178
179
0
#ifdef HAVE_LIBCURL
180
0
    "libcurl=yes "
181
#else
182
    "libcurl=no "
183
#endif
184
185
0
#ifdef ENABLE_S3
186
0
    "S3=yes "
187
#else
188
    "S3=no "
189
#endif
190
191
0
#ifdef ENABLE_GCS
192
0
    "GCS=yes "
193
#else
194
    "GCS=no "
195
#endif
196
197
#ifdef HAVE_LIBDEFLATE
198
    "libdeflate=yes "
199
#else
200
0
    "libdeflate=no "
201
0
#endif
202
203
0
#ifdef HAVE_LIBLZMA
204
0
    "lzma=yes "
205
#else
206
    "lzma=no "
207
#endif
208
209
0
#ifdef HAVE_LIBBZ2
210
0
    "bzip2=yes "
211
#else
212
    "bzip2=no "
213
#endif
214
215
// "plugins=" must stay at the end as it is followed by "plugin-path="
216
#ifdef ENABLE_PLUGINS
217
    "plugins=yes";
218
#else
219
0
    "plugins=no";
220
0
#endif
221
222
#ifdef ENABLE_PLUGINS
223
    snprintf(config, sizeof(config),
224
             "%s plugin-path=%.1000s htscodecs=%.40s",
225
             flags, hts_plugin_path(), htscodecs_version());
226
#else
227
0
    snprintf(config, sizeof(config),
228
0
             "%s htscodecs=%.40s",
229
0
             flags, htscodecs_version());
230
0
#endif
231
0
    return config;
232
0
}
233
234
235
HTSLIB_EXPORT
236
const unsigned char seq_nt16_table[256] = {
237
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
238
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
239
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
240
     1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
241
    15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
242
    15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
243
    15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
244
    15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
245
246
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
247
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
248
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
249
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
250
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
251
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
252
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
253
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
254
};
255
256
HTSLIB_EXPORT
257
const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN";
258
259
HTSLIB_EXPORT
260
const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
261
262
/**********************
263
 *** Basic file I/O ***
264
 **********************/
265
266
static enum htsFormatCategory format_category(enum htsExactFormat fmt)
267
47.1k
{
268
47.1k
    switch (fmt) {
269
0
    case bam:
270
0
    case sam:
271
12.5k
    case cram:
272
12.5k
    case fastq_format:
273
12.5k
    case fasta_format:
274
12.5k
        return sequence_data;
275
276
0
    case vcf:
277
0
    case bcf:
278
0
        return variant_data;
279
280
0
    case bai:
281
0
    case crai:
282
0
    case csi:
283
0
    case fai_format:
284
0
    case fqi_format:
285
0
    case gzi:
286
0
    case tbi:
287
0
        return index_file;
288
289
0
    case bed:
290
0
    case d4_format:
291
0
        return region_list;
292
293
0
    case htsget:
294
0
    case hts_crypt4gh_format:
295
0
        return unknown_category;
296
297
0
    case unknown_format:
298
17.3k
    case binary_format:
299
34.6k
    case text_format:
300
34.6k
    case empty_format:
301
34.6k
    case format_maximum:
302
34.6k
        break;
303
47.1k
    }
304
305
34.6k
    return unknown_category;
306
47.1k
}
307
308
// Decompress several hundred bytes by peeking at the file, which must be
309
// positioned at the start of a GZIP block.
310
static ssize_t
311
decompress_peek_gz(hFILE *fp, unsigned char *dest, size_t destsize)
312
3.18k
{
313
3.18k
    unsigned char buffer[2048];
314
3.18k
    z_stream zs;
315
3.18k
    ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
316
317
3.18k
    if (npeek < 0) return -1;
318
319
3.18k
    zs.zalloc = NULL;
320
3.18k
    zs.zfree = NULL;
321
3.18k
    zs.next_in = buffer;
322
3.18k
    zs.avail_in = npeek;
323
3.18k
    zs.next_out = dest;
324
3.18k
    zs.avail_out = destsize;
325
3.18k
    if (inflateInit2(&zs, 31) != Z_OK) return -1;
326
327
3.18k
    int ret;
328
3.18k
    const unsigned char *last_in = buffer;
329
6.23k
    while (zs.avail_out > 0) {
330
3.51k
        ret = inflate(&zs, Z_SYNC_FLUSH);
331
3.51k
        if (ret == Z_STREAM_END) {
332
105
            if (last_in == zs.next_in)
333
0
                break; // Paranoia to avoid potential looping. Shouldn't happen
334
105
            else
335
105
                last_in = zs.next_in;
336
105
            inflateReset(&zs);
337
3.41k
        } else if (ret != Z_OK) {
338
            // eg Z_BUF_ERROR due to avail_in/out becoming zero
339
472
            break;
340
472
        }
341
3.51k
    }
342
343
    // NB: zs.total_out is changed by inflateReset, so use pointer diff instead
344
3.18k
    destsize = zs.next_out - dest;
345
3.18k
    inflateEnd(&zs);
346
347
3.18k
    return destsize;
348
3.18k
}
349
350
#ifdef HAVE_LIBLZMA
351
// Similarly decompress a portion by peeking at the file, which must be
352
// positioned at the start of the file.
353
static ssize_t
354
decompress_peek_xz(hFILE *fp, unsigned char *dest, size_t destsize)
355
2
{
356
2
    unsigned char buffer[2048];
357
2
    ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
358
2
    if (npeek < 0) return -1;
359
360
2
    lzma_stream ls = LZMA_STREAM_INIT;
361
2
    if (lzma_stream_decoder(&ls, lzma_easy_decoder_memusage(9), 0) != LZMA_OK)
362
0
        return -1;
363
364
2
    ls.next_in = buffer;
365
2
    ls.avail_in = npeek;
366
2
    ls.next_out = dest;
367
2
    ls.avail_out = destsize;
368
369
2
    int r = lzma_code(&ls, LZMA_RUN);
370
2
    if (! (r == LZMA_OK || r == LZMA_STREAM_END)) {
371
1
        lzma_end(&ls);
372
1
        return -1;
373
1
    }
374
375
1
    destsize = ls.total_out;
376
1
    lzma_end(&ls);
377
378
1
    return destsize;
379
2
}
380
#endif
381
382
// Parse "x.y" text, taking care because the string is not NUL-terminated
383
// and filling in major/minor only when the digits are followed by a delimiter,
384
// so we don't misread "1.10" as "1.1" due to reaching the end of the buffer.
385
static void
386
parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim)
387
301
{
388
301
    const char *s    = (const char *) u;
389
301
    const char *slim = (const char *) ulim;
390
301
    short v;
391
392
301
    fmt->version.major = fmt->version.minor = -1;
393
394
624
    for (v = 0; s < slim && isdigit_c(*s); s++)
395
323
        v = 10 * v + *s - '0';
396
397
301
    if (s < slim) {
398
281
        fmt->version.major = v;
399
281
        if (*s == '.') {
400
44
            s++;
401
330
            for (v = 0; s < slim && isdigit_c(*s); s++)
402
286
                v = 10 * v + *s - '0';
403
44
            if (s < slim)
404
24
                fmt->version.minor = v;
405
44
        }
406
237
        else
407
237
            fmt->version.minor = 0;
408
281
    }
409
301
}
410
411
static int
412
cmp_nonblank(const char *key, const unsigned char *u, const unsigned char *ulim)
413
4.78k
{
414
4.78k
    const unsigned char *ukey = (const unsigned char *) key;
415
416
5.30k
    while (*ukey)
417
5.30k
        if (u >= ulim) return +1;
418
5.28k
        else if (isspace_c(*u)) u++;
419
4.77k
        else if (*u != *ukey) return (*ukey < *u)? -1 : +1;
420
8
        else u++, ukey++;
421
422
0
    return 0;
423
4.78k
}
424
425
static int is_text_only(const unsigned char *u, const unsigned char *ulim)
426
4.77k
{
427
89.7k
    for (; u < ulim; u++)
428
85.0k
        if (! (*u >= ' ' || *u == '\t' || *u == '\r' || *u == '\n'))
429
137
            return 0;
430
431
4.63k
    return 1;
432
4.77k
}
433
434
static inline int
435
alternate_zeros(const unsigned char *u, const unsigned char *ulim)
436
0
{
437
0
    for (; u < ulim; u += 2)
438
0
        if (*u != '\0') return 0;
439
0
    return 1;
440
0
}
441
442
static int is_utf16_text(const unsigned char *u, const unsigned char *ulim)
443
0
{
444
0
    if (ulim - u >= 6 &&
445
0
        ((u[0] == 0xfe && u[1] == 0xff && alternate_zeros(u+2, ulim)) ||
446
0
         (u[0] == 0xff && u[1] == 0xfe && alternate_zeros(u+3, ulim))))
447
0
        return 2;
448
0
    else if (ulim - u >= 8 &&
449
0
             (alternate_zeros(u, ulim) || alternate_zeros(u+1, ulim)))
450
0
        return 1;
451
0
    else
452
0
        return 0;
453
0
}
454
455
static int is_fastaq(const unsigned char *u, const unsigned char *ulim)
456
4.41k
{
457
4.41k
    const unsigned char *eol = memchr(u, '\n', ulim - u);
458
459
    // Check that the first line is entirely textual
460
4.41k
    if (! is_text_only(u, eol? eol : ulim)) return 0;
461
462
    // If the first line is very long, consider the file to indeed be FASTA/Q
463
4.39k
    if (eol == NULL) return 1;
464
465
3.80k
    u = eol+1; // Now points to the first character of the second line
466
467
    // Scan over all base-encoding letters (including 'N' but not SEQ's '=')
468
6.20k
    while (u < ulim && (seq_nt16_table[*u] != 15 || toupper(*u) == 'N')) {
469
2.39k
        if (*u == '=') return 0;
470
2.39k
        u++;
471
2.39k
    }
472
473
3.80k
    return (u == ulim || *u == '\r' || *u == '\n')? 1 : 0;
474
3.80k
}
475
476
// Parse tab-delimited text, filling in a string of column types and returning
477
// the number of columns spotted (within [u,ulim), and up to column_len) or -1
478
// if non-printable characters were seen.  Column types:
479
//     i: integer, s: strand sign, C: CIGAR, O: SAM optional field, Z: anything
480
static int
481
parse_tabbed_text(char *columns, int column_len,
482
                  const unsigned char *u, const unsigned char *ulim,
483
                  int *complete)
484
402
{
485
402
    const char *str  = (const char *) u;
486
402
    const char *slim = (const char *) ulim;
487
402
    const char *s;
488
402
    int ncolumns = 0;
489
490
402
    enum { digit = 1, leading_sign = 2, cigar_operator = 4, other = 8 };
491
402
    unsigned seen = 0;
492
402
    *complete = 0;
493
494
6.82k
    for (s = str; s < slim; s++)
495
6.63k
        if (*s >= ' ') {
496
5.40k
            if (isdigit_c(*s))
497
1.20k
                seen |= digit;
498
4.20k
            else if ((*s == '+' || *s == '-') && s == str)
499
112
                seen |= leading_sign;
500
4.09k
            else if (strchr(BAM_CIGAR_STR, *s) && s > str && isdigit_c(s[-1]))
501
238
                seen |= cigar_operator;
502
3.85k
            else
503
3.85k
                seen |= other;
504
5.40k
        }
505
1.22k
        else if (*s == '\t' || *s == '\r' || *s == '\n') {
506
1.08k
            size_t len = s - str;
507
1.08k
            char type;
508
509
1.08k
            if (seen == digit || seen == (leading_sign|digit)) type = 'i';
510
732
            else if (seen == (digit|cigar_operator)) type = 'C';
511
694
            else if (len == 1)
512
259
                switch (str[0]) {
513
127
                case '*': type = 'C'; break;
514
44
                case '+': case '-': case '.': type = 's'; break;
515
88
                default: type = 'Z'; break;
516
259
                }
517
435
            else if (len >= 5 && str[2] == ':' && str[4] == ':') type = 'O';
518
387
            else type = 'Z';
519
520
1.08k
            columns[ncolumns++] = type;
521
1.08k
            if (*s != '\t' || ncolumns >= column_len - 1) {
522
63
                *complete = 1; // finished the line or more columns than needed
523
63
                break;
524
63
            }
525
526
1.01k
            str = s + 1;
527
1.01k
            seen = 0;
528
1.01k
        }
529
141
        else return -1;
530
531
261
    columns[ncolumns] = '\0';
532
261
    return ncolumns;
533
402
}
534
535
// Match COLUMNS as a prefix against PATTERN (so COLUMNS may run out first).
536
// Returns len(COLUMNS) (modulo '+'), or 0 if there is a mismatched entry.
537
static int colmatch(const char *columns, const char *pattern)
538
331
{
539
331
    int i;
540
1.19k
    for (i = 0; columns[i] != '\0'; i++) {
541
985
        if (pattern[i] == '+') return i;
542
979
        if (! (columns[i] == pattern[i] || pattern[i] == 'Z')) return 0;
543
979
    }
544
545
212
    return i;
546
331
}
547
548
int hts_detect_format(hFILE *hfile, htsFormat *fmt)
549
0
{
550
0
    return hts_detect_format2(hfile, NULL, fmt);
551
0
}
552
553
int hts_detect_format2(hFILE *hfile, const char *fname, htsFormat *fmt)
554
67.1k
{
555
67.1k
    char extension[HTS_MAX_EXT_LEN], columns[24];
556
67.1k
    unsigned char s[1024];
557
67.1k
    int complete = 0;
558
67.1k
    ssize_t len = hpeek(hfile, s, 18);
559
67.1k
    if (len < 0) return -1;
560
561
67.1k
    fmt->category = unknown_category;
562
67.1k
    fmt->format = unknown_format;
563
67.1k
    fmt->version.major = fmt->version.minor = -1;
564
67.1k
    fmt->compression = no_compression;
565
67.1k
    fmt->compression_level = -1;
566
67.1k
    fmt->specific = NULL;
567
568
67.1k
    if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) {
569
        // The stream is either gzip-compressed or BGZF-compressed.
570
        // Determine which, and decompress the first few records or lines.
571
3.18k
        fmt->compression = gzip;
572
3.18k
        if (len >= 18 && (s[3] & 4)) {
573
1.51k
            if (memcmp(&s[12], "BC\2\0", 4) == 0)
574
254
                fmt->compression = bgzf;
575
1.25k
            else if (memcmp(&s[12], "RAZF", 4) == 0)
576
94
                fmt->compression = razf_compression;
577
1.51k
        }
578
3.18k
        if (len >= 9 && s[2] == 8)
579
3.16k
            fmt->compression_level = (s[8] == 2)? 9 : (s[8] == 4)? 1 : -1;
580
581
3.18k
        len = decompress_peek_gz(hfile, s, sizeof s);
582
3.18k
    }
583
63.9k
    else if (len >= 10 && memcmp(s, "BZh", 3) == 0 &&
584
63.9k
             (memcmp(&s[4], "\x31\x41\x59\x26\x53\x59", 6) == 0 ||
585
10
              memcmp(&s[4], "\x17\x72\x45\x38\x50\x90", 6) == 0)) {
586
3
        fmt->compression = bzip2_compression;
587
3
        fmt->compression_level = s[3] - '0';
588
        // Decompressing via libbz2 produces no output until it has a whole
589
        // block (of size 100Kb x level), which is too large for peeking.
590
        // So unfortunately we can recognise bzip2 but not the contents,
591
        // except that \x1772... magic indicates the stream is empty.
592
3
        if (s[4] == '\x31') return 0;
593
2
        else len = 0;
594
3
    }
595
63.9k
    else if (len >= 6 && memcmp(s, "\xfd""7zXZ\0", 6) == 0) {
596
2
        fmt->compression = xz_compression;
597
2
#ifdef HAVE_LIBLZMA
598
2
        len = decompress_peek_xz(hfile, s, sizeof s);
599
#else
600
        // Without liblzma, we can't recognise the decompressed contents.
601
        return 0;
602
#endif
603
2
    }
604
63.9k
    else if (len >= 4 && memcmp(s, "\x28\xb5\x2f\xfd", 4) == 0) {
605
2
        fmt->compression = zstd_compression;
606
2
        return 0;
607
2
    }
608
63.9k
    else {
609
63.9k
        len = hpeek(hfile, s, sizeof s);
610
63.9k
    }
611
67.1k
    if (len < 0) return -1;
612
613
67.1k
    if (len == 0) {
614
101
        fmt->format = empty_format;
615
101
        return 0;
616
101
    }
617
618
    // We avoid using filename extensions wherever possible (as filenames are
619
    // not always available), but in a few cases they must be considered:
620
    //  - FASTA/Q indexes are simply tab-separated text; files that match these
621
    //    patterns but not the fai/fqi extension are usually generic BED files
622
    //  - GZI indexes have no magic numbers so can only be detected by filename
623
67.0k
    if (fname && strcmp(fname, "-") != 0) {
624
67.0k
        char *s;
625
67.0k
        if (find_file_extension(fname, extension) < 0) extension[0] = '\0';
626
67.0k
        for (s = extension; *s; s++) *s = tolower_c(*s);
627
67.0k
    }
628
0
    else extension[0] = '\0';
629
630
67.0k
    if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=7 && s[5]<=7) {
631
13.9k
        fmt->category = sequence_data;
632
13.9k
        fmt->format = cram;
633
13.9k
        fmt->version.major = s[4], fmt->version.minor = s[5];
634
13.9k
        fmt->compression = custom;
635
13.9k
        return 0;
636
13.9k
    }
637
53.0k
    else if (len >= 4 && s[3] <= '\4') {
638
5.35k
        if (memcmp(s, "BAM\1", 4) == 0) {
639
3.62k
            fmt->category = sequence_data;
640
3.62k
            fmt->format = bam;
641
            // TODO Decompress enough to pick version from @HD-VN header
642
3.62k
            fmt->version.major = 1, fmt->version.minor = -1;
643
3.62k
            return 0;
644
3.62k
        }
645
1.73k
        else if (memcmp(s, "BAI\1", 4) == 0) {
646
1
            fmt->category = index_file;
647
1
            fmt->format = bai;
648
1
            fmt->version.major = -1, fmt->version.minor = -1;
649
1
            return 0;
650
1
        }
651
1.72k
        else if (memcmp(s, "BCF\4", 4) == 0) {
652
12
            fmt->category = variant_data;
653
12
            fmt->format = bcf;
654
12
            fmt->version.major = 1, fmt->version.minor = -1;
655
12
            return 0;
656
12
        }
657
1.71k
        else if (memcmp(s, "BCF\2", 4) == 0) {
658
1.69k
            fmt->category = variant_data;
659
1.69k
            fmt->format = bcf;
660
1.69k
            fmt->version.major = s[3];
661
1.69k
            fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0;
662
1.69k
            return 0;
663
1.69k
        }
664
19
        else if (memcmp(s, "CSI\1", 4) == 0) {
665
1
            fmt->category = index_file;
666
1
            fmt->format = csi;
667
1
            fmt->version.major = 1, fmt->version.minor = -1;
668
1
            return 0;
669
1
        }
670
18
        else if (memcmp(s, "TBI\1", 4) == 0) {
671
1
            fmt->category = index_file;
672
1
            fmt->format = tbi;
673
1
            return 0;
674
1
        }
675
        // GZI indexes have no magic numbers, so must be recognised solely by
676
        // filename extension.
677
17
        else if (strcmp(extension, "gzi") == 0) {
678
0
            fmt->category = index_file;
679
0
            fmt->format = gzi;
680
0
            return 0;
681
0
        }
682
5.35k
    }
683
47.6k
    else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) {
684
12.6k
        fmt->category = variant_data;
685
12.6k
        fmt->format = vcf;
686
12.6k
        if (len >= 21 && s[16] == 'v')
687
33
            parse_version(fmt, &s[17], &s[len]);
688
12.6k
        return 0;
689
12.6k
    }
690
35.0k
    else if (len >= 4 && s[0] == '@' &&
691
35.0k
             (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 ||
692
30.7k
              memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0 ||
693
30.7k
              memcmp(s, "@CO\t", 4) == 0)) {
694
30.2k
        fmt->category = sequence_data;
695
30.2k
        fmt->format = sam;
696
        // @HD-VN is not guaranteed to be the first tag, but then @HD is
697
        // not guaranteed to be present at all...
698
30.2k
        if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0)
699
268
            parse_version(fmt, &s[7], &s[len]);
700
29.9k
        else
701
29.9k
            fmt->version.major = 1, fmt->version.minor = -1;
702
30.2k
        return 0;
703
30.2k
    }
704
4.78k
    else if (len >= 8 && memcmp(s, "d4\xdd\xdd", 4) == 0) {
705
1
        fmt->category = region_list;
706
1
        fmt->format = d4_format;
707
        // How to decode the D4 Format Version bytes is not yet specified
708
        // so we don't try to set fmt->version.{major,minor}.
709
1
        return 0;
710
1
    }
711
4.78k
    else if (cmp_nonblank("{\"htsget\":", s, &s[len]) == 0) {
712
0
        fmt->category = unknown_category;
713
0
        fmt->format = htsget;
714
0
        return 0;
715
0
    }
716
4.78k
    else if (len > 8 && memcmp(s, "crypt4gh", 8) == 0) {
717
1
        fmt->category = unknown_category;
718
1
        fmt->format = hts_crypt4gh_format;
719
1
        return 0;
720
1
    }
721
4.78k
    else if (len >= 1 && s[0] == '>' && is_fastaq(s, &s[len])) {
722
3.80k
        fmt->category = sequence_data;
723
3.80k
        fmt->format = fasta_format;
724
3.80k
        return 0;
725
3.80k
    }
726
978
    else if (len >= 1 && s[0] == '@' && is_fastaq(s, &s[len])) {
727
576
        fmt->category = sequence_data;
728
576
        fmt->format = fastq_format;
729
576
        return 0;
730
576
    }
731
402
    else if (parse_tabbed_text(columns, sizeof columns, s,
732
402
                               &s[len], &complete) > 0) {
733
        // A complete SAM line is at least 11 columns.  On unmapped long reads may
734
        // be missing two.  (On mapped long reads we must have an @ header so long
735
        // CIGAR is irrelevant.)
736
183
        if (colmatch(columns, "ZiZiiCZiiZZOOOOOOOOOOOOOOOOOOOO+")
737
183
            >= 9 + 2*complete) {
738
52
            fmt->category = sequence_data;
739
52
            fmt->format = sam;
740
52
            fmt->version.major = 1, fmt->version.minor = -1;
741
52
            return 0;
742
52
        }
743
131
        else if (fmt->compression == gzip && colmatch(columns, "iiiiii") == 6) {
744
0
            fmt->category = index_file;
745
0
            fmt->format = crai;
746
0
            return 0;
747
0
        }
748
131
        else if (strstr(extension, "fqi") && colmatch(columns, "Ziiiii") == 6) {
749
0
            fmt->category = index_file;
750
0
            fmt->format = fqi_format;
751
0
            return 0;
752
0
        }
753
131
        else if (strstr(extension, "fai") && colmatch(columns, "Ziiii") == 5) {
754
0
            fmt->category = index_file;
755
0
            fmt->format = fai_format;
756
0
            return 0;
757
0
        }
758
131
        else if (colmatch(columns, "Zii+") >= 3) {
759
9
            fmt->category = region_list;
760
9
            fmt->format = bed;
761
9
            return 0;
762
9
        }
763
183
    }
764
765
    // Arbitrary text files can be read using hts_getline().
766
358
    if (is_text_only(s, &s[len])) fmt->format = text_format;
767
768
    // Nothing recognised: leave unset fmt-> fields as unknown.
769
358
    return 0;
770
67.0k
}
771
772
char *hts_format_description(const htsFormat *format)
773
0
{
774
0
    kstring_t str = { 0, 0, NULL };
775
776
0
    switch (format->format) {
777
0
    case sam:   kputs("SAM", &str); break;
778
0
    case bam:   kputs("BAM", &str); break;
779
0
    case cram:  kputs("CRAM", &str); break;
780
0
    case fasta_format:  kputs("FASTA", &str); break;
781
0
    case fastq_format:  kputs("FASTQ", &str); break;
782
0
    case vcf:   kputs("VCF", &str); break;
783
0
    case bcf:
784
0
        if (format->version.major == 1) kputs("Legacy BCF", &str);
785
0
        else kputs("BCF", &str);
786
0
        break;
787
0
    case bai:   kputs("BAI", &str); break;
788
0
    case crai:  kputs("CRAI", &str); break;
789
0
    case csi:   kputs("CSI", &str); break;
790
0
    case fai_format:    kputs("FASTA-IDX", &str); break;
791
0
    case fqi_format:    kputs("FASTQ-IDX", &str); break;
792
0
    case gzi:   kputs("GZI", &str); break;
793
0
    case tbi:   kputs("Tabix", &str); break;
794
0
    case bed:   kputs("BED", &str); break;
795
0
    case d4_format:     kputs("D4", &str); break;
796
0
    case htsget: kputs("htsget", &str); break;
797
0
    case hts_crypt4gh_format: kputs("crypt4gh", &str); break;
798
0
    case empty_format:  kputs("empty", &str); break;
799
0
    default:    kputs("unknown", &str); break;
800
0
    }
801
802
0
    if (format->version.major >= 0) {
803
0
        kputs(" version ", &str);
804
0
        kputw(format->version.major, &str);
805
0
        if (format->version.minor >= 0) {
806
0
            kputc('.', &str);
807
0
            kputw(format->version.minor, &str);
808
0
        }
809
0
    }
810
811
0
    switch (format->compression) {
812
0
    case bzip2_compression:  kputs(" bzip2-compressed", &str); break;
813
0
    case razf_compression:   kputs(" legacy-RAZF-compressed", &str); break;
814
0
    case xz_compression:     kputs(" XZ-compressed", &str); break;
815
0
    case zstd_compression:   kputs(" Zstandard-compressed", &str); break;
816
0
    case custom: kputs(" compressed", &str); break;
817
0
    case gzip:   kputs(" gzip-compressed", &str); break;
818
819
0
    case bgzf:
820
0
        switch (format->format) {
821
0
        case bam:
822
0
        case bcf:
823
0
        case csi:
824
0
        case tbi:
825
            // These are by definition BGZF, so just use the generic term
826
0
            kputs(" compressed", &str);
827
0
            break;
828
0
        default:
829
0
            kputs(" BGZF-compressed", &str);
830
0
            break;
831
0
        }
832
0
        break;
833
834
0
    case no_compression:
835
0
        switch (format->format) {
836
0
        case bam:
837
0
        case bcf:
838
0
        case cram:
839
0
        case csi:
840
0
        case tbi:
841
            // These are normally compressed, so emphasise that this one isn't
842
0
            kputs(" uncompressed", &str);
843
0
            break;
844
0
        default:
845
0
            break;
846
0
        }
847
0
        break;
848
849
0
    default: break;
850
0
    }
851
852
0
    switch (format->category) {
853
0
    case sequence_data: kputs(" sequence", &str); break;
854
0
    case variant_data:  kputs(" variant calling", &str); break;
855
0
    case index_file:    kputs(" index", &str); break;
856
0
    case region_list:   kputs(" genomic region", &str); break;
857
0
    default: break;
858
0
    }
859
860
0
    if (format->compression == no_compression)
861
0
        switch (format->format) {
862
0
        case text_format:
863
0
        case sam:
864
0
        case crai:
865
0
        case vcf:
866
0
        case bed:
867
0
        case fai_format:
868
0
        case fqi_format:
869
0
        case fasta_format:
870
0
        case fastq_format:
871
0
        case htsget:
872
0
            kputs(" text", &str);
873
0
            break;
874
875
0
        case empty_format:
876
0
            break;
877
878
0
        default:
879
0
            kputs(" data", &str);
880
0
            break;
881
0
        }
882
0
    else
883
0
        kputs(" data", &str);
884
885
0
    return ks_release(&str);
886
0
}
887
888
htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
889
47.1k
{
890
47.1k
    char smode[101], *cp, *cp2, *mode_c, *uncomp = NULL;
891
47.1k
    htsFile *fp = NULL;
892
47.1k
    hFILE *hfile = NULL;
893
47.1k
    char fmt_code = '\0';
894
    // see enum htsExactFormat in htslib/hts.h
895
47.1k
    const char format_to_mode[] = "\0g\0\0b\0c\0\0b\0g\0\0\0\0\0Ff\0\0";
896
897
47.1k
    strncpy(smode, mode, 99);
898
47.1k
    smode[99]=0;
899
47.1k
    if ((cp = strchr(smode, ',')))
900
0
        *cp = '\0';
901
902
    // Migrate format code (b or c) to the end of the smode buffer.
903
124k
    for (cp2 = cp = smode; *cp; cp++) {
904
77.0k
        if (*cp == 'b')
905
17.3k
            fmt_code = 'b';
906
59.7k
        else if (*cp == 'c')
907
12.5k
            fmt_code = 'c';
908
47.1k
        else {
909
47.1k
            *cp2++ = *cp;
910
            // Cache the uncompress flag 'u' pos if present
911
47.1k
            if (!uncomp && (*cp == 'u')) {
912
0
                uncomp = cp2 - 1;
913
0
            }
914
47.1k
        }
915
77.0k
    }
916
47.1k
    mode_c = cp2;
917
47.1k
    *cp2++ = fmt_code;
918
47.1k
    *cp2++ = 0;
919
920
    // Set or reset the format code if opts->format is used
921
47.1k
    if (fmt && fmt->format > unknown_format
922
47.1k
        && fmt->format < sizeof(format_to_mode)) {
923
0
        *mode_c = format_to_mode[fmt->format];
924
0
    }
925
926
    // Uncompressed bam/bcf is not supported, change 'u' to '0' on write
927
47.1k
    if (uncomp && *mode_c == 'b' && (strchr(smode, 'w') || strchr(smode, 'a'))) {
928
0
        *uncomp = '0';
929
0
    }
930
931
    // If we really asked for a compressed text format then mode_c above will
932
    // point to nul.  We set to 'z' to enable bgzf.
933
47.1k
    if (strchr(mode, 'w') && fmt && fmt->compression == bgzf) {
934
0
        if (fmt->format == sam || fmt->format == vcf || fmt->format == text_format)
935
0
            *mode_c = 'z';
936
0
    }
937
938
47.1k
    char *rmme = NULL, *fnidx = strstr(fn, HTS_IDX_DELIM);
939
47.1k
    if ( fnidx ) {
940
0
        rmme = strdup(fn);
941
0
        if ( !rmme ) goto error;
942
0
        rmme[fnidx-fn] = 0;
943
0
        fn = rmme;
944
0
    }
945
946
47.1k
    hfile = hopen(fn, smode);
947
47.1k
    if (hfile == NULL) goto error;
948
949
47.1k
    fp = hts_hopen(hfile, fn, smode);
950
47.1k
    if (fp == NULL) goto error;
951
952
    // Compensate for the loss of exactness in htsExactFormat.
953
    // hts_hopen returns generics such as binary or text, but we
954
    // have been given something explicit here so use that instead.
955
47.1k
    if (fp->is_write && fmt &&
956
47.1k
        (fmt->format == bam || fmt->format == sam ||
957
0
         fmt->format == vcf || fmt->format == bcf ||
958
0
         fmt->format == bed || fmt->format == fasta_format ||
959
0
         fmt->format == fastq_format))
960
0
        fp->format.format = fmt->format;
961
962
47.1k
    if (fmt && fmt->specific) {
963
0
        if (hts_opt_apply(fp, fmt->specific) != 0) {
964
0
            if (((hts_opt*)fmt->specific)->opt == CRAM_OPT_REFERENCE &&
965
0
                (errno == ENOENT || errno == EIO || errno == EBADF ||
966
0
                  errno == EACCES || errno == EISDIR)) {
967
                /* error during reference file operation
968
                 for these specific errors, set the error as EINVAL */
969
0
                errno = EINVAL;
970
0
            }
971
0
            goto error;
972
0
        }
973
0
    }
974
47.1k
    if ( rmme ) free(rmme);
975
47.1k
    return fp;
976
977
0
error:
978
0
    hts_log_error("Failed to open file \"%s\"%s%s", fn,
979
0
                  errno ? " : " : "", errno ? strerror(errno) : "");
980
0
    if ( rmme ) free(rmme);
981
982
0
    if (hfile)
983
0
        hclose_abruptly(hfile);
984
985
0
    return NULL;
986
47.1k
}
987
988
47.1k
htsFile *hts_open(const char *fn, const char *mode) {
989
47.1k
    return hts_open_format(fn, mode, NULL);
990
47.1k
}
991
992
/*
993
 * Splits str into a prefix, delimiter ('\0' or delim), and suffix, writing
994
 * the prefix in lowercase into buf and returning a pointer to the suffix.
995
 * On return, buf is always NUL-terminated; thus assumes that the "keyword"
996
 * prefix should be one of several known values of maximum length buflen-2.
997
 * (If delim is not found, returns a pointer to the '\0'.)
998
 */
999
static const char *
1000
scan_keyword(const char *str, char delim, char *buf, size_t buflen)
1001
0
{
1002
0
    size_t i = 0;
1003
0
    while (*str && *str != delim) {
1004
0
        if (i < buflen-1) buf[i++] = tolower_c(*str);
1005
0
        str++;
1006
0
    }
1007
1008
0
    buf[i] = '\0';
1009
0
    return *str? str+1 : str;
1010
0
}
1011
1012
/*
1013
 * Parses arg and appends it to the option list.
1014
 *
1015
 * Returns 0 on success;
1016
 *        -1 on failure.
1017
 */
1018
0
int hts_opt_add(hts_opt **opts, const char *c_arg) {
1019
0
    hts_opt *o, *t;
1020
0
    char *val;
1021
1022
    /*
1023
     * IMPORTANT!!!
1024
     * If you add another string option here, don't forget to also add
1025
     * it to the case statement in hts_opt_apply.
1026
     */
1027
1028
0
    if (!c_arg)
1029
0
        return -1;
1030
1031
0
    if (!(o =  malloc(sizeof(*o))))
1032
0
        return -1;
1033
1034
0
    if (!(o->arg = strdup(c_arg))) {
1035
0
        free(o);
1036
0
        return -1;
1037
0
    }
1038
1039
0
    if (!(val = strchr(o->arg, '=')))
1040
0
        val = "1"; // assume boolean
1041
0
    else
1042
0
        *val++ = '\0';
1043
1044
0
    if (strcmp(o->arg, "decode_md") == 0 ||
1045
0
        strcmp(o->arg, "DECODE_MD") == 0)
1046
0
        o->opt = CRAM_OPT_DECODE_MD, o->val.i = atoi(val);
1047
1048
0
    else if (strcmp(o->arg, "verbosity") == 0 ||
1049
0
             strcmp(o->arg, "VERBOSITY") == 0)
1050
0
        o->opt = CRAM_OPT_VERBOSITY, o->val.i = atoi(val);
1051
1052
0
    else if (strcmp(o->arg, "seqs_per_slice") == 0 ||
1053
0
             strcmp(o->arg, "SEQS_PER_SLICE") == 0)
1054
0
        o->opt = CRAM_OPT_SEQS_PER_SLICE, o->val.i = atoi(val);
1055
1056
0
    else if (strcmp(o->arg, "bases_per_slice") == 0 ||
1057
0
             strcmp(o->arg, "BASES_PER_SLICE") == 0)
1058
0
        o->opt = CRAM_OPT_BASES_PER_SLICE, o->val.i = atoi(val);
1059
1060
0
    else if (strcmp(o->arg, "slices_per_container") == 0 ||
1061
0
             strcmp(o->arg, "SLICES_PER_CONTAINER") == 0)
1062
0
        o->opt = CRAM_OPT_SLICES_PER_CONTAINER, o->val.i = atoi(val);
1063
1064
0
    else if (strcmp(o->arg, "embed_ref") == 0 ||
1065
0
             strcmp(o->arg, "EMBED_REF") == 0)
1066
0
        o->opt = CRAM_OPT_EMBED_REF, o->val.i = atoi(val);
1067
1068
0
    else if (strcmp(o->arg, "no_ref") == 0 ||
1069
0
             strcmp(o->arg, "NO_REF") == 0)
1070
0
        o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(val);
1071
1072
0
    else if (strcmp(o->arg, "pos_delta") == 0 ||
1073
0
             strcmp(o->arg, "POS_DELTA") == 0)
1074
0
        o->opt = CRAM_OPT_POS_DELTA, o->val.i = atoi(val);
1075
1076
0
    else if (strcmp(o->arg, "ignore_md5") == 0 ||
1077
0
             strcmp(o->arg, "IGNORE_MD5") == 0)
1078
0
        o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(val);
1079
1080
0
    else if (strcmp(o->arg, "use_bzip2") == 0 ||
1081
0
             strcmp(o->arg, "USE_BZIP2") == 0)
1082
0
        o->opt = CRAM_OPT_USE_BZIP2, o->val.i = atoi(val);
1083
1084
0
    else if (strcmp(o->arg, "use_rans") == 0 ||
1085
0
             strcmp(o->arg, "USE_RANS") == 0)
1086
0
        o->opt = CRAM_OPT_USE_RANS, o->val.i = atoi(val);
1087
1088
0
    else if (strcmp(o->arg, "use_lzma") == 0 ||
1089
0
             strcmp(o->arg, "USE_LZMA") == 0)
1090
0
        o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(val);
1091
1092
0
    else if (strcmp(o->arg, "use_tok") == 0 ||
1093
0
             strcmp(o->arg, "USE_TOK") == 0)
1094
0
        o->opt = CRAM_OPT_USE_TOK, o->val.i = atoi(val);
1095
1096
0
    else if (strcmp(o->arg, "use_fqz") == 0 ||
1097
0
             strcmp(o->arg, "USE_FQZ") == 0)
1098
0
        o->opt = CRAM_OPT_USE_FQZ, o->val.i = atoi(val);
1099
1100
0
    else if (strcmp(o->arg, "use_arith") == 0 ||
1101
0
             strcmp(o->arg, "USE_ARITH") == 0)
1102
0
        o->opt = CRAM_OPT_USE_ARITH, o->val.i = atoi(val);
1103
1104
0
    else if (strcmp(o->arg, "fast") == 0 ||
1105
0
             strcmp(o->arg, "FAST") == 0)
1106
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_FAST;
1107
1108
0
    else if (strcmp(o->arg, "normal") == 0 ||
1109
0
             strcmp(o->arg, "NORMAL") == 0)
1110
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_NORMAL;
1111
1112
0
    else if (strcmp(o->arg, "small") == 0 ||
1113
0
             strcmp(o->arg, "SMALL") == 0)
1114
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_SMALL;
1115
1116
0
    else if (strcmp(o->arg, "archive") == 0 ||
1117
0
             strcmp(o->arg, "ARCHIVE") == 0)
1118
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_ARCHIVE;
1119
1120
0
    else if (strcmp(o->arg, "reference") == 0 ||
1121
0
             strcmp(o->arg, "REFERENCE") == 0)
1122
0
        o->opt = CRAM_OPT_REFERENCE, o->val.s = val;
1123
1124
0
    else if (strcmp(o->arg, "version") == 0 ||
1125
0
             strcmp(o->arg, "VERSION") == 0)
1126
0
        o->opt = CRAM_OPT_VERSION, o->val.s =val;
1127
1128
0
    else if (strcmp(o->arg, "multi_seq_per_slice") == 0 ||
1129
0
             strcmp(o->arg, "MULTI_SEQ_PER_SLICE") == 0)
1130
0
        o->opt = CRAM_OPT_MULTI_SEQ_PER_SLICE, o->val.i = atoi(val);
1131
1132
0
    else if (strcmp(o->arg, "nthreads") == 0 ||
1133
0
             strcmp(o->arg, "NTHREADS") == 0)
1134
0
        o->opt = HTS_OPT_NTHREADS, o->val.i = atoi(val);
1135
1136
0
    else if (strcmp(o->arg, "cache_size") == 0 ||
1137
0
             strcmp(o->arg, "CACHE_SIZE") == 0) {
1138
0
        char *endp;
1139
0
        o->opt = HTS_OPT_CACHE_SIZE;
1140
0
        o->val.i = strtol(val, &endp, 0);
1141
        // NB: Doesn't support floats, eg 1.5g
1142
        // TODO: extend hts_parse_decimal? See also samtools sort.
1143
0
        switch (*endp) {
1144
0
        case 'g': case 'G': o->val.i *= 1024; // fall through
1145
0
        case 'm': case 'M': o->val.i *= 1024; // fall through
1146
0
        case 'k': case 'K': o->val.i *= 1024; break;
1147
0
        case '\0': break;
1148
0
        default:
1149
0
            hts_log_error("Unrecognised cache size suffix '%c'", *endp);
1150
0
            free(o->arg);
1151
0
            free(o);
1152
0
            return -1;
1153
0
        }
1154
0
    }
1155
1156
0
    else if (strcmp(o->arg, "required_fields") == 0 ||
1157
0
             strcmp(o->arg, "REQUIRED_FIELDS") == 0)
1158
0
        o->opt = CRAM_OPT_REQUIRED_FIELDS, o->val.i = strtol(val, NULL, 0);
1159
1160
0
    else if (strcmp(o->arg, "lossy_names") == 0 ||
1161
0
             strcmp(o->arg, "LOSSY_NAMES") == 0)
1162
0
        o->opt = CRAM_OPT_LOSSY_NAMES, o->val.i = strtol(val, NULL, 0);
1163
1164
0
    else if (strcmp(o->arg, "name_prefix") == 0 ||
1165
0
             strcmp(o->arg, "NAME_PREFIX") == 0)
1166
0
        o->opt = CRAM_OPT_PREFIX, o->val.s = val;
1167
1168
0
    else if (strcmp(o->arg, "store_md") == 0 ||
1169
0
             strcmp(o->arg, "store_md") == 0)
1170
0
        o->opt = CRAM_OPT_STORE_MD, o->val.i = atoi(val);
1171
1172
0
    else if (strcmp(o->arg, "store_nm") == 0 ||
1173
0
             strcmp(o->arg, "store_nm") == 0)
1174
0
        o->opt = CRAM_OPT_STORE_NM, o->val.i = atoi(val);
1175
1176
0
    else if (strcmp(o->arg, "block_size") == 0 ||
1177
0
             strcmp(o->arg, "BLOCK_SIZE") == 0)
1178
0
        o->opt = HTS_OPT_BLOCK_SIZE, o->val.i = strtol(val, NULL, 0);
1179
1180
0
    else if (strcmp(o->arg, "level") == 0 ||
1181
0
             strcmp(o->arg, "LEVEL") == 0)
1182
0
        o->opt = HTS_OPT_COMPRESSION_LEVEL, o->val.i = strtol(val, NULL, 0);
1183
1184
0
    else if (strcmp(o->arg, "filter") == 0 ||
1185
0
             strcmp(o->arg, "FILTER") == 0)
1186
0
        o->opt = HTS_OPT_FILTER, o->val.s = val;
1187
1188
0
    else if (strcmp(o->arg, "fastq_aux") == 0 ||
1189
0
        strcmp(o->arg, "FASTQ_AUX") == 0)
1190
0
        o->opt = FASTQ_OPT_AUX, o->val.s = val;
1191
1192
0
    else if (strcmp(o->arg, "fastq_barcode") == 0 ||
1193
0
        strcmp(o->arg, "FASTQ_BARCODE") == 0)
1194
0
        o->opt = FASTQ_OPT_BARCODE, o->val.s = val;
1195
1196
0
    else if (strcmp(o->arg, "fastq_rnum") == 0 ||
1197
0
        strcmp(o->arg, "FASTQ_RNUM") == 0)
1198
0
        o->opt = FASTQ_OPT_RNUM, o->val.i = 1;
1199
1200
0
    else if (strcmp(o->arg, "fastq_casava") == 0 ||
1201
0
        strcmp(o->arg, "FASTQ_CASAVA") == 0)
1202
0
        o->opt = FASTQ_OPT_CASAVA, o->val.i = 1;
1203
1204
0
    else if (strcmp(o->arg, "fastq_name2") == 0 ||
1205
0
        strcmp(o->arg, "FASTQ_NAME2") == 0)
1206
0
        o->opt = FASTQ_OPT_NAME2, o->val.i = 1;
1207
1208
0
    else {
1209
0
        hts_log_error("Unknown option '%s'", o->arg);
1210
0
        free(o->arg);
1211
0
        free(o);
1212
0
        return -1;
1213
0
    }
1214
1215
0
    o->next = NULL;
1216
1217
    // Append; assumes small list.
1218
0
    if (*opts) {
1219
0
        t = *opts;
1220
0
        while (t->next)
1221
0
            t = t->next;
1222
0
        t->next = o;
1223
0
    } else {
1224
0
        *opts = o;
1225
0
    }
1226
1227
0
    return 0;
1228
0
}
1229
1230
/*
1231
 * Applies an hts_opt option list to a given htsFile.
1232
 *
1233
 * Returns 0 on success
1234
 *        -1 on failure
1235
 */
1236
0
int hts_opt_apply(htsFile *fp, hts_opt *opts) {
1237
0
    hts_opt *last = NULL;
1238
1239
0
    for (; opts;  opts = (last=opts)->next) {
1240
0
        switch (opts->opt) {
1241
0
            case CRAM_OPT_REFERENCE:
1242
0
                if (!(fp->fn_aux = strdup(opts->val.s)))
1243
0
                    return -1;
1244
                // fall through
1245
0
            case CRAM_OPT_VERSION:
1246
0
            case CRAM_OPT_PREFIX:
1247
0
            case HTS_OPT_FILTER:
1248
0
            case FASTQ_OPT_AUX:
1249
0
            case FASTQ_OPT_BARCODE:
1250
0
                if (hts_set_opt(fp,  opts->opt,  opts->val.s) != 0)
1251
0
                    return -1;
1252
0
                break;
1253
0
            default:
1254
0
                if (hts_set_opt(fp,  opts->opt,  opts->val.i) != 0)
1255
0
                    return -1;
1256
0
                break;
1257
0
        }
1258
0
    }
1259
1260
0
    return 0;
1261
0
}
1262
1263
/*
1264
 * Frees an hts_opt list.
1265
 */
1266
0
void hts_opt_free(hts_opt *opts) {
1267
0
    hts_opt *last = NULL;
1268
0
    while (opts) {
1269
0
        opts = (last=opts)->next;
1270
0
        free(last->arg);
1271
0
        free(last);
1272
0
    }
1273
0
}
1274
1275
1276
/*
1277
 * Tokenise options as (key(=value)?,)*(key(=value)?)?
1278
 * NB: No provision for ',' appearing in the value!
1279
 * Add backslashing rules?
1280
 *
1281
 * This could be used as part of a general command line option parser or
1282
 * as a string concatenated onto the file open mode.
1283
 *
1284
 * Returns 0 on success
1285
 *        -1 on failure.
1286
 */
1287
0
int hts_parse_opt_list(htsFormat *fmt, const char *str) {
1288
0
    while (str && *str) {
1289
0
        const char *str_start;
1290
0
        int len;
1291
0
        char arg[8001];
1292
1293
0
        while (*str && *str == ',')
1294
0
            str++;
1295
1296
0
        for (str_start = str; *str && *str != ','; str++);
1297
0
        len = str - str_start;
1298
1299
        // Produce a nul terminated copy of the option
1300
0
        strncpy(arg, str_start, len < 8000 ? len : 8000);
1301
0
        arg[len < 8000 ? len : 8000] = '\0';
1302
1303
0
        if (hts_opt_add((hts_opt **)&fmt->specific, arg) != 0)
1304
0
            return -1;
1305
1306
0
        if (*str)
1307
0
            str++;
1308
0
    }
1309
1310
0
    return 0;
1311
0
}
1312
1313
/*
1314
 * Accepts a string file format (sam, bam, cram, vcf, bam) optionally
1315
 * followed by a comma separated list of key=value options and splits
1316
 * these up into the fields of htsFormat struct.
1317
 *
1318
 * format is assumed to be already initialised, either to blank
1319
 * "unknown" values or via previous hts_opt_add calls.
1320
 *
1321
 * Returns 0 on success
1322
 *        -1 on failure.
1323
 */
1324
0
int hts_parse_format(htsFormat *format, const char *str) {
1325
0
    char fmt[8];
1326
0
    const char *cp = scan_keyword(str, ',', fmt, sizeof fmt);
1327
1328
0
    format->version.minor = 0; // unknown
1329
0
    format->version.major = 0; // unknown
1330
1331
0
    if (strcmp(fmt, "sam") == 0) {
1332
0
        format->category          = sequence_data;
1333
0
        format->format            = sam;
1334
0
        format->compression       = no_compression;
1335
0
        format->compression_level = 0;
1336
0
    } else if (strcmp(fmt, "sam.gz") == 0) {
1337
0
        format->category          = sequence_data;
1338
0
        format->format            = sam;
1339
0
        format->compression       = bgzf;
1340
0
        format->compression_level = -1;
1341
0
    } else if (strcmp(fmt, "bam") == 0) {
1342
0
        format->category          = sequence_data;
1343
0
        format->format            = bam;
1344
0
        format->compression       = bgzf;
1345
0
        format->compression_level = -1;
1346
0
    } else if (strcmp(fmt, "cram") == 0) {
1347
0
        format->category          = sequence_data;
1348
0
        format->format            = cram;
1349
0
        format->compression       = custom;
1350
0
        format->compression_level = -1;
1351
0
    } else if (strcmp(fmt, "vcf") == 0) {
1352
0
        format->category          = variant_data;
1353
0
        format->format            = vcf;
1354
0
        format->compression       = no_compression;
1355
0
        format->compression_level = 0;
1356
0
    } else if (strcmp(fmt, "bcf") == 0) {
1357
0
        format->category          = variant_data;
1358
0
        format->format            = bcf;
1359
0
        format->compression       = bgzf;
1360
0
        format->compression_level = -1;
1361
0
    } else if (strcmp(fmt, "fastq") == 0 || strcmp(fmt, "fq") == 0) {
1362
0
        format->category          = sequence_data;
1363
0
        format->format            = fastq_format;
1364
0
        format->compression       = no_compression;
1365
0
        format->compression_level = 0;
1366
0
    } else if (strcmp(fmt, "fastq.gz") == 0 || strcmp(fmt, "fq.gz") == 0) {
1367
0
        format->category          = sequence_data;
1368
0
        format->format            = fastq_format;
1369
0
        format->compression       = bgzf;
1370
0
        format->compression_level = 0;
1371
0
    } else if (strcmp(fmt, "fasta") == 0 || strcmp(fmt, "fa") == 0) {
1372
0
        format->category          = sequence_data;
1373
0
        format->format            = fasta_format;
1374
0
        format->compression       = no_compression;
1375
0
        format->compression_level = 0;
1376
0
    } else if (strcmp(fmt, "fasta.gz") == 0 || strcmp(fmt, "fa.gz") == 0) {
1377
0
        format->category          = sequence_data;
1378
0
        format->format            = fasta_format;
1379
0
        format->compression       = bgzf;
1380
0
        format->compression_level = 0;
1381
0
    } else {
1382
0
        return -1;
1383
0
    }
1384
1385
0
    return hts_parse_opt_list(format, cp);
1386
0
}
1387
1388
1389
/*
1390
 * Tokenise options as (key(=value)?,)*(key(=value)?)?
1391
 * NB: No provision for ',' appearing in the value!
1392
 * Add backslashing rules?
1393
 *
1394
 * This could be used as part of a general command line option parser or
1395
 * as a string concatenated onto the file open mode.
1396
 *
1397
 * Returns 0 on success
1398
 *        -1 on failure.
1399
 */
1400
0
static int hts_process_opts(htsFile *fp, const char *opts) {
1401
0
    htsFormat fmt;
1402
1403
0
    fmt.specific = NULL;
1404
0
    if (hts_parse_opt_list(&fmt, opts) != 0)
1405
0
        return -1;
1406
1407
0
    if (hts_opt_apply(fp, fmt.specific) != 0) {
1408
0
        hts_opt_free(fmt.specific);
1409
0
        return -1;
1410
0
    }
1411
1412
0
    hts_opt_free(fmt.specific);
1413
1414
0
    return 0;
1415
0
}
1416
1417
static int hts_crypt4gh_redirect(const char *fn, const char *mode,
1418
1
                                 hFILE **hfile_ptr, htsFile *fp) {
1419
1
    hFILE *hfile1 = *hfile_ptr;
1420
1
    hFILE *hfile2 = NULL;
1421
1
    char fn_buf[512], *fn2 = fn_buf;
1422
1
    char mode2[102]; // Size set by sizeof(simple_mode) in hts_hopen()
1423
1
    const char *prefix = "crypt4gh:";
1424
1
    size_t fn2_len = strlen(prefix) + strlen(fn) + 1;
1425
1
    int ret = -1;
1426
1427
1
    if (fn2_len > sizeof(fn_buf)) {
1428
0
        if (fn2_len >= INT_MAX) // Silence gcc format-truncation warning
1429
0
            return -1;
1430
0
        fn2 = malloc(fn2_len);
1431
0
        if (!fn2) return -1;
1432
0
    }
1433
1434
    // Reopen fn using the crypt4gh plug-in (if available)
1435
1
    snprintf(fn2, fn2_len, "%s%s", prefix, fn);
1436
1
    snprintf(mode2, sizeof(mode2), "%s%s", mode, strchr(mode, ':') ? "" : ":");
1437
1
    hfile2 = hopen(fn2, mode2, "parent", hfile1, NULL);
1438
1
    if (hfile2) {
1439
        // Replace original hfile with the new one.  The original is now
1440
        // enclosed within hfile2
1441
0
        *hfile_ptr = hfile2;
1442
0
        ret = 0;
1443
0
    }
1444
1445
1
    if (fn2 != fn_buf)
1446
0
        free(fn2);
1447
1
    return ret;
1448
1
}
1449
1450
htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode)
1451
114k
{
1452
114k
    hFILE *hfile_orig = hfile;
1453
114k
    hFILE *hfile_cleanup = hfile;
1454
114k
    htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile));
1455
114k
    char simple_mode[101], *cp, *opts;
1456
114k
    simple_mode[100] = '\0';
1457
1458
114k
    if (fp == NULL) goto error;
1459
1460
114k
    fp->fn = strdup(fn);
1461
114k
    fp->is_be = ed_is_big();
1462
1463
    // Split mode into simple_mode,opts strings
1464
114k
    if ((cp = strchr(mode, ','))) {
1465
0
        strncpy(simple_mode, mode, cp-mode <= 100 ? cp-mode : 100);
1466
0
        simple_mode[cp-mode] = '\0';
1467
0
        opts = cp+1;
1468
114k
    } else {
1469
114k
        strncpy(simple_mode, mode, 100);
1470
114k
        opts = NULL;
1471
114k
    }
1472
1473
114k
    if (strchr(simple_mode, 'r')) {
1474
67.1k
        const int max_loops = 5; // Should be plenty
1475
67.1k
        int loops = 0;
1476
67.1k
        if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error;
1477
1478
        // Deal with formats that re-direct an underlying file via a plug-in.
1479
        // Loops as we may have crypt4gh served via htsget, or
1480
        // crypt4gh-in-crypt4gh.
1481
1482
67.1k
        while (fp->format.format == htsget ||
1483
67.1k
               fp->format.format == hts_crypt4gh_format) {
1484
            // Ensure we don't get stuck in an endless redirect loop
1485
1
            if (++loops > max_loops) {
1486
0
                errno = ELOOP;
1487
0
                goto error;
1488
0
            }
1489
1490
1
            if (fp->format.format == htsget) {
1491
0
                hFILE *hfile2 = hopen_htsget_redirect(hfile, simple_mode);
1492
0
                if (hfile2 == NULL) goto error;
1493
1494
0
                if (hfile != hfile_cleanup) {
1495
                    // Close the result of an earlier redirection
1496
0
                    hclose_abruptly(hfile);
1497
0
                }
1498
1499
0
                hfile = hfile2;
1500
0
            }
1501
1
            else if (fp->format.format == hts_crypt4gh_format) {
1502
1
                int should_preserve = (hfile == hfile_orig);
1503
1
                int update_cleanup = (hfile == hfile_cleanup);
1504
1
                if (hts_crypt4gh_redirect(fn, simple_mode, &hfile, fp) < 0)
1505
1
                    goto error;
1506
0
                if (should_preserve) {
1507
                    // The original hFILE is now contained in a crypt4gh
1508
                    // wrapper.  Should we need to close the wrapper due
1509
                    // to a later error, we need to prevent the wrapped
1510
                    // handle from being closed as the caller will see
1511
                    // this function return NULL and try to clean up itself.
1512
0
                    hfile_orig->preserve = 1;
1513
0
                }
1514
0
                if (update_cleanup) {
1515
                    // Update handle to close at the end if redirected by htsget
1516
0
                    hfile_cleanup = hfile;
1517
0
                }
1518
0
            }
1519
1520
            // Re-detect format against the result of the redirection
1521
0
            if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error;
1522
0
        }
1523
67.1k
    }
1524
47.1k
    else if (strchr(simple_mode, 'w') || strchr(simple_mode, 'a')) {
1525
47.1k
        htsFormat *fmt = &fp->format;
1526
47.1k
        fp->is_write = 1;
1527
1528
47.1k
        if (strchr(simple_mode, 'b')) fmt->format = binary_format;
1529
29.8k
        else if (strchr(simple_mode, 'c')) fmt->format = cram;
1530
17.3k
        else if (strchr(simple_mode, 'f')) fmt->format = fastq_format;
1531
17.3k
        else if (strchr(simple_mode, 'F')) fmt->format = fasta_format;
1532
17.3k
        else fmt->format = text_format;
1533
1534
47.1k
        if (strchr(simple_mode, 'z')) fmt->compression = bgzf;
1535
47.1k
        else if (strchr(simple_mode, 'g')) fmt->compression = gzip;
1536
47.1k
        else if (strchr(simple_mode, 'u')) fmt->compression = no_compression;
1537
47.1k
        else {
1538
            // No compression mode specified, set to the default for the format
1539
47.1k
            switch (fmt->format) {
1540
17.3k
            case binary_format: fmt->compression = bgzf; break;
1541
12.5k
            case cram: fmt->compression = custom; break;
1542
0
            case fastq_format: fmt->compression = no_compression; break;
1543
0
            case fasta_format: fmt->compression = no_compression; break;
1544
17.3k
            case text_format: fmt->compression = no_compression; break;
1545
0
            default: abort();
1546
47.1k
            }
1547
47.1k
        }
1548
1549
        // Fill in category (if determinable; e.g. 'b' could be BAM or BCF)
1550
47.1k
        fmt->category = format_category(fmt->format);
1551
1552
47.1k
        fmt->version.major = fmt->version.minor = -1;
1553
47.1k
        fmt->compression_level = -1;
1554
47.1k
        fmt->specific = NULL;
1555
47.1k
    }
1556
0
    else { errno = EINVAL; goto error; }
1557
1558
114k
    switch (fp->format.format) {
1559
17.3k
    case binary_format:
1560
20.9k
    case bam:
1561
22.6k
    case bcf:
1562
22.6k
        fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
1563
22.6k
        if (fp->fp.bgzf == NULL) goto error;
1564
22.6k
        fp->is_bin = fp->is_bgzf = 1;
1565
22.6k
        break;
1566
1567
26.5k
    case cram:
1568
26.5k
        fp->fp.cram = cram_dopen(hfile, fn, simple_mode);
1569
26.5k
        if (fp->fp.cram == NULL) goto error;
1570
24.3k
        if (!fp->is_write)
1571
11.8k
            cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, -1); // auto
1572
24.3k
        fp->is_cram = 1;
1573
24.3k
        break;
1574
1575
101
    case empty_format:
1576
17.6k
    case text_format:
1577
17.6k
    case bed:
1578
21.4k
    case fasta_format:
1579
22.0k
    case fastq_format:
1580
52.3k
    case sam:
1581
65.0k
    case vcf:
1582
65.0k
        if (fp->format.compression != no_compression) {
1583
3.16k
            fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
1584
3.16k
            if (fp->fp.bgzf == NULL) goto error;
1585
3.07k
            fp->is_bgzf = 1;
1586
3.07k
        }
1587
61.8k
        else
1588
61.8k
            fp->fp.hfile = hfile;
1589
64.9k
        break;
1590
1591
64.9k
    default:
1592
126
        errno = EFTYPE;
1593
126
        goto error;
1594
114k
    }
1595
1596
111k
    if (opts)
1597
0
        hts_process_opts(fp, opts);
1598
1599
    // Allow original file to close if it was preserved earlier by crypt4gh
1600
111k
    hfile_orig->preserve = 0;
1601
1602
    // If redirecting via htsget, close the original hFILE now (pedantically
1603
    // we would instead close it in hts_close(), but this a simplifying
1604
    // optimisation)
1605
111k
    if (hfile != hfile_cleanup) hclose_abruptly(hfile_cleanup);
1606
1607
111k
    return fp;
1608
1609
2.37k
error:
1610
2.37k
    hts_log_error("Failed to open file %s", fn);
1611
1612
    // If redirecting, close the failed redirection hFILE that we have opened
1613
2.37k
    if (hfile != hfile_orig) hclose_abruptly(hfile);
1614
2.37k
    hfile_orig->preserve = 0; // Allow caller to close the original hfile
1615
1616
2.37k
    if (fp) {
1617
2.37k
        free(fp->fn);
1618
2.37k
        free(fp->fn_aux);
1619
2.37k
        free(fp);
1620
2.37k
    }
1621
2.37k
    return NULL;
1622
114k
}
1623
1624
static int hts_idx_close_otf_fp(hts_idx_t *idx);
1625
1626
int hts_close(htsFile *fp)
1627
111k
{
1628
111k
    int ret = 0, save;
1629
111k
    if (!fp) {
1630
0
        errno = EINVAL;
1631
0
        return -1;
1632
0
    }
1633
1634
111k
    switch (fp->format.format) {
1635
1.59k
    case binary_format:
1636
17.4k
    case bam:
1637
22.6k
    case bcf:
1638
22.6k
        ret = bgzf_close(fp->fp.bgzf);
1639
22.6k
        break;
1640
1641
24.3k
    case cram:
1642
24.3k
        if (!fp->is_write) {
1643
11.8k
            switch (cram_eof(fp->fp.cram)) {
1644
129
            case 2:
1645
129
                hts_log_warning("EOF marker is absent. The input is probably truncated");
1646
129
                break;
1647
8.74k
            case 0:  /* not at EOF, but may not have wanted all seqs */
1648
11.7k
            default: /* case 1, expected EOF */
1649
11.7k
                break;
1650
11.8k
            }
1651
11.8k
        }
1652
24.3k
        ret = cram_close(fp->fp.cram);
1653
24.3k
        break;
1654
1655
26
    case empty_format:
1656
1.84k
    case text_format:
1657
1.85k
    case bed:
1658
5.65k
    case fasta_format:
1659
6.22k
    case fastq_format:
1660
48.7k
    case sam:
1661
64.9k
    case vcf:
1662
64.9k
        if (fp->format.format == sam)
1663
42.5k
            ret = sam_state_destroy(fp);
1664
22.4k
        else if (fp->format.format == fastq_format ||
1665
22.4k
                 fp->format.format == fasta_format)
1666
4.37k
            fastq_state_destroy(fp);
1667
1668
64.9k
        if (fp->format.compression != no_compression)
1669
3.07k
            ret |= bgzf_close(fp->fp.bgzf);
1670
61.8k
        else
1671
61.8k
            ret |= hclose(fp->fp.hfile);
1672
64.9k
        break;
1673
1674
0
    default:
1675
0
        ret = -1;
1676
0
        break;
1677
111k
    }
1678
1679
111k
    if (fp->idx) {
1680
        // Close deferred index file handle, if present.
1681
        // Unfortunately this means errors on the index will get mixed with
1682
        // those on the main file, but as we only have the EOF block left to
1683
        // write it hopefully won't happen that often.
1684
0
        ret |= hts_idx_close_otf_fp(fp->idx);
1685
0
    }
1686
1687
111k
    save = errno;
1688
111k
    sam_hdr_destroy(fp->bam_header);
1689
111k
    hts_idx_destroy(fp->idx);
1690
111k
    hts_filter_free(fp->filter);
1691
111k
    free(fp->fn);
1692
111k
    free(fp->fn_aux);
1693
111k
    free(fp->line.s);
1694
111k
    free(fp);
1695
111k
    errno = save;
1696
111k
    return ret;
1697
111k
}
1698
1699
int hts_flush(htsFile *fp)
1700
0
{
1701
0
    if (fp == NULL) return 0;
1702
1703
0
    switch (fp->format.format) {
1704
0
    case binary_format:
1705
0
    case bam:
1706
0
    case bcf:
1707
0
        return bgzf_flush(fp->fp.bgzf);
1708
1709
0
    case cram:
1710
0
        return cram_flush(fp->fp.cram);
1711
1712
0
    case empty_format:
1713
0
    case text_format:
1714
0
    case bed:
1715
0
    case fasta_format:
1716
0
    case fastq_format:
1717
0
    case sam:
1718
0
    case vcf:
1719
0
        if (fp->format.compression != no_compression)
1720
0
            return bgzf_flush(fp->fp.bgzf);
1721
0
        else
1722
0
            return hflush(fp->fp.hfile);
1723
1724
0
    default:
1725
0
        break;
1726
0
    }
1727
1728
0
    return 0;
1729
0
}
1730
1731
const htsFormat *hts_get_format(htsFile *fp)
1732
0
{
1733
0
    return fp? &fp->format : NULL;
1734
0
}
1735
1736
0
const char *hts_format_file_extension(const htsFormat *format) {
1737
0
    if (!format)
1738
0
        return "?";
1739
1740
0
    switch (format->format) {
1741
0
    case sam:  return "sam";
1742
0
    case bam:  return "bam";
1743
0
    case bai:  return "bai";
1744
0
    case cram: return "cram";
1745
0
    case crai: return "crai";
1746
0
    case vcf:  return "vcf";
1747
0
    case bcf:  return "bcf";
1748
0
    case csi:  return "csi";
1749
0
    case fai_format:   return "fai";
1750
0
    case fqi_format:   return "fqi";
1751
0
    case gzi:  return "gzi";
1752
0
    case tbi:  return "tbi";
1753
0
    case bed:  return "bed";
1754
0
    case d4_format:    return "d4";
1755
0
    case fasta_format: return "fa";
1756
0
    case fastq_format: return "fq";
1757
0
    default:   return "?";
1758
0
    }
1759
0
}
1760
1761
0
static hFILE *hts_hfile(htsFile *fp) {
1762
0
    switch (fp->format.format) {
1763
0
    case binary_format:// fall through
1764
0
    case bcf:          // fall through
1765
0
    case bam:          return bgzf_hfile(fp->fp.bgzf);
1766
0
    case cram:         return cram_hfile(fp->fp.cram);
1767
0
    case text_format:  return fp->fp.hfile;
1768
0
    case vcf:          // fall through
1769
0
    case fastq_format: // fall through
1770
0
    case fasta_format: // fall through
1771
0
    case sam:          return fp->format.compression != no_compression
1772
0
                              ? bgzf_hfile(fp->fp.bgzf)
1773
0
                              : fp->fp.hfile;
1774
0
    default:           return NULL;
1775
0
    }
1776
0
}
1777
1778
0
int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) {
1779
0
    int r;
1780
0
    va_list args;
1781
1782
0
    switch (opt) {
1783
0
    case HTS_OPT_NTHREADS: {
1784
0
        va_start(args, opt);
1785
0
        int nthreads = va_arg(args, int);
1786
0
        va_end(args);
1787
0
        return hts_set_threads(fp, nthreads);
1788
0
    }
1789
1790
0
    case HTS_OPT_BLOCK_SIZE: {
1791
0
        hFILE *hf = hts_hfile(fp);
1792
1793
0
        if (hf) {
1794
0
            va_start(args, opt);
1795
0
            if (hfile_set_blksize(hf, va_arg(args, int)) != 0)
1796
0
                hts_log_warning("Failed to change block size");
1797
0
            va_end(args);
1798
0
        }
1799
0
        else {
1800
            // To do - implement for vcf/bcf.
1801
0
            hts_log_warning("Cannot change block size for this format");
1802
0
        }
1803
1804
0
        return 0;
1805
0
    }
1806
1807
0
    case HTS_OPT_THREAD_POOL: {
1808
0
        va_start(args, opt);
1809
0
        htsThreadPool *p = va_arg(args, htsThreadPool *);
1810
0
        va_end(args);
1811
0
        return hts_set_thread_pool(fp, p);
1812
0
    }
1813
1814
0
    case HTS_OPT_CACHE_SIZE: {
1815
0
        va_start(args, opt);
1816
0
        int cache_size = va_arg(args, int);
1817
0
        va_end(args);
1818
0
        hts_set_cache_size(fp, cache_size);
1819
0
        return 0;
1820
0
    }
1821
1822
0
    case FASTQ_OPT_CASAVA:
1823
0
    case FASTQ_OPT_RNUM:
1824
0
    case FASTQ_OPT_NAME2:
1825
0
        if (fp->format.format == fastq_format ||
1826
0
            fp->format.format == fasta_format)
1827
0
            return fastq_state_set(fp, opt);
1828
0
        return 0;
1829
1830
0
    case FASTQ_OPT_AUX:
1831
0
        if (fp->format.format == fastq_format ||
1832
0
            fp->format.format == fasta_format) {
1833
0
            va_start(args, opt);
1834
0
            char *list = va_arg(args, char *);
1835
0
            va_end(args);
1836
0
            return fastq_state_set(fp, opt, list);
1837
0
        }
1838
0
        return 0;
1839
1840
0
    case FASTQ_OPT_BARCODE:
1841
0
        if (fp->format.format == fastq_format ||
1842
0
            fp->format.format == fasta_format) {
1843
0
            va_start(args, opt);
1844
0
            char *bc = va_arg(args, char *);
1845
0
            va_end(args);
1846
0
            return fastq_state_set(fp, opt, bc);
1847
0
        }
1848
0
        return 0;
1849
1850
    // Options below here flow through to cram_set_voption
1851
0
    case HTS_OPT_COMPRESSION_LEVEL: {
1852
0
        va_start(args, opt);
1853
0
        int level = va_arg(args, int);
1854
0
        va_end(args);
1855
0
        if (fp->is_bgzf)
1856
0
            fp->fp.bgzf->compress_level = level;
1857
0
        else if (fp->format.format == cram)
1858
0
            return cram_set_option(fp->fp.cram, opt, level);
1859
0
        return 0;
1860
0
    }
1861
1862
0
    case HTS_OPT_FILTER: {
1863
0
        va_start(args, opt);
1864
0
        char *expr = va_arg(args, char *);
1865
0
        va_end(args);
1866
0
        return hts_set_filter_expression(fp, expr);
1867
0
    }
1868
1869
0
    case HTS_OPT_PROFILE: {
1870
0
        va_start(args, opt);
1871
0
        enum hts_profile_option prof = va_arg(args, int);
1872
0
        va_end(args);
1873
0
        if (fp->is_bgzf) {
1874
0
            switch (prof) {
1875
#ifdef HAVE_LIBDEFLATE
1876
            case HTS_PROFILE_FAST:    fp->fp.bgzf->compress_level =  2; break;
1877
            case HTS_PROFILE_NORMAL:  fp->fp.bgzf->compress_level = -1; break;
1878
            case HTS_PROFILE_SMALL:   fp->fp.bgzf->compress_level = 10; break;
1879
            case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level = 12; break;
1880
#else
1881
0
            case HTS_PROFILE_FAST:    fp->fp.bgzf->compress_level =  1; break;
1882
0
            case HTS_PROFILE_NORMAL:  fp->fp.bgzf->compress_level = -1; break;
1883
0
            case HTS_PROFILE_SMALL:   fp->fp.bgzf->compress_level =  8; break;
1884
0
            case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level =  9; break;
1885
0
#endif
1886
0
            }
1887
0
        } // else CRAM manages this in its own way
1888
0
        break;
1889
0
    }
1890
1891
0
    default:
1892
0
        break;
1893
0
    }
1894
1895
0
    if (fp->format.format != cram)
1896
0
        return 0;
1897
1898
0
    va_start(args, opt);
1899
0
    r = cram_set_voption(fp->fp.cram, opt, args);
1900
0
    va_end(args);
1901
1902
0
    return r;
1903
0
}
1904
1905
BGZF *hts_get_bgzfp(htsFile *fp);
1906
1907
int hts_set_threads(htsFile *fp, int n)
1908
0
{
1909
0
    if (fp->format.format == sam) {
1910
0
        return sam_set_threads(fp, n);
1911
0
    } else if (fp->format.compression == bgzf) {
1912
0
        return bgzf_mt(hts_get_bgzfp(fp), n, 256/*unused*/);
1913
0
    } else if (fp->format.format == cram) {
1914
0
        return hts_set_opt(fp, CRAM_OPT_NTHREADS, n);
1915
0
    }
1916
0
    else return 0;
1917
0
}
1918
1919
0
int hts_set_thread_pool(htsFile *fp, htsThreadPool *p) {
1920
0
    if (fp->format.format == sam || fp->format.format == text_format) {
1921
0
        return sam_set_thread_pool(fp, p);
1922
0
    } else if (fp->format.compression == bgzf) {
1923
0
        return bgzf_thread_pool(hts_get_bgzfp(fp), p->pool, p->qsize);
1924
0
    } else if (fp->format.format == cram) {
1925
0
        return hts_set_opt(fp, CRAM_OPT_THREAD_POOL, p);
1926
0
    }
1927
0
    else return 0;
1928
0
}
1929
1930
void hts_set_cache_size(htsFile *fp, int n)
1931
0
{
1932
0
    if (fp->format.compression == bgzf)
1933
0
        bgzf_set_cache_size(hts_get_bgzfp(fp), n);
1934
0
}
1935
1936
int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
1937
0
{
1938
0
    free(fp->fn_aux);
1939
0
    if (fn_aux) {
1940
0
        fp->fn_aux = strdup(fn_aux);
1941
0
        if (fp->fn_aux == NULL) return -1;
1942
0
    }
1943
0
    else fp->fn_aux = NULL;
1944
1945
0
    if (fp->format.format == cram)
1946
0
        if (cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux))
1947
0
            return -1;
1948
1949
0
    return 0;
1950
0
}
1951
1952
int hts_set_filter_expression(htsFile *fp, const char *expr)
1953
0
{
1954
0
    if (fp->filter)
1955
0
        hts_filter_free(fp->filter);
1956
1957
0
    if (!expr)
1958
0
        return 0;
1959
1960
0
    return (fp->filter = hts_filter_init(expr))
1961
0
        ? 0 : -1;
1962
0
}
1963
1964
hFILE *hts_open_tmpfile(const char *fname, const char *mode, kstring_t *tmpname)
1965
0
{
1966
0
    int pid = (int) getpid();
1967
0
    unsigned ptr = (uintptr_t) tmpname;
1968
0
    int n = 0;
1969
0
    hFILE *fp = NULL;
1970
1971
0
    do {
1972
        // Attempt to further uniquify the temporary filename
1973
0
        unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock()) ^ ptr;
1974
0
        n++;
1975
1976
0
        ks_clear(tmpname);
1977
0
        if (ksprintf(tmpname, "%s.tmp_%d_%d_%u", fname, pid, n, t) < 0) break;
1978
1979
0
        fp = hopen(tmpname->s, mode);
1980
0
    } while (fp == NULL && errno == EEXIST && n < 100);
1981
1982
0
    return fp;
1983
0
}
1984
1985
int hts_is_utf16_text(const kstring_t *str)
1986
0
{
1987
0
    const unsigned char *u = (const unsigned char *) (str->s);
1988
0
    return (str->l > 0 && str->s)? is_utf16_text(u, u + str->l) : 0;
1989
0
}
1990
1991
// For VCF/BCF backward sweeper. Not exposing these functions because their
1992
// future is uncertain. Things will probably have to change with hFILE...
1993
BGZF *hts_get_bgzfp(htsFile *fp)
1994
0
{
1995
0
    if (fp->is_bgzf)
1996
0
        return fp->fp.bgzf;
1997
0
    else
1998
0
        return NULL;
1999
0
}
2000
int hts_useek(htsFile *fp, off_t uoffset, int where)
2001
0
{
2002
0
    if (fp->is_bgzf)
2003
0
        return bgzf_useek(fp->fp.bgzf, uoffset, where);
2004
0
    else
2005
0
        return (hseek(fp->fp.hfile, uoffset, SEEK_SET) >= 0)? 0 : -1;
2006
0
}
2007
off_t hts_utell(htsFile *fp)
2008
0
{
2009
0
    if (fp->is_bgzf)
2010
0
        return bgzf_utell(fp->fp.bgzf);
2011
0
    else
2012
0
        return htell(fp->fp.hfile);
2013
0
}
2014
2015
int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
2016
115M
{
2017
115M
    int ret;
2018
115M
    if (! (delimiter == KS_SEP_LINE || delimiter == '\n')) {
2019
0
        hts_log_error("Unexpected delimiter %d", delimiter);
2020
0
        abort();
2021
0
    }
2022
2023
115M
    switch (fp->format.compression) {
2024
669k
    case no_compression:
2025
669k
        str->l = 0;
2026
669k
        ret = kgetline2(str, (kgets_func2 *) hgetln, fp->fp.hfile);
2027
669k
        if (ret >= 0) ret = (str->l <= INT_MAX)? (int) str->l : INT_MAX;
2028
16.4k
        else if (herrno(fp->fp.hfile)) ret = -2, errno = herrno(fp->fp.hfile);
2029
16.4k
        else ret = -1;
2030
669k
        break;
2031
2032
115M
    case gzip:
2033
115M
    case bgzf:
2034
115M
        ret = bgzf_getline(fp->fp.bgzf, '\n', str);
2035
115M
        break;
2036
2037
0
    default:
2038
0
        abort();
2039
115M
    }
2040
2041
115M
    ++fp->lineno;
2042
115M
    return ret;
2043
115M
}
2044
2045
char **hts_readlist(const char *string, int is_file, int *_n)
2046
0
{
2047
0
    unsigned int m = 0, n = 0;
2048
0
    char **s = 0, **s_new;
2049
0
    if ( is_file )
2050
0
    {
2051
0
        BGZF *fp = bgzf_open(string, "r");
2052
0
        if ( !fp ) return NULL;
2053
2054
0
        kstring_t str;
2055
0
        int ret;
2056
0
        str.s = 0; str.l = str.m = 0;
2057
0
        while ((ret = bgzf_getline(fp, '\n', &str)) >= 0)
2058
0
        {
2059
0
            if (str.l == 0) continue;
2060
0
            if (n == 0 && hts_is_utf16_text(&str))
2061
0
                hts_log_warning("'%s' appears to be encoded as UTF-16", string);
2062
0
            if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
2063
0
                goto err;
2064
0
            s[n] = strdup(str.s);
2065
0
            if (!s[n])
2066
0
                goto err;
2067
0
            n++;
2068
0
        }
2069
0
        if (ret < -1) // Read error
2070
0
            goto err;
2071
0
        bgzf_close(fp);
2072
0
        free(str.s);
2073
0
    }
2074
0
    else
2075
0
    {
2076
0
        const char *q = string, *p = string;
2077
0
        while ( 1 )
2078
0
        {
2079
0
            if (*p == ',' || *p == 0)
2080
0
            {
2081
0
                if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
2082
0
                    goto err;
2083
0
                s[n] = (char*)calloc(p - q + 1, 1);
2084
0
                if (!s[n])
2085
0
                    goto err;
2086
0
                strncpy(s[n++], q, p - q);
2087
0
                q = p + 1;
2088
0
            }
2089
0
            if ( !*p ) break;
2090
0
            p++;
2091
0
        }
2092
0
    }
2093
    // Try to shrink s to the minimum size needed
2094
0
    s_new = (char**)realloc(s, n * sizeof(char*));
2095
0
    if (!s_new)
2096
0
        goto err;
2097
2098
0
    s = s_new;
2099
0
    assert(n < INT_MAX); // hts_resize() should ensure this
2100
0
    *_n = n;
2101
0
    return s;
2102
2103
0
 err:
2104
0
    for (m = 0; m < n; m++)
2105
0
        free(s[m]);
2106
0
    free(s);
2107
0
    return NULL;
2108
0
}
2109
2110
char **hts_readlines(const char *fn, int *_n)
2111
0
{
2112
0
    unsigned int m = 0, n = 0;
2113
0
    char **s = 0, **s_new;
2114
0
    BGZF *fp = bgzf_open(fn, "r");
2115
0
    if ( fp ) { // read from file
2116
0
        kstring_t str;
2117
0
        int ret;
2118
0
        str.s = 0; str.l = str.m = 0;
2119
0
        while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
2120
0
            if (str.l == 0) continue;
2121
0
            if (n == 0 && hts_is_utf16_text(&str))
2122
0
                hts_log_warning("'%s' appears to be encoded as UTF-16", fn);
2123
0
            if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
2124
0
                goto err;
2125
0
            s[n] = strdup(str.s);
2126
0
            if (!s[n])
2127
0
                goto err;
2128
0
            n++;
2129
0
        }
2130
0
        if (ret < -1) // Read error
2131
0
            goto err;
2132
0
        bgzf_close(fp);
2133
0
        free(str.s);
2134
0
    } else if (*fn == ':') { // read from string
2135
0
        const char *q, *p;
2136
0
        for (q = p = fn + 1;; ++p)
2137
0
            if (*p == ',' || *p == 0) {
2138
0
                if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
2139
0
                    goto err;
2140
0
                s[n] = (char*)calloc(p - q + 1, 1);
2141
0
                if (!s[n])
2142
0
                    goto err;
2143
0
                strncpy(s[n++], q, p - q);
2144
0
                q = p + 1;
2145
0
                if (*p == 0) break;
2146
0
            }
2147
0
    } else return 0;
2148
    // Try to shrink s to the minimum size needed
2149
0
    s_new = (char**)realloc(s, n * sizeof(char*));
2150
0
    if (!s_new)
2151
0
        goto err;
2152
2153
0
    s = s_new;
2154
0
    assert(n < INT_MAX); // hts_resize() should ensure this
2155
0
    *_n = n;
2156
0
    return s;
2157
2158
0
 err:
2159
0
    for (m = 0; m < n; m++)
2160
0
        free(s[m]);
2161
0
    free(s);
2162
0
    return NULL;
2163
0
}
2164
2165
// DEPRECATED: To be removed in a future HTSlib release
2166
int hts_file_type(const char *fname)
2167
0
{
2168
0
    int len = strlen(fname);
2169
0
    if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ;
2170
0
    if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF;
2171
0
    if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ;
2172
0
    if ( !strcmp("-",fname) ) return FT_STDIN;
2173
2174
0
    hFILE *f = hopen(fname, "r");
2175
0
    if (f == NULL) return 0;
2176
2177
0
    htsFormat fmt;
2178
0
    if (hts_detect_format2(f, fname, &fmt) < 0) { hclose_abruptly(f); return 0; }
2179
0
    if (hclose(f) < 0) return 0;
2180
2181
0
    switch (fmt.format) {
2182
0
    case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ;
2183
0
    case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ;
2184
0
    default:  return 0;
2185
0
    }
2186
0
}
2187
2188
int hts_check_EOF(htsFile *fp)
2189
0
{
2190
0
    if (fp->format.compression == bgzf)
2191
0
        return bgzf_check_EOF(hts_get_bgzfp(fp));
2192
0
    else if (fp->format.format == cram)
2193
0
        return cram_check_EOF(fp->fp.cram);
2194
0
    else
2195
0
        return 3;
2196
0
}
2197
2198
2199
/****************
2200
 *** Indexing ***
2201
 ****************/
2202
2203
0
#define HTS_MIN_MARKER_DIST 0x10000
2204
2205
// Finds the special meta bin
2206
//  ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1
2207
0
#define META_BIN(idx) ((idx)->n_bins + 1)
2208
2209
0
#define pair64_lt(a,b) ((a).u < (b).u)
2210
0
#define pair64max_lt(a,b) ((a).u < (b).u || \
2211
0
                           ((a).u == (b).u && (a).max < (b).max))
2212
2213
0
KSORT_INIT_STATIC(_off, hts_pair64_t, pair64_lt)
2214
0
KSORT_INIT_STATIC(_off_max, hts_pair64_max_t, pair64max_lt)
2215
2216
typedef struct {
2217
    int32_t m, n;
2218
    uint64_t loff;
2219
    hts_pair64_t *list;
2220
} bins_t;
2221
2222
KHASH_MAP_INIT_INT(bin, bins_t)
2223
typedef khash_t(bin) bidx_t;
2224
2225
typedef struct {
2226
    hts_pos_t n, m;
2227
    uint64_t *offset;
2228
} lidx_t;
2229
2230
struct hts_idx_t {
2231
    int fmt, min_shift, n_lvls, n_bins;
2232
    uint32_t l_meta;
2233
    int32_t n, m;
2234
    uint64_t n_no_coor;
2235
    bidx_t **bidx;
2236
    lidx_t *lidx;
2237
    uint8_t *meta; // MUST have a terminating NUL on the end
2238
    int tbi_n, last_tbi_tid;
2239
    struct {
2240
        uint32_t last_bin, save_bin;
2241
        hts_pos_t last_coor;
2242
        int last_tid, save_tid, finished;
2243
        uint64_t last_off, save_off;
2244
        uint64_t off_beg, off_end;
2245
        uint64_t n_mapped, n_unmapped;
2246
    } z; // keep internal states
2247
    BGZF *otf_fp;  // Index on-the-fly output file
2248
};
2249
2250
0
static char * idx_format_name(int fmt) {
2251
0
    switch (fmt) {
2252
0
        case HTS_FMT_CSI: return "csi";
2253
0
        case HTS_FMT_BAI: return "bai";
2254
0
        case HTS_FMT_TBI: return "tbi";
2255
0
        case HTS_FMT_CRAI: return "crai";
2256
0
        default: return "unknown";
2257
0
    }
2258
0
}
2259
2260
#ifdef DEBUG_INDEX
2261
static void idx_dump(const hts_idx_t *idx) {
2262
    int i;
2263
    int64_t j;
2264
2265
    if (!idx) fprintf(stderr, "Null index\n");
2266
2267
    fprintf(stderr, "format='%s', min_shift=%d, n_lvls=%d, n_bins=%d, l_meta=%u ",
2268
            idx_format_name(idx->fmt), idx->min_shift, idx->n_lvls, idx->n_bins, idx->l_meta);
2269
    fprintf(stderr, "n=%d, m=%d, n_no_coor=%"PRIu64"\n", idx->n, idx->m, idx->n_no_coor);
2270
    for (i = 0; i < idx->n; i++) {
2271
        bidx_t *bidx = idx->bidx[i];
2272
        lidx_t *lidx = &idx->lidx[i];
2273
        if (bidx) {
2274
            fprintf(stderr, "======== BIN Index - tid=%d, n_buckets=%d, size=%d\n", i, bidx->n_buckets, bidx->size);
2275
            int b;
2276
            for (b = 0; b < META_BIN(idx); b++) {
2277
                khint_t k;
2278
                if ((k = kh_get(bin, bidx, b)) != kh_end(bidx)) {
2279
                    bins_t *entries = &kh_value(bidx, k);
2280
                    int l = hts_bin_level(b);
2281
                    int64_t bin_width = 1LL << ((idx->n_lvls - l) * 3 + idx->min_shift);
2282
                    fprintf(stderr, "\tbin=%d, level=%d, parent=%d, n_chunks=%d, loff=%"PRIu64", interval=[%"PRId64" - %"PRId64"]\n",
2283
                        b, l, hts_bin_parent(b), entries->n, entries->loff, (b-hts_bin_first(l))*bin_width+1, (b+1-hts_bin_first(l))*bin_width);
2284
                    for (j = 0; j < entries->n; j++)
2285
                        fprintf(stderr, "\t\tchunk=%"PRId64", u=%"PRIu64", v=%"PRIu64"\n", j, entries->list[j].u, entries->list[j].v);
2286
                }
2287
            }
2288
        }
2289
        if (lidx) {
2290
            fprintf(stderr, "======== LINEAR Index - tid=%d, n_values=%"PRId64"\n", i, lidx->n);
2291
            for (j = 0; j < lidx->n; j++) {
2292
                fprintf(stderr, "\t\tentry=%"PRId64", offset=%"PRIu64", interval=[%"PRId64" - %"PRId64"]\n",
2293
                    j, lidx->offset[j], j*(1<<idx->min_shift)+1, (j+1)*(1<<idx->min_shift));
2294
            }
2295
        }
2296
    }
2297
}
2298
#endif
2299
2300
static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end)
2301
0
{
2302
0
    khint_t k;
2303
0
    bins_t *l;
2304
0
    int absent;
2305
0
    k = kh_put(bin, b, bin, &absent);
2306
0
    if (absent < 0) return -1; // Out of memory
2307
0
    l = &kh_value(b, k);
2308
0
    if (absent) {
2309
0
        l->m = 1; l->n = 0;
2310
0
        l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t));
2311
0
        if (!l->list) {
2312
0
            kh_del(bin, b, k);
2313
0
            return -1;
2314
0
        }
2315
0
    } else if (l->n == l->m) {
2316
0
        uint32_t new_m = l->m ? l->m << 1 : 1;
2317
0
        hts_pair64_t *new_list = realloc(l->list, new_m * sizeof(hts_pair64_t));
2318
0
        if (!new_list) return -1;
2319
0
        l->list = new_list;
2320
0
        l->m = new_m;
2321
0
    }
2322
0
    l->list[l->n].u = beg;
2323
0
    l->list[l->n++].v = end;
2324
0
    return 0;
2325
0
}
2326
2327
static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift)
2328
0
{
2329
0
    int i;
2330
0
    hts_pos_t beg, end;
2331
0
    beg = _beg >> min_shift;
2332
0
    end = (_end - 1) >> min_shift;
2333
0
    if (l->m < end + 1) {
2334
0
        size_t new_m = l->m * 2 > end + 1 ? l->m * 2 : end + 1;
2335
0
        uint64_t *new_offset;
2336
2337
0
        new_offset = (uint64_t*)realloc(l->offset, new_m * sizeof(uint64_t));
2338
0
        if (!new_offset) return -1;
2339
2340
        // fill unused memory with (uint64_t)-1
2341
0
        memset(new_offset + l->m, 0xff, sizeof(uint64_t) * (new_m - l->m));
2342
0
        l->m = new_m;
2343
0
        l->offset = new_offset;
2344
0
    }
2345
0
    for (i = beg; i <= end; ++i) {
2346
0
        if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset;
2347
0
    }
2348
0
    if (l->n < end + 1) l->n = end + 1;
2349
0
    return 0;
2350
0
}
2351
2352
hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
2353
0
{
2354
0
    hts_idx_t *idx;
2355
0
    idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t));
2356
0
    if (idx == NULL) return NULL;
2357
0
    idx->fmt = fmt;
2358
0
    idx->min_shift = min_shift;
2359
0
    idx->n_lvls = n_lvls;
2360
0
    idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7;
2361
0
    idx->z.save_tid = idx->z.last_tid = -1;
2362
0
    idx->z.save_bin = idx->z.last_bin = 0xffffffffu;
2363
0
    idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0;
2364
0
    idx->z.last_coor = 0xffffffffu;
2365
0
    if (n) {
2366
0
        idx->n = idx->m = n;
2367
0
        idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*));
2368
0
        if (idx->bidx == NULL) { free(idx); return NULL; }
2369
0
        idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t));
2370
0
        if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; }
2371
0
    }
2372
0
    idx->tbi_n = -1;
2373
0
    idx->last_tbi_tid = -1;
2374
0
    idx->otf_fp = NULL;
2375
0
    return idx;
2376
0
}
2377
2378
static void update_loff(hts_idx_t *idx, int i, int free_lidx)
2379
0
{
2380
0
    bidx_t *bidx = idx->bidx[i];
2381
0
    lidx_t *lidx = &idx->lidx[i];
2382
0
    khint_t k;
2383
0
    int l;
2384
    // the last entry is always valid
2385
0
    for (l=lidx->n-2; l >= 0; l--) {
2386
0
        if (lidx->offset[l] == (uint64_t)-1)
2387
0
            lidx->offset[l] = lidx->offset[l+1];
2388
0
    }
2389
0
    if (bidx == 0) return;
2390
0
    for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff
2391
0
        if (kh_exist(bidx, k))
2392
0
        {
2393
0
            if ( kh_key(bidx, k) < idx->n_bins )
2394
0
            {
2395
0
                int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls);
2396
                // disable linear index if bot_bin out of bounds
2397
0
                kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0;
2398
0
            }
2399
0
            else
2400
0
                kh_val(bidx, k).loff = 0;
2401
0
        }
2402
0
    if (free_lidx) {
2403
0
        free(lidx->offset);
2404
0
        lidx->m = lidx->n = 0;
2405
0
        lidx->offset = 0;
2406
0
    }
2407
0
}
2408
2409
static int compress_binning(hts_idx_t *idx, int i)
2410
0
{
2411
0
    bidx_t *bidx = idx->bidx[i];
2412
0
    khint_t k;
2413
0
    int l, m;
2414
0
    if (bidx == 0) return 0;
2415
    // merge a bin to its parent if the bin is too small
2416
0
    for (l = idx->n_lvls; l > 0; --l) {
2417
0
        unsigned start = hts_bin_first(l);
2418
0
        for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
2419
0
            bins_t *p, *q;
2420
0
            if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue;
2421
0
            p = &kh_value(bidx, k);
2422
0
            if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list);
2423
0
            if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) {
2424
0
                khint_t kp;
2425
0
                kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k)));
2426
0
                if (kp == kh_end(bidx)) continue;
2427
0
                q = &kh_val(bidx, kp);
2428
0
                if (q->n + p->n > q->m) {
2429
0
                    uint32_t new_m = q->n + p->n;
2430
0
                    hts_pair64_t *new_list;
2431
0
                    kroundup32(new_m);
2432
0
                    if (new_m > INT32_MAX) return -1; // Limited by index format
2433
0
                    new_list = realloc(q->list, new_m * sizeof(*new_list));
2434
0
                    if (!new_list) return -1;
2435
0
                    q->m = new_m;
2436
0
                    q->list = new_list;
2437
0
                }
2438
0
                memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t));
2439
0
                q->n += p->n;
2440
0
                free(p->list);
2441
0
                kh_del(bin, bidx, k);
2442
0
            }
2443
0
        }
2444
0
    }
2445
0
    k = kh_get(bin, bidx, 0);
2446
0
    if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list);
2447
    // merge adjacent chunks that start from the same BGZF block
2448
0
    for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
2449
0
        bins_t *p;
2450
0
        if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue;
2451
0
        p = &kh_value(bidx, k);
2452
0
        for (l = 1, m = 0; l < p->n; ++l) {
2453
0
            if (p->list[m].v>>16 >= p->list[l].u>>16) {
2454
0
                if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v;
2455
0
            } else p->list[++m] = p->list[l];
2456
0
        }
2457
0
        p->n = m + 1;
2458
0
    }
2459
0
    return 0;
2460
0
}
2461
2462
int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
2463
0
{
2464
0
    int i, ret = 0;
2465
0
    if (idx == NULL || idx->z.finished) return 0; // do not run this function on an empty index or multiple times
2466
0
    if (idx->z.save_tid >= 0) {
2467
0
        ret |= insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset);
2468
0
        ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset);
2469
0
        ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
2470
0
    }
2471
0
    for (i = 0; i < idx->n; ++i) {
2472
0
        update_loff(idx, i, (idx->fmt == HTS_FMT_CSI));
2473
0
        ret |= compress_binning(idx, i);
2474
0
    }
2475
0
    idx->z.finished = 1;
2476
2477
0
    return ret;
2478
0
}
2479
2480
static inline hts_pos_t hts_idx_maxpos(const hts_idx_t *idx)
2481
0
{
2482
0
    return hts_bin_maxpos(idx->min_shift, idx->n_lvls);
2483
0
}
2484
2485
int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
2486
0
{
2487
0
    hts_pos_t maxpos = hts_idx_maxpos(idx);
2488
0
    if (tid < 0 || (beg <= maxpos && end <= maxpos))
2489
0
        return 0;
2490
2491
0
    if (idx->fmt == HTS_FMT_CSI) {
2492
0
        hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" "
2493
0
                      "cannot be stored in a csi index with these parameters. "
2494
0
                      "Please use a larger min_shift or depth",
2495
0
                      beg, end);
2496
0
    } else {
2497
0
        hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos
2498
0
                      " cannot be stored in a %s index. Try using a csi index",
2499
0
                      beg, end, idx_format_name(idx->fmt));
2500
0
    }
2501
0
    errno = ERANGE;
2502
0
    return -1;
2503
0
}
2504
2505
int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped)
2506
0
{
2507
0
    int bin;
2508
0
    if (tid<0) beg = -1, end = 0;
2509
0
    if (hts_idx_check_range(idx, tid, beg, end) < 0)
2510
0
        return -1;
2511
0
    if (tid >= idx->m) { // enlarge the index
2512
0
        uint32_t new_m = idx->m * 2 > tid + 1 ? idx->m * 2 : tid + 1;
2513
0
        bidx_t **new_bidx;
2514
0
        lidx_t *new_lidx;
2515
2516
0
        new_bidx = (bidx_t**)realloc(idx->bidx, new_m * sizeof(bidx_t*));
2517
0
        if (!new_bidx) return -1;
2518
0
        idx->bidx = new_bidx;
2519
2520
0
        new_lidx = (lidx_t*) realloc(idx->lidx, new_m * sizeof(lidx_t));
2521
0
        if (!new_lidx) return -1;
2522
0
        idx->lidx = new_lidx;
2523
2524
0
        memset(&idx->bidx[idx->m], 0, (new_m - idx->m) * sizeof(bidx_t*));
2525
0
        memset(&idx->lidx[idx->m], 0, (new_m - idx->m) * sizeof(lidx_t));
2526
0
        idx->m = new_m;
2527
0
    }
2528
0
    if (idx->n < tid + 1) idx->n = tid + 1;
2529
0
    if (idx->z.finished) return 0;
2530
0
    if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome
2531
0
        if ( tid>=0 && idx->n_no_coor )
2532
0
        {
2533
0
            hts_log_error("NO_COOR reads not in a single block at the end %d %d", tid, idx->z.last_tid);
2534
0
            return -1;
2535
0
        }
2536
0
        if (tid>=0 && idx->bidx[tid] != 0)
2537
0
        {
2538
0
            hts_log_error("Chromosome blocks not continuous");
2539
0
            return -1;
2540
0
        }
2541
0
        idx->z.last_tid = tid;
2542
0
        idx->z.last_bin = 0xffffffffu;
2543
0
    } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order
2544
0
        hts_log_error("Unsorted positions on sequence #%d: %"PRIhts_pos" followed by %"PRIhts_pos, tid+1, idx->z.last_coor+1, beg+1);
2545
0
        return -1;
2546
0
    }
2547
0
    if (end < beg) {
2548
        // Malformed ranges are errors. (Empty ranges (beg==end) are unusual but acceptable.)
2549
0
        hts_log_error("Invalid record on sequence #%d: end %"PRId64" < begin %"PRId64, tid+1, end, beg+1);
2550
0
        return -1;
2551
0
    }
2552
0
    if ( tid>=0 )
2553
0
    {
2554
0
        if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin);
2555
        // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin
2556
0
        if (beg < 0)  beg = 0;
2557
0
        if (end <= 0) end = 1;
2558
        // idx->z.last_off points to the start of the current record
2559
0
        if (insert_to_l(&idx->lidx[tid], beg, end,
2560
0
                        idx->z.last_off, idx->min_shift) < 0) return -1;
2561
0
    }
2562
0
    else idx->n_no_coor++;
2563
0
    bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls);
2564
0
    if ((int)idx->z.last_bin != bin) { // then possibly write the binning index
2565
0
        if (idx->z.save_bin != 0xffffffffu) { // save_bin==0xffffffffu only happens to the first record
2566
0
            if (insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin,
2567
0
                            idx->z.save_off, idx->z.last_off) < 0) return -1;
2568
0
        }
2569
0
        if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information
2570
0
            idx->z.off_end = idx->z.last_off;
2571
0
            if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
2572
0
                            idx->z.off_beg, idx->z.off_end) < 0) return -1;
2573
0
            if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
2574
0
                            idx->z.n_mapped, idx->z.n_unmapped) < 0) return -1;
2575
0
            idx->z.n_mapped = idx->z.n_unmapped = 0;
2576
0
            idx->z.off_beg = idx->z.off_end;
2577
0
        }
2578
0
        idx->z.save_off = idx->z.last_off;
2579
0
        idx->z.save_bin = idx->z.last_bin = bin;
2580
0
        idx->z.save_tid = tid;
2581
0
    }
2582
0
    if (is_mapped) ++idx->z.n_mapped;
2583
0
    else ++idx->z.n_unmapped;
2584
0
    idx->z.last_off = offset;
2585
0
    idx->z.last_coor = beg;
2586
0
    return 0;
2587
0
}
2588
2589
// Needed for TBI only.  Ensure 'tid' with 'name' is in the index meta data.
2590
// idx->meta needs to have been initialised first with an appropriate Tabix
2591
// configuration via hts_idx_set_meta.
2592
//
2593
// NB number of references (first 4 bytes of tabix header) aren't in
2594
// idx->meta, but held in idx->n instead.
2595
0
int hts_idx_tbi_name(hts_idx_t *idx, int tid, const char *name) {
2596
    // Horrid - we have to map incoming tid to a tbi alternative tid.
2597
    // This is because TBI counts tids by "covered" refs while everything
2598
    // else counts by Nth SQ/contig record in header.
2599
0
    if (tid == idx->last_tbi_tid || tid < 0 || !name)
2600
0
        return idx->tbi_n;
2601
2602
0
    uint32_t len = strlen(name)+1;
2603
0
    uint8_t *tmp = (uint8_t *)realloc(idx->meta, idx->l_meta + len);
2604
0
    if (!tmp)
2605
0
        return -1;
2606
2607
    // Append name
2608
0
    idx->meta = tmp;
2609
0
    strcpy((char *)idx->meta + idx->l_meta, name);
2610
0
    idx->l_meta += len;
2611
2612
    // Update seq length
2613
0
    u32_to_le(le_to_u32(idx->meta+24)+len, idx->meta+24);
2614
2615
0
    idx->last_tbi_tid = tid;
2616
0
    return ++idx->tbi_n;
2617
0
}
2618
2619
// When doing samtools index we have a read_bam / hts_idx_push(bgzf_tell())
2620
// loop.  idx->z.last_off is the previous bzgf_tell location, so we know
2621
// the location the current bam record started at as well as where it ends.
2622
//
2623
// When building an index on the fly via a write_bam / hts_idx_push loop,
2624
// this isn't quite identical as we may amend the virtual coord returned
2625
// by bgzf_tell to the start of a new block if the next bam struct doesn't
2626
// fit.  It's essentially the same thing, but for bit-identical indices
2627
// we need to amend the idx->z.last_off when we know we're starting a new
2628
// block.
2629
void hts_idx_amend_last(hts_idx_t *idx, uint64_t offset)
2630
0
{
2631
0
    idx->z.last_off = offset;
2632
0
}
2633
2634
void hts_idx_destroy(hts_idx_t *idx)
2635
111k
{
2636
111k
    khint_t k;
2637
111k
    int i;
2638
111k
    if (idx == 0) return;
2639
2640
    // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c
2641
0
    if (idx->fmt == HTS_FMT_CRAI) {
2642
0
        hts_cram_idx_t *cidx = (hts_cram_idx_t *) idx;
2643
0
        cram_index_free(cidx->cram);
2644
0
        free(cidx);
2645
0
        return;
2646
0
    }
2647
2648
0
    for (i = 0; i < idx->m; ++i) {
2649
0
        bidx_t *bidx = idx->bidx[i];
2650
0
        free(idx->lidx[i].offset);
2651
0
        if (bidx == 0) continue;
2652
0
        for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
2653
0
            if (kh_exist(bidx, k))
2654
0
                free(kh_value(bidx, k).list);
2655
0
        kh_destroy(bin, bidx);
2656
0
    }
2657
0
    free(idx->bidx); free(idx->lidx); free(idx->meta);
2658
0
    free(idx);
2659
0
}
2660
2661
0
int hts_idx_fmt(hts_idx_t *idx) {
2662
0
    return idx->fmt;
2663
0
}
2664
2665
// The optimizer eliminates these ed_is_big() calls; still it would be good to
2666
// TODO Determine endianness at configure- or compile-time
2667
2668
static inline ssize_t HTS_RESULT_USED idx_write_int32(BGZF *fp, int32_t x)
2669
0
{
2670
0
    if (ed_is_big()) x = ed_swap_4(x);
2671
0
    return bgzf_write(fp, &x, sizeof x);
2672
0
}
2673
2674
static inline ssize_t HTS_RESULT_USED idx_write_uint32(BGZF *fp, uint32_t x)
2675
0
{
2676
0
    if (ed_is_big()) x = ed_swap_4(x);
2677
0
    return bgzf_write(fp, &x, sizeof x);
2678
0
}
2679
2680
static inline ssize_t HTS_RESULT_USED idx_write_uint64(BGZF *fp, uint64_t x)
2681
0
{
2682
0
    if (ed_is_big()) x = ed_swap_8(x);
2683
0
    return bgzf_write(fp, &x, sizeof x);
2684
0
}
2685
2686
static inline void swap_bins(bins_t *p)
2687
0
{
2688
0
    int i;
2689
0
    for (i = 0; i < p->n; ++i) {
2690
0
        ed_swap_8p(&p->list[i].u);
2691
0
        ed_swap_8p(&p->list[i].v);
2692
0
    }
2693
0
}
2694
2695
static int need_idx_ugly_delay_hack(const hts_idx_t *idx)
2696
0
{
2697
    // Ugly hack for on-the-fly BAI indexes.  As these are uncompressed,
2698
    // we need to delay writing a few bytes of data until file close
2699
    // so that we have something to force a modification time update.
2700
    //
2701
    // (For compressed indexes like CSI, the BGZF EOF block serves the same
2702
    // purpose).
2703
0
    return idx->otf_fp && !idx->otf_fp->is_compressed;
2704
0
}
2705
2706
static int idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt)
2707
0
{
2708
0
    int32_t i, j;
2709
2710
0
    #define check(ret) if ((ret) < 0) return -1
2711
2712
    // VCF TBI/CSI only writes IDs for non-empty bins (ie covered references)
2713
    //
2714
    // NOTE: CSI meta is undefined in spec, so this code has an assumption
2715
    // that we're only using it for Tabix data.
2716
0
    int nids = idx->n;
2717
0
    if (idx->meta && idx->l_meta >= 4 && le_to_u32(idx->meta) == TBX_VCF) {
2718
0
        for (i = nids = 0; i < idx->n; ++i) {
2719
0
            if (idx->bidx[i])
2720
0
                nids++;
2721
0
        }
2722
0
    }
2723
0
    check(idx_write_int32(fp, nids));
2724
0
    if (fmt == HTS_FMT_TBI && idx->l_meta)
2725
0
        check(bgzf_write(fp, idx->meta, idx->l_meta));
2726
2727
0
    for (i = 0; i < idx->n; ++i) {
2728
0
        khint_t k;
2729
0
        bidx_t *bidx = idx->bidx[i];
2730
0
        lidx_t *lidx = &idx->lidx[i];
2731
2732
        // write binning index
2733
0
        if (nids == idx->n || bidx)
2734
0
            check(idx_write_int32(fp, bidx? kh_size(bidx) : 0));
2735
0
        if (bidx)
2736
0
            for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
2737
0
                if (kh_exist(bidx, k)) {
2738
0
                    bins_t *p = &kh_value(bidx, k);
2739
0
                    check(idx_write_uint32(fp, kh_key(bidx, k)));
2740
0
                    if (fmt == HTS_FMT_CSI) check(idx_write_uint64(fp, p->loff));
2741
                    //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v);
2742
0
                    check(idx_write_int32(fp, p->n));
2743
0
                    for (j = 0; j < p->n; ++j) {
2744
                        //fprintf(stderr, "\t%ld\t%ld\n", p->list[j].u, p->list[j].v);
2745
0
                        check(idx_write_uint64(fp, p->list[j].u));
2746
0
                        check(idx_write_uint64(fp, p->list[j].v));
2747
0
                    }
2748
0
                }
2749
2750
        // write linear index
2751
0
        if (fmt != HTS_FMT_CSI) {
2752
0
            check(idx_write_int32(fp, lidx->n));
2753
0
            for (j = 0; j < lidx->n; ++j)
2754
0
                check(idx_write_uint64(fp, lidx->offset[j]));
2755
0
        }
2756
0
    }
2757
2758
0
    if (!need_idx_ugly_delay_hack(idx)) {
2759
        // Write this for compressed (CSI) indexes, but for BAI we
2760
        // need to save a bit for later.  See hts_idx_close_otf_fp()
2761
0
        check(idx_write_uint64(fp, idx->n_no_coor));
2762
0
    }
2763
2764
#ifdef DEBUG_INDEX
2765
    idx_dump(idx);
2766
#endif
2767
2768
0
    return 0;
2769
0
    #undef check
2770
0
}
2771
2772
int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
2773
0
{
2774
0
    int ret, save;
2775
0
    if (idx == NULL || fn == NULL) { errno = EINVAL; return -1; }
2776
0
    char *fnidx = (char*)calloc(1, strlen(fn) + 5);
2777
0
    if (fnidx == NULL) return -1;
2778
2779
0
    strcpy(fnidx, fn);
2780
0
    switch (fmt) {
2781
0
    case HTS_FMT_BAI: strcat(fnidx, ".bai"); break;
2782
0
    case HTS_FMT_CSI: strcat(fnidx, ".csi"); break;
2783
0
    case HTS_FMT_TBI: strcat(fnidx, ".tbi"); break;
2784
0
    default: abort();
2785
0
    }
2786
2787
0
    ret = hts_idx_save_as(idx, fn, fnidx, fmt);
2788
0
    save = errno;
2789
0
    free(fnidx);
2790
0
    errno = save;
2791
0
    return ret;
2792
0
}
2793
2794
static int hts_idx_write_out(const hts_idx_t *idx, BGZF *fp, int fmt)
2795
0
{
2796
0
    #define check(ret) if ((ret) < 0) return -1
2797
2798
0
    if (fmt == HTS_FMT_CSI) {
2799
0
        check(bgzf_write(fp, "CSI\1", 4));
2800
0
        check(idx_write_int32(fp, idx->min_shift));
2801
0
        check(idx_write_int32(fp, idx->n_lvls));
2802
0
        check(idx_write_uint32(fp, idx->l_meta));
2803
0
        if (idx->l_meta) check(bgzf_write(fp, idx->meta, idx->l_meta));
2804
0
    } else if (fmt == HTS_FMT_TBI) {
2805
0
        check(bgzf_write(fp, "TBI\1", 4));
2806
0
    } else if (fmt == HTS_FMT_BAI) {
2807
0
        check(bgzf_write(fp, "BAI\1", 4));
2808
0
    } else abort();
2809
2810
0
    check(idx_save_core(idx, fp, fmt));
2811
2812
0
    #undef check
2813
0
    return 0;
2814
0
}
2815
2816
int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
2817
0
{
2818
0
    BGZF *fp;
2819
2820
0
    if (fnidx == NULL)
2821
0
        return hts_idx_save(idx, fn, fmt);
2822
2823
0
    fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
2824
0
    if (fp == NULL) return -1;
2825
2826
0
    if (hts_idx_write_out(idx, fp, fmt) < 0) {
2827
0
        int save_errno = errno;
2828
0
        bgzf_close(fp);
2829
0
        errno = save_errno;
2830
0
        return -1;
2831
0
    }
2832
2833
0
    return bgzf_close(fp);
2834
0
}
2835
2836
// idx_save for on-the-fly indexes.  Mostly duplicated from above, except
2837
// idx is not const because we want to store the file handle in it, and
2838
// the index file handle is not closed.  This allows the index file to be
2839
// closed after the EOF block on the indexed file has been written out,
2840
// so the modification times on the two files will be in the correct order.
2841
int hts_idx_save_but_not_close(hts_idx_t *idx, const char *fnidx, int fmt)
2842
0
{
2843
0
    idx->otf_fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
2844
0
    if (idx->otf_fp == NULL) return -1;
2845
2846
0
    if (hts_idx_write_out(idx, idx->otf_fp, fmt) < 0) {
2847
0
        int save_errno = errno;
2848
0
        bgzf_close(idx->otf_fp);
2849
0
        idx->otf_fp = NULL;
2850
0
        errno = save_errno;
2851
0
        return -1;
2852
0
    }
2853
2854
0
    return bgzf_flush(idx->otf_fp);
2855
0
}
2856
2857
static int hts_idx_close_otf_fp(hts_idx_t *idx)
2858
0
{
2859
0
    if (idx && idx->otf_fp) {
2860
0
        int ret = 0;
2861
0
        if (need_idx_ugly_delay_hack(idx)) {
2862
            // BAI index - write out the bytes we deferred earlier
2863
0
            ret = idx_write_uint64(idx->otf_fp, idx->n_no_coor) < 0;
2864
0
        }
2865
0
        ret |= bgzf_close(idx->otf_fp) < 0;
2866
0
        idx->otf_fp = NULL;
2867
0
        return ret == 0 ? 0 : -1;
2868
0
    }
2869
0
    return 0;
2870
0
}
2871
2872
static int idx_read_core(hts_idx_t *idx, BGZF *fp, int fmt)
2873
0
{
2874
0
    int32_t i, n, is_be;
2875
0
    is_be = ed_is_big();
2876
0
    if (idx == NULL) return -4;
2877
0
    for (i = 0; i < idx->n; ++i) {
2878
0
        bidx_t *h;
2879
0
        lidx_t *l = &idx->lidx[i];
2880
0
        uint32_t key;
2881
0
        int j, absent;
2882
0
        bins_t *p;
2883
0
        h = idx->bidx[i] = kh_init(bin);
2884
0
        if (bgzf_read(fp, &n, 4) != 4) return -1;
2885
0
        if (is_be) ed_swap_4p(&n);
2886
0
        if (n < 0) return -3;
2887
0
        for (j = 0; j < n; ++j) {
2888
0
            khint_t k;
2889
0
            if (bgzf_read(fp, &key, 4) != 4) return -1;
2890
0
            if (is_be) ed_swap_4p(&key);
2891
0
            k = kh_put(bin, h, key, &absent);
2892
0
            if (absent <  0) return -2; // No memory
2893
0
            if (absent == 0) return -3; // Duplicate bin number
2894
0
            p = &kh_val(h, k);
2895
0
            if (fmt == HTS_FMT_CSI) {
2896
0
                if (bgzf_read(fp, &p->loff, 8) != 8) return -1;
2897
0
                if (is_be) ed_swap_8p(&p->loff);
2898
0
            } else p->loff = 0;
2899
0
            if (bgzf_read(fp, &p->n, 4) != 4) return -1;
2900
0
            if (is_be) ed_swap_4p(&p->n);
2901
0
            if (p->n < 0) return -3;
2902
0
            if ((size_t) p->n > SIZE_MAX / sizeof(hts_pair64_t)) return -2;
2903
0
            p->m = p->n;
2904
0
            p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t));
2905
0
            if (p->list == NULL) return -2;
2906
0
            if (bgzf_read(fp, p->list, ((size_t) p->n)<<4) != ((size_t) p->n)<<4) return -1;
2907
0
            if (is_be) swap_bins(p);
2908
0
        }
2909
0
        if (fmt != HTS_FMT_CSI) { // load linear index
2910
0
            int j, k;
2911
0
            uint32_t x;
2912
0
            if (bgzf_read(fp, &x, 4) != 4) return -1;
2913
0
            if (is_be) ed_swap_4p(&x);
2914
0
            l->n = x;
2915
0
            if (l->n < 0) return -3;
2916
0
            if ((size_t) l->n > SIZE_MAX / sizeof(uint64_t)) return -2;
2917
0
            l->m = l->n;
2918
0
            l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t));
2919
0
            if (l->offset == NULL) return -2;
2920
0
            if (bgzf_read(fp, l->offset, l->n << 3) != l->n << 3) return -1;
2921
0
            if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]);
2922
0
            for (k = j = 0; j < l->n && l->offset[j] == 0; k = ++j); // stop at the first non-zero entry
2923
0
            for (j = l->n-1; j > k; j--) // fill missing values; may happen given older samtools and tabix
2924
0
                if (l->offset[j-1] == 0) l->offset[j-1] = l->offset[j];
2925
0
            update_loff(idx, i, 0);
2926
0
        }
2927
0
    }
2928
0
    if (bgzf_read(fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0;
2929
0
    if (is_be) ed_swap_8p(&idx->n_no_coor);
2930
#ifdef DEBUG_INDEX
2931
    idx_dump(idx);
2932
#endif
2933
2934
0
    return 0;
2935
0
}
2936
2937
static hts_idx_t *idx_read(const char *fn)
2938
0
{
2939
0
    uint8_t magic[4];
2940
0
    int i, is_be;
2941
0
    hts_idx_t *idx = NULL;
2942
0
    uint8_t *meta = NULL;
2943
0
    BGZF *fp = bgzf_open(fn, "r");
2944
0
    if (fp == NULL) return NULL;
2945
0
    is_be = ed_is_big();
2946
0
    if (bgzf_read(fp, magic, 4) != 4) goto fail;
2947
2948
0
    if (memcmp(magic, "CSI\1", 4) == 0) {
2949
0
        uint32_t x[3], n;
2950
0
        if (bgzf_read(fp, x, 12) != 12) goto fail;
2951
0
        if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]);
2952
0
        if (x[2]) {
2953
0
            if (SIZE_MAX - x[2] < 1) goto fail; // Prevent possible overflow
2954
0
            if ((meta = (uint8_t*)malloc((size_t) x[2] + 1)) == NULL) goto fail;
2955
0
            if (bgzf_read(fp, meta, x[2]) != x[2]) goto fail;
2956
            // Prevent possible strlen past the end in tbx_index_load2
2957
0
            meta[x[2]] = '\0';
2958
0
        }
2959
0
        if (bgzf_read(fp, &n, 4) != 4) goto fail;
2960
0
        if (is_be) ed_swap_4p(&n);
2961
0
        if (n > INT32_MAX) goto fail;
2962
0
        if ((idx = hts_idx_init(n, HTS_FMT_CSI, 0, x[0], x[1])) == NULL) goto fail;
2963
0
        idx->l_meta = x[2];
2964
0
        idx->meta = meta;
2965
0
        meta = NULL;
2966
0
        if (idx_read_core(idx, fp, HTS_FMT_CSI) < 0) goto fail;
2967
0
    }
2968
0
    else if (memcmp(magic, "TBI\1", 4) == 0) {
2969
0
        uint8_t x[8 * 4];
2970
0
        uint32_t n;
2971
        // Read file header
2972
0
        if (bgzf_read(fp, x, sizeof(x)) != sizeof(x)) goto fail;
2973
0
        n = le_to_u32(&x[0]); // location of n_ref
2974
0
        if (n > INT32_MAX) goto fail;
2975
0
        if ((idx = hts_idx_init(n, HTS_FMT_TBI, 0, 14, 5)) == NULL) goto fail;
2976
0
        n = le_to_u32(&x[7*4]); // location of l_nm
2977
0
        if (n > UINT32_MAX - 29) goto fail; // Prevent possible overflow
2978
0
        idx->l_meta = 28 + n;
2979
0
        if ((idx->meta = (uint8_t*)malloc(idx->l_meta + 1)) == NULL) goto fail;
2980
        // copy format, col_seq, col_beg, col_end, meta, skip, l_nm
2981
        // N.B. left in little-endian byte order.
2982
0
        memcpy(idx->meta, &x[1*4], 28);
2983
        // Read in sequence names.
2984
0
        if (bgzf_read(fp, idx->meta + 28, n) != n) goto fail;
2985
        // Prevent possible strlen past the end in tbx_index_load2
2986
0
        idx->meta[idx->l_meta] = '\0';
2987
0
        if (idx_read_core(idx, fp, HTS_FMT_TBI) < 0) goto fail;
2988
0
    }
2989
0
    else if (memcmp(magic, "BAI\1", 4) == 0) {
2990
0
        uint32_t n;
2991
0
        if (bgzf_read(fp, &n, 4) != 4) goto fail;
2992
0
        if (is_be) ed_swap_4p(&n);
2993
0
        if (n > INT32_MAX) goto fail;
2994
0
        if ((idx = hts_idx_init(n, HTS_FMT_BAI, 0, 14, 5)) == NULL) goto fail;
2995
0
        if (idx_read_core(idx, fp, HTS_FMT_BAI) < 0) goto fail;
2996
0
    }
2997
0
    else { errno = EINVAL; goto fail; }
2998
2999
0
    bgzf_close(fp);
3000
0
    return idx;
3001
3002
0
fail:
3003
0
    bgzf_close(fp);
3004
0
    hts_idx_destroy(idx);
3005
0
    free(meta);
3006
0
    return NULL;
3007
0
}
3008
3009
int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta,
3010
                      int is_copy)
3011
0
{
3012
0
    uint8_t *new_meta = meta;
3013
0
    if (is_copy) {
3014
0
        size_t l = l_meta;
3015
0
        if (l > SIZE_MAX - 1) {
3016
0
            errno = ENOMEM;
3017
0
            return -1;
3018
0
        }
3019
0
        new_meta = malloc(l + 1);
3020
0
        if (!new_meta) return -1;
3021
0
        memcpy(new_meta, meta, l);
3022
        // Prevent possible strlen past the end in tbx_index_load2
3023
0
        new_meta[l] = '\0';
3024
0
    }
3025
0
    if (idx->meta) free(idx->meta);
3026
0
    idx->l_meta = l_meta;
3027
0
    idx->meta = new_meta;
3028
0
    return 0;
3029
0
}
3030
3031
uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta)
3032
0
{
3033
0
    *l_meta = idx->l_meta;
3034
0
    return idx->meta;
3035
0
}
3036
3037
const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)
3038
0
{
3039
0
    if ( !idx || !idx->n )
3040
0
    {
3041
0
        *n = 0;
3042
0
        return NULL;
3043
0
    }
3044
3045
0
    int tid = 0, i;
3046
0
    const char **names = (const char**) calloc(idx->n,sizeof(const char*));
3047
0
    for (i=0; i<idx->n; i++)
3048
0
    {
3049
0
        bidx_t *bidx = idx->bidx[i];
3050
0
        if ( !bidx ) continue;
3051
0
        names[tid++] = getid(hdr,i);
3052
0
    }
3053
0
    *n = tid;
3054
0
    return names;
3055
0
}
3056
3057
0
int hts_idx_nseq(const hts_idx_t *idx) {
3058
0
    if (!idx) return -1;
3059
0
    return idx->n;
3060
0
}
3061
3062
int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped)
3063
0
{
3064
0
    if (!idx) return -1;
3065
0
    if ( idx->fmt == HTS_FMT_CRAI ) {
3066
0
        *mapped = 0; *unmapped = 0;
3067
0
        return -1;
3068
0
    }
3069
3070
0
    bidx_t *h = idx->bidx[tid];
3071
0
    if (!h) return -1;
3072
0
    khint_t k = kh_get(bin, h, META_BIN(idx));
3073
0
    if (k != kh_end(h)) {
3074
0
        *mapped = kh_val(h, k).list[1].u;
3075
0
        *unmapped = kh_val(h, k).list[1].v;
3076
0
        return 0;
3077
0
    } else {
3078
0
        *mapped = 0; *unmapped = 0;
3079
0
        return -1;
3080
0
    }
3081
0
}
3082
3083
uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
3084
0
{
3085
0
    if (idx->fmt == HTS_FMT_CRAI) return 0;
3086
0
    return idx->n_no_coor;
3087
0
}
3088
3089
/****************
3090
 *** Iterator ***
3091
 ****************/
3092
3093
// Note: even with 32-bit hts_pos_t, end needs to be 64-bit here due to 1LL<<s.
3094
static inline int reg2bins_narrow(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx)
3095
0
{
3096
0
    int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
3097
0
    for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3098
0
        hts_pos_t b, e;
3099
0
        int i;
3100
0
        b = t + (beg>>s); e = t + (end>>s);
3101
0
        for (i = b; i <= e; ++i) {
3102
0
            if (kh_get(bin, bidx, i) != kh_end(bidx)) {
3103
0
                assert(itr->bins.n < itr->bins.m);
3104
0
                itr->bins.a[itr->bins.n++] = i;
3105
0
            }
3106
0
        }
3107
0
    }
3108
0
    return itr->bins.n;
3109
0
}
3110
3111
static inline int reg2bins_wide(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx)
3112
0
{
3113
0
    khint_t i;
3114
0
    hts_pos_t max_shift = 3 * n_lvls + min_shift;
3115
0
    --end;
3116
0
    if (beg < 0) beg = 0;
3117
0
    for (i = kh_begin(bidx); i != kh_end(bidx); i++) {
3118
0
        if (!kh_exist(bidx, i)) continue;
3119
0
        hts_pos_t bin = (hts_pos_t) kh_key(bidx, i);
3120
0
        int level = hts_bin_level(bin);
3121
0
        if (level > n_lvls) continue; // Dodgy index?
3122
0
        hts_pos_t first = hts_bin_first(level);
3123
0
        hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level));
3124
0
        hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level));
3125
0
        if (beg_at_level <= bin && bin <= end_at_level) {
3126
0
            assert(itr->bins.n < itr->bins.m);
3127
0
            itr->bins.a[itr->bins.n++] = bin;
3128
0
        }
3129
0
    }
3130
0
    return itr->bins.n;
3131
0
}
3132
3133
static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx)
3134
0
{
3135
0
    int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
3136
0
    size_t reg_bin_count = 0, hash_bin_count = kh_n_buckets(bidx), max_bins;
3137
0
    hts_pos_t end1;
3138
0
    if (end >= 1LL<<s) end = 1LL<<s;
3139
0
    if (beg >= end) return 0;
3140
0
    end1 = end - 1;
3141
3142
    // Count bins to see if it's faster to iterate through the hash table
3143
    // or the set of bins covering the region
3144
0
    for (l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3145
0
        reg_bin_count += (end1 >> s) - (beg >> s) + 1;
3146
0
    }
3147
0
    max_bins = reg_bin_count < kh_size(bidx) ? reg_bin_count : kh_size(bidx);
3148
0
    if (itr->bins.m - itr->bins.n < max_bins) {
3149
        // Worst-case memory usage.  May be wasteful on very sparse
3150
        // data, but the bin list usually won't be too big anyway.
3151
0
        size_t new_m = max_bins + itr->bins.n;
3152
0
        if (new_m > INT_MAX || new_m > SIZE_MAX / sizeof(int)) {
3153
0
            errno = ENOMEM;
3154
0
            return -1;
3155
0
        }
3156
0
        int *new_a = realloc(itr->bins.a, new_m * sizeof(*new_a));
3157
0
        if (!new_a) return -1;
3158
0
        itr->bins.a = new_a;
3159
0
        itr->bins.m = new_m;
3160
0
    }
3161
0
    if (reg_bin_count < hash_bin_count) {
3162
0
        return reg2bins_narrow(beg, end, itr, min_shift, n_lvls, bidx);
3163
0
    } else {
3164
0
        return reg2bins_wide(beg, end, itr, min_shift, n_lvls, bidx);
3165
0
    }
3166
0
}
3167
3168
static inline int add_to_interval(hts_itr_t *iter, bins_t *bin,
3169
                                  int tid, uint32_t interval,
3170
                                  uint64_t min_off, uint64_t max_off)
3171
0
{
3172
0
    hts_pair64_max_t *off;
3173
0
    int j;
3174
3175
0
    if (!bin->n)
3176
0
        return 0;
3177
0
    off = realloc(iter->off, (iter->n_off + bin->n) * sizeof(*off));
3178
0
    if (!off)
3179
0
        return -2;
3180
3181
0
    iter->off = off;
3182
0
    for (j = 0; j < bin->n; ++j) {
3183
0
        if (bin->list[j].v > min_off && bin->list[j].u < max_off) {
3184
0
            iter->off[iter->n_off].u = min_off > bin->list[j].u
3185
0
                ? min_off : bin->list[j].u;
3186
0
            iter->off[iter->n_off].v = max_off < bin->list[j].v
3187
0
                ? max_off : bin->list[j].v;
3188
            // hts_pair64_max_t::max is now used to link
3189
            // file offsets to region list entries.
3190
            // The iterator can use this to decide if it
3191
            // can skip some file regions.
3192
0
            iter->off[iter->n_off].max = ((uint64_t) tid << 32) | interval;
3193
0
            iter->n_off++;
3194
0
        }
3195
0
    }
3196
0
    return 0;
3197
0
}
3198
3199
static inline int reg2intervals_narrow(hts_itr_t *iter, const bidx_t *bidx,
3200
                                       int tid, int64_t beg, int64_t end,
3201
                                       uint32_t interval,
3202
                                       uint64_t min_off, uint64_t max_off,
3203
                                       int min_shift, int n_lvls)
3204
0
{
3205
0
    int l, t, s = min_shift + n_lvls * 3;
3206
0
    hts_pos_t b, e, i;
3207
3208
0
    for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3209
0
        b = t + (beg>>s); e = t + (end>>s);
3210
0
        for (i = b; i <= e; ++i) {
3211
0
            khint_t k = kh_get(bin, bidx, i);
3212
0
            if (k != kh_end(bidx)) {
3213
0
                bins_t *bin = &kh_value(bidx, k);
3214
0
                int res = add_to_interval(iter, bin, tid, interval, min_off, max_off);
3215
0
                if (res < 0)
3216
0
                    return res;
3217
0
            }
3218
0
        }
3219
0
    }
3220
0
    return 0;
3221
0
}
3222
3223
static inline int reg2intervals_wide(hts_itr_t *iter, const bidx_t *bidx,
3224
                                       int tid, int64_t beg, int64_t end,
3225
                                       uint32_t interval,
3226
                                       uint64_t min_off, uint64_t max_off,
3227
                                       int min_shift, int n_lvls)
3228
0
{
3229
0
    khint_t i;
3230
0
    hts_pos_t max_shift = 3 * n_lvls + min_shift;
3231
0
    --end;
3232
0
    if (beg < 0) beg = 0;
3233
0
    for (i = kh_begin(bidx); i != kh_end(bidx); i++) {
3234
0
        if (!kh_exist(bidx, i)) continue;
3235
0
        hts_pos_t bin = (hts_pos_t) kh_key(bidx, i);
3236
0
        int level = hts_bin_level(bin);
3237
0
        if (level > n_lvls) continue; // Dodgy index?
3238
0
        hts_pos_t first = hts_bin_first(level);
3239
0
        hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level));
3240
0
        hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level));
3241
0
        if (beg_at_level <= bin && bin <= end_at_level) {
3242
0
            bins_t *bin = &kh_value(bidx, i);
3243
0
            int res = add_to_interval(iter, bin, tid, interval, min_off, max_off);
3244
0
            if (res < 0)
3245
0
                return res;
3246
0
        }
3247
0
    }
3248
0
    return 0;
3249
0
}
3250
3251
static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint32_t interval, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls)
3252
0
{
3253
0
    int l, t, s;
3254
0
    int i, j;
3255
0
    hts_pos_t end1;
3256
0
    bidx_t *bidx;
3257
0
    int start_n_off;
3258
0
    size_t reg_bin_count = 0, hash_bin_count;
3259
0
    int res;
3260
3261
0
    if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg > end)
3262
0
        return -1;
3263
3264
0
    hash_bin_count = kh_n_buckets(bidx);
3265
3266
0
    s = min_shift + (n_lvls<<1) + n_lvls;
3267
0
    if (end >= 1LL<<s)
3268
0
        end = 1LL<<s;
3269
3270
0
    end1 = end - 1;
3271
    // Count bins to see if it's faster to iterate through the hash table
3272
    // or the set of bins covering the region
3273
0
    for (l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3274
0
        reg_bin_count += (end1 >> s) - (beg >> s) + 1;
3275
0
    }
3276
3277
0
    start_n_off = iter->n_off;
3278
3279
    // Populate iter->off with the intervals for this region
3280
0
    if (reg_bin_count < hash_bin_count) {
3281
0
        res = reg2intervals_narrow(iter, bidx, tid, beg, end, interval,
3282
0
                                   min_off, max_off, min_shift, n_lvls);
3283
0
    } else {
3284
0
        res = reg2intervals_wide(iter, bidx, tid, beg, end, interval,
3285
0
                                 min_off, max_off, min_shift, n_lvls);
3286
0
    }
3287
0
    if (res < 0)
3288
0
        return res;
3289
3290
0
    if (iter->n_off - start_n_off > 1) {
3291
0
        ks_introsort(_off_max, iter->n_off - start_n_off, iter->off + start_n_off);
3292
0
        for (i = start_n_off, j = start_n_off + 1; j < iter->n_off; j++) {
3293
0
            if (iter->off[i].v >= iter->off[j].u) {
3294
0
                if (iter->off[i].v < iter->off[j].v)
3295
0
                    iter->off[i].v = iter->off[j].v;
3296
0
            } else {
3297
0
                i++;
3298
0
                if (i < j)
3299
0
                    iter->off[i] = iter->off[j];
3300
0
            }
3301
0
        }
3302
0
        iter->n_off = i + 1;
3303
0
    }
3304
3305
0
    return iter->n_off;
3306
0
}
3307
3308
0
static int compare_regions(const void *r1, const void *r2) {
3309
0
    hts_reglist_t *reg1 = (hts_reglist_t *)r1;
3310
0
    hts_reglist_t *reg2 = (hts_reglist_t *)r2;
3311
3312
0
    if (reg1->tid < 0 && reg2->tid >= 0)
3313
0
        return 1;
3314
0
    else if (reg1->tid >= 0 && reg2->tid < 0)
3315
0
        return -1;
3316
0
    else
3317
0
        return reg1->tid - reg2->tid;
3318
0
}
3319
3320
0
uint64_t hts_itr_off(const hts_idx_t* idx, int tid) {
3321
3322
0
    int i;
3323
0
    bidx_t* bidx;
3324
0
    uint64_t off0 = (uint64_t) -1;
3325
0
    khint_t k;
3326
0
    switch (tid) {
3327
0
    case HTS_IDX_START:
3328
        // Find the smallest offset, note that sequence ids may not be ordered sequentially
3329
0
        for (i = 0; i < idx->n; i++) {
3330
0
            bidx = idx->bidx[i];
3331
0
            k = kh_get(bin, bidx, META_BIN(idx));
3332
0
            if (k == kh_end(bidx))
3333
0
                continue;
3334
3335
0
            if (off0 > kh_val(bidx, k).list[0].u)
3336
0
                off0 = kh_val(bidx, k).list[0].u;
3337
0
        }
3338
0
        if (off0 == (uint64_t) -1 && idx->n_no_coor)
3339
0
            off0 = 0;
3340
        // only no-coor reads in this bam
3341
0
        break;
3342
0
    case HTS_IDX_NOCOOR:
3343
        /* No-coor reads sort after all of the mapped reads.  The position
3344
           is not stored in the index itself, so need to find the end
3345
           offset for the last mapped read.  A loop is needed here in
3346
           case references at the end of the file have no mapped reads,
3347
           or sequence ids are not ordered sequentially.
3348
           See issue samtools#568 and commits b2aab8, 60c22d and cc207d. */
3349
0
        for (i = 0; i < idx->n; i++) {
3350
0
            bidx = idx->bidx[i];
3351
0
            k = kh_get(bin, bidx, META_BIN(idx));
3352
0
            if (k != kh_end(bidx)) {
3353
0
                if (off0 == (uint64_t) -1 || off0 < kh_val(bidx, k).list[0].v) {
3354
0
                    off0 = kh_val(bidx, k).list[0].v;
3355
0
                }
3356
0
            }
3357
0
        }
3358
0
        if (off0 == (uint64_t) -1 && idx->n_no_coor)
3359
0
            off0 = 0;
3360
        // only no-coor reads in this bam
3361
0
        break;
3362
0
    case HTS_IDX_REST:
3363
0
        off0 = 0;
3364
0
        break;
3365
0
    case HTS_IDX_NONE:
3366
0
        off0 = 0;
3367
0
        break;
3368
0
    }
3369
3370
0
    return off0;
3371
0
}
3372
3373
hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
3374
0
{
3375
0
    int i, n_off, l, bin;
3376
0
    hts_pair64_max_t *off;
3377
0
    khint_t k;
3378
0
    bidx_t *bidx;
3379
0
    uint64_t min_off, max_off;
3380
0
    hts_pos_t idx_maxpos;
3381
0
    hts_itr_t *iter;
3382
0
    uint32_t unmapped = 0, rel_off;
3383
3384
    // It's possible to call this function with NULL idx iff
3385
    // tid is one of the special values HTS_IDX_REST or HTS_IDX_NONE
3386
0
    if (!idx && !(tid == HTS_IDX_REST || tid == HTS_IDX_NONE)) {
3387
0
        errno = EINVAL;
3388
0
        return NULL;
3389
0
    }
3390
3391
0
    iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
3392
0
    if (iter) {
3393
0
        if (tid < 0) {
3394
0
            uint64_t off = hts_itr_off(idx, tid);
3395
0
            if (off != (uint64_t) -1) {
3396
0
                iter->read_rest = 1;
3397
0
                iter->curr_off = off;
3398
0
                iter->readrec = readrec;
3399
0
                if (tid == HTS_IDX_NONE)
3400
0
                    iter->finished = 1;
3401
0
            } else {
3402
0
                free(iter);
3403
0
                iter = NULL;
3404
0
            }
3405
0
        } else if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) {
3406
0
            iter->finished = 1;
3407
0
        } else {
3408
0
            if (beg < 0) beg = 0;
3409
0
            if (end < beg) {
3410
0
              free(iter);
3411
0
              return NULL;
3412
0
            }
3413
3414
0
            k = kh_get(bin, bidx, META_BIN(idx));
3415
0
            if (k != kh_end(bidx))
3416
0
                unmapped = kh_val(bidx, k).list[1].v;
3417
0
            else
3418
0
                unmapped = 1;
3419
3420
0
            iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
3421
0
            iter->readrec = readrec;
3422
3423
0
            if ( !kh_size(bidx) ) { iter->finished = 1; return iter; }
3424
3425
0
            idx_maxpos = hts_idx_maxpos(idx);
3426
0
            if (beg >= idx_maxpos) { iter->finished = 1; return iter; }
3427
3428
0
            rel_off = beg>>idx->min_shift;
3429
            // compute min_off
3430
0
            bin = hts_bin_first(idx->n_lvls) + rel_off;
3431
0
            do {
3432
0
                int first;
3433
0
                k = kh_get(bin, bidx, bin);
3434
0
                if (k != kh_end(bidx)) break;
3435
0
                first = (hts_bin_parent(bin)<<3) + 1;
3436
0
                if (bin > first) --bin;
3437
0
                else bin = hts_bin_parent(bin);
3438
0
            } while (bin);
3439
0
            if (bin == 0) k = kh_get(bin, bidx, bin);
3440
0
            min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
3441
            // min_off can be calculated more accurately if the
3442
            // linear index is available
3443
0
            if (idx->lidx[tid].offset
3444
0
                && rel_off < idx->lidx[tid].n) {
3445
0
                if (min_off < idx->lidx[tid].offset[rel_off])
3446
0
                    min_off = idx->lidx[tid].offset[rel_off];
3447
0
                if (unmapped) {
3448
                    // unmapped reads are not covered by the linear index,
3449
                    // so search backwards for a smaller offset
3450
0
                    int tmp_off;
3451
0
                    for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) {
3452
0
                        if (idx->lidx[tid].offset[tmp_off] < min_off) {
3453
0
                            min_off = idx->lidx[tid].offset[tmp_off];
3454
0
                            break;
3455
0
                        }
3456
0
                    }
3457
                    // if the search went too far back or no satisfactory entry
3458
                    // was found, revert to the bin index loff value
3459
0
                    if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).loff || tmp_off < 0))
3460
0
                        min_off = kh_val(bidx, k).loff;
3461
0
                }
3462
0
            } else if (unmapped) { //CSI index
3463
0
                if (k != kh_end(bidx))
3464
0
                    min_off = kh_val(bidx, k).loff;
3465
0
            }
3466
3467
            // compute max_off: a virtual offset from a bin to the right of end
3468
            // First check if end lies within the range of the index (it won't
3469
            // if it's HTS_POS_MAX)
3470
0
            if (end <= idx_maxpos) {
3471
0
                bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
3472
0
                if (bin >= idx->n_bins) bin = 0;
3473
0
                while (1) {
3474
                    // search for an extant bin by moving right, but moving up to the
3475
                    // parent whenever we get to a first child (which also covers falling
3476
                    // off the RHS, which wraps around and immediately goes up to bin 0)
3477
0
                    while (bin % 8 == 1) bin = hts_bin_parent(bin);
3478
0
                    if (bin == 0) { max_off = UINT64_MAX; break; }
3479
0
                    k = kh_get(bin, bidx, bin);
3480
0
                    if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; }
3481
0
                    bin++;
3482
0
                }
3483
0
            } else {
3484
                // Searching to end of reference
3485
0
                max_off = UINT64_MAX;
3486
0
            }
3487
3488
            // retrieve bins
3489
0
            if (reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls, bidx) < 0) {
3490
0
                hts_itr_destroy(iter);
3491
0
                return NULL;
3492
0
            }
3493
3494
0
            for (i = n_off = 0; i < iter->bins.n; ++i)
3495
0
                if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx))
3496
0
                    n_off += kh_value(bidx, k).n;
3497
0
            if (n_off == 0) {
3498
                // No overlapping bins means the iterator has already finished.
3499
0
                iter->finished = 1;
3500
0
                return iter;
3501
0
            }
3502
0
            off = calloc(n_off, sizeof(*off));
3503
0
            for (i = n_off = 0; i < iter->bins.n; ++i) {
3504
0
                if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) {
3505
0
                    int j;
3506
0
                    bins_t *p = &kh_value(bidx, k);
3507
0
                    for (j = 0; j < p->n; ++j)
3508
0
                        if (p->list[j].v > min_off && p->list[j].u < max_off) {
3509
0
                            off[n_off].u = min_off > p->list[j].u
3510
0
                                ? min_off : p->list[j].u;
3511
0
                            off[n_off].v = max_off < p->list[j].v
3512
0
                                ? max_off : p->list[j].v;
3513
                            // hts_pair64_max_t::max is now used to link
3514
                            // file offsets to region list entries.
3515
                            // The iterator can use this to decide if it
3516
                            // can skip some file regions.
3517
0
                            off[n_off].max = ((uint64_t) tid << 32) | j;
3518
0
                            n_off++;
3519
0
                        }
3520
0
                }
3521
0
            }
3522
3523
0
            if (n_off == 0) {
3524
0
                free(off);
3525
0
                iter->finished = 1;
3526
0
                return iter;
3527
0
            }
3528
0
            ks_introsort(_off_max, n_off, off);
3529
            // resolve completely contained adjacent blocks
3530
0
            for (i = 1, l = 0; i < n_off; ++i)
3531
0
                if (off[l].v < off[i].v) off[++l] = off[i];
3532
0
            n_off = l + 1;
3533
            // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
3534
0
            for (i = 1; i < n_off; ++i)
3535
0
                if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
3536
            // merge adjacent blocks
3537
0
            for (i = 1, l = 0; i < n_off; ++i) {
3538
0
                if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
3539
0
                else off[++l] = off[i];
3540
0
            }
3541
0
            n_off = l + 1;
3542
0
            iter->n_off = n_off; iter->off = off;
3543
0
        }
3544
0
    }
3545
3546
0
    return iter;
3547
0
}
3548
3549
int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
3550
0
{
3551
0
    int i, j, bin;
3552
0
    khint_t k;
3553
0
    bidx_t *bidx;
3554
0
    uint64_t min_off, max_off, t_off = (uint64_t)-1;
3555
0
    int tid;
3556
0
    hts_pos_t beg, end, idx_maxpos;
3557
0
    hts_reglist_t *curr_reg;
3558
0
    uint32_t unmapped = 0, rel_off;
3559
3560
0
    if (!idx || !iter || !iter->multi)
3561
0
        return -1;
3562
3563
0
    iter->i = -1;
3564
0
    for (i=0; i<iter->n_reg; i++) {
3565
3566
0
        curr_reg = &iter->reg_list[i];
3567
0
        tid = curr_reg->tid;
3568
3569
0
        if (tid < 0) {
3570
0
            t_off = hts_itr_off(idx, tid);
3571
0
            if (t_off != (uint64_t)-1) {
3572
0
                switch (tid) {
3573
0
                case HTS_IDX_NONE:
3574
0
                    iter->finished = 1;
3575
                    // fall through
3576
0
                case HTS_IDX_START:
3577
0
                case HTS_IDX_REST:
3578
0
                    iter->curr_off = t_off;
3579
0
                    iter->n_reg = 0;
3580
0
                    iter->reg_list = NULL;
3581
0
                    iter->read_rest = 1;
3582
0
                    return 0;
3583
0
                case HTS_IDX_NOCOOR:
3584
0
                    iter->nocoor = 1;
3585
0
                    iter->nocoor_off = t_off;
3586
0
                }
3587
0
            }
3588
0
        } else {
3589
0
            if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL || !kh_size(bidx))
3590
0
                continue;
3591
3592
0
            k = kh_get(bin, bidx, META_BIN(idx));
3593
0
            if (k != kh_end(bidx))
3594
0
                unmapped = kh_val(bidx, k).list[1].v;
3595
0
            else
3596
0
                unmapped = 1;
3597
3598
0
            idx_maxpos = hts_idx_maxpos(idx);
3599
3600
0
            for(j=0; j<curr_reg->count; j++) {
3601
0
                hts_pair32_t *curr_intv = &curr_reg->intervals[j];
3602
0
                if (curr_intv->end < curr_intv->beg)
3603
0
                    continue;
3604
3605
0
                beg = curr_intv->beg;
3606
0
                end = curr_intv->end;
3607
0
                if (beg >= idx_maxpos)
3608
0
                    continue;
3609
0
                rel_off = beg>>idx->min_shift;
3610
3611
                /* Compute 'min_off' by searching the lowest level bin containing 'beg'.
3612
                       If the computed bin is not in the index, try the next bin to the
3613
                       left, belonging to the same parent. If it is the first sibling bin,
3614
                       try the parent bin. */
3615
0
                bin = hts_bin_first(idx->n_lvls) + rel_off;
3616
0
                do {
3617
0
                    int first;
3618
0
                    k = kh_get(bin, bidx, bin);
3619
0
                    if (k != kh_end(bidx)) break;
3620
0
                    first = (hts_bin_parent(bin)<<3) + 1;
3621
0
                    if (bin > first) --bin;
3622
0
                    else bin = hts_bin_parent(bin);
3623
0
                } while (bin);
3624
0
                if (bin == 0)
3625
0
                    k = kh_get(bin, bidx, bin);
3626
0
                min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
3627
                // min_off can be calculated more accurately if the
3628
                // linear index is available
3629
0
                if (idx->lidx[tid].offset
3630
0
                    && rel_off < idx->lidx[tid].n) {
3631
0
                    if (min_off < idx->lidx[tid].offset[rel_off])
3632
0
                        min_off = idx->lidx[tid].offset[rel_off];
3633
0
                    if (unmapped) {
3634
0
                        int tmp_off;
3635
0
                        for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) {
3636
0
                            if (idx->lidx[tid].offset[tmp_off] < min_off) {
3637
0
                                min_off = idx->lidx[tid].offset[tmp_off];
3638
0
                                break;
3639
0
                            }
3640
0
                        }
3641
3642
0
                        if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).loff || tmp_off < 0))
3643
0
                            min_off = kh_val(bidx, k).loff;
3644
0
                    }
3645
0
                } else if (unmapped) { //CSI index
3646
0
                    if (k != kh_end(bidx))
3647
0
                        min_off = kh_val(bidx, k).loff;
3648
0
                }
3649
3650
                // compute max_off: a virtual offset from a bin to the right of end
3651
                // First check if end lies within the range of the index (it
3652
                // won't if it's HTS_POS_MAX)
3653
0
                if (end <= idx_maxpos) {
3654
0
                    bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
3655
0
                    if (bin >= idx->n_bins) bin = 0;
3656
0
                    while (1) {
3657
                        // search for an extant bin by moving right, but moving up to the
3658
                        // parent whenever we get to a first child (which also covers falling
3659
                        // off the RHS, which wraps around and immediately goes up to bin 0)
3660
0
                        while (bin % 8 == 1) bin = hts_bin_parent(bin);
3661
0
                        if (bin == 0) { max_off = UINT64_MAX; break; }
3662
0
                        k = kh_get(bin, bidx, bin);
3663
0
                        if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) {
3664
0
                            max_off = kh_val(bidx, k).list[0].u;
3665
0
                            break;
3666
0
                        }
3667
0
                        bin++;
3668
0
                    }
3669
0
                } else {
3670
                    // Searching to end of reference
3671
0
                    max_off = UINT64_MAX;
3672
0
                }
3673
3674
                //convert coordinates to file offsets
3675
0
                if (reg2intervals(iter, idx, tid, beg, end, j,
3676
0
                                  min_off, max_off,
3677
0
                                  idx->min_shift, idx->n_lvls) < 0) {
3678
0
                    return -1;
3679
0
                }
3680
0
            }
3681
0
        }
3682
0
    }
3683
3684
0
    if (iter->n_off > 1)
3685
0
        ks_introsort(_off_max, iter->n_off, iter->off);
3686
3687
0
    if(!iter->n_off && !iter->nocoor)
3688
0
        iter->finished = 1;
3689
3690
0
    return 0;
3691
0
}
3692
3693
int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter)
3694
0
{
3695
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
3696
0
    int tid, i, n_off = 0;
3697
0
    uint32_t j;
3698
0
    hts_pos_t beg, end;
3699
0
    hts_reglist_t *curr_reg;
3700
0
    hts_pair32_t *curr_intv;
3701
0
    hts_pair64_max_t *off = NULL, *tmp;
3702
0
    cram_index *e = NULL;
3703
3704
0
    if (!cidx || !iter || !iter->multi)
3705
0
        return -1;
3706
3707
0
    iter->is_cram = 1;
3708
0
    iter->read_rest = 0;
3709
0
    iter->off = NULL;
3710
0
    iter->n_off = 0;
3711
0
    iter->curr_off = 0;
3712
0
    iter->i = -1;
3713
3714
0
    for (i=0; i<iter->n_reg; i++) {
3715
3716
0
        curr_reg = &iter->reg_list[i];
3717
0
        tid = curr_reg->tid;
3718
3719
0
        if (tid >= 0) {
3720
0
            tmp = realloc(off, (n_off + curr_reg->count) * sizeof(*off));
3721
0
            if (!tmp)
3722
0
                goto err;
3723
0
            off = tmp;
3724
3725
0
            for (j=0; j < curr_reg->count; j++) {
3726
0
                curr_intv = &curr_reg->intervals[j];
3727
0
                if (curr_intv->end < curr_intv->beg)
3728
0
                    continue;
3729
3730
0
                beg = curr_intv->beg;
3731
0
                end = curr_intv->end;
3732
3733
/* First, fetch the container overlapping 'beg' and assign its file offset to u, then
3734
 * find the container overlapping 'end' and assign the relative end of the slice to v.
3735
 * The cram_ptell function will adjust with the container offset, which is not stored
3736
 * in the index.
3737
 */
3738
0
                e = cram_index_query(cidx->cram, tid, beg+1, NULL);
3739
0
                if (e) {
3740
0
                    off[n_off].u = e->offset;
3741
                    // hts_pair64_max_t::max is now used to link
3742
                    // file offsets to region list entries.
3743
                    // The iterator can use this to decide if it
3744
                    // can skip some file regions.
3745
0
                    off[n_off].max = ((uint64_t) tid << 32) | j;
3746
3747
0
                    if (end >= HTS_POS_MAX) {
3748
0
                       e = cram_index_last(cidx->cram, tid, NULL);
3749
0
                    } else {
3750
0
                       e = cram_index_query_last(cidx->cram, tid, end+1);
3751
0
                    }
3752
3753
0
                    if (e) {
3754
0
                        off[n_off++].v = e->e_next
3755
0
                            ? e->e_next->offset
3756
0
                            : e->offset + e->slice + e->len;
3757
0
                    } else {
3758
0
                        hts_log_warning("Could not set offset end for region %d:%"PRIhts_pos"-%"PRIhts_pos". Skipping", tid, beg, end);
3759
0
                    }
3760
0
                }
3761
0
            }
3762
0
        } else {
3763
0
            switch (tid) {
3764
0
                case HTS_IDX_NOCOOR:
3765
0
                    e = cram_index_query(cidx->cram, tid, 1, NULL);
3766
0
                    if (e) {
3767
0
                        iter->nocoor = 1;
3768
0
                        iter->nocoor_off = e->offset;
3769
0
                    } else {
3770
0
                        hts_log_warning("No index entry for NOCOOR region");
3771
0
                    }
3772
0
                    break;
3773
0
                case HTS_IDX_START:
3774
0
                    e = cram_index_query(cidx->cram, tid, 1, NULL);
3775
0
                    if (e) {
3776
0
                        iter->read_rest = 1;
3777
0
                        tmp = realloc(off, sizeof(*off));
3778
0
                        if (!tmp)
3779
0
                            goto err;
3780
0
                        off = tmp;
3781
0
                        off[0].u = e->offset;
3782
0
                        off[0].v = 0;
3783
0
                        n_off=1;
3784
0
                    } else {
3785
0
                        hts_log_warning("No index entries");
3786
0
                    }
3787
0
                    break;
3788
0
                case HTS_IDX_REST:
3789
0
                    break;
3790
0
                case HTS_IDX_NONE:
3791
0
                    iter->finished = 1;
3792
0
                    break;
3793
0
                default:
3794
0
                    hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
3795
0
            }
3796
0
        }
3797
0
    }
3798
3799
0
    if (n_off) {
3800
0
        ks_introsort(_off_max, n_off, off);
3801
0
        iter->n_off = n_off; iter->off = off;
3802
0
    }
3803
3804
0
    if(!n_off && !iter->nocoor)
3805
0
        iter->finished = 1;
3806
3807
0
    return 0;
3808
3809
0
 err:
3810
0
    free(off);
3811
0
    return -1;
3812
0
}
3813
3814
void hts_itr_destroy(hts_itr_t *iter)
3815
0
{
3816
0
    if (iter) {
3817
0
        if (iter->multi) {
3818
0
            hts_reglist_free(iter->reg_list, iter->n_reg);
3819
0
        } else {
3820
0
            free(iter->bins.a);
3821
0
        }
3822
3823
0
        if (iter->off)
3824
0
            free(iter->off);
3825
0
        free(iter);
3826
0
    }
3827
0
}
3828
3829
static inline unsigned long long push_digit(unsigned long long i, char c)
3830
0
{
3831
    // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so
3832
0
    int digit = c - '0';
3833
0
    return 10 * i + digit;
3834
0
}
3835
3836
long long hts_parse_decimal(const char *str, char **strend, int flags)
3837
0
{
3838
0
    unsigned long long n = 0;
3839
0
    int digits = 0, decimals = 0, e = 0, lost = 0;
3840
0
    char sign = '+', esign = '+';
3841
0
    const char *s, *str_orig = str;
3842
3843
0
    while (isspace_c(*str)) str++;
3844
0
    s = str;
3845
3846
0
    if (*s == '+' || *s == '-') sign = *s++;
3847
0
    while (*s)
3848
0
        if (isdigit_c(*s)) digits++, n = push_digit(n, *s++);
3849
0
        else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++;
3850
0
        else break;
3851
3852
0
    if (*s == '.') {
3853
0
        s++;
3854
0
        while (isdigit_c(*s)) decimals++, digits++, n = push_digit(n, *s++);
3855
0
    }
3856
3857
0
    switch (*s) {
3858
0
    case 'e': case 'E':
3859
0
        s++;
3860
0
        if (*s == '+' || *s == '-') esign = *s++;
3861
0
        while (isdigit_c(*s)) e = push_digit(e, *s++);
3862
0
        if (esign == '-') e = -e;
3863
0
        break;
3864
3865
0
    case 'k': case 'K': e += 3; s++; break;
3866
0
    case 'm': case 'M': e += 6; s++; break;
3867
0
    case 'g': case 'G': e += 9; s++; break;
3868
0
    }
3869
3870
0
    e -= decimals;
3871
0
    while (e > 0) n *= 10, e--;
3872
0
    while (e < 0) lost += n % 10, n /= 10, e++;
3873
3874
0
    if (lost > 0) {
3875
0
        hts_log_warning("Discarding fractional part of %.*s", (int)(s - str), str);
3876
0
    }
3877
3878
0
    if (strend) {
3879
        // Set to the original input str pointer if not valid number syntax
3880
0
        *strend = (digits > 0)? (char *)s : (char *)str_orig;
3881
0
    } else if (digits == 0) {
3882
0
        hts_log_warning("Invalid numeric value %.8s[truncated]", str);
3883
0
    } else if (*s) {
3884
0
        if ((flags & HTS_PARSE_THOUSANDS_SEP) || (!(flags & HTS_PARSE_THOUSANDS_SEP) && *s != ','))
3885
0
            hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s);
3886
0
    }
3887
3888
0
    return (sign == '+')? n : -n;
3889
0
}
3890
3891
0
static void *hts_memrchr(const void *s, int c, size_t n) {
3892
0
    size_t i;
3893
0
    unsigned char *u = (unsigned char *)s;
3894
0
    for (i = n; i > 0; i--) {
3895
0
        if (u[i-1] == c)
3896
0
            return u+i-1;
3897
0
    }
3898
3899
0
    return NULL;
3900
0
}
3901
3902
/*
3903
 * A variant of hts_parse_reg which is reference-id aware.  It uses
3904
 * the iterator name2id callbacks to validate the region tokenisation works.
3905
 *
3906
 * This is necessary due to GRCh38 HLA additions which have reference names
3907
 * like "HLA-DRB1*12:17".
3908
 *
3909
 * All parameters are mandatory.
3910
 *
3911
 * To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200"
3912
 * are reference names, we may quote using curly braces.
3913
 * Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example.
3914
 *
3915
 * Flags are used to control how parsing works, and can be one of the below.
3916
 *
3917
 * HTS_PARSE_LIST:
3918
 *     If present, the region is assmed to be a comma separated list and
3919
 *     position parsing will not contain commas (this implicitly
3920
 *     clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal).
3921
 *     On success the return pointer will be the start of the next region, ie
3922
 *     the character after the comma.  (If *ret != '\0' then the caller can
3923
 *     assume another region is present in the list.)
3924
 *
3925
 *     If not set then positions may contain commas.  In this case the return
3926
 *     value should point to the end of the string, or NULL on failure.
3927
 *
3928
 * HTS_PARSE_ONE_COORD:
3929
 *     If present, X:100 is treated as the single base pair region X:100-100.
3930
 *     In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-<end>.
3931
 *     (This is the standard bcftools region convention.)
3932
 *
3933
 *     When not set X:100 is considered to be X:100-<end> where <end> is
3934
 *     the end of chromosome X (set to HTS_POS_MAX here).  X:100- and X:-100
3935
 *     are invalid.
3936
 *     (This is the standard samtools region convention.)
3937
 *
3938
 * Note the supplied string expects 1 based inclusive coordinates, but the
3939
 * returned coordinates start from 0 and are half open, so pos0 is valid
3940
 * for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}"
3941
 *
3942
 * On success a pointer to the byte after the end of the entire region
3943
 *            specifier is returned (plus any trailing comma), and tid,
3944
 *            beg & end will be set.
3945
 * On failure NULL is returned.
3946
 */
3947
const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg,
3948
                             hts_pos_t *end, hts_name2id_f getid, void *hdr,
3949
                             int flags)
3950
0
{
3951
0
    if (!s || !tid || !beg || !end || !getid)
3952
0
        return NULL;
3953
3954
0
    size_t s_len = strlen(s);
3955
0
    kstring_t ks = { 0, 0, NULL };
3956
3957
0
    const char *colon = NULL, *comma = NULL;
3958
0
    int quoted = 0;
3959
3960
0
    if (flags & HTS_PARSE_LIST)
3961
0
        flags &= ~HTS_PARSE_THOUSANDS_SEP;
3962
0
    else
3963
0
        flags |= HTS_PARSE_THOUSANDS_SEP;
3964
3965
0
    const char *s_end = s + s_len;
3966
3967
    // Braced quoting of references is permitted to resolve ambiguities.
3968
0
    if (*s == '{') {
3969
0
        const char *close = memchr(s, '}', s_len);
3970
0
        if (!close) {
3971
0
            hts_log_error("Mismatching braces in \"%s\"", s);
3972
0
            *tid = -1;
3973
0
            return NULL;
3974
0
        }
3975
0
        s++;
3976
0
        s_len--;
3977
0
        if (close[1] == ':')
3978
0
            colon = close+1;
3979
0
        quoted = 1; // number of trailing characters to trim
3980
3981
        // Truncate to this item only, if appropriate.
3982
0
        if (flags & HTS_PARSE_LIST) {
3983
0
            comma = strchr(close, ',');
3984
0
            if (comma) {
3985
0
                s_len = comma-s;
3986
0
                s_end = comma+1;
3987
0
            }
3988
0
        }
3989
0
    } else {
3990
        // Truncate to this item only, if appropriate.
3991
0
        if (flags & HTS_PARSE_LIST) {
3992
0
            comma = strchr(s, ',');
3993
0
            if (comma) {
3994
0
                s_len = comma-s;
3995
0
                s_end = comma+1;
3996
0
            }
3997
0
        }
3998
3999
0
        colon = hts_memrchr(s, ':', s_len);
4000
0
    }
4001
4002
    // No colon is simplest case; just check and return.
4003
0
    if (colon == NULL) {
4004
0
        *beg = 0; *end = HTS_POS_MAX;
4005
0
        kputsn(s, s_len-quoted, &ks); // convert to nul terminated string
4006
0
        if (!ks.s) {
4007
0
            *tid = -2;
4008
0
            return NULL;
4009
0
        }
4010
4011
0
        *tid = getid(hdr, ks.s);
4012
0
        free(ks.s);
4013
4014
0
        return *tid >= 0 ? s_end : NULL;
4015
0
    }
4016
4017
    // Has a colon, but check whole name first.
4018
0
    if (!quoted) {
4019
0
        *beg = 0; *end = HTS_POS_MAX;
4020
0
        kputsn(s, s_len, &ks); // convert to nul terminated string
4021
0
        if (!ks.s) {
4022
0
            *tid = -2;
4023
0
            return NULL;
4024
0
        }
4025
0
        if ((*tid = getid(hdr, ks.s)) >= 0) {
4026
            // Entire name matches, but also check this isn't
4027
            // ambiguous.  eg we have ref chr1 and ref chr1:100-200
4028
            // both present.
4029
0
            ks.l = 0;
4030
0
            kputsn(s, colon-s, &ks); // convert to nul terminated string
4031
0
            if (!ks.s) {
4032
0
                *tid = -2;
4033
0
                return NULL;
4034
0
            }
4035
0
            if (getid(hdr, ks.s) >= 0) {
4036
0
                free(ks.s);
4037
0
                *tid = -1;
4038
0
                hts_log_error("Range is ambiguous. "
4039
0
                              "Use {%s} or {%.*s}%s instead",
4040
0
                              s, (int)(colon-s), s, colon);
4041
0
                return NULL;
4042
0
            }
4043
0
            free(ks.s);
4044
4045
0
            return s_end;
4046
0
        }
4047
0
        if (*tid < -1) // Failed to parse header
4048
0
            return NULL;
4049
0
    }
4050
4051
    // Quoted, or unquoted and whole string isn't a name.
4052
    // Check the pre-colon part is valid.
4053
0
    ks.l = 0;
4054
0
    kputsn(s, colon-s-quoted, &ks); // convert to nul terminated string
4055
0
    if (!ks.s) {
4056
0
        *tid = -2;
4057
0
        return NULL;
4058
0
    }
4059
0
    *tid = getid(hdr, ks.s);
4060
0
    free(ks.s);
4061
0
    if (*tid < 0)
4062
0
        return NULL;
4063
4064
    // Finally parse the post-colon coordinates
4065
0
    char *hyphen;
4066
0
    *beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1;
4067
0
    if (*beg < 0) {
4068
0
        if (*beg != -1 && *hyphen == '-' && colon[1] != '\0') {
4069
            // User specified zero, but we're 1-based.
4070
0
            hts_log_error("Coordinates must be > 0");
4071
0
            return NULL;
4072
0
        }
4073
0
        if (isdigit_c(*hyphen) || *hyphen == '\0' || *hyphen == ',') {
4074
            // interpret chr:-100 as chr:1-100
4075
0
            *end = *beg==-1 ? HTS_POS_MAX : -(*beg+1);
4076
0
            *beg = 0;
4077
0
            return s_end;
4078
0
        } else if (*beg < -1) {
4079
0
            hts_log_error("Unexpected string \"%s\" after region", hyphen);
4080
0
            return NULL;
4081
0
        }
4082
0
    }
4083
4084
0
    if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) {
4085
0
        *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : HTS_POS_MAX;
4086
0
    } else if (*hyphen == '-') {
4087
0
        *end = hts_parse_decimal(hyphen+1, &hyphen, flags);
4088
0
        if (*hyphen != '\0' && *hyphen != ',') {
4089
0
            hts_log_error("Unexpected string \"%s\" after region", hyphen);
4090
0
            return NULL;
4091
0
        }
4092
0
    } else {
4093
0
        hts_log_error("Unexpected string \"%s\" after region", hyphen);
4094
0
        return NULL;
4095
0
    }
4096
4097
0
    if (*end == 0)
4098
0
        *end = HTS_POS_MAX; // interpret chr:100- as chr:100-<end>
4099
4100
0
    if (*beg >= *end) return NULL;
4101
4102
0
    return s_end;
4103
0
}
4104
4105
// Next release we should mark this as deprecated?
4106
// Use hts_parse_region above instead.
4107
const char *hts_parse_reg64(const char *s, hts_pos_t *beg, hts_pos_t *end)
4108
0
{
4109
0
    char *hyphen;
4110
0
    const char *colon = strrchr(s, ':');
4111
0
    if (colon == NULL) {
4112
0
        *beg = 0; *end = HTS_POS_MAX;
4113
0
        return s + strlen(s);
4114
0
    }
4115
4116
0
    *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1;
4117
0
    if (*beg < 0) *beg = 0;
4118
4119
0
    if (*hyphen == '\0') *end = HTS_POS_MAX;
4120
0
    else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP);
4121
0
    else return NULL;
4122
4123
0
    if (*beg >= *end) return NULL;
4124
0
    return colon;
4125
0
}
4126
4127
const char *hts_parse_reg(const char *s, int *beg, int *end)
4128
0
{
4129
0
    hts_pos_t beg64 = 0, end64 = 0;
4130
0
    const char *colon = hts_parse_reg64(s, &beg64, &end64);
4131
0
    if (beg64 > INT_MAX) {
4132
0
        hts_log_error("Position %"PRId64" too large", beg64);
4133
0
        return NULL;
4134
0
    }
4135
0
    if (end64 > INT_MAX) {
4136
0
        if (end64 == HTS_POS_MAX) {
4137
0
            end64 = INT_MAX;
4138
0
        } else {
4139
0
            hts_log_error("Position %"PRId64" too large", end64);
4140
0
            return NULL;
4141
0
        }
4142
0
    }
4143
0
    *beg = beg64;
4144
0
    *end = end64;
4145
0
    return colon;
4146
0
}
4147
4148
hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
4149
0
{
4150
0
    int tid;
4151
0
    hts_pos_t beg, end;
4152
4153
0
    if (strcmp(reg, ".") == 0)
4154
0
        return itr_query(idx, HTS_IDX_START, 0, 0, readrec);
4155
0
    else if (strcmp(reg, "*") == 0)
4156
0
        return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
4157
4158
0
    if (!hts_parse_region(reg, &tid, &beg, &end, getid, hdr, HTS_PARSE_THOUSANDS_SEP))
4159
0
        return NULL;
4160
4161
0
    return itr_query(idx, tid, beg, end, readrec);
4162
0
}
4163
4164
0
hts_itr_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int count, hts_name2id_f getid, void *hdr, hts_itr_multi_query_func *itr_specific, hts_readrec_func *readrec, hts_seek_func *seek, hts_tell_func *tell) {
4165
4166
0
    int i;
4167
4168
0
    if (!reglist)
4169
0
        return NULL;
4170
4171
0
    hts_itr_t *itr = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
4172
0
    if (itr) {
4173
0
        itr->n_reg = count;
4174
0
        itr->readrec = readrec;
4175
0
        itr->seek = seek;
4176
0
        itr->tell = tell;
4177
0
        itr->reg_list = reglist;
4178
0
        itr->finished = 0;
4179
0
        itr->nocoor = 0;
4180
0
        itr->multi = 1;
4181
4182
0
        for (i = 0; i < itr->n_reg; i++) {
4183
0
            if (itr->reg_list[i].reg) {
4184
0
                if (!strcmp(itr->reg_list[i].reg, ".")) {
4185
0
                    itr->reg_list[i].tid = HTS_IDX_START;
4186
0
                    continue;
4187
0
                }
4188
4189
0
                if (!strcmp(itr->reg_list[i].reg, "*")) {
4190
0
                    itr->reg_list[i].tid = HTS_IDX_NOCOOR;
4191
0
                    continue;
4192
0
                }
4193
4194
0
                itr->reg_list[i].tid = getid(hdr, reglist[i].reg);
4195
0
                if (itr->reg_list[i].tid < 0) {
4196
0
                    if (itr->reg_list[i].tid < -1) {
4197
0
                        hts_log_error("Failed to parse header");
4198
0
                        hts_itr_destroy(itr);
4199
0
                        return NULL;
4200
0
                    } else {
4201
0
                        hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", reglist[i].reg);
4202
0
                    }
4203
0
                }
4204
0
            }
4205
0
        }
4206
4207
0
        qsort(itr->reg_list, itr->n_reg, sizeof(hts_reglist_t), compare_regions);
4208
0
        if (itr_specific(idx, itr) != 0) {
4209
0
            hts_log_error("Failed to create the multi-region iterator!");
4210
0
            hts_itr_destroy(itr);
4211
0
            itr = NULL;
4212
0
        }
4213
0
    }
4214
4215
0
    return itr;
4216
0
}
4217
4218
int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
4219
0
{
4220
0
    int ret, tid;
4221
0
    hts_pos_t beg, end;
4222
0
    if (iter == NULL || iter->finished) return -1;
4223
0
    if (iter->read_rest) {
4224
0
        if (iter->curr_off) { // seek to the start
4225
0
            if (bgzf_seek(fp, iter->curr_off, SEEK_SET) < 0) {
4226
0
                hts_log_error("Failed to seek to offset %"PRIu64"%s%s",
4227
0
                              iter->curr_off,
4228
0
                              errno ? ": " : "", strerror(errno));
4229
0
                return -2;
4230
0
            }
4231
0
            iter->curr_off = 0; // only seek once
4232
0
        }
4233
0
        ret = iter->readrec(fp, data, r, &tid, &beg, &end);
4234
0
        if (ret < 0) iter->finished = 1;
4235
0
        iter->curr_tid = tid;
4236
0
        iter->curr_beg = beg;
4237
0
        iter->curr_end = end;
4238
0
        return ret;
4239
0
    }
4240
    // A NULL iter->off should always be accompanied by iter->finished.
4241
0
    assert(iter->off != NULL);
4242
0
    for (;;) {
4243
0
        if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
4244
0
            if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
4245
0
            if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
4246
0
                if (bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) {
4247
0
                    hts_log_error("Failed to seek to offset %"PRIu64"%s%s",
4248
0
                                  iter->off[iter->i+1].u,
4249
0
                                  errno ? ": " : "", strerror(errno));
4250
0
                    return -2;
4251
0
                }
4252
0
                iter->curr_off = bgzf_tell(fp);
4253
0
            }
4254
0
            ++iter->i;
4255
0
        }
4256
0
        if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) {
4257
0
            iter->curr_off = bgzf_tell(fp);
4258
0
            if (tid != iter->tid || beg >= iter->end) { // no need to proceed
4259
0
                ret = -1; break;
4260
0
            } else if (end > iter->beg && iter->end > beg) {
4261
0
                iter->curr_tid = tid;
4262
0
                iter->curr_beg = beg;
4263
0
                iter->curr_end = end;
4264
0
                return ret;
4265
0
            }
4266
0
        } else break; // end of file or error
4267
0
    }
4268
0
    iter->finished = 1;
4269
0
    return ret;
4270
0
}
4271
4272
int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r)
4273
0
{
4274
0
    void *fp;
4275
0
    int ret, tid, i, cr, ci;
4276
0
    hts_pos_t beg, end;
4277
0
    hts_reglist_t *found_reg;
4278
4279
0
    if (iter == NULL || iter->finished) return -1;
4280
4281
0
    if (iter->is_cram) {
4282
0
        fp = fd->fp.cram;
4283
0
    } else {
4284
0
        fp = fd->fp.bgzf;
4285
0
    }
4286
4287
0
    if (iter->read_rest) {
4288
0
        if (iter->curr_off) { // seek to the start
4289
0
            if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
4290
0
                hts_log_error("Seek at offset %" PRIu64 " failed.", iter->curr_off);
4291
0
                return -1;
4292
0
            }
4293
0
            iter->curr_off = 0; // only seek once
4294
0
        }
4295
4296
0
        ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
4297
0
        if (ret < 0)
4298
0
            iter->finished = 1;
4299
4300
0
        iter->curr_tid = tid;
4301
0
        iter->curr_beg = beg;
4302
0
        iter->curr_end = end;
4303
4304
0
        return ret;
4305
0
    }
4306
    // A NULL iter->off should always be accompanied by iter->finished.
4307
0
    assert(iter->off != NULL || iter->nocoor != 0);
4308
4309
0
    int next_range = 0;
4310
0
    for (;;) {
4311
        // Note that due to the way bam indexing works, iter->off may contain
4312
        // file chunks that are not actually needed as they contain data
4313
        // beyond the end of the requested region.  These are filtered out
4314
        // by comparing the tid and index into hts_reglist_t::intervals
4315
        // (packed for reasons of convenience into iter->off[iter->i].max)
4316
        // associated with the file region with iter->curr_tid and
4317
        // iter->curr_intv.
4318
4319
0
        if (next_range
4320
0
            || iter->curr_off == 0
4321
0
            || iter->i >= iter->n_off
4322
0
            || iter->curr_off >= iter->off[iter->i].v
4323
0
            || (iter->off[iter->i].max >> 32 == iter->curr_tid
4324
0
                && (iter->off[iter->i].max & 0xffffffff) < iter->curr_intv)) {
4325
4326
            // Jump to the next chunk.  It may be necessary to skip more
4327
            // than one as the iter->off list can include overlapping entries.
4328
0
            do {
4329
0
                iter->i++;
4330
0
            } while (iter->i < iter->n_off
4331
0
                     && (iter->curr_off >= iter->off[iter->i].v
4332
0
                         || (iter->off[iter->i].max >> 32 == iter->curr_tid
4333
0
                             && (iter->off[iter->i].max & 0xffffffff) < iter->curr_intv)));
4334
4335
0
            if (iter->is_cram && iter->i < iter->n_off) {
4336
                // Ensure iter->curr_reg is correct.
4337
                //
4338
                // We need this for CRAM as we shortcut some of the later
4339
                // logic by getting an end-of-range and continuing to the
4340
                // next offset.
4341
                //
4342
                // We cannot do this for BAM (and fortunately do not need to
4343
                // either) because in BAM world a query to genomic positions
4344
                // GX and GY leading to a seek offsets PX and PY may have
4345
                // GX > GY and PX < PY.  (This is due to the R-tree and falling
4346
                // between intervals, bumping up to a higher bin.)
4347
                // CRAM strictly follows PX >= PY if GX >= GY, so this logic
4348
                // works.
4349
0
                int want_tid = iter->off[iter->i].max >> 32;
4350
0
                if (!(iter->curr_reg < iter->n_reg &&
4351
0
                      iter->reg_list[iter->curr_reg].tid == want_tid)) {
4352
0
                    int j;
4353
0
                    for (j = 0; j < iter->n_reg; j++)
4354
0
                        if (iter->reg_list[j].tid == want_tid)
4355
0
                            break;
4356
0
                    if (j == iter->n_reg)
4357
0
                        return -1;
4358
0
                    iter->curr_reg = j;
4359
0
                    iter->curr_tid = iter->reg_list[iter->curr_reg].tid;
4360
0
                };
4361
0
                iter->curr_intv = iter->off[iter->i].max & 0xffffffff;
4362
0
            }
4363
4364
0
            if (iter->i >= iter->n_off) { // no more chunks, except NOCOORs
4365
0
                if (iter->nocoor) {
4366
0
                    next_range = 0;
4367
0
                    if (iter->seek(fp, iter->nocoor_off, SEEK_SET) < 0) {
4368
0
                        hts_log_error("Seek at offset %" PRIu64 " failed.", iter->nocoor_off);
4369
0
                        return -1;
4370
0
                    }
4371
0
                    if (iter->is_cram) {
4372
0
                        cram_range r = { HTS_IDX_NOCOOR };
4373
0
                        cram_set_option(fp, CRAM_OPT_RANGE_NOSEEK, &r);
4374
0
                    }
4375
4376
                    // The first slice covering the unmapped reads might
4377
                    // contain a few mapped reads, so scroll
4378
                    // forward until finding the first unmapped read.
4379
0
                    do {
4380
0
                        ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
4381
0
                    } while (tid >= 0 && ret >=0);
4382
4383
0
                    if (ret < 0)
4384
0
                        iter->finished = 1;
4385
0
                    else
4386
0
                        iter->read_rest = 1;
4387
4388
0
                    iter->curr_off = 0; // don't seek any more
4389
0
                    iter->curr_tid = tid;
4390
0
                    iter->curr_beg = beg;
4391
0
                    iter->curr_end = end;
4392
4393
0
                    return ret;
4394
0
                } else {
4395
0
                    ret = -1; break;
4396
0
                }
4397
0
            } else if (iter->i < iter->n_off) {
4398
                // New chunk may overlap the last one, so ensure we
4399
                // only seek forwards.
4400
0
                if (iter->curr_off < iter->off[iter->i].u || next_range) {
4401
0
                    iter->curr_off = iter->off[iter->i].u;
4402
4403
                    // CRAM has the capability of setting an end location.
4404
                    // This means multi-threaded decodes can stop once they
4405
                    // reach that point, rather than pointlessly decoding
4406
                    // more slices than we'll be using.
4407
                    //
4408
                    // We have to be careful here.  Whenever we set the cram
4409
                    // range we need a corresponding seek in order to ensure
4410
                    // we can safely decode at that offset.  We use next_range
4411
                    // var to ensure this is always true; this is set on
4412
                    // end-of-range condition. It's never modified for BAM.
4413
0
                    if (iter->is_cram) {
4414
                        // Next offset.[uv] tuple, but it's already been
4415
                        // included in our cram range, so don't seek and don't
4416
                        // reset range so we can efficiently multi-thread.
4417
0
                        if (next_range || iter->curr_off >= iter->end) {
4418
0
                            if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
4419
0
                                hts_log_error("Seek at offset %" PRIu64
4420
0
                                        " failed.", iter->curr_off);
4421
0
                                return -1;
4422
0
                            }
4423
4424
                            // Find the genomic range matching this interval.
4425
0
                            int j;
4426
0
                            hts_reglist_t *rl = &iter->reg_list[iter->curr_reg];
4427
0
                            cram_range r = {
4428
0
                                    rl->tid,
4429
0
                                    rl->intervals[iter->curr_intv].beg,
4430
0
                                    rl->intervals[iter->curr_intv].end
4431
0
                            };
4432
4433
                            // Expand it up to cover neighbouring intervals.
4434
                            // Note we can only have a single chromosome in a
4435
                            // range, so if we detect our blocks span chromosomes
4436
                            // or we have a multi-ref mode slice, we just use
4437
                            // HTS_IDX_START refid instead.  This doesn't actually
4438
                            // seek (due to CRAM_OPT_RANGE_NOSEEK) and is simply
4439
                            // and indicator of decoding with no end limit.
4440
                            //
4441
                            // That isn't as efficient as it could be, but it's
4442
                            // no poorer than before and it works.
4443
0
                            int tid = r.refid;
4444
0
                            int64_t end = r.end;
4445
0
                            int64_t v = iter->off[iter->i].v;
4446
0
                            j = iter->i+1;
4447
0
                            while (j < iter->n_off) {
4448
0
                                if (iter->off[j].u > v)
4449
0
                                    break;
4450
4451
0
                                uint64_t max = iter->off[j].max;
4452
0
                                if ((max>>32) != tid) {
4453
0
                                    tid = HTS_IDX_START; // => no range limit
4454
0
                                } else {
4455
0
                                    if (end < rl->intervals[max & 0xffffffff].end)
4456
0
                                        end = rl->intervals[max & 0xffffffff].end;
4457
0
                                }
4458
0
                                if (v < iter->off[j].v)
4459
0
                                    v = iter->off[j].v;
4460
0
                                j++;
4461
0
                            }
4462
0
                            r.refid = tid;
4463
0
                            r.end = end;
4464
4465
                            // Remember maximum 'v' here so we don't do
4466
                            // unnecessary subsequent seeks for the next
4467
                            // regions.  We can't change curr_off, but
4468
                            // beg/end are used only by single region iterator so
4469
                            // we cache it there to avoid changing the struct.
4470
0
                            iter->end = v;
4471
4472
0
                            cram_set_option(fp, CRAM_OPT_RANGE_NOSEEK, &r);
4473
0
                            next_range = 0;
4474
0
                        }
4475
0
                    } else { // Not CRAM
4476
0
                        if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
4477
0
                            hts_log_error("Seek at offset %" PRIu64 " failed.",
4478
0
                                          iter->curr_off);
4479
0
                            return -1;
4480
0
                        }
4481
0
                    }
4482
0
                }
4483
0
            }
4484
0
        }
4485
4486
0
        ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
4487
0
        if (ret < 0) {
4488
0
            if (iter->is_cram && cram_eof(fp)) {
4489
                // Skip to end of range
4490
                //
4491
                // We should never be adjusting curr_off manually unless
4492
                // we also can guarantee we'll be doing a seek after to
4493
                // a new location.  Otherwise we'll be reading wrong offset
4494
                // for the next container.
4495
                //
4496
                // We ensure this by adjusting our CRAM_OPT_RANGE
4497
                // accordingly above, but to double check we also
4498
                // set the skipped_block flag to enforce a seek also.
4499
0
                iter->curr_off = iter->off[iter->i].v;
4500
0
                next_range = 1;
4501
4502
                // Next region
4503
0
                if (++iter->curr_intv >= iter->reg_list[iter->curr_reg].count){
4504
0
                    if (++iter->curr_reg >= iter->n_reg)
4505
0
                        break;
4506
0
                    iter->curr_intv = 0;
4507
0
                    iter->curr_tid = iter->reg_list[iter->curr_reg].tid;
4508
0
                }
4509
0
                continue;
4510
0
            } else {
4511
0
                break;
4512
0
            }
4513
0
        }
4514
4515
0
        iter->curr_off = iter->tell(fp);
4516
4517
0
        if (tid != iter->curr_tid) {
4518
0
            hts_reglist_t key;
4519
0
            key.tid = tid;
4520
4521
0
            found_reg = (hts_reglist_t *)bsearch(&key, iter->reg_list,
4522
0
                                                 iter->n_reg,
4523
0
                                                 sizeof(hts_reglist_t),
4524
0
                                                 compare_regions);
4525
0
            if (!found_reg)
4526
0
                continue;
4527
4528
0
            iter->curr_reg = (found_reg - iter->reg_list);
4529
0
            iter->curr_tid = tid;
4530
0
            iter->curr_intv = 0;
4531
0
        }
4532
4533
0
        cr = iter->curr_reg;
4534
0
        ci = iter->curr_intv;
4535
4536
0
        for (i = ci; i < iter->reg_list[cr].count; i++) {
4537
0
            if (end > iter->reg_list[cr].intervals[i].beg &&
4538
0
                iter->reg_list[cr].intervals[i].end > beg) {
4539
0
                iter->curr_beg = beg;
4540
0
                iter->curr_end = end;
4541
0
                iter->curr_intv = i;
4542
4543
0
                return ret;
4544
0
            }
4545
4546
            // Check if the read starts beyond intervals[i].end
4547
            // If so, the interval is finished so move on to the next.
4548
0
            if (beg > iter->reg_list[cr].intervals[i].end)
4549
0
                iter->curr_intv = i + 1;
4550
4551
            // No need to keep searching if the read ends before intervals[i].beg
4552
0
            if (end < iter->reg_list[cr].intervals[i].beg)
4553
0
                break;
4554
0
        }
4555
0
    }
4556
0
    iter->finished = 1;
4557
4558
0
    return ret;
4559
0
}
4560
4561
/**********************
4562
 *** Retrieve index ***
4563
 **********************/
4564
// Local_fn and local_len will return a sub-region of 'fn'.
4565
// Eg http://elsewhere/dir/foo.bam.bai?a=b may return
4566
// foo.bam.bai via local_fn and local_len.
4567
//
4568
// Returns -1 if index couldn't be opened.
4569
//         -2 on other errors
4570
static int idx_test_and_fetch(const char *fn, const char **local_fn, int *local_len, int download)
4571
0
{
4572
0
    hFILE *remote_hfp = NULL;
4573
0
    hFILE *local_fp = NULL;
4574
0
    int save_errno;
4575
0
    htsFormat fmt;
4576
0
    kstring_t s = KS_INITIALIZE;
4577
0
    kstring_t tmps = KS_INITIALIZE;
4578
4579
0
    if (hisremote(fn)) {
4580
0
        const int buf_size = 1 * 1024 * 1024;
4581
0
        int l;
4582
0
        const char *p, *e;
4583
        // Ignore ?# params: eg any file.fmt?param=val, except for S3 URLs
4584
0
        e = fn + ((strncmp(fn, "s3://", 5) && strncmp(fn, "s3+http://", 10) && strncmp(fn, "s3+https://", 11)) ? strcspn(fn, "?#") : strcspn(fn, "?"));
4585
        // Find the previous slash from there.
4586
0
        p = e;
4587
0
        while (p > fn && *p != '/') p--;
4588
0
        if (*p == '/') p++;
4589
4590
        // Attempt to open local file first
4591
0
        kputsn(p, e-p, &s);
4592
0
        if (access(s.s, R_OK) == 0)
4593
0
        {
4594
0
            free(s.s);
4595
0
            *local_fn = p;
4596
0
            *local_len = e-p;
4597
0
            return 0;
4598
0
        }
4599
4600
        // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .bai or .tbi index.
4601
0
        if ((remote_hfp = hopen(fn, "r")) == 0) {
4602
0
            hts_log_info("Failed to open index file '%s'", fn);
4603
0
            free(s.s);
4604
0
            return -1;
4605
0
        }
4606
0
        if (hts_detect_format2(remote_hfp, fn, &fmt)) {
4607
0
            hts_log_error("Failed to detect format of index file '%s'", fn);
4608
0
            goto fail;
4609
0
        }
4610
0
        if (fmt.category != index_file || (fmt.format != bai &&  fmt.format != csi && fmt.format != tbi
4611
0
                && fmt.format != crai && fmt.format != fai_format)) {
4612
0
            hts_log_error("Format of index file '%s' is not supported", fn);
4613
0
            goto fail;
4614
0
        }
4615
4616
0
        if (download) {
4617
0
            if ((local_fp = hts_open_tmpfile(s.s, "wx", &tmps)) == NULL) {
4618
0
                hts_log_error("Failed to create file %s in the working directory", p);
4619
0
                goto fail;
4620
0
            }
4621
0
            hts_log_info("Downloading file %s to local directory", fn);
4622
0
            uint8_t *buf = (uint8_t*)calloc(buf_size, 1);
4623
0
            if (!buf) {
4624
0
                hts_log_error("%s", strerror(errno));
4625
0
                goto fail;
4626
0
            }
4627
0
            while ((l = hread(remote_hfp, buf, buf_size)) > 0) {
4628
0
                if (hwrite(local_fp, buf, l) != l) {
4629
0
                    hts_log_error("Failed to write data to %s : %s",
4630
0
                            fn, strerror(errno));
4631
0
                    free(buf);
4632
0
                    goto fail;
4633
0
                }
4634
0
            }
4635
0
            free(buf);
4636
0
            if (l < 0) {
4637
0
                hts_log_error("Error reading \"%s\"", fn);
4638
0
                goto fail;
4639
0
            }
4640
0
            if (hclose(local_fp) < 0) {
4641
0
                hts_log_error("Error closing %s : %s", fn, strerror(errno));
4642
0
                local_fp = NULL;
4643
0
                goto fail;
4644
0
            }
4645
0
            local_fp = NULL;
4646
0
            if (rename(tmps.s, s.s) < 0) {
4647
0
                hts_log_error("Error renaming %s : %s", tmps.s, strerror(errno));
4648
0
                goto fail;
4649
0
            }
4650
0
            ks_clear(&tmps);
4651
4652
0
            *local_fn = p;
4653
0
            *local_len = e-p;
4654
0
        } else {
4655
0
            *local_fn = fn;
4656
0
            *local_len = e-fn;
4657
0
        }
4658
4659
0
        if (hclose(remote_hfp) != 0) {
4660
0
            hts_log_error("Failed to close remote file %s", fn);
4661
0
        }
4662
4663
0
        free(tmps.s);
4664
0
        free(s.s);
4665
0
        return 0;
4666
0
    } else {
4667
0
        hFILE *local_hfp;
4668
0
        if ((local_hfp = hopen(fn, "r")) == 0) return -1;
4669
0
        hclose_abruptly(local_hfp);
4670
0
        *local_fn = fn;
4671
0
        *local_len = strlen(fn);
4672
0
        return 0;
4673
0
    }
4674
4675
0
 fail:
4676
0
    save_errno = errno;
4677
0
    if (remote_hfp) hclose_abruptly(remote_hfp);
4678
0
    if (local_fp) hclose_abruptly(local_fp);
4679
0
    if (tmps.l > 0) unlink(tmps.s);
4680
0
    free(tmps.s);
4681
0
    free(s.s);
4682
0
    errno = save_errno;
4683
0
    return -2;
4684
0
}
4685
4686
/*
4687
 * Check the existence of a local index file using part of the alignment
4688
 * file name.
4689
 *
4690
 * For a filename fn of fn.fmt (eg fn.bam or fn.cram) the order of checks is
4691
 * fn.fmt.csi,  fn.csi,
4692
 * fn.fmt.bai,  fn.bai  - if fmt is HTS_FMT_BAI
4693
 * fn.fmt.tbi,  fn.tbi  - if fmt is HTS_FMT_TBI
4694
 * fn.fmt.crai, fn.crai - if fmt is HTS_FMT_CRAI
4695
 * fn.fmt.fai           - if fmt is HTS_FMT_FAI
4696
 *   also .gzi if fmt is ".gz"
4697
 *
4698
 * @param fn    - pointer to the file name
4699
 * @param fmt   - one of the HTS_FMT index formats
4700
 * @param fnidx - pointer to the index file name placeholder
4701
 * @return        1 for success, 0 for failure
4702
 */
4703
6.27k
int hts_idx_check_local(const char *fn, int fmt, char **fnidx) {
4704
6.27k
    int i, l_fn, l_ext;
4705
6.27k
    const char *fn_tmp = NULL;
4706
6.27k
    char *fnidx_tmp;
4707
6.27k
    const char *csi_ext = ".csi";
4708
6.27k
    const char *bai_ext = ".bai";
4709
6.27k
    const char *tbi_ext = ".tbi";
4710
6.27k
    const char *crai_ext = ".crai";
4711
6.27k
    const char *fai_ext = ".fai";
4712
6.27k
    const char *gzi_ext = ".gzi";
4713
4714
6.27k
    if (!fn)
4715
0
        return 0;
4716
4717
6.27k
    if (hisremote(fn)) {
4718
0
        for (i = strlen(fn) - 1; i >= 0; --i)
4719
0
            if (fn[i] == '/') {
4720
0
                fn_tmp = (char *)&fn[i+1];
4721
0
                break;
4722
0
            }
4723
6.27k
    } else {
4724
        // Borrowed from hopen_fd_fileuri()
4725
6.27k
        if (strncmp(fn, "file://localhost/", 17) == 0) fn_tmp = fn + 16;
4726
6.27k
        else if (strncmp(fn, "file:///", 8) == 0) fn_tmp = fn + 7;
4727
6.27k
        else fn_tmp = fn;
4728
#if defined(_WIN32) || defined(__MSYS__)
4729
        // For cases like C:/foo
4730
        if (fn_tmp[0] == '/' && fn_tmp[1] && fn_tmp[2] == ':' && fn_tmp[3] == '/')
4731
            fn_tmp++;
4732
#endif
4733
6.27k
    }
4734
4735
6.27k
    if (!fn_tmp) return 0;
4736
6.27k
    hts_log_info("Using alignment file '%s'", fn_tmp);
4737
6.27k
    l_fn = strlen(fn_tmp); l_ext = 5;
4738
6.27k
    fnidx_tmp = (char*)calloc(l_fn + l_ext + 1, 1);
4739
6.27k
    if (!fnidx_tmp) return 0;
4740
4741
6.27k
    struct stat sbuf;
4742
4743
    // Try alignment.bam.csi first
4744
6.27k
    strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, csi_ext);
4745
6.27k
    if(stat(fnidx_tmp, &sbuf) == 0) {
4746
0
        *fnidx = fnidx_tmp;
4747
0
        return 1;
4748
6.27k
    } else { // Then try alignment.csi
4749
25.1k
        for (i = l_fn - 1; i > 0; --i)
4750
18.8k
            if (fnidx_tmp[i] == '.') {
4751
0
                strcpy(fnidx_tmp + i, csi_ext);
4752
0
                if(stat(fnidx_tmp, &sbuf) == 0) {
4753
0
                    *fnidx = fnidx_tmp;
4754
0
                    return 1;
4755
0
                }
4756
0
                break;
4757
0
            }
4758
6.27k
    }
4759
6.27k
    if (fmt == HTS_FMT_BAI) {
4760
        // Next, try alignment.bam.bai
4761
0
        strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, bai_ext);
4762
0
        if(stat(fnidx_tmp, &sbuf) == 0) {
4763
0
            *fnidx = fnidx_tmp;
4764
0
            return 1;
4765
0
        } else { // And finally, try alignment.bai
4766
0
            for (i = l_fn - 1; i > 0; --i)
4767
0
                if (fnidx_tmp[i] == '.') {
4768
0
                    strcpy(fnidx_tmp + i, bai_ext);
4769
0
                    if(stat(fnidx_tmp, &sbuf) == 0) {
4770
0
                        *fnidx = fnidx_tmp;
4771
0
                        return 1;
4772
0
                    }
4773
0
                    break;
4774
0
                }
4775
0
        }
4776
6.27k
    } else if (fmt == HTS_FMT_TBI) { // Or .tbi
4777
6.27k
        strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, tbi_ext);
4778
6.27k
        if(stat(fnidx_tmp, &sbuf) == 0) {
4779
0
            *fnidx = fnidx_tmp;
4780
0
            return 1;
4781
6.27k
        } else {
4782
25.1k
            for (i = l_fn - 1; i > 0; --i)
4783
18.8k
                if (fnidx_tmp[i] == '.') {
4784
0
                    strcpy(fnidx_tmp + i, tbi_ext);
4785
0
                    if(stat(fnidx_tmp, &sbuf) == 0) {
4786
0
                        *fnidx = fnidx_tmp;
4787
0
                        return 1;
4788
0
                    }
4789
0
                    break;
4790
0
                }
4791
6.27k
        }
4792
6.27k
    } else if (fmt == HTS_FMT_CRAI) { // Or .crai
4793
0
        strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, crai_ext);
4794
0
        if(stat(fnidx_tmp, &sbuf) == 0) {
4795
0
            *fnidx = fnidx_tmp;
4796
0
            return 1;
4797
0
        } else {
4798
0
            for (i = l_fn - 1; i > 0; --i)
4799
0
                if (fnidx_tmp[i] == '.') {
4800
0
                    strcpy(fnidx_tmp + i, crai_ext);
4801
0
                    if(stat(fnidx_tmp, &sbuf) == 0) {
4802
0
                        *fnidx = fnidx_tmp;
4803
0
                        return 1;
4804
0
                    }
4805
0
                    break;
4806
0
                }
4807
0
        }
4808
0
    } else if (fmt == HTS_FMT_FAI) { // Or .fai
4809
        // Check .gzi if we have a .gz file
4810
0
        strcpy(fnidx_tmp, fn_tmp);
4811
0
        int gzi_ok = 1;
4812
0
        if ((l_fn > 3 && strcmp(fn_tmp+l_fn-3, ".gz") == 0) ||
4813
0
            (l_fn > 5 && strcmp(fn_tmp+l_fn-5, ".bgzf") == 0)) {
4814
0
            strcpy(fnidx_tmp + l_fn, gzi_ext);
4815
0
            gzi_ok = stat(fnidx_tmp, &sbuf)==0;
4816
0
        }
4817
4818
        // Now check for .fai.  Occurs second as we're returning this
4819
        // in *fnidx irrespective of whether we did gzi check.
4820
0
        strcpy(fnidx_tmp + l_fn, fai_ext);
4821
0
        *fnidx = fnidx_tmp;
4822
0
        if (stat(fnidx_tmp, &sbuf) == 0)
4823
0
            return gzi_ok;
4824
0
        else
4825
0
            return 0;
4826
0
    }
4827
4828
6.27k
    free(fnidx_tmp);
4829
6.27k
    return 0;
4830
6.27k
}
4831
4832
0
static char *idx_filename(const char *fn, const char *ext, int download) {
4833
0
    int ret, local_len;
4834
0
    char *fnidx;
4835
0
    const char *local_fn = NULL;
4836
0
    kstring_t buffer = KS_INITIALIZE;
4837
4838
    // First try : append `ext` to `fn`
4839
0
    if (!(fnidx = haddextension(&buffer, fn, 0, ext))) {
4840
0
        free(buffer.s);
4841
0
        return NULL;
4842
0
    }
4843
0
    if ((ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, download)) == -1) {
4844
        // Second try : replace suffix of `fn` with `ext`
4845
0
        if (!(fnidx = haddextension(&buffer, fn, 1, ext))) {
4846
0
            free(buffer.s);
4847
0
            return NULL;
4848
0
        }
4849
0
        ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, download);
4850
0
    }
4851
4852
0
    if (ret < 0) {
4853
0
        free(buffer.s);
4854
0
        return NULL;
4855
0
    }
4856
4857
0
    memmove(fnidx, local_fn, local_len);
4858
0
    fnidx[local_len] = 0;
4859
0
    return fnidx;
4860
0
}
4861
4862
char *hts_idx_getfn(const char *fn, const char *ext)
4863
0
{
4864
0
    return idx_filename(fn, ext, HTS_IDX_SAVE_REMOTE);
4865
0
}
4866
4867
char *hts_idx_locatefn(const char *fn, const char *ext)
4868
0
{
4869
0
    return idx_filename(fn, ext, 0);
4870
0
}
4871
4872
static hts_idx_t *idx_find_and_load(const char *fn, int fmt, int flags)
4873
6.27k
{
4874
6.27k
    char *fnidx = strstr(fn, HTS_IDX_DELIM);
4875
6.27k
    hts_idx_t *idx;
4876
4877
6.27k
    if ( fnidx ) {
4878
0
        char *fn2 = strdup(fn);
4879
0
        if (!fn2) {
4880
0
            hts_log_error("%s", strerror(errno));
4881
0
            return NULL;
4882
0
        }
4883
0
        fn2[fnidx - fn] = '\0';
4884
0
        fnidx += strlen(HTS_IDX_DELIM);
4885
0
        idx = hts_idx_load3(fn2, fnidx, fmt, flags);
4886
0
        free(fn2);
4887
0
        return idx;
4888
0
    }
4889
4890
6.27k
    if (hts_idx_check_local(fn, fmt, &fnidx) == 0 && hisremote(fn)) {
4891
0
        if (flags & HTS_IDX_SAVE_REMOTE) {
4892
0
            fnidx = idx_filename(fn, ".csi", HTS_IDX_SAVE_REMOTE);
4893
0
            if (!fnidx) {
4894
0
                switch (fmt) {
4895
0
                case HTS_FMT_BAI: fnidx = idx_filename(fn, ".bai", HTS_IDX_SAVE_REMOTE); break;
4896
0
                case HTS_FMT_TBI: fnidx = idx_filename(fn, ".tbi", HTS_IDX_SAVE_REMOTE); break;
4897
0
                default: break;
4898
0
                }
4899
0
            }
4900
0
        } else {
4901
0
            fnidx = idx_filename(fn, ".csi", 0);
4902
0
            if (!fnidx) {
4903
0
                switch (fmt) {
4904
0
                case HTS_FMT_BAI: fnidx = idx_filename(fn, ".bai", 0); break;
4905
0
                case HTS_FMT_TBI: fnidx = idx_filename(fn, ".tbi", 0); break;
4906
0
                default: break;
4907
0
                }
4908
0
            }
4909
0
        }
4910
0
    }
4911
6.27k
    if (!fnidx) {
4912
6.27k
        if (!(flags & HTS_IDX_SILENT_FAIL))
4913
0
            hts_log_error("Could not retrieve index file for '%s'", fn);
4914
6.27k
        return 0;
4915
6.27k
    }
4916
4917
0
    if (flags & HTS_IDX_SAVE_REMOTE)
4918
0
        idx = hts_idx_load3(fn, fnidx, fmt, flags);
4919
0
    else
4920
0
        idx = idx_read(fnidx);
4921
0
    free(fnidx);
4922
0
    return idx;
4923
6.27k
}
4924
4925
0
hts_idx_t *hts_idx_load(const char *fn, int fmt) {
4926
0
    return idx_find_and_load(fn, fmt, 1);
4927
0
}
4928
4929
hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
4930
0
{
4931
0
    return hts_idx_load3(fn, fnidx, 0, 0);
4932
0
}
4933
4934
hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags)
4935
6.27k
{
4936
6.27k
    const char *local_fn = NULL;
4937
6.27k
    char *local_fnidx = NULL;
4938
6.27k
    int local_len;
4939
6.27k
    if (!fnidx)
4940
6.27k
        return idx_find_and_load(fn, fmt, flags);
4941
4942
    // Check that the index file is up to date, the main file might have changed
4943
0
    struct stat stat_idx,stat_main;
4944
0
    int remote_fn = hisremote(fn), remote_fnidx = hisremote(fnidx);
4945
0
    if ( !remote_fn && !remote_fnidx
4946
0
         && !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) )
4947
0
    {
4948
0
        if ( stat_idx.st_mtime < stat_main.st_mtime )
4949
0
            hts_log_warning("The index file is older than the data file: %s", fnidx);
4950
0
    }
4951
4952
0
    if (remote_fnidx && (flags & HTS_IDX_SAVE_REMOTE))
4953
0
    {
4954
0
        int ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, 1);
4955
0
        if (ret == 0) {
4956
0
            local_fnidx = strdup(local_fn);
4957
0
            if (local_fnidx) {
4958
0
                local_fnidx[local_len] = '\0';
4959
0
                fnidx = local_fnidx;
4960
0
            }
4961
0
        }
4962
0
    }
4963
4964
0
    hts_idx_t *idx = idx_read(fnidx);
4965
0
    if (!idx && !(flags & HTS_IDX_SILENT_FAIL))
4966
0
        hts_log_error("Could not load local index file '%s'%s%s", fnidx,
4967
0
                      errno ? " : " : "", errno ? strerror(errno) : "");
4968
4969
4970
0
    free(local_fnidx);
4971
4972
0
    return idx;
4973
6.27k
}
4974
4975
4976
4977
/**********************
4978
 ***     Memory     ***
4979
 **********************/
4980
4981
/* For use with hts_expand macros *only* */
4982
HTSLIB_EXPORT
4983
size_t hts_realloc_or_die(size_t n, size_t m, size_t m_sz, size_t size,
4984
5.75k
                          int clear, void **ptr, const char *func) {
4985
    /* If new_m and size are both below this limit, multiplying them
4986
       together can't overflow */
4987
5.75k
    const size_t safe = (size_t) 1 << (sizeof(size_t) * 4);
4988
5.75k
    void *new_ptr;
4989
5.75k
    size_t bytes, new_m;
4990
4991
5.75k
    new_m = n;
4992
5.75k
    kroundup_size_t(new_m);
4993
4994
5.75k
    bytes = size * new_m;
4995
4996
    /* Check for overflow.  Both ensure that new_m will fit in m (we make the
4997
       pessimistic assumption that m is signed), and that bytes has not
4998
       wrapped around. */
4999
5.75k
    if (new_m > (((size_t) 1 << (m_sz * 8 - 1)) - 1)
5000
5.75k
        || ((size > safe || new_m > safe)
5001
5.75k
            && bytes / new_m != size)) {
5002
0
        errno = ENOMEM;
5003
0
        goto die;
5004
0
    }
5005
5006
5.75k
    new_ptr = realloc(*ptr, bytes);
5007
5.75k
    if (new_ptr == NULL) goto die;
5008
5009
5.75k
    if (clear) {
5010
0
        if (new_m > m) {
5011
0
            memset((char *) new_ptr + m * size, 0, (new_m - m) * size);
5012
0
        }
5013
0
    }
5014
5015
5.75k
    *ptr = new_ptr;
5016
5017
5.75k
    return new_m;
5018
5019
0
 die:
5020
0
    hts_log_error("%s", strerror(errno));
5021
0
    exit(1);
5022
5.75k
}
5023
5024
/*
5025
 * Companion to hts_resize() macro that does the actual allocation.
5026
 *
5027
 * Somewhat complicated as hts_resize() needs to write the new allocated
5028
 * size back into *size_in_out, and the value pointed to may either be
5029
 * int32_t, uint32_t or size_t depending on which array is being resized.
5030
 * This is solved by making `size_in_out` a void pointer, getting the macro
5031
 * to pass in the size of the item pointed to (in `size_sz`) and then using
5032
 * an appropriate cast (based on the value of size_sz).  The function
5033
 * ensures that the maximum size will be storable in a signed type of
5034
 * the given size so storing to an int32_t should work correctly.
5035
 *
5036
 * Assumes that sizeof(uint32_t) and sizeof(int32_t) is 4,
5037
 * sizeof(uint64_t) and sizeof(int64_t) is 8 and sizeof(size_t) is
5038
 * either 4 or 8.  It also assumes casting from unsigned to signed will
5039
 * work as long as the top bit isn't set.
5040
 */
5041
5042
int hts_resize_array_(size_t item_size, size_t num, size_t size_sz,
5043
                      void *size_in_out, void **ptr_in_out, int flags,
5044
33.6k
                      const char *func) {
5045
    /* If new_size and item_size are both below this limit, multiplying them
5046
       together can't overflow */
5047
33.6k
    const size_t safe = (size_t) 1 << (sizeof(size_t) * 4);
5048
33.6k
    void *new_ptr;
5049
33.6k
    size_t bytes, new_size;
5050
5051
33.6k
    new_size = num;
5052
33.6k
    kroundup_size_t(new_size);
5053
33.6k
    bytes = item_size * new_size;
5054
5055
    /* Check for overflow.  Both ensure that alloc will fit in alloc_in_out (we
5056
       make the pessimistic assumption that *alloc_in_out is signed), and that
5057
       bytes has not wrapped around. */
5058
5059
33.6k
    if ((new_size > (((size_t) 1 << (size_sz * 8 - 1)) - 1))
5060
33.6k
        || (((item_size > safe) || (new_size > safe))
5061
33.6k
            && bytes / new_size != item_size)) {
5062
0
        hts_log(HTS_LOG_ERROR, func, "Memory allocation too large");
5063
0
        errno = ENOMEM;
5064
0
        return -1;
5065
0
    }
5066
5067
33.6k
    new_ptr = realloc(*ptr_in_out, bytes);
5068
33.6k
    if (new_ptr == NULL) {
5069
0
        int save_errno = errno;
5070
0
        hts_log(HTS_LOG_ERROR, func, "%s", strerror(errno));
5071
0
        errno = save_errno;
5072
0
        return -1;
5073
0
    }
5074
5075
33.6k
    if (flags & HTS_RESIZE_CLEAR) {
5076
33.6k
        size_t old_size;
5077
33.6k
        switch (size_sz) {
5078
33.6k
        case 4: old_size = *((uint32_t *) size_in_out); break;
5079
0
        case 8: old_size = *((uint64_t *) size_in_out); break;
5080
0
        default: abort();
5081
33.6k
        }
5082
33.6k
        if (new_size > old_size) {
5083
33.6k
            memset((char *) new_ptr + old_size * item_size, 0,
5084
33.6k
                   (new_size - old_size) * item_size);
5085
33.6k
        }
5086
33.6k
    }
5087
5088
33.6k
    switch (size_sz) {
5089
33.6k
    case 4: *((uint32_t *) size_in_out) = new_size; break;
5090
0
    case 8: *((uint64_t *) size_in_out) = new_size; break;
5091
0
    default: abort();
5092
33.6k
    }
5093
5094
33.6k
    *ptr_in_out = new_ptr;
5095
33.6k
    return 0;
5096
33.6k
}
5097
5098
void hts_lib_shutdown(void)
5099
0
{
5100
0
    hfile_shutdown(1);
5101
0
}
5102
5103
0
void hts_free(void *ptr) {
5104
0
    free(ptr);
5105
0
}
5106
5107
void hts_set_log_level(enum htsLogLevel level)
5108
0
{
5109
0
    hts_verbose = level;
5110
0
}
5111
5112
enum htsLogLevel hts_get_log_level(void)
5113
0
{
5114
0
    return hts_verbose;
5115
0
}
5116
5117
static char get_severity_tag(enum htsLogLevel severity)
5118
2.32M
{
5119
2.32M
    switch (severity) {
5120
44.6k
    case HTS_LOG_ERROR:
5121
44.6k
        return 'E';
5122
2.27M
    case HTS_LOG_WARNING:
5123
2.27M
        return 'W';
5124
0
    case HTS_LOG_INFO:
5125
0
        return 'I';
5126
0
    case HTS_LOG_DEBUG:
5127
0
        return 'D';
5128
0
    case HTS_LOG_TRACE:
5129
0
        return 'T';
5130
0
    default:
5131
0
        break;
5132
2.32M
    }
5133
5134
0
    return '*';
5135
2.32M
}
5136
5137
void hts_log(enum htsLogLevel severity, const char *context, const char *format, ...)
5138
2.78M
{
5139
2.78M
    int save_errno = errno;
5140
2.78M
    if (severity <= hts_verbose) {
5141
2.32M
        va_list argptr;
5142
5143
2.32M
        fprintf(stderr, "[%c::%s] ", get_severity_tag(severity), context);
5144
5145
2.32M
        va_start(argptr, format);
5146
2.32M
        vfprintf(stderr, format, argptr);
5147
2.32M
        va_end(argptr);
5148
5149
2.32M
        fprintf(stderr, "\n");
5150
2.32M
    }
5151
2.78M
    errno = save_errno;
5152
2.78M
}