Coverage Report

Created: 2026-03-25 06:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/hts.c
Line
Count
Source
1
/*  hts.c -- format-neutral I/O, indexing, and iterator API functions.
2
3
    Copyright (C) 2008, 2009, 2012-2025 Genome Research Ltd.
4
    Copyright (C) 2012, 2013 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <zlib.h>
30
#include <stdio.h>
31
#include <string.h>
32
#include <strings.h>
33
#include <stdlib.h>
34
#include <unistd.h>
35
#include <inttypes.h>
36
#include <limits.h>
37
#include <stdint.h>
38
#include <fcntl.h>
39
#include <errno.h>
40
#include <time.h>
41
#include <sys/stat.h>
42
#include <assert.h>
43
44
#ifdef HAVE_LIBLZMA
45
#ifdef HAVE_LZMA_H
46
#include <lzma.h>
47
#else
48
#include "os/lzma_stub.h"
49
#endif
50
#endif
51
52
#include "htslib/hts.h"
53
#include "htslib/bgzf.h"
54
#include "cram/cram.h"
55
#include "htslib/hfile.h"
56
#include "htslib/hts_endian.h"
57
#include "version.h"
58
#include "config_vars.h"
59
#include "hts_internal.h"
60
#include "hfile_internal.h"
61
#include "sam_internal.h"
62
#include "htslib/hts_expr.h"
63
#include "htslib/hts_os.h" // drand48
64
65
#include "htslib/khash.h"
66
#include "htslib/kseq.h"
67
#include "htslib/ksort.h"
68
#include "htslib/tbx.h"
69
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
70
#include <htscodecs/htscodecs.h>
71
#else
72
#include "htscodecs/htscodecs/htscodecs.h"
73
#endif
74
75
#ifndef EFTYPE
76
0
#define EFTYPE ENOEXEC
77
#endif
78
79
3.42k
KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal)
Unexecuted instantiation: kh_grow_to_fit_s2i
Unexecuted instantiation: kh_stats_s2i
80
81
HTSLIB_EXPORT
82
int hts_verbose = HTS_LOG_WARNING;
83
84
const char *hts_version(void)
85
2
{
86
2
    return HTS_VERSION_TEXT;
87
2
}
88
89
0
unsigned int hts_features(void) {
90
0
    unsigned int feat = HTS_FEATURE_HTSCODECS; // Always present
91
92
0
#ifdef PACKAGE_URL
93
0
    feat |= HTS_FEATURE_CONFIGURE;
94
0
#endif
95
96
#ifdef ENABLE_PLUGINS
97
    feat |= HTS_FEATURE_PLUGINS;
98
#endif
99
100
0
#ifdef HAVE_LIBCURL
101
0
    feat |= HTS_FEATURE_LIBCURL;
102
0
#endif
103
104
0
#ifdef ENABLE_S3
105
0
    feat |= HTS_FEATURE_S3;
106
0
#endif
107
108
0
#ifdef ENABLE_GCS
109
0
    feat |= HTS_FEATURE_GCS;
110
0
#endif
111
112
#ifdef HAVE_LIBDEFLATE
113
    feat |= HTS_FEATURE_LIBDEFLATE;
114
#endif
115
116
0
#ifdef HAVE_LIBLZMA
117
0
    feat |= HTS_FEATURE_LZMA;
118
0
#endif
119
120
0
#ifdef HAVE_LIBBZ2
121
0
    feat |= HTS_FEATURE_BZIP2;
122
0
#endif
123
124
0
    return feat;
125
0
}
126
127
0
const char *hts_test_feature(unsigned int id) {
128
0
    unsigned int feat = hts_features();
129
130
0
    switch (id) {
131
0
    case HTS_FEATURE_CONFIGURE:
132
0
        return feat & HTS_FEATURE_CONFIGURE ? "yes" : NULL;
133
0
    case HTS_FEATURE_PLUGINS:
134
0
        return feat & HTS_FEATURE_PLUGINS ? "yes" : NULL;
135
0
    case HTS_FEATURE_LIBCURL:
136
0
        return feat & HTS_FEATURE_LIBCURL ? "yes" : NULL;
137
0
    case HTS_FEATURE_S3:
138
0
        return feat & HTS_FEATURE_S3 ? "yes" : NULL;
139
0
    case HTS_FEATURE_GCS:
140
0
        return feat & HTS_FEATURE_GCS ? "yes" : NULL;
141
0
    case HTS_FEATURE_LIBDEFLATE:
142
0
        return feat & HTS_FEATURE_LIBDEFLATE ? "yes" : NULL;
143
0
    case HTS_FEATURE_BZIP2:
144
0
        return feat & HTS_FEATURE_BZIP2 ? "yes" : NULL;
145
0
    case HTS_FEATURE_LZMA:
146
0
        return feat & HTS_FEATURE_LZMA ? "yes" : NULL;
147
148
0
    case HTS_FEATURE_HTSCODECS:
149
0
        return htscodecs_version();
150
151
0
    case HTS_FEATURE_CC:
152
0
        return HTS_CC;
153
0
    case HTS_FEATURE_CFLAGS:
154
0
        return HTS_CFLAGS;
155
0
    case HTS_FEATURE_LDFLAGS:
156
0
        return HTS_LDFLAGS;
157
0
    case HTS_FEATURE_CPPFLAGS:
158
0
        return HTS_CPPFLAGS;
159
160
0
    default:
161
0
        fprintf(stderr, "Unknown feature code: %u\n", id);
162
0
    }
163
164
0
    return NULL;
165
0
}
166
167
// Note this implementation also means we can just "strings" the library
168
// to find the configuration parameters.
169
0
const char *hts_feature_string(void) {
170
0
    static char config[1200];
171
0
    const char *flags=
172
173
0
#ifdef PACKAGE_URL
174
0
    "build=configure "
175
#else
176
    "build=Makefile "
177
#endif
178
179
0
#ifdef HAVE_LIBCURL
180
0
    "libcurl=yes "
181
#else
182
    "libcurl=no "
183
#endif
184
185
0
#ifdef ENABLE_S3
186
0
    "S3=yes "
187
#else
188
    "S3=no "
189
#endif
190
191
0
#ifdef ENABLE_GCS
192
0
    "GCS=yes "
193
#else
194
    "GCS=no "
195
#endif
196
197
#ifdef HAVE_LIBDEFLATE
198
    "libdeflate=yes "
199
#else
200
0
    "libdeflate=no "
201
0
#endif
202
203
0
#ifdef HAVE_LIBLZMA
204
0
    "lzma=yes "
205
#else
206
    "lzma=no "
207
#endif
208
209
0
#ifdef HAVE_LIBBZ2
210
0
    "bzip2=yes "
211
#else
212
    "bzip2=no "
213
#endif
214
215
// "plugins=" must stay at the end as it is followed by "plugin-path="
216
#ifdef ENABLE_PLUGINS
217
    "plugins=yes";
218
#else
219
0
    "plugins=no";
220
0
#endif
221
222
#ifdef ENABLE_PLUGINS
223
    snprintf(config, sizeof(config),
224
             "%s plugin-path=%.1000s htscodecs=%.40s",
225
             flags, hts_plugin_path(), htscodecs_version());
226
#else
227
0
    snprintf(config, sizeof(config),
228
0
             "%s htscodecs=%.40s",
229
0
             flags, htscodecs_version());
230
0
#endif
231
0
    return config;
232
0
}
233
234
235
// Converts ASCII to BAM nibble encoding.
236
// Note 0123 is treated as ACGT (ABI colourspace encoding) and
237
// U is treated as T.
238
HTSLIB_EXPORT
239
const unsigned char seq_nt16_table[256] = {
240
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
241
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
242
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
243
     1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
244
    15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
245
    15,15, 5, 6,  8, 8, 7, 9, 15,10,15,15, 15,15,15,15,
246
    15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
247
    15,15, 5, 6,  8, 8, 7, 9, 15,10,15,15, 15,15,15,15,
248
249
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
250
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
251
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
252
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
253
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
254
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
255
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
256
    15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
257
};
258
259
HTSLIB_EXPORT
260
const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN";
261
262
HTSLIB_EXPORT
263
const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
264
265
/**********************
266
 *** Basic file I/O ***
267
 **********************/
268
269
static enum htsFormatCategory format_category(enum htsExactFormat fmt)
270
7.77k
{
271
7.77k
    switch (fmt) {
272
0
    case bam:
273
0
    case sam:
274
1.93k
    case cram:
275
1.93k
    case fastq_format:
276
1.93k
    case fasta_format:
277
1.93k
        return sequence_data;
278
279
0
    case vcf:
280
0
    case bcf:
281
0
        return variant_data;
282
283
0
    case bai:
284
0
    case crai:
285
0
    case csi:
286
0
    case fai_format:
287
0
    case fqi_format:
288
0
    case gzi:
289
0
    case tbi:
290
0
        return index_file;
291
292
0
    case bed:
293
0
    case d4_format:
294
0
        return region_list;
295
296
0
    case htsget:
297
0
    case hts_crypt4gh_format:
298
0
        return unknown_category;
299
300
0
    case unknown_format:
301
2.92k
    case binary_format:
302
5.84k
    case text_format:
303
5.84k
    case empty_format:
304
5.84k
    case format_maximum:
305
5.84k
        break;
306
7.77k
    }
307
308
5.84k
    return unknown_category;
309
7.77k
}
310
311
// Decompress several hundred bytes by peeking at the file, which must be
312
// positioned at the start of a GZIP block.
313
static ssize_t
314
decompress_peek_gz(hFILE *fp, unsigned char *dest, size_t destsize)
315
1.72k
{
316
1.72k
    unsigned char buffer[2048];
317
1.72k
    z_stream zs;
318
1.72k
    ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
319
320
1.72k
    if (npeek < 0) return -1;
321
322
1.72k
    zs.zalloc = NULL;
323
1.72k
    zs.zfree = NULL;
324
1.72k
    zs.next_in = buffer;
325
1.72k
    zs.avail_in = npeek;
326
1.72k
    zs.next_out = dest;
327
1.72k
    zs.avail_out = destsize;
328
1.72k
    if (inflateInit2(&zs, 31) != Z_OK) return -1;
329
330
1.72k
    int ret;
331
1.72k
    const unsigned char *last_in = buffer;
332
3.44k
    while (zs.avail_out > 0) {
333
1.73k
        ret = inflate(&zs, Z_SYNC_FLUSH);
334
1.73k
        if (ret == Z_STREAM_END) {
335
15
            if (last_in == zs.next_in)
336
0
                break; // Paranoia to avoid potential looping. Shouldn't happen
337
15
            else
338
15
                last_in = zs.next_in;
339
15
            inflateReset(&zs);
340
1.72k
        } else if (ret != Z_OK) {
341
            // eg Z_BUF_ERROR due to avail_in/out becoming zero
342
13
            break;
343
13
        }
344
1.73k
    }
345
346
    // NB: zs.total_out is changed by inflateReset, so use pointer diff instead
347
1.72k
    destsize = zs.next_out - dest;
348
1.72k
    inflateEnd(&zs);
349
350
1.72k
    return destsize;
351
1.72k
}
352
353
#ifdef HAVE_LIBLZMA
354
// Similarly decompress a portion by peeking at the file, which must be
355
// positioned at the start of the file.
356
static ssize_t
357
decompress_peek_xz(hFILE *fp, unsigned char *dest, size_t destsize)
358
0
{
359
0
    unsigned char buffer[2048];
360
0
    ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
361
0
    if (npeek < 0) return -1;
362
363
0
    lzma_stream ls = LZMA_STREAM_INIT;
364
0
    if (lzma_stream_decoder(&ls, lzma_easy_decoder_memusage(9), 0) != LZMA_OK)
365
0
        return -1;
366
367
0
    ls.next_in = buffer;
368
0
    ls.avail_in = npeek;
369
0
    ls.next_out = dest;
370
0
    ls.avail_out = destsize;
371
372
0
    int r = lzma_code(&ls, LZMA_RUN);
373
0
    if (! (r == LZMA_OK || r == LZMA_STREAM_END)) {
374
0
        lzma_end(&ls);
375
0
        return -1;
376
0
    }
377
378
0
    destsize = ls.total_out;
379
0
    lzma_end(&ls);
380
381
0
    return destsize;
382
0
}
383
#endif
384
385
// Parse "x.y" text, taking care because the string is not NUL-terminated
386
// and filling in major/minor only when the digits are followed by a delimiter,
387
// so we don't misread "1.10" as "1.1" due to reaching the end of the buffer.
388
static void
389
parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim)
390
93
{
391
93
    const char *s    = (const char *) u;
392
93
    const char *slim = (const char *) ulim;
393
93
    short v;
394
395
93
    fmt->version.major = fmt->version.minor = -1;
396
397
162
    for (v = 0; s < slim && isdigit_c(*s); s++)
398
69
        v = 10 * v + *s - '0';
399
400
93
    if (s < slim) {
401
93
        fmt->version.major = v;
402
93
        if (*s == '.') {
403
0
            s++;
404
0
            for (v = 0; s < slim && isdigit_c(*s); s++)
405
0
                v = 10 * v + *s - '0';
406
0
            if (s < slim)
407
0
                fmt->version.minor = v;
408
0
        }
409
93
        else
410
93
            fmt->version.minor = 0;
411
93
    }
412
93
}
413
414
static int
415
cmp_nonblank(const char *key, const unsigned char *u, const unsigned char *ulim)
416
1.67k
{
417
1.67k
    const unsigned char *ukey = (const unsigned char *) key;
418
419
1.67k
    while (*ukey)
420
1.67k
        if (u >= ulim) return +1;
421
1.67k
        else if (isspace_c(*u)) u++;
422
1.67k
        else if (*u != *ukey) return (*ukey < *u)? -1 : +1;
423
0
        else u++, ukey++;
424
425
0
    return 0;
426
1.67k
}
427
428
static int is_text_only(const unsigned char *u, const unsigned char *ulim)
429
1.66k
{
430
64.4k
    for (; u < ulim; u++)
431
62.8k
        if (! (*u >= ' ' || *u == '\t' || *u == '\r' || *u == '\n'))
432
0
            return 0;
433
434
1.66k
    return 1;
435
1.66k
}
436
437
static inline int
438
alternate_zeros(const unsigned char *u, const unsigned char *ulim)
439
0
{
440
0
    for (; u < ulim; u += 2)
441
0
        if (*u != '\0') return 0;
442
0
    return 1;
443
0
}
444
445
static int is_utf16_text(const unsigned char *u, const unsigned char *ulim)
446
0
{
447
0
    if (ulim - u >= 6 &&
448
0
        ((u[0] == 0xfe && u[1] == 0xff && alternate_zeros(u+2, ulim)) ||
449
0
         (u[0] == 0xff && u[1] == 0xfe && alternate_zeros(u+3, ulim))))
450
0
        return 2;
451
0
    else if (ulim - u >= 8 &&
452
0
             (alternate_zeros(u, ulim) || alternate_zeros(u+1, ulim)))
453
0
        return 1;
454
0
    else
455
0
        return 0;
456
0
}
457
458
static int is_fastaq(const unsigned char *u, const unsigned char *ulim)
459
1.66k
{
460
1.66k
    const unsigned char *eol = memchr(u, '\n', ulim - u);
461
462
    // Check that the first line is entirely textual
463
1.66k
    if (! is_text_only(u, eol? eol : ulim)) return 0;
464
465
    // If the first line is very long, consider the file to indeed be FASTA/Q
466
1.66k
    if (eol == NULL) return 1;
467
468
1.64k
    u = eol+1; // Now points to the first character of the second line
469
470
    // Scan over all base-encoding letters (including 'N' but not SEQ's '=')
471
1.72k
    while (u < ulim && (seq_nt16_table[*u] != 15 || toupper(*u) == 'N')) {
472
80
        if (*u == '=') return 0;
473
80
        u++;
474
80
    }
475
476
1.64k
    return (u == ulim || *u == '\r' || *u == '\n')? 1 : 0;
477
1.64k
}
478
479
// Parse tab-delimited text, filling in a string of column types and returning
480
// the number of columns spotted (within [u,ulim), and up to column_len) or -1
481
// if non-printable characters were seen.  Column types:
482
//     i: integer, s: strand sign, C: CIGAR, O: SAM optional field, Z: anything
483
static int
484
parse_tabbed_text(char *columns, int column_len,
485
                  const unsigned char *u, const unsigned char *ulim,
486
                  int *complete)
487
0
{
488
0
    const char *str  = (const char *) u;
489
0
    const char *slim = (const char *) ulim;
490
0
    const char *s;
491
0
    int ncolumns = 0;
492
493
0
    enum { digit = 1, leading_sign = 2, cigar_operator = 4, other = 8 };
494
0
    unsigned seen = 0;
495
0
    *complete = 0;
496
497
0
    for (s = str; s < slim; s++)
498
0
        if (*s >= ' ') {
499
0
            if (isdigit_c(*s))
500
0
                seen |= digit;
501
0
            else if ((*s == '+' || *s == '-') && s == str)
502
0
                seen |= leading_sign;
503
0
            else if (strchr(BAM_CIGAR_STR, *s) && s > str && isdigit_c(s[-1]))
504
0
                seen |= cigar_operator;
505
0
            else
506
0
                seen |= other;
507
0
        }
508
0
        else if (*s == '\t' || *s == '\r' || *s == '\n') {
509
0
            size_t len = s - str;
510
0
            char type;
511
512
0
            if (seen == digit || seen == (leading_sign|digit)) type = 'i';
513
0
            else if (seen == (digit|cigar_operator)) type = 'C';
514
0
            else if (len == 1)
515
0
                switch (str[0]) {
516
0
                case '*': type = 'C'; break;
517
0
                case '+': case '-': case '.': type = 's'; break;
518
0
                default: type = 'Z'; break;
519
0
                }
520
0
            else if (len >= 5 && str[2] == ':' && str[4] == ':') type = 'O';
521
0
            else type = 'Z';
522
523
0
            columns[ncolumns++] = type;
524
0
            if (*s != '\t' || ncolumns >= column_len - 1) {
525
0
                *complete = 1; // finished the line or more columns than needed
526
0
                break;
527
0
            }
528
529
0
            str = s + 1;
530
0
            seen = 0;
531
0
        }
532
0
        else return -1;
533
534
0
    columns[ncolumns] = '\0';
535
0
    return ncolumns;
536
0
}
537
538
// Match COLUMNS as a prefix against PATTERN (so COLUMNS may run out first).
539
// Returns len(COLUMNS) (modulo '+'), or 0 if there is a mismatched entry.
540
static int colmatch(const char *columns, const char *pattern)
541
0
{
542
0
    int i;
543
0
    for (i = 0; columns[i] != '\0'; i++) {
544
0
        if (pattern[i] == '+') return i;
545
0
        if (! (columns[i] == pattern[i] || pattern[i] == 'Z')) return 0;
546
0
    }
547
548
0
    return i;
549
0
}
550
551
int hts_detect_format(hFILE *hfile, htsFormat *fmt)
552
0
{
553
0
    return hts_detect_format2(hfile, NULL, fmt);
554
0
}
555
556
int hts_detect_format2(hFILE *hfile, const char *fname, htsFormat *fmt)
557
10.7k
{
558
10.7k
    char extension[HTS_MAX_EXT_LEN], columns[24];
559
10.7k
    unsigned char s[1024];
560
10.7k
    int complete = 0;
561
10.7k
    ssize_t len = hpeek(hfile, s, 18);
562
10.7k
    if (len < 0) return -1;
563
564
10.7k
    fmt->category = unknown_category;
565
10.7k
    fmt->format = unknown_format;
566
10.7k
    fmt->version.major = fmt->version.minor = -1;
567
10.7k
    fmt->compression = no_compression;
568
10.7k
    fmt->compression_level = -1;
569
10.7k
    fmt->specific = NULL;
570
571
10.7k
    if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) {
572
        // The stream is either gzip-compressed or BGZF-compressed.
573
        // Determine which, and decompress the first few records or lines.
574
1.72k
        fmt->compression = gzip;
575
1.72k
        if (len >= 18 && (s[3] & 4)) {
576
925
            if (memcmp(&s[12], "BC\2\0", 4) == 0)
577
12
                fmt->compression = bgzf;
578
913
            else if (memcmp(&s[12], "RAZF", 4) == 0)
579
1
                fmt->compression = razf_compression;
580
925
        }
581
1.72k
        if (len >= 9 && s[2] == 8)
582
1.72k
            fmt->compression_level = (s[8] == 2)? 9 : (s[8] == 4)? 1 : -1;
583
584
1.72k
        len = decompress_peek_gz(hfile, s, sizeof s);
585
1.72k
    }
586
9.01k
    else if (len >= 10 && memcmp(s, "BZh", 3) == 0 &&
587
0
             (memcmp(&s[4], "\x31\x41\x59\x26\x53\x59", 6) == 0 ||
588
0
              memcmp(&s[4], "\x17\x72\x45\x38\x50\x90", 6) == 0)) {
589
0
        fmt->compression = bzip2_compression;
590
0
        fmt->compression_level = s[3] - '0';
591
        // Decompressing via libbz2 produces no output until it has a whole
592
        // block (of size 100Kb x level), which is too large for peeking.
593
        // So unfortunately we can recognise bzip2 but not the contents,
594
        // except that \x1772... magic indicates the stream is empty.
595
0
        if (s[4] == '\x31') return 0;
596
0
        else len = 0;
597
0
    }
598
9.01k
    else if (len >= 6 && memcmp(s, "\xfd""7zXZ\0", 6) == 0) {
599
0
        fmt->compression = xz_compression;
600
0
#ifdef HAVE_LIBLZMA
601
0
        len = decompress_peek_xz(hfile, s, sizeof s);
602
#else
603
        // Without liblzma, we can't recognise the decompressed contents.
604
        return 0;
605
#endif
606
0
    }
607
9.01k
    else if (len >= 4 && memcmp(s, "\x28\xb5\x2f\xfd", 4) == 0) {
608
0
        fmt->compression = zstd_compression;
609
0
        return 0;
610
0
    }
611
9.01k
    else {
612
9.01k
        len = hpeek(hfile, s, sizeof s);
613
9.01k
    }
614
10.7k
    if (len < 0) return -1;
615
616
10.7k
    if (len == 0) {
617
0
        fmt->format = empty_format;
618
0
        return 0;
619
0
    }
620
621
    // We avoid using filename extensions wherever possible (as filenames are
622
    // not always available), but in a few cases they must be considered:
623
    //  - FASTA/Q indexes are simply tab-separated text; files that match these
624
    //    patterns but not the fai/fqi extension are usually generic BED files
625
    //  - GZI indexes have no magic numbers so can only be detected by filename
626
10.7k
    if (fname && strcmp(fname, "-") != 0) {
627
10.7k
        char *s;
628
10.7k
        if (find_file_extension(fname, extension) < 0) extension[0] = '\0';
629
10.7k
        for (s = extension; *s; s++) *s = tolower_c(*s);
630
10.7k
    }
631
0
    else extension[0] = '\0';
632
633
10.7k
    if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=7 && s[5]<=7) {
634
1.19k
        fmt->category = sequence_data;
635
1.19k
        fmt->format = cram;
636
1.19k
        fmt->version.major = s[4], fmt->version.minor = s[5];
637
1.19k
        fmt->compression = custom;
638
1.19k
        return 0;
639
1.19k
    }
640
9.54k
    else if (len >= 4 && s[3] <= '\4') {
641
337
        if (memcmp(s, "BAM\1", 4) == 0) {
642
136
            fmt->category = sequence_data;
643
136
            fmt->format = bam;
644
            // TODO Decompress enough to pick version from @HD-VN header
645
136
            fmt->version.major = 1, fmt->version.minor = -1;
646
136
            return 0;
647
136
        }
648
201
        else if (memcmp(s, "BAI\1", 4) == 0) {
649
0
            fmt->category = index_file;
650
0
            fmt->format = bai;
651
0
            fmt->version.major = -1, fmt->version.minor = -1;
652
0
            return 0;
653
0
        }
654
201
        else if (memcmp(s, "BCF\4", 4) == 0) {
655
0
            fmt->category = variant_data;
656
0
            fmt->format = bcf;
657
0
            fmt->version.major = 1, fmt->version.minor = -1;
658
0
            return 0;
659
0
        }
660
201
        else if (memcmp(s, "BCF\2", 4) == 0) {
661
201
            fmt->category = variant_data;
662
201
            fmt->format = bcf;
663
201
            fmt->version.major = s[3];
664
201
            fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0;
665
201
            return 0;
666
201
        }
667
0
        else if (memcmp(s, "CSI\1", 4) == 0) {
668
0
            fmt->category = index_file;
669
0
            fmt->format = csi;
670
0
            fmt->version.major = 1, fmt->version.minor = -1;
671
0
            return 0;
672
0
        }
673
0
        else if (memcmp(s, "TBI\1", 4) == 0) {
674
0
            fmt->category = index_file;
675
0
            fmt->format = tbi;
676
0
            return 0;
677
0
        }
678
        // GZI indexes have no magic numbers, so must be recognised solely by
679
        // filename extension.
680
0
        else if (strcmp(extension, "gzi") == 0) {
681
0
            fmt->category = index_file;
682
0
            fmt->format = gzi;
683
0
            return 0;
684
0
        }
685
337
    }
686
9.20k
    else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) {
687
2.77k
        fmt->category = variant_data;
688
2.77k
        fmt->format = vcf;
689
2.77k
        if (len >= 21 && s[16] == 'v')
690
45
            parse_version(fmt, &s[17], &s[len]);
691
2.77k
        return 0;
692
2.77k
    }
693
6.43k
    else if (len >= 4 && s[0] == '@' &&
694
4.80k
             (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 ||
695
808
              memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0 ||
696
4.76k
              memcmp(s, "@CO\t", 4) == 0)) {
697
4.76k
        fmt->category = sequence_data;
698
4.76k
        fmt->format = sam;
699
        // @HD-VN is not guaranteed to be the first tag, but then @HD is
700
        // not guaranteed to be present at all...
701
4.76k
        if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0)
702
48
            parse_version(fmt, &s[7], &s[len]);
703
4.71k
        else
704
4.71k
            fmt->version.major = 1, fmt->version.minor = -1;
705
4.76k
        return 0;
706
4.76k
    }
707
1.67k
    else if (len >= 8 && memcmp(s, "d4\xdd\xdd", 4) == 0) {
708
0
        fmt->category = region_list;
709
0
        fmt->format = d4_format;
710
        // How to decode the D4 Format Version bytes is not yet specified
711
        // so we don't try to set fmt->version.{major,minor}.
712
0
        return 0;
713
0
    }
714
1.67k
    else if (cmp_nonblank("{\"htsget\":", s, &s[len]) == 0) {
715
0
        fmt->category = unknown_category;
716
0
        fmt->format = htsget;
717
0
        return 0;
718
0
    }
719
1.67k
    else if (len > 8 && memcmp(s, "crypt4gh", 8) == 0) {
720
1
        fmt->category = unknown_category;
721
1
        fmt->format = hts_crypt4gh_format;
722
1
        return 0;
723
1
    }
724
1.66k
    else if (len >= 1 && s[0] == '>' && is_fastaq(s, &s[len])) {
725
1.62k
        fmt->category = sequence_data;
726
1.62k
        fmt->format = fasta_format;
727
1.62k
        return 0;
728
1.62k
    }
729
44
    else if (len >= 1 && s[0] == '@' && is_fastaq(s, &s[len])) {
730
44
        fmt->category = sequence_data;
731
44
        fmt->format = fastq_format;
732
44
        return 0;
733
44
    }
734
0
    else if (parse_tabbed_text(columns, sizeof columns, s,
735
0
                               &s[len], &complete) > 0) {
736
        // A complete SAM line is at least 11 columns.  On unmapped long reads may
737
        // be missing two.  (On mapped long reads we must have an @ header so long
738
        // CIGAR is irrelevant.)
739
0
        if (colmatch(columns, "ZiZiiCZiiZZOOOOOOOOOOOOOOOOOOOO+")
740
0
            >= 9 + 2*complete) {
741
0
            fmt->category = sequence_data;
742
0
            fmt->format = sam;
743
0
            fmt->version.major = 1, fmt->version.minor = -1;
744
0
            return 0;
745
0
        }
746
0
        else if (fmt->compression == gzip && colmatch(columns, "iiiiii") == 6) {
747
0
            fmt->category = index_file;
748
0
            fmt->format = crai;
749
0
            return 0;
750
0
        }
751
0
        else if (strstr(extension, "fqi") && colmatch(columns, "Ziiiii") == 6) {
752
0
            fmt->category = index_file;
753
0
            fmt->format = fqi_format;
754
0
            return 0;
755
0
        }
756
0
        else if (strstr(extension, "fai") && colmatch(columns, "Ziiii") == 5) {
757
0
            fmt->category = index_file;
758
0
            fmt->format = fai_format;
759
0
            return 0;
760
0
        }
761
0
        else if (colmatch(columns, "Zii+") >= 3) {
762
0
            fmt->category = region_list;
763
0
            fmt->format = bed;
764
0
            return 0;
765
0
        }
766
0
    }
767
768
    // Arbitrary text files can be read using hts_getline().
769
0
    if (is_text_only(s, &s[len])) fmt->format = text_format;
770
771
    // Nothing recognised: leave unset fmt-> fields as unknown.
772
0
    return 0;
773
10.7k
}
774
775
char *hts_format_description(const htsFormat *format)
776
0
{
777
0
    kstring_t str = { 0, 0, NULL };
778
779
0
    switch (format->format) {
780
0
    case sam:   kputs("SAM", &str); break;
781
0
    case bam:   kputs("BAM", &str); break;
782
0
    case cram:  kputs("CRAM", &str); break;
783
0
    case fasta_format:  kputs("FASTA", &str); break;
784
0
    case fastq_format:  kputs("FASTQ", &str); break;
785
0
    case vcf:   kputs("VCF", &str); break;
786
0
    case bcf:
787
0
        if (format->version.major == 1) kputs("Legacy BCF", &str);
788
0
        else kputs("BCF", &str);
789
0
        break;
790
0
    case bai:   kputs("BAI", &str); break;
791
0
    case crai:  kputs("CRAI", &str); break;
792
0
    case csi:   kputs("CSI", &str); break;
793
0
    case fai_format:    kputs("FASTA-IDX", &str); break;
794
0
    case fqi_format:    kputs("FASTQ-IDX", &str); break;
795
0
    case gzi:   kputs("GZI", &str); break;
796
0
    case tbi:   kputs("Tabix", &str); break;
797
0
    case bed:   kputs("BED", &str); break;
798
0
    case d4_format:     kputs("D4", &str); break;
799
0
    case htsget: kputs("htsget", &str); break;
800
0
    case hts_crypt4gh_format: kputs("crypt4gh", &str); break;
801
0
    case empty_format:  kputs("empty", &str); break;
802
0
    default:    kputs("unknown", &str); break;
803
0
    }
804
805
0
    if (format->version.major >= 0) {
806
0
        kputs(" version ", &str);
807
0
        kputw(format->version.major, &str);
808
0
        if (format->version.minor >= 0) {
809
0
            kputc('.', &str);
810
0
            kputw(format->version.minor, &str);
811
0
        }
812
0
    }
813
814
0
    switch (format->compression) {
815
0
    case bzip2_compression:  kputs(" bzip2-compressed", &str); break;
816
0
    case razf_compression:   kputs(" legacy-RAZF-compressed", &str); break;
817
0
    case xz_compression:     kputs(" XZ-compressed", &str); break;
818
0
    case zstd_compression:   kputs(" Zstandard-compressed", &str); break;
819
0
    case custom: kputs(" compressed", &str); break;
820
0
    case gzip:   kputs(" gzip-compressed", &str); break;
821
822
0
    case bgzf:
823
0
        switch (format->format) {
824
0
        case bam:
825
0
        case bcf:
826
0
        case csi:
827
0
        case tbi:
828
            // These are by definition BGZF, so just use the generic term
829
0
            kputs(" compressed", &str);
830
0
            break;
831
0
        default:
832
0
            kputs(" BGZF-compressed", &str);
833
0
            break;
834
0
        }
835
0
        break;
836
837
0
    case no_compression:
838
0
        switch (format->format) {
839
0
        case bam:
840
0
        case bcf:
841
0
        case cram:
842
0
        case csi:
843
0
        case tbi:
844
            // These are normally compressed, so emphasise that this one isn't
845
0
            kputs(" uncompressed", &str);
846
0
            break;
847
0
        default:
848
0
            break;
849
0
        }
850
0
        break;
851
852
0
    default: break;
853
0
    }
854
855
0
    switch (format->category) {
856
0
    case sequence_data: kputs(" sequence", &str); break;
857
0
    case variant_data:  kputs(" variant calling", &str); break;
858
0
    case index_file:    kputs(" index", &str); break;
859
0
    case region_list:   kputs(" genomic region", &str); break;
860
0
    default: break;
861
0
    }
862
863
0
    if (format->compression == no_compression)
864
0
        switch (format->format) {
865
0
        case text_format:
866
0
        case sam:
867
0
        case crai:
868
0
        case vcf:
869
0
        case bed:
870
0
        case fai_format:
871
0
        case fqi_format:
872
0
        case fasta_format:
873
0
        case fastq_format:
874
0
        case htsget:
875
0
            kputs(" text", &str);
876
0
            break;
877
878
0
        case empty_format:
879
0
            break;
880
881
0
        default:
882
0
            kputs(" data", &str);
883
0
            break;
884
0
        }
885
0
    else
886
0
        kputs(" data", &str);
887
888
0
    return ks_release(&str);
889
0
}
890
891
htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
892
7.77k
{
893
7.77k
    char smode[101], *cp, *cp2, *mode_c, *uncomp = NULL;
894
7.77k
    htsFile *fp = NULL;
895
7.77k
    hFILE *hfile = NULL;
896
7.77k
    char fmt_code = '\0';
897
    // see enum htsExactFormat in htslib/hts.h
898
7.77k
    const char format_to_mode[] = "\0g\0\0b\0c\0\0b\0g\0\0\0\0\0Ff\0\0";
899
900
7.77k
    strncpy(smode, mode, 99);
901
7.77k
    smode[99]=0;
902
7.77k
    if ((cp = strchr(smode, ',')))
903
0
        *cp = '\0';
904
905
    // Migrate format code (b or c) to the end of the smode buffer.
906
20.4k
    for (cp2 = cp = smode; *cp; cp++) {
907
12.6k
        if (*cp == 'b')
908
2.92k
            fmt_code = 'b';
909
9.70k
        else if (*cp == 'c')
910
1.93k
            fmt_code = 'c';
911
7.77k
        else {
912
7.77k
            *cp2++ = *cp;
913
            // Cache the uncompress flag 'u' pos if present
914
7.77k
            if (!uncomp && (*cp == 'u')) {
915
0
                uncomp = cp2 - 1;
916
0
            }
917
7.77k
        }
918
12.6k
    }
919
7.77k
    mode_c = cp2;
920
7.77k
    *cp2++ = fmt_code;
921
7.77k
    *cp2++ = 0;
922
923
    // Set or reset the format code if opts->format is used
924
7.77k
    if (fmt && fmt->format > unknown_format
925
0
        && fmt->format < sizeof(format_to_mode)) {
926
0
        *mode_c = format_to_mode[fmt->format];
927
0
    }
928
929
    // Uncompressed bam/bcf is not supported, change 'u' to '0' on write
930
7.77k
    if (uncomp && *mode_c == 'b' && (strchr(smode, 'w') || strchr(smode, 'a'))) {
931
0
        *uncomp = '0';
932
0
    }
933
934
    // If we really asked for a compressed text format then mode_c above will
935
    // point to nul.  We set to 'z' to enable bgzf.
936
7.77k
    if (strchr(mode, 'w') && fmt && fmt->compression == bgzf) {
937
0
        if (fmt->format == sam || fmt->format == vcf || fmt->format == text_format)
938
0
            *mode_c = 'z';
939
0
    }
940
941
7.77k
    char *rmme = NULL, *fnidx = strstr(fn, HTS_IDX_DELIM);
942
7.77k
    if ( fnidx ) {
943
0
        rmme = strdup(fn);
944
0
        if ( !rmme ) goto error;
945
0
        rmme[fnidx-fn] = 0;
946
0
        fn = rmme;
947
0
    }
948
949
7.77k
    hfile = hopen(fn, smode);
950
7.77k
    if (hfile == NULL) goto error;
951
952
7.77k
    fp = hts_hopen(hfile, fn, smode);
953
7.77k
    if (fp == NULL) goto error;
954
955
    // Compensate for the loss of exactness in htsExactFormat.
956
    // hts_hopen returns generics such as binary or text, but we
957
    // have been given something explicit here so use that instead.
958
7.77k
    if (fp->is_write && fmt &&
959
0
        (fmt->format == bam || fmt->format == sam ||
960
0
         fmt->format == vcf || fmt->format == bcf ||
961
0
         fmt->format == bed || fmt->format == fasta_format ||
962
0
         fmt->format == fastq_format))
963
0
        fp->format.format = fmt->format;
964
965
7.77k
    if (fmt && fmt->specific) {
966
0
        if (hts_opt_apply(fp, fmt->specific) != 0) {
967
0
            if (((hts_opt*)fmt->specific)->opt == CRAM_OPT_REFERENCE &&
968
0
                (errno == ENOENT || errno == EIO || errno == EBADF ||
969
0
                  errno == EACCES || errno == EISDIR)) {
970
                /* error during reference file operation
971
                 for these specific errors, set the error as EINVAL */
972
0
                errno = EINVAL;
973
0
            }
974
0
            goto error;
975
0
        }
976
0
    }
977
7.77k
    if ( rmme ) free(rmme);
978
7.77k
    return fp;
979
980
0
error:
981
0
    hts_log_error("Failed to open file \"%s\"%s%s", fn,
982
0
                  errno ? " : " : "", errno ? strerror(errno) : "");
983
0
    if ( rmme ) free(rmme);
984
985
0
    if (hfile)
986
0
        hclose_abruptly(hfile);
987
988
0
    return NULL;
989
7.77k
}
990
991
7.77k
htsFile *hts_open(const char *fn, const char *mode) {
992
7.77k
    return hts_open_format(fn, mode, NULL);
993
7.77k
}
994
995
/*
996
 * Splits str into a prefix, delimiter ('\0' or delim), and suffix, writing
997
 * the prefix in lowercase into buf and returning a pointer to the suffix.
998
 * On return, buf is always NUL-terminated; thus assumes that the "keyword"
999
 * prefix should be one of several known values of maximum length buflen-2.
1000
 * (If delim is not found, returns a pointer to the '\0'.)
1001
 */
1002
static const char *
1003
scan_keyword(const char *str, char delim, char *buf, size_t buflen)
1004
0
{
1005
0
    size_t i = 0;
1006
0
    while (*str && *str != delim) {
1007
0
        if (i < buflen-1) buf[i++] = tolower_c(*str);
1008
0
        str++;
1009
0
    }
1010
1011
0
    buf[i] = '\0';
1012
0
    return *str? str+1 : str;
1013
0
}
1014
1015
/*
1016
 * Parses arg and appends it to the option list.
1017
 *
1018
 * Returns 0 on success;
1019
 *        -1 on failure.
1020
 */
1021
0
int hts_opt_add(hts_opt **opts, const char *c_arg) {
1022
0
    hts_opt *o, *t;
1023
0
    char *val;
1024
1025
    /*
1026
     * IMPORTANT!!!
1027
     * If you add another string option here, don't forget to also add
1028
     * it to the case statement in hts_opt_apply.
1029
     */
1030
1031
0
    if (!c_arg)
1032
0
        return -1;
1033
1034
0
    if (!(o =  malloc(sizeof(*o))))
1035
0
        return -1;
1036
1037
0
    if (!(o->arg = strdup(c_arg))) {
1038
0
        free(o);
1039
0
        return -1;
1040
0
    }
1041
1042
0
    if (!(val = strchr(o->arg, '=')))
1043
0
        val = "1"; // assume boolean
1044
0
    else
1045
0
        *val++ = '\0';
1046
1047
0
    if (strcmp(o->arg, "decode_md") == 0 ||
1048
0
        strcmp(o->arg, "DECODE_MD") == 0)
1049
0
        o->opt = CRAM_OPT_DECODE_MD, o->val.i = atoi(val);
1050
1051
0
    else if (strcmp(o->arg, "verbosity") == 0 ||
1052
0
             strcmp(o->arg, "VERBOSITY") == 0)
1053
0
        o->opt = CRAM_OPT_VERBOSITY, o->val.i = atoi(val);
1054
1055
0
    else if (strcmp(o->arg, "seqs_per_slice") == 0 ||
1056
0
             strcmp(o->arg, "SEQS_PER_SLICE") == 0)
1057
0
        o->opt = CRAM_OPT_SEQS_PER_SLICE, o->val.i = atoi(val);
1058
1059
0
    else if (strcmp(o->arg, "bases_per_slice") == 0 ||
1060
0
             strcmp(o->arg, "BASES_PER_SLICE") == 0)
1061
0
        o->opt = CRAM_OPT_BASES_PER_SLICE, o->val.i = atoi(val);
1062
1063
0
    else if (strcmp(o->arg, "slices_per_container") == 0 ||
1064
0
             strcmp(o->arg, "SLICES_PER_CONTAINER") == 0)
1065
0
        o->opt = CRAM_OPT_SLICES_PER_CONTAINER, o->val.i = atoi(val);
1066
1067
0
    else if (strcmp(o->arg, "embed_ref") == 0 ||
1068
0
             strcmp(o->arg, "EMBED_REF") == 0)
1069
0
        o->opt = CRAM_OPT_EMBED_REF, o->val.i = atoi(val);
1070
1071
0
    else if (strcmp(o->arg, "no_ref") == 0 ||
1072
0
             strcmp(o->arg, "NO_REF") == 0)
1073
0
        o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(val);
1074
1075
0
    else if (strcmp(o->arg, "pos_delta") == 0 ||
1076
0
             strcmp(o->arg, "POS_DELTA") == 0)
1077
0
        o->opt = CRAM_OPT_POS_DELTA, o->val.i = atoi(val);
1078
1079
0
    else if (strcmp(o->arg, "ignore_md5") == 0 ||
1080
0
             strcmp(o->arg, "IGNORE_MD5") == 0)
1081
0
        o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(val);
1082
1083
0
    else if (strcmp(o->arg, "use_bzip2") == 0 ||
1084
0
             strcmp(o->arg, "USE_BZIP2") == 0)
1085
0
        o->opt = CRAM_OPT_USE_BZIP2, o->val.i = atoi(val);
1086
1087
0
    else if (strcmp(o->arg, "use_rans") == 0 ||
1088
0
             strcmp(o->arg, "USE_RANS") == 0)
1089
0
        o->opt = CRAM_OPT_USE_RANS, o->val.i = atoi(val);
1090
1091
0
    else if (strcmp(o->arg, "use_lzma") == 0 ||
1092
0
             strcmp(o->arg, "USE_LZMA") == 0)
1093
0
        o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(val);
1094
1095
0
    else if (strcmp(o->arg, "use_tok") == 0 ||
1096
0
             strcmp(o->arg, "USE_TOK") == 0)
1097
0
        o->opt = CRAM_OPT_USE_TOK, o->val.i = atoi(val);
1098
1099
0
    else if (strcmp(o->arg, "use_fqz") == 0 ||
1100
0
             strcmp(o->arg, "USE_FQZ") == 0)
1101
0
        o->opt = CRAM_OPT_USE_FQZ, o->val.i = atoi(val);
1102
1103
0
    else if (strcmp(o->arg, "use_arith") == 0 ||
1104
0
             strcmp(o->arg, "USE_ARITH") == 0)
1105
0
        o->opt = CRAM_OPT_USE_ARITH, o->val.i = atoi(val);
1106
1107
0
    else if (strcmp(o->arg, "fast") == 0 ||
1108
0
             strcmp(o->arg, "FAST") == 0)
1109
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_FAST;
1110
1111
0
    else if (strcmp(o->arg, "normal") == 0 ||
1112
0
             strcmp(o->arg, "NORMAL") == 0)
1113
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_NORMAL;
1114
1115
0
    else if (strcmp(o->arg, "small") == 0 ||
1116
0
             strcmp(o->arg, "SMALL") == 0)
1117
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_SMALL;
1118
1119
0
    else if (strcmp(o->arg, "archive") == 0 ||
1120
0
             strcmp(o->arg, "ARCHIVE") == 0)
1121
0
        o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_ARCHIVE;
1122
1123
0
    else if (strcmp(o->arg, "reference") == 0 ||
1124
0
             strcmp(o->arg, "REFERENCE") == 0)
1125
0
        o->opt = CRAM_OPT_REFERENCE, o->val.s = val;
1126
1127
0
    else if (strcmp(o->arg, "version") == 0 ||
1128
0
             strcmp(o->arg, "VERSION") == 0)
1129
0
        o->opt = CRAM_OPT_VERSION, o->val.s =val;
1130
1131
0
    else if (strcmp(o->arg, "multi_seq_per_slice") == 0 ||
1132
0
             strcmp(o->arg, "MULTI_SEQ_PER_SLICE") == 0)
1133
0
        o->opt = CRAM_OPT_MULTI_SEQ_PER_SLICE, o->val.i = atoi(val);
1134
1135
0
    else if (strcmp(o->arg, "nthreads") == 0 ||
1136
0
             strcmp(o->arg, "NTHREADS") == 0)
1137
0
        o->opt = HTS_OPT_NTHREADS, o->val.i = atoi(val);
1138
1139
0
    else if (strcmp(o->arg, "cache_size") == 0 ||
1140
0
             strcmp(o->arg, "CACHE_SIZE") == 0) {
1141
0
        char *endp;
1142
0
        o->opt = HTS_OPT_CACHE_SIZE;
1143
0
        o->val.i = strtol(val, &endp, 0);
1144
        // NB: Doesn't support floats, eg 1.5g
1145
        // TODO: extend hts_parse_decimal? See also samtools sort.
1146
0
        switch (*endp) {
1147
0
        case 'g': case 'G': o->val.i *= 1024; // fall through
1148
0
        case 'm': case 'M': o->val.i *= 1024; // fall through
1149
0
        case 'k': case 'K': o->val.i *= 1024; break;
1150
0
        case '\0': break;
1151
0
        default:
1152
0
            hts_log_error("Unrecognised cache size suffix '%c'", *endp);
1153
0
            free(o->arg);
1154
0
            free(o);
1155
0
            return -1;
1156
0
        }
1157
0
    }
1158
1159
0
    else if (strcmp(o->arg, "required_fields") == 0 ||
1160
0
             strcmp(o->arg, "REQUIRED_FIELDS") == 0)
1161
0
        o->opt = CRAM_OPT_REQUIRED_FIELDS, o->val.i = strtol(val, NULL, 0);
1162
1163
0
    else if (strcmp(o->arg, "lossy_names") == 0 ||
1164
0
             strcmp(o->arg, "LOSSY_NAMES") == 0)
1165
0
        o->opt = CRAM_OPT_LOSSY_NAMES, o->val.i = strtol(val, NULL, 0);
1166
1167
0
    else if (strcmp(o->arg, "name_prefix") == 0 ||
1168
0
             strcmp(o->arg, "NAME_PREFIX") == 0)
1169
0
        o->opt = CRAM_OPT_PREFIX, o->val.s = val;
1170
1171
0
    else if (strcmp(o->arg, "store_md") == 0 ||
1172
0
             strcmp(o->arg, "store_md") == 0)
1173
0
        o->opt = CRAM_OPT_STORE_MD, o->val.i = atoi(val);
1174
1175
0
    else if (strcmp(o->arg, "store_nm") == 0 ||
1176
0
             strcmp(o->arg, "store_nm") == 0)
1177
0
        o->opt = CRAM_OPT_STORE_NM, o->val.i = atoi(val);
1178
1179
0
    else if (strcmp(o->arg, "block_size") == 0 ||
1180
0
             strcmp(o->arg, "BLOCK_SIZE") == 0)
1181
0
        o->opt = HTS_OPT_BLOCK_SIZE, o->val.i = strtol(val, NULL, 0);
1182
1183
0
    else if (strcmp(o->arg, "level") == 0 ||
1184
0
             strcmp(o->arg, "LEVEL") == 0)
1185
0
        o->opt = HTS_OPT_COMPRESSION_LEVEL, o->val.i = strtol(val, NULL, 0);
1186
1187
0
    else if (strcmp(o->arg, "filter") == 0 ||
1188
0
             strcmp(o->arg, "FILTER") == 0)
1189
0
        o->opt = HTS_OPT_FILTER, o->val.s = val;
1190
1191
0
    else if (strcmp(o->arg, "fastq_aux") == 0 ||
1192
0
        strcmp(o->arg, "FASTQ_AUX") == 0)
1193
0
        o->opt = FASTQ_OPT_AUX, o->val.s = val;
1194
1195
0
    else if (strcmp(o->arg, "fastq_barcode") == 0 ||
1196
0
        strcmp(o->arg, "FASTQ_BARCODE") == 0)
1197
0
        o->opt = FASTQ_OPT_BARCODE, o->val.s = val;
1198
1199
0
    else if (strcmp(o->arg, "fastq_rnum") == 0 ||
1200
0
        strcmp(o->arg, "FASTQ_RNUM") == 0)
1201
0
        o->opt = FASTQ_OPT_RNUM, o->val.i = 1;
1202
1203
0
    else if (strcmp(o->arg, "fastq_casava") == 0 ||
1204
0
        strcmp(o->arg, "FASTQ_CASAVA") == 0)
1205
0
        o->opt = FASTQ_OPT_CASAVA, o->val.i = 1;
1206
1207
0
    else if (strcmp(o->arg, "fastq_name2") == 0 ||
1208
0
        strcmp(o->arg, "FASTQ_NAME2") == 0)
1209
0
        o->opt = FASTQ_OPT_NAME2, o->val.i = 1;
1210
1211
0
    else if (strcmp(o->arg, "fastq_umi") == 0 ||
1212
0
        strcmp(o->arg, "FASTQ_UMI") == 0)
1213
0
        o->opt = FASTQ_OPT_UMI, o->val.s = val;
1214
1215
0
    else if (strcmp(o->arg, "fastq_umi_regex") == 0 ||
1216
0
        strcmp(o->arg, "FASTQ_UMI_REGEX") == 0)
1217
0
        o->opt = FASTQ_OPT_UMI_REGEX, o->val.s = val;
1218
1219
0
    else {
1220
0
        hts_log_error("Unknown option '%s'", o->arg);
1221
0
        free(o->arg);
1222
0
        free(o);
1223
0
        return -1;
1224
0
    }
1225
1226
0
    o->next = NULL;
1227
1228
    // Append; assumes small list.
1229
0
    if (*opts) {
1230
0
        t = *opts;
1231
0
        while (t->next)
1232
0
            t = t->next;
1233
0
        t->next = o;
1234
0
    } else {
1235
0
        *opts = o;
1236
0
    }
1237
1238
0
    return 0;
1239
0
}
1240
1241
/*
1242
 * Applies an hts_opt option list to a given htsFile.
1243
 *
1244
 * Returns 0 on success
1245
 *        -1 on failure
1246
 */
1247
0
int hts_opt_apply(htsFile *fp, hts_opt *opts) {
1248
0
    hts_opt *last = NULL;
1249
1250
0
    for (; opts;  opts = (last=opts)->next) {
1251
0
        switch (opts->opt) {
1252
0
            case CRAM_OPT_REFERENCE:
1253
0
                if (!(fp->fn_aux = strdup(opts->val.s)))
1254
0
                    return -1;
1255
                // fall through
1256
0
            case CRAM_OPT_VERSION:
1257
0
            case CRAM_OPT_PREFIX:
1258
0
            case HTS_OPT_FILTER:
1259
0
            case FASTQ_OPT_AUX:
1260
0
            case FASTQ_OPT_BARCODE:
1261
0
            case FASTQ_OPT_UMI:
1262
0
            case FASTQ_OPT_UMI_REGEX:
1263
0
                if (hts_set_opt(fp,  opts->opt,  opts->val.s) != 0)
1264
0
                    return -1;
1265
0
                break;
1266
0
            default:
1267
0
                if (hts_set_opt(fp,  opts->opt,  opts->val.i) != 0)
1268
0
                    return -1;
1269
0
                break;
1270
0
        }
1271
0
    }
1272
1273
0
    return 0;
1274
0
}
1275
1276
/*
1277
 * Frees an hts_opt list.
1278
 */
1279
0
void hts_opt_free(hts_opt *opts) {
1280
0
    hts_opt *last = NULL;
1281
0
    while (opts) {
1282
0
        opts = (last=opts)->next;
1283
0
        free(last->arg);
1284
0
        free(last);
1285
0
    }
1286
0
}
1287
1288
1289
/*
1290
 * Tokenise options as (key(=value)?,)*(key(=value)?)?
1291
 * NB: No provision for ',' appearing in the value!
1292
 * Add backslashing rules?
1293
 *
1294
 * This could be used as part of a general command line option parser or
1295
 * as a string concatenated onto the file open mode.
1296
 *
1297
 * Returns 0 on success
1298
 *        -1 on failure.
1299
 */
1300
0
int hts_parse_opt_list(htsFormat *fmt, const char *str) {
1301
0
    while (str && *str) {
1302
0
        const char *str_start;
1303
0
        int len;
1304
0
        char arg[8001];
1305
1306
0
        while (*str && *str == ',')
1307
0
            str++;
1308
1309
0
        for (str_start = str; *str && *str != ','; str++);
1310
0
        len = str - str_start;
1311
1312
        // Produce a nul terminated copy of the option
1313
0
        strncpy(arg, str_start, len < 8000 ? len : 8000);
1314
0
        arg[len < 8000 ? len : 8000] = '\0';
1315
1316
0
        if (hts_opt_add((hts_opt **)&fmt->specific, arg) != 0)
1317
0
            return -1;
1318
1319
0
        if (*str)
1320
0
            str++;
1321
0
    }
1322
1323
0
    return 0;
1324
0
}
1325
1326
/*
1327
 * Accepts a string file format (sam, bam, cram, vcf, bam) optionally
1328
 * followed by a comma separated list of key=value options and splits
1329
 * these up into the fields of htsFormat struct.
1330
 *
1331
 * format is assumed to be already initialised, either to blank
1332
 * "unknown" values or via previous hts_opt_add calls.
1333
 *
1334
 * Returns 0 on success
1335
 *        -1 on failure.
1336
 */
1337
0
int hts_parse_format(htsFormat *format, const char *str) {
1338
0
    char fmt[9];
1339
0
    const char *cp = scan_keyword(str, ',', fmt, sizeof fmt);
1340
1341
0
    format->version.minor = 0; // unknown
1342
0
    format->version.major = 0; // unknown
1343
1344
0
    if (strcmp(fmt, "sam") == 0) {
1345
0
        format->category          = sequence_data;
1346
0
        format->format            = sam;
1347
0
        format->compression       = no_compression;
1348
0
        format->compression_level = 0;
1349
0
    } else if (strcmp(fmt, "sam.gz") == 0) {
1350
0
        format->category          = sequence_data;
1351
0
        format->format            = sam;
1352
0
        format->compression       = bgzf;
1353
0
        format->compression_level = -1;
1354
0
    } else if (strcmp(fmt, "bam") == 0) {
1355
0
        format->category          = sequence_data;
1356
0
        format->format            = bam;
1357
0
        format->compression       = bgzf;
1358
0
        format->compression_level = -1;
1359
0
    } else if (strcmp(fmt, "cram") == 0) {
1360
0
        format->category          = sequence_data;
1361
0
        format->format            = cram;
1362
0
        format->compression       = custom;
1363
0
        format->compression_level = -1;
1364
0
    } else if (strcmp(fmt, "vcf") == 0) {
1365
0
        format->category          = variant_data;
1366
0
        format->format            = vcf;
1367
0
        format->compression       = no_compression;
1368
0
        format->compression_level = 0;
1369
0
    } else if (strcmp(fmt, "bcf") == 0) {
1370
0
        format->category          = variant_data;
1371
0
        format->format            = bcf;
1372
0
        format->compression       = bgzf;
1373
0
        format->compression_level = -1;
1374
0
    } else if (strcmp(fmt, "fastq") == 0 || strcmp(fmt, "fq") == 0) {
1375
0
        format->category          = sequence_data;
1376
0
        format->format            = fastq_format;
1377
0
        format->compression       = no_compression;
1378
0
        format->compression_level = 0;
1379
0
    } else if (strcmp(fmt, "fastq.gz") == 0 || strcmp(fmt, "fq.gz") == 0) {
1380
0
        format->category          = sequence_data;
1381
0
        format->format            = fastq_format;
1382
0
        format->compression       = bgzf;
1383
0
        format->compression_level = 0;
1384
0
    } else if (strcmp(fmt, "fasta") == 0 || strcmp(fmt, "fa") == 0) {
1385
0
        format->category          = sequence_data;
1386
0
        format->format            = fasta_format;
1387
0
        format->compression       = no_compression;
1388
0
        format->compression_level = 0;
1389
0
    } else if (strcmp(fmt, "fasta.gz") == 0 || strcmp(fmt, "fa.gz") == 0) {
1390
0
        format->category          = sequence_data;
1391
0
        format->format            = fasta_format;
1392
0
        format->compression       = bgzf;
1393
0
        format->compression_level = 0;
1394
0
    } else {
1395
0
        return -1;
1396
0
    }
1397
1398
0
    return hts_parse_opt_list(format, cp);
1399
0
}
1400
1401
1402
/*
1403
 * Tokenise options as (key(=value)?,)*(key(=value)?)?
1404
 * NB: No provision for ',' appearing in the value!
1405
 * Add backslashing rules?
1406
 *
1407
 * This could be used as part of a general command line option parser or
1408
 * as a string concatenated onto the file open mode.
1409
 *
1410
 * Returns 0 on success
1411
 *        -1 on failure.
1412
 */
1413
0
static int hts_process_opts(htsFile *fp, const char *opts) {
1414
0
    htsFormat fmt;
1415
1416
0
    fmt.specific = NULL;
1417
0
    if (hts_parse_opt_list(&fmt, opts) != 0)
1418
0
        return -1;
1419
1420
0
    if (hts_opt_apply(fp, fmt.specific) != 0) {
1421
0
        hts_opt_free(fmt.specific);
1422
0
        return -1;
1423
0
    }
1424
1425
0
    hts_opt_free(fmt.specific);
1426
1427
0
    return 0;
1428
0
}
1429
1430
static int hts_crypt4gh_redirect(const char *fn, const char *mode,
1431
1
                                 hFILE **hfile_ptr, htsFile *fp) {
1432
1
    hFILE *hfile1 = *hfile_ptr;
1433
1
    hFILE *hfile2 = NULL;
1434
1
    char fn_buf[512], *fn2 = fn_buf;
1435
1
    char mode2[102]; // Size set by sizeof(simple_mode) in hts_hopen()
1436
1
    const char *prefix = "crypt4gh:";
1437
1
    size_t fn2_len = strlen(prefix) + strlen(fn) + 1;
1438
1
    int ret = -1;
1439
1440
1
    if (fn2_len > sizeof(fn_buf)) {
1441
0
        if (fn2_len >= INT_MAX) // Silence gcc format-truncation warning
1442
0
            return -1;
1443
0
        fn2 = malloc(fn2_len);
1444
0
        if (!fn2) return -1;
1445
0
    }
1446
1447
    // Reopen fn using the crypt4gh plug-in (if available)
1448
1
    snprintf(fn2, fn2_len, "%s%s", prefix, fn);
1449
1
    snprintf(mode2, sizeof(mode2), "%s%s", mode, strchr(mode, ':') ? "" : ":");
1450
1
    hfile2 = hopen(fn2, mode2, "parent", hfile1, NULL);
1451
1
    if (hfile2) {
1452
        // Replace original hfile with the new one.  The original is now
1453
        // enclosed within hfile2
1454
0
        *hfile_ptr = hfile2;
1455
0
        ret = 0;
1456
0
    }
1457
1458
1
    if (fn2 != fn_buf)
1459
0
        free(fn2);
1460
1
    return ret;
1461
1
}
1462
1463
htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode)
1464
18.5k
{
1465
18.5k
    hFILE *hfile_orig = hfile;
1466
18.5k
    hFILE *hfile_cleanup = hfile;
1467
18.5k
    htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile));
1468
18.5k
    char simple_mode[101], *cp, *opts;
1469
18.5k
    simple_mode[100] = '\0';
1470
1471
18.5k
    if (fp == NULL) goto error;
1472
1473
18.5k
    fp->fn = strdup(fn);
1474
18.5k
    fp->is_be = ed_is_big();
1475
1476
    // Split mode into simple_mode,opts strings
1477
18.5k
    if ((cp = strchr(mode, ','))) {
1478
0
        strncpy(simple_mode, mode, cp-mode <= 100 ? cp-mode : 100);
1479
0
        simple_mode[cp-mode] = '\0';
1480
0
        opts = cp+1;
1481
18.5k
    } else {
1482
18.5k
        strncpy(simple_mode, mode, 100);
1483
18.5k
        opts = NULL;
1484
18.5k
    }
1485
1486
18.5k
    if (strchr(simple_mode, 'r')) {
1487
10.7k
        const int max_loops = 5; // Should be plenty
1488
10.7k
        int loops = 0;
1489
10.7k
        if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error;
1490
1491
        // Deal with formats that re-direct an underlying file via a plug-in.
1492
        // Loops as we may have crypt4gh served via htsget, or
1493
        // crypt4gh-in-crypt4gh.
1494
1495
10.7k
        while (fp->format.format == htsget ||
1496
10.7k
               fp->format.format == hts_crypt4gh_format) {
1497
            // Ensure we don't get stuck in an endless redirect loop
1498
1
            if (++loops > max_loops) {
1499
0
                errno = ELOOP;
1500
0
                goto error;
1501
0
            }
1502
1503
1
            if (fp->format.format == htsget) {
1504
0
                hFILE *hfile2 = hopen_htsget_redirect(hfile, simple_mode);
1505
0
                if (hfile2 == NULL) goto error;
1506
1507
0
                if (hfile != hfile_cleanup) {
1508
                    // Close the result of an earlier redirection
1509
0
                    hclose_abruptly(hfile);
1510
0
                }
1511
1512
0
                hfile = hfile2;
1513
0
            }
1514
1
            else if (fp->format.format == hts_crypt4gh_format) {
1515
1
                int should_preserve = (hfile == hfile_orig);
1516
1
                int update_cleanup = (hfile == hfile_cleanup);
1517
1
                if (hts_crypt4gh_redirect(fn, simple_mode, &hfile, fp) < 0)
1518
1
                    goto error;
1519
0
                if (should_preserve) {
1520
                    // The original hFILE is now contained in a crypt4gh
1521
                    // wrapper.  Should we need to close the wrapper due
1522
                    // to a later error, we need to prevent the wrapped
1523
                    // handle from being closed as the caller will see
1524
                    // this function return NULL and try to clean up itself.
1525
0
                    hfile_orig->preserve = 1;
1526
0
                }
1527
0
                if (update_cleanup) {
1528
                    // Update handle to close at the end if redirected by htsget
1529
0
                    hfile_cleanup = hfile;
1530
0
                }
1531
0
            }
1532
1533
            // Re-detect format against the result of the redirection
1534
0
            if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error;
1535
0
        }
1536
10.7k
    }
1537
7.77k
    else if (strchr(simple_mode, 'w') || strchr(simple_mode, 'a')) {
1538
7.77k
        htsFormat *fmt = &fp->format;
1539
7.77k
        fp->is_write = 1;
1540
1541
7.77k
        if (strchr(simple_mode, 'b')) fmt->format = binary_format;
1542
4.85k
        else if (strchr(simple_mode, 'c')) fmt->format = cram;
1543
2.92k
        else if (strchr(simple_mode, 'f')) fmt->format = fastq_format;
1544
2.92k
        else if (strchr(simple_mode, 'F')) fmt->format = fasta_format;
1545
2.92k
        else fmt->format = text_format;
1546
1547
7.77k
        if (strchr(simple_mode, 'z')) fmt->compression = bgzf;
1548
7.77k
        else if (strchr(simple_mode, 'g')) fmt->compression = gzip;
1549
7.77k
        else if (strchr(simple_mode, 'u')) fmt->compression = no_compression;
1550
7.77k
        else {
1551
            // No compression mode specified, set to the default for the format
1552
7.77k
            switch (fmt->format) {
1553
2.92k
            case binary_format: fmt->compression = bgzf; break;
1554
1.93k
            case cram: fmt->compression = custom; break;
1555
0
            case fastq_format: fmt->compression = no_compression; break;
1556
0
            case fasta_format: fmt->compression = no_compression; break;
1557
2.92k
            case text_format: fmt->compression = no_compression; break;
1558
0
            default: abort();
1559
7.77k
            }
1560
7.77k
        }
1561
1562
        // Fill in category (if determinable; e.g. 'b' could be BAM or BCF)
1563
7.77k
        fmt->category = format_category(fmt->format);
1564
1565
7.77k
        fmt->version.major = fmt->version.minor = -1;
1566
7.77k
        fmt->compression_level = -1;
1567
7.77k
        fmt->specific = NULL;
1568
7.77k
    }
1569
0
    else { errno = EINVAL; goto error; }
1570
1571
18.5k
    switch (fp->format.format) {
1572
2.92k
    case binary_format:
1573
3.05k
    case bam:
1574
3.26k
    case bcf:
1575
3.26k
        fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
1576
3.26k
        if (fp->fp.bgzf == NULL) goto error;
1577
3.26k
        fp->is_bin = fp->is_bgzf = 1;
1578
3.26k
        break;
1579
1580
3.12k
    case cram:
1581
3.12k
        fp->fp.cram = cram_dopen(hfile, fn, simple_mode);
1582
3.12k
        if (fp->fp.cram == NULL) goto error;
1583
3.09k
        if (!fp->is_write)
1584
1.16k
            cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, -1); // auto
1585
3.09k
        fp->is_cram = 1;
1586
3.09k
        break;
1587
1588
0
    case empty_format:
1589
2.92k
    case text_format:
1590
2.92k
    case bed:
1591
4.54k
    case fasta_format:
1592
4.59k
    case fastq_format:
1593
9.35k
    case sam:
1594
12.1k
    case vcf:
1595
12.1k
        if (fp->format.compression != no_compression) {
1596
1.68k
            fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
1597
1.68k
            if (fp->fp.bgzf == NULL) goto error;
1598
1.68k
            fp->is_bgzf = 1;
1599
1.68k
        }
1600
10.4k
        else
1601
10.4k
            fp->fp.hfile = hfile;
1602
12.1k
        break;
1603
1604
12.1k
    default:
1605
0
        errno = EFTYPE;
1606
0
        goto error;
1607
18.5k
    }
1608
1609
18.4k
    if (opts)
1610
0
        hts_process_opts(fp, opts);
1611
1612
    // Allow original file to close if it was preserved earlier by crypt4gh
1613
18.4k
    hfile_orig->preserve = 0;
1614
1615
    // If redirecting via htsget, close the original hFILE now (pedantically
1616
    // we would instead close it in hts_close(), but this a simplifying
1617
    // optimisation)
1618
18.4k
    if (hfile != hfile_cleanup) hclose_abruptly(hfile_cleanup);
1619
1620
18.4k
    return fp;
1621
1622
33
error:
1623
33
    hts_log_error("Failed to open file %s", fn);
1624
1625
    // If redirecting, close the failed redirection hFILE that we have opened
1626
33
    if (hfile != hfile_orig) hclose_abruptly(hfile);
1627
33
    hfile_orig->preserve = 0; // Allow caller to close the original hfile
1628
1629
33
    if (fp) {
1630
33
        free(fp->fn);
1631
33
        free(fp->fn_aux);
1632
33
        free(fp);
1633
33
    }
1634
33
    return NULL;
1635
18.5k
}
1636
1637
static int hts_idx_close_otf_fp(hts_idx_t *idx);
1638
1639
int hts_close(htsFile *fp)
1640
18.4k
{
1641
18.4k
    int ret = 0, save;
1642
18.4k
    if (!fp) {
1643
0
        errno = EINVAL;
1644
0
        return -1;
1645
0
    }
1646
1647
18.4k
    switch (fp->format.format) {
1648
244
    case binary_format:
1649
2.21k
    case bam:
1650
3.26k
    case bcf:
1651
3.26k
        ret = bgzf_close(fp->fp.bgzf);
1652
3.26k
        break;
1653
1654
3.09k
    case cram:
1655
3.09k
        if (!fp->is_write) {
1656
1.16k
            switch (cram_eof(fp->fp.cram)) {
1657
3
            case 2:
1658
3
                hts_log_warning("EOF marker is absent. The input is probably truncated");
1659
3
                break;
1660
799
            case 0:  /* not at EOF, but may not have wanted all seqs */
1661
1.15k
            default: /* case 1, expected EOF */
1662
1.15k
                break;
1663
1.16k
            }
1664
1.16k
        }
1665
3.09k
        ret = cram_close(fp->fp.cram);
1666
3.09k
        break;
1667
1668
0
    case empty_format:
1669
244
    case text_format:
1670
244
    case bed:
1671
1.86k
    case fasta_format:
1672
1.91k
    case fastq_format:
1673
8.50k
    case sam:
1674
12.1k
    case vcf:
1675
12.1k
        if (fp->format.format == sam)
1676
6.59k
            ret = sam_state_destroy(fp);
1677
5.52k
        else if (fp->format.format == fastq_format ||
1678
5.48k
                 fp->format.format == fasta_format)
1679
1.66k
            fastq_state_destroy(fp);
1680
1681
12.1k
        if (fp->format.compression != no_compression)
1682
1.68k
            ret |= bgzf_close(fp->fp.bgzf);
1683
10.4k
        else
1684
10.4k
            ret |= hclose(fp->fp.hfile);
1685
12.1k
        break;
1686
1687
0
    default:
1688
0
        ret = -1;
1689
0
        break;
1690
18.4k
    }
1691
1692
18.4k
    if (fp->idx) {
1693
        // Close deferred index file handle, if present.
1694
        // Unfortunately this means errors on the index will get mixed with
1695
        // those on the main file, but as we only have the EOF block left to
1696
        // write it hopefully won't happen that often.
1697
0
        ret |= hts_idx_close_otf_fp(fp->idx);
1698
0
    }
1699
1700
18.4k
    save = errno;
1701
18.4k
    sam_hdr_destroy(fp->bam_header);
1702
18.4k
    hts_idx_destroy(fp->idx);
1703
18.4k
    hts_filter_free(fp->filter);
1704
18.4k
    free(fp->fn);
1705
18.4k
    free(fp->fn_aux);
1706
18.4k
    free(fp->line.s);
1707
18.4k
    free(fp);
1708
18.4k
    errno = save;
1709
18.4k
    return ret;
1710
18.4k
}
1711
1712
int hts_flush(htsFile *fp)
1713
0
{
1714
0
    if (fp == NULL) return 0;
1715
1716
0
    switch (fp->format.format) {
1717
0
    case binary_format:
1718
0
    case bam:
1719
0
    case bcf:
1720
0
        return bgzf_flush(fp->fp.bgzf);
1721
1722
0
    case cram:
1723
0
        return cram_flush(fp->fp.cram);
1724
1725
0
    case empty_format:
1726
0
    case text_format:
1727
0
    case bed:
1728
0
    case fasta_format:
1729
0
    case fastq_format:
1730
0
    case sam:
1731
0
    case vcf:
1732
0
        if (fp->format.compression != no_compression)
1733
0
            return bgzf_flush(fp->fp.bgzf);
1734
0
        else
1735
0
            return hflush(fp->fp.hfile);
1736
1737
0
    default:
1738
0
        break;
1739
0
    }
1740
1741
0
    return 0;
1742
0
}
1743
1744
const htsFormat *hts_get_format(htsFile *fp)
1745
0
{
1746
0
    return fp? &fp->format : NULL;
1747
0
}
1748
1749
0
const char *hts_format_file_extension(const htsFormat *format) {
1750
0
    if (!format)
1751
0
        return "?";
1752
1753
0
    switch (format->format) {
1754
0
    case sam:  return "sam";
1755
0
    case bam:  return "bam";
1756
0
    case bai:  return "bai";
1757
0
    case cram: return "cram";
1758
0
    case crai: return "crai";
1759
0
    case vcf:  return "vcf";
1760
0
    case bcf:  return "bcf";
1761
0
    case csi:  return "csi";
1762
0
    case fai_format:   return "fai";
1763
0
    case fqi_format:   return "fqi";
1764
0
    case gzi:  return "gzi";
1765
0
    case tbi:  return "tbi";
1766
0
    case bed:  return "bed";
1767
0
    case d4_format:    return "d4";
1768
0
    case fasta_format: return "fa";
1769
0
    case fastq_format: return "fq";
1770
0
    default:   return "?";
1771
0
    }
1772
0
}
1773
1774
0
static hFILE *hts_hfile(htsFile *fp) {
1775
0
    switch (fp->format.format) {
1776
0
    case binary_format:// fall through
1777
0
    case bcf:          // fall through
1778
0
    case bam:          return bgzf_hfile(fp->fp.bgzf);
1779
0
    case cram:         return cram_hfile(fp->fp.cram);
1780
0
    case text_format:  // fall through
1781
0
    case vcf:          // fall through
1782
0
    case fastq_format: // fall through
1783
0
    case fasta_format: // fall through
1784
0
    case sam:          return fp->format.compression != no_compression
1785
0
                              ? bgzf_hfile(fp->fp.bgzf)
1786
0
                              : fp->fp.hfile;
1787
0
    default:           return NULL;
1788
0
    }
1789
0
}
1790
1791
0
int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) {
1792
0
    int r;
1793
0
    va_list args;
1794
1795
0
    switch (opt) {
1796
0
    case HTS_OPT_NTHREADS: {
1797
0
        va_start(args, opt);
1798
0
        int nthreads = va_arg(args, int);
1799
0
        va_end(args);
1800
0
        return hts_set_threads(fp, nthreads);
1801
0
    }
1802
1803
0
    case HTS_OPT_BLOCK_SIZE: {
1804
0
        hFILE *hf = hts_hfile(fp);
1805
1806
0
        if (hf) {
1807
0
            va_start(args, opt);
1808
0
            if (hfile_set_blksize(hf, va_arg(args, int)) != 0)
1809
0
                hts_log_warning("Failed to change block size");
1810
0
            va_end(args);
1811
0
        }
1812
0
        else {
1813
            // To do - implement for vcf/bcf.
1814
0
            hts_log_warning("Cannot change block size for this format");
1815
0
        }
1816
1817
0
        return 0;
1818
0
    }
1819
1820
0
    case HTS_OPT_THREAD_POOL: {
1821
0
        va_start(args, opt);
1822
0
        htsThreadPool *p = va_arg(args, htsThreadPool *);
1823
0
        va_end(args);
1824
0
        return hts_set_thread_pool(fp, p);
1825
0
    }
1826
1827
0
    case HTS_OPT_CACHE_SIZE: {
1828
0
        va_start(args, opt);
1829
0
        int cache_size = va_arg(args, int);
1830
0
        va_end(args);
1831
0
        hts_set_cache_size(fp, cache_size);
1832
0
        return 0;
1833
0
    }
1834
1835
0
    case FASTQ_OPT_CASAVA:
1836
0
    case FASTQ_OPT_RNUM:
1837
0
    case FASTQ_OPT_NAME2:
1838
0
        if (fp->format.format == fastq_format ||
1839
0
            fp->format.format == fasta_format)
1840
0
            return fastq_state_set(fp, opt);
1841
0
        return 0;
1842
1843
0
    case FASTQ_OPT_AUX:
1844
0
        if (fp->format.format == fastq_format ||
1845
0
            fp->format.format == fasta_format) {
1846
0
            va_start(args, opt);
1847
0
            char *list = va_arg(args, char *);
1848
0
            va_end(args);
1849
0
            return fastq_state_set(fp, opt, list);
1850
0
        }
1851
0
        return 0;
1852
1853
0
    case FASTQ_OPT_BARCODE:
1854
0
    case FASTQ_OPT_UMI:
1855
0
    case FASTQ_OPT_UMI_REGEX:
1856
0
        if (fp->format.format == fastq_format ||
1857
0
            fp->format.format == fasta_format) {
1858
0
            va_start(args, opt);
1859
0
            char *bc = va_arg(args, char *);
1860
0
            va_end(args);
1861
0
            return fastq_state_set(fp, opt, bc);
1862
0
        }
1863
0
        return 0;
1864
1865
    // Options below here flow through to cram_set_voption
1866
0
    case HTS_OPT_COMPRESSION_LEVEL: {
1867
0
        va_start(args, opt);
1868
0
        int level = va_arg(args, int);
1869
0
        va_end(args);
1870
0
        if (fp->is_bgzf)
1871
0
            fp->fp.bgzf->compress_level = level;
1872
0
        else if (fp->format.format == cram)
1873
0
            return cram_set_option(fp->fp.cram, opt, level);
1874
0
        return 0;
1875
0
    }
1876
1877
0
    case HTS_OPT_FILTER: {
1878
0
        va_start(args, opt);
1879
0
        char *expr = va_arg(args, char *);
1880
0
        va_end(args);
1881
0
        return hts_set_filter_expression(fp, expr);
1882
0
    }
1883
1884
0
    case HTS_OPT_PROFILE: {
1885
0
        va_start(args, opt);
1886
0
        enum hts_profile_option prof = va_arg(args, int);
1887
0
        va_end(args);
1888
0
        if (fp->is_bgzf) {
1889
0
            switch (prof) {
1890
#ifdef HAVE_LIBDEFLATE
1891
            case HTS_PROFILE_FAST:    fp->fp.bgzf->compress_level =  2; break;
1892
            case HTS_PROFILE_NORMAL:  fp->fp.bgzf->compress_level = -1; break;
1893
            case HTS_PROFILE_SMALL:   fp->fp.bgzf->compress_level = 10; break;
1894
            case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level = 12; break;
1895
#else
1896
0
            case HTS_PROFILE_FAST:    fp->fp.bgzf->compress_level =  1; break;
1897
0
            case HTS_PROFILE_NORMAL:  fp->fp.bgzf->compress_level = -1; break;
1898
0
            case HTS_PROFILE_SMALL:   fp->fp.bgzf->compress_level =  8; break;
1899
0
            case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level =  9; break;
1900
0
#endif
1901
0
            }
1902
0
        } // else CRAM manages this in its own way
1903
0
        break;
1904
0
    }
1905
1906
0
    default:
1907
0
        break;
1908
0
    }
1909
1910
0
    if (fp->format.format != cram)
1911
0
        return 0;
1912
1913
0
    va_start(args, opt);
1914
0
    r = cram_set_voption(fp->fp.cram, opt, args);
1915
0
    va_end(args);
1916
1917
0
    return r;
1918
0
}
1919
1920
BGZF *hts_get_bgzfp(htsFile *fp);
1921
1922
int hts_set_threads(htsFile *fp, int n)
1923
0
{
1924
0
    if (fp->format.format == sam) {
1925
0
        return sam_set_threads(fp, n);
1926
0
    } else if (fp->format.compression == bgzf) {
1927
0
        return bgzf_mt(hts_get_bgzfp(fp), n, 256/*unused*/);
1928
0
    } else if (fp->format.format == cram) {
1929
0
        return hts_set_opt(fp, CRAM_OPT_NTHREADS, n);
1930
0
    }
1931
0
    else return 0;
1932
0
}
1933
1934
0
int hts_set_thread_pool(htsFile *fp, htsThreadPool *p) {
1935
0
    if (fp->format.format == sam || fp->format.format == text_format) {
1936
0
        return sam_set_thread_pool(fp, p);
1937
0
    } else if (fp->format.compression == bgzf) {
1938
0
        return bgzf_thread_pool(hts_get_bgzfp(fp), p->pool, p->qsize);
1939
0
    } else if (fp->format.format == cram) {
1940
0
        return hts_set_opt(fp, CRAM_OPT_THREAD_POOL, p);
1941
0
    }
1942
0
    else return 0;
1943
0
}
1944
1945
void hts_set_cache_size(htsFile *fp, int n)
1946
0
{
1947
0
    if (fp->format.compression == bgzf)
1948
0
        bgzf_set_cache_size(hts_get_bgzfp(fp), n);
1949
0
}
1950
1951
int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
1952
0
{
1953
0
    free(fp->fn_aux);
1954
0
    if (fn_aux) {
1955
0
        fp->fn_aux = strdup(fn_aux);
1956
0
        if (fp->fn_aux == NULL) return -1;
1957
0
    }
1958
0
    else fp->fn_aux = NULL;
1959
1960
0
    if (fp->format.format == cram)
1961
0
        if (cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux))
1962
0
            return -1;
1963
1964
0
    return 0;
1965
0
}
1966
1967
int hts_set_filter_expression(htsFile *fp, const char *expr)
1968
0
{
1969
0
    if (fp->filter)
1970
0
        hts_filter_free(fp->filter);
1971
1972
0
    if (!expr)
1973
0
        return 0;
1974
1975
0
    return (fp->filter = hts_filter_init(expr))
1976
0
        ? 0 : -1;
1977
0
}
1978
1979
hFILE *hts_open_tmpfile(const char *fname, const char *mode, kstring_t *tmpname)
1980
0
{
1981
0
    int pid = (int) getpid();
1982
0
    unsigned ptr = (uintptr_t) tmpname;
1983
0
    int n = 0;
1984
0
    hFILE *fp = NULL;
1985
1986
0
    do {
1987
        // Attempt to further uniquify the temporary filename
1988
0
        unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock()) ^ ptr;
1989
0
        n++;
1990
1991
0
        ks_clear(tmpname);
1992
0
        if (ksprintf(tmpname, "%s.tmp_%d_%d_%u", fname, pid, n, t) < 0) break;
1993
1994
0
        fp = hopen(tmpname->s, mode);
1995
0
    } while (fp == NULL && errno == EEXIST && n < 100);
1996
1997
0
    return fp;
1998
0
}
1999
2000
int hts_is_utf16_text(const kstring_t *str)
2001
0
{
2002
0
    const unsigned char *u = (const unsigned char *) (str->s);
2003
0
    return (str->l > 0 && str->s)? is_utf16_text(u, u + str->l) : 0;
2004
0
}
2005
2006
// For VCF/BCF backward sweeper. Not exposing these functions because their
2007
// future is uncertain. Things will probably have to change with hFILE...
2008
BGZF *hts_get_bgzfp(htsFile *fp)
2009
0
{
2010
0
    if (fp->is_bgzf)
2011
0
        return fp->fp.bgzf;
2012
0
    else
2013
0
        return NULL;
2014
0
}
2015
int hts_useek(htsFile *fp, off_t uoffset, int where)
2016
0
{
2017
0
    if (fp->is_bgzf)
2018
0
        return bgzf_useek(fp->fp.bgzf, uoffset, where);
2019
0
    else
2020
0
        return (hseek(fp->fp.hfile, uoffset, SEEK_SET) >= 0)? 0 : -1;
2021
0
}
2022
off_t hts_utell(htsFile *fp)
2023
0
{
2024
0
    if (fp->is_bgzf)
2025
0
        return bgzf_utell(fp->fp.bgzf);
2026
0
    else
2027
0
        return htell(fp->fp.hfile);
2028
0
}
2029
2030
int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
2031
70.1M
{
2032
70.1M
    int ret;
2033
70.1M
    if (! (delimiter == KS_SEP_LINE || delimiter == '\n')) {
2034
0
        hts_log_error("Unexpected delimiter %d", delimiter);
2035
0
        abort();
2036
0
    }
2037
2038
70.1M
    switch (fp->format.compression) {
2039
1.40M
    case no_compression:
2040
1.40M
        str->l = 0;
2041
1.40M
        ret = kgetline2(str, (kgets_func2 *) hgetln, fp->fp.hfile);
2042
1.40M
        if (ret >= 0) ret = (str->l <= INT_MAX)? (int) str->l : INT_MAX;
2043
2.63k
        else if (herrno(fp->fp.hfile)) ret = -2, errno = herrno(fp->fp.hfile);
2044
2.63k
        else ret = -1;
2045
1.40M
        break;
2046
2047
68.7M
    case gzip:
2048
68.7M
    case bgzf:
2049
68.7M
        ret = bgzf_getline(fp->fp.bgzf, '\n', str);
2050
68.7M
        break;
2051
2052
0
    default:
2053
0
        abort();
2054
70.1M
    }
2055
2056
70.1M
    ++fp->lineno;
2057
70.1M
    return ret;
2058
70.1M
}
2059
2060
char **hts_readlist(const char *string, int is_file, int *_n)
2061
0
{
2062
0
    unsigned int m = 0, n = 0;
2063
0
    char **s = 0, **s_new;
2064
0
    if ( is_file )
2065
0
    {
2066
0
        BGZF *fp = bgzf_open(string, "r");
2067
0
        if ( !fp ) return NULL;
2068
2069
0
        kstring_t str;
2070
0
        int ret;
2071
0
        str.s = 0; str.l = str.m = 0;
2072
0
        while ((ret = bgzf_getline(fp, '\n', &str)) >= 0)
2073
0
        {
2074
0
            if (str.l == 0) continue;
2075
0
            if (n == 0 && hts_is_utf16_text(&str))
2076
0
                hts_log_warning("'%s' appears to be encoded as UTF-16", string);
2077
0
            if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
2078
0
                goto err;
2079
0
            s[n] = strdup(str.s);
2080
0
            if (!s[n])
2081
0
                goto err;
2082
0
            n++;
2083
0
        }
2084
0
        if (ret < -1) // Read error
2085
0
            goto err;
2086
0
        bgzf_close(fp);
2087
0
        free(str.s);
2088
0
    }
2089
0
    else
2090
0
    {
2091
0
        const char *q = string, *p = string;
2092
0
        while ( 1 )
2093
0
        {
2094
0
            if (*p == ',' || *p == 0)
2095
0
            {
2096
0
                if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
2097
0
                    goto err;
2098
0
                s[n] = (char*)calloc(p - q + 1, 1);
2099
0
                if (!s[n])
2100
0
                    goto err;
2101
0
                strncpy(s[n++], q, p - q);
2102
0
                q = p + 1;
2103
0
            }
2104
0
            if ( !*p ) break;
2105
0
            p++;
2106
0
        }
2107
0
    }
2108
    // Try to shrink s to the minimum size needed
2109
0
    s_new = (char**)realloc(s, n * sizeof(char*));
2110
0
    if (!s_new)
2111
0
        goto err;
2112
2113
0
    s = s_new;
2114
0
    assert(n < INT_MAX); // hts_resize() should ensure this
2115
0
    *_n = n;
2116
0
    return s;
2117
2118
0
 err:
2119
0
    for (m = 0; m < n; m++)
2120
0
        free(s[m]);
2121
0
    free(s);
2122
0
    return NULL;
2123
0
}
2124
2125
char **hts_readlines(const char *fn, int *_n)
2126
0
{
2127
0
    unsigned int m = 0, n = 0;
2128
0
    char **s = 0, **s_new;
2129
0
    BGZF *fp = bgzf_open(fn, "r");
2130
0
    if ( fp ) { // read from file
2131
0
        kstring_t str;
2132
0
        int ret;
2133
0
        str.s = 0; str.l = str.m = 0;
2134
0
        while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
2135
0
            if (str.l == 0) continue;
2136
0
            if (n == 0 && hts_is_utf16_text(&str))
2137
0
                hts_log_warning("'%s' appears to be encoded as UTF-16", fn);
2138
0
            if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
2139
0
                goto err;
2140
0
            s[n] = strdup(str.s);
2141
0
            if (!s[n])
2142
0
                goto err;
2143
0
            n++;
2144
0
        }
2145
0
        if (ret < -1) // Read error
2146
0
            goto err;
2147
0
        bgzf_close(fp);
2148
0
        free(str.s);
2149
0
    } else if (*fn == ':') { // read from string
2150
0
        const char *q, *p;
2151
0
        for (q = p = fn + 1;; ++p)
2152
0
            if (*p == ',' || *p == 0) {
2153
0
                if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
2154
0
                    goto err;
2155
0
                s[n] = (char*)calloc(p - q + 1, 1);
2156
0
                if (!s[n])
2157
0
                    goto err;
2158
0
                strncpy(s[n++], q, p - q);
2159
0
                q = p + 1;
2160
0
                if (*p == 0) break;
2161
0
            }
2162
0
    } else return 0;
2163
    // Try to shrink s to the minimum size needed
2164
0
    s_new = (char**)realloc(s, n * sizeof(char*));
2165
0
    if (!s_new)
2166
0
        goto err;
2167
2168
0
    s = s_new;
2169
0
    assert(n < INT_MAX); // hts_resize() should ensure this
2170
0
    *_n = n;
2171
0
    return s;
2172
2173
0
 err:
2174
0
    for (m = 0; m < n; m++)
2175
0
        free(s[m]);
2176
0
    free(s);
2177
0
    return NULL;
2178
0
}
2179
2180
// DEPRECATED: To be removed in a future HTSlib release
2181
int hts_file_type(const char *fname)
2182
0
{
2183
0
    int len = strlen(fname);
2184
0
    if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ;
2185
0
    if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF;
2186
0
    if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ;
2187
0
    if ( !strcmp("-",fname) ) return FT_STDIN;
2188
2189
0
    hFILE *f = hopen(fname, "r");
2190
0
    if (f == NULL) return 0;
2191
2192
0
    htsFormat fmt;
2193
0
    if (hts_detect_format2(f, fname, &fmt) < 0) { hclose_abruptly(f); return 0; }
2194
0
    if (hclose(f) < 0) return 0;
2195
2196
0
    switch (fmt.format) {
2197
0
    case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ;
2198
0
    case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ;
2199
0
    default:  return 0;
2200
0
    }
2201
0
}
2202
2203
int hts_check_EOF(htsFile *fp)
2204
0
{
2205
0
    if (fp->format.compression == bgzf)
2206
0
        return bgzf_check_EOF(hts_get_bgzfp(fp));
2207
0
    else if (fp->format.format == cram)
2208
0
        return cram_check_EOF(fp->fp.cram);
2209
0
    else
2210
0
        return 3;
2211
0
}
2212
2213
2214
/****************
2215
 *** Indexing ***
2216
 ****************/
2217
2218
0
#define HTS_MIN_MARKER_DIST 0x10000
2219
2220
// Finds the special meta bin
2221
//  ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1
2222
0
#define META_BIN(idx) ((idx)->n_bins + 1)
2223
2224
0
#define pair64_lt(a,b) ((a).u < (b).u)
2225
0
#define pair64max_lt(a,b) ((a).u < (b).u || \
2226
0
                           ((a).u == (b).u && (a).max < (b).max))
2227
2228
0
KSORT_INIT_STATIC(_off, hts_pair64_t, pair64_lt)
2229
0
KSORT_INIT_STATIC(_off_max, hts_pair64_max_t, pair64max_lt)
2230
2231
typedef struct {
2232
    int32_t m, n;
2233
    uint64_t loff;
2234
    hts_pair64_t *list;
2235
} bins_t;
2236
2237
KHASH_MAP_INIT_INT(bin, bins_t)
2238
typedef khash_t(bin) bidx_t;
2239
2240
typedef struct {
2241
    hts_pos_t n, m;
2242
    uint64_t *offset;
2243
} lidx_t;
2244
2245
struct hts_idx_t {
2246
    int fmt, min_shift, n_lvls, n_bins;
2247
    uint32_t l_meta;
2248
    int32_t n, m;
2249
    uint64_t n_no_coor;
2250
    bidx_t **bidx;
2251
    lidx_t *lidx;
2252
    uint8_t *meta; // MUST have a terminating NUL on the end
2253
    int tbi_n, last_tbi_tid;
2254
    struct {
2255
        uint32_t last_bin, save_bin;
2256
        hts_pos_t last_coor;
2257
        int last_tid, save_tid, finished;
2258
        uint64_t last_off, save_off;
2259
        uint64_t off_beg, off_end;
2260
        uint64_t n_mapped, n_unmapped;
2261
    } z; // keep internal states
2262
    BGZF *otf_fp;  // Index on-the-fly output file
2263
};
2264
2265
0
static char * idx_format_name(int fmt) {
2266
0
    switch (fmt) {
2267
0
        case HTS_FMT_CSI: return "csi";
2268
0
        case HTS_FMT_BAI: return "bai";
2269
0
        case HTS_FMT_TBI: return "tbi";
2270
0
        case HTS_FMT_CRAI: return "crai";
2271
0
        default: return "unknown";
2272
0
    }
2273
0
}
2274
2275
#ifdef DEBUG_INDEX
2276
static void idx_dump(const hts_idx_t *idx) {
2277
    int i;
2278
    int64_t j;
2279
2280
    if (!idx) fprintf(stderr, "Null index\n");
2281
2282
    fprintf(stderr, "format='%s', min_shift=%d, n_lvls=%d, n_bins=%d, l_meta=%u ",
2283
            idx_format_name(idx->fmt), idx->min_shift, idx->n_lvls, idx->n_bins, idx->l_meta);
2284
    fprintf(stderr, "n=%d, m=%d, n_no_coor=%"PRIu64"\n", idx->n, idx->m, idx->n_no_coor);
2285
    for (i = 0; i < idx->n; i++) {
2286
        bidx_t *bidx = idx->bidx[i];
2287
        lidx_t *lidx = &idx->lidx[i];
2288
        if (bidx) {
2289
            fprintf(stderr, "======== BIN Index - tid=%d, n_buckets=%d, size=%d\n", i, bidx->n_buckets, bidx->size);
2290
            int b;
2291
            for (b = 0; b < META_BIN(idx); b++) {
2292
                khint_t k;
2293
                if ((k = kh_get(bin, bidx, b)) != kh_end(bidx)) {
2294
                    bins_t *entries = &kh_value(bidx, k);
2295
                    int l = hts_bin_level(b);
2296
                    int64_t bin_width = 1LL << ((idx->n_lvls - l) * 3 + idx->min_shift);
2297
                    fprintf(stderr, "\tbin=%d, level=%d, parent=%d, n_chunks=%d, loff=%"PRIu64", interval=[%"PRId64" - %"PRId64"]\n",
2298
                        b, l, hts_bin_parent(b), entries->n, entries->loff, (b-hts_bin_first(l))*bin_width+1, (b+1-hts_bin_first(l))*bin_width);
2299
                    for (j = 0; j < entries->n; j++)
2300
                        fprintf(stderr, "\t\tchunk=%"PRId64", u=%"PRIu64", v=%"PRIu64"\n", j, entries->list[j].u, entries->list[j].v);
2301
                }
2302
            }
2303
        }
2304
        if (lidx) {
2305
            fprintf(stderr, "======== LINEAR Index - tid=%d, n_values=%"PRId64"\n", i, lidx->n);
2306
            for (j = 0; j < lidx->n; j++) {
2307
                fprintf(stderr, "\t\tentry=%"PRId64", offset=%"PRIu64", interval=[%"PRId64" - %"PRId64"]\n",
2308
                    j, lidx->offset[j], j*(1<<idx->min_shift)+1, (j+1)*(1<<idx->min_shift));
2309
            }
2310
        }
2311
    }
2312
}
2313
#endif
2314
2315
static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end)
2316
0
{
2317
0
    khint_t k;
2318
0
    bins_t *l;
2319
0
    int absent;
2320
0
    k = kh_put(bin, b, bin, &absent);
2321
0
    if (absent < 0) return -1; // Out of memory
2322
0
    l = &kh_value(b, k);
2323
0
    if (absent) {
2324
0
        l->m = 1; l->n = 0;
2325
0
        l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t));
2326
0
        if (!l->list) {
2327
0
            kh_del(bin, b, k);
2328
0
            return -1;
2329
0
        }
2330
0
    } else if (l->n == l->m) {
2331
0
        uint32_t new_m = l->m ? l->m << 1 : 1;
2332
0
        hts_pair64_t *new_list = realloc(l->list, new_m * sizeof(hts_pair64_t));
2333
0
        if (!new_list) return -1;
2334
0
        l->list = new_list;
2335
0
        l->m = new_m;
2336
0
    }
2337
0
    l->list[l->n].u = beg;
2338
0
    l->list[l->n++].v = end;
2339
0
    return 0;
2340
0
}
2341
2342
static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift)
2343
0
{
2344
0
    int i;
2345
0
    hts_pos_t beg, end;
2346
0
    beg = _beg >> min_shift;
2347
0
    end = (_end - 1) >> min_shift;
2348
0
    if (l->m < end + 1) {
2349
0
        size_t new_m = l->m * 2 > end + 1 ? l->m * 2 : end + 1;
2350
0
        uint64_t *new_offset;
2351
2352
0
        new_offset = (uint64_t*)realloc(l->offset, new_m * sizeof(uint64_t));
2353
0
        if (!new_offset) return -1;
2354
2355
        // fill unused memory with (uint64_t)-1
2356
0
        memset(new_offset + l->m, 0xff, sizeof(uint64_t) * (new_m - l->m));
2357
0
        l->m = new_m;
2358
0
        l->offset = new_offset;
2359
0
    }
2360
0
    for (i = beg; i <= end; ++i) {
2361
0
        if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset;
2362
0
    }
2363
0
    if (l->n < end + 1) l->n = end + 1;
2364
0
    return 0;
2365
0
}
2366
2367
void hts_adjust_csi_settings(int64_t max_len_in, int *min_shift_, int *n_lvls_)
2368
0
{
2369
0
    const int max_n_lvls = 9; // To prevent bin number overflow
2370
0
    int min_shift = *min_shift_;
2371
0
    int n_lvls = *n_lvls_;
2372
0
    int64_t max_len = max_len_in + 256, maxpos;
2373
2374
    // Check if we need to adjust n_lvls or min_shift to get the range needed
2375
0
    if (max_len <= hts_bin_maxpos(min_shift, max_n_lvls)) {
2376
        // Can get required range by adjusting n_lvls
2377
0
        maxpos = hts_bin_maxpos(min_shift, n_lvls);
2378
0
        while (max_len > maxpos) {
2379
0
            ++n_lvls;
2380
0
            maxpos *= 8;
2381
0
        }
2382
0
        *n_lvls_ = n_lvls;
2383
0
    } else {
2384
        // No room to change n_lvls - adjust min_shift instead
2385
        // This was likely user-supplied so warn about the change too.
2386
0
        n_lvls = max_n_lvls;
2387
0
        maxpos = hts_bin_maxpos(min_shift, n_lvls);
2388
0
        while (max_len > maxpos) {
2389
0
            ++min_shift;
2390
0
            maxpos *= 2;
2391
0
        }
2392
0
        hts_log_warning("Adjusted min_shift from %d to %d"
2393
0
                        " due to longest reference of %"PRId64" bases.",
2394
0
                        *min_shift_, min_shift, max_len_in);
2395
0
        *n_lvls_ = n_lvls;
2396
0
        *min_shift_ = min_shift;
2397
0
    }
2398
0
}
2399
2400
hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
2401
0
{
2402
0
    hts_idx_t *idx;
2403
0
    idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t));
2404
0
    if (idx == NULL) return NULL;
2405
0
    idx->fmt = fmt;
2406
0
    idx->min_shift = min_shift;
2407
0
    idx->n_lvls = n_lvls;
2408
0
    idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7;
2409
0
    idx->z.save_tid = idx->z.last_tid = -1;
2410
0
    idx->z.save_bin = idx->z.last_bin = 0xffffffffu;
2411
0
    idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0;
2412
0
    idx->z.last_coor = 0xffffffffu;
2413
0
    if (n) {
2414
0
        idx->n = idx->m = n;
2415
0
        idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*));
2416
0
        if (idx->bidx == NULL) { free(idx); return NULL; }
2417
0
        idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t));
2418
0
        if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; }
2419
0
    }
2420
0
    idx->tbi_n = -1;
2421
0
    idx->last_tbi_tid = -1;
2422
0
    idx->otf_fp = NULL;
2423
0
    return idx;
2424
0
}
2425
2426
static void update_loff(hts_idx_t *idx, int i, int free_lidx)
2427
0
{
2428
0
    bidx_t *bidx = idx->bidx[i];
2429
0
    lidx_t *lidx = &idx->lidx[i];
2430
0
    khint_t k;
2431
0
    int l;
2432
    // the last entry is always valid
2433
0
    for (l=lidx->n-2; l >= 0; l--) {
2434
0
        if (lidx->offset[l] == (uint64_t)-1)
2435
0
            lidx->offset[l] = lidx->offset[l+1];
2436
0
    }
2437
0
    if (bidx == 0) return;
2438
0
    for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff
2439
0
        if (kh_exist(bidx, k))
2440
0
        {
2441
0
            if ( kh_key(bidx, k) < idx->n_bins )
2442
0
            {
2443
0
                int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls);
2444
                // disable linear index if bot_bin out of bounds
2445
0
                kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0;
2446
0
            }
2447
0
            else
2448
0
                kh_val(bidx, k).loff = 0;
2449
0
        }
2450
0
    if (free_lidx) {
2451
0
        free(lidx->offset);
2452
0
        lidx->m = lidx->n = 0;
2453
0
        lidx->offset = 0;
2454
0
    }
2455
0
}
2456
2457
static int compress_binning(hts_idx_t *idx, int i)
2458
0
{
2459
0
    bidx_t *bidx = idx->bidx[i];
2460
0
    khint_t k;
2461
0
    int l, m;
2462
0
    if (bidx == 0) return 0;
2463
    // merge a bin to its parent if the bin is too small
2464
0
    for (l = idx->n_lvls; l > 0; --l) {
2465
0
        unsigned start = hts_bin_first(l);
2466
0
        for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
2467
0
            bins_t *p, *q;
2468
0
            if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue;
2469
0
            p = &kh_value(bidx, k);
2470
0
            if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list);
2471
0
            if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) {
2472
0
                khint_t kp;
2473
0
                kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k)));
2474
0
                if (kp == kh_end(bidx)) continue;
2475
0
                q = &kh_val(bidx, kp);
2476
0
                if (q->n + p->n > q->m) {
2477
0
                    uint32_t new_m = q->n + p->n;
2478
0
                    hts_pair64_t *new_list;
2479
0
                    kroundup32(new_m);
2480
0
                    if (new_m > INT32_MAX) return -1; // Limited by index format
2481
0
                    new_list = realloc(q->list, new_m * sizeof(*new_list));
2482
0
                    if (!new_list) return -1;
2483
0
                    q->m = new_m;
2484
0
                    q->list = new_list;
2485
0
                }
2486
0
                memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t));
2487
0
                q->n += p->n;
2488
0
                free(p->list);
2489
0
                kh_del(bin, bidx, k);
2490
0
            }
2491
0
        }
2492
0
    }
2493
0
    k = kh_get(bin, bidx, 0);
2494
0
    if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list);
2495
    // merge adjacent chunks that start from the same BGZF block
2496
0
    for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
2497
0
        bins_t *p;
2498
0
        if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue;
2499
0
        p = &kh_value(bidx, k);
2500
0
        for (l = 1, m = 0; l < p->n; ++l) {
2501
0
            if (p->list[m].v>>16 >= p->list[l].u>>16) {
2502
0
                if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v;
2503
0
            } else p->list[++m] = p->list[l];
2504
0
        }
2505
0
        p->n = m + 1;
2506
0
    }
2507
0
    return 0;
2508
0
}
2509
2510
int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
2511
0
{
2512
0
    int i, ret = 0;
2513
0
    if (idx == NULL || idx->z.finished) return 0; // do not run this function on an empty index or multiple times
2514
0
    if (idx->z.save_tid >= 0) {
2515
0
        ret |= insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset);
2516
0
        ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset);
2517
0
        ret |= insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
2518
0
    }
2519
0
    for (i = 0; i < idx->n; ++i) {
2520
0
        update_loff(idx, i, (idx->fmt == HTS_FMT_CSI));
2521
0
        ret |= compress_binning(idx, i);
2522
0
    }
2523
0
    idx->z.finished = 1;
2524
2525
0
    return ret;
2526
0
}
2527
2528
static inline hts_pos_t hts_idx_maxpos(const hts_idx_t *idx)
2529
0
{
2530
0
    return hts_bin_maxpos(idx->min_shift, idx->n_lvls);
2531
0
}
2532
2533
int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
2534
0
{
2535
0
    hts_pos_t maxpos = hts_idx_maxpos(idx);
2536
0
    if (tid < 0 || (beg <= maxpos && end <= maxpos))
2537
0
        return 0;
2538
2539
0
    if (idx->fmt == HTS_FMT_CSI) {
2540
0
        hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" "
2541
0
                      "cannot be stored in a csi index with these parameters. "
2542
0
                      "Please use a larger min_shift or depth",
2543
0
                      beg, end);
2544
0
    } else {
2545
0
        hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos
2546
0
                      " cannot be stored in a %s index. Try using a csi index",
2547
0
                      beg, end, idx_format_name(idx->fmt));
2548
0
    }
2549
0
    errno = ERANGE;
2550
0
    return -1;
2551
0
}
2552
2553
int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped)
2554
0
{
2555
0
    int bin;
2556
0
    if (tid<0) beg = -1, end = 0;
2557
0
    if (hts_idx_check_range(idx, tid, beg, end) < 0)
2558
0
        return -1;
2559
0
    if (tid >= idx->m) { // enlarge the index
2560
0
        uint32_t new_m = idx->m * 2 > tid + 1 ? idx->m * 2 : tid + 1;
2561
0
        bidx_t **new_bidx;
2562
0
        lidx_t *new_lidx;
2563
2564
0
        new_bidx = (bidx_t**)realloc(idx->bidx, new_m * sizeof(bidx_t*));
2565
0
        if (!new_bidx) return -1;
2566
0
        idx->bidx = new_bidx;
2567
2568
0
        new_lidx = (lidx_t*) realloc(idx->lidx, new_m * sizeof(lidx_t));
2569
0
        if (!new_lidx) return -1;
2570
0
        idx->lidx = new_lidx;
2571
2572
0
        memset(&idx->bidx[idx->m], 0, (new_m - idx->m) * sizeof(bidx_t*));
2573
0
        memset(&idx->lidx[idx->m], 0, (new_m - idx->m) * sizeof(lidx_t));
2574
0
        idx->m = new_m;
2575
0
    }
2576
0
    if (idx->n < tid + 1) idx->n = tid + 1;
2577
0
    if (idx->z.finished) return 0;
2578
0
    if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome
2579
0
        if ( tid>=0 && idx->n_no_coor )
2580
0
        {
2581
0
            hts_log_error("NO_COOR reads not in a single block at the end %d %d", tid, idx->z.last_tid);
2582
0
            return -1;
2583
0
        }
2584
0
        if (tid>=0 && idx->bidx[tid] != 0)
2585
0
        {
2586
0
            hts_log_error("Chromosome blocks not continuous");
2587
0
            return -1;
2588
0
        }
2589
0
        idx->z.last_tid = tid;
2590
0
        idx->z.last_bin = 0xffffffffu;
2591
0
    } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order
2592
0
        hts_log_error("Unsorted positions on sequence #%d: %"PRIhts_pos" followed by %"PRIhts_pos, tid+1, idx->z.last_coor+1, beg+1);
2593
0
        return -1;
2594
0
    }
2595
0
    if (end < beg) {
2596
        // Malformed ranges are errors. (Empty ranges (beg==end) are unusual but acceptable.)
2597
0
        hts_log_error("Invalid record on sequence #%d: end %"PRId64" < begin %"PRId64, tid+1, end, beg+1);
2598
0
        return -1;
2599
0
    }
2600
0
    if ( tid>=0 )
2601
0
    {
2602
0
        if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin);
2603
        // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin
2604
0
        if (beg < 0)  beg = 0;
2605
0
        if (end <= 0) end = 1;
2606
        // idx->z.last_off points to the start of the current record
2607
0
        if (insert_to_l(&idx->lidx[tid], beg, end,
2608
0
                        idx->z.last_off, idx->min_shift) < 0) return -1;
2609
0
    }
2610
0
    else idx->n_no_coor++;
2611
0
    bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls);
2612
0
    if ((int)idx->z.last_bin != bin) { // then possibly write the binning index
2613
0
        if (idx->z.save_bin != 0xffffffffu) { // save_bin==0xffffffffu only happens to the first record
2614
0
            if (insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin,
2615
0
                            idx->z.save_off, idx->z.last_off) < 0) return -1;
2616
0
        }
2617
0
        if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information
2618
0
            idx->z.off_end = idx->z.last_off;
2619
0
            if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
2620
0
                            idx->z.off_beg, idx->z.off_end) < 0) return -1;
2621
0
            if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
2622
0
                            idx->z.n_mapped, idx->z.n_unmapped) < 0) return -1;
2623
0
            idx->z.n_mapped = idx->z.n_unmapped = 0;
2624
0
            idx->z.off_beg = idx->z.off_end;
2625
0
        }
2626
0
        idx->z.save_off = idx->z.last_off;
2627
0
        idx->z.save_bin = idx->z.last_bin = bin;
2628
0
        idx->z.save_tid = tid;
2629
0
    }
2630
0
    if (is_mapped) ++idx->z.n_mapped;
2631
0
    else ++idx->z.n_unmapped;
2632
0
    idx->z.last_off = offset;
2633
0
    idx->z.last_coor = beg;
2634
0
    return 0;
2635
0
}
2636
2637
// Needed for TBI only.  Ensure 'tid' with 'name' is in the index meta data.
2638
// idx->meta needs to have been initialised first with an appropriate Tabix
2639
// configuration via hts_idx_set_meta.
2640
//
2641
// NB number of references (first 4 bytes of tabix header) aren't in
2642
// idx->meta, but held in idx->n instead.
2643
0
int hts_idx_tbi_name(hts_idx_t *idx, int tid, const char *name) {
2644
    // Horrid - we have to map incoming tid to a tbi alternative tid.
2645
    // This is because TBI counts tids by "covered" refs while everything
2646
    // else counts by Nth SQ/contig record in header.
2647
0
    if (tid == idx->last_tbi_tid || tid < 0 || !name)
2648
0
        return idx->tbi_n;
2649
2650
0
    uint32_t len = strlen(name)+1;
2651
0
    uint8_t *tmp = (uint8_t *)realloc(idx->meta, idx->l_meta + len);
2652
0
    if (!tmp)
2653
0
        return -1;
2654
2655
    // Append name
2656
0
    idx->meta = tmp;
2657
0
    strcpy((char *)idx->meta + idx->l_meta, name);
2658
0
    idx->l_meta += len;
2659
2660
    // Update seq length
2661
0
    u32_to_le(le_to_u32(idx->meta+24)+len, idx->meta+24);
2662
2663
0
    idx->last_tbi_tid = tid;
2664
0
    return ++idx->tbi_n;
2665
0
}
2666
2667
// When doing samtools index we have a read_bam / hts_idx_push(bgzf_tell())
2668
// loop.  idx->z.last_off is the previous bzgf_tell location, so we know
2669
// the location the current bam record started at as well as where it ends.
2670
//
2671
// When building an index on the fly via a write_bam / hts_idx_push loop,
2672
// this isn't quite identical as we may amend the virtual coord returned
2673
// by bgzf_tell to the start of a new block if the next bam struct doesn't
2674
// fit.  It's essentially the same thing, but for bit-identical indices
2675
// we need to amend the idx->z.last_off when we know we're starting a new
2676
// block.
2677
void hts_idx_amend_last(hts_idx_t *idx, uint64_t offset)
2678
0
{
2679
0
    idx->z.last_off = offset;
2680
0
}
2681
2682
void hts_idx_destroy(hts_idx_t *idx)
2683
18.4k
{
2684
18.4k
    khint_t k;
2685
18.4k
    int i;
2686
18.4k
    if (idx == 0) return;
2687
2688
    // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c
2689
0
    if (idx->fmt == HTS_FMT_CRAI) {
2690
0
        hts_cram_idx_t *cidx = (hts_cram_idx_t *) idx;
2691
0
        cram_index_free(cidx->cram);
2692
0
        free(cidx);
2693
0
        return;
2694
0
    }
2695
2696
0
    for (i = 0; i < idx->m; ++i) {
2697
0
        bidx_t *bidx = idx->bidx[i];
2698
0
        free(idx->lidx[i].offset);
2699
0
        if (bidx == 0) continue;
2700
0
        for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
2701
0
            if (kh_exist(bidx, k))
2702
0
                free(kh_value(bidx, k).list);
2703
0
        kh_destroy(bin, bidx);
2704
0
    }
2705
0
    free(idx->bidx); free(idx->lidx); free(idx->meta);
2706
0
    free(idx);
2707
0
}
2708
2709
0
int hts_idx_fmt(hts_idx_t *idx) {
2710
0
    return idx->fmt;
2711
0
}
2712
2713
// The optimizer eliminates these ed_is_big() calls; still it would be good to
2714
// TODO Determine endianness at configure- or compile-time
2715
2716
static inline ssize_t HTS_RESULT_USED idx_write_int32(BGZF *fp, int32_t x)
2717
0
{
2718
0
    if (ed_is_big()) x = ed_swap_4(x);
2719
0
    return bgzf_write(fp, &x, sizeof x);
2720
0
}
2721
2722
static inline ssize_t HTS_RESULT_USED idx_write_uint32(BGZF *fp, uint32_t x)
2723
0
{
2724
0
    if (ed_is_big()) x = ed_swap_4(x);
2725
0
    return bgzf_write(fp, &x, sizeof x);
2726
0
}
2727
2728
static inline ssize_t HTS_RESULT_USED idx_write_uint64(BGZF *fp, uint64_t x)
2729
0
{
2730
0
    if (ed_is_big()) x = ed_swap_8(x);
2731
0
    return bgzf_write(fp, &x, sizeof x);
2732
0
}
2733
2734
static inline void swap_bins(bins_t *p)
2735
0
{
2736
0
    int i;
2737
0
    for (i = 0; i < p->n; ++i) {
2738
0
        ed_swap_8p(&p->list[i].u);
2739
0
        ed_swap_8p(&p->list[i].v);
2740
0
    }
2741
0
}
2742
2743
static int need_idx_ugly_delay_hack(const hts_idx_t *idx)
2744
0
{
2745
    // Ugly hack for on-the-fly BAI indexes.  As these are uncompressed,
2746
    // we need to delay writing a few bytes of data until file close
2747
    // so that we have something to force a modification time update.
2748
    //
2749
    // (For compressed indexes like CSI, the BGZF EOF block serves the same
2750
    // purpose).
2751
0
    return idx->otf_fp && !idx->otf_fp->is_compressed;
2752
0
}
2753
2754
static int idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt)
2755
0
{
2756
0
    int32_t i, j;
2757
2758
0
    #define check(ret) if ((ret) < 0) return -1
2759
2760
    // VCF TBI/CSI only writes IDs for non-empty bins (ie covered references)
2761
    //
2762
    // NOTE: CSI meta is undefined in spec, so this code has an assumption
2763
    // that we're only using it for Tabix data.
2764
0
    int nids = idx->n;
2765
0
    if (idx->meta && idx->l_meta >= 4 && le_to_u32(idx->meta) == TBX_VCF) {
2766
0
        for (i = nids = 0; i < idx->n; ++i) {
2767
0
            if (idx->bidx[i])
2768
0
                nids++;
2769
0
        }
2770
0
    }
2771
0
    check(idx_write_int32(fp, nids));
2772
0
    if (fmt == HTS_FMT_TBI && idx->l_meta)
2773
0
        check(bgzf_write(fp, idx->meta, idx->l_meta));
2774
2775
0
    for (i = 0; i < idx->n; ++i) {
2776
0
        khint_t k;
2777
0
        bidx_t *bidx = idx->bidx[i];
2778
0
        lidx_t *lidx = &idx->lidx[i];
2779
2780
        // write binning index
2781
0
        if (nids == idx->n || bidx)
2782
0
            check(idx_write_int32(fp, bidx? kh_size(bidx) : 0));
2783
0
        if (bidx)
2784
0
            for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
2785
0
                if (kh_exist(bidx, k)) {
2786
0
                    bins_t *p = &kh_value(bidx, k);
2787
0
                    check(idx_write_uint32(fp, kh_key(bidx, k)));
2788
0
                    if (fmt == HTS_FMT_CSI) check(idx_write_uint64(fp, p->loff));
2789
                    //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v);
2790
0
                    check(idx_write_int32(fp, p->n));
2791
0
                    for (j = 0; j < p->n; ++j) {
2792
                        //fprintf(stderr, "\t%ld\t%ld\n", p->list[j].u, p->list[j].v);
2793
0
                        check(idx_write_uint64(fp, p->list[j].u));
2794
0
                        check(idx_write_uint64(fp, p->list[j].v));
2795
0
                    }
2796
0
                }
2797
2798
        // write linear index
2799
0
        if (fmt != HTS_FMT_CSI) {
2800
0
            check(idx_write_int32(fp, lidx->n));
2801
0
            for (j = 0; j < lidx->n; ++j)
2802
0
                check(idx_write_uint64(fp, lidx->offset[j]));
2803
0
        }
2804
0
    }
2805
2806
0
    if (!need_idx_ugly_delay_hack(idx)) {
2807
        // Write this for compressed (CSI) indexes, but for BAI we
2808
        // need to save a bit for later.  See hts_idx_close_otf_fp()
2809
0
        check(idx_write_uint64(fp, idx->n_no_coor));
2810
0
    }
2811
2812
#ifdef DEBUG_INDEX
2813
    idx_dump(idx);
2814
#endif
2815
2816
0
    return 0;
2817
0
    #undef check
2818
0
}
2819
2820
int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
2821
0
{
2822
0
    int ret, save;
2823
0
    if (idx == NULL || fn == NULL) { errno = EINVAL; return -1; }
2824
0
    char *fnidx = (char*)calloc(1, strlen(fn) + 5);
2825
0
    if (fnidx == NULL) return -1;
2826
2827
0
    strcpy(fnidx, fn);
2828
0
    switch (fmt) {
2829
0
    case HTS_FMT_BAI: strcat(fnidx, ".bai"); break;
2830
0
    case HTS_FMT_CSI: strcat(fnidx, ".csi"); break;
2831
0
    case HTS_FMT_TBI: strcat(fnidx, ".tbi"); break;
2832
0
    default: abort();
2833
0
    }
2834
2835
0
    ret = hts_idx_save_as(idx, fn, fnidx, fmt);
2836
0
    save = errno;
2837
0
    free(fnidx);
2838
0
    errno = save;
2839
0
    return ret;
2840
0
}
2841
2842
static int hts_idx_write_out(const hts_idx_t *idx, BGZF *fp, int fmt)
2843
0
{
2844
0
    #define check(ret) if ((ret) < 0) return -1
2845
2846
0
    if (fmt == HTS_FMT_CSI) {
2847
0
        check(bgzf_write(fp, "CSI\1", 4));
2848
0
        check(idx_write_int32(fp, idx->min_shift));
2849
0
        check(idx_write_int32(fp, idx->n_lvls));
2850
0
        check(idx_write_uint32(fp, idx->l_meta));
2851
0
        if (idx->l_meta) check(bgzf_write(fp, idx->meta, idx->l_meta));
2852
0
    } else if (fmt == HTS_FMT_TBI) {
2853
0
        check(bgzf_write(fp, "TBI\1", 4));
2854
0
    } else if (fmt == HTS_FMT_BAI) {
2855
0
        check(bgzf_write(fp, "BAI\1", 4));
2856
0
    } else abort();
2857
2858
0
    check(idx_save_core(idx, fp, fmt));
2859
2860
0
    #undef check
2861
0
    return 0;
2862
0
}
2863
2864
int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
2865
0
{
2866
0
    BGZF *fp;
2867
2868
0
    if (fnidx == NULL)
2869
0
        return hts_idx_save(idx, fn, fmt);
2870
2871
0
    fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
2872
0
    if (fp == NULL) return -1;
2873
2874
0
    if (hts_idx_write_out(idx, fp, fmt) < 0) {
2875
0
        int save_errno = errno;
2876
0
        bgzf_close(fp);
2877
0
        errno = save_errno;
2878
0
        return -1;
2879
0
    }
2880
2881
0
    return bgzf_close(fp);
2882
0
}
2883
2884
// idx_save for on-the-fly indexes.  Mostly duplicated from above, except
2885
// idx is not const because we want to store the file handle in it, and
2886
// the index file handle is not closed.  This allows the index file to be
2887
// closed after the EOF block on the indexed file has been written out,
2888
// so the modification times on the two files will be in the correct order.
2889
int hts_idx_save_but_not_close(hts_idx_t *idx, const char *fnidx, int fmt)
2890
0
{
2891
0
    idx->otf_fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
2892
0
    if (idx->otf_fp == NULL) return -1;
2893
2894
0
    if (hts_idx_write_out(idx, idx->otf_fp, fmt) < 0) {
2895
0
        int save_errno = errno;
2896
0
        bgzf_close(idx->otf_fp);
2897
0
        idx->otf_fp = NULL;
2898
0
        errno = save_errno;
2899
0
        return -1;
2900
0
    }
2901
2902
0
    return bgzf_flush(idx->otf_fp);
2903
0
}
2904
2905
static int hts_idx_close_otf_fp(hts_idx_t *idx)
2906
0
{
2907
0
    if (idx && idx->otf_fp) {
2908
0
        int ret = 0;
2909
0
        if (need_idx_ugly_delay_hack(idx)) {
2910
            // BAI index - write out the bytes we deferred earlier
2911
0
            ret = idx_write_uint64(idx->otf_fp, idx->n_no_coor) < 0;
2912
0
        }
2913
0
        ret |= bgzf_close(idx->otf_fp) < 0;
2914
0
        idx->otf_fp = NULL;
2915
0
        return ret == 0 ? 0 : -1;
2916
0
    }
2917
0
    return 0;
2918
0
}
2919
2920
static int idx_read_core(hts_idx_t *idx, BGZF *fp, int fmt)
2921
0
{
2922
0
    int32_t i, n, is_be;
2923
0
    is_be = ed_is_big();
2924
0
    if (idx == NULL) return -4;
2925
0
    for (i = 0; i < idx->n; ++i) {
2926
0
        bidx_t *h;
2927
0
        lidx_t *l = &idx->lidx[i];
2928
0
        uint32_t key;
2929
0
        int j, absent;
2930
0
        bins_t *p;
2931
0
        h = idx->bidx[i] = kh_init(bin);
2932
0
        if (bgzf_read(fp, &n, 4) != 4) return -1;
2933
0
        if (is_be) ed_swap_4p(&n);
2934
0
        if (n < 0) return -3;
2935
0
        for (j = 0; j < n; ++j) {
2936
0
            khint_t k;
2937
0
            if (bgzf_read(fp, &key, 4) != 4) return -1;
2938
0
            if (is_be) ed_swap_4p(&key);
2939
0
            k = kh_put(bin, h, key, &absent);
2940
0
            if (absent <  0) return -2; // No memory
2941
0
            if (absent == 0) return -3; // Duplicate bin number
2942
0
            p = &kh_val(h, k);
2943
0
            if (fmt == HTS_FMT_CSI) {
2944
0
                if (bgzf_read(fp, &p->loff, 8) != 8) return -1;
2945
0
                if (is_be) ed_swap_8p(&p->loff);
2946
0
            } else p->loff = 0;
2947
0
            if (bgzf_read(fp, &p->n, 4) != 4) return -1;
2948
0
            if (is_be) ed_swap_4p(&p->n);
2949
0
            if (p->n < 0) return -3;
2950
0
            if ((size_t) p->n > SIZE_MAX / sizeof(hts_pair64_t)) return -2;
2951
0
            p->m = p->n;
2952
0
            p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t));
2953
0
            if (p->list == NULL) return -2;
2954
0
            if (bgzf_read(fp, p->list, ((size_t) p->n)<<4) != ((size_t) p->n)<<4) return -1;
2955
0
            if (is_be) swap_bins(p);
2956
0
        }
2957
0
        if (fmt != HTS_FMT_CSI) { // load linear index
2958
0
            int j, k;
2959
0
            uint32_t x;
2960
0
            if (bgzf_read(fp, &x, 4) != 4) return -1;
2961
0
            if (is_be) ed_swap_4p(&x);
2962
0
            l->n = x;
2963
0
            if (l->n < 0) return -3;
2964
0
            if ((size_t) l->n > SIZE_MAX / sizeof(uint64_t)) return -2;
2965
0
            l->m = l->n;
2966
0
            l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t));
2967
0
            if (l->offset == NULL) return -2;
2968
0
            if (bgzf_read(fp, l->offset, l->n << 3) != l->n << 3) return -1;
2969
0
            if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]);
2970
0
            for (k = j = 0; j < l->n && l->offset[j] == 0; k = ++j); // stop at the first non-zero entry
2971
0
            for (j = l->n-1; j > k; j--) // fill missing values; may happen given older samtools and tabix
2972
0
                if (l->offset[j-1] == 0) l->offset[j-1] = l->offset[j];
2973
0
            update_loff(idx, i, 0);
2974
0
        }
2975
0
    }
2976
0
    if (bgzf_read(fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0;
2977
0
    if (is_be) ed_swap_8p(&idx->n_no_coor);
2978
#ifdef DEBUG_INDEX
2979
    idx_dump(idx);
2980
#endif
2981
2982
0
    return 0;
2983
0
}
2984
2985
static hts_idx_t *idx_read(const char *fn)
2986
0
{
2987
0
    uint8_t magic[4];
2988
0
    int i, is_be;
2989
0
    hts_idx_t *idx = NULL;
2990
0
    uint8_t *meta = NULL;
2991
0
    BGZF *fp = bgzf_open(fn, "r");
2992
0
    if (fp == NULL) return NULL;
2993
0
    is_be = ed_is_big();
2994
0
    if (bgzf_read(fp, magic, 4) != 4) goto fail;
2995
2996
0
    if (memcmp(magic, "CSI\1", 4) == 0) {
2997
0
        uint32_t x[3], n;
2998
0
        if (bgzf_read(fp, x, 12) != 12) goto fail;
2999
0
        if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]);
3000
0
        if (x[2]) {
3001
0
            if (SIZE_MAX - x[2] < 1) goto fail; // Prevent possible overflow
3002
0
            if ((meta = (uint8_t*)malloc((size_t) x[2] + 1)) == NULL) goto fail;
3003
0
            if (bgzf_read(fp, meta, x[2]) != x[2]) goto fail;
3004
            // Prevent possible strlen past the end in tbx_index_load2
3005
0
            meta[x[2]] = '\0';
3006
0
        }
3007
0
        if (bgzf_read(fp, &n, 4) != 4) goto fail;
3008
0
        if (is_be) ed_swap_4p(&n);
3009
0
        if (n > INT32_MAX) goto fail;
3010
0
        if ((idx = hts_idx_init(n, HTS_FMT_CSI, 0, x[0], x[1])) == NULL) goto fail;
3011
0
        idx->l_meta = x[2];
3012
0
        idx->meta = meta;
3013
0
        meta = NULL;
3014
0
        if (idx_read_core(idx, fp, HTS_FMT_CSI) < 0) goto fail;
3015
0
    }
3016
0
    else if (memcmp(magic, "TBI\1", 4) == 0) {
3017
0
        uint8_t x[8 * 4];
3018
0
        uint32_t n;
3019
        // Read file header
3020
0
        if (bgzf_read(fp, x, sizeof(x)) != sizeof(x)) goto fail;
3021
0
        n = le_to_u32(&x[0]); // location of n_ref
3022
0
        if (n > INT32_MAX) goto fail;
3023
0
        if ((idx = hts_idx_init(n, HTS_FMT_TBI, 0, 14, 5)) == NULL) goto fail;
3024
0
        n = le_to_u32(&x[7*4]); // location of l_nm
3025
0
        if (n > UINT32_MAX - 29) goto fail; // Prevent possible overflow
3026
0
        idx->l_meta = 28 + n;
3027
0
        if ((idx->meta = (uint8_t*)malloc(idx->l_meta + 1)) == NULL) goto fail;
3028
        // copy format, col_seq, col_beg, col_end, meta, skip, l_nm
3029
        // N.B. left in little-endian byte order.
3030
0
        memcpy(idx->meta, &x[1*4], 28);
3031
        // Read in sequence names.
3032
0
        if (bgzf_read(fp, idx->meta + 28, n) != n) goto fail;
3033
        // Prevent possible strlen past the end in tbx_index_load2
3034
0
        idx->meta[idx->l_meta] = '\0';
3035
0
        if (idx_read_core(idx, fp, HTS_FMT_TBI) < 0) goto fail;
3036
0
    }
3037
0
    else if (memcmp(magic, "BAI\1", 4) == 0) {
3038
0
        uint32_t n;
3039
0
        if (bgzf_read(fp, &n, 4) != 4) goto fail;
3040
0
        if (is_be) ed_swap_4p(&n);
3041
0
        if (n > INT32_MAX) goto fail;
3042
0
        if ((idx = hts_idx_init(n, HTS_FMT_BAI, 0, 14, 5)) == NULL) goto fail;
3043
0
        if (idx_read_core(idx, fp, HTS_FMT_BAI) < 0) goto fail;
3044
0
    }
3045
0
    else { errno = EINVAL; goto fail; }
3046
3047
0
    bgzf_close(fp);
3048
0
    return idx;
3049
3050
0
fail:
3051
0
    bgzf_close(fp);
3052
0
    hts_idx_destroy(idx);
3053
0
    free(meta);
3054
0
    return NULL;
3055
0
}
3056
3057
int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta,
3058
                      int is_copy)
3059
0
{
3060
0
    uint8_t *new_meta = meta;
3061
0
    if (is_copy) {
3062
0
        size_t l = l_meta;
3063
0
        if (l > SIZE_MAX - 1) {
3064
0
            errno = ENOMEM;
3065
0
            return -1;
3066
0
        }
3067
0
        new_meta = malloc(l + 1);
3068
0
        if (!new_meta) return -1;
3069
0
        memcpy(new_meta, meta, l);
3070
        // Prevent possible strlen past the end in tbx_index_load2
3071
0
        new_meta[l] = '\0';
3072
0
    }
3073
0
    if (idx->meta) free(idx->meta);
3074
0
    idx->l_meta = l_meta;
3075
0
    idx->meta = new_meta;
3076
0
    return 0;
3077
0
}
3078
3079
uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta)
3080
0
{
3081
0
    *l_meta = idx->l_meta;
3082
0
    return idx->meta;
3083
0
}
3084
3085
const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)
3086
0
{
3087
0
    if ( !idx || !idx->n )
3088
0
    {
3089
0
        *n = 0;
3090
0
        return NULL;
3091
0
    }
3092
3093
0
    int tid = 0, i;
3094
0
    const char **names = (const char**) calloc(idx->n,sizeof(const char*));
3095
0
    for (i=0; i<idx->n; i++)
3096
0
    {
3097
0
        bidx_t *bidx = idx->bidx[i];
3098
0
        if ( !bidx ) continue;
3099
0
        names[tid++] = getid(hdr,i);
3100
0
    }
3101
0
    *n = tid;
3102
0
    return names;
3103
0
}
3104
3105
0
int hts_idx_nseq(const hts_idx_t *idx) {
3106
0
    if (!idx) return -1;
3107
0
    return idx->n;
3108
0
}
3109
3110
int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped)
3111
0
{
3112
0
    if (!idx) return -1;
3113
0
    if ( idx->fmt == HTS_FMT_CRAI ) {
3114
0
        *mapped = 0; *unmapped = 0;
3115
0
        return -1;
3116
0
    }
3117
3118
0
    bidx_t *h = idx->bidx[tid];
3119
0
    if (!h) return -1;
3120
0
    khint_t k = kh_get(bin, h, META_BIN(idx));
3121
0
    if (k != kh_end(h)) {
3122
0
        *mapped = kh_val(h, k).list[1].u;
3123
0
        *unmapped = kh_val(h, k).list[1].v;
3124
0
        return 0;
3125
0
    } else {
3126
0
        *mapped = 0; *unmapped = 0;
3127
0
        return -1;
3128
0
    }
3129
0
}
3130
3131
uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
3132
0
{
3133
0
    if (idx->fmt == HTS_FMT_CRAI) return 0;
3134
0
    return idx->n_no_coor;
3135
0
}
3136
3137
/****************
3138
 *** Iterator ***
3139
 ****************/
3140
3141
// Note: even with 32-bit hts_pos_t, end needs to be 64-bit here due to 1LL<<s.
3142
static inline int reg2bins_narrow(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx)
3143
0
{
3144
0
    int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
3145
0
    for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3146
0
        hts_pos_t b, e;
3147
0
        int i;
3148
0
        b = t + (beg>>s); e = t + (end>>s);
3149
0
        for (i = b; i <= e; ++i) {
3150
0
            if (kh_get(bin, bidx, i) != kh_end(bidx)) {
3151
0
                assert(itr->bins.n < itr->bins.m);
3152
0
                itr->bins.a[itr->bins.n++] = i;
3153
0
            }
3154
0
        }
3155
0
    }
3156
0
    return itr->bins.n;
3157
0
}
3158
3159
static inline int reg2bins_wide(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx)
3160
0
{
3161
0
    khint_t i;
3162
0
    hts_pos_t max_shift = 3 * n_lvls + min_shift;
3163
0
    --end;
3164
0
    if (beg < 0) beg = 0;
3165
0
    for (i = kh_begin(bidx); i != kh_end(bidx); i++) {
3166
0
        if (!kh_exist(bidx, i)) continue;
3167
0
        hts_pos_t bin = (hts_pos_t) kh_key(bidx, i);
3168
0
        int level = hts_bin_level(bin);
3169
0
        if (level > n_lvls) continue; // Dodgy index?
3170
0
        hts_pos_t first = hts_bin_first(level);
3171
0
        hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level));
3172
0
        hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level));
3173
0
        if (beg_at_level <= bin && bin <= end_at_level) {
3174
0
            assert(itr->bins.n < itr->bins.m);
3175
0
            itr->bins.a[itr->bins.n++] = bin;
3176
0
        }
3177
0
    }
3178
0
    return itr->bins.n;
3179
0
}
3180
3181
static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx)
3182
0
{
3183
0
    int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
3184
0
    size_t reg_bin_count = 0, hash_bin_count = kh_n_buckets(bidx), max_bins;
3185
0
    hts_pos_t end1;
3186
0
    if (end >= 1LL<<s) end = 1LL<<s;
3187
0
    if (beg >= end) return 0;
3188
0
    end1 = end - 1;
3189
3190
    // Count bins to see if it's faster to iterate through the hash table
3191
    // or the set of bins covering the region
3192
0
    for (l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3193
0
        reg_bin_count += (end1 >> s) - (beg >> s) + 1;
3194
0
    }
3195
0
    max_bins = reg_bin_count < kh_size(bidx) ? reg_bin_count : kh_size(bidx);
3196
0
    if (itr->bins.m - itr->bins.n < max_bins) {
3197
        // Worst-case memory usage.  May be wasteful on very sparse
3198
        // data, but the bin list usually won't be too big anyway.
3199
0
        size_t new_m = max_bins + itr->bins.n;
3200
0
        if (new_m > INT_MAX || new_m > SIZE_MAX / sizeof(int)) {
3201
0
            errno = ENOMEM;
3202
0
            return -1;
3203
0
        }
3204
0
        int *new_a = realloc(itr->bins.a, new_m * sizeof(*new_a));
3205
0
        if (!new_a) return -1;
3206
0
        itr->bins.a = new_a;
3207
0
        itr->bins.m = new_m;
3208
0
    }
3209
0
    if (reg_bin_count < hash_bin_count) {
3210
0
        return reg2bins_narrow(beg, end, itr, min_shift, n_lvls, bidx);
3211
0
    } else {
3212
0
        return reg2bins_wide(beg, end, itr, min_shift, n_lvls, bidx);
3213
0
    }
3214
0
}
3215
3216
static inline int add_to_interval(hts_itr_t *iter, bins_t *bin,
3217
                                  int tid, uint32_t interval,
3218
                                  uint64_t min_off, uint64_t max_off)
3219
0
{
3220
0
    hts_pair64_max_t *off;
3221
0
    int j;
3222
3223
0
    if (!bin->n)
3224
0
        return 0;
3225
0
    off = realloc(iter->off, (iter->n_off + bin->n) * sizeof(*off));
3226
0
    if (!off)
3227
0
        return -2;
3228
3229
0
    iter->off = off;
3230
0
    for (j = 0; j < bin->n; ++j) {
3231
0
        if (bin->list[j].v > min_off && bin->list[j].u < max_off) {
3232
0
            iter->off[iter->n_off].u = min_off > bin->list[j].u
3233
0
                ? min_off : bin->list[j].u;
3234
0
            iter->off[iter->n_off].v = max_off < bin->list[j].v
3235
0
                ? max_off : bin->list[j].v;
3236
            // hts_pair64_max_t::max is now used to link
3237
            // file offsets to region list entries.
3238
            // The iterator can use this to decide if it
3239
            // can skip some file regions.
3240
0
            iter->off[iter->n_off].max = ((uint64_t) tid << 32) | interval;
3241
0
            iter->n_off++;
3242
0
        }
3243
0
    }
3244
0
    return 0;
3245
0
}
3246
3247
static inline int reg2intervals_narrow(hts_itr_t *iter, const bidx_t *bidx,
3248
                                       int tid, int64_t beg, int64_t end,
3249
                                       uint32_t interval,
3250
                                       uint64_t min_off, uint64_t max_off,
3251
                                       int min_shift, int n_lvls)
3252
0
{
3253
0
    int l, t, s = min_shift + n_lvls * 3;
3254
0
    hts_pos_t b, e, i;
3255
3256
0
    for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3257
0
        b = t + (beg>>s); e = t + (end>>s);
3258
0
        for (i = b; i <= e; ++i) {
3259
0
            khint_t k = kh_get(bin, bidx, i);
3260
0
            if (k != kh_end(bidx)) {
3261
0
                bins_t *bin = &kh_value(bidx, k);
3262
0
                int res = add_to_interval(iter, bin, tid, interval, min_off, max_off);
3263
0
                if (res < 0)
3264
0
                    return res;
3265
0
            }
3266
0
        }
3267
0
    }
3268
0
    return 0;
3269
0
}
3270
3271
static inline int reg2intervals_wide(hts_itr_t *iter, const bidx_t *bidx,
3272
                                       int tid, int64_t beg, int64_t end,
3273
                                       uint32_t interval,
3274
                                       uint64_t min_off, uint64_t max_off,
3275
                                       int min_shift, int n_lvls)
3276
0
{
3277
0
    khint_t i;
3278
0
    hts_pos_t max_shift = 3 * n_lvls + min_shift;
3279
0
    --end;
3280
0
    if (beg < 0) beg = 0;
3281
0
    for (i = kh_begin(bidx); i != kh_end(bidx); i++) {
3282
0
        if (!kh_exist(bidx, i)) continue;
3283
0
        hts_pos_t bin = (hts_pos_t) kh_key(bidx, i);
3284
0
        int level = hts_bin_level(bin);
3285
0
        if (level > n_lvls) continue; // Dodgy index?
3286
0
        hts_pos_t first = hts_bin_first(level);
3287
0
        hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level));
3288
0
        hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level));
3289
0
        if (beg_at_level <= bin && bin <= end_at_level) {
3290
0
            bins_t *bin = &kh_value(bidx, i);
3291
0
            int res = add_to_interval(iter, bin, tid, interval, min_off, max_off);
3292
0
            if (res < 0)
3293
0
                return res;
3294
0
        }
3295
0
    }
3296
0
    return 0;
3297
0
}
3298
3299
static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint32_t interval, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls)
3300
0
{
3301
0
    int l, t, s;
3302
0
    int i, j;
3303
0
    hts_pos_t end1;
3304
0
    bidx_t *bidx;
3305
0
    int start_n_off;
3306
0
    size_t reg_bin_count = 0, hash_bin_count;
3307
0
    int res;
3308
3309
0
    if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg > end)
3310
0
        return -1;
3311
3312
0
    hash_bin_count = kh_n_buckets(bidx);
3313
3314
0
    s = min_shift + (n_lvls<<1) + n_lvls;
3315
0
    if (end >= 1LL<<s)
3316
0
        end = 1LL<<s;
3317
3318
0
    end1 = end - 1;
3319
    // Count bins to see if it's faster to iterate through the hash table
3320
    // or the set of bins covering the region
3321
0
    for (l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
3322
0
        reg_bin_count += (end1 >> s) - (beg >> s) + 1;
3323
0
    }
3324
3325
0
    start_n_off = iter->n_off;
3326
3327
    // Populate iter->off with the intervals for this region
3328
0
    if (reg_bin_count < hash_bin_count) {
3329
0
        res = reg2intervals_narrow(iter, bidx, tid, beg, end, interval,
3330
0
                                   min_off, max_off, min_shift, n_lvls);
3331
0
    } else {
3332
0
        res = reg2intervals_wide(iter, bidx, tid, beg, end, interval,
3333
0
                                 min_off, max_off, min_shift, n_lvls);
3334
0
    }
3335
0
    if (res < 0)
3336
0
        return res;
3337
3338
0
    if (iter->n_off - start_n_off > 1) {
3339
0
        ks_introsort(_off_max, iter->n_off - start_n_off, iter->off + start_n_off);
3340
0
        for (i = start_n_off, j = start_n_off + 1; j < iter->n_off; j++) {
3341
0
            if (iter->off[i].v >= iter->off[j].u) {
3342
0
                if (iter->off[i].v < iter->off[j].v)
3343
0
                    iter->off[i].v = iter->off[j].v;
3344
0
            } else {
3345
0
                i++;
3346
0
                if (i < j)
3347
0
                    iter->off[i] = iter->off[j];
3348
0
            }
3349
0
        }
3350
0
        iter->n_off = i + 1;
3351
0
    }
3352
3353
0
    return iter->n_off;
3354
0
}
3355
3356
0
static int compare_regions(const void *r1, const void *r2) {
3357
0
    hts_reglist_t *reg1 = (hts_reglist_t *)r1;
3358
0
    hts_reglist_t *reg2 = (hts_reglist_t *)r2;
3359
3360
0
    if (reg1->tid < 0 && reg2->tid >= 0)
3361
0
        return 1;
3362
0
    else if (reg1->tid >= 0 && reg2->tid < 0)
3363
0
        return -1;
3364
0
    else
3365
0
        return reg1->tid - reg2->tid;
3366
0
}
3367
3368
0
uint64_t hts_itr_off(const hts_idx_t* idx, int tid) {
3369
3370
0
    int i;
3371
0
    bidx_t* bidx;
3372
0
    uint64_t off0 = (uint64_t) -1;
3373
0
    khint_t k;
3374
0
    switch (tid) {
3375
0
    case HTS_IDX_START:
3376
        // Find the smallest offset, note that sequence ids may not be ordered sequentially
3377
0
        for (i = 0; i < idx->n; i++) {
3378
0
            bidx = idx->bidx[i];
3379
0
            k = kh_get(bin, bidx, META_BIN(idx));
3380
0
            if (k == kh_end(bidx))
3381
0
                continue;
3382
3383
0
            if (off0 > kh_val(bidx, k).list[0].u)
3384
0
                off0 = kh_val(bidx, k).list[0].u;
3385
0
        }
3386
0
        if (off0 == (uint64_t) -1 && idx->n_no_coor)
3387
0
            off0 = 0;
3388
        // only no-coor reads in this bam
3389
0
        break;
3390
0
    case HTS_IDX_NOCOOR:
3391
        /* No-coor reads sort after all of the mapped reads.  The position
3392
           is not stored in the index itself, so need to find the end
3393
           offset for the last mapped read.  A loop is needed here in
3394
           case references at the end of the file have no mapped reads,
3395
           or sequence ids are not ordered sequentially.
3396
           See issue samtools#568 and commits b2aab8, 60c22d and cc207d. */
3397
0
        for (i = 0; i < idx->n; i++) {
3398
0
            bidx = idx->bidx[i];
3399
0
            k = kh_get(bin, bidx, META_BIN(idx));
3400
0
            if (k != kh_end(bidx)) {
3401
0
                if (off0 == (uint64_t) -1 || off0 < kh_val(bidx, k).list[0].v) {
3402
0
                    off0 = kh_val(bidx, k).list[0].v;
3403
0
                }
3404
0
            }
3405
0
        }
3406
0
        if (off0 == (uint64_t) -1 && idx->n_no_coor)
3407
0
            off0 = 0;
3408
        // only no-coor reads in this bam
3409
0
        break;
3410
0
    case HTS_IDX_REST:
3411
0
        off0 = 0;
3412
0
        break;
3413
0
    case HTS_IDX_NONE:
3414
0
        off0 = 0;
3415
0
        break;
3416
0
    }
3417
3418
0
    return off0;
3419
0
}
3420
3421
hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
3422
0
{
3423
0
    int i, n_off, l, bin;
3424
0
    hts_pair64_max_t *off;
3425
0
    khint_t k;
3426
0
    bidx_t *bidx;
3427
0
    uint64_t min_off, max_off;
3428
0
    hts_pos_t idx_maxpos;
3429
0
    hts_itr_t *iter;
3430
0
    uint32_t unmapped = 0, rel_off;
3431
3432
    // It's possible to call this function with NULL idx iff
3433
    // tid is one of the special values HTS_IDX_REST or HTS_IDX_NONE
3434
0
    if (!idx && !(tid == HTS_IDX_REST || tid == HTS_IDX_NONE)) {
3435
0
        errno = EINVAL;
3436
0
        return NULL;
3437
0
    }
3438
3439
0
    iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
3440
0
    if (iter) {
3441
0
        if (tid < 0) {
3442
0
            uint64_t off = hts_itr_off(idx, tid);
3443
0
            if (off != (uint64_t) -1) {
3444
0
                iter->read_rest = 1;
3445
0
                iter->curr_off = off;
3446
0
                iter->readrec = readrec;
3447
0
                if (tid == HTS_IDX_NONE)
3448
0
                    iter->finished = 1;
3449
0
            } else {
3450
0
                free(iter);
3451
0
                iter = NULL;
3452
0
            }
3453
0
        } else if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) {
3454
0
            iter->finished = 1;
3455
0
        } else {
3456
0
            if (beg < 0) beg = 0;
3457
0
            if (end < beg) {
3458
0
              free(iter);
3459
0
              return NULL;
3460
0
            }
3461
3462
0
            k = kh_get(bin, bidx, META_BIN(idx));
3463
0
            if (k != kh_end(bidx))
3464
0
                unmapped = kh_val(bidx, k).list[1].v;
3465
0
            else
3466
0
                unmapped = 1;
3467
3468
0
            iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
3469
0
            iter->readrec = readrec;
3470
3471
0
            if ( !kh_size(bidx) ) { iter->finished = 1; return iter; }
3472
3473
0
            idx_maxpos = hts_idx_maxpos(idx);
3474
0
            if (beg >= idx_maxpos) { iter->finished = 1; return iter; }
3475
3476
0
            rel_off = beg>>idx->min_shift;
3477
            // compute min_off
3478
0
            bin = hts_bin_first(idx->n_lvls) + rel_off;
3479
0
            do {
3480
0
                int first;
3481
0
                k = kh_get(bin, bidx, bin);
3482
0
                if (k != kh_end(bidx)) break;
3483
0
                first = (hts_bin_parent(bin)<<3) + 1;
3484
0
                if (bin > first) --bin;
3485
0
                else bin = hts_bin_parent(bin);
3486
0
            } while (bin);
3487
0
            if (bin == 0) k = kh_get(bin, bidx, bin);
3488
0
            min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
3489
            // min_off can be calculated more accurately if the
3490
            // linear index is available
3491
0
            if (idx->lidx[tid].offset
3492
0
                && rel_off < idx->lidx[tid].n) {
3493
0
                if (min_off < idx->lidx[tid].offset[rel_off])
3494
0
                    min_off = idx->lidx[tid].offset[rel_off];
3495
0
                if (unmapped) {
3496
                    // unmapped reads are not covered by the linear index,
3497
                    // so search backwards for a smaller offset
3498
0
                    int tmp_off;
3499
0
                    for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) {
3500
0
                        if (idx->lidx[tid].offset[tmp_off] < min_off) {
3501
0
                            min_off = idx->lidx[tid].offset[tmp_off];
3502
0
                            break;
3503
0
                        }
3504
0
                    }
3505
                    // if the search went too far back or no satisfactory entry
3506
                    // was found, revert to the bin index loff value
3507
0
                    if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).loff || tmp_off < 0))
3508
0
                        min_off = kh_val(bidx, k).loff;
3509
0
                }
3510
0
            } else if (unmapped) { //CSI index
3511
0
                if (k != kh_end(bidx))
3512
0
                    min_off = kh_val(bidx, k).loff;
3513
0
            }
3514
3515
            // compute max_off: a virtual offset from a bin to the right of end
3516
            // First check if end lies within the range of the index (it won't
3517
            // if it's HTS_POS_MAX)
3518
0
            if (end <= idx_maxpos) {
3519
0
                bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
3520
0
                if (bin >= idx->n_bins) bin = 0;
3521
0
                while (1) {
3522
                    // search for an extant bin by moving right, but moving up to the
3523
                    // parent whenever we get to a first child (which also covers falling
3524
                    // off the RHS, which wraps around and immediately goes up to bin 0)
3525
0
                    while (bin % 8 == 1) bin = hts_bin_parent(bin);
3526
0
                    if (bin == 0) { max_off = UINT64_MAX; break; }
3527
0
                    k = kh_get(bin, bidx, bin);
3528
0
                    if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; }
3529
0
                    bin++;
3530
0
                }
3531
0
            } else {
3532
                // Searching to end of reference
3533
0
                max_off = UINT64_MAX;
3534
0
            }
3535
3536
            // retrieve bins
3537
0
            if (reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls, bidx) < 0) {
3538
0
                hts_itr_destroy(iter);
3539
0
                return NULL;
3540
0
            }
3541
3542
0
            for (i = n_off = 0; i < iter->bins.n; ++i)
3543
0
                if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx))
3544
0
                    n_off += kh_value(bidx, k).n;
3545
0
            if (n_off == 0) {
3546
                // No overlapping bins means the iterator has already finished.
3547
0
                iter->finished = 1;
3548
0
                return iter;
3549
0
            }
3550
0
            off = calloc(n_off, sizeof(*off));
3551
0
            for (i = n_off = 0; i < iter->bins.n; ++i) {
3552
0
                if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) {
3553
0
                    int j;
3554
0
                    bins_t *p = &kh_value(bidx, k);
3555
0
                    for (j = 0; j < p->n; ++j)
3556
0
                        if (p->list[j].v > min_off && p->list[j].u < max_off) {
3557
0
                            off[n_off].u = min_off > p->list[j].u
3558
0
                                ? min_off : p->list[j].u;
3559
0
                            off[n_off].v = max_off < p->list[j].v
3560
0
                                ? max_off : p->list[j].v;
3561
                            // hts_pair64_max_t::max is now used to link
3562
                            // file offsets to region list entries.
3563
                            // The iterator can use this to decide if it
3564
                            // can skip some file regions.
3565
0
                            off[n_off].max = ((uint64_t) tid << 32) | j;
3566
0
                            n_off++;
3567
0
                        }
3568
0
                }
3569
0
            }
3570
3571
0
            if (n_off == 0) {
3572
0
                free(off);
3573
0
                iter->finished = 1;
3574
0
                return iter;
3575
0
            }
3576
0
            ks_introsort(_off_max, n_off, off);
3577
            // resolve completely contained adjacent blocks
3578
0
            for (i = 1, l = 0; i < n_off; ++i)
3579
0
                if (off[l].v < off[i].v) off[++l] = off[i];
3580
0
            n_off = l + 1;
3581
            // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
3582
0
            for (i = 1; i < n_off; ++i)
3583
0
                if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
3584
            // merge adjacent blocks
3585
0
            for (i = 1, l = 0; i < n_off; ++i) {
3586
0
                if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
3587
0
                else off[++l] = off[i];
3588
0
            }
3589
0
            n_off = l + 1;
3590
0
            iter->n_off = n_off; iter->off = off;
3591
0
        }
3592
0
    }
3593
3594
0
    return iter;
3595
0
}
3596
3597
int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
3598
0
{
3599
0
    int i, j, bin;
3600
0
    khint_t k;
3601
0
    bidx_t *bidx;
3602
0
    uint64_t min_off, max_off, t_off = (uint64_t)-1;
3603
0
    int tid;
3604
0
    hts_pos_t beg, end, idx_maxpos;
3605
0
    hts_reglist_t *curr_reg;
3606
0
    uint32_t unmapped = 0, rel_off;
3607
3608
0
    if (!idx || !iter || !iter->multi)
3609
0
        return -1;
3610
3611
0
    iter->i = -1;
3612
0
    for (i=0; i<iter->n_reg; i++) {
3613
3614
0
        curr_reg = &iter->reg_list[i];
3615
0
        tid = curr_reg->tid;
3616
3617
0
        if (tid < 0) {
3618
0
            t_off = hts_itr_off(idx, tid);
3619
0
            if (t_off != (uint64_t)-1) {
3620
0
                switch (tid) {
3621
0
                case HTS_IDX_NONE:
3622
0
                    iter->finished = 1;
3623
                    // fall through
3624
0
                case HTS_IDX_START:
3625
0
                case HTS_IDX_REST:
3626
0
                    iter->curr_off = t_off;
3627
0
                    iter->n_reg = 0;
3628
0
                    iter->reg_list = NULL;
3629
0
                    iter->read_rest = 1;
3630
0
                    return 0;
3631
0
                case HTS_IDX_NOCOOR:
3632
0
                    iter->nocoor = 1;
3633
0
                    iter->nocoor_off = t_off;
3634
0
                }
3635
0
            }
3636
0
        } else {
3637
0
            if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL || !kh_size(bidx))
3638
0
                continue;
3639
3640
0
            k = kh_get(bin, bidx, META_BIN(idx));
3641
0
            if (k != kh_end(bidx))
3642
0
                unmapped = kh_val(bidx, k).list[1].v;
3643
0
            else
3644
0
                unmapped = 1;
3645
3646
0
            idx_maxpos = hts_idx_maxpos(idx);
3647
3648
0
            for(j=0; j<curr_reg->count; j++) {
3649
0
                hts_pair32_t *curr_intv = &curr_reg->intervals[j];
3650
0
                if (curr_intv->end < curr_intv->beg)
3651
0
                    continue;
3652
3653
0
                beg = curr_intv->beg;
3654
0
                end = curr_intv->end;
3655
0
                if (beg >= idx_maxpos)
3656
0
                    continue;
3657
0
                rel_off = beg>>idx->min_shift;
3658
3659
                /* Compute 'min_off' by searching the lowest level bin containing 'beg'.
3660
                       If the computed bin is not in the index, try the next bin to the
3661
                       left, belonging to the same parent. If it is the first sibling bin,
3662
                       try the parent bin. */
3663
0
                bin = hts_bin_first(idx->n_lvls) + rel_off;
3664
0
                do {
3665
0
                    int first;
3666
0
                    k = kh_get(bin, bidx, bin);
3667
0
                    if (k != kh_end(bidx)) break;
3668
0
                    first = (hts_bin_parent(bin)<<3) + 1;
3669
0
                    if (bin > first) --bin;
3670
0
                    else bin = hts_bin_parent(bin);
3671
0
                } while (bin);
3672
0
                if (bin == 0)
3673
0
                    k = kh_get(bin, bidx, bin);
3674
0
                min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
3675
                // min_off can be calculated more accurately if the
3676
                // linear index is available
3677
0
                if (idx->lidx[tid].offset
3678
0
                    && rel_off < idx->lidx[tid].n) {
3679
0
                    if (min_off < idx->lidx[tid].offset[rel_off])
3680
0
                        min_off = idx->lidx[tid].offset[rel_off];
3681
0
                    if (unmapped) {
3682
0
                        int tmp_off;
3683
0
                        for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) {
3684
0
                            if (idx->lidx[tid].offset[tmp_off] < min_off) {
3685
0
                                min_off = idx->lidx[tid].offset[tmp_off];
3686
0
                                break;
3687
0
                            }
3688
0
                        }
3689
3690
0
                        if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).loff || tmp_off < 0))
3691
0
                            min_off = kh_val(bidx, k).loff;
3692
0
                    }
3693
0
                } else if (unmapped) { //CSI index
3694
0
                    if (k != kh_end(bidx))
3695
0
                        min_off = kh_val(bidx, k).loff;
3696
0
                }
3697
3698
                // compute max_off: a virtual offset from a bin to the right of end
3699
                // First check if end lies within the range of the index (it
3700
                // won't if it's HTS_POS_MAX)
3701
0
                if (end <= idx_maxpos) {
3702
0
                    bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
3703
0
                    if (bin >= idx->n_bins) bin = 0;
3704
0
                    while (1) {
3705
                        // search for an extant bin by moving right, but moving up to the
3706
                        // parent whenever we get to a first child (which also covers falling
3707
                        // off the RHS, which wraps around and immediately goes up to bin 0)
3708
0
                        while (bin % 8 == 1) bin = hts_bin_parent(bin);
3709
0
                        if (bin == 0) { max_off = UINT64_MAX; break; }
3710
0
                        k = kh_get(bin, bidx, bin);
3711
0
                        if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) {
3712
0
                            max_off = kh_val(bidx, k).list[0].u;
3713
0
                            break;
3714
0
                        }
3715
0
                        bin++;
3716
0
                    }
3717
0
                } else {
3718
                    // Searching to end of reference
3719
0
                    max_off = UINT64_MAX;
3720
0
                }
3721
3722
                //convert coordinates to file offsets
3723
0
                if (reg2intervals(iter, idx, tid, beg, end, j,
3724
0
                                  min_off, max_off,
3725
0
                                  idx->min_shift, idx->n_lvls) < 0) {
3726
0
                    return -1;
3727
0
                }
3728
0
            }
3729
0
        }
3730
0
    }
3731
3732
0
    if (iter->n_off > 1)
3733
0
        ks_introsort(_off_max, iter->n_off, iter->off);
3734
3735
0
    if(!iter->n_off && !iter->nocoor)
3736
0
        iter->finished = 1;
3737
3738
0
    return 0;
3739
0
}
3740
3741
int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter)
3742
0
{
3743
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
3744
0
    int tid, i, n_off = 0;
3745
0
    uint32_t j;
3746
0
    hts_pos_t beg, end;
3747
0
    hts_reglist_t *curr_reg;
3748
0
    hts_pair32_t *curr_intv;
3749
0
    hts_pair64_max_t *off = NULL, *tmp;
3750
0
    cram_index *e = NULL;
3751
3752
0
    if (!cidx || !iter || !iter->multi)
3753
0
        return -1;
3754
3755
0
    iter->is_cram = 1;
3756
0
    iter->read_rest = 0;
3757
0
    iter->off = NULL;
3758
0
    iter->n_off = 0;
3759
0
    iter->curr_off = 0;
3760
0
    iter->i = -1;
3761
3762
0
    for (i=0; i<iter->n_reg; i++) {
3763
3764
0
        curr_reg = &iter->reg_list[i];
3765
0
        tid = curr_reg->tid;
3766
3767
0
        if (tid >= 0) {
3768
0
            tmp = realloc(off, (n_off + curr_reg->count) * sizeof(*off));
3769
0
            if (!tmp)
3770
0
                goto err;
3771
0
            off = tmp;
3772
3773
0
            for (j=0; j < curr_reg->count; j++) {
3774
0
                curr_intv = &curr_reg->intervals[j];
3775
0
                if (curr_intv->end < curr_intv->beg)
3776
0
                    continue;
3777
3778
0
                beg = curr_intv->beg;
3779
0
                end = curr_intv->end;
3780
3781
/* First, fetch the container overlapping 'beg' and assign its file offset to u, then
3782
 * find the container overlapping 'end' and assign the relative end of the slice to v.
3783
 * The cram_ptell function will adjust with the container offset, which is not stored
3784
 * in the index.
3785
 */
3786
0
                e = cram_index_query(cidx->cram, tid, beg+1, NULL);
3787
0
                if (e) {
3788
0
                    off[n_off].u = e->offset;
3789
                    // hts_pair64_max_t::max is now used to link
3790
                    // file offsets to region list entries.
3791
                    // The iterator can use this to decide if it
3792
                    // can skip some file regions.
3793
0
                    off[n_off].max = ((uint64_t) tid << 32) | j;
3794
3795
0
                    if (end >= HTS_POS_MAX) {
3796
0
                       e = cram_index_last(cidx->cram, tid, NULL);
3797
0
                    } else {
3798
0
                       e = cram_index_query_last(cidx->cram, tid, end+1);
3799
0
                    }
3800
3801
0
                    if (e) {
3802
0
                        off[n_off++].v = e->e_next
3803
0
                            ? e->e_next->offset
3804
0
                            : e->offset + e->slice + e->len;
3805
0
                    } else {
3806
0
                        hts_log_warning("Could not set offset end for region %d:%"PRIhts_pos"-%"PRIhts_pos". Skipping", tid, beg, end);
3807
0
                    }
3808
0
                }
3809
0
            }
3810
0
        } else {
3811
0
            switch (tid) {
3812
0
                case HTS_IDX_NOCOOR:
3813
0
                    e = cram_index_query(cidx->cram, tid, 1, NULL);
3814
0
                    if (e) {
3815
0
                        iter->nocoor = 1;
3816
0
                        iter->nocoor_off = e->offset;
3817
0
                    } else {
3818
0
                        hts_log_warning("No index entry for NOCOOR region");
3819
0
                    }
3820
0
                    break;
3821
0
                case HTS_IDX_START:
3822
0
                    e = cram_index_query(cidx->cram, tid, 1, NULL);
3823
0
                    if (e) {
3824
0
                        iter->read_rest = 1;
3825
0
                        tmp = realloc(off, sizeof(*off));
3826
0
                        if (!tmp)
3827
0
                            goto err;
3828
0
                        off = tmp;
3829
0
                        off[0].u = e->offset;
3830
0
                        off[0].v = 0;
3831
0
                        n_off=1;
3832
0
                    } else {
3833
0
                        hts_log_warning("No index entries");
3834
0
                    }
3835
0
                    break;
3836
0
                case HTS_IDX_REST:
3837
0
                    break;
3838
0
                case HTS_IDX_NONE:
3839
0
                    iter->finished = 1;
3840
0
                    break;
3841
0
                default:
3842
0
                    hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
3843
0
            }
3844
0
        }
3845
0
    }
3846
3847
0
    if (n_off) {
3848
0
        ks_introsort(_off_max, n_off, off);
3849
0
        iter->n_off = n_off; iter->off = off;
3850
0
    }
3851
3852
0
    if(!n_off && !iter->nocoor)
3853
0
        iter->finished = 1;
3854
3855
0
    return 0;
3856
3857
0
 err:
3858
0
    free(off);
3859
0
    return -1;
3860
0
}
3861
3862
void hts_itr_destroy(hts_itr_t *iter)
3863
0
{
3864
0
    if (iter) {
3865
0
        if (iter->multi) {
3866
0
            hts_reglist_free(iter->reg_list, iter->n_reg);
3867
0
        } else {
3868
0
            free(iter->bins.a);
3869
0
        }
3870
3871
0
        if (iter->off)
3872
0
            free(iter->off);
3873
0
        free(iter);
3874
0
    }
3875
0
}
3876
3877
static inline unsigned long long push_digit(unsigned long long i, char c)
3878
0
{
3879
    // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so
3880
0
    int digit = c - '0';
3881
0
    return 10 * i + digit;
3882
0
}
3883
3884
long long hts_parse_decimal(const char *str, char **strend, int flags)
3885
0
{
3886
0
    unsigned long long n = 0;
3887
0
    int digits = 0, decimals = 0, e = 0, lost = 0;
3888
0
    char sign = '+', esign = '+';
3889
0
    const char *s, *str_orig = str;
3890
3891
0
    while (isspace_c(*str)) str++;
3892
0
    s = str;
3893
3894
0
    if (*s == '+' || *s == '-') sign = *s++;
3895
0
    while (*s)
3896
0
        if (isdigit_c(*s)) digits++, n = push_digit(n, *s++);
3897
0
        else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++;
3898
0
        else break;
3899
3900
0
    if (*s == '.') {
3901
0
        s++;
3902
0
        while (isdigit_c(*s)) decimals++, digits++, n = push_digit(n, *s++);
3903
0
    }
3904
3905
0
    switch (*s) {
3906
0
    case 'e': case 'E':
3907
0
        s++;
3908
0
        if (*s == '+' || *s == '-') esign = *s++;
3909
0
        while (isdigit_c(*s)) e = push_digit(e, *s++);
3910
0
        if (esign == '-') e = -e;
3911
0
        break;
3912
3913
0
    case 'k': case 'K': e += 3; s++; break;
3914
0
    case 'm': case 'M': e += 6; s++; break;
3915
0
    case 'g': case 'G': e += 9; s++; break;
3916
0
    }
3917
3918
0
    e -= decimals;
3919
0
    while (e > 0) n *= 10, e--;
3920
0
    while (e < 0) lost += n % 10, n /= 10, e++;
3921
3922
0
    if (lost > 0) {
3923
0
        hts_log_warning("Discarding fractional part of %.*s", (int)(s - str), str);
3924
0
    }
3925
3926
0
    if (strend) {
3927
        // Set to the original input str pointer if not valid number syntax
3928
0
        *strend = (digits > 0)? (char *)s : (char *)str_orig;
3929
0
    } else if (digits == 0) {
3930
0
        hts_log_warning("Invalid numeric value %.8s[truncated]", str);
3931
0
    } else if (*s) {
3932
0
        if ((flags & HTS_PARSE_THOUSANDS_SEP) || (!(flags & HTS_PARSE_THOUSANDS_SEP) && *s != ','))
3933
0
            hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s);
3934
0
    }
3935
3936
0
    return (sign == '+')? n : -n;
3937
0
}
3938
3939
0
static void *hts_memrchr(const void *s, int c, size_t n) {
3940
0
    size_t i;
3941
0
    unsigned char *u = (unsigned char *)s;
3942
0
    for (i = n; i > 0; i--) {
3943
0
        if (u[i-1] == c)
3944
0
            return u+i-1;
3945
0
    }
3946
3947
0
    return NULL;
3948
0
}
3949
3950
/*
3951
 * A variant of hts_parse_reg which is reference-id aware.  It uses
3952
 * the iterator name2id callbacks to validate the region tokenisation works.
3953
 *
3954
 * This is necessary due to GRCh38 HLA additions which have reference names
3955
 * like "HLA-DRB1*12:17".
3956
 *
3957
 * All parameters are mandatory.
3958
 *
3959
 * To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200"
3960
 * are reference names, we may quote using curly braces.
3961
 * Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example.
3962
 *
3963
 * Flags are used to control how parsing works, and can be one of the below.
3964
 *
3965
 * HTS_PARSE_LIST:
3966
 *     If present, the region is assmed to be a comma separated list and
3967
 *     position parsing will not contain commas (this implicitly
3968
 *     clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal).
3969
 *     On success the return pointer will be the start of the next region, ie
3970
 *     the character after the comma.  (If *ret != '\0' then the caller can
3971
 *     assume another region is present in the list.)
3972
 *
3973
 *     If not set then positions may contain commas.  In this case the return
3974
 *     value should point to the end of the string, or NULL on failure.
3975
 *
3976
 * HTS_PARSE_ONE_COORD:
3977
 *     If present, X:100 is treated as the single base pair region X:100-100.
3978
 *     In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-<end>.
3979
 *     (This is the standard bcftools region convention.)
3980
 *
3981
 *     When not set X:100 is considered to be X:100-<end> where <end> is
3982
 *     the end of chromosome X (set to HTS_POS_MAX here).  X:100- and X:-100
3983
 *     are invalid.
3984
 *     (This is the standard samtools region convention.)
3985
 *
3986
 * Note the supplied string expects 1 based inclusive coordinates, but the
3987
 * returned coordinates start from 0 and are half open, so pos0 is valid
3988
 * for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}"
3989
 *
3990
 * On success a pointer to the byte after the end of the entire region
3991
 *            specifier is returned (plus any trailing comma), and tid,
3992
 *            beg & end will be set.
3993
 * On failure NULL is returned.
3994
 */
3995
const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg,
3996
                             hts_pos_t *end, hts_name2id_f getid, void *hdr,
3997
                             int flags)
3998
0
{
3999
0
    if (!s || !tid || !beg || !end || !getid)
4000
0
        return NULL;
4001
4002
0
    size_t s_len = strlen(s);
4003
0
    kstring_t ks = { 0, 0, NULL };
4004
4005
0
    const char *colon = NULL, *comma = NULL;
4006
0
    int quoted = 0;
4007
4008
0
    if (flags & HTS_PARSE_LIST)
4009
0
        flags &= ~HTS_PARSE_THOUSANDS_SEP;
4010
0
    else
4011
0
        flags |= HTS_PARSE_THOUSANDS_SEP;
4012
4013
0
    const char *s_end = s + s_len;
4014
4015
    // Braced quoting of references is permitted to resolve ambiguities.
4016
0
    if (*s == '{') {
4017
0
        const char *close = memchr(s, '}', s_len);
4018
0
        if (!close) {
4019
0
            hts_log_error("Mismatching braces in \"%s\"", s);
4020
0
            *tid = -1;
4021
0
            return NULL;
4022
0
        }
4023
0
        s++;
4024
0
        s_len--;
4025
0
        if (close[1] == ':')
4026
0
            colon = close+1;
4027
0
        quoted = 1; // number of trailing characters to trim
4028
4029
        // Truncate to this item only, if appropriate.
4030
0
        if (flags & HTS_PARSE_LIST) {
4031
0
            comma = strchr(close, ',');
4032
0
            if (comma) {
4033
0
                s_len = comma-s;
4034
0
                s_end = comma+1;
4035
0
            }
4036
0
        }
4037
0
    } else {
4038
        // Truncate to this item only, if appropriate.
4039
0
        if (flags & HTS_PARSE_LIST) {
4040
0
            comma = strchr(s, ',');
4041
0
            if (comma) {
4042
0
                s_len = comma-s;
4043
0
                s_end = comma+1;
4044
0
            }
4045
0
        }
4046
4047
0
        colon = hts_memrchr(s, ':', s_len);
4048
0
    }
4049
4050
    // No colon is simplest case; just check and return.
4051
0
    if (colon == NULL) {
4052
0
        *beg = 0; *end = HTS_POS_MAX;
4053
0
        kputsn(s, s_len-quoted, &ks); // convert to nul terminated string
4054
0
        if (!ks.s) {
4055
0
            *tid = -2;
4056
0
            return NULL;
4057
0
        }
4058
4059
0
        *tid = getid(hdr, ks.s);
4060
0
        free(ks.s);
4061
4062
0
        return *tid >= 0 ? s_end : NULL;
4063
0
    }
4064
4065
    // Has a colon, but check whole name first.
4066
0
    if (!quoted) {
4067
0
        *beg = 0; *end = HTS_POS_MAX;
4068
0
        kputsn(s, s_len, &ks); // convert to nul terminated string
4069
0
        if (!ks.s) {
4070
0
            *tid = -2;
4071
0
            return NULL;
4072
0
        }
4073
0
        if ((*tid = getid(hdr, ks.s)) >= 0) {
4074
            // Entire name matches, but also check this isn't
4075
            // ambiguous.  eg we have ref chr1 and ref chr1:100-200
4076
            // both present.
4077
0
            ks.l = 0;
4078
0
            kputsn(s, colon-s, &ks); // convert to nul terminated string
4079
0
            if (!ks.s) {
4080
0
                *tid = -2;
4081
0
                return NULL;
4082
0
            }
4083
0
            if (getid(hdr, ks.s) >= 0) {
4084
0
                free(ks.s);
4085
0
                *tid = -1;
4086
0
                hts_log_error("Range is ambiguous. "
4087
0
                              "Use {%s} or {%.*s}%s instead",
4088
0
                              s, (int)(colon-s), s, colon);
4089
0
                return NULL;
4090
0
            }
4091
0
            free(ks.s);
4092
4093
0
            return s_end;
4094
0
        }
4095
0
        if (*tid < -1) // Failed to parse header
4096
0
            return NULL;
4097
0
    }
4098
4099
    // Quoted, or unquoted and whole string isn't a name.
4100
    // Check the pre-colon part is valid.
4101
0
    ks.l = 0;
4102
0
    kputsn(s, colon-s-quoted, &ks); // convert to nul terminated string
4103
0
    if (!ks.s) {
4104
0
        *tid = -2;
4105
0
        return NULL;
4106
0
    }
4107
0
    *tid = getid(hdr, ks.s);
4108
0
    free(ks.s);
4109
0
    if (*tid < 0)
4110
0
        return NULL;
4111
4112
    // Finally parse the post-colon coordinates
4113
0
    char *hyphen;
4114
0
    *beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1;
4115
0
    if (*beg < 0) {
4116
0
        if (*beg != -1 && *hyphen == '-' && colon[1] != '\0') {
4117
            // User specified zero, but we're 1-based.
4118
0
            hts_log_error("Coordinates must be > 0");
4119
0
            return NULL;
4120
0
        }
4121
0
        if (isdigit_c(*hyphen) || *hyphen == '\0' || *hyphen == ',') {
4122
            // interpret chr:-100 as chr:1-100
4123
0
            *end = *beg==-1 ? HTS_POS_MAX : -(*beg+1);
4124
0
            *beg = 0;
4125
0
            return s_end;
4126
0
        } else if (*beg < -1) {
4127
0
            hts_log_error("Unexpected string \"%s\" after region", hyphen);
4128
0
            return NULL;
4129
0
        }
4130
0
    }
4131
4132
0
    if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) {
4133
0
        *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : HTS_POS_MAX;
4134
0
    } else if (*hyphen == '-') {
4135
0
        *end = hts_parse_decimal(hyphen+1, &hyphen, flags);
4136
0
        if (*hyphen != '\0' && *hyphen != ',') {
4137
0
            hts_log_error("Unexpected string \"%s\" after region", hyphen);
4138
0
            return NULL;
4139
0
        }
4140
0
    } else {
4141
0
        hts_log_error("Unexpected string \"%s\" after region", hyphen);
4142
0
        return NULL;
4143
0
    }
4144
4145
0
    if (*end == 0)
4146
0
        *end = HTS_POS_MAX; // interpret chr:100- as chr:100-<end>
4147
4148
0
    if (*beg >= *end) return NULL;
4149
4150
0
    return s_end;
4151
0
}
4152
4153
// Next release we should mark this as deprecated?
4154
// Use hts_parse_region above instead.
4155
const char *hts_parse_reg64(const char *s, hts_pos_t *beg, hts_pos_t *end)
4156
0
{
4157
0
    char *hyphen;
4158
0
    const char *colon = strrchr(s, ':');
4159
0
    if (colon == NULL) {
4160
0
        *beg = 0; *end = HTS_POS_MAX;
4161
0
        return s + strlen(s);
4162
0
    }
4163
4164
0
    *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1;
4165
0
    if (*beg < 0) *beg = 0;
4166
4167
0
    if (*hyphen == '\0') *end = HTS_POS_MAX;
4168
0
    else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP);
4169
0
    else return NULL;
4170
4171
0
    if (*beg >= *end) return NULL;
4172
0
    return colon;
4173
0
}
4174
4175
const char *hts_parse_reg(const char *s, int *beg, int *end)
4176
0
{
4177
0
    hts_pos_t beg64 = 0, end64 = 0;
4178
0
    const char *colon = hts_parse_reg64(s, &beg64, &end64);
4179
0
    if (beg64 > INT_MAX) {
4180
0
        hts_log_error("Position %"PRId64" too large", beg64);
4181
0
        return NULL;
4182
0
    }
4183
0
    if (end64 > INT_MAX) {
4184
0
        if (end64 == HTS_POS_MAX) {
4185
0
            end64 = INT_MAX;
4186
0
        } else {
4187
0
            hts_log_error("Position %"PRId64" too large", end64);
4188
0
            return NULL;
4189
0
        }
4190
0
    }
4191
0
    *beg = beg64;
4192
0
    *end = end64;
4193
0
    return colon;
4194
0
}
4195
4196
hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
4197
0
{
4198
0
    int tid;
4199
0
    hts_pos_t beg, end;
4200
4201
0
    if (strcmp(reg, ".") == 0)
4202
0
        return itr_query(idx, HTS_IDX_START, 0, 0, readrec);
4203
0
    else if (strcmp(reg, "*") == 0)
4204
0
        return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
4205
4206
0
    if (!hts_parse_region(reg, &tid, &beg, &end, getid, hdr, HTS_PARSE_THOUSANDS_SEP))
4207
0
        return NULL;
4208
4209
0
    return itr_query(idx, tid, beg, end, readrec);
4210
0
}
4211
4212
0
hts_itr_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int count, hts_name2id_f getid, void *hdr, hts_itr_multi_query_func *itr_specific, hts_readrec_func *readrec, hts_seek_func *seek, hts_tell_func *tell) {
4213
4214
0
    int i;
4215
4216
0
    if (!reglist)
4217
0
        return NULL;
4218
4219
0
    hts_itr_t *itr = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
4220
0
    if (itr) {
4221
0
        itr->n_reg = count;
4222
0
        itr->readrec = readrec;
4223
0
        itr->seek = seek;
4224
0
        itr->tell = tell;
4225
0
        itr->reg_list = reglist;
4226
0
        itr->finished = 0;
4227
0
        itr->nocoor = 0;
4228
0
        itr->multi = 1;
4229
4230
0
        for (i = 0; i < itr->n_reg; i++) {
4231
0
            if (itr->reg_list[i].reg) {
4232
0
                if (!strcmp(itr->reg_list[i].reg, ".")) {
4233
0
                    itr->reg_list[i].tid = HTS_IDX_START;
4234
0
                    continue;
4235
0
                }
4236
4237
0
                if (!strcmp(itr->reg_list[i].reg, "*")) {
4238
0
                    itr->reg_list[i].tid = HTS_IDX_NOCOOR;
4239
0
                    continue;
4240
0
                }
4241
4242
0
                itr->reg_list[i].tid = getid(hdr, reglist[i].reg);
4243
0
                if (itr->reg_list[i].tid < 0) {
4244
0
                    if (itr->reg_list[i].tid < -1) {
4245
0
                        hts_log_error("Failed to parse header");
4246
0
                        hts_itr_destroy(itr);
4247
0
                        return NULL;
4248
0
                    } else {
4249
0
                        hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", reglist[i].reg);
4250
0
                    }
4251
0
                }
4252
0
            }
4253
0
        }
4254
4255
0
        qsort(itr->reg_list, itr->n_reg, sizeof(hts_reglist_t), compare_regions);
4256
0
        if (itr_specific(idx, itr) != 0) {
4257
0
            hts_log_error("Failed to create the multi-region iterator!");
4258
0
            hts_itr_destroy(itr);
4259
0
            itr = NULL;
4260
0
        }
4261
0
    }
4262
4263
0
    return itr;
4264
0
}
4265
4266
int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
4267
0
{
4268
0
    int ret, tid;
4269
0
    hts_pos_t beg, end;
4270
0
    if (iter == NULL || iter->finished) return -1;
4271
0
    if (iter->read_rest) {
4272
0
        if (iter->curr_off) { // seek to the start
4273
0
            if (bgzf_seek(fp, iter->curr_off, SEEK_SET) < 0) {
4274
0
                hts_log_error("Failed to seek to offset %"PRIu64"%s%s",
4275
0
                              iter->curr_off,
4276
0
                              errno ? ": " : "", strerror(errno));
4277
0
                return -2;
4278
0
            }
4279
0
            iter->curr_off = 0; // only seek once
4280
0
        }
4281
0
        ret = iter->readrec(fp, data, r, &tid, &beg, &end);
4282
0
        if (ret < 0) iter->finished = 1;
4283
0
        iter->curr_tid = tid;
4284
0
        iter->curr_beg = beg;
4285
0
        iter->curr_end = end;
4286
0
        return ret;
4287
0
    }
4288
    // A NULL iter->off should always be accompanied by iter->finished.
4289
0
    assert(iter->off != NULL);
4290
0
    for (;;) {
4291
0
        if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
4292
0
            if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
4293
0
            if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
4294
0
                if (bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) {
4295
0
                    hts_log_error("Failed to seek to offset %"PRIu64"%s%s",
4296
0
                                  iter->off[iter->i+1].u,
4297
0
                                  errno ? ": " : "", strerror(errno));
4298
0
                    return -2;
4299
0
                }
4300
0
                iter->curr_off = bgzf_tell(fp);
4301
0
            }
4302
0
            ++iter->i;
4303
0
        }
4304
0
        if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) {
4305
0
            iter->curr_off = bgzf_tell(fp);
4306
0
            if (tid != iter->tid || beg >= iter->end) { // no need to proceed
4307
0
                ret = -1; break;
4308
0
            } else if (end > iter->beg && iter->end > beg) {
4309
0
                iter->curr_tid = tid;
4310
0
                iter->curr_beg = beg;
4311
0
                iter->curr_end = end;
4312
0
                return ret;
4313
0
            }
4314
0
        } else break; // end of file or error
4315
0
    }
4316
0
    iter->finished = 1;
4317
0
    return ret;
4318
0
}
4319
4320
int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r)
4321
0
{
4322
0
    void *fp;
4323
0
    int ret, tid, i, cr, ci;
4324
0
    hts_pos_t beg, end;
4325
0
    hts_reglist_t *found_reg;
4326
4327
0
    if (iter == NULL || iter->finished) return -1;
4328
4329
0
    if (iter->is_cram) {
4330
0
        fp = fd->fp.cram;
4331
0
    } else {
4332
0
        fp = fd->fp.bgzf;
4333
0
    }
4334
4335
0
    if (iter->read_rest) {
4336
0
        if (iter->curr_off) { // seek to the start
4337
0
            if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
4338
0
                hts_log_error("Seek at offset %" PRIu64 " failed.", iter->curr_off);
4339
0
                return -2;
4340
0
            }
4341
0
            iter->curr_off = 0; // only seek once
4342
0
        }
4343
4344
0
        ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
4345
0
        if (ret < 0)
4346
0
            iter->finished = 1;
4347
4348
0
        iter->curr_tid = tid;
4349
0
        iter->curr_beg = beg;
4350
0
        iter->curr_end = end;
4351
4352
0
        return ret;
4353
0
    }
4354
    // A NULL iter->off should always be accompanied by iter->finished.
4355
0
    assert(iter->off != NULL || iter->nocoor != 0);
4356
4357
0
    int next_range = 0;
4358
0
    for (;;) {
4359
        // Note that due to the way bam indexing works, iter->off may contain
4360
        // file chunks that are not actually needed as they contain data
4361
        // beyond the end of the requested region.  These are filtered out
4362
        // by comparing the tid and index into hts_reglist_t::intervals
4363
        // (packed for reasons of convenience into iter->off[iter->i].max)
4364
        // associated with the file region with iter->curr_tid and
4365
        // iter->curr_intv.
4366
4367
0
        if (next_range
4368
0
            || iter->curr_off == 0
4369
0
            || iter->i >= iter->n_off
4370
0
            || iter->curr_off >= iter->off[iter->i].v
4371
0
            || (iter->off[iter->i].max >> 32 == iter->curr_tid
4372
0
                && (iter->off[iter->i].max & 0xffffffff) < iter->curr_intv)) {
4373
4374
            // Jump to the next chunk.  It may be necessary to skip more
4375
            // than one as the iter->off list can include overlapping entries.
4376
0
            do {
4377
0
                iter->i++;
4378
0
            } while (iter->i < iter->n_off
4379
0
                     && (iter->curr_off >= iter->off[iter->i].v
4380
0
                         || (iter->off[iter->i].max >> 32 == iter->curr_tid
4381
0
                             && (iter->off[iter->i].max & 0xffffffff) < iter->curr_intv)));
4382
4383
0
            if (iter->is_cram && iter->i < iter->n_off) {
4384
                // Ensure iter->curr_reg is correct.
4385
                //
4386
                // We need this for CRAM as we shortcut some of the later
4387
                // logic by getting an end-of-range and continuing to the
4388
                // next offset.
4389
                //
4390
                // We cannot do this for BAM (and fortunately do not need to
4391
                // either) because in BAM world a query to genomic positions
4392
                // GX and GY leading to a seek offsets PX and PY may have
4393
                // GX > GY and PX < PY.  (This is due to the R-tree and falling
4394
                // between intervals, bumping up to a higher bin.)
4395
                // CRAM strictly follows PX >= PY if GX >= GY, so this logic
4396
                // works.
4397
0
                int want_tid = iter->off[iter->i].max >> 32;
4398
0
                if (!(iter->curr_reg < iter->n_reg &&
4399
0
                      iter->reg_list[iter->curr_reg].tid == want_tid)) {
4400
0
                    int j;
4401
0
                    for (j = 0; j < iter->n_reg; j++)
4402
0
                        if (iter->reg_list[j].tid == want_tid)
4403
0
                            break;
4404
0
                    if (j == iter->n_reg)
4405
0
                        return -1;
4406
0
                    iter->curr_reg = j;
4407
0
                    iter->curr_tid = iter->reg_list[iter->curr_reg].tid;
4408
0
                };
4409
0
                iter->curr_intv = iter->off[iter->i].max & 0xffffffff;
4410
0
            }
4411
4412
0
            if (iter->i >= iter->n_off) { // no more chunks, except NOCOORs
4413
0
                if (iter->nocoor) {
4414
0
                    next_range = 0;
4415
0
                    if (iter->seek(fp, iter->nocoor_off, SEEK_SET) < 0) {
4416
0
                        hts_log_error("Seek at offset %" PRIu64 " failed.", iter->nocoor_off);
4417
0
                        return -2;
4418
0
                    }
4419
0
                    if (iter->is_cram) {
4420
0
                        cram_range r = { HTS_IDX_NOCOOR };
4421
0
                        cram_set_option(fp, CRAM_OPT_RANGE_NOSEEK, &r);
4422
0
                    }
4423
4424
                    // The first slice covering the unmapped reads might
4425
                    // contain a few mapped reads, so scroll
4426
                    // forward until finding the first unmapped read.
4427
0
                    do {
4428
0
                        ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
4429
0
                    } while (tid >= 0 && ret >=0);
4430
4431
0
                    if (ret < 0)
4432
0
                        iter->finished = 1;
4433
0
                    else
4434
0
                        iter->read_rest = 1;
4435
4436
0
                    iter->curr_off = 0; // don't seek any more
4437
0
                    iter->curr_tid = tid;
4438
0
                    iter->curr_beg = beg;
4439
0
                    iter->curr_end = end;
4440
4441
0
                    return ret;
4442
0
                } else {
4443
0
                    ret = -1; break;
4444
0
                }
4445
0
            } else if (iter->i < iter->n_off) {
4446
                // New chunk may overlap the last one, so ensure we
4447
                // only seek forwards.
4448
0
                if (iter->curr_off < iter->off[iter->i].u || next_range) {
4449
0
                    iter->curr_off = iter->off[iter->i].u;
4450
4451
                    // CRAM has the capability of setting an end location.
4452
                    // This means multi-threaded decodes can stop once they
4453
                    // reach that point, rather than pointlessly decoding
4454
                    // more slices than we'll be using.
4455
                    //
4456
                    // We have to be careful here.  Whenever we set the cram
4457
                    // range we need a corresponding seek in order to ensure
4458
                    // we can safely decode at that offset.  We use next_range
4459
                    // var to ensure this is always true; this is set on
4460
                    // end-of-range condition. It's never modified for BAM.
4461
0
                    if (iter->is_cram) {
4462
                        // Next offset.[uv] tuple, but it's already been
4463
                        // included in our cram range, so don't seek and don't
4464
                        // reset range so we can efficiently multi-thread.
4465
0
                        if (next_range || iter->curr_off >= iter->end) {
4466
0
                            if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
4467
0
                                hts_log_error("Seek at offset %" PRIu64
4468
0
                                        " failed.", iter->curr_off);
4469
0
                                return -2;
4470
0
                            }
4471
4472
                            // Find the genomic range matching this interval.
4473
0
                            int j;
4474
0
                            hts_reglist_t *rl = &iter->reg_list[iter->curr_reg];
4475
0
                            cram_range r = {
4476
0
                                    rl->tid,
4477
0
                                    rl->intervals[iter->curr_intv].beg,
4478
0
                                    rl->intervals[iter->curr_intv].end
4479
0
                            };
4480
4481
                            // Expand it up to cover neighbouring intervals.
4482
                            // Note we can only have a single chromosome in a
4483
                            // range, so if we detect our blocks span chromosomes
4484
                            // or we have a multi-ref mode slice, we just use
4485
                            // HTS_IDX_START refid instead.  This doesn't actually
4486
                            // seek (due to CRAM_OPT_RANGE_NOSEEK) and is simply
4487
                            // and indicator of decoding with no end limit.
4488
                            //
4489
                            // That isn't as efficient as it could be, but it's
4490
                            // no poorer than before and it works.
4491
0
                            int tid = r.refid;
4492
0
                            int64_t end = r.end;
4493
0
                            int64_t v = iter->off[iter->i].v;
4494
0
                            j = iter->i+1;
4495
0
                            while (j < iter->n_off) {
4496
0
                                if (iter->off[j].u > v)
4497
0
                                    break;
4498
4499
0
                                uint64_t max = iter->off[j].max;
4500
0
                                if ((max>>32) != tid) {
4501
0
                                    tid = HTS_IDX_START; // => no range limit
4502
0
                                } else {
4503
0
                                    if (end < rl->intervals[max & 0xffffffff].end)
4504
0
                                        end = rl->intervals[max & 0xffffffff].end;
4505
0
                                }
4506
0
                                if (v < iter->off[j].v)
4507
0
                                    v = iter->off[j].v;
4508
0
                                j++;
4509
0
                            }
4510
0
                            r.refid = tid;
4511
0
                            r.end = end;
4512
4513
                            // Remember maximum 'v' here so we don't do
4514
                            // unnecessary subsequent seeks for the next
4515
                            // regions.  We can't change curr_off, but
4516
                            // beg/end are used only by single region iterator so
4517
                            // we cache it there to avoid changing the struct.
4518
0
                            iter->end = v;
4519
4520
0
                            cram_set_option(fp, CRAM_OPT_RANGE_NOSEEK, &r);
4521
0
                            next_range = 0;
4522
0
                        }
4523
0
                    } else { // Not CRAM
4524
0
                        if (iter->seek(fp, iter->curr_off, SEEK_SET) < 0) {
4525
0
                            hts_log_error("Seek at offset %" PRIu64 " failed.",
4526
0
                                          iter->curr_off);
4527
0
                            return -2;
4528
0
                        }
4529
0
                    }
4530
0
                }
4531
0
            }
4532
0
        }
4533
4534
0
        ret = iter->readrec(fp, fd, r, &tid, &beg, &end);
4535
0
        if (ret < 0) {
4536
0
            if (iter->is_cram && cram_eof(fp)) {
4537
                // Skip to end of range
4538
                //
4539
                // We should never be adjusting curr_off manually unless
4540
                // we also can guarantee we'll be doing a seek after to
4541
                // a new location.  Otherwise we'll be reading wrong offset
4542
                // for the next container.
4543
                //
4544
                // We ensure this by adjusting our CRAM_OPT_RANGE
4545
                // accordingly above, but to double check we also
4546
                // set the skipped_block flag to enforce a seek also.
4547
0
                iter->curr_off = iter->off[iter->i].v;
4548
0
                next_range = 1;
4549
4550
                // Next region
4551
0
                if (++iter->curr_intv >= iter->reg_list[iter->curr_reg].count){
4552
0
                    if (++iter->curr_reg >= iter->n_reg)
4553
0
                        break;
4554
0
                    iter->curr_intv = 0;
4555
0
                    iter->curr_tid = iter->reg_list[iter->curr_reg].tid;
4556
0
                }
4557
0
                continue;
4558
0
            } else {
4559
0
                break;
4560
0
            }
4561
0
        }
4562
4563
0
        iter->curr_off = iter->tell(fp);
4564
4565
0
        if (tid != iter->curr_tid) {
4566
0
            hts_reglist_t key;
4567
0
            key.tid = tid;
4568
4569
0
            found_reg = (hts_reglist_t *)bsearch(&key, iter->reg_list,
4570
0
                                                 iter->n_reg,
4571
0
                                                 sizeof(hts_reglist_t),
4572
0
                                                 compare_regions);
4573
0
            if (!found_reg)
4574
0
                continue;
4575
4576
0
            iter->curr_reg = (found_reg - iter->reg_list);
4577
0
            iter->curr_tid = tid;
4578
0
            iter->curr_intv = 0;
4579
0
        }
4580
4581
0
        cr = iter->curr_reg;
4582
0
        ci = iter->curr_intv;
4583
4584
0
        for (i = ci; i < iter->reg_list[cr].count; i++) {
4585
0
            if (end > iter->reg_list[cr].intervals[i].beg &&
4586
0
                iter->reg_list[cr].intervals[i].end > beg) {
4587
0
                iter->curr_beg = beg;
4588
0
                iter->curr_end = end;
4589
0
                iter->curr_intv = i;
4590
4591
0
                return ret;
4592
0
            }
4593
4594
            // Check if the read starts beyond intervals[i].end
4595
            // If so, the interval is finished so move on to the next.
4596
0
            if (beg > iter->reg_list[cr].intervals[i].end)
4597
0
                iter->curr_intv = i + 1;
4598
4599
            // No need to keep searching if the read ends before intervals[i].beg
4600
0
            if (end < iter->reg_list[cr].intervals[i].beg)
4601
0
                break;
4602
0
        }
4603
0
    }
4604
0
    iter->finished = 1;
4605
4606
0
    return ret;
4607
0
}
4608
4609
/**********************
4610
 *** Retrieve index ***
4611
 **********************/
4612
// Local_fn and local_len will return a sub-region of 'fn'.
4613
// Eg http://elsewhere/dir/foo.bam.bai?a=b may return
4614
// foo.bam.bai via local_fn and local_len.
4615
//
4616
// Returns -1 if index couldn't be opened.
4617
//         -2 on other errors
4618
static int idx_test_and_fetch(const char *fn, const char **local_fn, int *local_len, int download)
4619
0
{
4620
0
    hFILE *remote_hfp = NULL;
4621
0
    hFILE *local_fp = NULL;
4622
0
    int save_errno;
4623
0
    htsFormat fmt;
4624
0
    kstring_t s = KS_INITIALIZE;
4625
0
    kstring_t tmps = KS_INITIALIZE;
4626
4627
0
    if (hisremote(fn)) {
4628
0
        const int buf_size = 1 * 1024 * 1024;
4629
0
        int l;
4630
0
        const char *p, *e;
4631
        // Ignore ?# params: eg any file.fmt?param=val, except for S3 URLs
4632
0
        e = fn + ((strncmp(fn, "s3://", 5) && strncmp(fn, "s3+http://", 10) && strncmp(fn, "s3+https://", 11)) ? strcspn(fn, "?#") : strcspn(fn, "?"));
4633
        // Find the previous slash from there.
4634
0
        p = e;
4635
0
        while (p > fn && *p != '/') p--;
4636
0
        if (*p == '/') p++;
4637
4638
        // Attempt to open local file first
4639
0
        kputsn(p, e-p, &s);
4640
0
        if (access(s.s, R_OK) == 0)
4641
0
        {
4642
0
            free(s.s);
4643
0
            *local_fn = p;
4644
0
            *local_len = e-p;
4645
0
            return 0;
4646
0
        }
4647
4648
        // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .bai or .tbi index.
4649
0
        if ((remote_hfp = hopen(fn, "r")) == 0) {
4650
0
            hts_log_info("Failed to open index file '%s'", fn);
4651
0
            free(s.s);
4652
0
            return -1;
4653
0
        }
4654
0
        if (hts_detect_format2(remote_hfp, fn, &fmt)) {
4655
0
            hts_log_error("Failed to detect format of index file '%s'", fn);
4656
0
            goto fail;
4657
0
        }
4658
0
        if (fmt.category != index_file || (fmt.format != bai &&  fmt.format != csi && fmt.format != tbi
4659
0
                && fmt.format != crai && fmt.format != fai_format)) {
4660
0
            hts_log_error("Format of index file '%s' is not supported", fn);
4661
0
            goto fail;
4662
0
        }
4663
4664
0
        if (download) {
4665
0
            if ((local_fp = hts_open_tmpfile(s.s, "wx", &tmps)) == NULL) {
4666
0
                hts_log_error("Failed to create file %s in the working directory", p);
4667
0
                goto fail;
4668
0
            }
4669
0
            hts_log_info("Downloading file %s to local directory", fn);
4670
0
            uint8_t *buf = (uint8_t*)calloc(buf_size, 1);
4671
0
            if (!buf) {
4672
0
                hts_log_error("%s", strerror(errno));
4673
0
                goto fail;
4674
0
            }
4675
0
            while ((l = hread(remote_hfp, buf, buf_size)) > 0) {
4676
0
                if (hwrite(local_fp, buf, l) != l) {
4677
0
                    hts_log_error("Failed to write data to %s : %s",
4678
0
                            fn, strerror(errno));
4679
0
                    free(buf);
4680
0
                    goto fail;
4681
0
                }
4682
0
            }
4683
0
            free(buf);
4684
0
            if (l < 0) {
4685
0
                hts_log_error("Error reading \"%s\"", fn);
4686
0
                goto fail;
4687
0
            }
4688
0
            if (hclose(local_fp) < 0) {
4689
0
                hts_log_error("Error closing %s : %s", fn, strerror(errno));
4690
0
                local_fp = NULL;
4691
0
                goto fail;
4692
0
            }
4693
0
            local_fp = NULL;
4694
0
            if (rename(tmps.s, s.s) < 0) {
4695
0
                hts_log_error("Error renaming %s : %s", tmps.s, strerror(errno));
4696
0
                goto fail;
4697
0
            }
4698
0
            ks_clear(&tmps);
4699
4700
0
            *local_fn = p;
4701
0
            *local_len = e-p;
4702
0
        } else {
4703
0
            *local_fn = fn;
4704
0
            *local_len = e-fn;
4705
0
        }
4706
4707
0
        if (hclose(remote_hfp) != 0) {
4708
0
            hts_log_error("Failed to close remote file %s", fn);
4709
0
        }
4710
4711
0
        free(tmps.s);
4712
0
        free(s.s);
4713
0
        return 0;
4714
0
    } else {
4715
0
        hFILE *local_hfp;
4716
0
        if ((local_hfp = hopen(fn, "r")) == 0) return -1;
4717
0
        hclose_abruptly(local_hfp);
4718
0
        *local_fn = fn;
4719
0
        *local_len = strlen(fn);
4720
0
        return 0;
4721
0
    }
4722
4723
0
 fail:
4724
0
    save_errno = errno;
4725
0
    if (remote_hfp) hclose_abruptly(remote_hfp);
4726
0
    if (local_fp) hclose_abruptly(local_fp);
4727
0
    if (tmps.l > 0) unlink(tmps.s);
4728
0
    free(tmps.s);
4729
0
    free(s.s);
4730
0
    errno = save_errno;
4731
0
    return -2;
4732
0
}
4733
4734
/*
4735
 * Check the existence of a local index file using part of the alignment
4736
 * file name.
4737
 *
4738
 * For a filename fn of fn.fmt (eg fn.bam or fn.cram) the order of checks is
4739
 * fn.fmt.csi,  fn.csi,
4740
 * fn.fmt.bai,  fn.bai  - if fmt is HTS_FMT_BAI
4741
 * fn.fmt.tbi,  fn.tbi  - if fmt is HTS_FMT_TBI
4742
 * fn.fmt.crai, fn.crai - if fmt is HTS_FMT_CRAI
4743
 * fn.fmt.fai           - if fmt is HTS_FMT_FAI
4744
 *   also .gzi if fmt is ".gz"
4745
 *
4746
 * @param fn    - pointer to the file name
4747
 * @param fmt   - one of the HTS_FMT index formats
4748
 * @param fnidx - pointer to the index file name placeholder
4749
 * @return        1 for success, 0 for failure
4750
 */
4751
1.55k
int hts_idx_check_local(const char *fn, int fmt, char **fnidx) {
4752
1.55k
    int i, l_fn, l_ext;
4753
1.55k
    const char *fn_tmp = NULL;
4754
1.55k
    char *fnidx_tmp;
4755
1.55k
    const char *csi_ext = ".csi";
4756
1.55k
    const char *bai_ext = ".bai";
4757
1.55k
    const char *tbi_ext = ".tbi";
4758
1.55k
    const char *crai_ext = ".crai";
4759
1.55k
    const char *fai_ext = ".fai";
4760
1.55k
    const char *gzi_ext = ".gzi";
4761
4762
1.55k
    if (!fn)
4763
0
        return 0;
4764
4765
1.55k
    if (hisremote(fn)) {
4766
0
        for (i = strlen(fn) - 1; i >= 0; --i)
4767
0
            if (fn[i] == '/') {
4768
0
                fn_tmp = (char *)&fn[i+1];
4769
0
                break;
4770
0
            }
4771
1.55k
    } else {
4772
        // Borrowed from hopen_fd_fileuri()
4773
1.55k
        if (strncmp(fn, "file://localhost/", 17) == 0) fn_tmp = fn + 16;
4774
1.55k
        else if (strncmp(fn, "file:///", 8) == 0) fn_tmp = fn + 7;
4775
1.55k
        else fn_tmp = fn;
4776
#if defined(_WIN32) || defined(__MSYS__)
4777
        // For cases like C:/foo
4778
        if (fn_tmp[0] == '/' && fn_tmp[1] && fn_tmp[2] == ':' && fn_tmp[3] == '/')
4779
            fn_tmp++;
4780
#endif
4781
1.55k
    }
4782
4783
1.55k
    if (!fn_tmp) return 0;
4784
1.55k
    hts_log_info("Using alignment file '%s'", fn_tmp);
4785
1.55k
    l_fn = strlen(fn_tmp); l_ext = 5;
4786
1.55k
    fnidx_tmp = (char*)calloc(l_fn + l_ext + 1, 1);
4787
1.55k
    if (!fnidx_tmp) return 0;
4788
4789
1.55k
    struct stat sbuf;
4790
4791
    // Try alignment.bam.csi first
4792
1.55k
    strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, csi_ext);
4793
1.55k
    if(stat(fnidx_tmp, &sbuf) == 0) {
4794
0
        *fnidx = fnidx_tmp;
4795
0
        return 1;
4796
1.55k
    } else { // Then try alignment.csi
4797
6.20k
        for (i = l_fn - 1; i > 0; --i)
4798
4.65k
            if (fnidx_tmp[i] == '.') {
4799
0
                strcpy(fnidx_tmp + i, csi_ext);
4800
0
                if(stat(fnidx_tmp, &sbuf) == 0) {
4801
0
                    *fnidx = fnidx_tmp;
4802
0
                    return 1;
4803
0
                }
4804
0
                break;
4805
0
            }
4806
1.55k
    }
4807
1.55k
    if (fmt == HTS_FMT_BAI) {
4808
        // Next, try alignment.bam.bai
4809
0
        strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, bai_ext);
4810
0
        if(stat(fnidx_tmp, &sbuf) == 0) {
4811
0
            *fnidx = fnidx_tmp;
4812
0
            return 1;
4813
0
        } else { // And finally, try alignment.bai
4814
0
            for (i = l_fn - 1; i > 0; --i)
4815
0
                if (fnidx_tmp[i] == '.') {
4816
0
                    strcpy(fnidx_tmp + i, bai_ext);
4817
0
                    if(stat(fnidx_tmp, &sbuf) == 0) {
4818
0
                        *fnidx = fnidx_tmp;
4819
0
                        return 1;
4820
0
                    }
4821
0
                    break;
4822
0
                }
4823
0
        }
4824
1.55k
    } else if (fmt == HTS_FMT_TBI) { // Or .tbi
4825
1.55k
        strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, tbi_ext);
4826
1.55k
        if(stat(fnidx_tmp, &sbuf) == 0) {
4827
0
            *fnidx = fnidx_tmp;
4828
0
            return 1;
4829
1.55k
        } else {
4830
6.20k
            for (i = l_fn - 1; i > 0; --i)
4831
4.65k
                if (fnidx_tmp[i] == '.') {
4832
0
                    strcpy(fnidx_tmp + i, tbi_ext);
4833
0
                    if(stat(fnidx_tmp, &sbuf) == 0) {
4834
0
                        *fnidx = fnidx_tmp;
4835
0
                        return 1;
4836
0
                    }
4837
0
                    break;
4838
0
                }
4839
1.55k
        }
4840
1.55k
    } else if (fmt == HTS_FMT_CRAI) { // Or .crai
4841
0
        strcpy(fnidx_tmp, fn_tmp); strcpy(fnidx_tmp + l_fn, crai_ext);
4842
0
        if(stat(fnidx_tmp, &sbuf) == 0) {
4843
0
            *fnidx = fnidx_tmp;
4844
0
            return 1;
4845
0
        } else {
4846
0
            for (i = l_fn - 1; i > 0; --i)
4847
0
                if (fnidx_tmp[i] == '.') {
4848
0
                    strcpy(fnidx_tmp + i, crai_ext);
4849
0
                    if(stat(fnidx_tmp, &sbuf) == 0) {
4850
0
                        *fnidx = fnidx_tmp;
4851
0
                        return 1;
4852
0
                    }
4853
0
                    break;
4854
0
                }
4855
0
        }
4856
0
    } else if (fmt == HTS_FMT_FAI) { // Or .fai
4857
        // Check .gzi if we have a .gz file
4858
0
        strcpy(fnidx_tmp, fn_tmp);
4859
0
        int gzi_ok = 1;
4860
0
        if ((l_fn > 3 && strcmp(fn_tmp+l_fn-3, ".gz") == 0) ||
4861
0
            (l_fn > 5 && strcmp(fn_tmp+l_fn-5, ".bgzf") == 0)) {
4862
0
            strcpy(fnidx_tmp + l_fn, gzi_ext);
4863
0
            gzi_ok = stat(fnidx_tmp, &sbuf)==0;
4864
0
        }
4865
4866
        // Now check for .fai.  Occurs second as we're returning this
4867
        // in *fnidx irrespective of whether we did gzi check.
4868
0
        strcpy(fnidx_tmp + l_fn, fai_ext);
4869
0
        *fnidx = fnidx_tmp;
4870
0
        if (stat(fnidx_tmp, &sbuf) == 0)
4871
0
            return gzi_ok;
4872
0
        else
4873
0
            return 0;
4874
0
    }
4875
4876
1.55k
    free(fnidx_tmp);
4877
1.55k
    return 0;
4878
1.55k
}
4879
4880
0
static char *idx_filename(const char *fn, const char *ext, int download) {
4881
0
    int ret, local_len;
4882
0
    char *fnidx;
4883
0
    const char *local_fn = NULL;
4884
0
    kstring_t buffer = KS_INITIALIZE;
4885
4886
    // First try : append `ext` to `fn`
4887
0
    if (!(fnidx = haddextension(&buffer, fn, 0, ext))) {
4888
0
        free(buffer.s);
4889
0
        return NULL;
4890
0
    }
4891
0
    if ((ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, download)) == -1) {
4892
        // Second try : replace suffix of `fn` with `ext`
4893
0
        if (!(fnidx = haddextension(&buffer, fn, 1, ext))) {
4894
0
            free(buffer.s);
4895
0
            return NULL;
4896
0
        }
4897
0
        ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, download);
4898
0
    }
4899
4900
0
    if (ret < 0) {
4901
0
        free(buffer.s);
4902
0
        return NULL;
4903
0
    }
4904
4905
0
    memmove(fnidx, local_fn, local_len);
4906
0
    fnidx[local_len] = 0;
4907
0
    return fnidx;
4908
0
}
4909
4910
char *hts_idx_getfn(const char *fn, const char *ext)
4911
0
{
4912
0
    return idx_filename(fn, ext, HTS_IDX_SAVE_REMOTE);
4913
0
}
4914
4915
char *hts_idx_locatefn(const char *fn, const char *ext)
4916
0
{
4917
0
    return idx_filename(fn, ext, 0);
4918
0
}
4919
4920
static hts_idx_t *idx_find_and_load(const char *fn, int fmt, int flags)
4921
1.55k
{
4922
1.55k
    char *fnidx = strstr(fn, HTS_IDX_DELIM);
4923
1.55k
    hts_idx_t *idx;
4924
4925
1.55k
    if ( fnidx ) {
4926
0
        char *fn2 = strdup(fn);
4927
0
        if (!fn2) {
4928
0
            hts_log_error("%s", strerror(errno));
4929
0
            return NULL;
4930
0
        }
4931
0
        fn2[fnidx - fn] = '\0';
4932
0
        fnidx += strlen(HTS_IDX_DELIM);
4933
0
        idx = hts_idx_load3(fn2, fnidx, fmt, flags);
4934
0
        free(fn2);
4935
0
        return idx;
4936
0
    }
4937
4938
1.55k
    if (hts_idx_check_local(fn, fmt, &fnidx) == 0 && hisremote(fn)) {
4939
0
        if (flags & HTS_IDX_SAVE_REMOTE) {
4940
0
            fnidx = idx_filename(fn, ".csi", HTS_IDX_SAVE_REMOTE);
4941
0
            if (!fnidx) {
4942
0
                switch (fmt) {
4943
0
                case HTS_FMT_BAI: fnidx = idx_filename(fn, ".bai", HTS_IDX_SAVE_REMOTE); break;
4944
0
                case HTS_FMT_TBI: fnidx = idx_filename(fn, ".tbi", HTS_IDX_SAVE_REMOTE); break;
4945
0
                default: break;
4946
0
                }
4947
0
            }
4948
0
        } else {
4949
0
            fnidx = idx_filename(fn, ".csi", 0);
4950
0
            if (!fnidx) {
4951
0
                switch (fmt) {
4952
0
                case HTS_FMT_BAI: fnidx = idx_filename(fn, ".bai", 0); break;
4953
0
                case HTS_FMT_TBI: fnidx = idx_filename(fn, ".tbi", 0); break;
4954
0
                default: break;
4955
0
                }
4956
0
            }
4957
0
        }
4958
0
    }
4959
1.55k
    if (!fnidx) {
4960
1.55k
        if (!(flags & HTS_IDX_SILENT_FAIL))
4961
0
            hts_log_error("Could not retrieve index file for '%s'", fn);
4962
1.55k
        return 0;
4963
1.55k
    }
4964
4965
0
    if (flags & HTS_IDX_SAVE_REMOTE)
4966
0
        idx = hts_idx_load3(fn, fnidx, fmt, flags);
4967
0
    else
4968
0
        idx = idx_read(fnidx);
4969
0
    free(fnidx);
4970
0
    return idx;
4971
1.55k
}
4972
4973
0
hts_idx_t *hts_idx_load(const char *fn, int fmt) {
4974
0
    return idx_find_and_load(fn, fmt, 1);
4975
0
}
4976
4977
hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
4978
0
{
4979
0
    return hts_idx_load3(fn, fnidx, 0, 0);
4980
0
}
4981
4982
hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags)
4983
1.55k
{
4984
1.55k
    const char *local_fn = NULL;
4985
1.55k
    char *local_fnidx = NULL;
4986
1.55k
    int local_len;
4987
1.55k
    if (!fnidx)
4988
1.55k
        return idx_find_and_load(fn, fmt, flags);
4989
4990
    // Check that the index file is up to date, the main file might have changed
4991
0
    struct stat stat_idx,stat_main;
4992
0
    int remote_fn = hisremote(fn), remote_fnidx = hisremote(fnidx);
4993
0
    if ( !remote_fn && !remote_fnidx
4994
0
         && !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) )
4995
0
    {
4996
0
        if ( stat_idx.st_mtime < stat_main.st_mtime )
4997
0
            hts_log_warning("The index file is older than the data file: %s", fnidx);
4998
0
    }
4999
5000
0
    if (remote_fnidx && (flags & HTS_IDX_SAVE_REMOTE))
5001
0
    {
5002
0
        int ret = idx_test_and_fetch(fnidx, &local_fn, &local_len, 1);
5003
0
        if (ret == 0) {
5004
0
            local_fnidx = strdup(local_fn);
5005
0
            if (local_fnidx) {
5006
0
                local_fnidx[local_len] = '\0';
5007
0
                fnidx = local_fnidx;
5008
0
            }
5009
0
        }
5010
0
    }
5011
5012
0
    hts_idx_t *idx = idx_read(fnidx);
5013
0
    if (!idx && !(flags & HTS_IDX_SILENT_FAIL))
5014
0
        hts_log_error("Could not load local index file '%s'%s%s", fnidx,
5015
0
                      errno ? " : " : "", errno ? strerror(errno) : "");
5016
5017
5018
0
    free(local_fnidx);
5019
5020
0
    return idx;
5021
1.55k
}
5022
5023
5024
5025
/**********************
5026
 ***     Memory     ***
5027
 **********************/
5028
5029
/* For use with hts_expand macros *only* */
5030
HTSLIB_EXPORT
5031
size_t hts_realloc_or_die(size_t n, size_t m, size_t m_sz, size_t size,
5032
1.63k
                          int clear, void **ptr, const char *func) {
5033
    /* If new_m and size are both below this limit, multiplying them
5034
       together can't overflow */
5035
1.63k
    const size_t safe = (size_t) 1 << (sizeof(size_t) * 4);
5036
1.63k
    void *new_ptr;
5037
1.63k
    size_t bytes, new_m;
5038
5039
1.63k
    new_m = n;
5040
1.63k
    kroundup_size_t(new_m);
5041
5042
1.63k
    bytes = size * new_m;
5043
5044
    /* Check for overflow.  Both ensure that new_m will fit in m (we make the
5045
       pessimistic assumption that m is signed), and that bytes has not
5046
       wrapped around. */
5047
1.63k
    if (new_m > (((size_t) 1 << (m_sz * 8 - 1)) - 1)
5048
1.63k
        || ((size > safe || new_m > safe)
5049
0
            && bytes / new_m != size)) {
5050
0
        errno = ENOMEM;
5051
0
        goto die;
5052
0
    }
5053
5054
1.63k
    new_ptr = realloc(*ptr, bytes);
5055
1.63k
    if (new_ptr == NULL) goto die;
5056
5057
1.63k
    if (clear) {
5058
0
        if (new_m > m) {
5059
0
            memset((char *) new_ptr + m * size, 0, (new_m - m) * size);
5060
0
        }
5061
0
    }
5062
5063
1.63k
    *ptr = new_ptr;
5064
5065
1.63k
    return new_m;
5066
5067
0
 die:
5068
0
    hts_log_error("%s", strerror(errno));
5069
0
    exit(1);
5070
1.63k
}
5071
5072
/*
5073
 * Companion to hts_resize() macro that does the actual allocation.
5074
 *
5075
 * Somewhat complicated as hts_resize() needs to write the new allocated
5076
 * size back into *size_in_out, and the value pointed to may either be
5077
 * int32_t, uint32_t or size_t depending on which array is being resized.
5078
 * This is solved by making `size_in_out` a void pointer, getting the macro
5079
 * to pass in the size of the item pointed to (in `size_sz`) and then using
5080
 * an appropriate cast (based on the value of size_sz).  The function
5081
 * ensures that the maximum size will be storable in a signed type of
5082
 * the given size so storing to an int32_t should work correctly.
5083
 *
5084
 * Assumes that sizeof(uint32_t) and sizeof(int32_t) is 4,
5085
 * sizeof(uint64_t) and sizeof(int64_t) is 8 and sizeof(size_t) is
5086
 * either 4 or 8.  It also assumes casting from unsigned to signed will
5087
 * work as long as the top bit isn't set.
5088
 */
5089
5090
int hts_resize_array_(size_t item_size, size_t num, size_t size_sz,
5091
                      void *size_in_out, void **ptr_in_out, int flags,
5092
9.17k
                      const char *func) {
5093
    /* If new_size and item_size are both below this limit, multiplying them
5094
       together can't overflow */
5095
9.17k
    const size_t safe = (size_t) 1 << (sizeof(size_t) * 4);
5096
9.17k
    void *new_ptr;
5097
9.17k
    size_t bytes, new_size;
5098
5099
9.17k
    new_size = num;
5100
9.17k
    kroundup_size_t(new_size);
5101
9.17k
    bytes = item_size * new_size;
5102
5103
    /* Check for overflow.  Both ensure that alloc will fit in alloc_in_out (we
5104
       make the pessimistic assumption that *alloc_in_out is signed), and that
5105
       bytes has not wrapped around. */
5106
5107
9.17k
    if ((new_size > (((size_t) 1 << (size_sz * 8 - 1)) - 1))
5108
9.17k
        || (((item_size > safe) || (new_size > safe))
5109
0
            && bytes / new_size != item_size)) {
5110
0
        hts_log(HTS_LOG_ERROR, func, "Memory allocation too large");
5111
0
        errno = ENOMEM;
5112
0
        return -1;
5113
0
    }
5114
5115
9.17k
    new_ptr = realloc(*ptr_in_out, bytes);
5116
9.17k
    if (new_ptr == NULL) {
5117
0
        int save_errno = errno;
5118
0
        hts_log(HTS_LOG_ERROR, func, "%s", strerror(errno));
5119
0
        errno = save_errno;
5120
0
        return -1;
5121
0
    }
5122
5123
9.17k
    if (flags & HTS_RESIZE_CLEAR) {
5124
9.17k
        size_t old_size;
5125
9.17k
        switch (size_sz) {
5126
9.17k
        case 4: old_size = *((uint32_t *) size_in_out); break;
5127
0
        case 8: old_size = *((uint64_t *) size_in_out); break;
5128
0
        default: abort();
5129
9.17k
        }
5130
9.17k
        if (new_size > old_size) {
5131
9.17k
            memset((char *) new_ptr + old_size * item_size, 0,
5132
9.17k
                   (new_size - old_size) * item_size);
5133
9.17k
        }
5134
9.17k
    }
5135
5136
9.17k
    switch (size_sz) {
5137
9.17k
    case 4: *((uint32_t *) size_in_out) = new_size; break;
5138
0
    case 8: *((uint64_t *) size_in_out) = new_size; break;
5139
0
    default: abort();
5140
9.17k
    }
5141
5142
9.17k
    *ptr_in_out = new_ptr;
5143
9.17k
    return 0;
5144
9.17k
}
5145
5146
void hts_lib_shutdown(void)
5147
0
{
5148
0
    hfile_shutdown(1);
5149
0
}
5150
5151
0
void hts_free(void *ptr) {
5152
0
    free(ptr);
5153
0
}
5154
5155
void hts_set_log_level(enum htsLogLevel level)
5156
0
{
5157
0
    hts_verbose = level;
5158
0
}
5159
5160
enum htsLogLevel hts_get_log_level(void)
5161
0
{
5162
0
    return hts_verbose;
5163
0
}
5164
5165
static char get_severity_tag(enum htsLogLevel severity)
5166
1.42M
{
5167
1.42M
    switch (severity) {
5168
19.7k
    case HTS_LOG_ERROR:
5169
19.7k
        return 'E';
5170
1.40M
    case HTS_LOG_WARNING:
5171
1.40M
        return 'W';
5172
0
    case HTS_LOG_INFO:
5173
0
        return 'I';
5174
0
    case HTS_LOG_DEBUG:
5175
0
        return 'D';
5176
0
    case HTS_LOG_TRACE:
5177
0
        return 'T';
5178
0
    default:
5179
0
        break;
5180
1.42M
    }
5181
5182
0
    return '*';
5183
1.42M
}
5184
5185
void hts_log(enum htsLogLevel severity, const char *context, const char *format, ...)
5186
1.83M
{
5187
1.83M
    int save_errno = errno;
5188
1.83M
    if (severity <= hts_verbose) {
5189
1.42M
        va_list argptr;
5190
5191
1.42M
        fprintf(stderr, "[%c::%s] ", get_severity_tag(severity), context);
5192
5193
1.42M
        va_start(argptr, format);
5194
1.42M
        vfprintf(stderr, format, argptr);
5195
1.42M
        va_end(argptr);
5196
5197
1.42M
        fprintf(stderr, "\n");
5198
1.42M
    }
5199
    errno = save_errno;
5200
1.83M
}