Coverage Report

Created: 2026-06-10 06:30

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/cram/cram_index.c
Line
Count
Source
1
/*
2
Copyright (c) 2013-2020, 2023-2024 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * The index is a gzipped tab-delimited text file with one line per slice.
33
 * The columns are:
34
 * 1: reference number (0 to N-1, as per BAM ref_id)
35
 * 2: reference position of 1st read in slice (1..?)
36
 * 3: number of reads in slice
37
 * 4: offset of container start (relative to end of SAM header, so 1st
38
 *    container is offset 0).
39
 * 5: slice number within container (ie which landmark).
40
 *
41
 * In memory, we hold this in a nested containment list. Each list element is
42
 * a cram_index struct. Each element in turn can contain its own list of
43
 * cram_index structs.
44
 *
45
 * Any start..end range which is entirely contained within another (and
46
 * earlier as it is sorted) range will be held within it. This ensures that
47
 * the outer list will never have containments and we can safely do a
48
 * binary search to find the first range which overlaps any given coordinate.
49
 */
50
51
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
52
#include <config.h>
53
54
#include <stdio.h>
55
#include <errno.h>
56
#include <assert.h>
57
#include <inttypes.h>
58
#include <stdlib.h>
59
#include <string.h>
60
#include <zlib.h>
61
#include <sys/types.h>
62
#include <sys/stat.h>
63
#include <math.h>
64
65
#include "../htslib/bgzf.h"
66
#include "../htslib/hfile.h"
67
#include "../htslib/hts_alloc.h"
68
#include "../hts_internal.h"
69
#include "cram.h"
70
#include "os.h"
71
72
#if 0
73
static void dump_index_(cram_index *e, int level) {
74
    int i, n;
75
    n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end);
76
    printf("%*soffset %"PRId64" %p %p\n", MAX(0,50-n), "", e->offset, e, e->e_next);
77
    for (i = 0; i < e->nslice; i++) {
78
        dump_index_(&e->e[i], level+1);
79
    }
80
}
81
82
static void dump_index(cram_fd *fd) {
83
    int i;
84
    for (i = 0; i < fd->index_sz; i++) {
85
        dump_index_(&fd->index[i], 0);
86
    }
87
}
88
#endif
89
90
// Thread a linked list through the nested containment list.
91
// This makes navigating it and finding the "next" index entry
92
// trivial.
93
0
static cram_index *link_index_(cram_index *e, cram_index *e_last) {
94
0
    int i;
95
0
    if (e_last)
96
0
        e_last->e_next = e;
97
98
    // We don't want to link in the top-level cram_index with
99
    // offset=0 and start/end = INT_MIN/INT_MAX.
100
0
    if (e->offset)
101
0
        e_last = e;
102
103
0
    for (i = 0; i < e->nslice; i++)
104
0
        e_last = link_index_(&e->e[i], e_last);
105
106
0
    return e_last;
107
0
}
108
109
0
static void link_index(cram_fd *fd) {
110
0
    int i;
111
0
    cram_index *e_last = NULL;
112
113
0
    for (i = 0; i < fd->index_sz; i++) {
114
0
        e_last = link_index_(&fd->index[i], e_last);
115
0
    }
116
117
0
    if (e_last)
118
0
        e_last->e_next = NULL;
119
0
}
120
121
0
static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) {
122
0
    int sign = 1;
123
0
    int32_t val = 0;
124
0
    size_t p = *pos;
125
126
0
    while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
127
0
        p++;
128
129
0
    if (p < k->l && k->s[p] == '-')
130
0
        sign = -1, p++;
131
132
0
    if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
133
0
        return -1;
134
135
0
    while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') {
136
0
        int digit = k->s[p++]-'0';
137
0
        val = val*10 + digit;
138
0
    }
139
140
0
    *pos = p;
141
0
    *val_p = sign*val;
142
143
0
    return 0;
144
0
}
145
146
0
static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) {
147
0
    int sign = 1;
148
0
    int64_t val = 0;
149
0
    size_t p = *pos;
150
151
0
    while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
152
0
        p++;
153
154
0
    if (p < k->l && k->s[p] == '-')
155
0
        sign = -1, p++;
156
157
0
    if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
158
0
        return -1;
159
160
0
    while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') {
161
0
        int digit = k->s[p++]-'0';
162
0
        val = val*10 + digit;
163
0
    }
164
165
0
    *pos = p;
166
0
    *val_p = sign*val;
167
168
0
    return 0;
169
0
}
170
171
/*
172
 * Loads a CRAM .crai index into memory.
173
 *
174
 * Returns 0 for success
175
 *        -1 for failure
176
 */
177
0
int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) {
178
179
0
    char *tfn_idx = NULL;
180
0
    char buf[65536];
181
0
    ssize_t len;
182
0
    kstring_t kstr = {0};
183
0
    hFILE *fp;
184
0
    cram_index *idx;
185
0
    cram_index **idx_stack = NULL, *ep, e;
186
0
    int idx_stack_alloc = 0, idx_stack_ptr = 0;
187
0
    size_t pos = 0;
188
189
    /* Check if already loaded */
190
0
    if (fd->index)
191
0
        return 0;
192
193
0
    fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index));
194
0
    if (!fd->index)
195
0
        return -1;
196
197
0
    idx = &fd->index[0];
198
0
    idx->refid = -1;
199
0
    idx->start = INT_MIN;
200
0
    idx->end   = INT_MAX;
201
202
0
    idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack));
203
0
    if (!idx_stack)
204
0
        goto fail;
205
206
0
    idx_stack[idx_stack_ptr] = idx;
207
208
    // Support pathX.cram##idx##pathY.crai
209
0
    const char *fn_delim = strstr(fn, HTS_IDX_DELIM);
210
0
    if (fn_delim && !fn_idx)
211
0
        fn_idx = fn_delim + strlen(HTS_IDX_DELIM);
212
213
0
    if (!fn_idx) {
214
0
        if (hts_idx_check_local(fn, HTS_FMT_CRAI, &tfn_idx) == 0 && hisremote(fn))
215
0
            tfn_idx = hts_idx_getfn(fn, ".crai");
216
217
0
        if (!tfn_idx) {
218
0
            hts_log_error("Could not retrieve index file for '%s'", fn);
219
0
            goto fail;
220
0
        }
221
0
        fn_idx = tfn_idx;
222
0
    }
223
224
0
    if (!(fp = hopen(fn_idx, "r"))) {
225
0
        hts_log_error("Could not open index file '%s'", fn_idx);
226
0
        goto fail;
227
0
    }
228
229
    // Load the file into memory
230
0
    while ((len = hread(fp, buf, sizeof(buf))) > 0) {
231
0
        if (kputsn(buf, len, &kstr) < 0)
232
0
            goto fail;
233
0
    }
234
235
0
    if (len < 0 || kstr.l < 2)
236
0
        goto fail;
237
238
0
    if (hclose(fp) < 0)
239
0
        goto fail;
240
241
    // Uncompress if required
242
0
    if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) {
243
0
        size_t l = 0;
244
0
        char *s = zlib_mem_inflate(kstr.s, kstr.l, &l);
245
0
        if (!s)
246
0
            goto fail;
247
248
0
        free(kstr.s);
249
0
        kstr.s = s;
250
0
        kstr.l = l;
251
0
        kstr.m = l; // conservative estimate of the size allocated
252
0
        if (kputsn("", 0, &kstr) < 0) // ensure kstr.s is NUL-terminated
253
0
            goto fail;
254
0
    }
255
256
257
    // refid indexes fd->index, so bound it to the header's reference count.
258
0
    int nref = sam_hdr_nref(fd->header);
259
260
    // Parse it line at a time
261
0
    while (pos < kstr.l) {
262
        /* 1.1 layout */
263
0
        if (kget_int32(&kstr, &pos, &e.refid) == -1)
264
0
            goto fail;
265
266
0
        if (kget_int32(&kstr, &pos, &e.start) == -1)
267
0
            goto fail;
268
269
0
        if (kget_int32(&kstr, &pos, &e.end) == -1)
270
0
            goto fail;
271
272
0
        if (kget_int64(&kstr, &pos, &e.offset) == -1)
273
0
            goto fail;
274
275
0
        if (kget_int32(&kstr, &pos, &e.slice) == -1)
276
0
            goto fail;
277
278
0
        if (kget_int32(&kstr, &pos, &e.len) == -1)
279
0
            goto fail;
280
281
0
        e.end += e.start-1;
282
        //printf("%d/%d..%d-offset=%" PRIu64 ",len=%d,slice=%d\n", e.refid, e.start, e.end, e.offset, e.len, e.slice);
283
284
0
        if (e.refid < -1 || e.refid >= nref) {
285
0
            hts_log_error("Malformed index file, refid %d", e.refid);
286
0
            goto fail;
287
0
        }
288
289
0
        if (e.refid != idx->refid) {
290
0
            if (fd->index_sz < e.refid+2) {
291
0
                cram_index *new_idx;
292
0
                int new_sz = e.refid+2;
293
0
                size_t index_end = fd->index_sz * sizeof(*fd->index);
294
0
                new_idx = hts_realloc_p(fd->index, sizeof(*fd->index),
295
0
                                        new_sz);
296
0
                if (!new_idx)
297
0
                    goto fail;
298
299
0
                fd->index = new_idx;
300
0
                fd->index_sz = new_sz;
301
0
                memset(((char *)fd->index) + index_end, 0,
302
0
                       fd->index_sz * sizeof(*fd->index) - index_end);
303
0
            }
304
0
            idx = &fd->index[e.refid+1];
305
0
            idx->refid = e.refid;
306
0
            idx->start = INT_MIN;
307
0
            idx->end   = INT_MAX;
308
0
            idx->nslice = idx->nalloc = 0;
309
0
            idx->e = NULL;
310
0
            idx_stack[(idx_stack_ptr = 0)] = idx;
311
0
        }
312
313
0
        while (!(e.start >= idx->start && e.end <= idx->end) ||
314
0
               (idx->start == 0 && idx->refid == -1)) {
315
0
            idx = idx_stack[--idx_stack_ptr];
316
0
        }
317
318
        // Now contains, so append
319
0
        if (idx->nslice+1 >= idx->nalloc) {
320
0
            cram_index *new_e;
321
0
            idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16;
322
0
            new_e = hts_realloc_p(idx->e, sizeof(*idx->e), idx->nalloc);
323
0
            if (!new_e)
324
0
                goto fail;
325
326
0
            idx->e = new_e;
327
0
        }
328
329
0
        e.nalloc = e.nslice = 0; e.e = NULL;
330
0
        *(ep = &idx->e[idx->nslice++]) = e;
331
0
        idx = ep;
332
333
0
        if (++idx_stack_ptr >= idx_stack_alloc) {
334
0
            cram_index **new_stack;
335
0
            idx_stack_alloc *= 2;
336
0
            new_stack = hts_realloc_p(idx_stack, sizeof(*idx_stack), idx_stack_alloc);
337
0
            if (!new_stack)
338
0
                goto fail;
339
0
            idx_stack = new_stack;
340
0
        }
341
0
        idx_stack[idx_stack_ptr] = idx;
342
343
0
        while (pos < kstr.l && kstr.s[pos] != '\n')
344
0
            pos++;
345
0
        pos++;
346
0
    }
347
348
0
    free(idx_stack);
349
0
    free(kstr.s);
350
0
    free(tfn_idx);
351
352
    // Convert NCList to linear linked list
353
0
    link_index(fd);
354
355
    //dump_index(fd);
356
357
0
    return 0;
358
359
0
 fail:
360
0
    free(kstr.s);
361
0
    free(idx_stack);
362
0
    free(tfn_idx);
363
0
    cram_index_free(fd); // Also sets fd->index = NULL
364
0
    return -1;
365
0
}
366
367
0
static void cram_index_free_recurse(cram_index *e) {
368
0
    if (e->e) {
369
0
        int i;
370
0
        for (i = 0; i < e->nslice; i++) {
371
0
            cram_index_free_recurse(&e->e[i]);
372
0
        }
373
0
        free(e->e);
374
0
    }
375
0
}
376
377
0
void cram_index_free(cram_fd *fd) {
378
0
    int i;
379
380
0
    if (!fd->index)
381
0
        return;
382
383
0
    for (i = 0; i < fd->index_sz; i++) {
384
0
        cram_index_free_recurse(&fd->index[i]);
385
0
    }
386
0
    free(fd->index);
387
388
0
    fd->index = NULL;
389
0
}
390
391
/*
392
 * Searches the index for the first slice overlapping a reference ID
393
 * and position, or one immediately preceding it if none is found in
394
 * the index to overlap this position. (Our index may have missing
395
 * entries, but we require at least one per reference.)
396
 *
397
 * If the index finds multiple slices overlapping this position we
398
 * return the first one only. Subsequent calls should specify
399
 * "from" as the last slice we checked to find the next one. Otherwise
400
 * set "from" to be NULL to find the first one.
401
 *
402
 * Refid can also be any of the special HTS_IDX_ values.
403
 * For backwards compatibility, refid -1 is equivalent to HTS_IDX_NOCOOR.
404
 *
405
 * Returns the cram_index pointer on success
406
 *         NULL on failure
407
 */
408
cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos,
409
0
                             cram_index *from) {
410
0
    int i, j, k;
411
0
    cram_index *e;
412
413
0
    if (from) {
414
        // Continue from a previous search.
415
        // We switch to just scanning the linked list, as the nested
416
        // lists are typically short.
417
0
        if (refid == HTS_IDX_NOCOOR)
418
0
            refid = -1;
419
420
0
        e = from->e_next;
421
0
        if (e && e->refid == refid && e->start <= pos)
422
0
            return e;
423
0
        else
424
0
            return NULL;
425
0
    }
426
427
0
    switch(refid) {
428
0
    case HTS_IDX_NONE:
429
0
    case HTS_IDX_REST:
430
        // fail, or already there, dealt with elsewhere.
431
0
        return NULL;
432
433
0
    case -1:
434
0
    case HTS_IDX_NOCOOR:
435
0
        refid = -1;
436
0
        pos = 0;
437
0
        break;
438
439
0
    case HTS_IDX_START: {
440
0
        int64_t min_idx = INT64_MAX;
441
0
        for (i = 0, j = -1; i < fd->index_sz; i++) {
442
0
            if (fd->index[i].e && fd->index[i].e[0].offset < min_idx) {
443
0
                min_idx = fd->index[i].e[0].offset;
444
0
                j = i;
445
0
            }
446
0
        }
447
0
        if (j < 0)
448
0
            return NULL;
449
0
        return fd->index[j].e;
450
0
    }
451
452
0
    default:
453
0
        if (refid < HTS_IDX_NONE || refid+1 >= fd->index_sz)
454
0
            return NULL;
455
0
    }
456
457
0
    from = &fd->index[refid+1];
458
459
    // Ref with nothing aligned against it.
460
0
    if (!from->e)
461
0
        return NULL;
462
463
    // This sequence is covered by the index, so binary search to find
464
    // the optimal starting block.
465
0
    i = 0, j = fd->index[refid+1].nslice-1;
466
0
    for (k = j/2; k != i; k = (j-i)/2 + i) {
467
0
        if (from->e[k].refid > refid) {
468
0
            j = k;
469
0
            continue;
470
0
        }
471
472
0
        if (from->e[k].refid < refid) {
473
0
            i = k;
474
0
            continue;
475
0
        }
476
477
0
        if (from->e[k].start >= pos) {
478
0
            j = k;
479
0
            continue;
480
0
        }
481
482
0
        if (from->e[k].start < pos) {
483
0
            i = k;
484
0
            continue;
485
0
        }
486
0
    }
487
    // i==j or i==j-1. Check if j is better.
488
0
    if (j >= 0 && from->e[j].start < pos && from->e[j].refid == refid)
489
0
        i = j;
490
491
    /* The above found *a* bin overlapping, but not necessarily the first */
492
0
    while (i > 0 && from->e[i-1].end >= pos)
493
0
        i--;
494
495
    /* We may be one bin before the optimum, so check */
496
0
    while (i+1 < from->nslice &&
497
0
           (from->e[i].refid < refid ||
498
0
            from->e[i].end < pos))
499
0
        i++;
500
501
0
    e = &from->e[i];
502
503
0
    return e;
504
0
}
505
506
// Return the index entry for last slice on a specific reference.
507
0
cram_index *cram_index_last(cram_fd *fd, int refid, cram_index *from) {
508
0
    int slice;
509
510
0
    if (refid+1 < 0 || refid+1 >= fd->index_sz)
511
0
        return NULL;
512
513
0
    if (!from)
514
0
        from = &fd->index[refid+1];
515
516
    // Ref with nothing aligned against it.
517
0
    if (!from->e)
518
0
        return NULL;
519
520
0
    slice = fd->index[refid+1].nslice - 1;
521
522
    // e is the last entry in the nested containment list, but it may
523
    // contain further slices within it.
524
0
    cram_index *e = &from->e[slice];
525
0
    while (e->e_next)
526
0
        e = e->e_next;
527
528
0
    return e;
529
0
}
530
531
/*
532
 * Find the last container overlapping pos 'end', and the file offset of
533
 * its end (equivalent to the start offset of the container following it).
534
 */
535
0
cram_index *cram_index_query_last(cram_fd *fd, int refid, hts_pos_t end) {
536
0
    cram_index *e = NULL, *prev_e;
537
0
    do {
538
0
        prev_e = e;
539
0
        e = cram_index_query(fd, refid, end, prev_e);
540
0
    } while (e);
541
542
0
    if (!prev_e)
543
0
        return NULL;
544
0
    e = prev_e;
545
546
    // Note: offset of e and e->e_next may be the same if we're using a
547
    // multi-ref container where a single container generates multiple
548
    // index entries.
549
    //
550
    // We need to keep iterating until offset differs in order to find
551
    // the genuine file offset for the end of container.
552
0
    do {
553
0
        prev_e = e;
554
0
        e = e->e_next;
555
0
    } while (e && e->offset == prev_e->offset);
556
557
0
    return prev_e;
558
0
}
559
560
/*
561
 * Skips to a container overlapping the start coordinate listed in
562
 * cram_range.
563
 *
564
 * In theory we call cram_index_query multiple times, once per slice
565
 * overlapping the range. However slices may be absent from the index
566
 * which makes this problematic. Instead we find the left-most slice
567
 * and then read from then on, skipping decoding of slices and/or
568
 * whole containers when they don't overlap the specified cram_range.
569
 *
570
 * This function also updates the cram_fd range field.
571
 *
572
 * Returns 0 on success
573
 *        -1 on general failure
574
 *        -2 on no-data (empty chromosome)
575
 */
576
0
int cram_seek_to_refpos(cram_fd *fd, cram_range *r) {
577
0
    int ret = 0;
578
0
    cram_index *e;
579
580
0
    if (r->refid == HTS_IDX_NONE) {
581
0
        ret = -2; goto err;
582
0
    }
583
584
    // Ideally use an index, so see if we have one.
585
0
    if ((e = cram_index_query(fd, r->refid, r->start, NULL))) {
586
0
        if (0 != cram_seek(fd, e->offset, SEEK_SET)) {
587
0
            ret = -1; goto err;
588
0
        }
589
0
    } else {
590
        // Absent from index, but this most likely means it simply has no data.
591
0
        ret = -2; goto err;
592
0
    }
593
594
0
    pthread_mutex_lock(&fd->range_lock);
595
0
    fd->range = *r;
596
0
    if (r->refid == HTS_IDX_NOCOOR) {
597
0
        fd->range.refid = -1;
598
0
        fd->range.start = 0;
599
0
    } else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) {
600
0
        fd->range.refid = -2; // special case in cram_next_slice
601
0
    }
602
0
    pthread_mutex_unlock(&fd->range_lock);
603
604
0
    if (fd->ctr) {
605
0
        cram_free_container(fd->ctr);
606
0
        if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
607
0
            cram_free_container(fd->ctr_mt);
608
0
        fd->ctr = NULL;
609
0
        fd->ctr_mt = NULL;
610
0
        fd->ooc = 0;
611
0
        fd->eof = 0;
612
0
    }
613
614
0
    return 0;
615
616
0
 err:
617
    // It's unlikely fd->range will be accessed after EOF or error,
618
    // but this maintains identical behaviour to the previous code.
619
0
    pthread_mutex_lock(&fd->range_lock);
620
0
    fd->range = *r;
621
0
    pthread_mutex_unlock(&fd->range_lock);
622
0
    return ret;
623
0
}
624
625
626
/*
627
 * A specialised form of cram_index_build (below) that deals with slices
628
 * having multiple references in this (ref_id -2). In this scenario we
629
 * decode the slice to look at the RI data series instead.
630
 *
631
 * Returns 0 on success
632
 *        -1 on read failure
633
 *        -2 on wrong sort order
634
 *        -4 on write failure
635
 */
636
static int cram_index_build_multiref(cram_fd *fd,
637
                                     cram_container *c,
638
                                     cram_slice *s,
639
                                     BGZF *fp,
640
                                     off_t cpos,
641
                                     int32_t landmark,
642
0
                                     int sz) {
643
0
    int i, ref = -2;
644
0
    int64_t ref_start = 0, ref_end;
645
0
    char buf[1024];
646
647
0
    if (fd->mode != 'w') {
648
0
        if (0 != cram_decode_slice(fd, c, s, fd->header))
649
0
            return -1;
650
0
    }
651
652
0
    ref_end = INT_MIN;
653
654
0
    int32_t last_ref = -9;
655
0
    int32_t last_pos = -9;
656
0
    for (i = 0; i < s->hdr->num_records; i++) {
657
0
        if (s->crecs[i].ref_id == last_ref && s->crecs[i].apos < last_pos) {
658
0
            hts_log_error("CRAM file is not sorted by chromosome / position");
659
0
            return -2;
660
0
        }
661
0
        last_ref = s->crecs[i].ref_id;
662
0
        last_pos = s->crecs[i].apos;
663
664
0
        if (s->crecs[i].ref_id == ref) {
665
0
            if (ref_end < s->crecs[i].aend)
666
0
                ref_end = s->crecs[i].aend;
667
0
            continue;
668
0
        }
669
670
0
        if (ref != -2) {
671
0
            snprintf(buf, sizeof(buf),
672
0
                     "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n",
673
0
                     ref, ref_start, ref_end - ref_start + 1,
674
0
                     (int64_t)cpos, landmark, sz);
675
0
            if (bgzf_write(fp, buf, strlen(buf)) < 0)
676
0
                return -4;
677
0
        }
678
679
0
        ref = s->crecs[i].ref_id;
680
0
        ref_start = s->crecs[i].apos;
681
0
        ref_end   = s->crecs[i].aend;
682
0
    }
683
684
0
    if (ref != -2) {
685
0
        snprintf(buf, sizeof(buf),
686
0
                 "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n",
687
0
                 ref, ref_start, ref_end - ref_start + 1,
688
0
                 (int64_t)cpos, landmark, sz);
689
0
        if (bgzf_write(fp, buf, strlen(buf)) < 0)
690
0
            return -4;
691
0
    }
692
693
0
    return 0;
694
0
}
695
696
/*
697
 * Adds a single slice to the index.
698
 */
699
int cram_index_slice(cram_fd *fd,
700
                     cram_container *c,
701
                     cram_slice *s,
702
                     BGZF *fp,
703
                     off_t cpos,
704
                     off_t spos, // relative to cpos
705
0
                     off_t sz) {
706
0
    int ret;
707
0
    char buf[1024];
708
709
0
    if (sz > INT_MAX) {
710
0
        hts_log_error("CRAM slice is too big (%"PRId64" bytes)",
711
0
                      (int64_t) sz);
712
0
        return -1;
713
0
    }
714
715
0
    if (s->hdr->ref_seq_id == -2) {
716
0
        ret = cram_index_build_multiref(fd, c, s, fp, cpos, spos, sz);
717
0
    } else {
718
0
        snprintf(buf, sizeof(buf),
719
0
                 "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n",
720
0
                 s->hdr->ref_seq_id, s->hdr->ref_seq_start,
721
0
                 s->hdr->ref_seq_span, (int64_t)cpos, (int)spos, (int)sz);
722
0
        ret = (bgzf_write(fp, buf, strlen(buf)) >= 0)? 0 : -4;
723
0
    }
724
725
0
    return ret;
726
0
}
727
728
/*
729
 * Adds a single container to the index.
730
 */
731
static
732
int cram_index_container(cram_fd *fd,
733
                         cram_container *c,
734
                         BGZF *fp,
735
0
                         off_t cpos) {
736
0
    int j;
737
0
    off_t spos;
738
739
    // 2.0 format
740
0
    for (j = 0; j < c->num_landmarks; j++) {
741
0
        cram_slice *s;
742
0
        off_t sz;
743
0
        int ret;
744
745
0
        spos = htell(fd->fp);
746
0
        if (spos - cpos - (off_t) c->offset != c->landmark[j]) {
747
0
            hts_log_error("CRAM slice offset %"PRId64" does not match"
748
0
                          " landmark %d in container header (%"PRId32")",
749
0
                          (int64_t) (spos - cpos - (off_t) c->offset),
750
0
                          j, c->landmark[j]);
751
0
            return -1;
752
0
        }
753
754
0
        if (!(s = cram_read_slice(fd))) {
755
0
            return -1;
756
0
        }
757
758
0
        sz = htell(fd->fp) - spos;
759
0
        ret = cram_index_slice(fd, c, s, fp, cpos, c->landmark[j], sz);
760
761
0
        cram_free_slice(s);
762
763
0
        if (ret < 0) {
764
0
            return ret;
765
0
        }
766
0
    }
767
768
0
    return 0;
769
0
}
770
771
772
/*
773
 * Builds an index file.
774
 *
775
 * fd is a newly opened cram file that we wish to index.
776
 * fn_base is the filename of the associated CRAM file.
777
 * fn_idx is the filename of the index file to be written;
778
 * if NULL, we add ".crai" to fn_base to get the index filename.
779
 *
780
 * Returns 0 on success,
781
 *         negative on failure (-1 for read failure, -4 for write failure)
782
 */
783
0
int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) {
784
0
    cram_container *c;
785
0
    off_t cpos, hpos;
786
0
    BGZF *fp;
787
0
    kstring_t fn_idx_str = {0};
788
0
    int64_t last_ref = -9, last_start = -9;
789
790
    // Useful for cram_index_build_multiref
791
0
    cram_set_option(fd, CRAM_OPT_REQUIRED_FIELDS, SAM_RNAME | SAM_POS | SAM_CIGAR);
792
793
0
    if (! fn_idx) {
794
0
        kputs(fn_base, &fn_idx_str);
795
0
        kputs(".crai", &fn_idx_str);
796
0
        fn_idx = fn_idx_str.s;
797
0
    }
798
799
0
    if (!(fp = bgzf_open(fn_idx, "wg"))) {
800
0
        perror(fn_idx);
801
0
        free(fn_idx_str.s);
802
0
        return -4;
803
0
    }
804
805
0
    free(fn_idx_str.s);
806
807
0
    cpos = htell(fd->fp);
808
0
    while ((c = cram_read_container(fd))) {
809
0
        if (fd->err) {
810
0
            perror("Cram container read");
811
0
            return -1;
812
0
        }
813
814
0
        hpos = htell(fd->fp);
815
816
0
        if (!(c->comp_hdr_block = cram_read_block(fd)))
817
0
            return -1;
818
0
        assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER);
819
820
0
        c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
821
0
        if (!c->comp_hdr)
822
0
            return -1;
823
824
0
        if (c->ref_seq_id == last_ref && c->ref_seq_start < last_start) {
825
0
            hts_log_error("CRAM file is not sorted by chromosome / position");
826
0
            return -2;
827
0
        }
828
0
        last_ref = c->ref_seq_id;
829
0
        last_start = c->ref_seq_start;
830
831
0
        if (cram_index_container(fd, c, fp, cpos) < 0) {
832
0
            bgzf_close(fp);
833
0
            return -1;
834
0
        }
835
836
0
        off_t next_cpos = htell(fd->fp);
837
0
        if (next_cpos != hpos + c->length) {
838
0
            hts_log_error("Length %"PRId32" in container header at offset %lld does not match block lengths (%lld)",
839
0
                          c->length, (long long) cpos, (long long) next_cpos - hpos);
840
0
            return -1;
841
0
        }
842
0
        cpos = next_cpos;
843
844
0
        cram_free_container(c);
845
0
    }
846
0
    if (fd->err) {
847
0
        bgzf_close(fp);
848
0
        return -1;
849
0
    }
850
851
0
    return (bgzf_close(fp) >= 0)? 0 : -4;
852
0
}
853
854
// internal recursive step
855
static int64_t cram_num_containers_between_(cram_index *e, int64_t *last_pos,
856
                                            int64_t nct,
857
                                            off_t cstart, off_t cend,
858
0
                                            int64_t *first, int64_t *last) {
859
0
    int64_t nc = 0, i;
860
861
0
    if (e->offset) {
862
0
        if (e->offset != *last_pos) {
863
0
            if (e->offset >= cstart && (!cend || e->offset <= cend)) {
864
0
                if (first && *first < 0)
865
0
                    *first = nct;
866
0
                if (last)
867
0
                    *last = nct;
868
0
            }
869
0
            nc++;
870
0
        }
871
        // else a new multi-ref in same container
872
0
        *last_pos = e->offset;
873
0
    }
874
875
0
    for (i = 0; i < e->nslice; i++)
876
0
        nc += cram_num_containers_between_(&e->e[i], last_pos, nc + nct,
877
0
                                           cstart, cend, first, last);
878
879
0
    return nc;
880
0
}
881
882
/*! Returns the number of containers in the CRAM file within given offsets.
883
 *
884
 * The cstart and cend offsets are the locations of the start of containers
885
 * as returned by index_container_offset.
886
 *
887
 * If non-NULL, first and last will hold the inclusive range of container
888
 * numbers, counting from zero.
889
 *
890
 * @return
891
 * Returns the number of containers, equivalent to *last-*first+1.
892
 */
893
int64_t cram_num_containers_between(cram_fd *fd,
894
                                    off_t cstart, off_t cend,
895
0
                                    int64_t *first, int64_t *last) {
896
0
    int64_t nc = 0, i;
897
0
    int64_t last_pos = -99;
898
0
    int64_t l_first = -1, l_last = -1;
899
900
0
    for (i = 0; i < fd->index_sz; i++) {
901
0
        int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end
902
0
        nc += cram_num_containers_between_(&fd->index[j], &last_pos, nc,
903
0
                                           cstart, cend, &l_first, &l_last);
904
0
    }
905
906
0
    if (first)
907
0
        *first = l_first;
908
0
    if (last)
909
0
        *last = l_last;
910
911
0
    return l_last - l_first + 1;
912
0
}
913
914
/*
915
 * Queries the total number of distinct containers in the index.
916
 * Note there may be more containers in the file than in the index, as we
917
 * are not required to have an index entry for every one.
918
 */
919
0
int64_t cram_num_containers(cram_fd *fd) {
920
0
    return cram_num_containers_between(fd, 0, 0, NULL, NULL);
921
0
}
922
923
924
/*! Returns the byte offset for the start of the n^th container.
925
 *
926
 * The index must have previously been loaded, otherwise <0 is returned.
927
 */
928
static cram_index *cram_container_num2offset_(cram_index *e, int num,
929
0
                                              int64_t *last_pos, int *nc) {
930
0
    if (e->offset) {
931
0
        if (e->offset != *last_pos) {
932
0
            if (*nc == num)
933
0
                return e;
934
0
            (*nc)++;
935
0
        }
936
        // else a new multi-ref in same container
937
0
        *last_pos = e->offset;
938
0
    }
939
940
0
    int i;
941
0
    for (i = 0; i < e->nslice; i++) {
942
0
        cram_index *tmp = cram_container_num2offset_(&e->e[i], num,
943
0
                                                     last_pos, nc);
944
0
        if (tmp)
945
0
            return tmp;
946
0
    }
947
948
949
0
    return NULL;
950
0
}
951
952
0
off_t cram_container_num2offset(cram_fd *fd, int64_t num) {
953
0
    int nc = 0, i;
954
0
    int64_t last_pos = -9;
955
0
    cram_index *e = NULL;
956
957
0
    for (i = 0; i < fd->index_sz; i++) {
958
0
        int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end
959
0
        if (!fd->index[j].nslice)
960
0
            continue;
961
0
        if ((e = cram_container_num2offset_(&fd->index[j], num,
962
0
                                            &last_pos, &nc)))
963
0
            break;
964
0
    }
965
966
0
    return e ? e->offset : -1;
967
0
}
968
969
970
/*! Returns the container number for the first container at offset >= pos.
971
 *
972
 * The index must have previously been loaded, otherwise <0 is returned.
973
 */
974
static cram_index *cram_container_offset2num_(cram_index *e, off_t pos,
975
0
                                              int64_t *last_pos, int *nc) {
976
0
    if (e->offset) {
977
0
        if (e->offset != *last_pos) {
978
0
            if (e->offset >= pos)
979
0
                return e;
980
0
            (*nc)++;
981
0
        }
982
        // else a new multi-ref in same container
983
0
        *last_pos = e->offset;
984
0
    }
985
986
0
    int i;
987
0
    for (i = 0; i < e->nslice; i++) {
988
0
        cram_index *tmp = cram_container_offset2num_(&e->e[i], pos,
989
0
                                                     last_pos, nc);
990
0
        if (tmp)
991
0
            return tmp;
992
0
    }
993
994
995
0
    return NULL;
996
0
}
997
998
0
int64_t cram_container_offset2num(cram_fd *fd, off_t pos) {
999
0
    int nc = 0, i;
1000
0
    int64_t last_pos = -9;
1001
0
    cram_index *e = NULL;
1002
1003
0
    for (i = 0; i < fd->index_sz; i++) {
1004
0
        int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end
1005
0
        if (!fd->index[j].nslice)
1006
0
            continue;
1007
0
        if ((e = cram_container_offset2num_(&fd->index[j], pos,
1008
0
                                            &last_pos, &nc)))
1009
0
            break;
1010
0
    }
1011
1012
0
    return e ? nc : -1;
1013
0
}
1014
1015
/*!
1016
 * Returns the file offsets of CRAM containers covering a specific region
1017
 * query.  Note both offsets are the START of the container.
1018
 *
1019
 * first will point to the start of the first overlapping container
1020
 * last will point to the start of the last overlapping container
1021
 *
1022
 * Returns 0 on success
1023
 *        <0 on failure
1024
 */
1025
int cram_index_extents(cram_fd *fd, int refid, hts_pos_t start, hts_pos_t end,
1026
0
                       off_t *first, off_t *last) {
1027
0
    cram_index *ci;
1028
1029
0
    if (first) {
1030
0
        if (!(ci = cram_index_query(fd, refid, start, NULL)))
1031
0
            return -1;
1032
0
        *first = ci->offset;
1033
0
    }
1034
1035
0
    if (last) {
1036
0
        if (!(ci = cram_index_query_last(fd, refid, end)))
1037
0
            return -1;
1038
0
        *last = ci->offset;
1039
0
    }
1040
1041
0
    return 0;
1042
0
}