Coverage Report

Created: 2025-08-24 06:59

/src/htslib/cram/cram_index.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2013-2020, 2023-2024 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * The index is a gzipped tab-delimited text file with one line per slice.
33
 * The columns are:
34
 * 1: reference number (0 to N-1, as per BAM ref_id)
35
 * 2: reference position of 1st read in slice (1..?)
36
 * 3: number of reads in slice
37
 * 4: offset of container start (relative to end of SAM header, so 1st
38
 *    container is offset 0).
39
 * 5: slice number within container (ie which landmark).
40
 *
41
 * In memory, we hold this in a nested containment list. Each list element is
42
 * a cram_index struct. Each element in turn can contain its own list of
43
 * cram_index structs.
44
 *
45
 * Any start..end range which is entirely contained within another (and
46
 * earlier as it is sorted) range will be held within it. This ensures that
47
 * the outer list will never have containments and we can safely do a
48
 * binary search to find the first range which overlaps any given coordinate.
49
 */
50
51
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
52
#include <config.h>
53
54
#include <stdio.h>
55
#include <errno.h>
56
#include <assert.h>
57
#include <inttypes.h>
58
#include <stdlib.h>
59
#include <string.h>
60
#include <zlib.h>
61
#include <sys/types.h>
62
#include <sys/stat.h>
63
#include <math.h>
64
65
#include "../htslib/bgzf.h"
66
#include "../htslib/hfile.h"
67
#include "../hts_internal.h"
68
#include "cram.h"
69
#include "os.h"
70
71
#if 0
72
static void dump_index_(cram_index *e, int level) {
73
    int i, n;
74
    n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end);
75
    printf("%*soffset %"PRId64" %p %p\n", MAX(0,50-n), "", e->offset, e, e->e_next);
76
    for (i = 0; i < e->nslice; i++) {
77
        dump_index_(&e->e[i], level+1);
78
    }
79
}
80
81
static void dump_index(cram_fd *fd) {
82
    int i;
83
    for (i = 0; i < fd->index_sz; i++) {
84
        dump_index_(&fd->index[i], 0);
85
    }
86
}
87
#endif
88
89
// Thread a linked list through the nested containment list.
90
// This makes navigating it and finding the "next" index entry
91
// trivial.
92
0
static cram_index *link_index_(cram_index *e, cram_index *e_last) {
93
0
    int i;
94
0
    if (e_last)
95
0
        e_last->e_next = e;
96
97
    // We don't want to link in the top-level cram_index with
98
    // offset=0 and start/end = INT_MIN/INT_MAX.
99
0
    if (e->offset)
100
0
        e_last = e;
101
102
0
    for (i = 0; i < e->nslice; i++)
103
0
        e_last = link_index_(&e->e[i], e_last);
104
105
0
    return e_last;
106
0
}
107
108
0
static void link_index(cram_fd *fd) {
109
0
    int i;
110
0
    cram_index *e_last = NULL;
111
112
0
    for (i = 0; i < fd->index_sz; i++) {
113
0
        e_last = link_index_(&fd->index[i], e_last);
114
0
    }
115
116
0
    if (e_last)
117
0
        e_last->e_next = NULL;
118
0
}
119
120
0
static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) {
121
0
    int sign = 1;
122
0
    int32_t val = 0;
123
0
    size_t p = *pos;
124
125
0
    while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
126
0
        p++;
127
128
0
    if (p < k->l && k->s[p] == '-')
129
0
        sign = -1, p++;
130
131
0
    if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
132
0
        return -1;
133
134
0
    while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') {
135
0
        int digit = k->s[p++]-'0';
136
0
        val = val*10 + digit;
137
0
    }
138
139
0
    *pos = p;
140
0
    *val_p = sign*val;
141
142
0
    return 0;
143
0
}
144
145
0
static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) {
146
0
    int sign = 1;
147
0
    int64_t val = 0;
148
0
    size_t p = *pos;
149
150
0
    while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
151
0
        p++;
152
153
0
    if (p < k->l && k->s[p] == '-')
154
0
        sign = -1, p++;
155
156
0
    if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
157
0
        return -1;
158
159
0
    while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') {
160
0
        int digit = k->s[p++]-'0';
161
0
        val = val*10 + digit;
162
0
    }
163
164
0
    *pos = p;
165
0
    *val_p = sign*val;
166
167
0
    return 0;
168
0
}
169
170
/*
171
 * Loads a CRAM .crai index into memory.
172
 *
173
 * Returns 0 for success
174
 *        -1 for failure
175
 */
176
0
int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) {
177
178
0
    char *tfn_idx = NULL;
179
0
    char buf[65536];
180
0
    ssize_t len;
181
0
    kstring_t kstr = {0};
182
0
    hFILE *fp;
183
0
    cram_index *idx;
184
0
    cram_index **idx_stack = NULL, *ep, e;
185
0
    int idx_stack_alloc = 0, idx_stack_ptr = 0;
186
0
    size_t pos = 0;
187
188
    /* Check if already loaded */
189
0
    if (fd->index)
190
0
        return 0;
191
192
0
    fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index));
193
0
    if (!fd->index)
194
0
        return -1;
195
196
0
    idx = &fd->index[0];
197
0
    idx->refid = -1;
198
0
    idx->start = INT_MIN;
199
0
    idx->end   = INT_MAX;
200
201
0
    idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack));
202
0
    if (!idx_stack)
203
0
        goto fail;
204
205
0
    idx_stack[idx_stack_ptr] = idx;
206
207
    // Support pathX.cram##idx##pathY.crai
208
0
    const char *fn_delim = strstr(fn, HTS_IDX_DELIM);
209
0
    if (fn_delim && !fn_idx)
210
0
        fn_idx = fn_delim + strlen(HTS_IDX_DELIM);
211
212
0
    if (!fn_idx) {
213
0
        if (hts_idx_check_local(fn, HTS_FMT_CRAI, &tfn_idx) == 0 && hisremote(fn))
214
0
            tfn_idx = hts_idx_getfn(fn, ".crai");
215
216
0
        if (!tfn_idx) {
217
0
            hts_log_error("Could not retrieve index file for '%s'", fn);
218
0
            goto fail;
219
0
        }
220
0
        fn_idx = tfn_idx;
221
0
    }
222
223
0
    if (!(fp = hopen(fn_idx, "r"))) {
224
0
        hts_log_error("Could not open index file '%s'", fn_idx);
225
0
        goto fail;
226
0
    }
227
228
    // Load the file into memory
229
0
    while ((len = hread(fp, buf, sizeof(buf))) > 0) {
230
0
        if (kputsn(buf, len, &kstr) < 0)
231
0
            goto fail;
232
0
    }
233
234
0
    if (len < 0 || kstr.l < 2)
235
0
        goto fail;
236
237
0
    if (hclose(fp) < 0)
238
0
        goto fail;
239
240
    // Uncompress if required
241
0
    if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) {
242
0
        size_t l = 0;
243
0
        char *s = zlib_mem_inflate(kstr.s, kstr.l, &l);
244
0
        if (!s)
245
0
            goto fail;
246
247
0
        free(kstr.s);
248
0
        kstr.s = s;
249
0
        kstr.l = l;
250
0
        kstr.m = l; // conservative estimate of the size allocated
251
0
        if (kputsn("", 0, &kstr) < 0) // ensure kstr.s is NUL-terminated
252
0
            goto fail;
253
0
    }
254
255
256
    // Parse it line at a time
257
0
    while (pos < kstr.l) {
258
        /* 1.1 layout */
259
0
        if (kget_int32(&kstr, &pos, &e.refid) == -1)
260
0
            goto fail;
261
262
0
        if (kget_int32(&kstr, &pos, &e.start) == -1)
263
0
            goto fail;
264
265
0
        if (kget_int32(&kstr, &pos, &e.end) == -1)
266
0
            goto fail;
267
268
0
        if (kget_int64(&kstr, &pos, &e.offset) == -1)
269
0
            goto fail;
270
271
0
        if (kget_int32(&kstr, &pos, &e.slice) == -1)
272
0
            goto fail;
273
274
0
        if (kget_int32(&kstr, &pos, &e.len) == -1)
275
0
            goto fail;
276
277
0
        e.end += e.start-1;
278
        //printf("%d/%d..%d-offset=%" PRIu64 ",len=%d,slice=%d\n", e.refid, e.start, e.end, e.offset, e.len, e.slice);
279
280
0
        if (e.refid < -1) {
281
0
            hts_log_error("Malformed index file, refid %d", e.refid);
282
0
            goto fail;
283
0
        }
284
285
0
        if (e.refid != idx->refid) {
286
0
            if (fd->index_sz < e.refid+2) {
287
0
                cram_index *new_idx;
288
0
                int new_sz = e.refid+2;
289
0
                size_t index_end = fd->index_sz * sizeof(*fd->index);
290
0
                new_idx = realloc(fd->index,
291
0
                                  new_sz * sizeof(*fd->index));
292
0
                if (!new_idx)
293
0
                    goto fail;
294
295
0
                fd->index = new_idx;
296
0
                fd->index_sz = new_sz;
297
0
                memset(((char *)fd->index) + index_end, 0,
298
0
                       fd->index_sz * sizeof(*fd->index) - index_end);
299
0
            }
300
0
            idx = &fd->index[e.refid+1];
301
0
            idx->refid = e.refid;
302
0
            idx->start = INT_MIN;
303
0
            idx->end   = INT_MAX;
304
0
            idx->nslice = idx->nalloc = 0;
305
0
            idx->e = NULL;
306
0
            idx_stack[(idx_stack_ptr = 0)] = idx;
307
0
        }
308
309
0
        while (!(e.start >= idx->start && e.end <= idx->end) ||
310
0
               (idx->start == 0 && idx->refid == -1)) {
311
0
            idx = idx_stack[--idx_stack_ptr];
312
0
        }
313
314
        // Now contains, so append
315
0
        if (idx->nslice+1 >= idx->nalloc) {
316
0
            cram_index *new_e;
317
0
            idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16;
318
0
            new_e = realloc(idx->e, idx->nalloc * sizeof(*idx->e));
319
0
            if (!new_e)
320
0
                goto fail;
321
322
0
            idx->e = new_e;
323
0
        }
324
325
0
        e.nalloc = e.nslice = 0; e.e = NULL;
326
0
        *(ep = &idx->e[idx->nslice++]) = e;
327
0
        idx = ep;
328
329
0
        if (++idx_stack_ptr >= idx_stack_alloc) {
330
0
            cram_index **new_stack;
331
0
            idx_stack_alloc *= 2;
332
0
            new_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack));
333
0
            if (!new_stack)
334
0
                goto fail;
335
0
            idx_stack = new_stack;
336
0
        }
337
0
        idx_stack[idx_stack_ptr] = idx;
338
339
0
        while (pos < kstr.l && kstr.s[pos] != '\n')
340
0
            pos++;
341
0
        pos++;
342
0
    }
343
344
0
    free(idx_stack);
345
0
    free(kstr.s);
346
0
    free(tfn_idx);
347
348
    // Convert NCList to linear linked list
349
0
    link_index(fd);
350
351
    //dump_index(fd);
352
353
0
    return 0;
354
355
0
 fail:
356
0
    free(kstr.s);
357
0
    free(idx_stack);
358
0
    free(tfn_idx);
359
0
    cram_index_free(fd); // Also sets fd->index = NULL
360
0
    return -1;
361
0
}
362
363
0
static void cram_index_free_recurse(cram_index *e) {
364
0
    if (e->e) {
365
0
        int i;
366
0
        for (i = 0; i < e->nslice; i++) {
367
0
            cram_index_free_recurse(&e->e[i]);
368
0
        }
369
0
        free(e->e);
370
0
    }
371
0
}
372
373
0
void cram_index_free(cram_fd *fd) {
374
0
    int i;
375
376
0
    if (!fd->index)
377
0
        return;
378
379
0
    for (i = 0; i < fd->index_sz; i++) {
380
0
        cram_index_free_recurse(&fd->index[i]);
381
0
    }
382
0
    free(fd->index);
383
384
0
    fd->index = NULL;
385
0
}
386
387
/*
388
 * Searches the index for the first slice overlapping a reference ID
389
 * and position, or one immediately preceding it if none is found in
390
 * the index to overlap this position. (Our index may have missing
391
 * entries, but we require at least one per reference.)
392
 *
393
 * If the index finds multiple slices overlapping this position we
394
 * return the first one only. Subsequent calls should specify
395
 * "from" as the last slice we checked to find the next one. Otherwise
396
 * set "from" to be NULL to find the first one.
397
 *
398
 * Refid can also be any of the special HTS_IDX_ values.
399
 * For backwards compatibility, refid -1 is equivalent to HTS_IDX_NOCOOR.
400
 *
401
 * Returns the cram_index pointer on success
402
 *         NULL on failure
403
 */
404
cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos,
405
0
                             cram_index *from) {
406
0
    int i, j, k;
407
0
    cram_index *e;
408
409
0
    if (from) {
410
        // Continue from a previous search.
411
        // We switch to just scanning the linked list, as the nested
412
        // lists are typically short.
413
0
        if (refid == HTS_IDX_NOCOOR)
414
0
            refid = -1;
415
416
0
        e = from->e_next;
417
0
        if (e && e->refid == refid && e->start <= pos)
418
0
            return e;
419
0
        else
420
0
            return NULL;
421
0
    }
422
423
0
    switch(refid) {
424
0
    case HTS_IDX_NONE:
425
0
    case HTS_IDX_REST:
426
        // fail, or already there, dealt with elsewhere.
427
0
        return NULL;
428
429
0
    case -1:
430
0
    case HTS_IDX_NOCOOR:
431
0
        refid = -1;
432
0
        pos = 0;
433
0
        break;
434
435
0
    case HTS_IDX_START: {
436
0
        int64_t min_idx = INT64_MAX;
437
0
        for (i = 0, j = -1; i < fd->index_sz; i++) {
438
0
            if (fd->index[i].e && fd->index[i].e[0].offset < min_idx) {
439
0
                min_idx = fd->index[i].e[0].offset;
440
0
                j = i;
441
0
            }
442
0
        }
443
0
        if (j < 0)
444
0
            return NULL;
445
0
        return fd->index[j].e;
446
0
    }
447
448
0
    default:
449
0
        if (refid < HTS_IDX_NONE || refid+1 >= fd->index_sz)
450
0
            return NULL;
451
0
    }
452
453
0
    from = &fd->index[refid+1];
454
455
    // Ref with nothing aligned against it.
456
0
    if (!from->e)
457
0
        return NULL;
458
459
    // This sequence is covered by the index, so binary search to find
460
    // the optimal starting block.
461
0
    i = 0, j = fd->index[refid+1].nslice-1;
462
0
    for (k = j/2; k != i; k = (j-i)/2 + i) {
463
0
        if (from->e[k].refid > refid) {
464
0
            j = k;
465
0
            continue;
466
0
        }
467
468
0
        if (from->e[k].refid < refid) {
469
0
            i = k;
470
0
            continue;
471
0
        }
472
473
0
        if (from->e[k].start >= pos) {
474
0
            j = k;
475
0
            continue;
476
0
        }
477
478
0
        if (from->e[k].start < pos) {
479
0
            i = k;
480
0
            continue;
481
0
        }
482
0
    }
483
    // i==j or i==j-1. Check if j is better.
484
0
    if (j >= 0 && from->e[j].start < pos && from->e[j].refid == refid)
485
0
        i = j;
486
487
    /* The above found *a* bin overlapping, but not necessarily the first */
488
0
    while (i > 0 && from->e[i-1].end >= pos)
489
0
        i--;
490
491
    /* We may be one bin before the optimum, so check */
492
0
    while (i+1 < from->nslice &&
493
0
           (from->e[i].refid < refid ||
494
0
            from->e[i].end < pos))
495
0
        i++;
496
497
0
    e = &from->e[i];
498
499
0
    return e;
500
0
}
501
502
// Return the index entry for last slice on a specific reference.
503
0
cram_index *cram_index_last(cram_fd *fd, int refid, cram_index *from) {
504
0
    int slice;
505
506
0
    if (refid+1 < 0 || refid+1 >= fd->index_sz)
507
0
        return NULL;
508
509
0
    if (!from)
510
0
        from = &fd->index[refid+1];
511
512
    // Ref with nothing aligned against it.
513
0
    if (!from->e)
514
0
        return NULL;
515
516
0
    slice = fd->index[refid+1].nslice - 1;
517
518
    // e is the last entry in the nested containment list, but it may
519
    // contain further slices within it.
520
0
    cram_index *e = &from->e[slice];
521
0
    while (e->e_next)
522
0
        e = e->e_next;
523
524
0
    return e;
525
0
}
526
527
/*
528
 * Find the last container overlapping pos 'end', and the file offset of
529
 * its end (equivalent to the start offset of the container following it).
530
 */
531
0
cram_index *cram_index_query_last(cram_fd *fd, int refid, hts_pos_t end) {
532
0
    cram_index *e = NULL, *prev_e;
533
0
    do {
534
0
        prev_e = e;
535
0
        e = cram_index_query(fd, refid, end, prev_e);
536
0
    } while (e);
537
538
0
    if (!prev_e)
539
0
        return NULL;
540
0
    e = prev_e;
541
542
    // Note: offset of e and e->e_next may be the same if we're using a
543
    // multi-ref container where a single container generates multiple
544
    // index entries.
545
    //
546
    // We need to keep iterating until offset differs in order to find
547
    // the genuine file offset for the end of container.
548
0
    do {
549
0
        prev_e = e;
550
0
        e = e->e_next;
551
0
    } while (e && e->offset == prev_e->offset);
552
553
0
    return prev_e;
554
0
}
555
556
/*
557
 * Skips to a container overlapping the start coordinate listed in
558
 * cram_range.
559
 *
560
 * In theory we call cram_index_query multiple times, once per slice
561
 * overlapping the range. However slices may be absent from the index
562
 * which makes this problematic. Instead we find the left-most slice
563
 * and then read from then on, skipping decoding of slices and/or
564
 * whole containers when they don't overlap the specified cram_range.
565
 *
566
 * This function also updates the cram_fd range field.
567
 *
568
 * Returns 0 on success
569
 *        -1 on general failure
570
 *        -2 on no-data (empty chromosome)
571
 */
572
0
int cram_seek_to_refpos(cram_fd *fd, cram_range *r) {
573
0
    int ret = 0;
574
0
    cram_index *e;
575
576
0
    if (r->refid == HTS_IDX_NONE) {
577
0
        ret = -2; goto err;
578
0
    }
579
580
    // Ideally use an index, so see if we have one.
581
0
    if ((e = cram_index_query(fd, r->refid, r->start, NULL))) {
582
0
        if (0 != cram_seek(fd, e->offset, SEEK_SET)) {
583
0
            ret = -1; goto err;
584
0
        }
585
0
    } else {
586
        // Absent from index, but this most likely means it simply has no data.
587
0
        ret = -2; goto err;
588
0
    }
589
590
0
    pthread_mutex_lock(&fd->range_lock);
591
0
    fd->range = *r;
592
0
    if (r->refid == HTS_IDX_NOCOOR) {
593
0
        fd->range.refid = -1;
594
0
        fd->range.start = 0;
595
0
    } else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) {
596
0
        fd->range.refid = -2; // special case in cram_next_slice
597
0
    }
598
0
    pthread_mutex_unlock(&fd->range_lock);
599
600
0
    if (fd->ctr) {
601
0
        cram_free_container(fd->ctr);
602
0
        if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
603
0
            cram_free_container(fd->ctr_mt);
604
0
        fd->ctr = NULL;
605
0
        fd->ctr_mt = NULL;
606
0
        fd->ooc = 0;
607
0
        fd->eof = 0;
608
0
    }
609
610
0
    return 0;
611
612
0
 err:
613
    // It's unlikely fd->range will be accessed after EOF or error,
614
    // but this maintains identical behaviour to the previous code.
615
0
    pthread_mutex_lock(&fd->range_lock);
616
0
    fd->range = *r;
617
0
    pthread_mutex_unlock(&fd->range_lock);
618
0
    return ret;
619
0
}
620
621
622
/*
623
 * A specialised form of cram_index_build (below) that deals with slices
624
 * having multiple references in this (ref_id -2). In this scenario we
625
 * decode the slice to look at the RI data series instead.
626
 *
627
 * Returns 0 on success
628
 *        -1 on read failure
629
 *        -2 on wrong sort order
630
 *        -4 on write failure
631
 */
632
static int cram_index_build_multiref(cram_fd *fd,
633
                                     cram_container *c,
634
                                     cram_slice *s,
635
                                     BGZF *fp,
636
                                     off_t cpos,
637
                                     int32_t landmark,
638
0
                                     int sz) {
639
0
    int i, ref = -2;
640
0
    int64_t ref_start = 0, ref_end;
641
0
    char buf[1024];
642
643
0
    if (fd->mode != 'w') {
644
0
        if (0 != cram_decode_slice(fd, c, s, fd->header))
645
0
            return -1;
646
0
    }
647
648
0
    ref_end = INT_MIN;
649
650
0
    int32_t last_ref = -9;
651
0
    int32_t last_pos = -9;
652
0
    for (i = 0; i < s->hdr->num_records; i++) {
653
0
        if (s->crecs[i].ref_id == last_ref && s->crecs[i].apos < last_pos) {
654
0
            hts_log_error("CRAM file is not sorted by chromosome / position");
655
0
            return -2;
656
0
        }
657
0
        last_ref = s->crecs[i].ref_id;
658
0
        last_pos = s->crecs[i].apos;
659
660
0
        if (s->crecs[i].ref_id == ref) {
661
0
            if (ref_end < s->crecs[i].aend)
662
0
                ref_end = s->crecs[i].aend;
663
0
            continue;
664
0
        }
665
666
0
        if (ref != -2) {
667
0
            snprintf(buf, sizeof(buf),
668
0
                     "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n",
669
0
                     ref, ref_start, ref_end - ref_start + 1,
670
0
                     (int64_t)cpos, landmark, sz);
671
0
            if (bgzf_write(fp, buf, strlen(buf)) < 0)
672
0
                return -4;
673
0
        }
674
675
0
        ref = s->crecs[i].ref_id;
676
0
        ref_start = s->crecs[i].apos;
677
0
        ref_end   = s->crecs[i].aend;
678
0
    }
679
680
0
    if (ref != -2) {
681
0
        snprintf(buf, sizeof(buf),
682
0
                 "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n",
683
0
                 ref, ref_start, ref_end - ref_start + 1,
684
0
                 (int64_t)cpos, landmark, sz);
685
0
        if (bgzf_write(fp, buf, strlen(buf)) < 0)
686
0
            return -4;
687
0
    }
688
689
0
    return 0;
690
0
}
691
692
/*
693
 * Adds a single slice to the index.
694
 */
695
int cram_index_slice(cram_fd *fd,
696
                     cram_container *c,
697
                     cram_slice *s,
698
                     BGZF *fp,
699
                     off_t cpos,
700
                     off_t spos, // relative to cpos
701
0
                     off_t sz) {
702
0
    int ret;
703
0
    char buf[1024];
704
705
0
    if (sz > INT_MAX) {
706
0
        hts_log_error("CRAM slice is too big (%"PRId64" bytes)",
707
0
                      (int64_t) sz);
708
0
        return -1;
709
0
    }
710
711
0
    if (s->hdr->ref_seq_id == -2) {
712
0
        ret = cram_index_build_multiref(fd, c, s, fp, cpos, spos, sz);
713
0
    } else {
714
0
        snprintf(buf, sizeof(buf),
715
0
                 "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n",
716
0
                 s->hdr->ref_seq_id, s->hdr->ref_seq_start,
717
0
                 s->hdr->ref_seq_span, (int64_t)cpos, (int)spos, (int)sz);
718
0
        ret = (bgzf_write(fp, buf, strlen(buf)) >= 0)? 0 : -4;
719
0
    }
720
721
0
    return ret;
722
0
}
723
724
/*
725
 * Adds a single container to the index.
726
 */
727
static
728
int cram_index_container(cram_fd *fd,
729
                         cram_container *c,
730
                         BGZF *fp,
731
0
                         off_t cpos) {
732
0
    int j;
733
0
    off_t spos;
734
735
    // 2.0 format
736
0
    for (j = 0; j < c->num_landmarks; j++) {
737
0
        cram_slice *s;
738
0
        off_t sz;
739
0
        int ret;
740
741
0
        spos = htell(fd->fp);
742
0
        if (spos - cpos - (off_t) c->offset != c->landmark[j]) {
743
0
            hts_log_error("CRAM slice offset %"PRId64" does not match"
744
0
                          " landmark %d in container header (%"PRId32")",
745
0
                          (int64_t) (spos - cpos - (off_t) c->offset),
746
0
                          j, c->landmark[j]);
747
0
            return -1;
748
0
        }
749
750
0
        if (!(s = cram_read_slice(fd))) {
751
0
            return -1;
752
0
        }
753
754
0
        sz = htell(fd->fp) - spos;
755
0
        ret = cram_index_slice(fd, c, s, fp, cpos, c->landmark[j], sz);
756
757
0
        cram_free_slice(s);
758
759
0
        if (ret < 0) {
760
0
            return ret;
761
0
        }
762
0
    }
763
764
0
    return 0;
765
0
}
766
767
768
/*
769
 * Builds an index file.
770
 *
771
 * fd is a newly opened cram file that we wish to index.
772
 * fn_base is the filename of the associated CRAM file.
773
 * fn_idx is the filename of the index file to be written;
774
 * if NULL, we add ".crai" to fn_base to get the index filename.
775
 *
776
 * Returns 0 on success,
777
 *         negative on failure (-1 for read failure, -4 for write failure)
778
 */
779
0
int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) {
780
0
    cram_container *c;
781
0
    off_t cpos, hpos;
782
0
    BGZF *fp;
783
0
    kstring_t fn_idx_str = {0};
784
0
    int64_t last_ref = -9, last_start = -9;
785
786
    // Useful for cram_index_build_multiref
787
0
    cram_set_option(fd, CRAM_OPT_REQUIRED_FIELDS, SAM_RNAME | SAM_POS | SAM_CIGAR);
788
789
0
    if (! fn_idx) {
790
0
        kputs(fn_base, &fn_idx_str);
791
0
        kputs(".crai", &fn_idx_str);
792
0
        fn_idx = fn_idx_str.s;
793
0
    }
794
795
0
    if (!(fp = bgzf_open(fn_idx, "wg"))) {
796
0
        perror(fn_idx);
797
0
        free(fn_idx_str.s);
798
0
        return -4;
799
0
    }
800
801
0
    free(fn_idx_str.s);
802
803
0
    cpos = htell(fd->fp);
804
0
    while ((c = cram_read_container(fd))) {
805
0
        if (fd->err) {
806
0
            perror("Cram container read");
807
0
            return -1;
808
0
        }
809
810
0
        hpos = htell(fd->fp);
811
812
0
        if (!(c->comp_hdr_block = cram_read_block(fd)))
813
0
            return -1;
814
0
        assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER);
815
816
0
        c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
817
0
        if (!c->comp_hdr)
818
0
            return -1;
819
820
0
        if (c->ref_seq_id == last_ref && c->ref_seq_start < last_start) {
821
0
            hts_log_error("CRAM file is not sorted by chromosome / position");
822
0
            return -2;
823
0
        }
824
0
        last_ref = c->ref_seq_id;
825
0
        last_start = c->ref_seq_start;
826
827
0
        if (cram_index_container(fd, c, fp, cpos) < 0) {
828
0
            bgzf_close(fp);
829
0
            return -1;
830
0
        }
831
832
0
        off_t next_cpos = htell(fd->fp);
833
0
        if (next_cpos != hpos + c->length) {
834
0
            hts_log_error("Length %"PRId32" in container header at offset %lld does not match block lengths (%lld)",
835
0
                          c->length, (long long) cpos, (long long) next_cpos - hpos);
836
0
            return -1;
837
0
        }
838
0
        cpos = next_cpos;
839
840
0
        cram_free_container(c);
841
0
    }
842
0
    if (fd->err) {
843
0
        bgzf_close(fp);
844
0
        return -1;
845
0
    }
846
847
0
    return (bgzf_close(fp) >= 0)? 0 : -4;
848
0
}
849
850
// internal recursive step
851
static int64_t cram_num_containers_between_(cram_index *e, int64_t *last_pos,
852
                                            int64_t nct,
853
                                            off_t cstart, off_t cend,
854
0
                                            int64_t *first, int64_t *last) {
855
0
    int64_t nc = 0, i;
856
857
0
    if (e->offset) {
858
0
        if (e->offset != *last_pos) {
859
0
            if (e->offset >= cstart && (!cend || e->offset <= cend)) {
860
0
                if (first && *first < 0)
861
0
                    *first = nct;
862
0
                if (last)
863
0
                    *last = nct;
864
0
            }
865
0
            nc++;
866
0
        }
867
        // else a new multi-ref in same container
868
0
        *last_pos = e->offset;
869
0
    }
870
871
0
    for (i = 0; i < e->nslice; i++)
872
0
        nc += cram_num_containers_between_(&e->e[i], last_pos, nc + nct,
873
0
                                           cstart, cend, first, last);
874
875
0
    return nc;
876
0
}
877
878
/*! Returns the number of containers in the CRAM file within given offsets.
879
 *
880
 * The cstart and cend offsets are the locations of the start of containers
881
 * as returned by index_container_offset.
882
 *
883
 * If non-NULL, first and last will hold the inclusive range of container
884
 * numbers, counting from zero.
885
 *
886
 * @return
887
 * Returns the number of containers, equivalent to *last-*first+1.
888
 */
889
int64_t cram_num_containers_between(cram_fd *fd,
890
                                    off_t cstart, off_t cend,
891
0
                                    int64_t *first, int64_t *last) {
892
0
    int64_t nc = 0, i;
893
0
    int64_t last_pos = -99;
894
0
    int64_t l_first = -1, l_last = -1;
895
896
0
    for (i = 0; i < fd->index_sz; i++) {
897
0
        int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end
898
0
        nc += cram_num_containers_between_(&fd->index[j], &last_pos, nc,
899
0
                                           cstart, cend, &l_first, &l_last);
900
0
    }
901
902
0
    if (first)
903
0
        *first = l_first;
904
0
    if (last)
905
0
        *last = l_last;
906
907
0
    return l_last - l_first + 1;
908
0
}
909
910
/*
911
 * Queries the total number of distinct containers in the index.
912
 * Note there may be more containers in the file than in the index, as we
913
 * are not required to have an index entry for every one.
914
 */
915
0
int64_t cram_num_containers(cram_fd *fd) {
916
0
    return cram_num_containers_between(fd, 0, 0, NULL, NULL);
917
0
}
918
919
920
/*! Returns the byte offset for the start of the n^th container.
921
 *
922
 * The index must have previously been loaded, otherwise <0 is returned.
923
 */
924
static cram_index *cram_container_num2offset_(cram_index *e, int num,
925
0
                                              int64_t *last_pos, int *nc) {
926
0
    if (e->offset) {
927
0
        if (e->offset != *last_pos) {
928
0
            if (*nc == num)
929
0
                return e;
930
0
            (*nc)++;
931
0
        }
932
        // else a new multi-ref in same container
933
0
        *last_pos = e->offset;
934
0
    }
935
936
0
    int i;
937
0
    for (i = 0; i < e->nslice; i++) {
938
0
        cram_index *tmp = cram_container_num2offset_(&e->e[i], num,
939
0
                                                     last_pos, nc);
940
0
        if (tmp)
941
0
            return tmp;
942
0
    }
943
944
945
0
    return NULL;
946
0
}
947
948
0
off_t cram_container_num2offset(cram_fd *fd, int64_t num) {
949
0
    int nc = 0, i;
950
0
    int64_t last_pos = -9;
951
0
    cram_index *e = NULL;
952
953
0
    for (i = 0; i < fd->index_sz; i++) {
954
0
        int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end
955
0
        if (!fd->index[j].nslice)
956
0
            continue;
957
0
        if ((e = cram_container_num2offset_(&fd->index[j], num,
958
0
                                            &last_pos, &nc)))
959
0
            break;
960
0
    }
961
962
0
    return e ? e->offset : -1;
963
0
}
964
965
966
/*! Returns the container number for the first container at offset >= pos.
967
 *
968
 * The index must have previously been loaded, otherwise <0 is returned.
969
 */
970
static cram_index *cram_container_offset2num_(cram_index *e, off_t pos,
971
0
                                              int64_t *last_pos, int *nc) {
972
0
    if (e->offset) {
973
0
        if (e->offset != *last_pos) {
974
0
            if (e->offset >= pos)
975
0
                return e;
976
0
            (*nc)++;
977
0
        }
978
        // else a new multi-ref in same container
979
0
        *last_pos = e->offset;
980
0
    }
981
982
0
    int i;
983
0
    for (i = 0; i < e->nslice; i++) {
984
0
        cram_index *tmp = cram_container_offset2num_(&e->e[i], pos,
985
0
                                                     last_pos, nc);
986
0
        if (tmp)
987
0
            return tmp;
988
0
    }
989
990
991
0
    return NULL;
992
0
}
993
994
0
int64_t cram_container_offset2num(cram_fd *fd, off_t pos) {
995
0
    int nc = 0, i;
996
0
    int64_t last_pos = -9;
997
0
    cram_index *e = NULL;
998
999
0
    for (i = 0; i < fd->index_sz; i++) {
1000
0
        int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end
1001
0
        if (!fd->index[j].nslice)
1002
0
            continue;
1003
0
        if ((e = cram_container_offset2num_(&fd->index[j], pos,
1004
0
                                            &last_pos, &nc)))
1005
0
            break;
1006
0
    }
1007
1008
0
    return e ? nc : -1;
1009
0
}
1010
1011
/*!
1012
 * Returns the file offsets of CRAM containers covering a specific region
1013
 * query.  Note both offsets are the START of the container.
1014
 *
1015
 * first will point to the start of the first overlapping container
1016
 * last will point to the start of the last overlapping container
1017
 *
1018
 * Returns 0 on success
1019
 *        <0 on failure
1020
 */
1021
int cram_index_extents(cram_fd *fd, int refid, hts_pos_t start, hts_pos_t end,
1022
0
                       off_t *first, off_t *last) {
1023
0
    cram_index *ci;
1024
1025
0
    if (first) {
1026
0
        if (!(ci = cram_index_query(fd, refid, start, NULL)))
1027
0
            return -1;
1028
0
        *first = ci->offset;
1029
0
    }
1030
1031
0
    if (last) {
1032
0
        if (!(ci = cram_index_query_last(fd, refid, end)))
1033
0
            return -1;
1034
0
        *last = ci->offset;
1035
0
    }
1036
1037
0
    return 0;
1038
0
}