Coverage Report

Created: 2023-01-17 06:24

/src/htslib/cram/cram_index.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2013-2020 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * The index is a gzipped tab-delimited text file with one line per slice.
33
 * The columns are:
34
 * 1: reference number (0 to N-1, as per BAM ref_id)
35
 * 2: reference position of 1st read in slice (1..?)
36
 * 3: number of reads in slice
37
 * 4: offset of container start (relative to end of SAM header, so 1st
38
 *    container is offset 0).
39
 * 5: slice number within container (ie which landmark).
40
 *
41
 * In memory, we hold this in a nested containment list. Each list element is
42
 * a cram_index struct. Each element in turn can contain its own list of
43
 * cram_index structs.
44
 *
45
 * Any start..end range which is entirely contained within another (and
46
 * earlier as it is sorted) range will be held within it. This ensures that
47
 * the outer list will never have containments and we can safely do a
48
 * binary search to find the first range which overlaps any given coordinate.
49
 */
50
51
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
52
#include <config.h>
53
54
#include <stdio.h>
55
#include <errno.h>
56
#include <assert.h>
57
#include <inttypes.h>
58
#include <stdlib.h>
59
#include <string.h>
60
#include <zlib.h>
61
#include <sys/types.h>
62
#include <sys/stat.h>
63
#include <math.h>
64
65
#include "../htslib/bgzf.h"
66
#include "../htslib/hfile.h"
67
#include "../hts_internal.h"
68
#include "cram.h"
69
#include "os.h"
70
71
#if 0
72
static void dump_index_(cram_index *e, int level) {
73
    int i, n;
74
    n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end);
75
    printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset);
76
    for (i = 0; i < e->nslice; i++) {
77
        dump_index_(&e->e[i], level+1);
78
    }
79
}
80
81
static void dump_index(cram_fd *fd) {
82
    int i;
83
    for (i = 0; i < fd->index_sz; i++) {
84
        dump_index_(&fd->index[i], 0);
85
    }
86
}
87
#endif
88
89
0
static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) {
90
0
    int sign = 1;
91
0
    int32_t val = 0;
92
0
    size_t p = *pos;
93
94
0
    while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
95
0
        p++;
96
97
0
    if (p < k->l && k->s[p] == '-')
98
0
        sign = -1, p++;
99
100
0
    if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
101
0
        return -1;
102
103
0
    while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') {
104
0
        int digit = k->s[p++]-'0';
105
0
        val = val*10 + digit;
106
0
    }
107
108
0
    *pos = p;
109
0
    *val_p = sign*val;
110
111
0
    return 0;
112
0
}
113
114
0
static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) {
115
0
    int sign = 1;
116
0
    int64_t val = 0;
117
0
    size_t p = *pos;
118
119
0
    while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
120
0
        p++;
121
122
0
    if (p < k->l && k->s[p] == '-')
123
0
        sign = -1, p++;
124
125
0
    if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
126
0
        return -1;
127
128
0
    while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') {
129
0
        int digit = k->s[p++]-'0';
130
0
        val = val*10 + digit;
131
0
    }
132
133
0
    *pos = p;
134
0
    *val_p = sign*val;
135
136
0
    return 0;
137
0
}
138
139
/*
140
 * Loads a CRAM .crai index into memory.
141
 *
142
 * Returns 0 for success
143
 *        -1 for failure
144
 */
145
0
int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) {
146
147
0
    char *tfn_idx = NULL;
148
0
    char buf[65536];
149
0
    ssize_t len;
150
0
    kstring_t kstr = {0};
151
0
    hFILE *fp;
152
0
    cram_index *idx;
153
0
    cram_index **idx_stack = NULL, *ep, e;
154
0
    int idx_stack_alloc = 0, idx_stack_ptr = 0;
155
0
    size_t pos = 0;
156
157
    /* Check if already loaded */
158
0
    if (fd->index)
159
0
        return 0;
160
161
0
    fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index));
162
0
    if (!fd->index)
163
0
        return -1;
164
165
0
    idx = &fd->index[0];
166
0
    idx->refid = -1;
167
0
    idx->start = INT_MIN;
168
0
    idx->end   = INT_MAX;
169
170
0
    idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack));
171
0
    if (!idx_stack)
172
0
        goto fail;
173
174
0
    idx_stack[idx_stack_ptr] = idx;
175
176
    // Support pathX.cram##idx##pathY.crai
177
0
    const char *fn_delim = strstr(fn, HTS_IDX_DELIM);
178
0
    if (fn_delim && !fn_idx)
179
0
        fn_idx = fn_delim + strlen(HTS_IDX_DELIM);
180
181
0
    if (!fn_idx) {
182
0
        if (hts_idx_check_local(fn, HTS_FMT_CRAI, &tfn_idx) == 0 && hisremote(fn))
183
0
            tfn_idx = hts_idx_getfn(fn, ".crai");
184
185
0
        if (!tfn_idx) {
186
0
            hts_log_error("Could not retrieve index file for '%s'", fn);
187
0
            goto fail;
188
0
        }
189
0
        fn_idx = tfn_idx;
190
0
    }
191
192
0
    if (!(fp = hopen(fn_idx, "r"))) {
193
0
        hts_log_error("Could not open index file '%s'", fn_idx);
194
0
        goto fail;
195
0
    }
196
197
    // Load the file into memory
198
0
    while ((len = hread(fp, buf, sizeof(buf))) > 0) {
199
0
        if (kputsn(buf, len, &kstr) < 0)
200
0
            goto fail;
201
0
    }
202
203
0
    if (len < 0 || kstr.l < 2)
204
0
        goto fail;
205
206
0
    if (hclose(fp) < 0)
207
0
        goto fail;
208
209
    // Uncompress if required
210
0
    if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) {
211
0
        size_t l = 0;
212
0
        char *s = zlib_mem_inflate(kstr.s, kstr.l, &l);
213
0
        if (!s)
214
0
            goto fail;
215
216
0
        free(kstr.s);
217
0
        kstr.s = s;
218
0
        kstr.l = l;
219
0
        kstr.m = l; // conservative estimate of the size allocated
220
0
        if (kputsn("", 0, &kstr) < 0) // ensure kstr.s is NUL-terminated
221
0
            goto fail;
222
0
    }
223
224
225
    // Parse it line at a time
226
0
    while (pos < kstr.l) {
227
        /* 1.1 layout */
228
0
        if (kget_int32(&kstr, &pos, &e.refid) == -1)
229
0
            goto fail;
230
231
0
        if (kget_int32(&kstr, &pos, &e.start) == -1)
232
0
            goto fail;
233
234
0
        if (kget_int32(&kstr, &pos, &e.end) == -1)
235
0
            goto fail;
236
237
0
        if (kget_int64(&kstr, &pos, &e.offset) == -1)
238
0
            goto fail;
239
240
0
        if (kget_int32(&kstr, &pos, &e.slice) == -1)
241
0
            goto fail;
242
243
0
        if (kget_int32(&kstr, &pos, &e.len) == -1)
244
0
            goto fail;
245
246
0
        e.end += e.start-1;
247
        //printf("%d/%d..%d-offset=%" PRIu64 ",len=%d,slice=%d\n", e.refid, e.start, e.end, e.offset, e.len, e.slice);
248
249
0
        if (e.refid < -1) {
250
0
            hts_log_error("Malformed index file, refid %d", e.refid);
251
0
            goto fail;
252
0
        }
253
254
0
        if (e.refid != idx->refid) {
255
0
            if (fd->index_sz < e.refid+2) {
256
0
                cram_index *new_idx;
257
0
                int new_sz = e.refid+2;
258
0
                size_t index_end = fd->index_sz * sizeof(*fd->index);
259
0
                new_idx = realloc(fd->index,
260
0
                                  new_sz * sizeof(*fd->index));
261
0
                if (!new_idx)
262
0
                    goto fail;
263
264
0
                fd->index = new_idx;
265
0
                fd->index_sz = new_sz;
266
0
                memset(((char *)fd->index) + index_end, 0,
267
0
                       fd->index_sz * sizeof(*fd->index) - index_end);
268
0
            }
269
0
            idx = &fd->index[e.refid+1];
270
0
            idx->refid = e.refid;
271
0
            idx->start = INT_MIN;
272
0
            idx->end   = INT_MAX;
273
0
            idx->nslice = idx->nalloc = 0;
274
0
            idx->e = NULL;
275
0
            idx_stack[(idx_stack_ptr = 0)] = idx;
276
0
        }
277
278
0
        while (!(e.start >= idx->start && e.end <= idx->end) || idx->end == 0) {
279
0
            idx = idx_stack[--idx_stack_ptr];
280
0
        }
281
282
        // Now contains, so append
283
0
        if (idx->nslice+1 >= idx->nalloc) {
284
0
            cram_index *new_e;
285
0
            idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16;
286
0
            new_e = realloc(idx->e, idx->nalloc * sizeof(*idx->e));
287
0
            if (!new_e)
288
0
                goto fail;
289
290
0
            idx->e = new_e;
291
0
        }
292
293
0
        e.nalloc = e.nslice = 0; e.e = NULL;
294
0
        *(ep = &idx->e[idx->nslice++]) = e;
295
0
        idx = ep;
296
297
0
        if (++idx_stack_ptr >= idx_stack_alloc) {
298
0
            cram_index **new_stack;
299
0
            idx_stack_alloc *= 2;
300
0
            new_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack));
301
0
            if (!new_stack)
302
0
                goto fail;
303
0
            idx_stack = new_stack;
304
0
        }
305
0
        idx_stack[idx_stack_ptr] = idx;
306
307
0
        while (pos < kstr.l && kstr.s[pos] != '\n')
308
0
            pos++;
309
0
        pos++;
310
0
    }
311
312
0
    free(idx_stack);
313
0
    free(kstr.s);
314
0
    free(tfn_idx);
315
316
    // dump_index(fd);
317
318
0
    return 0;
319
320
0
 fail:
321
0
    free(kstr.s);
322
0
    free(idx_stack);
323
0
    free(tfn_idx);
324
0
    cram_index_free(fd); // Also sets fd->index = NULL
325
0
    return -1;
326
0
}
327
328
0
static void cram_index_free_recurse(cram_index *e) {
329
0
    if (e->e) {
330
0
        int i;
331
0
        for (i = 0; i < e->nslice; i++) {
332
0
            cram_index_free_recurse(&e->e[i]);
333
0
        }
334
0
        free(e->e);
335
0
    }
336
0
}
337
338
0
void cram_index_free(cram_fd *fd) {
339
0
    int i;
340
341
0
    if (!fd->index)
342
0
        return;
343
344
0
    for (i = 0; i < fd->index_sz; i++) {
345
0
        cram_index_free_recurse(&fd->index[i]);
346
0
    }
347
0
    free(fd->index);
348
349
0
    fd->index = NULL;
350
0
}
351
352
/*
353
 * Searches the index for the first slice overlapping a reference ID
354
 * and position, or one immediately preceding it if none is found in
355
 * the index to overlap this position. (Our index may have missing
356
 * entries, but we require at least one per reference.)
357
 *
358
 * If the index finds multiple slices overlapping this position we
359
 * return the first one only. Subsequent calls should specifying
360
 * "from" as the last slice we checked to find the next one. Otherwise
361
 * set "from" to be NULL to find the first one.
362
 *
363
 * Refid can also be any of the special HTS_IDX_ values.
364
 * For backwards compatibility, refid -1 is equivalent to HTS_IDX_NOCOOR.
365
 *
366
 * Returns the cram_index pointer on success
367
 *         NULL on failure
368
 */
369
cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos,
370
0
                             cram_index *from) {
371
0
    int i, j, k;
372
0
    cram_index *e;
373
374
0
    switch(refid) {
375
0
    case HTS_IDX_NONE:
376
0
    case HTS_IDX_REST:
377
        // fail, or already there, dealt with elsewhere.
378
0
        return NULL;
379
380
0
    case HTS_IDX_NOCOOR:
381
0
        refid = -1;
382
0
        pos = 0;
383
0
        break;
384
385
0
    case HTS_IDX_START: {
386
0
        int64_t min_idx = INT64_MAX;
387
0
        for (i = 0, j = -1; i < fd->index_sz; i++) {
388
0
            if (fd->index[i].e && fd->index[i].e[0].offset < min_idx) {
389
0
                min_idx = fd->index[i].e[0].offset;
390
0
                j = i;
391
0
            }
392
0
        }
393
0
        if (j < 0)
394
0
            return NULL;
395
0
        return fd->index[j].e;
396
0
    }
397
398
0
    default:
399
0
        if (refid < HTS_IDX_NONE || refid+1 >= fd->index_sz)
400
0
            return NULL;
401
0
    }
402
403
0
    if (!from)
404
0
        from = &fd->index[refid+1];
405
406
    // Ref with nothing aligned against it.
407
0
    if (!from->e)
408
0
        return NULL;
409
410
    // This sequence is covered by the index, so binary search to find
411
    // the optimal starting block.
412
0
    i = 0, j = fd->index[refid+1].nslice-1;
413
0
    for (k = j/2; k != i; k = (j-i)/2 + i) {
414
0
        if (from->e[k].refid > refid) {
415
0
            j = k;
416
0
            continue;
417
0
        }
418
419
0
        if (from->e[k].refid < refid) {
420
0
            i = k;
421
0
            continue;
422
0
        }
423
424
0
        if (from->e[k].start >= pos) {
425
0
            j = k;
426
0
            continue;
427
0
        }
428
429
0
        if (from->e[k].start < pos) {
430
0
            i = k;
431
0
            continue;
432
0
        }
433
0
    }
434
    // i==j or i==j-1. Check if j is better.
435
0
    if (j >= 0 && from->e[j].start < pos && from->e[j].refid == refid)
436
0
        i = j;
437
438
    /* The above found *a* bin overlapping, but not necessarily the first */
439
0
    while (i > 0 && from->e[i-1].end >= pos)
440
0
        i--;
441
442
    /* We may be one bin before the optimum, so check */
443
0
    while (i+1 < from->nslice &&
444
0
           (from->e[i].refid < refid ||
445
0
            from->e[i].end < pos))
446
0
        i++;
447
448
0
    e = &from->e[i];
449
450
0
    return e;
451
0
}
452
453
// Return the index entry for last slice on a specific reference.
454
0
cram_index *cram_index_last(cram_fd *fd, int refid, cram_index *from) {
455
0
    int slice;
456
457
0
    if (refid+1 < 0 || refid+1 >= fd->index_sz)
458
0
        return NULL;
459
460
0
    if (!from)
461
0
        from = &fd->index[refid+1];
462
463
    // Ref with nothing aligned against it.
464
0
    if (!from->e)
465
0
        return NULL;
466
467
0
    slice = fd->index[refid+1].nslice - 1;
468
469
0
    return &from->e[slice];
470
0
}
471
472
0
cram_index *cram_index_query_last(cram_fd *fd, int refid, hts_pos_t end) {
473
0
    cram_index *first = cram_index_query(fd, refid, end, NULL);
474
0
    cram_index *last =  cram_index_last(fd, refid, NULL);
475
0
    if (!first || !last)
476
0
        return NULL;
477
478
0
    while (first < last && (first+1)->start <= end)
479
0
        first++;
480
481
0
    while (first->e) {
482
0
        int count = 0;
483
0
        int nslices = first->nslice;
484
0
        first = first->e;
485
0
        while (++count < nslices && (first+1)->start <= end)
486
0
            first++;
487
0
    }
488
489
    // Compute the start location of next container.
490
    //
491
    // This is useful for stitching containers together in the multi-region
492
    // iterator.  Sadly we can't compute this from the single index line.
493
    //
494
    // Note we can have neighbouring index entries at the same location
495
    // for when we have multi-reference mode and/or multiple slices per
496
    // container.
497
0
    cram_index *next = first;
498
0
    do {
499
0
        if (next >= last) {
500
            // Next non-empty reference
501
0
            while (++refid+1 < fd->index_sz)
502
0
                if (fd->index[refid+1].nslice)
503
0
                    break;
504
0
            if (refid+1 >= fd->index_sz) {
505
0
                next = NULL;
506
0
            } else {
507
0
                next = fd->index[refid+1].e;
508
0
                last = fd->index[refid+1].e + fd->index[refid+1].nslice;
509
0
            }
510
0
        } else {
511
0
            next++;
512
0
        }
513
0
    } while (next && next->offset == first->offset);
514
515
0
    first->next = next ? next->offset : 0;
516
517
0
    return first;
518
0
}
519
520
/*
521
 * Skips to a container overlapping the start coordinate listed in
522
 * cram_range.
523
 *
524
 * In theory we call cram_index_query multiple times, once per slice
525
 * overlapping the range. However slices may be absent from the index
526
 * which makes this problematic. Instead we find the left-most slice
527
 * and then read from then on, skipping decoding of slices and/or
528
 * whole containers when they don't overlap the specified cram_range.
529
 *
530
 * This function also updates the cram_fd range field.
531
 *
532
 * Returns 0 on success
533
 *        -1 on general failure
534
 *        -2 on no-data (empty chromosome)
535
 */
536
0
int cram_seek_to_refpos(cram_fd *fd, cram_range *r) {
537
0
    int ret = 0;
538
0
    cram_index *e;
539
540
0
    if (r->refid == HTS_IDX_NONE) {
541
0
        ret = -2; goto err;
542
0
    }
543
544
    // Ideally use an index, so see if we have one.
545
0
    if ((e = cram_index_query(fd, r->refid, r->start, NULL))) {
546
0
        if (0 != cram_seek(fd, e->offset, SEEK_SET)) {
547
0
            if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR)) {
548
0
                ret = -1; goto err;
549
0
            }
550
0
        }
551
0
    } else {
552
        // Absent from index, but this most likely means it simply has no data.
553
0
        ret = -2; goto err;
554
0
    }
555
556
0
    pthread_mutex_lock(&fd->range_lock);
557
0
    fd->range = *r;
558
0
    if (r->refid == HTS_IDX_NOCOOR) {
559
0
        fd->range.refid = -1;
560
0
        fd->range.start = 0;
561
0
    } else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) {
562
0
        fd->range.refid = -2; // special case in cram_next_slice
563
0
    }
564
0
    pthread_mutex_unlock(&fd->range_lock);
565
566
0
    if (fd->ctr) {
567
0
        cram_free_container(fd->ctr);
568
0
        if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
569
0
            cram_free_container(fd->ctr_mt);
570
0
        fd->ctr = NULL;
571
0
        fd->ctr_mt = NULL;
572
0
        fd->ooc = 0;
573
0
        fd->eof = 0;
574
0
    }
575
576
0
    return 0;
577
578
0
 err:
579
    // It's unlikely fd->range will be accessed after EOF or error,
580
    // but this maintains identical behaviour to the previous code.
581
0
    pthread_mutex_lock(&fd->range_lock);
582
0
    fd->range = *r;
583
0
    pthread_mutex_unlock(&fd->range_lock);
584
0
    return ret;
585
0
}
586
587
588
/*
589
 * A specialised form of cram_index_build (below) that deals with slices
590
 * having multiple references in this (ref_id -2). In this scenario we
591
 * decode the slice to look at the RI data series instead.
592
 *
593
 * Returns 0 on success
594
 *        -1 on read failure
595
 *        -2 on wrong sort order
596
 *        -4 on write failure
597
 */
598
static int cram_index_build_multiref(cram_fd *fd,
599
                                     cram_container *c,
600
                                     cram_slice *s,
601
                                     BGZF *fp,
602
                                     off_t cpos,
603
                                     int32_t landmark,
604
0
                                     int sz) {
605
0
    int i, ref = -2;
606
0
    int64_t ref_start = 0, ref_end;
607
0
    char buf[1024];
608
609
0
    if (fd->mode != 'w') {
610
0
        if (0 != cram_decode_slice(fd, c, s, fd->header))
611
0
            return -1;
612
0
    }
613
614
0
    ref_end = INT_MIN;
615
616
0
    int32_t last_ref = -9;
617
0
    int32_t last_pos = -9;
618
0
    for (i = 0; i < s->hdr->num_records; i++) {
619
0
        if (s->crecs[i].ref_id == last_ref && s->crecs[i].apos < last_pos) {
620
0
            hts_log_error("CRAM file is not sorted by chromosome / position");
621
0
            return -2;
622
0
        }
623
0
        last_ref = s->crecs[i].ref_id;
624
0
        last_pos = s->crecs[i].apos;
625
626
0
        if (s->crecs[i].ref_id == ref) {
627
0
            if (ref_end < s->crecs[i].aend)
628
0
                ref_end = s->crecs[i].aend;
629
0
            continue;
630
0
        }
631
632
0
        if (ref != -2) {
633
0
            sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n",
634
0
                    ref, ref_start, ref_end - ref_start + 1,
635
0
                    (int64_t)cpos, landmark, sz);
636
0
            if (bgzf_write(fp, buf, strlen(buf)) < 0)
637
0
                return -4;
638
0
        }
639
640
0
        ref = s->crecs[i].ref_id;
641
0
        ref_start = s->crecs[i].apos;
642
0
        ref_end   = s->crecs[i].aend;
643
0
    }
644
645
0
    if (ref != -2) {
646
0
        sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n",
647
0
                ref, ref_start, ref_end - ref_start + 1,
648
0
                (int64_t)cpos, landmark, sz);
649
0
        if (bgzf_write(fp, buf, strlen(buf)) < 0)
650
0
            return -4;
651
0
    }
652
653
0
    return 0;
654
0
}
655
656
/*
657
 * Adds a single slice to the index.
658
 */
659
int cram_index_slice(cram_fd *fd,
660
                     cram_container *c,
661
                     cram_slice *s,
662
                     BGZF *fp,
663
                     off_t cpos,
664
                     off_t spos, // relative to cpos
665
0
                     off_t sz) {
666
0
    int ret;
667
0
    char buf[1024];
668
669
0
    if (sz > INT_MAX) {
670
0
        hts_log_error("CRAM slice is too big (%"PRId64" bytes)",
671
0
                      (int64_t) sz);
672
0
        return -1;
673
0
    }
674
675
0
    if (s->hdr->ref_seq_id == -2) {
676
0
        ret = cram_index_build_multiref(fd, c, s, fp, cpos, spos, sz);
677
0
    } else {
678
0
        sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n",
679
0
                s->hdr->ref_seq_id, s->hdr->ref_seq_start,
680
0
                s->hdr->ref_seq_span, (int64_t)cpos, (int)spos, (int)sz);
681
0
        ret = (bgzf_write(fp, buf, strlen(buf)) >= 0)? 0 : -4;
682
0
    }
683
684
0
    return ret;
685
0
}
686
687
/*
688
 * Adds a single container to the index.
689
 */
690
static
691
int cram_index_container(cram_fd *fd,
692
                         cram_container *c,
693
                         BGZF *fp,
694
0
                         off_t cpos) {
695
0
    int j;
696
0
    off_t spos;
697
698
    // 2.0 format
699
0
    for (j = 0; j < c->num_landmarks; j++) {
700
0
        cram_slice *s;
701
0
        off_t sz;
702
0
        int ret;
703
704
0
        spos = htell(fd->fp);
705
0
        if (spos - cpos - c->offset != c->landmark[j]) {
706
0
            hts_log_error("CRAM slice offset %"PRId64" does not match"
707
0
                          " landmark %d in container header (%d)",
708
0
                          spos - cpos - c->offset, j, c->landmark[j]);
709
0
            return -1;
710
0
        }
711
712
0
        if (!(s = cram_read_slice(fd))) {
713
0
            return -1;
714
0
        }
715
716
0
        sz = htell(fd->fp) - spos;
717
0
        ret = cram_index_slice(fd, c, s, fp, cpos, c->landmark[j], sz);
718
719
0
        cram_free_slice(s);
720
721
0
        if (ret < 0) {
722
0
            return ret;
723
0
        }
724
0
    }
725
726
0
    return 0;
727
0
}
728
729
730
/*
731
 * Builds an index file.
732
 *
733
 * fd is a newly opened cram file that we wish to index.
734
 * fn_base is the filename of the associated CRAM file.
735
 * fn_idx is the filename of the index file to be written;
736
 * if NULL, we add ".crai" to fn_base to get the index filename.
737
 *
738
 * Returns 0 on success,
739
 *         negative on failure (-1 for read failure, -4 for write failure)
740
 */
741
0
int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) {
742
0
    cram_container *c;
743
0
    off_t cpos, hpos;
744
0
    BGZF *fp;
745
0
    kstring_t fn_idx_str = {0};
746
0
    int64_t last_ref = -9, last_start = -9;
747
748
    // Useful for cram_index_build_multiref
749
0
    cram_set_option(fd, CRAM_OPT_REQUIRED_FIELDS, SAM_RNAME | SAM_POS | SAM_CIGAR);
750
751
0
    if (! fn_idx) {
752
0
        kputs(fn_base, &fn_idx_str);
753
0
        kputs(".crai", &fn_idx_str);
754
0
        fn_idx = fn_idx_str.s;
755
0
    }
756
757
0
    if (!(fp = bgzf_open(fn_idx, "wg"))) {
758
0
        perror(fn_idx);
759
0
        free(fn_idx_str.s);
760
0
        return -4;
761
0
    }
762
763
0
    free(fn_idx_str.s);
764
765
0
    cpos = htell(fd->fp);
766
0
    while ((c = cram_read_container(fd))) {
767
0
        if (fd->err) {
768
0
            perror("Cram container read");
769
0
            return -1;
770
0
        }
771
772
0
        hpos = htell(fd->fp);
773
774
0
        if (!(c->comp_hdr_block = cram_read_block(fd)))
775
0
            return -1;
776
0
        assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER);
777
778
0
        c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
779
0
        if (!c->comp_hdr)
780
0
            return -1;
781
782
0
        if (c->ref_seq_id == last_ref && c->ref_seq_start < last_start) {
783
0
            hts_log_error("CRAM file is not sorted by chromosome / position");
784
0
            return -2;
785
0
        }
786
0
        last_ref = c->ref_seq_id;
787
0
        last_start = c->ref_seq_start;
788
789
0
        if (cram_index_container(fd, c, fp, cpos) < 0) {
790
0
            bgzf_close(fp);
791
0
            return -1;
792
0
        }
793
794
0
        cpos = htell(fd->fp);
795
0
        assert(cpos == hpos + c->length);
796
797
0
        cram_free_container(c);
798
0
    }
799
0
    if (fd->err) {
800
0
        bgzf_close(fp);
801
0
        return -1;
802
0
    }
803
804
0
    return (bgzf_close(fp) >= 0)? 0 : -4;
805
0
}