/src/htslib/cram/cram_index.c
Line | Count | Source |
1 | | /* |
2 | | Copyright (c) 2013-2020, 2023-2024 Genome Research Ltd. |
3 | | Author: James Bonfield <jkb@sanger.ac.uk> |
4 | | |
5 | | Redistribution and use in source and binary forms, with or without |
6 | | modification, are permitted provided that the following conditions are met: |
7 | | |
8 | | 1. Redistributions of source code must retain the above copyright notice, |
9 | | this list of conditions and the following disclaimer. |
10 | | |
11 | | 2. Redistributions in binary form must reproduce the above copyright notice, |
12 | | this list of conditions and the following disclaimer in the documentation |
13 | | and/or other materials provided with the distribution. |
14 | | |
15 | | 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger |
16 | | Institute nor the names of its contributors may be used to endorse or promote |
17 | | products derived from this software without specific prior written permission. |
18 | | |
19 | | THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND |
20 | | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
21 | | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
22 | | DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE |
23 | | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
24 | | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
25 | | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
26 | | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
27 | | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
28 | | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 | | */ |
30 | | |
31 | | /* |
32 | | * The index is a gzipped tab-delimited text file with one line per slice. |
33 | | * The columns are: |
34 | | * 1: reference number (0 to N-1, as per BAM ref_id) |
35 | | * 2: reference position of 1st read in slice (1..?) |
36 | | * 3: number of reads in slice |
37 | | * 4: offset of container start (relative to end of SAM header, so 1st |
38 | | * container is offset 0). |
39 | | * 5: slice number within container (ie which landmark). |
40 | | * |
41 | | * In memory, we hold this in a nested containment list. Each list element is |
42 | | * a cram_index struct. Each element in turn can contain its own list of |
43 | | * cram_index structs. |
44 | | * |
45 | | * Any start..end range which is entirely contained within another (and |
46 | | * earlier as it is sorted) range will be held within it. This ensures that |
47 | | * the outer list will never have containments and we can safely do a |
48 | | * binary search to find the first range which overlaps any given coordinate. |
49 | | */ |
50 | | |
51 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
52 | | #include <config.h> |
53 | | |
54 | | #include <stdio.h> |
55 | | #include <errno.h> |
56 | | #include <assert.h> |
57 | | #include <inttypes.h> |
58 | | #include <stdlib.h> |
59 | | #include <string.h> |
60 | | #include <zlib.h> |
61 | | #include <sys/types.h> |
62 | | #include <sys/stat.h> |
63 | | #include <math.h> |
64 | | |
65 | | #include "../htslib/bgzf.h" |
66 | | #include "../htslib/hfile.h" |
67 | | #include "../htslib/hts_alloc.h" |
68 | | #include "../hts_internal.h" |
69 | | #include "cram.h" |
70 | | #include "os.h" |
71 | | |
72 | | #if 0 |
73 | | static void dump_index_(cram_index *e, int level) { |
74 | | int i, n; |
75 | | n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end); |
76 | | printf("%*soffset %"PRId64" %p %p\n", MAX(0,50-n), "", e->offset, e, e->e_next); |
77 | | for (i = 0; i < e->nslice; i++) { |
78 | | dump_index_(&e->e[i], level+1); |
79 | | } |
80 | | } |
81 | | |
82 | | static void dump_index(cram_fd *fd) { |
83 | | int i; |
84 | | for (i = 0; i < fd->index_sz; i++) { |
85 | | dump_index_(&fd->index[i], 0); |
86 | | } |
87 | | } |
88 | | #endif |
89 | | |
90 | | // Thread a linked list through the nested containment list. |
91 | | // This makes navigating it and finding the "next" index entry |
92 | | // trivial. |
93 | 0 | static cram_index *link_index_(cram_index *e, cram_index *e_last) { |
94 | 0 | int i; |
95 | 0 | if (e_last) |
96 | 0 | e_last->e_next = e; |
97 | | |
98 | | // We don't want to link in the top-level cram_index with |
99 | | // offset=0 and start/end = INT_MIN/INT_MAX. |
100 | 0 | if (e->offset) |
101 | 0 | e_last = e; |
102 | |
|
103 | 0 | for (i = 0; i < e->nslice; i++) |
104 | 0 | e_last = link_index_(&e->e[i], e_last); |
105 | |
|
106 | 0 | return e_last; |
107 | 0 | } |
108 | | |
109 | 0 | static void link_index(cram_fd *fd) { |
110 | 0 | int i; |
111 | 0 | cram_index *e_last = NULL; |
112 | |
|
113 | 0 | for (i = 0; i < fd->index_sz; i++) { |
114 | 0 | e_last = link_index_(&fd->index[i], e_last); |
115 | 0 | } |
116 | |
|
117 | 0 | if (e_last) |
118 | 0 | e_last->e_next = NULL; |
119 | 0 | } |
120 | | |
121 | 0 | static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) { |
122 | 0 | int sign = 1; |
123 | 0 | int32_t val = 0; |
124 | 0 | size_t p = *pos; |
125 | |
|
126 | 0 | while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t')) |
127 | 0 | p++; |
128 | |
|
129 | 0 | if (p < k->l && k->s[p] == '-') |
130 | 0 | sign = -1, p++; |
131 | |
|
132 | 0 | if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9')) |
133 | 0 | return -1; |
134 | | |
135 | 0 | while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') { |
136 | 0 | int digit = k->s[p++]-'0'; |
137 | 0 | val = val*10 + digit; |
138 | 0 | } |
139 | |
|
140 | 0 | *pos = p; |
141 | 0 | *val_p = sign*val; |
142 | |
|
143 | 0 | return 0; |
144 | 0 | } |
145 | | |
146 | 0 | static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) { |
147 | 0 | int sign = 1; |
148 | 0 | int64_t val = 0; |
149 | 0 | size_t p = *pos; |
150 | |
|
151 | 0 | while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t')) |
152 | 0 | p++; |
153 | |
|
154 | 0 | if (p < k->l && k->s[p] == '-') |
155 | 0 | sign = -1, p++; |
156 | |
|
157 | 0 | if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9')) |
158 | 0 | return -1; |
159 | | |
160 | 0 | while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') { |
161 | 0 | int digit = k->s[p++]-'0'; |
162 | 0 | val = val*10 + digit; |
163 | 0 | } |
164 | |
|
165 | 0 | *pos = p; |
166 | 0 | *val_p = sign*val; |
167 | |
|
168 | 0 | return 0; |
169 | 0 | } |
170 | | |
171 | | /* |
172 | | * Loads a CRAM .crai index into memory. |
173 | | * |
174 | | * Returns 0 for success |
175 | | * -1 for failure |
176 | | */ |
177 | 0 | int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) { |
178 | |
|
179 | 0 | char *tfn_idx = NULL; |
180 | 0 | char buf[65536]; |
181 | 0 | ssize_t len; |
182 | 0 | kstring_t kstr = {0}; |
183 | 0 | hFILE *fp; |
184 | 0 | cram_index *idx; |
185 | 0 | cram_index **idx_stack = NULL, *ep, e; |
186 | 0 | int idx_stack_alloc = 0, idx_stack_ptr = 0; |
187 | 0 | size_t pos = 0; |
188 | | |
189 | | /* Check if already loaded */ |
190 | 0 | if (fd->index) |
191 | 0 | return 0; |
192 | | |
193 | 0 | fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index)); |
194 | 0 | if (!fd->index) |
195 | 0 | return -1; |
196 | | |
197 | 0 | idx = &fd->index[0]; |
198 | 0 | idx->refid = -1; |
199 | 0 | idx->start = INT_MIN; |
200 | 0 | idx->end = INT_MAX; |
201 | |
|
202 | 0 | idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack)); |
203 | 0 | if (!idx_stack) |
204 | 0 | goto fail; |
205 | | |
206 | 0 | idx_stack[idx_stack_ptr] = idx; |
207 | | |
208 | | // Support pathX.cram##idx##pathY.crai |
209 | 0 | const char *fn_delim = strstr(fn, HTS_IDX_DELIM); |
210 | 0 | if (fn_delim && !fn_idx) |
211 | 0 | fn_idx = fn_delim + strlen(HTS_IDX_DELIM); |
212 | |
|
213 | 0 | if (!fn_idx) { |
214 | 0 | if (hts_idx_check_local(fn, HTS_FMT_CRAI, &tfn_idx) == 0 && hisremote(fn)) |
215 | 0 | tfn_idx = hts_idx_getfn(fn, ".crai"); |
216 | |
|
217 | 0 | if (!tfn_idx) { |
218 | 0 | hts_log_error("Could not retrieve index file for '%s'", fn); |
219 | 0 | goto fail; |
220 | 0 | } |
221 | 0 | fn_idx = tfn_idx; |
222 | 0 | } |
223 | | |
224 | 0 | if (!(fp = hopen(fn_idx, "r"))) { |
225 | 0 | hts_log_error("Could not open index file '%s'", fn_idx); |
226 | 0 | goto fail; |
227 | 0 | } |
228 | | |
229 | | // Load the file into memory |
230 | 0 | while ((len = hread(fp, buf, sizeof(buf))) > 0) { |
231 | 0 | if (kputsn(buf, len, &kstr) < 0) |
232 | 0 | goto fail; |
233 | 0 | } |
234 | | |
235 | 0 | if (len < 0 || kstr.l < 2) |
236 | 0 | goto fail; |
237 | | |
238 | 0 | if (hclose(fp) < 0) |
239 | 0 | goto fail; |
240 | | |
241 | | // Uncompress if required |
242 | 0 | if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) { |
243 | 0 | size_t l = 0; |
244 | 0 | char *s = zlib_mem_inflate(kstr.s, kstr.l, &l); |
245 | 0 | if (!s) |
246 | 0 | goto fail; |
247 | | |
248 | 0 | free(kstr.s); |
249 | 0 | kstr.s = s; |
250 | 0 | kstr.l = l; |
251 | 0 | kstr.m = l; // conservative estimate of the size allocated |
252 | 0 | if (kputsn("", 0, &kstr) < 0) // ensure kstr.s is NUL-terminated |
253 | 0 | goto fail; |
254 | 0 | } |
255 | | |
256 | | |
257 | | // refid indexes fd->index, so bound it to the header's reference count. |
258 | 0 | int nref = sam_hdr_nref(fd->header); |
259 | | |
260 | | // Parse it line at a time |
261 | 0 | while (pos < kstr.l) { |
262 | | /* 1.1 layout */ |
263 | 0 | if (kget_int32(&kstr, &pos, &e.refid) == -1) |
264 | 0 | goto fail; |
265 | | |
266 | 0 | if (kget_int32(&kstr, &pos, &e.start) == -1) |
267 | 0 | goto fail; |
268 | | |
269 | 0 | if (kget_int32(&kstr, &pos, &e.end) == -1) |
270 | 0 | goto fail; |
271 | | |
272 | 0 | if (kget_int64(&kstr, &pos, &e.offset) == -1) |
273 | 0 | goto fail; |
274 | | |
275 | 0 | if (kget_int32(&kstr, &pos, &e.slice) == -1) |
276 | 0 | goto fail; |
277 | | |
278 | 0 | if (kget_int32(&kstr, &pos, &e.len) == -1) |
279 | 0 | goto fail; |
280 | | |
281 | 0 | e.end += e.start-1; |
282 | | //printf("%d/%d..%d-offset=%" PRIu64 ",len=%d,slice=%d\n", e.refid, e.start, e.end, e.offset, e.len, e.slice); |
283 | |
|
284 | 0 | if (e.refid < -1 || e.refid >= nref) { |
285 | 0 | hts_log_error("Malformed index file, refid %d", e.refid); |
286 | 0 | goto fail; |
287 | 0 | } |
288 | | |
289 | 0 | if (e.refid != idx->refid) { |
290 | 0 | if (fd->index_sz < e.refid+2) { |
291 | 0 | cram_index *new_idx; |
292 | 0 | int new_sz = e.refid+2; |
293 | 0 | size_t index_end = fd->index_sz * sizeof(*fd->index); |
294 | 0 | new_idx = hts_realloc_p(fd->index, sizeof(*fd->index), |
295 | 0 | new_sz); |
296 | 0 | if (!new_idx) |
297 | 0 | goto fail; |
298 | | |
299 | 0 | fd->index = new_idx; |
300 | 0 | fd->index_sz = new_sz; |
301 | 0 | memset(((char *)fd->index) + index_end, 0, |
302 | 0 | fd->index_sz * sizeof(*fd->index) - index_end); |
303 | 0 | } |
304 | 0 | idx = &fd->index[e.refid+1]; |
305 | 0 | idx->refid = e.refid; |
306 | 0 | idx->start = INT_MIN; |
307 | 0 | idx->end = INT_MAX; |
308 | 0 | idx->nslice = idx->nalloc = 0; |
309 | 0 | idx->e = NULL; |
310 | 0 | idx_stack[(idx_stack_ptr = 0)] = idx; |
311 | 0 | } |
312 | | |
313 | 0 | while (!(e.start >= idx->start && e.end <= idx->end) || |
314 | 0 | (idx->start == 0 && idx->refid == -1)) { |
315 | 0 | idx = idx_stack[--idx_stack_ptr]; |
316 | 0 | } |
317 | | |
318 | | // Now contains, so append |
319 | 0 | if (idx->nslice+1 >= idx->nalloc) { |
320 | 0 | cram_index *new_e; |
321 | 0 | idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16; |
322 | 0 | new_e = hts_realloc_p(idx->e, sizeof(*idx->e), idx->nalloc); |
323 | 0 | if (!new_e) |
324 | 0 | goto fail; |
325 | | |
326 | 0 | idx->e = new_e; |
327 | 0 | } |
328 | | |
329 | 0 | e.nalloc = e.nslice = 0; e.e = NULL; |
330 | 0 | *(ep = &idx->e[idx->nslice++]) = e; |
331 | 0 | idx = ep; |
332 | |
|
333 | 0 | if (++idx_stack_ptr >= idx_stack_alloc) { |
334 | 0 | cram_index **new_stack; |
335 | 0 | idx_stack_alloc *= 2; |
336 | 0 | new_stack = hts_realloc_p(idx_stack, sizeof(*idx_stack), idx_stack_alloc); |
337 | 0 | if (!new_stack) |
338 | 0 | goto fail; |
339 | 0 | idx_stack = new_stack; |
340 | 0 | } |
341 | 0 | idx_stack[idx_stack_ptr] = idx; |
342 | |
|
343 | 0 | while (pos < kstr.l && kstr.s[pos] != '\n') |
344 | 0 | pos++; |
345 | 0 | pos++; |
346 | 0 | } |
347 | | |
348 | 0 | free(idx_stack); |
349 | 0 | free(kstr.s); |
350 | 0 | free(tfn_idx); |
351 | | |
352 | | // Convert NCList to linear linked list |
353 | 0 | link_index(fd); |
354 | | |
355 | | //dump_index(fd); |
356 | |
|
357 | 0 | return 0; |
358 | | |
359 | 0 | fail: |
360 | 0 | free(kstr.s); |
361 | 0 | free(idx_stack); |
362 | 0 | free(tfn_idx); |
363 | 0 | cram_index_free(fd); // Also sets fd->index = NULL |
364 | 0 | return -1; |
365 | 0 | } |
366 | | |
367 | 0 | static void cram_index_free_recurse(cram_index *e) { |
368 | 0 | if (e->e) { |
369 | 0 | int i; |
370 | 0 | for (i = 0; i < e->nslice; i++) { |
371 | 0 | cram_index_free_recurse(&e->e[i]); |
372 | 0 | } |
373 | 0 | free(e->e); |
374 | 0 | } |
375 | 0 | } |
376 | | |
377 | 0 | void cram_index_free(cram_fd *fd) { |
378 | 0 | int i; |
379 | |
|
380 | 0 | if (!fd->index) |
381 | 0 | return; |
382 | | |
383 | 0 | for (i = 0; i < fd->index_sz; i++) { |
384 | 0 | cram_index_free_recurse(&fd->index[i]); |
385 | 0 | } |
386 | 0 | free(fd->index); |
387 | |
|
388 | 0 | fd->index = NULL; |
389 | 0 | } |
390 | | |
391 | | /* |
392 | | * Searches the index for the first slice overlapping a reference ID |
393 | | * and position, or one immediately preceding it if none is found in |
394 | | * the index to overlap this position. (Our index may have missing |
395 | | * entries, but we require at least one per reference.) |
396 | | * |
397 | | * If the index finds multiple slices overlapping this position we |
398 | | * return the first one only. Subsequent calls should specify |
399 | | * "from" as the last slice we checked to find the next one. Otherwise |
400 | | * set "from" to be NULL to find the first one. |
401 | | * |
402 | | * Refid can also be any of the special HTS_IDX_ values. |
403 | | * For backwards compatibility, refid -1 is equivalent to HTS_IDX_NOCOOR. |
404 | | * |
405 | | * Returns the cram_index pointer on success |
406 | | * NULL on failure |
407 | | */ |
408 | | cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos, |
409 | 0 | cram_index *from) { |
410 | 0 | int i, j, k; |
411 | 0 | cram_index *e; |
412 | |
|
413 | 0 | if (from) { |
414 | | // Continue from a previous search. |
415 | | // We switch to just scanning the linked list, as the nested |
416 | | // lists are typically short. |
417 | 0 | if (refid == HTS_IDX_NOCOOR) |
418 | 0 | refid = -1; |
419 | |
|
420 | 0 | e = from->e_next; |
421 | 0 | if (e && e->refid == refid && e->start <= pos) |
422 | 0 | return e; |
423 | 0 | else |
424 | 0 | return NULL; |
425 | 0 | } |
426 | | |
427 | 0 | switch(refid) { |
428 | 0 | case HTS_IDX_NONE: |
429 | 0 | case HTS_IDX_REST: |
430 | | // fail, or already there, dealt with elsewhere. |
431 | 0 | return NULL; |
432 | | |
433 | 0 | case -1: |
434 | 0 | case HTS_IDX_NOCOOR: |
435 | 0 | refid = -1; |
436 | 0 | pos = 0; |
437 | 0 | break; |
438 | | |
439 | 0 | case HTS_IDX_START: { |
440 | 0 | int64_t min_idx = INT64_MAX; |
441 | 0 | for (i = 0, j = -1; i < fd->index_sz; i++) { |
442 | 0 | if (fd->index[i].e && fd->index[i].e[0].offset < min_idx) { |
443 | 0 | min_idx = fd->index[i].e[0].offset; |
444 | 0 | j = i; |
445 | 0 | } |
446 | 0 | } |
447 | 0 | if (j < 0) |
448 | 0 | return NULL; |
449 | 0 | return fd->index[j].e; |
450 | 0 | } |
451 | | |
452 | 0 | default: |
453 | 0 | if (refid < HTS_IDX_NONE || refid+1 >= fd->index_sz) |
454 | 0 | return NULL; |
455 | 0 | } |
456 | | |
457 | 0 | from = &fd->index[refid+1]; |
458 | | |
459 | | // Ref with nothing aligned against it. |
460 | 0 | if (!from->e) |
461 | 0 | return NULL; |
462 | | |
463 | | // This sequence is covered by the index, so binary search to find |
464 | | // the optimal starting block. |
465 | 0 | i = 0, j = fd->index[refid+1].nslice-1; |
466 | 0 | for (k = j/2; k != i; k = (j-i)/2 + i) { |
467 | 0 | if (from->e[k].refid > refid) { |
468 | 0 | j = k; |
469 | 0 | continue; |
470 | 0 | } |
471 | | |
472 | 0 | if (from->e[k].refid < refid) { |
473 | 0 | i = k; |
474 | 0 | continue; |
475 | 0 | } |
476 | | |
477 | 0 | if (from->e[k].start >= pos) { |
478 | 0 | j = k; |
479 | 0 | continue; |
480 | 0 | } |
481 | | |
482 | 0 | if (from->e[k].start < pos) { |
483 | 0 | i = k; |
484 | 0 | continue; |
485 | 0 | } |
486 | 0 | } |
487 | | // i==j or i==j-1. Check if j is better. |
488 | 0 | if (j >= 0 && from->e[j].start < pos && from->e[j].refid == refid) |
489 | 0 | i = j; |
490 | | |
491 | | /* The above found *a* bin overlapping, but not necessarily the first */ |
492 | 0 | while (i > 0 && from->e[i-1].end >= pos) |
493 | 0 | i--; |
494 | | |
495 | | /* We may be one bin before the optimum, so check */ |
496 | 0 | while (i+1 < from->nslice && |
497 | 0 | (from->e[i].refid < refid || |
498 | 0 | from->e[i].end < pos)) |
499 | 0 | i++; |
500 | |
|
501 | 0 | e = &from->e[i]; |
502 | |
|
503 | 0 | return e; |
504 | 0 | } |
505 | | |
506 | | // Return the index entry for last slice on a specific reference. |
507 | 0 | cram_index *cram_index_last(cram_fd *fd, int refid, cram_index *from) { |
508 | 0 | int slice; |
509 | |
|
510 | 0 | if (refid+1 < 0 || refid+1 >= fd->index_sz) |
511 | 0 | return NULL; |
512 | | |
513 | 0 | if (!from) |
514 | 0 | from = &fd->index[refid+1]; |
515 | | |
516 | | // Ref with nothing aligned against it. |
517 | 0 | if (!from->e) |
518 | 0 | return NULL; |
519 | | |
520 | 0 | slice = fd->index[refid+1].nslice - 1; |
521 | | |
522 | | // e is the last entry in the nested containment list, but it may |
523 | | // contain further slices within it. |
524 | 0 | cram_index *e = &from->e[slice]; |
525 | 0 | while (e->e_next) |
526 | 0 | e = e->e_next; |
527 | |
|
528 | 0 | return e; |
529 | 0 | } |
530 | | |
531 | | /* |
532 | | * Find the last container overlapping pos 'end', and the file offset of |
533 | | * its end (equivalent to the start offset of the container following it). |
534 | | */ |
535 | 0 | cram_index *cram_index_query_last(cram_fd *fd, int refid, hts_pos_t end) { |
536 | 0 | cram_index *e = NULL, *prev_e; |
537 | 0 | do { |
538 | 0 | prev_e = e; |
539 | 0 | e = cram_index_query(fd, refid, end, prev_e); |
540 | 0 | } while (e); |
541 | |
|
542 | 0 | if (!prev_e) |
543 | 0 | return NULL; |
544 | 0 | e = prev_e; |
545 | | |
546 | | // Note: offset of e and e->e_next may be the same if we're using a |
547 | | // multi-ref container where a single container generates multiple |
548 | | // index entries. |
549 | | // |
550 | | // We need to keep iterating until offset differs in order to find |
551 | | // the genuine file offset for the end of container. |
552 | 0 | do { |
553 | 0 | prev_e = e; |
554 | 0 | e = e->e_next; |
555 | 0 | } while (e && e->offset == prev_e->offset); |
556 | |
|
557 | 0 | return prev_e; |
558 | 0 | } |
559 | | |
560 | | /* |
561 | | * Skips to a container overlapping the start coordinate listed in |
562 | | * cram_range. |
563 | | * |
564 | | * In theory we call cram_index_query multiple times, once per slice |
565 | | * overlapping the range. However slices may be absent from the index |
566 | | * which makes this problematic. Instead we find the left-most slice |
567 | | * and then read from then on, skipping decoding of slices and/or |
568 | | * whole containers when they don't overlap the specified cram_range. |
569 | | * |
570 | | * This function also updates the cram_fd range field. |
571 | | * |
572 | | * Returns 0 on success |
573 | | * -1 on general failure |
574 | | * -2 on no-data (empty chromosome) |
575 | | */ |
576 | 0 | int cram_seek_to_refpos(cram_fd *fd, cram_range *r) { |
577 | 0 | int ret = 0; |
578 | 0 | cram_index *e; |
579 | |
|
580 | 0 | if (r->refid == HTS_IDX_NONE) { |
581 | 0 | ret = -2; goto err; |
582 | 0 | } |
583 | | |
584 | | // Ideally use an index, so see if we have one. |
585 | 0 | if ((e = cram_index_query(fd, r->refid, r->start, NULL))) { |
586 | 0 | if (0 != cram_seek(fd, e->offset, SEEK_SET)) { |
587 | 0 | ret = -1; goto err; |
588 | 0 | } |
589 | 0 | } else { |
590 | | // Absent from index, but this most likely means it simply has no data. |
591 | 0 | ret = -2; goto err; |
592 | 0 | } |
593 | | |
594 | 0 | pthread_mutex_lock(&fd->range_lock); |
595 | 0 | fd->range = *r; |
596 | 0 | if (r->refid == HTS_IDX_NOCOOR) { |
597 | 0 | fd->range.refid = -1; |
598 | 0 | fd->range.start = 0; |
599 | 0 | } else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) { |
600 | 0 | fd->range.refid = -2; // special case in cram_next_slice |
601 | 0 | } |
602 | 0 | pthread_mutex_unlock(&fd->range_lock); |
603 | |
|
604 | 0 | if (fd->ctr) { |
605 | 0 | cram_free_container(fd->ctr); |
606 | 0 | if (fd->ctr_mt && fd->ctr_mt != fd->ctr) |
607 | 0 | cram_free_container(fd->ctr_mt); |
608 | 0 | fd->ctr = NULL; |
609 | 0 | fd->ctr_mt = NULL; |
610 | 0 | fd->ooc = 0; |
611 | 0 | fd->eof = 0; |
612 | 0 | } |
613 | |
|
614 | 0 | return 0; |
615 | | |
616 | 0 | err: |
617 | | // It's unlikely fd->range will be accessed after EOF or error, |
618 | | // but this maintains identical behaviour to the previous code. |
619 | 0 | pthread_mutex_lock(&fd->range_lock); |
620 | 0 | fd->range = *r; |
621 | 0 | pthread_mutex_unlock(&fd->range_lock); |
622 | 0 | return ret; |
623 | 0 | } |
624 | | |
625 | | |
626 | | /* |
627 | | * A specialised form of cram_index_build (below) that deals with slices |
628 | | * having multiple references in this (ref_id -2). In this scenario we |
629 | | * decode the slice to look at the RI data series instead. |
630 | | * |
631 | | * Returns 0 on success |
632 | | * -1 on read failure |
633 | | * -2 on wrong sort order |
634 | | * -4 on write failure |
635 | | */ |
636 | | static int cram_index_build_multiref(cram_fd *fd, |
637 | | cram_container *c, |
638 | | cram_slice *s, |
639 | | BGZF *fp, |
640 | | off_t cpos, |
641 | | int32_t landmark, |
642 | 0 | int sz) { |
643 | 0 | int i, ref = -2; |
644 | 0 | int64_t ref_start = 0, ref_end; |
645 | 0 | char buf[1024]; |
646 | |
|
647 | 0 | if (fd->mode != 'w') { |
648 | 0 | if (0 != cram_decode_slice(fd, c, s, fd->header)) |
649 | 0 | return -1; |
650 | 0 | } |
651 | | |
652 | 0 | ref_end = INT_MIN; |
653 | |
|
654 | 0 | int32_t last_ref = -9; |
655 | 0 | int32_t last_pos = -9; |
656 | 0 | for (i = 0; i < s->hdr->num_records; i++) { |
657 | 0 | if (s->crecs[i].ref_id == last_ref && s->crecs[i].apos < last_pos) { |
658 | 0 | hts_log_error("CRAM file is not sorted by chromosome / position"); |
659 | 0 | return -2; |
660 | 0 | } |
661 | 0 | last_ref = s->crecs[i].ref_id; |
662 | 0 | last_pos = s->crecs[i].apos; |
663 | |
|
664 | 0 | if (s->crecs[i].ref_id == ref) { |
665 | 0 | if (ref_end < s->crecs[i].aend) |
666 | 0 | ref_end = s->crecs[i].aend; |
667 | 0 | continue; |
668 | 0 | } |
669 | | |
670 | 0 | if (ref != -2) { |
671 | 0 | snprintf(buf, sizeof(buf), |
672 | 0 | "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", |
673 | 0 | ref, ref_start, ref_end - ref_start + 1, |
674 | 0 | (int64_t)cpos, landmark, sz); |
675 | 0 | if (bgzf_write(fp, buf, strlen(buf)) < 0) |
676 | 0 | return -4; |
677 | 0 | } |
678 | | |
679 | 0 | ref = s->crecs[i].ref_id; |
680 | 0 | ref_start = s->crecs[i].apos; |
681 | 0 | ref_end = s->crecs[i].aend; |
682 | 0 | } |
683 | | |
684 | 0 | if (ref != -2) { |
685 | 0 | snprintf(buf, sizeof(buf), |
686 | 0 | "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", |
687 | 0 | ref, ref_start, ref_end - ref_start + 1, |
688 | 0 | (int64_t)cpos, landmark, sz); |
689 | 0 | if (bgzf_write(fp, buf, strlen(buf)) < 0) |
690 | 0 | return -4; |
691 | 0 | } |
692 | | |
693 | 0 | return 0; |
694 | 0 | } |
695 | | |
696 | | /* |
697 | | * Adds a single slice to the index. |
698 | | */ |
699 | | int cram_index_slice(cram_fd *fd, |
700 | | cram_container *c, |
701 | | cram_slice *s, |
702 | | BGZF *fp, |
703 | | off_t cpos, |
704 | | off_t spos, // relative to cpos |
705 | 0 | off_t sz) { |
706 | 0 | int ret; |
707 | 0 | char buf[1024]; |
708 | |
|
709 | 0 | if (sz > INT_MAX) { |
710 | 0 | hts_log_error("CRAM slice is too big (%"PRId64" bytes)", |
711 | 0 | (int64_t) sz); |
712 | 0 | return -1; |
713 | 0 | } |
714 | | |
715 | 0 | if (s->hdr->ref_seq_id == -2) { |
716 | 0 | ret = cram_index_build_multiref(fd, c, s, fp, cpos, spos, sz); |
717 | 0 | } else { |
718 | 0 | snprintf(buf, sizeof(buf), |
719 | 0 | "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", |
720 | 0 | s->hdr->ref_seq_id, s->hdr->ref_seq_start, |
721 | 0 | s->hdr->ref_seq_span, (int64_t)cpos, (int)spos, (int)sz); |
722 | 0 | ret = (bgzf_write(fp, buf, strlen(buf)) >= 0)? 0 : -4; |
723 | 0 | } |
724 | |
|
725 | 0 | return ret; |
726 | 0 | } |
727 | | |
728 | | /* |
729 | | * Adds a single container to the index. |
730 | | */ |
731 | | static |
732 | | int cram_index_container(cram_fd *fd, |
733 | | cram_container *c, |
734 | | BGZF *fp, |
735 | 0 | off_t cpos) { |
736 | 0 | int j; |
737 | 0 | off_t spos; |
738 | | |
739 | | // 2.0 format |
740 | 0 | for (j = 0; j < c->num_landmarks; j++) { |
741 | 0 | cram_slice *s; |
742 | 0 | off_t sz; |
743 | 0 | int ret; |
744 | |
|
745 | 0 | spos = htell(fd->fp); |
746 | 0 | if (spos - cpos - (off_t) c->offset != c->landmark[j]) { |
747 | 0 | hts_log_error("CRAM slice offset %"PRId64" does not match" |
748 | 0 | " landmark %d in container header (%"PRId32")", |
749 | 0 | (int64_t) (spos - cpos - (off_t) c->offset), |
750 | 0 | j, c->landmark[j]); |
751 | 0 | return -1; |
752 | 0 | } |
753 | | |
754 | 0 | if (!(s = cram_read_slice(fd))) { |
755 | 0 | return -1; |
756 | 0 | } |
757 | | |
758 | 0 | sz = htell(fd->fp) - spos; |
759 | 0 | ret = cram_index_slice(fd, c, s, fp, cpos, c->landmark[j], sz); |
760 | |
|
761 | 0 | cram_free_slice(s); |
762 | |
|
763 | 0 | if (ret < 0) { |
764 | 0 | return ret; |
765 | 0 | } |
766 | 0 | } |
767 | | |
768 | 0 | return 0; |
769 | 0 | } |
770 | | |
771 | | |
772 | | /* |
773 | | * Builds an index file. |
774 | | * |
775 | | * fd is a newly opened cram file that we wish to index. |
776 | | * fn_base is the filename of the associated CRAM file. |
777 | | * fn_idx is the filename of the index file to be written; |
778 | | * if NULL, we add ".crai" to fn_base to get the index filename. |
779 | | * |
780 | | * Returns 0 on success, |
781 | | * negative on failure (-1 for read failure, -4 for write failure) |
782 | | */ |
783 | 0 | int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) { |
784 | 0 | cram_container *c; |
785 | 0 | off_t cpos, hpos; |
786 | 0 | BGZF *fp; |
787 | 0 | kstring_t fn_idx_str = {0}; |
788 | 0 | int64_t last_ref = -9, last_start = -9; |
789 | | |
790 | | // Useful for cram_index_build_multiref |
791 | 0 | cram_set_option(fd, CRAM_OPT_REQUIRED_FIELDS, SAM_RNAME | SAM_POS | SAM_CIGAR); |
792 | |
|
793 | 0 | if (! fn_idx) { |
794 | 0 | kputs(fn_base, &fn_idx_str); |
795 | 0 | kputs(".crai", &fn_idx_str); |
796 | 0 | fn_idx = fn_idx_str.s; |
797 | 0 | } |
798 | |
|
799 | 0 | if (!(fp = bgzf_open(fn_idx, "wg"))) { |
800 | 0 | perror(fn_idx); |
801 | 0 | free(fn_idx_str.s); |
802 | 0 | return -4; |
803 | 0 | } |
804 | | |
805 | 0 | free(fn_idx_str.s); |
806 | |
|
807 | 0 | cpos = htell(fd->fp); |
808 | 0 | while ((c = cram_read_container(fd))) { |
809 | 0 | if (fd->err) { |
810 | 0 | perror("Cram container read"); |
811 | 0 | return -1; |
812 | 0 | } |
813 | | |
814 | 0 | hpos = htell(fd->fp); |
815 | |
|
816 | 0 | if (!(c->comp_hdr_block = cram_read_block(fd))) |
817 | 0 | return -1; |
818 | 0 | assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER); |
819 | |
|
820 | 0 | c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block); |
821 | 0 | if (!c->comp_hdr) |
822 | 0 | return -1; |
823 | | |
824 | 0 | if (c->ref_seq_id == last_ref && c->ref_seq_start < last_start) { |
825 | 0 | hts_log_error("CRAM file is not sorted by chromosome / position"); |
826 | 0 | return -2; |
827 | 0 | } |
828 | 0 | last_ref = c->ref_seq_id; |
829 | 0 | last_start = c->ref_seq_start; |
830 | |
|
831 | 0 | if (cram_index_container(fd, c, fp, cpos) < 0) { |
832 | 0 | bgzf_close(fp); |
833 | 0 | return -1; |
834 | 0 | } |
835 | | |
836 | 0 | off_t next_cpos = htell(fd->fp); |
837 | 0 | if (next_cpos != hpos + c->length) { |
838 | 0 | hts_log_error("Length %"PRId32" in container header at offset %lld does not match block lengths (%lld)", |
839 | 0 | c->length, (long long) cpos, (long long) next_cpos - hpos); |
840 | 0 | return -1; |
841 | 0 | } |
842 | 0 | cpos = next_cpos; |
843 | |
|
844 | 0 | cram_free_container(c); |
845 | 0 | } |
846 | 0 | if (fd->err) { |
847 | 0 | bgzf_close(fp); |
848 | 0 | return -1; |
849 | 0 | } |
850 | | |
851 | 0 | return (bgzf_close(fp) >= 0)? 0 : -4; |
852 | 0 | } |
853 | | |
854 | | // internal recursive step |
855 | | static int64_t cram_num_containers_between_(cram_index *e, int64_t *last_pos, |
856 | | int64_t nct, |
857 | | off_t cstart, off_t cend, |
858 | 0 | int64_t *first, int64_t *last) { |
859 | 0 | int64_t nc = 0, i; |
860 | |
|
861 | 0 | if (e->offset) { |
862 | 0 | if (e->offset != *last_pos) { |
863 | 0 | if (e->offset >= cstart && (!cend || e->offset <= cend)) { |
864 | 0 | if (first && *first < 0) |
865 | 0 | *first = nct; |
866 | 0 | if (last) |
867 | 0 | *last = nct; |
868 | 0 | } |
869 | 0 | nc++; |
870 | 0 | } |
871 | | // else a new multi-ref in same container |
872 | 0 | *last_pos = e->offset; |
873 | 0 | } |
874 | |
|
875 | 0 | for (i = 0; i < e->nslice; i++) |
876 | 0 | nc += cram_num_containers_between_(&e->e[i], last_pos, nc + nct, |
877 | 0 | cstart, cend, first, last); |
878 | |
|
879 | 0 | return nc; |
880 | 0 | } |
881 | | |
882 | | /*! Returns the number of containers in the CRAM file within given offsets. |
883 | | * |
884 | | * The cstart and cend offsets are the locations of the start of containers |
885 | | * as returned by index_container_offset. |
886 | | * |
887 | | * If non-NULL, first and last will hold the inclusive range of container |
888 | | * numbers, counting from zero. |
889 | | * |
890 | | * @return |
891 | | * Returns the number of containers, equivalent to *last-*first+1. |
892 | | */ |
893 | | int64_t cram_num_containers_between(cram_fd *fd, |
894 | | off_t cstart, off_t cend, |
895 | 0 | int64_t *first, int64_t *last) { |
896 | 0 | int64_t nc = 0, i; |
897 | 0 | int64_t last_pos = -99; |
898 | 0 | int64_t l_first = -1, l_last = -1; |
899 | |
|
900 | 0 | for (i = 0; i < fd->index_sz; i++) { |
901 | 0 | int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end |
902 | 0 | nc += cram_num_containers_between_(&fd->index[j], &last_pos, nc, |
903 | 0 | cstart, cend, &l_first, &l_last); |
904 | 0 | } |
905 | |
|
906 | 0 | if (first) |
907 | 0 | *first = l_first; |
908 | 0 | if (last) |
909 | 0 | *last = l_last; |
910 | |
|
911 | 0 | return l_last - l_first + 1; |
912 | 0 | } |
913 | | |
914 | | /* |
915 | | * Queries the total number of distinct containers in the index. |
916 | | * Note there may be more containers in the file than in the index, as we |
917 | | * are not required to have an index entry for every one. |
918 | | */ |
919 | 0 | int64_t cram_num_containers(cram_fd *fd) { |
920 | 0 | return cram_num_containers_between(fd, 0, 0, NULL, NULL); |
921 | 0 | } |
922 | | |
923 | | |
924 | | /*! Returns the byte offset for the start of the n^th container. |
925 | | * |
926 | | * The index must have previously been loaded, otherwise <0 is returned. |
927 | | */ |
928 | | static cram_index *cram_container_num2offset_(cram_index *e, int num, |
929 | 0 | int64_t *last_pos, int *nc) { |
930 | 0 | if (e->offset) { |
931 | 0 | if (e->offset != *last_pos) { |
932 | 0 | if (*nc == num) |
933 | 0 | return e; |
934 | 0 | (*nc)++; |
935 | 0 | } |
936 | | // else a new multi-ref in same container |
937 | 0 | *last_pos = e->offset; |
938 | 0 | } |
939 | | |
940 | 0 | int i; |
941 | 0 | for (i = 0; i < e->nslice; i++) { |
942 | 0 | cram_index *tmp = cram_container_num2offset_(&e->e[i], num, |
943 | 0 | last_pos, nc); |
944 | 0 | if (tmp) |
945 | 0 | return tmp; |
946 | 0 | } |
947 | | |
948 | | |
949 | 0 | return NULL; |
950 | 0 | } |
951 | | |
952 | 0 | off_t cram_container_num2offset(cram_fd *fd, int64_t num) { |
953 | 0 | int nc = 0, i; |
954 | 0 | int64_t last_pos = -9; |
955 | 0 | cram_index *e = NULL; |
956 | |
|
957 | 0 | for (i = 0; i < fd->index_sz; i++) { |
958 | 0 | int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end |
959 | 0 | if (!fd->index[j].nslice) |
960 | 0 | continue; |
961 | 0 | if ((e = cram_container_num2offset_(&fd->index[j], num, |
962 | 0 | &last_pos, &nc))) |
963 | 0 | break; |
964 | 0 | } |
965 | |
|
966 | 0 | return e ? e->offset : -1; |
967 | 0 | } |
968 | | |
969 | | |
970 | | /*! Returns the container number for the first container at offset >= pos. |
971 | | * |
972 | | * The index must have previously been loaded, otherwise <0 is returned. |
973 | | */ |
974 | | static cram_index *cram_container_offset2num_(cram_index *e, off_t pos, |
975 | 0 | int64_t *last_pos, int *nc) { |
976 | 0 | if (e->offset) { |
977 | 0 | if (e->offset != *last_pos) { |
978 | 0 | if (e->offset >= pos) |
979 | 0 | return e; |
980 | 0 | (*nc)++; |
981 | 0 | } |
982 | | // else a new multi-ref in same container |
983 | 0 | *last_pos = e->offset; |
984 | 0 | } |
985 | | |
986 | 0 | int i; |
987 | 0 | for (i = 0; i < e->nslice; i++) { |
988 | 0 | cram_index *tmp = cram_container_offset2num_(&e->e[i], pos, |
989 | 0 | last_pos, nc); |
990 | 0 | if (tmp) |
991 | 0 | return tmp; |
992 | 0 | } |
993 | | |
994 | | |
995 | 0 | return NULL; |
996 | 0 | } |
997 | | |
998 | 0 | int64_t cram_container_offset2num(cram_fd *fd, off_t pos) { |
999 | 0 | int nc = 0, i; |
1000 | 0 | int64_t last_pos = -9; |
1001 | 0 | cram_index *e = NULL; |
1002 | |
|
1003 | 0 | for (i = 0; i < fd->index_sz; i++) { |
1004 | 0 | int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end |
1005 | 0 | if (!fd->index[j].nslice) |
1006 | 0 | continue; |
1007 | 0 | if ((e = cram_container_offset2num_(&fd->index[j], pos, |
1008 | 0 | &last_pos, &nc))) |
1009 | 0 | break; |
1010 | 0 | } |
1011 | |
|
1012 | 0 | return e ? nc : -1; |
1013 | 0 | } |
1014 | | |
1015 | | /*! |
1016 | | * Returns the file offsets of CRAM containers covering a specific region |
1017 | | * query. Note both offsets are the START of the container. |
1018 | | * |
1019 | | * first will point to the start of the first overlapping container |
1020 | | * last will point to the start of the last overlapping container |
1021 | | * |
1022 | | * Returns 0 on success |
1023 | | * <0 on failure |
1024 | | */ |
1025 | | int cram_index_extents(cram_fd *fd, int refid, hts_pos_t start, hts_pos_t end, |
1026 | 0 | off_t *first, off_t *last) { |
1027 | 0 | cram_index *ci; |
1028 | |
|
1029 | 0 | if (first) { |
1030 | 0 | if (!(ci = cram_index_query(fd, refid, start, NULL))) |
1031 | 0 | return -1; |
1032 | 0 | *first = ci->offset; |
1033 | 0 | } |
1034 | | |
1035 | 0 | if (last) { |
1036 | 0 | if (!(ci = cram_index_query_last(fd, refid, end))) |
1037 | 0 | return -1; |
1038 | 0 | *last = ci->offset; |
1039 | 0 | } |
1040 | | |
1041 | 0 | return 0; |
1042 | 0 | } |