/src/htslib/cram/cram_index.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | Copyright (c) 2013-2020, 2023-2024 Genome Research Ltd. |
3 | | Author: James Bonfield <jkb@sanger.ac.uk> |
4 | | |
5 | | Redistribution and use in source and binary forms, with or without |
6 | | modification, are permitted provided that the following conditions are met: |
7 | | |
8 | | 1. Redistributions of source code must retain the above copyright notice, |
9 | | this list of conditions and the following disclaimer. |
10 | | |
11 | | 2. Redistributions in binary form must reproduce the above copyright notice, |
12 | | this list of conditions and the following disclaimer in the documentation |
13 | | and/or other materials provided with the distribution. |
14 | | |
15 | | 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger |
16 | | Institute nor the names of its contributors may be used to endorse or promote |
17 | | products derived from this software without specific prior written permission. |
18 | | |
19 | | THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND |
20 | | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
21 | | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
22 | | DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE |
23 | | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
24 | | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
25 | | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
26 | | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
27 | | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
28 | | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 | | */ |
30 | | |
31 | | /* |
32 | | * The index is a gzipped tab-delimited text file with one line per slice. |
33 | | * The columns are: |
34 | | * 1: reference number (0 to N-1, as per BAM ref_id) |
35 | | * 2: reference position of 1st read in slice (1..?) |
36 | | * 3: number of reads in slice |
37 | | * 4: offset of container start (relative to end of SAM header, so 1st |
38 | | * container is offset 0). |
39 | | * 5: slice number within container (ie which landmark). |
40 | | * |
41 | | * In memory, we hold this in a nested containment list. Each list element is |
42 | | * a cram_index struct. Each element in turn can contain its own list of |
43 | | * cram_index structs. |
44 | | * |
45 | | * Any start..end range which is entirely contained within another (and |
46 | | * earlier as it is sorted) range will be held within it. This ensures that |
47 | | * the outer list will never have containments and we can safely do a |
48 | | * binary search to find the first range which overlaps any given coordinate. |
49 | | */ |
50 | | |
51 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
52 | | #include <config.h> |
53 | | |
54 | | #include <stdio.h> |
55 | | #include <errno.h> |
56 | | #include <assert.h> |
57 | | #include <inttypes.h> |
58 | | #include <stdlib.h> |
59 | | #include <string.h> |
60 | | #include <zlib.h> |
61 | | #include <sys/types.h> |
62 | | #include <sys/stat.h> |
63 | | #include <math.h> |
64 | | |
65 | | #include "../htslib/bgzf.h" |
66 | | #include "../htslib/hfile.h" |
67 | | #include "../hts_internal.h" |
68 | | #include "cram.h" |
69 | | #include "os.h" |
70 | | |
71 | | #if 0 |
72 | | static void dump_index_(cram_index *e, int level) { |
73 | | int i, n; |
74 | | n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end); |
75 | | printf("%*soffset %"PRId64" %p %p\n", MAX(0,50-n), "", e->offset, e, e->e_next); |
76 | | for (i = 0; i < e->nslice; i++) { |
77 | | dump_index_(&e->e[i], level+1); |
78 | | } |
79 | | } |
80 | | |
81 | | static void dump_index(cram_fd *fd) { |
82 | | int i; |
83 | | for (i = 0; i < fd->index_sz; i++) { |
84 | | dump_index_(&fd->index[i], 0); |
85 | | } |
86 | | } |
87 | | #endif |
88 | | |
89 | | // Thread a linked list through the nested containment list. |
90 | | // This makes navigating it and finding the "next" index entry |
91 | | // trivial. |
92 | 0 | static cram_index *link_index_(cram_index *e, cram_index *e_last) { |
93 | 0 | int i; |
94 | 0 | if (e_last) |
95 | 0 | e_last->e_next = e; |
96 | | |
97 | | // We don't want to link in the top-level cram_index with |
98 | | // offset=0 and start/end = INT_MIN/INT_MAX. |
99 | 0 | if (e->offset) |
100 | 0 | e_last = e; |
101 | |
|
102 | 0 | for (i = 0; i < e->nslice; i++) |
103 | 0 | e_last = link_index_(&e->e[i], e_last); |
104 | |
|
105 | 0 | return e_last; |
106 | 0 | } |
107 | | |
108 | 0 | static void link_index(cram_fd *fd) { |
109 | 0 | int i; |
110 | 0 | cram_index *e_last = NULL; |
111 | |
|
112 | 0 | for (i = 0; i < fd->index_sz; i++) { |
113 | 0 | e_last = link_index_(&fd->index[i], e_last); |
114 | 0 | } |
115 | |
|
116 | 0 | if (e_last) |
117 | 0 | e_last->e_next = NULL; |
118 | 0 | } |
119 | | |
120 | 0 | static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) { |
121 | 0 | int sign = 1; |
122 | 0 | int32_t val = 0; |
123 | 0 | size_t p = *pos; |
124 | |
|
125 | 0 | while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t')) |
126 | 0 | p++; |
127 | |
|
128 | 0 | if (p < k->l && k->s[p] == '-') |
129 | 0 | sign = -1, p++; |
130 | |
|
131 | 0 | if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9')) |
132 | 0 | return -1; |
133 | | |
134 | 0 | while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') { |
135 | 0 | int digit = k->s[p++]-'0'; |
136 | 0 | val = val*10 + digit; |
137 | 0 | } |
138 | |
|
139 | 0 | *pos = p; |
140 | 0 | *val_p = sign*val; |
141 | |
|
142 | 0 | return 0; |
143 | 0 | } |
144 | | |
145 | 0 | static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) { |
146 | 0 | int sign = 1; |
147 | 0 | int64_t val = 0; |
148 | 0 | size_t p = *pos; |
149 | |
|
150 | 0 | while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t')) |
151 | 0 | p++; |
152 | |
|
153 | 0 | if (p < k->l && k->s[p] == '-') |
154 | 0 | sign = -1, p++; |
155 | |
|
156 | 0 | if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9')) |
157 | 0 | return -1; |
158 | | |
159 | 0 | while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') { |
160 | 0 | int digit = k->s[p++]-'0'; |
161 | 0 | val = val*10 + digit; |
162 | 0 | } |
163 | |
|
164 | 0 | *pos = p; |
165 | 0 | *val_p = sign*val; |
166 | |
|
167 | 0 | return 0; |
168 | 0 | } |
169 | | |
170 | | /* |
171 | | * Loads a CRAM .crai index into memory. |
172 | | * |
173 | | * Returns 0 for success |
174 | | * -1 for failure |
175 | | */ |
176 | 0 | int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) { |
177 | |
|
178 | 0 | char *tfn_idx = NULL; |
179 | 0 | char buf[65536]; |
180 | 0 | ssize_t len; |
181 | 0 | kstring_t kstr = {0}; |
182 | 0 | hFILE *fp; |
183 | 0 | cram_index *idx; |
184 | 0 | cram_index **idx_stack = NULL, *ep, e; |
185 | 0 | int idx_stack_alloc = 0, idx_stack_ptr = 0; |
186 | 0 | size_t pos = 0; |
187 | | |
188 | | /* Check if already loaded */ |
189 | 0 | if (fd->index) |
190 | 0 | return 0; |
191 | | |
192 | 0 | fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index)); |
193 | 0 | if (!fd->index) |
194 | 0 | return -1; |
195 | | |
196 | 0 | idx = &fd->index[0]; |
197 | 0 | idx->refid = -1; |
198 | 0 | idx->start = INT_MIN; |
199 | 0 | idx->end = INT_MAX; |
200 | |
|
201 | 0 | idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack)); |
202 | 0 | if (!idx_stack) |
203 | 0 | goto fail; |
204 | | |
205 | 0 | idx_stack[idx_stack_ptr] = idx; |
206 | | |
207 | | // Support pathX.cram##idx##pathY.crai |
208 | 0 | const char *fn_delim = strstr(fn, HTS_IDX_DELIM); |
209 | 0 | if (fn_delim && !fn_idx) |
210 | 0 | fn_idx = fn_delim + strlen(HTS_IDX_DELIM); |
211 | |
|
212 | 0 | if (!fn_idx) { |
213 | 0 | if (hts_idx_check_local(fn, HTS_FMT_CRAI, &tfn_idx) == 0 && hisremote(fn)) |
214 | 0 | tfn_idx = hts_idx_getfn(fn, ".crai"); |
215 | |
|
216 | 0 | if (!tfn_idx) { |
217 | 0 | hts_log_error("Could not retrieve index file for '%s'", fn); |
218 | 0 | goto fail; |
219 | 0 | } |
220 | 0 | fn_idx = tfn_idx; |
221 | 0 | } |
222 | | |
223 | 0 | if (!(fp = hopen(fn_idx, "r"))) { |
224 | 0 | hts_log_error("Could not open index file '%s'", fn_idx); |
225 | 0 | goto fail; |
226 | 0 | } |
227 | | |
228 | | // Load the file into memory |
229 | 0 | while ((len = hread(fp, buf, sizeof(buf))) > 0) { |
230 | 0 | if (kputsn(buf, len, &kstr) < 0) |
231 | 0 | goto fail; |
232 | 0 | } |
233 | | |
234 | 0 | if (len < 0 || kstr.l < 2) |
235 | 0 | goto fail; |
236 | | |
237 | 0 | if (hclose(fp) < 0) |
238 | 0 | goto fail; |
239 | | |
240 | | // Uncompress if required |
241 | 0 | if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) { |
242 | 0 | size_t l = 0; |
243 | 0 | char *s = zlib_mem_inflate(kstr.s, kstr.l, &l); |
244 | 0 | if (!s) |
245 | 0 | goto fail; |
246 | | |
247 | 0 | free(kstr.s); |
248 | 0 | kstr.s = s; |
249 | 0 | kstr.l = l; |
250 | 0 | kstr.m = l; // conservative estimate of the size allocated |
251 | 0 | if (kputsn("", 0, &kstr) < 0) // ensure kstr.s is NUL-terminated |
252 | 0 | goto fail; |
253 | 0 | } |
254 | | |
255 | | |
256 | | // Parse it line at a time |
257 | 0 | while (pos < kstr.l) { |
258 | | /* 1.1 layout */ |
259 | 0 | if (kget_int32(&kstr, &pos, &e.refid) == -1) |
260 | 0 | goto fail; |
261 | | |
262 | 0 | if (kget_int32(&kstr, &pos, &e.start) == -1) |
263 | 0 | goto fail; |
264 | | |
265 | 0 | if (kget_int32(&kstr, &pos, &e.end) == -1) |
266 | 0 | goto fail; |
267 | | |
268 | 0 | if (kget_int64(&kstr, &pos, &e.offset) == -1) |
269 | 0 | goto fail; |
270 | | |
271 | 0 | if (kget_int32(&kstr, &pos, &e.slice) == -1) |
272 | 0 | goto fail; |
273 | | |
274 | 0 | if (kget_int32(&kstr, &pos, &e.len) == -1) |
275 | 0 | goto fail; |
276 | | |
277 | 0 | e.end += e.start-1; |
278 | | //printf("%d/%d..%d-offset=%" PRIu64 ",len=%d,slice=%d\n", e.refid, e.start, e.end, e.offset, e.len, e.slice); |
279 | |
|
280 | 0 | if (e.refid < -1) { |
281 | 0 | hts_log_error("Malformed index file, refid %d", e.refid); |
282 | 0 | goto fail; |
283 | 0 | } |
284 | | |
285 | 0 | if (e.refid != idx->refid) { |
286 | 0 | if (fd->index_sz < e.refid+2) { |
287 | 0 | cram_index *new_idx; |
288 | 0 | int new_sz = e.refid+2; |
289 | 0 | size_t index_end = fd->index_sz * sizeof(*fd->index); |
290 | 0 | new_idx = realloc(fd->index, |
291 | 0 | new_sz * sizeof(*fd->index)); |
292 | 0 | if (!new_idx) |
293 | 0 | goto fail; |
294 | | |
295 | 0 | fd->index = new_idx; |
296 | 0 | fd->index_sz = new_sz; |
297 | 0 | memset(((char *)fd->index) + index_end, 0, |
298 | 0 | fd->index_sz * sizeof(*fd->index) - index_end); |
299 | 0 | } |
300 | 0 | idx = &fd->index[e.refid+1]; |
301 | 0 | idx->refid = e.refid; |
302 | 0 | idx->start = INT_MIN; |
303 | 0 | idx->end = INT_MAX; |
304 | 0 | idx->nslice = idx->nalloc = 0; |
305 | 0 | idx->e = NULL; |
306 | 0 | idx_stack[(idx_stack_ptr = 0)] = idx; |
307 | 0 | } |
308 | | |
309 | 0 | while (!(e.start >= idx->start && e.end <= idx->end) || |
310 | 0 | (idx->start == 0 && idx->refid == -1)) { |
311 | 0 | idx = idx_stack[--idx_stack_ptr]; |
312 | 0 | } |
313 | | |
314 | | // Now contains, so append |
315 | 0 | if (idx->nslice+1 >= idx->nalloc) { |
316 | 0 | cram_index *new_e; |
317 | 0 | idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16; |
318 | 0 | new_e = realloc(idx->e, idx->nalloc * sizeof(*idx->e)); |
319 | 0 | if (!new_e) |
320 | 0 | goto fail; |
321 | | |
322 | 0 | idx->e = new_e; |
323 | 0 | } |
324 | | |
325 | 0 | e.nalloc = e.nslice = 0; e.e = NULL; |
326 | 0 | *(ep = &idx->e[idx->nslice++]) = e; |
327 | 0 | idx = ep; |
328 | |
|
329 | 0 | if (++idx_stack_ptr >= idx_stack_alloc) { |
330 | 0 | cram_index **new_stack; |
331 | 0 | idx_stack_alloc *= 2; |
332 | 0 | new_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack)); |
333 | 0 | if (!new_stack) |
334 | 0 | goto fail; |
335 | 0 | idx_stack = new_stack; |
336 | 0 | } |
337 | 0 | idx_stack[idx_stack_ptr] = idx; |
338 | |
|
339 | 0 | while (pos < kstr.l && kstr.s[pos] != '\n') |
340 | 0 | pos++; |
341 | 0 | pos++; |
342 | 0 | } |
343 | | |
344 | 0 | free(idx_stack); |
345 | 0 | free(kstr.s); |
346 | 0 | free(tfn_idx); |
347 | | |
348 | | // Convert NCList to linear linked list |
349 | 0 | link_index(fd); |
350 | | |
351 | | //dump_index(fd); |
352 | |
|
353 | 0 | return 0; |
354 | | |
355 | 0 | fail: |
356 | 0 | free(kstr.s); |
357 | 0 | free(idx_stack); |
358 | 0 | free(tfn_idx); |
359 | 0 | cram_index_free(fd); // Also sets fd->index = NULL |
360 | 0 | return -1; |
361 | 0 | } |
362 | | |
363 | 0 | static void cram_index_free_recurse(cram_index *e) { |
364 | 0 | if (e->e) { |
365 | 0 | int i; |
366 | 0 | for (i = 0; i < e->nslice; i++) { |
367 | 0 | cram_index_free_recurse(&e->e[i]); |
368 | 0 | } |
369 | 0 | free(e->e); |
370 | 0 | } |
371 | 0 | } |
372 | | |
373 | 0 | void cram_index_free(cram_fd *fd) { |
374 | 0 | int i; |
375 | |
|
376 | 0 | if (!fd->index) |
377 | 0 | return; |
378 | | |
379 | 0 | for (i = 0; i < fd->index_sz; i++) { |
380 | 0 | cram_index_free_recurse(&fd->index[i]); |
381 | 0 | } |
382 | 0 | free(fd->index); |
383 | |
|
384 | 0 | fd->index = NULL; |
385 | 0 | } |
386 | | |
387 | | /* |
388 | | * Searches the index for the first slice overlapping a reference ID |
389 | | * and position, or one immediately preceding it if none is found in |
390 | | * the index to overlap this position. (Our index may have missing |
391 | | * entries, but we require at least one per reference.) |
392 | | * |
393 | | * If the index finds multiple slices overlapping this position we |
394 | | * return the first one only. Subsequent calls should specify |
395 | | * "from" as the last slice we checked to find the next one. Otherwise |
396 | | * set "from" to be NULL to find the first one. |
397 | | * |
398 | | * Refid can also be any of the special HTS_IDX_ values. |
399 | | * For backwards compatibility, refid -1 is equivalent to HTS_IDX_NOCOOR. |
400 | | * |
401 | | * Returns the cram_index pointer on success |
402 | | * NULL on failure |
403 | | */ |
404 | | cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos, |
405 | 0 | cram_index *from) { |
406 | 0 | int i, j, k; |
407 | 0 | cram_index *e; |
408 | |
|
409 | 0 | if (from) { |
410 | | // Continue from a previous search. |
411 | | // We switch to just scanning the linked list, as the nested |
412 | | // lists are typically short. |
413 | 0 | if (refid == HTS_IDX_NOCOOR) |
414 | 0 | refid = -1; |
415 | |
|
416 | 0 | e = from->e_next; |
417 | 0 | if (e && e->refid == refid && e->start <= pos) |
418 | 0 | return e; |
419 | 0 | else |
420 | 0 | return NULL; |
421 | 0 | } |
422 | | |
423 | 0 | switch(refid) { |
424 | 0 | case HTS_IDX_NONE: |
425 | 0 | case HTS_IDX_REST: |
426 | | // fail, or already there, dealt with elsewhere. |
427 | 0 | return NULL; |
428 | | |
429 | 0 | case -1: |
430 | 0 | case HTS_IDX_NOCOOR: |
431 | 0 | refid = -1; |
432 | 0 | pos = 0; |
433 | 0 | break; |
434 | | |
435 | 0 | case HTS_IDX_START: { |
436 | 0 | int64_t min_idx = INT64_MAX; |
437 | 0 | for (i = 0, j = -1; i < fd->index_sz; i++) { |
438 | 0 | if (fd->index[i].e && fd->index[i].e[0].offset < min_idx) { |
439 | 0 | min_idx = fd->index[i].e[0].offset; |
440 | 0 | j = i; |
441 | 0 | } |
442 | 0 | } |
443 | 0 | if (j < 0) |
444 | 0 | return NULL; |
445 | 0 | return fd->index[j].e; |
446 | 0 | } |
447 | | |
448 | 0 | default: |
449 | 0 | if (refid < HTS_IDX_NONE || refid+1 >= fd->index_sz) |
450 | 0 | return NULL; |
451 | 0 | } |
452 | | |
453 | 0 | from = &fd->index[refid+1]; |
454 | | |
455 | | // Ref with nothing aligned against it. |
456 | 0 | if (!from->e) |
457 | 0 | return NULL; |
458 | | |
459 | | // This sequence is covered by the index, so binary search to find |
460 | | // the optimal starting block. |
461 | 0 | i = 0, j = fd->index[refid+1].nslice-1; |
462 | 0 | for (k = j/2; k != i; k = (j-i)/2 + i) { |
463 | 0 | if (from->e[k].refid > refid) { |
464 | 0 | j = k; |
465 | 0 | continue; |
466 | 0 | } |
467 | | |
468 | 0 | if (from->e[k].refid < refid) { |
469 | 0 | i = k; |
470 | 0 | continue; |
471 | 0 | } |
472 | | |
473 | 0 | if (from->e[k].start >= pos) { |
474 | 0 | j = k; |
475 | 0 | continue; |
476 | 0 | } |
477 | | |
478 | 0 | if (from->e[k].start < pos) { |
479 | 0 | i = k; |
480 | 0 | continue; |
481 | 0 | } |
482 | 0 | } |
483 | | // i==j or i==j-1. Check if j is better. |
484 | 0 | if (j >= 0 && from->e[j].start < pos && from->e[j].refid == refid) |
485 | 0 | i = j; |
486 | | |
487 | | /* The above found *a* bin overlapping, but not necessarily the first */ |
488 | 0 | while (i > 0 && from->e[i-1].end >= pos) |
489 | 0 | i--; |
490 | | |
491 | | /* We may be one bin before the optimum, so check */ |
492 | 0 | while (i+1 < from->nslice && |
493 | 0 | (from->e[i].refid < refid || |
494 | 0 | from->e[i].end < pos)) |
495 | 0 | i++; |
496 | |
|
497 | 0 | e = &from->e[i]; |
498 | |
|
499 | 0 | return e; |
500 | 0 | } |
501 | | |
502 | | // Return the index entry for last slice on a specific reference. |
503 | 0 | cram_index *cram_index_last(cram_fd *fd, int refid, cram_index *from) { |
504 | 0 | int slice; |
505 | |
|
506 | 0 | if (refid+1 < 0 || refid+1 >= fd->index_sz) |
507 | 0 | return NULL; |
508 | | |
509 | 0 | if (!from) |
510 | 0 | from = &fd->index[refid+1]; |
511 | | |
512 | | // Ref with nothing aligned against it. |
513 | 0 | if (!from->e) |
514 | 0 | return NULL; |
515 | | |
516 | 0 | slice = fd->index[refid+1].nslice - 1; |
517 | | |
518 | | // e is the last entry in the nested containment list, but it may |
519 | | // contain further slices within it. |
520 | 0 | cram_index *e = &from->e[slice]; |
521 | 0 | while (e->e_next) |
522 | 0 | e = e->e_next; |
523 | |
|
524 | 0 | return e; |
525 | 0 | } |
526 | | |
527 | | /* |
528 | | * Find the last container overlapping pos 'end', and the file offset of |
529 | | * its end (equivalent to the start offset of the container following it). |
530 | | */ |
531 | 0 | cram_index *cram_index_query_last(cram_fd *fd, int refid, hts_pos_t end) { |
532 | 0 | cram_index *e = NULL, *prev_e; |
533 | 0 | do { |
534 | 0 | prev_e = e; |
535 | 0 | e = cram_index_query(fd, refid, end, prev_e); |
536 | 0 | } while (e); |
537 | |
|
538 | 0 | if (!prev_e) |
539 | 0 | return NULL; |
540 | 0 | e = prev_e; |
541 | | |
542 | | // Note: offset of e and e->e_next may be the same if we're using a |
543 | | // multi-ref container where a single container generates multiple |
544 | | // index entries. |
545 | | // |
546 | | // We need to keep iterating until offset differs in order to find |
547 | | // the genuine file offset for the end of container. |
548 | 0 | do { |
549 | 0 | prev_e = e; |
550 | 0 | e = e->e_next; |
551 | 0 | } while (e && e->offset == prev_e->offset); |
552 | |
|
553 | 0 | return prev_e; |
554 | 0 | } |
555 | | |
556 | | /* |
557 | | * Skips to a container overlapping the start coordinate listed in |
558 | | * cram_range. |
559 | | * |
560 | | * In theory we call cram_index_query multiple times, once per slice |
561 | | * overlapping the range. However slices may be absent from the index |
562 | | * which makes this problematic. Instead we find the left-most slice |
563 | | * and then read from then on, skipping decoding of slices and/or |
564 | | * whole containers when they don't overlap the specified cram_range. |
565 | | * |
566 | | * This function also updates the cram_fd range field. |
567 | | * |
568 | | * Returns 0 on success |
569 | | * -1 on general failure |
570 | | * -2 on no-data (empty chromosome) |
571 | | */ |
572 | 0 | int cram_seek_to_refpos(cram_fd *fd, cram_range *r) { |
573 | 0 | int ret = 0; |
574 | 0 | cram_index *e; |
575 | |
|
576 | 0 | if (r->refid == HTS_IDX_NONE) { |
577 | 0 | ret = -2; goto err; |
578 | 0 | } |
579 | | |
580 | | // Ideally use an index, so see if we have one. |
581 | 0 | if ((e = cram_index_query(fd, r->refid, r->start, NULL))) { |
582 | 0 | if (0 != cram_seek(fd, e->offset, SEEK_SET)) { |
583 | 0 | ret = -1; goto err; |
584 | 0 | } |
585 | 0 | } else { |
586 | | // Absent from index, but this most likely means it simply has no data. |
587 | 0 | ret = -2; goto err; |
588 | 0 | } |
589 | | |
590 | 0 | pthread_mutex_lock(&fd->range_lock); |
591 | 0 | fd->range = *r; |
592 | 0 | if (r->refid == HTS_IDX_NOCOOR) { |
593 | 0 | fd->range.refid = -1; |
594 | 0 | fd->range.start = 0; |
595 | 0 | } else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) { |
596 | 0 | fd->range.refid = -2; // special case in cram_next_slice |
597 | 0 | } |
598 | 0 | pthread_mutex_unlock(&fd->range_lock); |
599 | |
|
600 | 0 | if (fd->ctr) { |
601 | 0 | cram_free_container(fd->ctr); |
602 | 0 | if (fd->ctr_mt && fd->ctr_mt != fd->ctr) |
603 | 0 | cram_free_container(fd->ctr_mt); |
604 | 0 | fd->ctr = NULL; |
605 | 0 | fd->ctr_mt = NULL; |
606 | 0 | fd->ooc = 0; |
607 | 0 | fd->eof = 0; |
608 | 0 | } |
609 | |
|
610 | 0 | return 0; |
611 | | |
612 | 0 | err: |
613 | | // It's unlikely fd->range will be accessed after EOF or error, |
614 | | // but this maintains identical behaviour to the previous code. |
615 | 0 | pthread_mutex_lock(&fd->range_lock); |
616 | 0 | fd->range = *r; |
617 | 0 | pthread_mutex_unlock(&fd->range_lock); |
618 | 0 | return ret; |
619 | 0 | } |
620 | | |
621 | | |
622 | | /* |
623 | | * A specialised form of cram_index_build (below) that deals with slices |
624 | | * having multiple references in this (ref_id -2). In this scenario we |
625 | | * decode the slice to look at the RI data series instead. |
626 | | * |
627 | | * Returns 0 on success |
628 | | * -1 on read failure |
629 | | * -2 on wrong sort order |
630 | | * -4 on write failure |
631 | | */ |
632 | | static int cram_index_build_multiref(cram_fd *fd, |
633 | | cram_container *c, |
634 | | cram_slice *s, |
635 | | BGZF *fp, |
636 | | off_t cpos, |
637 | | int32_t landmark, |
638 | 0 | int sz) { |
639 | 0 | int i, ref = -2; |
640 | 0 | int64_t ref_start = 0, ref_end; |
641 | 0 | char buf[1024]; |
642 | |
|
643 | 0 | if (fd->mode != 'w') { |
644 | 0 | if (0 != cram_decode_slice(fd, c, s, fd->header)) |
645 | 0 | return -1; |
646 | 0 | } |
647 | | |
648 | 0 | ref_end = INT_MIN; |
649 | |
|
650 | 0 | int32_t last_ref = -9; |
651 | 0 | int32_t last_pos = -9; |
652 | 0 | for (i = 0; i < s->hdr->num_records; i++) { |
653 | 0 | if (s->crecs[i].ref_id == last_ref && s->crecs[i].apos < last_pos) { |
654 | 0 | hts_log_error("CRAM file is not sorted by chromosome / position"); |
655 | 0 | return -2; |
656 | 0 | } |
657 | 0 | last_ref = s->crecs[i].ref_id; |
658 | 0 | last_pos = s->crecs[i].apos; |
659 | |
|
660 | 0 | if (s->crecs[i].ref_id == ref) { |
661 | 0 | if (ref_end < s->crecs[i].aend) |
662 | 0 | ref_end = s->crecs[i].aend; |
663 | 0 | continue; |
664 | 0 | } |
665 | | |
666 | 0 | if (ref != -2) { |
667 | 0 | snprintf(buf, sizeof(buf), |
668 | 0 | "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", |
669 | 0 | ref, ref_start, ref_end - ref_start + 1, |
670 | 0 | (int64_t)cpos, landmark, sz); |
671 | 0 | if (bgzf_write(fp, buf, strlen(buf)) < 0) |
672 | 0 | return -4; |
673 | 0 | } |
674 | | |
675 | 0 | ref = s->crecs[i].ref_id; |
676 | 0 | ref_start = s->crecs[i].apos; |
677 | 0 | ref_end = s->crecs[i].aend; |
678 | 0 | } |
679 | | |
680 | 0 | if (ref != -2) { |
681 | 0 | snprintf(buf, sizeof(buf), |
682 | 0 | "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", |
683 | 0 | ref, ref_start, ref_end - ref_start + 1, |
684 | 0 | (int64_t)cpos, landmark, sz); |
685 | 0 | if (bgzf_write(fp, buf, strlen(buf)) < 0) |
686 | 0 | return -4; |
687 | 0 | } |
688 | | |
689 | 0 | return 0; |
690 | 0 | } |
691 | | |
692 | | /* |
693 | | * Adds a single slice to the index. |
694 | | */ |
695 | | int cram_index_slice(cram_fd *fd, |
696 | | cram_container *c, |
697 | | cram_slice *s, |
698 | | BGZF *fp, |
699 | | off_t cpos, |
700 | | off_t spos, // relative to cpos |
701 | 0 | off_t sz) { |
702 | 0 | int ret; |
703 | 0 | char buf[1024]; |
704 | |
|
705 | 0 | if (sz > INT_MAX) { |
706 | 0 | hts_log_error("CRAM slice is too big (%"PRId64" bytes)", |
707 | 0 | (int64_t) sz); |
708 | 0 | return -1; |
709 | 0 | } |
710 | | |
711 | 0 | if (s->hdr->ref_seq_id == -2) { |
712 | 0 | ret = cram_index_build_multiref(fd, c, s, fp, cpos, spos, sz); |
713 | 0 | } else { |
714 | 0 | snprintf(buf, sizeof(buf), |
715 | 0 | "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", |
716 | 0 | s->hdr->ref_seq_id, s->hdr->ref_seq_start, |
717 | 0 | s->hdr->ref_seq_span, (int64_t)cpos, (int)spos, (int)sz); |
718 | 0 | ret = (bgzf_write(fp, buf, strlen(buf)) >= 0)? 0 : -4; |
719 | 0 | } |
720 | |
|
721 | 0 | return ret; |
722 | 0 | } |
723 | | |
724 | | /* |
725 | | * Adds a single container to the index. |
726 | | */ |
727 | | static |
728 | | int cram_index_container(cram_fd *fd, |
729 | | cram_container *c, |
730 | | BGZF *fp, |
731 | 0 | off_t cpos) { |
732 | 0 | int j; |
733 | 0 | off_t spos; |
734 | | |
735 | | // 2.0 format |
736 | 0 | for (j = 0; j < c->num_landmarks; j++) { |
737 | 0 | cram_slice *s; |
738 | 0 | off_t sz; |
739 | 0 | int ret; |
740 | |
|
741 | 0 | spos = htell(fd->fp); |
742 | 0 | if (spos - cpos - (off_t) c->offset != c->landmark[j]) { |
743 | 0 | hts_log_error("CRAM slice offset %"PRId64" does not match" |
744 | 0 | " landmark %d in container header (%"PRId32")", |
745 | 0 | (int64_t) (spos - cpos - (off_t) c->offset), |
746 | 0 | j, c->landmark[j]); |
747 | 0 | return -1; |
748 | 0 | } |
749 | | |
750 | 0 | if (!(s = cram_read_slice(fd))) { |
751 | 0 | return -1; |
752 | 0 | } |
753 | | |
754 | 0 | sz = htell(fd->fp) - spos; |
755 | 0 | ret = cram_index_slice(fd, c, s, fp, cpos, c->landmark[j], sz); |
756 | |
|
757 | 0 | cram_free_slice(s); |
758 | |
|
759 | 0 | if (ret < 0) { |
760 | 0 | return ret; |
761 | 0 | } |
762 | 0 | } |
763 | | |
764 | 0 | return 0; |
765 | 0 | } |
766 | | |
767 | | |
768 | | /* |
769 | | * Builds an index file. |
770 | | * |
771 | | * fd is a newly opened cram file that we wish to index. |
772 | | * fn_base is the filename of the associated CRAM file. |
773 | | * fn_idx is the filename of the index file to be written; |
774 | | * if NULL, we add ".crai" to fn_base to get the index filename. |
775 | | * |
776 | | * Returns 0 on success, |
777 | | * negative on failure (-1 for read failure, -4 for write failure) |
778 | | */ |
779 | 0 | int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) { |
780 | 0 | cram_container *c; |
781 | 0 | off_t cpos, hpos; |
782 | 0 | BGZF *fp; |
783 | 0 | kstring_t fn_idx_str = {0}; |
784 | 0 | int64_t last_ref = -9, last_start = -9; |
785 | | |
786 | | // Useful for cram_index_build_multiref |
787 | 0 | cram_set_option(fd, CRAM_OPT_REQUIRED_FIELDS, SAM_RNAME | SAM_POS | SAM_CIGAR); |
788 | |
|
789 | 0 | if (! fn_idx) { |
790 | 0 | kputs(fn_base, &fn_idx_str); |
791 | 0 | kputs(".crai", &fn_idx_str); |
792 | 0 | fn_idx = fn_idx_str.s; |
793 | 0 | } |
794 | |
|
795 | 0 | if (!(fp = bgzf_open(fn_idx, "wg"))) { |
796 | 0 | perror(fn_idx); |
797 | 0 | free(fn_idx_str.s); |
798 | 0 | return -4; |
799 | 0 | } |
800 | | |
801 | 0 | free(fn_idx_str.s); |
802 | |
|
803 | 0 | cpos = htell(fd->fp); |
804 | 0 | while ((c = cram_read_container(fd))) { |
805 | 0 | if (fd->err) { |
806 | 0 | perror("Cram container read"); |
807 | 0 | return -1; |
808 | 0 | } |
809 | | |
810 | 0 | hpos = htell(fd->fp); |
811 | |
|
812 | 0 | if (!(c->comp_hdr_block = cram_read_block(fd))) |
813 | 0 | return -1; |
814 | 0 | assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER); |
815 | | |
816 | 0 | c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block); |
817 | 0 | if (!c->comp_hdr) |
818 | 0 | return -1; |
819 | | |
820 | 0 | if (c->ref_seq_id == last_ref && c->ref_seq_start < last_start) { |
821 | 0 | hts_log_error("CRAM file is not sorted by chromosome / position"); |
822 | 0 | return -2; |
823 | 0 | } |
824 | 0 | last_ref = c->ref_seq_id; |
825 | 0 | last_start = c->ref_seq_start; |
826 | |
|
827 | 0 | if (cram_index_container(fd, c, fp, cpos) < 0) { |
828 | 0 | bgzf_close(fp); |
829 | 0 | return -1; |
830 | 0 | } |
831 | | |
832 | 0 | off_t next_cpos = htell(fd->fp); |
833 | 0 | if (next_cpos != hpos + c->length) { |
834 | 0 | hts_log_error("Length %"PRId32" in container header at offset %lld does not match block lengths (%lld)", |
835 | 0 | c->length, (long long) cpos, (long long) next_cpos - hpos); |
836 | 0 | return -1; |
837 | 0 | } |
838 | 0 | cpos = next_cpos; |
839 | |
|
840 | 0 | cram_free_container(c); |
841 | 0 | } |
842 | 0 | if (fd->err) { |
843 | 0 | bgzf_close(fp); |
844 | 0 | return -1; |
845 | 0 | } |
846 | | |
847 | 0 | return (bgzf_close(fp) >= 0)? 0 : -4; |
848 | 0 | } |
849 | | |
850 | | // internal recursive step |
851 | | static int64_t cram_num_containers_between_(cram_index *e, int64_t *last_pos, |
852 | | int64_t nct, |
853 | | off_t cstart, off_t cend, |
854 | 0 | int64_t *first, int64_t *last) { |
855 | 0 | int64_t nc = 0, i; |
856 | |
|
857 | 0 | if (e->offset) { |
858 | 0 | if (e->offset != *last_pos) { |
859 | 0 | if (e->offset >= cstart && (!cend || e->offset <= cend)) { |
860 | 0 | if (first && *first < 0) |
861 | 0 | *first = nct; |
862 | 0 | if (last) |
863 | 0 | *last = nct; |
864 | 0 | } |
865 | 0 | nc++; |
866 | 0 | } |
867 | | // else a new multi-ref in same container |
868 | 0 | *last_pos = e->offset; |
869 | 0 | } |
870 | |
|
871 | 0 | for (i = 0; i < e->nslice; i++) |
872 | 0 | nc += cram_num_containers_between_(&e->e[i], last_pos, nc + nct, |
873 | 0 | cstart, cend, first, last); |
874 | |
|
875 | 0 | return nc; |
876 | 0 | } |
877 | | |
878 | | /*! Returns the number of containers in the CRAM file within given offsets. |
879 | | * |
880 | | * The cstart and cend offsets are the locations of the start of containers |
881 | | * as returned by index_container_offset. |
882 | | * |
883 | | * If non-NULL, first and last will hold the inclusive range of container |
884 | | * numbers, counting from zero. |
885 | | * |
886 | | * @return |
887 | | * Returns the number of containers, equivalent to *last-*first+1. |
888 | | */ |
889 | | int64_t cram_num_containers_between(cram_fd *fd, |
890 | | off_t cstart, off_t cend, |
891 | 0 | int64_t *first, int64_t *last) { |
892 | 0 | int64_t nc = 0, i; |
893 | 0 | int64_t last_pos = -99; |
894 | 0 | int64_t l_first = -1, l_last = -1; |
895 | |
|
896 | 0 | for (i = 0; i < fd->index_sz; i++) { |
897 | 0 | int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end |
898 | 0 | nc += cram_num_containers_between_(&fd->index[j], &last_pos, nc, |
899 | 0 | cstart, cend, &l_first, &l_last); |
900 | 0 | } |
901 | |
|
902 | 0 | if (first) |
903 | 0 | *first = l_first; |
904 | 0 | if (last) |
905 | 0 | *last = l_last; |
906 | |
|
907 | 0 | return l_last - l_first + 1; |
908 | 0 | } |
909 | | |
910 | | /* |
911 | | * Queries the total number of distinct containers in the index. |
912 | | * Note there may be more containers in the file than in the index, as we |
913 | | * are not required to have an index entry for every one. |
914 | | */ |
915 | 0 | int64_t cram_num_containers(cram_fd *fd) { |
916 | 0 | return cram_num_containers_between(fd, 0, 0, NULL, NULL); |
917 | 0 | } |
918 | | |
919 | | |
920 | | /*! Returns the byte offset for the start of the n^th container. |
921 | | * |
922 | | * The index must have previously been loaded, otherwise <0 is returned. |
923 | | */ |
924 | | static cram_index *cram_container_num2offset_(cram_index *e, int num, |
925 | 0 | int64_t *last_pos, int *nc) { |
926 | 0 | if (e->offset) { |
927 | 0 | if (e->offset != *last_pos) { |
928 | 0 | if (*nc == num) |
929 | 0 | return e; |
930 | 0 | (*nc)++; |
931 | 0 | } |
932 | | // else a new multi-ref in same container |
933 | 0 | *last_pos = e->offset; |
934 | 0 | } |
935 | | |
936 | 0 | int i; |
937 | 0 | for (i = 0; i < e->nslice; i++) { |
938 | 0 | cram_index *tmp = cram_container_num2offset_(&e->e[i], num, |
939 | 0 | last_pos, nc); |
940 | 0 | if (tmp) |
941 | 0 | return tmp; |
942 | 0 | } |
943 | | |
944 | | |
945 | 0 | return NULL; |
946 | 0 | } |
947 | | |
948 | 0 | off_t cram_container_num2offset(cram_fd *fd, int64_t num) { |
949 | 0 | int nc = 0, i; |
950 | 0 | int64_t last_pos = -9; |
951 | 0 | cram_index *e = NULL; |
952 | |
|
953 | 0 | for (i = 0; i < fd->index_sz; i++) { |
954 | 0 | int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end |
955 | 0 | if (!fd->index[j].nslice) |
956 | 0 | continue; |
957 | 0 | if ((e = cram_container_num2offset_(&fd->index[j], num, |
958 | 0 | &last_pos, &nc))) |
959 | 0 | break; |
960 | 0 | } |
961 | |
|
962 | 0 | return e ? e->offset : -1; |
963 | 0 | } |
964 | | |
965 | | |
966 | | /*! Returns the container number for the first container at offset >= pos. |
967 | | * |
968 | | * The index must have previously been loaded, otherwise <0 is returned. |
969 | | */ |
970 | | static cram_index *cram_container_offset2num_(cram_index *e, off_t pos, |
971 | 0 | int64_t *last_pos, int *nc) { |
972 | 0 | if (e->offset) { |
973 | 0 | if (e->offset != *last_pos) { |
974 | 0 | if (e->offset >= pos) |
975 | 0 | return e; |
976 | 0 | (*nc)++; |
977 | 0 | } |
978 | | // else a new multi-ref in same container |
979 | 0 | *last_pos = e->offset; |
980 | 0 | } |
981 | | |
982 | 0 | int i; |
983 | 0 | for (i = 0; i < e->nslice; i++) { |
984 | 0 | cram_index *tmp = cram_container_offset2num_(&e->e[i], pos, |
985 | 0 | last_pos, nc); |
986 | 0 | if (tmp) |
987 | 0 | return tmp; |
988 | 0 | } |
989 | | |
990 | | |
991 | 0 | return NULL; |
992 | 0 | } |
993 | | |
994 | 0 | int64_t cram_container_offset2num(cram_fd *fd, off_t pos) { |
995 | 0 | int nc = 0, i; |
996 | 0 | int64_t last_pos = -9; |
997 | 0 | cram_index *e = NULL; |
998 | |
|
999 | 0 | for (i = 0; i < fd->index_sz; i++) { |
1000 | 0 | int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end |
1001 | 0 | if (!fd->index[j].nslice) |
1002 | 0 | continue; |
1003 | 0 | if ((e = cram_container_offset2num_(&fd->index[j], pos, |
1004 | 0 | &last_pos, &nc))) |
1005 | 0 | break; |
1006 | 0 | } |
1007 | |
|
1008 | 0 | return e ? nc : -1; |
1009 | 0 | } |
1010 | | |
1011 | | /*! |
1012 | | * Returns the file offsets of CRAM containers covering a specific region |
1013 | | * query. Note both offsets are the START of the container. |
1014 | | * |
1015 | | * first will point to the start of the first overlapping container |
1016 | | * last will point to the start of the last overlapping container |
1017 | | * |
1018 | | * Returns 0 on success |
1019 | | * <0 on failure |
1020 | | */ |
1021 | | int cram_index_extents(cram_fd *fd, int refid, hts_pos_t start, hts_pos_t end, |
1022 | 0 | off_t *first, off_t *last) { |
1023 | 0 | cram_index *ci; |
1024 | |
|
1025 | 0 | if (first) { |
1026 | 0 | if (!(ci = cram_index_query(fd, refid, start, NULL))) |
1027 | 0 | return -1; |
1028 | 0 | *first = ci->offset; |
1029 | 0 | } |
1030 | | |
1031 | 0 | if (last) { |
1032 | 0 | if (!(ci = cram_index_query_last(fd, refid, end))) |
1033 | 0 | return -1; |
1034 | 0 | *last = ci->offset; |
1035 | 0 | } |
1036 | | |
1037 | 0 | return 0; |
1038 | 0 | } |