/src/htslib/cram/cram_decode.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | Copyright (c) 2012-2020, 2022-2024 Genome Research Ltd. |
3 | | Author: James Bonfield <jkb@sanger.ac.uk> |
4 | | |
5 | | Redistribution and use in source and binary forms, with or without |
6 | | modification, are permitted provided that the following conditions are met: |
7 | | |
8 | | 1. Redistributions of source code must retain the above copyright notice, |
9 | | this list of conditions and the following disclaimer. |
10 | | |
11 | | 2. Redistributions in binary form must reproduce the above copyright notice, |
12 | | this list of conditions and the following disclaimer in the documentation |
13 | | and/or other materials provided with the distribution. |
14 | | |
15 | | 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger |
16 | | Institute nor the names of its contributors may be used to endorse or promote |
17 | | products derived from this software without specific prior written permission. |
18 | | |
19 | | THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND |
20 | | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
21 | | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
22 | | DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE |
23 | | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
24 | | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
25 | | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
26 | | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
27 | | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
28 | | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 | | */ |
30 | | |
31 | | /* |
32 | | * - In-memory decoding of CRAM data structures. |
33 | | * - Iterator for reading CRAM record by record. |
34 | | */ |
35 | | |
36 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
37 | | #include <config.h> |
38 | | |
39 | | #include <stdio.h> |
40 | | #include <errno.h> |
41 | | #include <assert.h> |
42 | | #include <stdlib.h> |
43 | | #include <string.h> |
44 | | #include <zlib.h> |
45 | | #include <sys/types.h> |
46 | | #include <sys/stat.h> |
47 | | #include <math.h> |
48 | | #include <stdint.h> |
49 | | #include <inttypes.h> |
50 | | |
51 | | #include "cram.h" |
52 | | #include "os.h" |
53 | | #include "../htslib/hts.h" |
54 | | |
55 | | //Whether CIGAR has just M or uses = and X to indicate match and mismatch |
56 | | //#define USE_X |
57 | | |
58 | | /* ---------------------------------------------------------------------- |
59 | | * CRAM compression headers |
60 | | */ |
61 | | |
62 | | /* |
63 | | * Decodes the Tag Dictionary record in the preservation map |
64 | | * Updates the cram compression header. |
65 | | * |
66 | | * Returns number of bytes decoded on success |
67 | | * -1 on failure |
68 | | */ |
69 | | int cram_decode_TD(cram_fd *fd, char *cp, const char *endp, |
70 | 729 | cram_block_compression_hdr *h) { |
71 | 729 | char *op = cp; |
72 | 729 | unsigned char *dat; |
73 | 729 | cram_block *b; |
74 | 729 | int32_t blk_size = 0; |
75 | 729 | int nTL, i, sz, err = 0; |
76 | | |
77 | 729 | if (!(b = cram_new_block(0, 0))) |
78 | 0 | return -1; |
79 | | |
80 | 729 | if (h->TD_blk || h->TL) { |
81 | 408 | hts_log_warning("More than one TD block found in compression header"); |
82 | 408 | cram_free_block(h->TD_blk); |
83 | 408 | free(h->TL); |
84 | 408 | h->TD_blk = NULL; |
85 | 408 | h->TL = NULL; |
86 | 408 | } |
87 | | |
88 | | /* Decode */ |
89 | 729 | blk_size = fd->vv.varint_get32(&cp, endp, &err); |
90 | 729 | if (!blk_size) { |
91 | 132 | h->nTL = 0; |
92 | 132 | cram_free_block(b); |
93 | 132 | return cp - op; |
94 | 132 | } |
95 | | |
96 | 597 | if (err || blk_size < 0 || endp - cp < blk_size) { |
97 | 57 | cram_free_block(b); |
98 | 57 | return -1; |
99 | 57 | } |
100 | | |
101 | 540 | BLOCK_APPEND(b, cp, blk_size); |
102 | 540 | cp += blk_size; |
103 | 540 | sz = cp - op; |
104 | | // Force nul termination if missing |
105 | 540 | if (BLOCK_DATA(b)[BLOCK_SIZE(b)-1]) |
106 | 357 | BLOCK_APPEND_CHAR(b, '\0'); |
107 | | |
108 | | /* Set up TL lookup table */ |
109 | 540 | dat = BLOCK_DATA(b); |
110 | | |
111 | | // Count |
112 | 6.29k | for (nTL = i = 0; i < BLOCK_SIZE(b); i++) { |
113 | 5.75k | nTL++; |
114 | 32.1k | while (dat[i]) |
115 | 26.3k | i++; |
116 | 5.75k | } |
117 | | |
118 | | // Copy |
119 | 540 | if (!(h->TL = calloc(nTL, sizeof(*h->TL)))) { |
120 | 0 | cram_free_block(b); |
121 | 0 | return -1; |
122 | 0 | } |
123 | 6.29k | for (nTL = i = 0; i < BLOCK_SIZE(b); i++) { |
124 | 5.75k | h->TL[nTL++] = &dat[i]; |
125 | 32.1k | while (dat[i]) |
126 | 26.3k | i++; |
127 | 5.75k | } |
128 | 540 | h->TD_blk = b; |
129 | 540 | h->nTL = nTL; |
130 | | |
131 | 540 | return sz; |
132 | | |
133 | 0 | block_err: |
134 | 0 | cram_free_block(b); |
135 | 0 | return -1; |
136 | 540 | } |
137 | | |
138 | | /* |
139 | | * Decodes a CRAM block compression header. |
140 | | * Returns header ptr on success |
141 | | * NULL on failure |
142 | | */ |
143 | | cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, |
144 | 7.80k | cram_block *b) { |
145 | 7.80k | char *cp, *endp, *cp_copy; |
146 | 7.80k | cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr)); |
147 | 7.80k | int i, err = 0; |
148 | 7.80k | int32_t map_size = 0, map_count = 0; |
149 | | |
150 | 7.80k | if (!hdr) |
151 | 0 | return NULL; |
152 | | |
153 | 7.80k | if (b->method != RAW) { |
154 | 5.36k | if (cram_uncompress_block(b)) { |
155 | 4.63k | free(hdr); |
156 | 4.63k | return NULL; |
157 | 4.63k | } |
158 | 5.36k | } |
159 | | |
160 | 3.16k | cp = (char *)b->data; |
161 | 3.16k | endp = cp + b->uncomp_size; |
162 | | |
163 | 3.16k | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
164 | 2.93k | hdr->ref_seq_id = fd->vv.varint_get32(&cp, endp, &err); |
165 | 2.93k | if (CRAM_MAJOR_VERS(fd->version) >= 4) { |
166 | 0 | hdr->ref_seq_start = fd->vv.varint_get64(&cp, endp, &err); |
167 | 0 | hdr->ref_seq_span = fd->vv.varint_get64(&cp, endp, &err); |
168 | 2.93k | } else { |
169 | 2.93k | hdr->ref_seq_start = fd->vv.varint_get32(&cp, endp, &err); |
170 | 2.93k | hdr->ref_seq_span = fd->vv.varint_get32(&cp, endp, &err); |
171 | 2.93k | } |
172 | 2.93k | hdr->num_records = fd->vv.varint_get32(&cp, endp, &err); |
173 | 2.93k | hdr->num_landmarks = fd->vv.varint_get32(&cp, endp, &err); |
174 | 2.93k | if (hdr->num_landmarks < 0 || |
175 | 2.93k | hdr->num_landmarks >= SIZE_MAX / sizeof(int32_t) || |
176 | 2.93k | endp - cp < hdr->num_landmarks) { |
177 | 171 | free(hdr); |
178 | 171 | return NULL; |
179 | 171 | } |
180 | 2.76k | if (!(hdr->landmark = malloc(hdr->num_landmarks * sizeof(int32_t)))) { |
181 | 0 | free(hdr); |
182 | 0 | return NULL; |
183 | 0 | } |
184 | 112k | for (i = 0; i < hdr->num_landmarks; i++) |
185 | 110k | hdr->landmark[i] = fd->vv.varint_get32(&cp, endp, &err);; |
186 | 2.76k | } |
187 | | |
188 | 2.99k | hdr->preservation_map = kh_init(map); |
189 | | |
190 | 2.99k | memset(hdr->rec_encoding_map, 0, |
191 | 2.99k | CRAM_MAP_HASH * sizeof(hdr->rec_encoding_map[0])); |
192 | 2.99k | memset(hdr->tag_encoding_map, 0, |
193 | 2.99k | CRAM_MAP_HASH * sizeof(hdr->tag_encoding_map[0])); |
194 | | |
195 | 2.99k | if (!hdr->preservation_map) { |
196 | 0 | cram_free_compression_header(hdr); |
197 | 0 | return NULL; |
198 | 0 | } |
199 | | |
200 | | /* Initialise defaults for preservation map */ |
201 | 2.99k | hdr->read_names_included = 0; |
202 | 2.99k | hdr->AP_delta = 1; |
203 | 2.99k | hdr->qs_seq_orient = 1; |
204 | 2.99k | memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20); |
205 | | |
206 | | /* Preservation map */ |
207 | 2.99k | map_size = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp; |
208 | 2.99k | map_count = fd->vv.varint_get32(&cp, endp, &err); |
209 | 89.0k | for (i = 0; i < map_count; i++) { |
210 | 86.3k | pmap_t hd; |
211 | 86.3k | khint_t k; |
212 | 86.3k | int r; |
213 | | |
214 | 86.3k | if (endp - cp < 3) { |
215 | 255 | cram_free_compression_header(hdr); |
216 | 255 | return NULL; |
217 | 255 | } |
218 | 86.0k | cp += 2; |
219 | 86.0k | switch(CRAM_KEY(cp[-2],cp[-1])) { |
220 | 27 | case CRAM_KEY('M','I'): // was mapped QS included in V1.0 |
221 | 111 | case CRAM_KEY('U','I'): // was unmapped QS included in V1.0 |
222 | 168 | case CRAM_KEY('P','I'): // was unmapped placed in V1.0 |
223 | 168 | hd.i = *cp++; |
224 | 168 | break; |
225 | | |
226 | 216 | case CRAM_KEY('R','N'): |
227 | 216 | hd.i = *cp++; |
228 | 216 | k = kh_put(map, hdr->preservation_map, "RN", &r); |
229 | 216 | if (-1 == r) { |
230 | 0 | cram_free_compression_header(hdr); |
231 | 0 | return NULL; |
232 | 0 | } |
233 | | |
234 | 216 | kh_val(hdr->preservation_map, k) = hd; |
235 | 216 | hdr->read_names_included = hd.i; |
236 | 216 | break; |
237 | | |
238 | 189 | case CRAM_KEY('A','P'): |
239 | 189 | hd.i = *cp++; |
240 | 189 | k = kh_put(map, hdr->preservation_map, "AP", &r); |
241 | 189 | if (-1 == r) { |
242 | 0 | cram_free_compression_header(hdr); |
243 | 0 | return NULL; |
244 | 0 | } |
245 | | |
246 | 189 | kh_val(hdr->preservation_map, k) = hd; |
247 | 189 | hdr->AP_delta = hd.i; |
248 | 189 | break; |
249 | | |
250 | 3.19k | case CRAM_KEY('R','R'): |
251 | 3.19k | hd.i = *cp++; |
252 | 3.19k | k = kh_put(map, hdr->preservation_map, "RR", &r); |
253 | 3.19k | if (-1 == r) { |
254 | 0 | cram_free_compression_header(hdr); |
255 | 0 | return NULL; |
256 | 0 | } |
257 | | |
258 | 3.19k | kh_val(hdr->preservation_map, k) = hd; |
259 | 3.19k | hdr->no_ref = !hd.i; |
260 | 3.19k | break; |
261 | | |
262 | 357 | case CRAM_KEY('Q','O'): |
263 | 357 | hd.i = *cp++; |
264 | 357 | k = kh_put(map, hdr->preservation_map, "QO", &r); |
265 | 357 | if (-1 == r) { |
266 | 0 | cram_free_compression_header(hdr); |
267 | 0 | return NULL; |
268 | 0 | } |
269 | | |
270 | 357 | kh_val(hdr->preservation_map, k) = hd; |
271 | 357 | hdr->qs_seq_orient = hd.i; |
272 | 357 | break; |
273 | | |
274 | 942 | case CRAM_KEY('S','M'): |
275 | 942 | if (endp - cp < 5) { |
276 | 6 | cram_free_compression_header(hdr); |
277 | 6 | return NULL; |
278 | 6 | } |
279 | 936 | hdr->substitution_matrix[0][(cp[0]>>6)&3] = 'C'; |
280 | 936 | hdr->substitution_matrix[0][(cp[0]>>4)&3] = 'G'; |
281 | 936 | hdr->substitution_matrix[0][(cp[0]>>2)&3] = 'T'; |
282 | 936 | hdr->substitution_matrix[0][(cp[0]>>0)&3] = 'N'; |
283 | | |
284 | 936 | hdr->substitution_matrix[1][(cp[1]>>6)&3] = 'A'; |
285 | 936 | hdr->substitution_matrix[1][(cp[1]>>4)&3] = 'G'; |
286 | 936 | hdr->substitution_matrix[1][(cp[1]>>2)&3] = 'T'; |
287 | 936 | hdr->substitution_matrix[1][(cp[1]>>0)&3] = 'N'; |
288 | | |
289 | 936 | hdr->substitution_matrix[2][(cp[2]>>6)&3] = 'A'; |
290 | 936 | hdr->substitution_matrix[2][(cp[2]>>4)&3] = 'C'; |
291 | 936 | hdr->substitution_matrix[2][(cp[2]>>2)&3] = 'T'; |
292 | 936 | hdr->substitution_matrix[2][(cp[2]>>0)&3] = 'N'; |
293 | | |
294 | 936 | hdr->substitution_matrix[3][(cp[3]>>6)&3] = 'A'; |
295 | 936 | hdr->substitution_matrix[3][(cp[3]>>4)&3] = 'C'; |
296 | 936 | hdr->substitution_matrix[3][(cp[3]>>2)&3] = 'G'; |
297 | 936 | hdr->substitution_matrix[3][(cp[3]>>0)&3] = 'N'; |
298 | | |
299 | 936 | hdr->substitution_matrix[4][(cp[4]>>6)&3] = 'A'; |
300 | 936 | hdr->substitution_matrix[4][(cp[4]>>4)&3] = 'C'; |
301 | 936 | hdr->substitution_matrix[4][(cp[4]>>2)&3] = 'G'; |
302 | 936 | hdr->substitution_matrix[4][(cp[4]>>0)&3] = 'T'; |
303 | | |
304 | 936 | hd.p = cp; |
305 | 936 | cp += 5; |
306 | | |
307 | 936 | k = kh_put(map, hdr->preservation_map, "SM", &r); |
308 | 936 | if (-1 == r) { |
309 | 0 | cram_free_compression_header(hdr); |
310 | 0 | return NULL; |
311 | 0 | } |
312 | 936 | kh_val(hdr->preservation_map, k) = hd; |
313 | 936 | break; |
314 | | |
315 | 729 | case CRAM_KEY('T','D'): { |
316 | 729 | int sz = cram_decode_TD(fd, cp, endp, hdr); // tag dictionary |
317 | 729 | if (sz < 0) { |
318 | 57 | cram_free_compression_header(hdr); |
319 | 57 | return NULL; |
320 | 57 | } |
321 | | |
322 | 672 | hd.p = cp; |
323 | 672 | cp += sz; |
324 | | |
325 | 672 | k = kh_put(map, hdr->preservation_map, "TD", &r); |
326 | 672 | if (-1 == r) { |
327 | 0 | cram_free_compression_header(hdr); |
328 | 0 | return NULL; |
329 | 0 | } |
330 | 672 | kh_val(hdr->preservation_map, k) = hd; |
331 | 672 | break; |
332 | 672 | } |
333 | | |
334 | 80.2k | default: |
335 | 80.2k | hts_log_warning("Unrecognised preservation map key %c%c", cp[-2], cp[-1]); |
336 | | // guess byte; |
337 | 80.2k | cp++; |
338 | 80.2k | break; |
339 | 86.0k | } |
340 | 86.0k | } |
341 | 2.67k | if (cp - cp_copy != map_size) { |
342 | 189 | cram_free_compression_header(hdr); |
343 | 189 | return NULL; |
344 | 189 | } |
345 | | |
346 | | /* Record encoding map */ |
347 | 2.48k | map_size = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp; |
348 | 2.48k | map_count = fd->vv.varint_get32(&cp, endp, &err); |
349 | 2.48k | int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0; |
350 | 77.7k | for (i = 0; i < map_count; i++) { |
351 | 76.6k | char *key = cp; |
352 | 76.6k | int32_t encoding = E_NULL; |
353 | 76.6k | int32_t size = 0; |
354 | 76.6k | ptrdiff_t offset; |
355 | 76.6k | cram_map *m; |
356 | 76.6k | enum cram_DS_ID ds_id; |
357 | 76.6k | enum cram_external_type type; |
358 | | |
359 | 76.6k | if (endp - cp < 4) { |
360 | 129 | cram_free_compression_header(hdr); |
361 | 129 | return NULL; |
362 | 129 | } |
363 | | |
364 | 76.4k | cp += 2; |
365 | 76.4k | encoding = fd->vv.varint_get32(&cp, endp, &err); |
366 | 76.4k | size = fd->vv.varint_get32(&cp, endp, &err); |
367 | | |
368 | 76.4k | offset = cp - (char *)b->data; |
369 | | |
370 | 76.4k | if (encoding == E_NULL) |
371 | 22.3k | continue; |
372 | | |
373 | 54.1k | if (size < 0 || endp - cp < size) { |
374 | 447 | cram_free_compression_header(hdr); |
375 | 447 | return NULL; |
376 | 447 | } |
377 | | |
378 | | //printf("%s codes for %.2s\n", cram_encoding2str(encoding), key); |
379 | | |
380 | | /* |
381 | | * For CRAM1.0 CF and BF are Byte and not Int. |
382 | | * Practically speaking it makes no difference unless we have a |
383 | | * 1.0 format file that stores these in EXTERNAL as only then |
384 | | * does Byte vs Int matter. |
385 | | * |
386 | | * Neither this C code nor Java reference implementations did this, |
387 | | * so we gloss over it and treat them as int. |
388 | | */ |
389 | 53.6k | ds_id = DS_CORE; |
390 | 53.6k | if (key[0] == 'B' && key[1] == 'F') { |
391 | 165 | ds_id = DS_BF; type = E_INT; |
392 | 53.5k | } else if (key[0] == 'C' && key[1] == 'F') { |
393 | 363 | ds_id = DS_CF; type = E_INT; |
394 | 53.1k | } else if (key[0] == 'R' && key[1] == 'I') { |
395 | 177 | ds_id = DS_RI; type = E_INT; |
396 | 52.9k | } else if (key[0] == 'R' && key[1] == 'L') { |
397 | 30 | ds_id = DS_RL; type = E_INT; |
398 | 52.9k | } else if (key[0] == 'A' && key[1] == 'P') { |
399 | 1.13k | ds_id = DS_AP; |
400 | 1.13k | type = is_v4 ? E_SLONG : E_INT; |
401 | 51.8k | } else if (key[0] == 'R' && key[1] == 'G') { |
402 | 99 | ds_id = DS_RG; |
403 | 99 | type = E_INT; |
404 | 51.7k | } else if (key[0] == 'M' && key[1] == 'F') { |
405 | 60 | ds_id = DS_MF; type = E_INT; |
406 | 51.6k | } else if (key[0] == 'N' && key[1] == 'S') { |
407 | 495 | ds_id = DS_NS; type = E_INT; |
408 | 51.1k | } else if (key[0] == 'N' && key[1] == 'P') { |
409 | 2.41k | ds_id = DS_NP; |
410 | 2.41k | type = is_v4 ? E_LONG : E_INT; |
411 | 48.7k | } else if (key[0] == 'T' && key[1] == 'S') { |
412 | 69 | ds_id = DS_TS; |
413 | 69 | type = is_v4 ? E_SLONG : E_INT; |
414 | 48.6k | } else if (key[0] == 'N' && key[1] == 'F') { |
415 | 144 | ds_id = DS_NF; type = E_INT; |
416 | 48.5k | } else if (key[0] == 'T' && key[1] == 'C') { |
417 | 141 | ds_id = DS_TC; type = E_BYTE; |
418 | 48.3k | } else if (key[0] == 'T' && key[1] == 'N') { |
419 | 3 | ds_id = DS_TN; type = E_INT; |
420 | 48.3k | } else if (key[0] == 'F' && key[1] == 'N') { |
421 | 3 | ds_id = DS_FN; type = E_INT; |
422 | 48.3k | } else if (key[0] == 'F' && key[1] == 'C') { |
423 | 396 | ds_id = DS_FC; type = E_BYTE; |
424 | 47.9k | } else if (key[0] == 'F' && key[1] == 'P') { |
425 | 72 | ds_id = DS_FP; type = E_INT; |
426 | 47.9k | } else if (key[0] == 'B' && key[1] == 'S') { |
427 | 36 | ds_id = DS_BS; type = E_BYTE; |
428 | 47.8k | } else if (key[0] == 'I' && key[1] == 'N') { |
429 | 69 | ds_id = DS_IN; type = E_BYTE_ARRAY; |
430 | 47.8k | } else if (key[0] == 'S' && key[1] == 'C') { |
431 | 3 | ds_id = DS_SC; type = E_BYTE_ARRAY; |
432 | 47.8k | } else if (key[0] == 'D' && key[1] == 'L') { |
433 | 90 | ds_id = DS_DL; type = E_INT; |
434 | 47.7k | } else if (key[0] == 'B' && key[1] == 'A') { |
435 | 177 | ds_id = DS_BA; type = E_BYTE; |
436 | 47.5k | } else if (key[0] == 'B' && key[1] == 'B') { |
437 | 141 | ds_id = DS_BB; type = E_BYTE_ARRAY; |
438 | 47.4k | } else if (key[0] == 'R' && key[1] == 'S') { |
439 | 285 | ds_id = DS_RS; type = E_INT; |
440 | 47.1k | } else if (key[0] == 'P' && key[1] == 'D') { |
441 | 3 | ds_id = DS_PD; type = E_INT; |
442 | 47.1k | } else if (key[0] == 'H' && key[1] == 'C') { |
443 | 12 | ds_id = DS_HC; type = E_INT; |
444 | 47.1k | } else if (key[0] == 'M' && key[1] == 'Q') { |
445 | 6.03k | ds_id = DS_MQ; type = E_INT; |
446 | 41.0k | } else if (key[0] == 'R' && key[1] == 'N') { |
447 | 30 | ds_id = DS_RN; type = E_BYTE_ARRAY_BLOCK; |
448 | 41.0k | } else if (key[0] == 'Q' && key[1] == 'S') { |
449 | 240 | ds_id = DS_QS; type = E_BYTE; |
450 | 40.7k | } else if (key[0] == 'Q' && key[1] == 'Q') { |
451 | 9 | ds_id = DS_QQ; type = E_BYTE_ARRAY; |
452 | 40.7k | } else if (key[0] == 'T' && key[1] == 'L') { |
453 | 3 | ds_id = DS_TL; type = E_INT; |
454 | 40.7k | } else if (key[0] == 'T' && key[1] == 'M') { |
455 | 40.7k | } else if (key[0] == 'T' && key[1] == 'V') { |
456 | 40.7k | } else { |
457 | 40.7k | hts_log_warning("Unrecognised key: %.2s", key); |
458 | 40.7k | } |
459 | | |
460 | 53.6k | if (ds_id != DS_CORE) { |
461 | 12.9k | if (hdr->codecs[ds_id] != NULL) { |
462 | 11.2k | hts_log_warning("Codec for key %.2s defined more than once", |
463 | 11.2k | key); |
464 | 11.2k | hdr->codecs[ds_id]->free(hdr->codecs[ds_id]); |
465 | 11.2k | } |
466 | 12.9k | hdr->codecs[ds_id] = cram_decoder_init(hdr, encoding, cp, size, |
467 | 12.9k | type, fd->version, &fd->vv); |
468 | 12.9k | if (!hdr->codecs[ds_id]) { |
469 | 822 | cram_free_compression_header(hdr); |
470 | 822 | return NULL; |
471 | 822 | } |
472 | 12.9k | } |
473 | | |
474 | 52.8k | cp += size; |
475 | | |
476 | | // Fill out cram_map purely for cram_dump to dump out. |
477 | 52.8k | m = malloc(sizeof(*m)); |
478 | 52.8k | if (!m) { |
479 | 0 | cram_free_compression_header(hdr); |
480 | 0 | return NULL; |
481 | 0 | } |
482 | 52.8k | m->key = CRAM_KEY(key[0], key[1]); |
483 | 52.8k | m->encoding = encoding; |
484 | 52.8k | m->size = size; |
485 | 52.8k | m->offset = offset; |
486 | 52.8k | m->codec = NULL; |
487 | | |
488 | 52.8k | m->next = hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])]; |
489 | 52.8k | hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])] = m; |
490 | 52.8k | } |
491 | 1.09k | if (cp - cp_copy != map_size) { |
492 | 126 | cram_free_compression_header(hdr); |
493 | 126 | return NULL; |
494 | 126 | } |
495 | | |
496 | | /* Tag encoding map */ |
497 | 965 | map_size = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp; |
498 | 965 | map_count = fd->vv.varint_get32(&cp, endp, &err); |
499 | 2.93k | for (i = 0; i < map_count; i++) { |
500 | 2.13k | int32_t encoding = E_NULL; |
501 | 2.13k | int32_t size = 0; |
502 | 2.13k | cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc |
503 | 2.13k | uint8_t key[3]; |
504 | | |
505 | 2.13k | if (!m || endp - cp < 6) { |
506 | 9 | free(m); |
507 | 9 | cram_free_compression_header(hdr); |
508 | 9 | return NULL; |
509 | 9 | } |
510 | | |
511 | 2.12k | m->key = fd->vv.varint_get32(&cp, endp, &err); |
512 | 2.12k | key[0] = m->key>>16; |
513 | 2.12k | key[1] = m->key>>8; |
514 | 2.12k | key[2] = m->key; |
515 | 2.12k | encoding = fd->vv.varint_get32(&cp, endp, &err); |
516 | 2.12k | size = fd->vv.varint_get32(&cp, endp, &err); |
517 | | |
518 | 2.12k | m->encoding = encoding; |
519 | 2.12k | m->size = size; |
520 | 2.12k | m->offset = cp - (char *)b->data; |
521 | 2.12k | if (size < 0 || endp - cp < size || |
522 | 2.12k | !(m->codec = cram_decoder_init(hdr, encoding, cp, size, |
523 | 2.07k | E_BYTE_ARRAY_BLOCK, fd->version, &fd->vv))) { |
524 | 159 | cram_free_compression_header(hdr); |
525 | 159 | free(m); |
526 | 159 | return NULL; |
527 | 159 | } |
528 | | |
529 | 1.96k | cp += size; |
530 | | |
531 | 1.96k | m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])]; |
532 | 1.96k | hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m; |
533 | 1.96k | } |
534 | 797 | if (err || cp - cp_copy != map_size) { |
535 | 170 | cram_free_compression_header(hdr); |
536 | 170 | return NULL; |
537 | 170 | } |
538 | | |
539 | 627 | return hdr; |
540 | 797 | } |
541 | | |
542 | | /* |
543 | | * Note we also need to scan through the record encoding map to |
544 | | * see which data series share the same block, either external or |
545 | | * CORE. For example if we need the BF data series but MQ and CF |
546 | | * are also encoded in the same block then we need to add those in |
547 | | * as a dependency in order to correctly decode BF. |
548 | | * |
549 | | * Returns 0 on success |
550 | | * -1 on failure |
551 | | */ |
552 | | int cram_dependent_data_series(cram_fd *fd, |
553 | | cram_block_compression_hdr *hdr, |
554 | 480 | cram_slice *s) { |
555 | 480 | int *block_used; |
556 | 480 | int core_used = 0; |
557 | 480 | int i; |
558 | 480 | static int i_to_id[] = { |
559 | 480 | DS_BF, DS_AP, DS_FP, DS_RL, DS_DL, DS_NF, DS_BA, DS_QS, |
560 | 480 | DS_FC, DS_FN, DS_BS, DS_IN, DS_RG, DS_MQ, DS_TL, DS_RN, |
561 | 480 | DS_NS, DS_NP, DS_TS, DS_MF, DS_CF, DS_RI, DS_RS, DS_PD, |
562 | 480 | DS_HC, DS_SC, DS_BB, DS_QQ, |
563 | 480 | }; |
564 | 480 | uint32_t orig_ds; |
565 | | |
566 | | /* |
567 | | * Set the data_series bit field based on fd->required_fields |
568 | | * contents. |
569 | | */ |
570 | 480 | if (fd->required_fields && fd->required_fields != INT_MAX) { |
571 | 0 | s->data_series = 0; |
572 | |
|
573 | 0 | if (fd->required_fields & SAM_QNAME) |
574 | 0 | s->data_series |= CRAM_RN; |
575 | |
|
576 | 0 | if (fd->required_fields & SAM_FLAG) |
577 | 0 | s->data_series |= CRAM_BF; |
578 | |
|
579 | 0 | if (fd->required_fields & SAM_RNAME) |
580 | 0 | s->data_series |= CRAM_RI | CRAM_BF; |
581 | |
|
582 | 0 | if (fd->required_fields & SAM_POS) |
583 | 0 | s->data_series |= CRAM_AP | CRAM_BF; |
584 | |
|
585 | 0 | if (fd->required_fields & SAM_MAPQ) |
586 | 0 | s->data_series |= CRAM_MQ; |
587 | |
|
588 | 0 | if (fd->required_fields & SAM_CIGAR) |
589 | 0 | s->data_series |= CRAM_CIGAR; |
590 | |
|
591 | 0 | if (fd->required_fields & SAM_RNEXT) |
592 | 0 | s->data_series |= CRAM_CF | CRAM_NF | CRAM_RI | CRAM_NS |CRAM_BF; |
593 | |
|
594 | 0 | if (fd->required_fields & SAM_PNEXT) |
595 | 0 | s->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_NP | CRAM_BF; |
596 | |
|
597 | 0 | if (fd->required_fields & SAM_TLEN) |
598 | 0 | s->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_TS | |
599 | 0 | CRAM_BF | CRAM_MF | CRAM_RI | CRAM_CIGAR; |
600 | |
|
601 | 0 | if (fd->required_fields & SAM_SEQ) |
602 | 0 | s->data_series |= CRAM_SEQ; |
603 | |
|
604 | 0 | if (!(fd->required_fields & SAM_AUX)) |
605 | | // No easy way to get MD/NM without other tags at present |
606 | 0 | s->decode_md = 0; |
607 | |
|
608 | 0 | if (fd->required_fields & SAM_QUAL) |
609 | 0 | s->data_series |= CRAM_QUAL; |
610 | |
|
611 | 0 | if (fd->required_fields & SAM_AUX) |
612 | 0 | s->data_series |= CRAM_RG | CRAM_TL | CRAM_aux; |
613 | |
|
614 | 0 | if (fd->required_fields & SAM_RGAUX) |
615 | 0 | s->data_series |= CRAM_RG | CRAM_BF; |
616 | | |
617 | | // Always uncompress CORE block |
618 | 0 | if (cram_uncompress_block(s->block[0])) |
619 | 0 | return -1; |
620 | 480 | } else { |
621 | 480 | s->data_series = CRAM_ALL; |
622 | | |
623 | 2.35k | for (i = 0; i < s->hdr->num_blocks; i++) { |
624 | 1.88k | if (cram_uncompress_block(s->block[i])) |
625 | 9 | return -1; |
626 | 1.88k | } |
627 | | |
628 | 471 | return 0; |
629 | 480 | } |
630 | | |
631 | 0 | block_used = calloc(s->hdr->num_blocks+1, sizeof(int)); |
632 | 0 | if (!block_used) |
633 | 0 | return -1; |
634 | | |
635 | 0 | do { |
636 | | /* |
637 | | * Also set data_series based on code prerequisites. Eg if we need |
638 | | * CRAM_QS then we also need to know CRAM_RL so we know how long it |
639 | | * is, or if we need FC/FP then we also need FN (number of features). |
640 | | * |
641 | | * It's not reciprocal though. We may be needing to decode FN |
642 | | * but have no need to decode FC, FP and cigar ops. |
643 | | */ |
644 | 0 | if (s->data_series & CRAM_RS) s->data_series |= CRAM_FC|CRAM_FP; |
645 | 0 | if (s->data_series & CRAM_PD) s->data_series |= CRAM_FC|CRAM_FP; |
646 | 0 | if (s->data_series & CRAM_HC) s->data_series |= CRAM_FC|CRAM_FP; |
647 | 0 | if (s->data_series & CRAM_QS) s->data_series |= CRAM_FC|CRAM_FP; |
648 | 0 | if (s->data_series & CRAM_IN) s->data_series |= CRAM_FC|CRAM_FP; |
649 | 0 | if (s->data_series & CRAM_SC) s->data_series |= CRAM_FC|CRAM_FP; |
650 | 0 | if (s->data_series & CRAM_BS) s->data_series |= CRAM_FC|CRAM_FP; |
651 | 0 | if (s->data_series & CRAM_DL) s->data_series |= CRAM_FC|CRAM_FP; |
652 | 0 | if (s->data_series & CRAM_BA) s->data_series |= CRAM_FC|CRAM_FP; |
653 | 0 | if (s->data_series & CRAM_BB) s->data_series |= CRAM_FC|CRAM_FP; |
654 | 0 | if (s->data_series & CRAM_QQ) s->data_series |= CRAM_FC|CRAM_FP; |
655 | | |
656 | | // cram_decode_seq() needs seq[] array |
657 | 0 | if (s->data_series & (CRAM_SEQ|CRAM_CIGAR)) s->data_series |= CRAM_RL; |
658 | |
|
659 | 0 | if (s->data_series & CRAM_FP) s->data_series |= CRAM_FC; |
660 | 0 | if (s->data_series & CRAM_FC) s->data_series |= CRAM_FN; |
661 | 0 | if (s->data_series & CRAM_aux) s->data_series |= CRAM_TL; |
662 | 0 | if (s->data_series & CRAM_MF) s->data_series |= CRAM_CF; |
663 | 0 | if (s->data_series & CRAM_MQ) s->data_series |= CRAM_BF; |
664 | 0 | if (s->data_series & CRAM_BS) s->data_series |= CRAM_RI; |
665 | 0 | if (s->data_series & (CRAM_MF |CRAM_NS |CRAM_NP |CRAM_TS |CRAM_NF)) |
666 | 0 | s->data_series |= CRAM_CF; |
667 | 0 | if (!hdr->read_names_included && s->data_series & CRAM_RN) |
668 | 0 | s->data_series |= CRAM_CF | CRAM_NF; |
669 | 0 | if (s->data_series & (CRAM_BA | CRAM_QS | CRAM_BB | CRAM_QQ)) |
670 | 0 | s->data_series |= CRAM_BF | CRAM_CF | CRAM_RL; |
671 | 0 | if (s->data_series & CRAM_FN) { |
672 | | // The CRAM_FN loop checks for reference length boundaries, |
673 | | // which needs a working seq_pos. Some fields are fixed size |
674 | | // irrespective of if we decode (BS), but others need to know |
675 | | // the size of the string fetched back (SC, IN, BB). |
676 | 0 | s->data_series |= CRAM_SC | CRAM_IN | CRAM_BB; |
677 | 0 | } |
678 | |
|
679 | 0 | orig_ds = s->data_series; |
680 | | |
681 | | // Find which blocks are in use. |
682 | 0 | for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) { |
683 | 0 | int bnum1, bnum2, j; |
684 | 0 | cram_codec *c = hdr->codecs[i_to_id[i]]; |
685 | |
|
686 | 0 | if (!(s->data_series & (1<<i))) |
687 | 0 | continue; |
688 | | |
689 | 0 | if (!c) |
690 | 0 | continue; |
691 | | |
692 | 0 | bnum1 = cram_codec_to_id(c, &bnum2); |
693 | |
|
694 | 0 | for (;;) { |
695 | 0 | switch (bnum1) { |
696 | 0 | case -2: |
697 | 0 | break; |
698 | | |
699 | 0 | case -1: |
700 | 0 | core_used = 1; |
701 | 0 | break; |
702 | | |
703 | 0 | default: |
704 | 0 | for (j = 0; j < s->hdr->num_blocks; j++) { |
705 | 0 | if (s->block[j]->content_type == EXTERNAL && |
706 | 0 | s->block[j]->content_id == bnum1) { |
707 | 0 | block_used[j] = 1; |
708 | 0 | if (cram_uncompress_block(s->block[j])) { |
709 | 0 | free(block_used); |
710 | 0 | return -1; |
711 | 0 | } |
712 | 0 | } |
713 | 0 | } |
714 | 0 | break; |
715 | 0 | } |
716 | | |
717 | 0 | if (bnum2 == -2 || bnum1 == bnum2) |
718 | 0 | break; |
719 | | |
720 | 0 | bnum1 = bnum2; // 2nd pass |
721 | 0 | } |
722 | 0 | } |
723 | | |
724 | | // Tags too |
725 | 0 | if ((fd->required_fields & SAM_AUX) || |
726 | 0 | (s->data_series & CRAM_aux)) { |
727 | 0 | for (i = 0; i < CRAM_MAP_HASH; i++) { |
728 | 0 | int bnum1, bnum2, j; |
729 | 0 | cram_map *m = hdr->tag_encoding_map[i]; |
730 | |
|
731 | 0 | while (m) { |
732 | 0 | cram_codec *c = m->codec; |
733 | 0 | if (!c) { |
734 | 0 | m = m->next; |
735 | 0 | continue; |
736 | 0 | } |
737 | | |
738 | 0 | bnum1 = cram_codec_to_id(c, &bnum2); |
739 | |
|
740 | 0 | for (;;) { |
741 | 0 | switch (bnum1) { |
742 | 0 | case -2: |
743 | 0 | break; |
744 | | |
745 | 0 | case -1: |
746 | 0 | core_used = 1; |
747 | 0 | break; |
748 | | |
749 | 0 | default: |
750 | 0 | for (j = 0; j < s->hdr->num_blocks; j++) { |
751 | 0 | if (s->block[j]->content_type == EXTERNAL && |
752 | 0 | s->block[j]->content_id == bnum1) { |
753 | 0 | block_used[j] = 1; |
754 | 0 | if (cram_uncompress_block(s->block[j])) { |
755 | 0 | free(block_used); |
756 | 0 | return -1; |
757 | 0 | } |
758 | 0 | } |
759 | 0 | } |
760 | 0 | break; |
761 | 0 | } |
762 | | |
763 | 0 | if (bnum2 == -2 || bnum1 == bnum2) |
764 | 0 | break; |
765 | | |
766 | 0 | bnum1 = bnum2; // 2nd pass |
767 | 0 | } |
768 | | |
769 | 0 | m = m->next; |
770 | 0 | } |
771 | 0 | } |
772 | 0 | } |
773 | | |
774 | | // We now know which blocks are in used, so repeat and find |
775 | | // which other data series need to be added. |
776 | 0 | for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) { |
777 | 0 | int bnum1, bnum2, j; |
778 | 0 | cram_codec *c = hdr->codecs[i_to_id[i]]; |
779 | |
|
780 | 0 | if (!c) |
781 | 0 | continue; |
782 | | |
783 | 0 | bnum1 = cram_codec_to_id(c, &bnum2); |
784 | |
|
785 | 0 | for (;;) { |
786 | 0 | switch (bnum1) { |
787 | 0 | case -2: |
788 | 0 | break; |
789 | | |
790 | 0 | case -1: |
791 | 0 | if (core_used) { |
792 | | //printf(" + data series %08x:\n", 1<<i); |
793 | 0 | s->data_series |= 1<<i; |
794 | 0 | } |
795 | 0 | break; |
796 | | |
797 | 0 | default: |
798 | 0 | for (j = 0; j < s->hdr->num_blocks; j++) { |
799 | 0 | if (s->block[j]->content_type == EXTERNAL && |
800 | 0 | s->block[j]->content_id == bnum1) { |
801 | 0 | if (block_used[j]) { |
802 | | //printf(" + data series %08x:\n", 1<<i); |
803 | 0 | s->data_series |= 1<<i; |
804 | 0 | } |
805 | 0 | } |
806 | 0 | } |
807 | 0 | break; |
808 | 0 | } |
809 | | |
810 | 0 | if (bnum2 == -2 || bnum1 == bnum2) |
811 | 0 | break; |
812 | | |
813 | 0 | bnum1 = bnum2; // 2nd pass |
814 | 0 | } |
815 | 0 | } |
816 | | |
817 | | // Tags too |
818 | 0 | for (i = 0; i < CRAM_MAP_HASH; i++) { |
819 | 0 | int bnum1, bnum2, j; |
820 | 0 | cram_map *m = hdr->tag_encoding_map[i]; |
821 | |
|
822 | 0 | while (m) { |
823 | 0 | cram_codec *c = m->codec; |
824 | 0 | if (!c) { |
825 | 0 | m = m->next; |
826 | 0 | continue; |
827 | 0 | } |
828 | | |
829 | 0 | bnum1 = cram_codec_to_id(c, &bnum2); |
830 | |
|
831 | 0 | for (;;) { |
832 | 0 | switch (bnum1) { |
833 | 0 | case -2: |
834 | 0 | break; |
835 | | |
836 | 0 | case -1: |
837 | | //printf(" + data series %08x:\n", CRAM_aux); |
838 | 0 | s->data_series |= CRAM_aux; |
839 | 0 | break; |
840 | | |
841 | 0 | default: |
842 | 0 | for (j = 0; j < s->hdr->num_blocks; j++) { |
843 | 0 | if (s->block[j]->content_type == EXTERNAL && |
844 | 0 | s->block[j]->content_id == bnum1) { |
845 | 0 | if (block_used[j]) { |
846 | | //printf(" + data series %08x:\n", |
847 | | // CRAM_aux); |
848 | 0 | s->data_series |= CRAM_aux; |
849 | 0 | } |
850 | 0 | } |
851 | 0 | } |
852 | 0 | break; |
853 | 0 | } |
854 | | |
855 | 0 | if (bnum2 == -2 || bnum1 == bnum2) |
856 | 0 | break; |
857 | | |
858 | 0 | bnum1 = bnum2; // 2nd pass |
859 | 0 | } |
860 | | |
861 | 0 | m = m->next; |
862 | 0 | } |
863 | 0 | } |
864 | 0 | } while (orig_ds != s->data_series); |
865 | | |
866 | 0 | free(block_used); |
867 | 0 | return 0; |
868 | 0 | } |
869 | | |
870 | | /* |
871 | | * Checks whether an external block is used solely by a single data series. |
872 | | * Returns the codec type if so (EXTERNAL, BYTE_ARRAY_LEN, BYTE_ARRAY_STOP) |
873 | | * or 0 if not (E_NULL). |
874 | | */ |
875 | | static int cram_ds_unique(cram_block_compression_hdr *hdr, cram_codec *c, |
876 | 0 | int id) { |
877 | 0 | int i, n_id = 0; |
878 | 0 | enum cram_encoding e_type = 0; |
879 | |
|
880 | 0 | for (i = 0; i < DS_END; i++) { |
881 | 0 | cram_codec *c; |
882 | 0 | int bnum1, bnum2, old_n_id; |
883 | |
|
884 | 0 | if (!(c = hdr->codecs[i])) |
885 | 0 | continue; |
886 | | |
887 | 0 | bnum1 = cram_codec_to_id(c, &bnum2); |
888 | |
|
889 | 0 | old_n_id = n_id; |
890 | 0 | if (bnum1 == id) { |
891 | 0 | n_id++; |
892 | 0 | e_type = c->codec; |
893 | 0 | } |
894 | 0 | if (bnum2 == id) { |
895 | 0 | n_id++; |
896 | 0 | e_type = c->codec; |
897 | 0 | } |
898 | |
|
899 | 0 | if (n_id == old_n_id+2) |
900 | 0 | n_id--; // len/val in same place counts once only. |
901 | 0 | } |
902 | |
|
903 | 0 | return n_id == 1 ? e_type : 0; |
904 | 0 | } |
905 | | |
906 | | /* |
907 | | * Attempts to estimate the size of some blocks so we can preallocate them |
908 | | * before decoding. Although decoding will automatically grow the blocks, |
909 | | * it is typically more efficient to preallocate. |
910 | | */ |
911 | | void cram_decode_estimate_sizes(cram_block_compression_hdr *hdr, cram_slice *s, |
912 | | int *qual_size, int *name_size, |
913 | 471 | int *q_id) { |
914 | 471 | int bnum1, bnum2; |
915 | 471 | cram_codec *cd; |
916 | | |
917 | 471 | *qual_size = 0; |
918 | 471 | *name_size = 0; |
919 | | |
920 | | /* Qual */ |
921 | 471 | cd = hdr->codecs[DS_QS]; |
922 | 471 | if (cd == NULL) return; |
923 | 0 | bnum1 = cram_codec_to_id(cd, &bnum2); |
924 | 0 | if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2; |
925 | 0 | if (cram_ds_unique(hdr, cd, bnum1)) { |
926 | 0 | cram_block *b = cram_get_block_by_id(s, bnum1); |
927 | 0 | if (b) *qual_size = b->uncomp_size; |
928 | 0 | if (q_id && cd->codec == E_EXTERNAL) |
929 | 0 | *q_id = bnum1; |
930 | 0 | } |
931 | | |
932 | | /* Name */ |
933 | 0 | cd = hdr->codecs[DS_RN]; |
934 | 0 | if (cd == NULL) return; |
935 | 0 | bnum1 = cram_codec_to_id(cd, &bnum2); |
936 | 0 | if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2; |
937 | 0 | if (cram_ds_unique(hdr, cd, bnum1)) { |
938 | 0 | cram_block *b = cram_get_block_by_id(s, bnum1); |
939 | 0 | if (b) *name_size = b->uncomp_size; |
940 | 0 | } |
941 | 0 | } |
942 | | |
943 | | |
944 | | /* ---------------------------------------------------------------------- |
945 | | * CRAM slices |
946 | | */ |
947 | | |
948 | | /* |
949 | | * Decodes a CRAM (un)mapped slice header block. |
950 | | * Returns slice header ptr on success |
951 | | * NULL on failure |
952 | | */ |
953 | 633 | cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { |
954 | 633 | cram_block_slice_hdr *hdr; |
955 | 633 | unsigned char *cp; |
956 | 633 | unsigned char *cp_end; |
957 | 633 | int i, err = 0; |
958 | | |
959 | 633 | if (b->method != RAW) { |
960 | | /* Spec. says slice header should be RAW, but we can future-proof |
961 | | by trying to decode it if it isn't. */ |
962 | 6 | if (cram_uncompress_block(b) < 0) |
963 | 3 | return NULL; |
964 | 6 | } |
965 | 630 | cp = (unsigned char *)BLOCK_DATA(b); |
966 | 630 | cp_end = cp + b->uncomp_size; |
967 | | |
968 | 630 | if (b->content_type != MAPPED_SLICE && |
969 | 630 | b->content_type != UNMAPPED_SLICE) |
970 | 0 | return NULL; |
971 | | |
972 | 630 | if (!(hdr = calloc(1, sizeof(*hdr)))) |
973 | 0 | return NULL; |
974 | | |
975 | 630 | hdr->content_type = b->content_type; |
976 | | |
977 | 630 | if (b->content_type == MAPPED_SLICE) { |
978 | 603 | hdr->ref_seq_id = fd->vv.varint_get32s((char **)&cp, (char *)cp_end, &err); |
979 | 603 | if (CRAM_MAJOR_VERS(fd->version) >= 4) { |
980 | 0 | hdr->ref_seq_start = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err); |
981 | 0 | hdr->ref_seq_span = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err); |
982 | 603 | } else { |
983 | 603 | hdr->ref_seq_start = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); |
984 | 603 | hdr->ref_seq_span = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); |
985 | 603 | } |
986 | 603 | if (hdr->ref_seq_start < 0 || hdr->ref_seq_span < 0) { |
987 | 45 | free(hdr); |
988 | 45 | hts_log_error("Negative values not permitted for header " |
989 | 45 | "sequence start or span fields"); |
990 | 45 | return NULL; |
991 | 45 | } |
992 | 603 | } |
993 | 585 | hdr->num_records = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err); |
994 | 585 | hdr->record_counter = 0; |
995 | 585 | if (CRAM_MAJOR_VERS(fd->version) == 2) { |
996 | 0 | hdr->record_counter = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); |
997 | 585 | } else if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
998 | 0 | hdr->record_counter = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err); |
999 | 0 | } |
1000 | 585 | hdr->num_blocks = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); |
1001 | 585 | hdr->num_content_ids = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); |
1002 | 585 | if (hdr->num_content_ids < 1 || |
1003 | 585 | hdr->num_content_ids >= 10000) { |
1004 | | // Slice must have at least one data block, and there is no need |
1005 | | // for more than 2 per possible aux-tag plus ancillary. |
1006 | 15 | free(hdr); |
1007 | 15 | return NULL; |
1008 | 15 | } |
1009 | 570 | hdr->block_content_ids = malloc(hdr->num_content_ids * sizeof(int32_t)); |
1010 | 570 | if (!hdr->block_content_ids) { |
1011 | 0 | free(hdr); |
1012 | 0 | return NULL; |
1013 | 0 | } |
1014 | | |
1015 | 9.50k | for (i = 0; i < hdr->num_content_ids; i++) |
1016 | 8.93k | hdr->block_content_ids[i] = fd->vv.varint_get32((char **)&cp, |
1017 | 8.93k | (char *)cp_end, |
1018 | 8.93k | &err); |
1019 | 570 | if (err) { |
1020 | 6 | free(hdr->block_content_ids); |
1021 | 6 | free(hdr); |
1022 | 6 | return NULL; |
1023 | 6 | } |
1024 | | |
1025 | 564 | if (b->content_type == MAPPED_SLICE) |
1026 | 540 | hdr->ref_base_id = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err); |
1027 | | |
1028 | 564 | if (CRAM_MAJOR_VERS(fd->version) != 1) { |
1029 | 0 | if (cp_end - cp < 16) { |
1030 | 0 | free(hdr->block_content_ids); |
1031 | 0 | free(hdr); |
1032 | 0 | return NULL; |
1033 | 0 | } |
1034 | 0 | memcpy(hdr->md5, cp, 16); |
1035 | 564 | } else { |
1036 | 564 | memset(hdr->md5, 0, 16); |
1037 | 564 | } |
1038 | | |
1039 | 564 | if (!err) |
1040 | 555 | return hdr; |
1041 | | |
1042 | 9 | free(hdr->block_content_ids); |
1043 | 9 | free(hdr); |
1044 | 9 | return NULL; |
1045 | 564 | } |
1046 | | |
1047 | | |
1048 | | #if 0 |
1049 | | /* Returns the number of bits set in val; it the highest bit used */ |
1050 | | static int nbits(int v) { |
1051 | | static const int MultiplyDeBruijnBitPosition[32] = { |
1052 | | 1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31, |
1053 | | 9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32 |
1054 | | }; |
1055 | | |
1056 | | v |= v >> 1; // first up to set all bits 1 after the first 1 */ |
1057 | | v |= v >> 2; |
1058 | | v |= v >> 4; |
1059 | | v |= v >> 8; |
1060 | | v |= v >> 16; |
1061 | | |
1062 | | // DeBruijn magic to find top bit |
1063 | | return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27]; |
1064 | | } |
1065 | | #endif |
1066 | | |
1067 | | #if 0 |
1068 | | static int sort_freqs(const void *vp1, const void *vp2) { |
1069 | | const int i1 = *(const int *)vp1; |
1070 | | const int i2 = *(const int *)vp2; |
1071 | | return i1-i2; |
1072 | | } |
1073 | | #endif |
1074 | | |
1075 | | /* ---------------------------------------------------------------------- |
1076 | | * Primary CRAM sequence decoder |
1077 | | */ |
1078 | | |
1079 | 0 | static inline int add_md_char(cram_slice *s, int decode_md, char c, int32_t *md_dist) { |
1080 | 0 | if (decode_md) { |
1081 | 0 | BLOCK_APPEND_UINT(s->aux_blk, *md_dist); |
1082 | 0 | BLOCK_APPEND_CHAR(s->aux_blk, c); |
1083 | 0 | *md_dist = 0; |
1084 | 0 | } |
1085 | 0 | return 0; |
1086 | | |
1087 | 0 | block_err: |
1088 | 0 | return -1; |
1089 | 0 | } |
1090 | | |
1091 | | /* |
1092 | | * Internal part of cram_decode_slice(). |
1093 | | * Generates the sequence, quality and cigar components. |
1094 | | */ |
1095 | | static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, |
1096 | | cram_block *blk, cram_record *cr, sam_hdr_t *sh, |
1097 | | int cf, char *seq, char *qual, |
1098 | 0 | int has_MD, int has_NM) { |
1099 | 0 | int prev_pos = 0, f, r = 0, out_sz = 1; |
1100 | 0 | int seq_pos = 1; |
1101 | 0 | int cig_len = 0; |
1102 | 0 | int64_t ref_pos = cr->apos; |
1103 | 0 | int32_t fn, i32; |
1104 | 0 | enum cigar_op cig_op = BAM_CMATCH; |
1105 | 0 | uint32_t *cigar = s->cigar; |
1106 | 0 | uint32_t ncigar = s->ncigar; |
1107 | 0 | uint32_t cigar_alloc = s->cigar_alloc; |
1108 | 0 | uint32_t nm = 0; |
1109 | 0 | int32_t md_dist = 0; |
1110 | 0 | int orig_aux = 0; |
1111 | | // CRAM < 4.0 decode_md is off/on |
1112 | | // CRAM >= 4.0 decode_md is auto/on (auto=on if MD* present, off otherwise) |
1113 | 0 | int do_md = CRAM_MAJOR_VERS(fd->version) >= 4 |
1114 | 0 | ? (s->decode_md > 0) |
1115 | 0 | : (s->decode_md != 0); |
1116 | 0 | int decode_md = s->ref && cr->ref_id >= 0 && ((do_md && !has_MD) || has_MD < 0); |
1117 | 0 | int decode_nm = s->ref && cr->ref_id >= 0 && ((do_md && !has_NM) || has_NM < 0); |
1118 | 0 | uint32_t ds = s->data_series; |
1119 | 0 | sam_hrecs_t *bfd = sh->hrecs; |
1120 | |
|
1121 | 0 | cram_codec **codecs = c->comp_hdr->codecs; |
1122 | |
|
1123 | 0 | if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { |
1124 | 0 | memset(qual, 255, cr->len); |
1125 | 0 | } |
1126 | |
|
1127 | 0 | if (cr->cram_flags & CRAM_FLAG_NO_SEQ) |
1128 | 0 | decode_md = decode_nm = 0; |
1129 | |
|
1130 | 0 | if (decode_md) { |
1131 | 0 | orig_aux = BLOCK_SIZE(s->aux_blk); |
1132 | 0 | if (has_MD == 0) |
1133 | 0 | BLOCK_APPEND(s->aux_blk, "MDZ", 3); |
1134 | 0 | } |
1135 | | |
1136 | 0 | if (ds & CRAM_FN) { |
1137 | 0 | if (!codecs[DS_FN]) return -1; |
1138 | 0 | r |= codecs[DS_FN]->decode(s,codecs[DS_FN], |
1139 | 0 | blk, (char *)&fn, &out_sz); |
1140 | 0 | if (r) return r; |
1141 | 0 | } else { |
1142 | 0 | fn = 0; |
1143 | 0 | } |
1144 | | |
1145 | 0 | ref_pos--; // count from 0 |
1146 | 0 | cr->cigar = ncigar; |
1147 | |
|
1148 | 0 | if (!(ds & (CRAM_FC | CRAM_FP))) |
1149 | 0 | goto skip_cigar; |
1150 | | |
1151 | 0 | if (fn) { |
1152 | 0 | if ((ds & CRAM_FC) && !codecs[DS_FC]) |
1153 | 0 | return -1; |
1154 | 0 | if ((ds & CRAM_FP) && !codecs[DS_FP]) |
1155 | 0 | return -1; |
1156 | 0 | } |
1157 | | |
1158 | 0 | for (f = 0; f < fn; f++) { |
1159 | 0 | int32_t pos = 0; |
1160 | 0 | char op; |
1161 | |
|
1162 | 0 | if (ncigar+2 >= cigar_alloc) { |
1163 | 0 | cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024; |
1164 | 0 | if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar)))) |
1165 | 0 | return -1; |
1166 | 0 | s->cigar = cigar; |
1167 | 0 | } |
1168 | | |
1169 | 0 | if (ds & CRAM_FC) { |
1170 | 0 | r |= codecs[DS_FC]->decode(s, |
1171 | 0 | codecs[DS_FC], |
1172 | 0 | blk, |
1173 | 0 | &op, &out_sz); |
1174 | 0 | if (r) return r; |
1175 | 0 | } |
1176 | | |
1177 | 0 | if (!(ds & CRAM_FP)) |
1178 | 0 | continue; |
1179 | | |
1180 | 0 | r |= codecs[DS_FP]->decode(s, |
1181 | 0 | codecs[DS_FP], |
1182 | 0 | blk, |
1183 | 0 | (char *)&pos, &out_sz); |
1184 | 0 | if (r) return r; |
1185 | 0 | pos += prev_pos; |
1186 | |
|
1187 | 0 | if (pos <= 0) { |
1188 | 0 | hts_log_error("Feature position %d before start of read", pos); |
1189 | 0 | return -1; |
1190 | 0 | } |
1191 | | |
1192 | 0 | if (pos > seq_pos) { |
1193 | 0 | if (pos > cr->len+1) |
1194 | 0 | return -1; |
1195 | | |
1196 | 0 | if (s->ref && cr->ref_id >= 0) { |
1197 | 0 | if (ref_pos + pos - seq_pos > bfd->ref[cr->ref_id].len) { |
1198 | 0 | static int whinged = 0; |
1199 | 0 | int rlen; |
1200 | 0 | if (!whinged) |
1201 | 0 | hts_log_warning("Ref pos outside of ref sequence boundary"); |
1202 | 0 | whinged = 1; |
1203 | 0 | rlen = bfd->ref[cr->ref_id].len - ref_pos; |
1204 | | // May miss MD/NM cases where both seq/ref are N, but this is a |
1205 | | // malformed cram file anyway. |
1206 | 0 | if (rlen > 0) { |
1207 | 0 | if (ref_pos + rlen > s->ref_end) |
1208 | 0 | goto beyond_slice; |
1209 | | |
1210 | 0 | memcpy(&seq[seq_pos-1], |
1211 | 0 | &s->ref[ref_pos - s->ref_start +1], rlen); |
1212 | 0 | if ((pos - seq_pos) - rlen > 0) |
1213 | 0 | memset(&seq[seq_pos-1+rlen], 'N', |
1214 | 0 | (pos - seq_pos) - rlen); |
1215 | 0 | } else { |
1216 | 0 | memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1); |
1217 | 0 | } |
1218 | 0 | if (md_dist >= 0) |
1219 | 0 | md_dist += pos - seq_pos; |
1220 | 0 | } else { |
1221 | | // 'N' in both ref and seq is also mismatch for NM/MD |
1222 | 0 | if (ref_pos + pos-seq_pos > s->ref_end) |
1223 | 0 | goto beyond_slice; |
1224 | | |
1225 | 0 | const char *refp = s->ref + ref_pos - s->ref_start + 1; |
1226 | 0 | const int frag_len = pos - seq_pos; |
1227 | 0 | int do_cpy = 1; |
1228 | 0 | if (decode_md || decode_nm) { |
1229 | 0 | char *N = memchr(refp, 'N', frag_len); |
1230 | 0 | if (N) { |
1231 | 0 | int i; |
1232 | 0 | for (i = 0; i < frag_len; i++) { |
1233 | 0 | char base = refp[i]; |
1234 | 0 | if (base == 'N') { |
1235 | 0 | if (add_md_char(s, decode_md, |
1236 | 0 | 'N', &md_dist) < 0) |
1237 | 0 | return -1; |
1238 | 0 | nm++; |
1239 | 0 | } else { |
1240 | 0 | md_dist++; |
1241 | 0 | } |
1242 | 0 | seq[seq_pos-1+i] = base; |
1243 | 0 | } |
1244 | 0 | do_cpy = 0; |
1245 | 0 | } else { |
1246 | 0 | md_dist += frag_len; |
1247 | 0 | } |
1248 | 0 | } |
1249 | 0 | if (do_cpy) |
1250 | 0 | memcpy(&seq[seq_pos-1], refp, frag_len); |
1251 | 0 | } |
1252 | 0 | } |
1253 | | #ifdef USE_X |
1254 | | if (cig_len && cig_op != BAM_CBASE_MATCH) { |
1255 | | cigar[ncigar++] = (cig_len<<4) + cig_op; |
1256 | | cig_len = 0; |
1257 | | } |
1258 | | cig_op = BAM_CBASE_MATCH; |
1259 | | #else |
1260 | 0 | if (cig_len && cig_op != BAM_CMATCH) { |
1261 | 0 | cigar[ncigar++] = (cig_len<<4) + cig_op; |
1262 | 0 | cig_len = 0; |
1263 | 0 | } |
1264 | 0 | cig_op = BAM_CMATCH; |
1265 | 0 | #endif |
1266 | 0 | cig_len += pos - seq_pos; |
1267 | 0 | ref_pos += pos - seq_pos; |
1268 | 0 | seq_pos = pos; |
1269 | 0 | } |
1270 | | |
1271 | 0 | prev_pos = pos; |
1272 | |
|
1273 | 0 | if (!(ds & CRAM_FC)) |
1274 | 0 | goto skip_cigar; |
1275 | | |
1276 | 0 | switch(op) { |
1277 | 0 | case 'S': { // soft clip: IN |
1278 | 0 | int32_t out_sz2 = 1; |
1279 | 0 | int have_sc = 0; |
1280 | |
|
1281 | 0 | if (cig_len) { |
1282 | 0 | cigar[ncigar++] = (cig_len<<4) + cig_op; |
1283 | 0 | cig_len = 0; |
1284 | 0 | } |
1285 | 0 | switch (CRAM_MAJOR_VERS(fd->version)) { |
1286 | 0 | case 1: |
1287 | 0 | if (ds & CRAM_IN) { |
1288 | 0 | r |= codecs[DS_IN] |
1289 | 0 | ? codecs[DS_IN]->decode(s, codecs[DS_IN], |
1290 | 0 | blk, |
1291 | 0 | cr->len ? &seq[pos-1] : NULL, |
1292 | 0 | &out_sz2) |
1293 | 0 | : (seq[pos-1] = 'N', out_sz2 = 1, 0); |
1294 | 0 | have_sc = 1; |
1295 | 0 | } |
1296 | 0 | break; |
1297 | 0 | case 2: |
1298 | 0 | default: |
1299 | 0 | if (ds & CRAM_SC) { |
1300 | 0 | r |= codecs[DS_SC] |
1301 | 0 | ? codecs[DS_SC]->decode(s, codecs[DS_SC], |
1302 | 0 | blk, |
1303 | 0 | cr->len ? &seq[pos-1] : NULL, |
1304 | 0 | &out_sz2) |
1305 | 0 | : (seq[pos-1] = 'N', out_sz2 = 1, 0); |
1306 | 0 | have_sc = 1; |
1307 | 0 | } |
1308 | 0 | break; |
1309 | | |
1310 | | //default: |
1311 | | // r |= codecs[DS_BB] |
1312 | | // ? codecs[DS_BB]->decode(s, codecs[DS_BB], |
1313 | | // blk, &seq[pos-1], &out_sz2) |
1314 | | // : (seq[pos-1] = 'N', out_sz2 = 1, 0); |
1315 | 0 | } |
1316 | 0 | if (have_sc) { |
1317 | 0 | if (r) return r; |
1318 | 0 | cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP; |
1319 | 0 | cig_op = BAM_CSOFT_CLIP; |
1320 | 0 | seq_pos += out_sz2; |
1321 | 0 | } |
1322 | 0 | break; |
1323 | 0 | } |
1324 | | |
1325 | 0 | case 'X': { // Substitution; BS |
1326 | 0 | unsigned char base; |
1327 | | #ifdef USE_X |
1328 | | if (cig_len && cig_op != BAM_CBASE_MISMATCH) { |
1329 | | cigar[ncigar++] = (cig_len<<4) + cig_op; |
1330 | | cig_len = 0; |
1331 | | } |
1332 | | if (ds & CRAM_BS) { |
1333 | | if (!codecs[DS_BS]) return -1; |
1334 | | r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk, |
1335 | | (char *)&base, &out_sz); |
1336 | | if (pos-1 < cr->len) |
1337 | | seq[pos-1] = 'N'; // FIXME look up BS=base value |
1338 | | } |
1339 | | cig_op = BAM_CBASE_MISMATCH; |
1340 | | #else |
1341 | 0 | int ref_base; |
1342 | 0 | if (cig_len && cig_op != BAM_CMATCH) { |
1343 | 0 | cigar[ncigar++] = (cig_len<<4) + cig_op; |
1344 | 0 | cig_len = 0; |
1345 | 0 | } |
1346 | 0 | if (ds & CRAM_BS) { |
1347 | 0 | if (!codecs[DS_BS]) return -1; |
1348 | 0 | r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk, |
1349 | 0 | (char *)&base, &out_sz); |
1350 | 0 | if (r) return -1; |
1351 | 0 | if (cr->ref_id < 0 || ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) { |
1352 | 0 | if (pos-1 < cr->len) |
1353 | 0 | seq[pos-1] = c->comp_hdr-> |
1354 | 0 | substitution_matrix[fd->L1['N']][base]; |
1355 | 0 | if (decode_md || decode_nm) { |
1356 | 0 | if (md_dist >= 0 && decode_md) |
1357 | 0 | BLOCK_APPEND_UINT(s->aux_blk, md_dist); |
1358 | 0 | md_dist = -1; |
1359 | 0 | nm--; |
1360 | 0 | } |
1361 | 0 | } else { |
1362 | 0 | unsigned char ref_call = ref_pos < s->ref_end |
1363 | 0 | ? (uc)s->ref[ref_pos - s->ref_start +1] |
1364 | 0 | : 'N'; |
1365 | 0 | ref_base = fd->L1[ref_call]; |
1366 | 0 | if (pos-1 < cr->len) |
1367 | 0 | seq[pos-1] = c->comp_hdr-> |
1368 | 0 | substitution_matrix[ref_base][base]; |
1369 | 0 | if (add_md_char(s, decode_md, ref_call, &md_dist) < 0) |
1370 | 0 | return -1; |
1371 | 0 | } |
1372 | 0 | } |
1373 | 0 | cig_op = BAM_CMATCH; |
1374 | 0 | #endif |
1375 | 0 | nm++; |
1376 | 0 | cig_len++; |
1377 | 0 | seq_pos++; |
1378 | 0 | ref_pos++; |
1379 | 0 | break; |
1380 | 0 | } |
1381 | | |
1382 | 0 | case 'D': { // Deletion; DL |
1383 | 0 | if (cig_len && cig_op != BAM_CDEL) { |
1384 | 0 | cigar[ncigar++] = (cig_len<<4) + cig_op; |
1385 | 0 | cig_len = 0; |
1386 | 0 | } |
1387 | 0 | if (ds & CRAM_DL) { |
1388 | 0 | if (!codecs[DS_DL]) return -1; |
1389 | 0 | r |= codecs[DS_DL]->decode(s, codecs[DS_DL], blk, |
1390 | 0 | (char *)&i32, &out_sz); |
1391 | 0 | if (r) return r; |
1392 | 0 | if (decode_md || decode_nm) { |
1393 | 0 | if (ref_pos + i32 > s->ref_end) |
1394 | 0 | goto beyond_slice; |
1395 | 0 | if (md_dist >= 0 && decode_md) |
1396 | 0 | BLOCK_APPEND_UINT(s->aux_blk, md_dist); |
1397 | 0 | if (ref_pos + i32 <= bfd->ref[cr->ref_id].len) { |
1398 | 0 | if (decode_md) { |
1399 | 0 | BLOCK_APPEND_CHAR(s->aux_blk, '^'); |
1400 | 0 | BLOCK_APPEND(s->aux_blk, |
1401 | 0 | &s->ref[ref_pos - s->ref_start +1], |
1402 | 0 | i32); |
1403 | 0 | md_dist = 0; |
1404 | 0 | } |
1405 | 0 | nm += i32; |
1406 | 0 | } else { |
1407 | 0 | uint32_t dlen; |
1408 | 0 | if (bfd->ref[cr->ref_id].len >= ref_pos) { |
1409 | 0 | if (decode_md) { |
1410 | 0 | BLOCK_APPEND_CHAR(s->aux_blk, '^'); |
1411 | 0 | BLOCK_APPEND(s->aux_blk, |
1412 | 0 | &s->ref[ref_pos - s->ref_start+1], |
1413 | 0 | bfd->ref[cr->ref_id].len-ref_pos); |
1414 | 0 | BLOCK_APPEND_UINT(s->aux_blk, 0); |
1415 | 0 | } |
1416 | 0 | dlen = i32 - (bfd->ref[cr->ref_id].len - ref_pos); |
1417 | 0 | nm += i32 - dlen; |
1418 | 0 | } else { |
1419 | 0 | dlen = i32; |
1420 | 0 | } |
1421 | | |
1422 | 0 | md_dist = -1; |
1423 | 0 | } |
1424 | 0 | } |
1425 | 0 | cig_op = BAM_CDEL; |
1426 | 0 | cig_len += i32; |
1427 | 0 | ref_pos += i32; |
1428 | | //printf(" %d: DL = %d (ret %d)\n", f, i32, r); |
1429 | 0 | } |
1430 | 0 | break; |
1431 | 0 | } |
1432 | | |
1433 | 0 | case 'I': { // Insertion (several bases); IN |
1434 | 0 | int32_t out_sz2 = 1; |
1435 | |
|
1436 | 0 | if (cig_len && cig_op != BAM_CINS) { |
1437 | 0 | cigar[ncigar++] = (cig_len<<4) + cig_op; |
1438 | 0 | cig_len = 0; |
1439 | 0 | } |
1440 | |
|
1441 | 0 | if (ds & CRAM_IN) { |
1442 | 0 | if (!codecs[DS_IN]) return -1; |
1443 | 0 | r |= codecs[DS_IN]->decode(s, codecs[DS_IN], blk, |
1444 | 0 | cr->len ? &seq[pos-1] : NULL, |
1445 | 0 | &out_sz2); |
1446 | 0 | if (r) return r; |
1447 | 0 | cig_op = BAM_CINS; |
1448 | 0 | cig_len += out_sz2; |
1449 | 0 | seq_pos += out_sz2; |
1450 | 0 | nm += out_sz2; |
1451 | | //printf(" %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2); |
1452 | 0 | } |
1453 | 0 | break; |
1454 | 0 | } |
1455 | | |
1456 | 0 | case 'i': { // Insertion (single base); BA |
1457 | 0 | if (cig_len && cig_op != BAM_CINS) { |
1458 | 0 | cigar[ncigar++] = (cig_len<<4) + cig_op; |
1459 | 0 | cig_len = 0; |
1460 | 0 | } |
1461 | 0 | if (ds & CRAM_BA) { |
1462 | 0 | if (!codecs[DS_BA]) return -1; |
1463 | 0 | r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk, |
1464 | 0 | cr->len ? &seq[pos-1] : NULL, |
1465 | 0 | &out_sz); |
1466 | 0 | if (r) return r; |
1467 | 0 | } |
1468 | 0 | cig_op = BAM_CINS; |
1469 | 0 | cig_len++; |
1470 | 0 | seq_pos++; |
1471 | 0 | nm++; |
1472 | 0 | break; |
1473 | 0 | } |
1474 | | |
1475 | 0 | case 'b': { // Several bases |
1476 | 0 | int32_t len = 1; |
1477 | |
|
1478 | 0 | if (cig_len && cig_op != BAM_CMATCH) { |
1479 | 0 | cigar[ncigar++] = (cig_len<<4) + cig_op; |
1480 | 0 | cig_len = 0; |
1481 | 0 | } |
1482 | |
|
1483 | 0 | if (ds & CRAM_BB) { |
1484 | 0 | if (!codecs[DS_BB]) return -1; |
1485 | 0 | r |= codecs[DS_BB]->decode(s, codecs[DS_BB], blk, |
1486 | 0 | cr->len ? &seq[pos-1] : NULL, |
1487 | 0 | &len); |
1488 | 0 | if (r) return r; |
1489 | | |
1490 | 0 | if (decode_md || decode_nm) { |
1491 | 0 | int x; |
1492 | 0 | if (md_dist >= 0 && decode_md) |
1493 | 0 | BLOCK_APPEND_UINT(s->aux_blk, md_dist); |
1494 | | |
1495 | 0 | for (x = 0; x < len; x++) { |
1496 | 0 | if (x && decode_md) |
1497 | 0 | BLOCK_APPEND_UINT(s->aux_blk, 0); |
1498 | 0 | if (ref_pos+x >= bfd->ref[cr->ref_id].len || !s->ref) { |
1499 | 0 | md_dist = -1; |
1500 | 0 | break; |
1501 | 0 | } else { |
1502 | 0 | if (decode_md) { |
1503 | 0 | if (ref_pos + x > s->ref_end) |
1504 | 0 | goto beyond_slice; |
1505 | 0 | char r = s->ref[ref_pos+x-s->ref_start +1]; |
1506 | 0 | BLOCK_APPEND_CHAR(s->aux_blk, r); |
1507 | 0 | } |
1508 | 0 | } |
1509 | 0 | } |
1510 | | |
1511 | 0 | nm += x; |
1512 | 0 | md_dist = 0; |
1513 | 0 | } |
1514 | 0 | } |
1515 | | |
1516 | 0 | cig_op = BAM_CMATCH; |
1517 | |
|
1518 | 0 | cig_len+=len; |
1519 | 0 | seq_pos+=len; |
1520 | 0 | ref_pos+=len; |
1521 | | //prev_pos+=len; |
1522 | 0 | break; |
1523 | 0 | } |
1524 | | |
1525 | 0 | case 'q': { // Several quality values |
1526 | 0 | int32_t len = 1; |
1527 | |
|
1528 | 0 | if (cig_len && cig_op != BAM_CMATCH) { |
1529 | 0 | cigar[ncigar++] = (cig_len<<4) + cig_op; |
1530 | 0 | cig_len = 0; |
1531 | 0 | } |
1532 | |
|
1533 | 0 | if (ds & CRAM_QQ) { |
1534 | 0 | if (!codecs[DS_QQ]) return -1; |
1535 | 0 | if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) |
1536 | 0 | && (unsigned char)*qual == 255) |
1537 | 0 | memset(qual, 30, cr->len); // ? |
1538 | 0 | r |= codecs[DS_QQ]->decode(s, codecs[DS_QQ], blk, |
1539 | 0 | (char *)&qual[pos-1], &len); |
1540 | 0 | if (r) return r; |
1541 | 0 | } |
1542 | | |
1543 | 0 | cig_op = BAM_CMATCH; |
1544 | | |
1545 | | //prev_pos+=len; |
1546 | 0 | break; |
1547 | 0 | } |
1548 | | |
1549 | 0 | case 'B': { // Read base; BA, QS |
1550 | | #ifdef USE_X |
1551 | | if (cig_len && cig_op != BAM_CBASE_MISMATCH) { |
1552 | | cigar[ncigar++] = (cig_len<<4) + cig_op; |
1553 | | cig_len = 0; |
1554 | | } |
1555 | | #else |
1556 | 0 | if (cig_len && cig_op != BAM_CMATCH) { |
1557 | 0 | cigar[ncigar++] = (cig_len<<4) + cig_op; |
1558 | 0 | cig_len = 0; |
1559 | 0 | } |
1560 | 0 | #endif |
1561 | 0 | if (ds & CRAM_BA) { |
1562 | 0 | if (!codecs[DS_BA]) return -1; |
1563 | 0 | r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk, |
1564 | 0 | cr->len ? &seq[pos-1] : NULL, |
1565 | 0 | &out_sz); |
1566 | |
|
1567 | 0 | if (decode_md || decode_nm) { |
1568 | 0 | if (md_dist >= 0 && decode_md) |
1569 | 0 | BLOCK_APPEND_UINT(s->aux_blk, md_dist); |
1570 | 0 | if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) { |
1571 | 0 | md_dist = -1; |
1572 | 0 | } else { |
1573 | 0 | if (decode_md) { |
1574 | 0 | if (ref_pos > s->ref_end) |
1575 | 0 | goto beyond_slice; |
1576 | 0 | BLOCK_APPEND_CHAR(s->aux_blk, |
1577 | 0 | s->ref[ref_pos-s->ref_start +1]); |
1578 | 0 | } |
1579 | 0 | nm++; |
1580 | 0 | md_dist = 0; |
1581 | 0 | } |
1582 | 0 | } |
1583 | 0 | } |
1584 | 0 | if (ds & CRAM_QS) { |
1585 | 0 | if (!codecs[DS_QS]) return -1; |
1586 | 0 | if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) |
1587 | 0 | && (unsigned char)*qual == 255) |
1588 | 0 | memset(qual, 30, cr->len); // ASCII ?. Same as htsjdk |
1589 | 0 | r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk, |
1590 | 0 | (char *)&qual[pos-1], &out_sz); |
1591 | 0 | } |
1592 | | #ifdef USE_X |
1593 | | cig_op = BAM_CBASE_MISMATCH; |
1594 | | #else |
1595 | 0 | cig_op = BAM_CMATCH; |
1596 | 0 | #endif |
1597 | 0 | cig_len++; |
1598 | 0 | seq_pos++; |
1599 | 0 | ref_pos++; |
1600 | | //printf(" %d: BA/QS(B) = %c/%d (ret %d)\n", f, i32, qc, r); |
1601 | 0 | break; |
1602 | 0 | } |
1603 | | |
1604 | 0 | case 'Q': { // Quality score; QS |
1605 | 0 | if (ds & CRAM_QS) { |
1606 | 0 | if (!codecs[DS_QS]) return -1; |
1607 | 0 | if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) && |
1608 | 0 | (unsigned char)*qual == 255) |
1609 | 0 | memset(qual, 30, cr->len); // ? |
1610 | 0 | r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk, |
1611 | 0 | (char *)&qual[pos-1], &out_sz); |
1612 | | //printf(" %d: QS = %d (ret %d)\n", f, qc, r); |
1613 | 0 | } |
1614 | 0 | break; |
1615 | 0 | } |
1616 | | |
1617 | 0 | case 'H': { // hard clip; HC |
1618 | 0 | if (cig_len && cig_op != BAM_CHARD_CLIP) { |
1619 | 0 | cigar[ncigar++] = (cig_len<<4) + cig_op; |
1620 | 0 | cig_len = 0; |
1621 | 0 | } |
1622 | 0 | if (ds & CRAM_HC) { |
1623 | 0 | if (!codecs[DS_HC]) return -1; |
1624 | 0 | r |= codecs[DS_HC]->decode(s, codecs[DS_HC], blk, |
1625 | 0 | (char *)&i32, &out_sz); |
1626 | 0 | if (r) return r; |
1627 | 0 | cig_op = BAM_CHARD_CLIP; |
1628 | 0 | cig_len += i32; |
1629 | 0 | } |
1630 | 0 | break; |
1631 | 0 | } |
1632 | | |
1633 | 0 | case 'P': { // padding; PD |
1634 | 0 | if (cig_len && cig_op != BAM_CPAD) { |
1635 | 0 | cigar[ncigar++] = (cig_len<<4) + cig_op; |
1636 | 0 | cig_len = 0; |
1637 | 0 | } |
1638 | 0 | if (ds & CRAM_PD) { |
1639 | 0 | if (!codecs[DS_PD]) return -1; |
1640 | 0 | r |= codecs[DS_PD]->decode(s, codecs[DS_PD], blk, |
1641 | 0 | (char *)&i32, &out_sz); |
1642 | 0 | if (r) return r; |
1643 | 0 | cig_op = BAM_CPAD; |
1644 | 0 | cig_len += i32; |
1645 | 0 | } |
1646 | 0 | break; |
1647 | 0 | } |
1648 | | |
1649 | 0 | case 'N': { // Ref skip; RS |
1650 | 0 | if (cig_len && cig_op != BAM_CREF_SKIP) { |
1651 | 0 | cigar[ncigar++] = (cig_len<<4) + cig_op; |
1652 | 0 | cig_len = 0; |
1653 | 0 | } |
1654 | 0 | if (ds & CRAM_RS) { |
1655 | 0 | if (!codecs[DS_RS]) return -1; |
1656 | 0 | r |= codecs[DS_RS]->decode(s, codecs[DS_RS], blk, |
1657 | 0 | (char *)&i32, &out_sz); |
1658 | 0 | if (r) return r; |
1659 | 0 | cig_op = BAM_CREF_SKIP; |
1660 | 0 | cig_len += i32; |
1661 | 0 | ref_pos += i32; |
1662 | 0 | } |
1663 | 0 | break; |
1664 | 0 | } |
1665 | | |
1666 | 0 | default: |
1667 | 0 | hts_log_error("Unknown feature code '%c'", op); |
1668 | 0 | return -1; |
1669 | 0 | } |
1670 | 0 | } |
1671 | | |
1672 | 0 | if (!(ds & CRAM_FC)) |
1673 | 0 | goto skip_cigar; |
1674 | | |
1675 | | /* An implicit match op for any unaccounted for bases */ |
1676 | 0 | if ((ds & CRAM_FN) && cr->len >= seq_pos) { |
1677 | 0 | if (s->ref && cr->ref_id >= 0) { |
1678 | 0 | if (ref_pos + cr->len - seq_pos + 1 > bfd->ref[cr->ref_id].len) { |
1679 | 0 | static int whinged = 0; |
1680 | 0 | int rlen; |
1681 | 0 | if (!whinged) |
1682 | 0 | hts_log_warning("Ref pos outside of ref sequence boundary"); |
1683 | 0 | whinged = 1; |
1684 | 0 | rlen = bfd->ref[cr->ref_id].len - ref_pos; |
1685 | | // May miss MD/NM cases where both seq/ref are N, but this is a |
1686 | | // malformed cram file anyway. |
1687 | 0 | if (rlen > 0) { |
1688 | 0 | if (seq_pos-1 + rlen < cr->len) |
1689 | 0 | memcpy(&seq[seq_pos-1], |
1690 | 0 | &s->ref[ref_pos - s->ref_start +1], rlen); |
1691 | 0 | if ((cr->len - seq_pos + 1) - rlen > 0) |
1692 | 0 | memset(&seq[seq_pos-1+rlen], 'N', |
1693 | 0 | (cr->len - seq_pos + 1) - rlen); |
1694 | 0 | } else { |
1695 | 0 | if (cr->len - seq_pos + 1 > 0) |
1696 | 0 | memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1); |
1697 | 0 | } |
1698 | 0 | if (md_dist >= 0) |
1699 | 0 | md_dist += cr->len - seq_pos + 1; |
1700 | 0 | } else { |
1701 | 0 | if (cr->len - seq_pos + 1 > 0) { |
1702 | 0 | if (ref_pos + cr->len-seq_pos +1 > s->ref_end) |
1703 | 0 | goto beyond_slice; |
1704 | 0 | int remainder = cr->len - (seq_pos-1); |
1705 | 0 | int j = ref_pos - s->ref_start + 1; |
1706 | 0 | if (decode_md || decode_nm) { |
1707 | 0 | int i; |
1708 | 0 | char *N = memchr(&s->ref[j], 'N', remainder); |
1709 | 0 | if (!N) { |
1710 | | // short cut the common case |
1711 | 0 | md_dist += cr->len - (seq_pos-1); |
1712 | 0 | } else { |
1713 | 0 | char *refp = &s->ref[j-(seq_pos-1)]; |
1714 | 0 | md_dist += N-&s->ref[j]; |
1715 | 0 | int i_start = seq_pos-1 + (N - &s->ref[j]); |
1716 | 0 | for (i = i_start; i < cr->len; i++) { |
1717 | 0 | char base = refp[i]; |
1718 | 0 | if (base == 'N') { |
1719 | 0 | if (add_md_char(s, decode_md, 'N', |
1720 | 0 | &md_dist) < 0) |
1721 | 0 | return -1; |
1722 | 0 | nm++; |
1723 | 0 | } else { |
1724 | 0 | md_dist++; |
1725 | 0 | } |
1726 | 0 | } |
1727 | 0 | } |
1728 | 0 | } |
1729 | 0 | memcpy(&seq[seq_pos-1], &s->ref[j], remainder); |
1730 | 0 | } |
1731 | 0 | ref_pos += cr->len - seq_pos + 1; |
1732 | 0 | } |
1733 | 0 | } else if (cr->ref_id >= 0) { |
1734 | | // So alignment end can be computed even when not decoding sequence |
1735 | 0 | ref_pos += cr->len - seq_pos + 1; |
1736 | 0 | } |
1737 | | |
1738 | 0 | if (ncigar+1 >= cigar_alloc) { |
1739 | 0 | cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024; |
1740 | 0 | if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar)))) |
1741 | 0 | return -1; |
1742 | 0 | s->cigar = cigar; |
1743 | 0 | } |
1744 | | #ifdef USE_X |
1745 | | if (cig_len && cig_op != BAM_CBASE_MATCH) { |
1746 | | cigar[ncigar++] = (cig_len<<4) + cig_op; |
1747 | | cig_len = 0; |
1748 | | } |
1749 | | cig_op = BAM_CBASE_MATCH; |
1750 | | #else |
1751 | 0 | if (cig_len && cig_op != BAM_CMATCH) { |
1752 | 0 | cigar[ncigar++] = (cig_len<<4) + cig_op; |
1753 | 0 | cig_len = 0; |
1754 | 0 | } |
1755 | 0 | cig_op = BAM_CMATCH; |
1756 | 0 | #endif |
1757 | 0 | cig_len += cr->len - seq_pos+1; |
1758 | 0 | } |
1759 | | |
1760 | 0 | skip_cigar: |
1761 | |
|
1762 | 0 | if ((ds & CRAM_FN) && decode_md) { |
1763 | 0 | if (md_dist >= 0) |
1764 | 0 | BLOCK_APPEND_UINT(s->aux_blk, md_dist); |
1765 | 0 | } |
1766 | | |
1767 | 0 | if (cig_len) { |
1768 | 0 | if (ncigar >= cigar_alloc) { |
1769 | 0 | cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024; |
1770 | 0 | if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar)))) |
1771 | 0 | return -1; |
1772 | 0 | s->cigar = cigar; |
1773 | 0 | } |
1774 | | |
1775 | 0 | cigar[ncigar++] = (cig_len<<4) + cig_op; |
1776 | 0 | } |
1777 | | |
1778 | 0 | cr->ncigar = ncigar - cr->cigar; |
1779 | 0 | cr->aend = ref_pos > cr->apos ? ref_pos : cr->apos; |
1780 | | |
1781 | | //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos); |
1782 | |
|
1783 | 0 | if (ds & CRAM_MQ) { |
1784 | 0 | if (!codecs[DS_MQ]) return -1; |
1785 | 0 | r |= codecs[DS_MQ]->decode(s, codecs[DS_MQ], blk, |
1786 | 0 | (char *)&cr->mqual, &out_sz); |
1787 | 0 | } else { |
1788 | 0 | cr->mqual = 40; |
1789 | 0 | } |
1790 | | |
1791 | 0 | if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { |
1792 | 0 | int32_t out_sz2 = cr->len; |
1793 | |
|
1794 | 0 | if (!codecs[DS_QS]) return -1; |
1795 | 0 | r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk, |
1796 | 0 | qual, &out_sz2); |
1797 | 0 | } |
1798 | | |
1799 | 0 | s->cigar = cigar; |
1800 | 0 | s->cigar_alloc = cigar_alloc; |
1801 | 0 | s->ncigar = ncigar; |
1802 | |
|
1803 | 0 | if (cr->cram_flags & CRAM_FLAG_NO_SEQ) |
1804 | 0 | cr->len = 0; |
1805 | |
|
1806 | 0 | if (decode_md) { |
1807 | 0 | BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z: |
1808 | 0 | size_t sz = BLOCK_SIZE(s->aux_blk) - orig_aux; |
1809 | 0 | if (has_MD < 0) { |
1810 | | // has_MD < 0; already have MDZ allocated in aux at -has_MD, |
1811 | | // but wrote MD to end of aux (at orig_aux). |
1812 | | // We need some memmoves to shuffle it around. |
1813 | 0 | char tmp_MD_[1024], *tmp_MD = tmp_MD_; |
1814 | 0 | unsigned char *orig_aux_p = BLOCK_DATA(s->aux_blk) + orig_aux; |
1815 | 0 | if (sz > 1024) { |
1816 | 0 | tmp_MD = malloc(sz); |
1817 | 0 | if (!tmp_MD) |
1818 | 0 | return -1; |
1819 | 0 | } |
1820 | 0 | memcpy(tmp_MD, orig_aux_p, sz); |
1821 | 0 | memmove(&BLOCK_DATA(s->aux_blk)[-has_MD] + sz, |
1822 | 0 | &BLOCK_DATA(s->aux_blk)[-has_MD], |
1823 | 0 | orig_aux_p - &BLOCK_DATA(s->aux_blk)[-has_MD]); |
1824 | 0 | memcpy(&BLOCK_DATA(s->aux_blk)[-has_MD], tmp_MD, sz); |
1825 | 0 | if (tmp_MD != tmp_MD_) |
1826 | 0 | free(tmp_MD); |
1827 | |
|
1828 | 0 | if (-has_NM > -has_MD) |
1829 | | // we inserted before NM, so move it up a bit |
1830 | 0 | has_NM -= sz; |
1831 | 0 | } |
1832 | | // else has_MD == 0 and we've already appended MD to the end. |
1833 | | |
1834 | 0 | cr->aux_size += sz; |
1835 | 0 | } |
1836 | | |
1837 | 0 | if (decode_nm) { |
1838 | 0 | if (has_NM == 0) { |
1839 | 0 | char buf[7]; |
1840 | 0 | size_t buf_size; |
1841 | 0 | buf[0] = 'N'; buf[1] = 'M'; |
1842 | 0 | if (nm <= UINT8_MAX) { |
1843 | 0 | buf_size = 4; |
1844 | 0 | buf[2] = 'C'; |
1845 | 0 | buf[3] = (nm>> 0) & 0xff; |
1846 | 0 | } else if (nm <= UINT16_MAX) { |
1847 | 0 | buf_size = 5; |
1848 | 0 | buf[2] = 'S'; |
1849 | 0 | buf[3] = (nm>> 0) & 0xff; |
1850 | 0 | buf[4] = (nm>> 8) & 0xff; |
1851 | 0 | } else { |
1852 | 0 | buf_size = 7; |
1853 | 0 | buf[2] = 'I'; |
1854 | 0 | buf[3] = (nm>> 0) & 0xff; |
1855 | 0 | buf[4] = (nm>> 8) & 0xff; |
1856 | 0 | buf[5] = (nm>>16) & 0xff; |
1857 | 0 | buf[6] = (nm>>24) & 0xff; |
1858 | 0 | } |
1859 | 0 | BLOCK_APPEND(s->aux_blk, buf, buf_size); |
1860 | 0 | cr->aux_size += buf_size; |
1861 | 0 | } else { |
1862 | | // Preallocated space for NM at -has_NM into aux block |
1863 | 0 | unsigned char *buf = BLOCK_DATA(s->aux_blk) + -has_NM; |
1864 | 0 | buf[0] = (nm>> 0) & 0xff; |
1865 | 0 | buf[1] = (nm>> 8) & 0xff; |
1866 | 0 | buf[2] = (nm>>16) & 0xff; |
1867 | 0 | buf[3] = (nm>>24) & 0xff; |
1868 | 0 | } |
1869 | 0 | } |
1870 | | |
1871 | 0 | return r; |
1872 | | |
1873 | 0 | beyond_slice: |
1874 | | // Cramtools can create CRAMs that have sequence features outside the |
1875 | | // stated range of the container & slice reference extents (start + span). |
1876 | | // We have to check for these in many places, but for brevity have the |
1877 | | // error reporting in only one. |
1878 | 0 | hts_log_error("CRAM CIGAR extends beyond slice reference extents"); |
1879 | 0 | return -1; |
1880 | | |
1881 | 0 | block_err: |
1882 | 0 | return -1; |
1883 | 0 | } |
1884 | | |
1885 | | /* |
1886 | | * Quick and simple hash lookup for cram_map arrays |
1887 | | */ |
1888 | 0 | static cram_map *map_find(cram_map **map, unsigned char *key, int id) { |
1889 | 0 | cram_map *m; |
1890 | |
|
1891 | 0 | m = map[CRAM_MAP(key[0],key[1])]; |
1892 | 0 | while (m && m->key != id) |
1893 | 0 | m= m->next; |
1894 | |
|
1895 | 0 | return m; |
1896 | 0 | } |
1897 | | |
1898 | | //#define map_find(M,K,I) M[CRAM_MAP(K[0],K[1])];while (m && m->key != I);m= m->next |
1899 | | |
1900 | | |
1901 | | static int cram_decode_aux_1_0(cram_container *c, cram_slice *s, |
1902 | 0 | cram_block *blk, cram_record *cr) { |
1903 | 0 | int i, r = 0, out_sz = 1; |
1904 | 0 | unsigned char ntags; |
1905 | |
|
1906 | 0 | if (!c->comp_hdr->codecs[DS_TC]) return -1; |
1907 | 0 | r |= c->comp_hdr->codecs[DS_TC]->decode(s, c->comp_hdr->codecs[DS_TC], blk, |
1908 | 0 | (char *)&ntags, &out_sz); |
1909 | 0 | cr->ntags = ntags; |
1910 | | |
1911 | | //printf("TC=%d\n", cr->ntags); |
1912 | 0 | cr->aux_size = 0; |
1913 | 0 | cr->aux = BLOCK_SIZE(s->aux_blk); |
1914 | |
|
1915 | 0 | for (i = 0; i < cr->ntags; i++) { |
1916 | 0 | int32_t id, out_sz = 1; |
1917 | 0 | unsigned char tag_data[3]; |
1918 | 0 | cram_map *m; |
1919 | | |
1920 | | //printf("Tag %d/%d\n", i+1, cr->ntags); |
1921 | 0 | if (!c->comp_hdr->codecs[DS_TN]) return -1; |
1922 | 0 | r |= c->comp_hdr->codecs[DS_TN]->decode(s, c->comp_hdr->codecs[DS_TN], |
1923 | 0 | blk, (char *)&id, &out_sz); |
1924 | 0 | if (out_sz == 3) { |
1925 | | // Tag name stored as 3 chars instead of an int? |
1926 | 0 | memcpy(tag_data, &id, 3); |
1927 | 0 | } else { |
1928 | 0 | tag_data[0] = (id>>16) & 0xff; |
1929 | 0 | tag_data[1] = (id>>8) & 0xff; |
1930 | 0 | tag_data[2] = id & 0xff; |
1931 | 0 | } |
1932 | |
|
1933 | 0 | m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id); |
1934 | 0 | if (!m) |
1935 | 0 | return -1; |
1936 | 0 | BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3); |
1937 | | |
1938 | 0 | if (!m->codec) return -1; |
1939 | 0 | r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz); |
1940 | |
|
1941 | 0 | cr->aux_size += out_sz + 3; |
1942 | 0 | } |
1943 | | |
1944 | 0 | return r; |
1945 | | |
1946 | 0 | block_err: |
1947 | 0 | return -1; |
1948 | 0 | } |
1949 | | |
1950 | | // has_MD and has_NM are filled out with 0 for none present, |
1951 | | // 1 for present and verbatim, and -pos for present as placeholder |
1952 | | // (MD*, NM*) to be generated and filled out at offset +pos. |
1953 | | static int cram_decode_aux(cram_fd *fd, |
1954 | | cram_container *c, cram_slice *s, |
1955 | | cram_block *blk, cram_record *cr, |
1956 | 0 | int *has_MD, int *has_NM) { |
1957 | 0 | int i, r = 0, out_sz = 1; |
1958 | 0 | int32_t TL = 0; |
1959 | 0 | unsigned char *TN; |
1960 | 0 | uint32_t ds = s->data_series; |
1961 | |
|
1962 | 0 | if (!(ds & (CRAM_TL|CRAM_aux))) { |
1963 | 0 | cr->aux = 0; |
1964 | 0 | cr->aux_size = 0; |
1965 | 0 | return 0; |
1966 | 0 | } |
1967 | | |
1968 | 0 | if (!c->comp_hdr->codecs[DS_TL]) return -1; |
1969 | 0 | r |= c->comp_hdr->codecs[DS_TL]->decode(s, c->comp_hdr->codecs[DS_TL], blk, |
1970 | 0 | (char *)&TL, &out_sz); |
1971 | 0 | if (r || TL < 0 || TL >= c->comp_hdr->nTL) |
1972 | 0 | return -1; |
1973 | | |
1974 | 0 | TN = c->comp_hdr->TL[TL]; |
1975 | 0 | cr->ntags = strlen((char *)TN)/3; // optimise to remove strlen |
1976 | | |
1977 | | //printf("TC=%d\n", cr->ntags); |
1978 | 0 | cr->aux_size = 0; |
1979 | 0 | cr->aux = BLOCK_SIZE(s->aux_blk); |
1980 | |
|
1981 | 0 | if (!(ds & CRAM_aux)) |
1982 | 0 | return 0; |
1983 | | |
1984 | 0 | for (i = 0; i < cr->ntags; i++) { |
1985 | 0 | int32_t id, out_sz = 1; |
1986 | 0 | unsigned char tag_data[7]; |
1987 | 0 | cram_map *m; |
1988 | |
|
1989 | 0 | if (TN[0] == 'M' && TN[1] == 'D' && has_MD) |
1990 | 0 | *has_MD = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1); |
1991 | 0 | if (TN[0] == 'N' && TN[1] == 'M' && has_NM) |
1992 | 0 | *has_NM = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1);; |
1993 | | |
1994 | | //printf("Tag %d/%d\n", i+1, cr->ntags); |
1995 | 0 | tag_data[0] = TN[0]; |
1996 | 0 | tag_data[1] = TN[1]; |
1997 | 0 | tag_data[2] = TN[2]; |
1998 | 0 | id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2]; |
1999 | |
|
2000 | 0 | if (CRAM_MAJOR_VERS(fd->version) >= 4 && TN[2] == '*') { |
2001 | | // Place holder, fill out contents later. |
2002 | 0 | int tag_data_size; |
2003 | 0 | if (TN[0] == 'N' && TN[1] == 'M') { |
2004 | | // Use a fixed size, so we can allocate room for it now. |
2005 | 0 | memcpy(&tag_data[2], "I\0\0\0\0", 5); |
2006 | 0 | tag_data_size = 7; |
2007 | 0 | } else if (TN[0] == 'R' && TN[1] == 'G') { |
2008 | | // RG is variable size, but known already. Insert now |
2009 | 0 | TN += 3; |
2010 | | // Equiv to fd->header->hrecs->rg[cr->rg], but this is the |
2011 | | // new header API equivalent. |
2012 | 0 | const char *rg = sam_hdr_line_name(fd->header, "RG", cr->rg); |
2013 | 0 | if (!rg) |
2014 | 0 | continue; |
2015 | | |
2016 | 0 | size_t rg_len = strlen(rg); |
2017 | 0 | tag_data[2] = 'Z'; |
2018 | 0 | BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3); |
2019 | 0 | BLOCK_APPEND(s->aux_blk, rg, rg_len); |
2020 | 0 | BLOCK_APPEND_CHAR(s->aux_blk, '\0'); |
2021 | 0 | cr->aux_size += 3 + rg_len + 1; |
2022 | 0 | cr->rg = -1; // prevents auto-add later |
2023 | 0 | continue; |
2024 | 0 | } else { |
2025 | | // Unknown size. We'll insert MD into stream later. |
2026 | 0 | tag_data[2] = 'Z'; |
2027 | 0 | tag_data_size = 3; |
2028 | 0 | } |
2029 | 0 | BLOCK_APPEND(s->aux_blk, (char *)tag_data, tag_data_size); |
2030 | 0 | cr->aux_size += tag_data_size; |
2031 | 0 | TN += 3; |
2032 | 0 | } else { |
2033 | 0 | TN += 3; |
2034 | 0 | m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id); |
2035 | 0 | if (!m) |
2036 | 0 | return -1; |
2037 | | |
2038 | 0 | BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3); |
2039 | | |
2040 | 0 | if (!m->codec) return -1; |
2041 | 0 | r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz); |
2042 | 0 | if (r) break; |
2043 | 0 | cr->aux_size += out_sz + 3; |
2044 | | |
2045 | | // cF CRAM flags. |
2046 | 0 | if (TN[-3]=='c' && TN[-2]=='F' && TN[-1]=='C' && out_sz == 1) { |
2047 | | // Remove cF tag |
2048 | 0 | uint8_t cF = BLOCK_END(s->aux_blk)[-1]; |
2049 | 0 | BLOCK_SIZE(s->aux_blk) -= out_sz+3; |
2050 | 0 | cr->aux_size -= out_sz+3; |
2051 | | |
2052 | | // bit 1 => don't auto-decode MD. |
2053 | | // Pretend MD is present verbatim, so we don't auto-generate |
2054 | 0 | if ((cF & 1) && has_MD && *has_MD == 0) |
2055 | 0 | *has_MD = 1; |
2056 | | |
2057 | | // bit 1 => don't auto-decode NM |
2058 | 0 | if ((cF & 2) && has_NM && *has_NM == 0) |
2059 | 0 | *has_NM = 1; |
2060 | 0 | } |
2061 | 0 | } |
2062 | | |
2063 | | // We could go to 2^32 fine, but we shouldn't be hitting this anyway, |
2064 | | // and it's protecting against memory hogs too. |
2065 | 0 | if (BLOCK_SIZE(s->aux_blk) > (1u<<31)) { |
2066 | 0 | hts_log_error("CRAM->BAM aux block size overflow"); |
2067 | 0 | goto block_err; |
2068 | 0 | } |
2069 | 0 | } |
2070 | | |
2071 | 0 | return r; |
2072 | | |
2073 | 0 | block_err: |
2074 | 0 | return -1; |
2075 | 0 | } |
2076 | | |
2077 | | /* Resolve mate pair cross-references between recs within this slice */ |
2078 | 408 | static int cram_decode_slice_xref(cram_slice *s, int required_fields) { |
2079 | 408 | int rec; |
2080 | | |
2081 | 408 | if (!(required_fields & (SAM_RNEXT | SAM_PNEXT | SAM_TLEN))) { |
2082 | 0 | for (rec = 0; rec < s->hdr->num_records; rec++) { |
2083 | 0 | cram_record *cr = &s->crecs[rec]; |
2084 | |
|
2085 | 0 | cr->tlen = 0; |
2086 | 0 | cr->mate_pos = 0; |
2087 | 0 | cr->mate_ref_id = -1; |
2088 | 0 | } |
2089 | |
|
2090 | 0 | return 0; |
2091 | 0 | } |
2092 | | |
2093 | 408 | for (rec = 0; rec < s->hdr->num_records; rec++) { |
2094 | 0 | cram_record *cr = &s->crecs[rec]; |
2095 | |
|
2096 | 0 | if (cr->mate_line >= 0) { |
2097 | 0 | if (cr->mate_line < s->hdr->num_records) { |
2098 | | /* |
2099 | | * On the first read, loop through computing lengths. |
2100 | | * It's not perfect as we have one slice per reference so we |
2101 | | * cannot detect when TLEN should be zero due to seqs that |
2102 | | * map to multiple references. |
2103 | | * |
2104 | | * We also cannot set tlen correct when it spans a slice for |
2105 | | * other reasons. This may make tlen too small. Should we |
2106 | | * fix this by forcing TLEN to be stored verbatim in such cases? |
2107 | | * |
2108 | | * Or do we just admit defeat and output 0 for tlen? It's the |
2109 | | * safe option... |
2110 | | */ |
2111 | 0 | if (cr->tlen == INT64_MIN) { |
2112 | 0 | int id1 = rec, id2 = rec; |
2113 | 0 | int64_t aleft = cr->apos, aright = cr->aend; |
2114 | 0 | int64_t tlen; |
2115 | 0 | int ref = cr->ref_id; |
2116 | | |
2117 | | // number of segments starting at the same point. |
2118 | 0 | int left_cnt = 0; |
2119 | |
|
2120 | 0 | do { |
2121 | 0 | if (aleft > s->crecs[id2].apos) |
2122 | 0 | aleft = s->crecs[id2].apos, left_cnt = 1; |
2123 | 0 | else if (aleft == s->crecs[id2].apos) |
2124 | 0 | left_cnt++; |
2125 | 0 | if (aright < s->crecs[id2].aend) |
2126 | 0 | aright = s->crecs[id2].aend; |
2127 | 0 | if (s->crecs[id2].mate_line == -1) { |
2128 | 0 | s->crecs[id2].mate_line = rec; |
2129 | 0 | break; |
2130 | 0 | } |
2131 | 0 | if (s->crecs[id2].mate_line <= id2 || |
2132 | 0 | s->crecs[id2].mate_line >= s->hdr->num_records) |
2133 | 0 | return -1; |
2134 | 0 | id2 = s->crecs[id2].mate_line; |
2135 | |
|
2136 | 0 | if (s->crecs[id2].ref_id != ref) |
2137 | 0 | ref = -1; |
2138 | 0 | } while (id2 != id1); |
2139 | | |
2140 | 0 | if (ref != -1) { |
2141 | 0 | tlen = aright - aleft + 1; |
2142 | 0 | id1 = id2 = rec; |
2143 | | |
2144 | | /* |
2145 | | * When we have two seqs with identical start and |
2146 | | * end coordinates, set +/- tlen based on 1st/last |
2147 | | * bit flags instead, as a tie breaker. |
2148 | | */ |
2149 | 0 | if (s->crecs[id2].apos == aleft) { |
2150 | 0 | if (left_cnt == 1 || |
2151 | 0 | (s->crecs[id2].flags & BAM_FREAD1)) |
2152 | 0 | s->crecs[id2].tlen = tlen; |
2153 | 0 | else |
2154 | 0 | s->crecs[id2].tlen = -tlen; |
2155 | 0 | } else { |
2156 | 0 | s->crecs[id2].tlen = -tlen; |
2157 | 0 | } |
2158 | |
|
2159 | 0 | id2 = s->crecs[id2].mate_line; |
2160 | 0 | while (id2 != id1) { |
2161 | 0 | if (s->crecs[id2].apos == aleft) { |
2162 | 0 | if (left_cnt == 1 || |
2163 | 0 | (s->crecs[id2].flags & BAM_FREAD1)) |
2164 | 0 | s->crecs[id2].tlen = tlen; |
2165 | 0 | else |
2166 | 0 | s->crecs[id2].tlen = -tlen; |
2167 | 0 | } else { |
2168 | 0 | s->crecs[id2].tlen = -tlen; |
2169 | 0 | } |
2170 | 0 | id2 = s->crecs[id2].mate_line; |
2171 | 0 | } |
2172 | 0 | } else { |
2173 | 0 | id1 = id2 = rec; |
2174 | |
|
2175 | 0 | s->crecs[id2].tlen = 0; |
2176 | 0 | id2 = s->crecs[id2].mate_line; |
2177 | 0 | while (id2 != id1) { |
2178 | 0 | s->crecs[id2].tlen = 0; |
2179 | 0 | id2 = s->crecs[id2].mate_line; |
2180 | 0 | } |
2181 | 0 | } |
2182 | 0 | } |
2183 | | |
2184 | 0 | cr->mate_pos = s->crecs[cr->mate_line].apos; |
2185 | 0 | cr->mate_ref_id = s->crecs[cr->mate_line].ref_id; |
2186 | | |
2187 | | // paired |
2188 | 0 | cr->flags |= BAM_FPAIRED; |
2189 | | |
2190 | | // set mate unmapped if needed |
2191 | 0 | if (s->crecs[cr->mate_line].flags & BAM_FUNMAP) { |
2192 | 0 | cr->flags |= BAM_FMUNMAP; |
2193 | 0 | cr->tlen = 0; |
2194 | 0 | } |
2195 | 0 | if (cr->flags & BAM_FUNMAP) { |
2196 | 0 | cr->tlen = 0; |
2197 | 0 | } |
2198 | | |
2199 | | // set mate reversed if needed |
2200 | 0 | if (s->crecs[cr->mate_line].flags & BAM_FREVERSE) |
2201 | 0 | cr->flags |= BAM_FMREVERSE; |
2202 | 0 | } else { |
2203 | 0 | hts_log_error("Mate line out of bounds: %d vs [0, %d]", |
2204 | 0 | cr->mate_line, s->hdr->num_records-1); |
2205 | 0 | } |
2206 | | |
2207 | | /* FIXME: construct read names here too if needed */ |
2208 | 0 | } else { |
2209 | 0 | if (cr->mate_flags & CRAM_M_REVERSE) { |
2210 | 0 | cr->flags |= BAM_FPAIRED | BAM_FMREVERSE; |
2211 | 0 | } |
2212 | 0 | if (cr->mate_flags & CRAM_M_UNMAP) { |
2213 | 0 | cr->flags |= BAM_FMUNMAP; |
2214 | | //cr->mate_ref_id = -1; |
2215 | 0 | } |
2216 | 0 | if (!(cr->flags & BAM_FPAIRED)) |
2217 | 0 | cr->mate_ref_id = -1; |
2218 | 0 | } |
2219 | | |
2220 | 0 | if (cr->tlen == INT64_MIN) |
2221 | 0 | cr->tlen = 0; // Just incase |
2222 | 0 | } |
2223 | | |
2224 | 408 | for (rec = 0; rec < s->hdr->num_records; rec++) { |
2225 | 0 | cram_record *cr = &s->crecs[rec]; |
2226 | 0 | if (cr->explicit_tlen != INT64_MIN) |
2227 | 0 | cr->tlen = cr->explicit_tlen; |
2228 | 0 | } |
2229 | | |
2230 | 408 | return 0; |
2231 | 408 | } |
2232 | | |
2233 | 0 | static char *md5_print(unsigned char *md5, char *out) { |
2234 | 0 | int i; |
2235 | 0 | for (i = 0; i < 16; i++) { |
2236 | 0 | out[i*2+0] = "0123456789abcdef"[md5[i]>>4]; |
2237 | 0 | out[i*2+1] = "0123456789abcdef"[md5[i]&15]; |
2238 | 0 | } |
2239 | 0 | out[32] = 0; |
2240 | |
|
2241 | 0 | return out; |
2242 | 0 | } |
2243 | | |
2244 | | /* |
2245 | | * Utility function to decode tlen (ISIZE), as it's called |
2246 | | * in multiple places. |
2247 | | * |
2248 | | * Returns codec return value (0 on success). |
2249 | | */ |
2250 | | static int cram_decode_tlen(cram_fd *fd, cram_container *c, cram_slice *s, |
2251 | 0 | cram_block *blk, int64_t *tlen) { |
2252 | 0 | int out_sz = 1, r = 0; |
2253 | |
|
2254 | 0 | if (!c->comp_hdr->codecs[DS_TS]) return -1; |
2255 | 0 | if (CRAM_MAJOR_VERS(fd->version) < 4) { |
2256 | 0 | int32_t i32; |
2257 | 0 | r |= c->comp_hdr->codecs[DS_TS] |
2258 | 0 | ->decode(s, c->comp_hdr->codecs[DS_TS], blk, |
2259 | 0 | (char *)&i32, &out_sz); |
2260 | 0 | *tlen = i32; |
2261 | 0 | } else { |
2262 | 0 | r |= c->comp_hdr->codecs[DS_TS] |
2263 | 0 | ->decode(s, c->comp_hdr->codecs[DS_TS], blk, |
2264 | 0 | (char *)tlen, &out_sz); |
2265 | 0 | } |
2266 | 0 | return r; |
2267 | 0 | } |
2268 | | |
2269 | | /* |
2270 | | * Decode an entire slice from container blocks. Fills out s->crecs[] array. |
2271 | | * Returns 0 on success |
2272 | | * -1 on failure |
2273 | | */ |
2274 | | int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, |
2275 | 480 | sam_hdr_t *sh) { |
2276 | 480 | cram_block *blk = s->block[0]; |
2277 | 480 | int32_t bf, ref_id; |
2278 | 480 | unsigned char cf; |
2279 | 480 | int out_sz, r = 0; |
2280 | 480 | int rec; |
2281 | 480 | char *seq = NULL, *qual = NULL; |
2282 | 480 | int unknown_rg = -1; |
2283 | 480 | int embed_ref; |
2284 | 480 | char **refs = NULL; |
2285 | 480 | uint32_t ds; |
2286 | 480 | sam_hrecs_t *bfd = sh->hrecs; |
2287 | | |
2288 | 480 | if (cram_dependent_data_series(fd, c->comp_hdr, s) != 0) |
2289 | 9 | return -1; |
2290 | | |
2291 | 471 | ds = s->data_series; |
2292 | | |
2293 | 471 | blk->bit = 7; // MSB first |
2294 | | |
2295 | | // Study the blocks and estimate approx sizes to preallocate. |
2296 | | // This looks to speed up decoding by around 8-9%. |
2297 | | // We can always shrink back down at the end if we overestimated. |
2298 | | // However it's likely that this also saves memory as own growth |
2299 | | // factor (*=1.5) is never applied. |
2300 | 471 | { |
2301 | 471 | int qsize, nsize, q_id; |
2302 | 471 | cram_decode_estimate_sizes(c->comp_hdr, s, &qsize, &nsize, &q_id); |
2303 | | //fprintf(stderr, "qsize=%d nsize=%d\n", qsize, nsize); |
2304 | | |
2305 | 471 | if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->seqs_blk, qsize+1); |
2306 | 471 | if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->qual_blk, qsize+1); |
2307 | 471 | if (nsize && (ds & CRAM_NS)) BLOCK_RESIZE_EXACT(s->name_blk, nsize+1); |
2308 | | |
2309 | | // To do - consider using q_id here to usurp the quality block and |
2310 | | // avoid a memcpy during decode. |
2311 | | // Specifically when quality is an external block uniquely used by |
2312 | | // DS_QS only, then we can set s->qual_blk directly to this |
2313 | | // block and save the codec->decode() calls. (Approx 3% cpu saving) |
2314 | 471 | } |
2315 | | |
2316 | | /* Look for unknown RG, added as last by Java CRAM? */ |
2317 | 471 | if (bfd->nrg > 0 && |
2318 | 471 | bfd->rg[bfd->nrg-1].name != NULL && |
2319 | 471 | !strcmp(bfd->rg[bfd->nrg-1].name, "UNKNOWN")) |
2320 | 0 | unknown_rg = bfd->nrg-1; |
2321 | | |
2322 | 471 | if (blk->content_type != CORE) |
2323 | 6 | return -1; |
2324 | | |
2325 | 465 | if (s->crecs) |
2326 | 0 | free(s->crecs); |
2327 | 465 | if (!(s->crecs = malloc(s->hdr->num_records * sizeof(*s->crecs)))) |
2328 | 0 | return -1; |
2329 | | |
2330 | 465 | ref_id = s->hdr->ref_seq_id; |
2331 | 465 | if (CRAM_MAJOR_VERS(fd->version) < 4) |
2332 | 465 | embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0; |
2333 | 0 | else |
2334 | 0 | embed_ref = s->hdr->ref_base_id > 0 ? 1 : 0; |
2335 | | |
2336 | 465 | if (ref_id >= 0) { |
2337 | 405 | if (embed_ref) { |
2338 | 399 | cram_block *b; |
2339 | 399 | if (s->hdr->ref_base_id < 0) { |
2340 | 0 | hts_log_error("No reference specified and no embedded reference is available" |
2341 | 0 | " at #%d:%"PRId64"-%"PRId64, ref_id, s->hdr->ref_seq_start, |
2342 | 0 | s->hdr->ref_seq_start + s->hdr->ref_seq_span-1); |
2343 | 0 | return -1; |
2344 | 0 | } |
2345 | 399 | b = cram_get_block_by_id(s, s->hdr->ref_base_id); |
2346 | 399 | if (!b) |
2347 | 27 | return -1; |
2348 | 372 | if (cram_uncompress_block(b) != 0) |
2349 | 0 | return -1; |
2350 | 372 | s->ref = (char *)BLOCK_DATA(b); |
2351 | 372 | s->ref_start = s->hdr->ref_seq_start; |
2352 | 372 | s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1; |
2353 | 372 | if (s->hdr->ref_seq_span > b->uncomp_size) { |
2354 | 3 | hts_log_error("Embedded reference is too small at #%d:%"PRIhts_pos"-%"PRIhts_pos, |
2355 | 3 | ref_id, s->ref_start, s->ref_end); |
2356 | 3 | return -1; |
2357 | 3 | } |
2358 | 372 | } else if (!c->comp_hdr->no_ref) { |
2359 | | //// Avoid Java cramtools bug by loading entire reference seq |
2360 | | //s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0); |
2361 | | //s->ref_start = 1; |
2362 | | |
2363 | 6 | if (fd->required_fields & SAM_SEQ) { |
2364 | 6 | s->ref = |
2365 | 6 | cram_get_ref(fd, s->hdr->ref_seq_id, |
2366 | 6 | s->hdr->ref_seq_start, |
2367 | 6 | s->hdr->ref_seq_start + s->hdr->ref_seq_span -1); |
2368 | 6 | } |
2369 | 6 | s->ref_start = s->hdr->ref_seq_start; |
2370 | 6 | s->ref_end = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1; |
2371 | | |
2372 | | /* Sanity check */ |
2373 | 6 | if (s->ref_start < 0) { |
2374 | 0 | hts_log_warning("Slice starts before base 1" |
2375 | 0 | " at #%d:%"PRId64"-%"PRId64, ref_id, s->hdr->ref_seq_start, |
2376 | 0 | s->hdr->ref_seq_start + s->hdr->ref_seq_span-1); |
2377 | 0 | s->ref_start = 0; |
2378 | 0 | } |
2379 | 6 | pthread_mutex_lock(&fd->ref_lock); |
2380 | 6 | pthread_mutex_lock(&fd->refs->lock); |
2381 | 6 | if ((fd->required_fields & SAM_SEQ) && |
2382 | 6 | ref_id < fd->refs->nref && fd->refs->ref_id && |
2383 | 6 | s->ref_end > fd->refs->ref_id[ref_id]->length) { |
2384 | 0 | s->ref_end = fd->refs->ref_id[ref_id]->length; |
2385 | 0 | } |
2386 | 6 | pthread_mutex_unlock(&fd->refs->lock); |
2387 | 6 | pthread_mutex_unlock(&fd->ref_lock); |
2388 | 6 | } |
2389 | 405 | } |
2390 | | |
2391 | 435 | if ((fd->required_fields & SAM_SEQ) && |
2392 | 435 | s->ref == NULL && s->hdr->ref_seq_id >= 0 && !c->comp_hdr->no_ref) { |
2393 | 6 | hts_log_error("Unable to fetch reference #%d:%"PRId64"-%"PRId64"\n", |
2394 | 6 | ref_id, s->hdr->ref_seq_start, |
2395 | 6 | s->hdr->ref_seq_start + s->hdr->ref_seq_span-1); |
2396 | 6 | return -1; |
2397 | 6 | } |
2398 | | |
2399 | 429 | if (CRAM_MAJOR_VERS(fd->version) != 1 |
2400 | 429 | && (fd->required_fields & SAM_SEQ) |
2401 | 429 | && s->hdr->ref_seq_id >= 0 |
2402 | 429 | && !fd->ignore_md5 |
2403 | 429 | && memcmp(s->hdr->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) { |
2404 | 0 | hts_md5_context *md5; |
2405 | 0 | unsigned char digest[16]; |
2406 | |
|
2407 | 0 | if (s->ref && s->hdr->ref_seq_id >= 0) { |
2408 | 0 | int start, len; |
2409 | |
|
2410 | 0 | if (s->hdr->ref_seq_start >= s->ref_start) { |
2411 | 0 | start = s->hdr->ref_seq_start - s->ref_start; |
2412 | 0 | } else { |
2413 | 0 | hts_log_warning("Slice starts before base 1 at #%d:%"PRIhts_pos"-%"PRIhts_pos, |
2414 | 0 | ref_id, s->ref_start, s->ref_end); |
2415 | 0 | start = 0; |
2416 | 0 | } |
2417 | |
|
2418 | 0 | if (s->hdr->ref_seq_span <= s->ref_end - s->ref_start + 1) { |
2419 | 0 | len = s->hdr->ref_seq_span; |
2420 | 0 | } else { |
2421 | 0 | hts_log_warning("Slice ends beyond reference end at #%d:%"PRIhts_pos"-%"PRIhts_pos, |
2422 | 0 | ref_id, s->ref_start, s->ref_end); |
2423 | 0 | len = s->ref_end - s->ref_start + 1; |
2424 | 0 | } |
2425 | |
|
2426 | 0 | if (!(md5 = hts_md5_init())) |
2427 | 0 | return -1; |
2428 | 0 | if (start + len > s->ref_end - s->ref_start + 1) |
2429 | 0 | len = s->ref_end - s->ref_start + 1 - start; |
2430 | 0 | if (len >= 0) |
2431 | 0 | hts_md5_update(md5, s->ref + start, len); |
2432 | 0 | hts_md5_final(digest, md5); |
2433 | 0 | hts_md5_destroy(md5); |
2434 | 0 | } else if (!s->ref && s->hdr->ref_base_id >= 0) { |
2435 | 0 | cram_block *b = cram_get_block_by_id(s, s->hdr->ref_base_id); |
2436 | 0 | if (b) { |
2437 | 0 | if (!(md5 = hts_md5_init())) |
2438 | 0 | return -1; |
2439 | 0 | hts_md5_update(md5, b->data, b->uncomp_size); |
2440 | 0 | hts_md5_final(digest, md5); |
2441 | 0 | hts_md5_destroy(md5); |
2442 | 0 | } |
2443 | 0 | } |
2444 | | |
2445 | 0 | if (!c->comp_hdr->no_ref && |
2446 | 0 | ((!s->ref && s->hdr->ref_base_id < 0) |
2447 | 0 | || memcmp(digest, s->hdr->md5, 16) != 0)) { |
2448 | 0 | char M[33]; |
2449 | 0 | const char *rname = sam_hdr_tid2name(sh, ref_id); |
2450 | 0 | if (!rname) rname="?"; // cannot happen normally |
2451 | 0 | hts_log_error("MD5 checksum reference mismatch at %s:%"PRIhts_pos"-%"PRIhts_pos, |
2452 | 0 | rname, s->ref_start, s->ref_end); |
2453 | 0 | hts_log_error("CRAM : %s", md5_print(s->hdr->md5, M)); |
2454 | 0 | hts_log_error("Ref : %s", md5_print(digest, M)); |
2455 | 0 | kstring_t ks = KS_INITIALIZE; |
2456 | 0 | if (sam_hdr_find_tag_id(sh, "SQ", "SN", rname, "M5", &ks) == 0) |
2457 | 0 | hts_log_error("@SQ M5: %s", ks.s); |
2458 | 0 | hts_log_error("Please check the reference given is correct"); |
2459 | 0 | ks_free(&ks); |
2460 | 0 | return -1; |
2461 | 0 | } |
2462 | 0 | } |
2463 | | |
2464 | 429 | if (ref_id == -2) { |
2465 | 33 | pthread_mutex_lock(&fd->ref_lock); |
2466 | 33 | pthread_mutex_lock(&fd->refs->lock); |
2467 | 33 | refs = calloc(fd->refs->nref, sizeof(char *)); |
2468 | 33 | pthread_mutex_unlock(&fd->refs->lock); |
2469 | 33 | pthread_mutex_unlock(&fd->ref_lock); |
2470 | 33 | if (!refs) |
2471 | 0 | return -1; |
2472 | 33 | } |
2473 | | |
2474 | 429 | int last_ref_id = -9; // Arbitrary -ve marker for not-yet-set |
2475 | 429 | for (rec = 0; rec < s->hdr->num_records; rec++) { |
2476 | 21 | cram_record *cr = &s->crecs[rec]; |
2477 | 21 | int has_MD, has_NM; |
2478 | | |
2479 | | //fprintf(stderr, "Decode seq %d, %d/%d\n", rec, blk->byte, blk->bit); |
2480 | | |
2481 | 21 | cr->s = s; |
2482 | | |
2483 | 21 | out_sz = 1; /* decode 1 item */ |
2484 | 21 | if (ds & CRAM_BF) { |
2485 | 21 | if (!c->comp_hdr->codecs[DS_BF]) goto block_err; |
2486 | 0 | r |= c->comp_hdr->codecs[DS_BF] |
2487 | 0 | ->decode(s, c->comp_hdr->codecs[DS_BF], blk, |
2488 | 0 | (char *)&bf, &out_sz); |
2489 | 0 | if (r || bf < 0 || |
2490 | 0 | bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap)) |
2491 | 0 | goto block_err; |
2492 | 0 | bf = fd->bam_flag_swap[bf]; |
2493 | 0 | cr->flags = bf; |
2494 | 0 | } else { |
2495 | 0 | cr->flags = bf = 0x4; // unmapped |
2496 | 0 | } |
2497 | | |
2498 | 0 | if (ds & CRAM_CF) { |
2499 | 0 | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
2500 | | /* CF is byte in 1.0, int32 in 2.0 */ |
2501 | 0 | if (!c->comp_hdr->codecs[DS_CF]) goto block_err; |
2502 | 0 | r |= c->comp_hdr->codecs[DS_CF] |
2503 | 0 | ->decode(s, c->comp_hdr->codecs[DS_CF], blk, |
2504 | 0 | (char *)&cf, &out_sz); |
2505 | 0 | if (r) goto block_err; |
2506 | 0 | cr->cram_flags = cf; |
2507 | 0 | } else { |
2508 | 0 | if (!c->comp_hdr->codecs[DS_CF]) goto block_err; |
2509 | 0 | r |= c->comp_hdr->codecs[DS_CF] |
2510 | 0 | ->decode(s, c->comp_hdr->codecs[DS_CF], blk, |
2511 | 0 | (char *)&cr->cram_flags, &out_sz); |
2512 | 0 | if (r) goto block_err; |
2513 | 0 | cf = cr->cram_flags; |
2514 | 0 | } |
2515 | 0 | } else { |
2516 | 0 | cf = cr->cram_flags = 0; |
2517 | 0 | } |
2518 | | |
2519 | 0 | if (CRAM_MAJOR_VERS(fd->version) != 1 && ref_id == -2) { |
2520 | 0 | if (ds & CRAM_RI) { |
2521 | 0 | if (!c->comp_hdr->codecs[DS_RI]) goto block_err; |
2522 | 0 | r |= c->comp_hdr->codecs[DS_RI] |
2523 | 0 | ->decode(s, c->comp_hdr->codecs[DS_RI], blk, |
2524 | 0 | (char *)&cr->ref_id, &out_sz); |
2525 | 0 | if (r) goto block_err; |
2526 | 0 | if ((fd->required_fields & (SAM_SEQ|SAM_TLEN)) |
2527 | 0 | && cr->ref_id >= 0 |
2528 | 0 | && cr->ref_id != last_ref_id) { |
2529 | 0 | if (!c->comp_hdr->no_ref) { |
2530 | | // Range(fd): seq >= 0, unmapped -1, unspecified -2 |
2531 | | // Slice(s): seq >= 0, unmapped -1, multiple refs -2 |
2532 | | // Record(cr): seq >= 0, unmapped -1 |
2533 | 0 | pthread_mutex_lock(&fd->range_lock); |
2534 | 0 | int need_ref = (fd->range.refid == -2 || cr->ref_id == fd->range.refid); |
2535 | 0 | pthread_mutex_unlock(&fd->range_lock); |
2536 | 0 | if (need_ref) { |
2537 | 0 | if (!refs[cr->ref_id]) |
2538 | 0 | refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id, 1, 0); |
2539 | 0 | if (!(s->ref = refs[cr->ref_id])) |
2540 | 0 | goto block_err; |
2541 | 0 | } else { |
2542 | | // For multi-ref containers, we don't need to fetch all |
2543 | | // refs if we're only querying one. |
2544 | 0 | s->ref = NULL; |
2545 | 0 | } |
2546 | | |
2547 | 0 | pthread_mutex_lock(&fd->range_lock); |
2548 | 0 | int discard_last_ref = (last_ref_id >= 0 && |
2549 | 0 | refs[last_ref_id] && |
2550 | 0 | (fd->range.refid == -2 || |
2551 | 0 | last_ref_id == fd->range.refid)); |
2552 | 0 | pthread_mutex_unlock(&fd->range_lock); |
2553 | 0 | if (discard_last_ref) { |
2554 | 0 | pthread_mutex_lock(&fd->ref_lock); |
2555 | 0 | discard_last_ref = !fd->unsorted; |
2556 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
2557 | 0 | } |
2558 | 0 | if (discard_last_ref) { |
2559 | 0 | cram_ref_decr(fd->refs, last_ref_id); |
2560 | 0 | refs[last_ref_id] = NULL; |
2561 | 0 | } |
2562 | 0 | } |
2563 | 0 | s->ref_start = 1; |
2564 | 0 | pthread_mutex_lock(&fd->ref_lock); |
2565 | 0 | pthread_mutex_lock(&fd->refs->lock); |
2566 | 0 | s->ref_end = fd->refs->ref_id[cr->ref_id]->length; |
2567 | 0 | pthread_mutex_unlock(&fd->refs->lock); |
2568 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
2569 | |
|
2570 | 0 | last_ref_id = cr->ref_id; |
2571 | 0 | } |
2572 | 0 | } else { |
2573 | 0 | cr->ref_id = -1; |
2574 | 0 | } |
2575 | 0 | } else { |
2576 | 0 | cr->ref_id = ref_id; // Forced constant in CRAM 1.0 |
2577 | 0 | } |
2578 | 0 | if (cr->ref_id < -1 || cr->ref_id >= bfd->nref) { |
2579 | 0 | hts_log_error("Requested unknown reference ID %d", cr->ref_id); |
2580 | 0 | goto block_err; |
2581 | 0 | } |
2582 | | |
2583 | 0 | if (ds & CRAM_RL) { |
2584 | 0 | if (!c->comp_hdr->codecs[DS_RL]) goto block_err; |
2585 | 0 | r |= c->comp_hdr->codecs[DS_RL] |
2586 | 0 | ->decode(s, c->comp_hdr->codecs[DS_RL], blk, |
2587 | 0 | (char *)&cr->len, &out_sz); |
2588 | 0 | if (r) goto block_err; |
2589 | 0 | if (cr->len < 0) { |
2590 | 0 | hts_log_error("Read has negative length"); |
2591 | 0 | goto block_err; |
2592 | 0 | } |
2593 | 0 | } |
2594 | | |
2595 | 0 | if (ds & CRAM_AP) { |
2596 | 0 | if (!c->comp_hdr->codecs[DS_AP]) goto block_err; |
2597 | 0 | if (CRAM_MAJOR_VERS(fd->version) >= 4) { |
2598 | 0 | r |= c->comp_hdr->codecs[DS_AP] |
2599 | 0 | ->decode(s, c->comp_hdr->codecs[DS_AP], blk, |
2600 | 0 | (char *)&cr->apos, &out_sz); |
2601 | 0 | } else { |
2602 | 0 | int32_t i32; |
2603 | 0 | r |= c->comp_hdr->codecs[DS_AP] |
2604 | 0 | ->decode(s, c->comp_hdr->codecs[DS_AP], blk, |
2605 | 0 | (char *)&i32, &out_sz); |
2606 | 0 | cr->apos = i32; |
2607 | 0 | } |
2608 | 0 | if (r) goto block_err;; |
2609 | 0 | if (c->comp_hdr->AP_delta) { |
2610 | 0 | if (cr->apos < 0 && c->unsorted == 0) { |
2611 | | // cache locally in c->unsorted so we don't have an |
2612 | | // excessive number of locks |
2613 | 0 | pthread_mutex_lock(&fd->ref_lock); |
2614 | 0 | c->unsorted = fd->unsorted = 1; |
2615 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
2616 | 0 | } |
2617 | 0 | cr->apos += s->last_apos; |
2618 | 0 | } |
2619 | 0 | s->last_apos= cr->apos; |
2620 | 0 | } else { |
2621 | 0 | cr->apos = c->ref_seq_start; |
2622 | 0 | } |
2623 | | |
2624 | 0 | if (ds & CRAM_RG) { |
2625 | 0 | if (!c->comp_hdr->codecs[DS_RG]) goto block_err; |
2626 | 0 | r |= c->comp_hdr->codecs[DS_RG] |
2627 | 0 | ->decode(s, c->comp_hdr->codecs[DS_RG], blk, |
2628 | 0 | (char *)&cr->rg, &out_sz); |
2629 | 0 | if (r) goto block_err; |
2630 | 0 | if (cr->rg == unknown_rg) |
2631 | 0 | cr->rg = -1; |
2632 | 0 | } else { |
2633 | 0 | cr->rg = -1; |
2634 | 0 | } |
2635 | | |
2636 | 0 | cr->name_len = 0; |
2637 | |
|
2638 | 0 | if (c->comp_hdr->read_names_included) { |
2639 | 0 | int32_t out_sz2 = 1; |
2640 | | |
2641 | | // Read directly into name cram_block |
2642 | 0 | cr->name = BLOCK_SIZE(s->name_blk); |
2643 | 0 | if (ds & CRAM_RN) { |
2644 | 0 | if (!c->comp_hdr->codecs[DS_RN]) goto block_err; |
2645 | 0 | r |= c->comp_hdr->codecs[DS_RN] |
2646 | 0 | ->decode(s, c->comp_hdr->codecs[DS_RN], blk, |
2647 | 0 | (char *)s->name_blk, &out_sz2); |
2648 | 0 | if (r) goto block_err; |
2649 | 0 | cr->name_len = out_sz2; |
2650 | 0 | } |
2651 | 0 | } |
2652 | | |
2653 | 0 | cr->mate_pos = 0; |
2654 | 0 | cr->mate_line = -1; |
2655 | 0 | cr->mate_ref_id = -1; |
2656 | 0 | cr->explicit_tlen = INT64_MIN; |
2657 | 0 | if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) { |
2658 | 0 | if (ds & CRAM_MF) { |
2659 | 0 | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
2660 | | /* MF is byte in 1.0, int32 in 2.0 */ |
2661 | 0 | unsigned char mf; |
2662 | 0 | if (!c->comp_hdr->codecs[DS_MF]) goto block_err; |
2663 | 0 | r |= c->comp_hdr->codecs[DS_MF] |
2664 | 0 | ->decode(s, c->comp_hdr->codecs[DS_MF], |
2665 | 0 | blk, (char *)&mf, &out_sz); |
2666 | 0 | if (r) goto block_err; |
2667 | 0 | cr->mate_flags = mf; |
2668 | 0 | } else { |
2669 | 0 | if (!c->comp_hdr->codecs[DS_MF]) goto block_err; |
2670 | 0 | r |= c->comp_hdr->codecs[DS_MF] |
2671 | 0 | ->decode(s, c->comp_hdr->codecs[DS_MF], |
2672 | 0 | blk, |
2673 | 0 | (char *)&cr->mate_flags, |
2674 | 0 | &out_sz); |
2675 | 0 | if (r) goto block_err; |
2676 | 0 | } |
2677 | 0 | } else { |
2678 | 0 | cr->mate_flags = 0; |
2679 | 0 | } |
2680 | | |
2681 | 0 | if (!c->comp_hdr->read_names_included) { |
2682 | 0 | int32_t out_sz2 = 1; |
2683 | | |
2684 | | // Read directly into name cram_block |
2685 | 0 | cr->name = BLOCK_SIZE(s->name_blk); |
2686 | 0 | if (ds & CRAM_RN) { |
2687 | 0 | if (!c->comp_hdr->codecs[DS_RN]) goto block_err; |
2688 | 0 | r |= c->comp_hdr->codecs[DS_RN] |
2689 | 0 | ->decode(s, c->comp_hdr->codecs[DS_RN], |
2690 | 0 | blk, (char *)s->name_blk, |
2691 | 0 | &out_sz2); |
2692 | 0 | if (r) goto block_err; |
2693 | 0 | cr->name_len = out_sz2; |
2694 | 0 | } |
2695 | 0 | } |
2696 | | |
2697 | 0 | if (ds & CRAM_NS) { |
2698 | 0 | if (!c->comp_hdr->codecs[DS_NS]) goto block_err; |
2699 | 0 | r |= c->comp_hdr->codecs[DS_NS] |
2700 | 0 | ->decode(s, c->comp_hdr->codecs[DS_NS], blk, |
2701 | 0 | (char *)&cr->mate_ref_id, &out_sz); |
2702 | 0 | if (r) goto block_err; |
2703 | 0 | } |
2704 | | |
2705 | | // Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown. |
2706 | | // if (cr->mate_ref_id == -1 && cr->flags & 0x01) { |
2707 | | // /* Paired, but unmapped */ |
2708 | | // cr->flags |= BAM_FMUNMAP; |
2709 | | // } |
2710 | | |
2711 | 0 | if (ds & CRAM_NP) { |
2712 | 0 | if (!c->comp_hdr->codecs[DS_NP]) goto block_err;; |
2713 | 0 | if (CRAM_MAJOR_VERS(fd->version) < 4) { |
2714 | 0 | int32_t i32; |
2715 | 0 | r |= c->comp_hdr->codecs[DS_NP] |
2716 | 0 | ->decode(s, c->comp_hdr->codecs[DS_NP], blk, |
2717 | 0 | (char *)&i32, &out_sz); |
2718 | 0 | cr->mate_pos = i32; |
2719 | 0 | } else { |
2720 | 0 | r |= c->comp_hdr->codecs[DS_NP] |
2721 | 0 | ->decode(s, c->comp_hdr->codecs[DS_NP], blk, |
2722 | 0 | (char *)&cr->mate_pos, &out_sz); |
2723 | 0 | } |
2724 | 0 | if (r) goto block_err; |
2725 | 0 | } |
2726 | | |
2727 | 0 | if (ds & CRAM_TS) { |
2728 | 0 | if (!c->comp_hdr->codecs[DS_TS]) goto block_err; |
2729 | 0 | r = cram_decode_tlen(fd, c, s, blk, &cr->tlen); |
2730 | 0 | if (r) goto block_err; |
2731 | 0 | } else { |
2732 | 0 | cr->tlen = INT64_MIN; |
2733 | 0 | } |
2734 | 0 | } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) { |
2735 | | // else not detached |
2736 | 0 | if (ds & CRAM_NF) { |
2737 | 0 | if (!c->comp_hdr->codecs[DS_NF]) goto block_err; |
2738 | 0 | r |= c->comp_hdr->codecs[DS_NF] |
2739 | 0 | ->decode(s, c->comp_hdr->codecs[DS_NF], blk, |
2740 | 0 | (char *)&cr->mate_line, &out_sz); |
2741 | 0 | if (r) goto block_err; |
2742 | 0 | cr->mate_line += rec + 1; |
2743 | | |
2744 | | //cr->name_len = sprintf(name, "%d", name_id++); |
2745 | | //cr->name = DSTRING_LEN(name_ds); |
2746 | | //dstring_nappend(name_ds, name, cr->name_len); |
2747 | |
|
2748 | 0 | cr->mate_ref_id = -1; |
2749 | 0 | cr->tlen = INT64_MIN; |
2750 | 0 | cr->mate_pos = 0; |
2751 | 0 | } else { |
2752 | 0 | cr->mate_flags = 0; |
2753 | 0 | cr->tlen = INT64_MIN; |
2754 | 0 | } |
2755 | 0 | if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) { |
2756 | 0 | if (ds & CRAM_TS) { |
2757 | 0 | r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen); |
2758 | 0 | if (r) return r; |
2759 | 0 | } else { |
2760 | 0 | cr->mate_flags = 0; |
2761 | 0 | cr->tlen = INT64_MIN; |
2762 | 0 | } |
2763 | 0 | } |
2764 | 0 | } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) { |
2765 | 0 | if (ds & CRAM_TS) { |
2766 | 0 | r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen); |
2767 | 0 | if (r) return r; |
2768 | 0 | } else { |
2769 | 0 | cr->mate_flags = 0; |
2770 | 0 | cr->tlen = INT64_MIN; |
2771 | 0 | } |
2772 | 0 | } else { |
2773 | 0 | cr->mate_flags = 0; |
2774 | 0 | cr->tlen = INT64_MIN; |
2775 | 0 | } |
2776 | | /* |
2777 | | else if (!name[0]) { |
2778 | | //name[0] = '?'; name[1] = 0; |
2779 | | //cr->name_len = 1; |
2780 | | //cr->name= DSTRING_LEN(s->name_ds); |
2781 | | //dstring_nappend(s->name_ds, "?", 1); |
2782 | | |
2783 | | cr->mate_ref_id = -1; |
2784 | | cr->tlen = 0; |
2785 | | cr->mate_pos = 0; |
2786 | | } |
2787 | | */ |
2788 | | |
2789 | | /* Auxiliary tags */ |
2790 | 0 | has_MD = has_NM = 0; |
2791 | 0 | if (CRAM_MAJOR_VERS(fd->version) == 1) |
2792 | 0 | r |= cram_decode_aux_1_0(c, s, blk, cr); |
2793 | 0 | else |
2794 | 0 | r |= cram_decode_aux(fd, c, s, blk, cr, &has_MD, &has_NM); |
2795 | 0 | if (r) goto block_err; |
2796 | | |
2797 | | /* Fake up dynamic string growth and appending */ |
2798 | 0 | if (ds & CRAM_RL) { |
2799 | 0 | cr->seq = BLOCK_SIZE(s->seqs_blk); |
2800 | 0 | BLOCK_GROW(s->seqs_blk, cr->len); |
2801 | 0 | seq = (char *)BLOCK_END(s->seqs_blk); |
2802 | 0 | BLOCK_SIZE(s->seqs_blk) += cr->len; |
2803 | |
|
2804 | 0 | if (!seq) |
2805 | 0 | goto block_err; |
2806 | | |
2807 | 0 | cr->qual = BLOCK_SIZE(s->qual_blk); |
2808 | 0 | BLOCK_GROW(s->qual_blk, cr->len); |
2809 | 0 | qual = (char *)BLOCK_END(s->qual_blk); |
2810 | 0 | BLOCK_SIZE(s->qual_blk) += cr->len; |
2811 | |
|
2812 | 0 | if (!s->ref) |
2813 | 0 | memset(seq, '=', cr->len); |
2814 | 0 | } |
2815 | | |
2816 | 0 | if (!(bf & BAM_FUNMAP)) { |
2817 | 0 | if ((ds & CRAM_AP) && cr->apos <= 0) { |
2818 | 0 | hts_log_error("Read has alignment position %"PRId64 |
2819 | 0 | " but no unmapped flag", |
2820 | 0 | cr->apos); |
2821 | 0 | goto block_err; |
2822 | 0 | } |
2823 | | /* Decode sequence and generate CIGAR */ |
2824 | 0 | if (ds & (CRAM_SEQ | CRAM_MQ)) { |
2825 | 0 | r |= cram_decode_seq(fd, c, s, blk, cr, sh, cf, seq, qual, |
2826 | 0 | has_MD, has_NM); |
2827 | 0 | if (r) goto block_err; |
2828 | 0 | } else { |
2829 | 0 | cr->cigar = 0; |
2830 | 0 | cr->ncigar = 0; |
2831 | 0 | cr->aend = cr->apos; |
2832 | 0 | cr->mqual = 0; |
2833 | 0 | } |
2834 | 0 | } else { |
2835 | 0 | int out_sz2 = cr->len; |
2836 | | |
2837 | | //puts("Unmapped"); |
2838 | 0 | cr->cigar = 0; |
2839 | 0 | cr->ncigar = 0; |
2840 | 0 | cr->aend = cr->apos; |
2841 | 0 | cr->mqual = 0; |
2842 | |
|
2843 | 0 | if (ds & CRAM_BA && cr->len) { |
2844 | 0 | if (!c->comp_hdr->codecs[DS_BA]) goto block_err; |
2845 | 0 | r |= c->comp_hdr->codecs[DS_BA] |
2846 | 0 | ->decode(s, c->comp_hdr->codecs[DS_BA], blk, |
2847 | 0 | (char *)seq, &out_sz2); |
2848 | 0 | if (r) goto block_err; |
2849 | 0 | } |
2850 | | |
2851 | 0 | if ((ds & CRAM_CF) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { |
2852 | 0 | out_sz2 = cr->len; |
2853 | 0 | if (ds & CRAM_QS && cr->len >= 0) { |
2854 | 0 | if (!c->comp_hdr->codecs[DS_QS]) goto block_err; |
2855 | 0 | r |= c->comp_hdr->codecs[DS_QS] |
2856 | 0 | ->decode(s, c->comp_hdr->codecs[DS_QS], |
2857 | 0 | blk, qual, &out_sz2); |
2858 | 0 | if (r) goto block_err; |
2859 | 0 | } |
2860 | 0 | } else { |
2861 | 0 | if (ds & CRAM_RL) |
2862 | 0 | memset(qual, 255, cr->len); |
2863 | 0 | } |
2864 | 0 | } |
2865 | | |
2866 | 0 | if (!c->comp_hdr->qs_seq_orient && (ds & CRAM_QS) && (cr->flags & BAM_FREVERSE)) { |
2867 | 0 | int i, j; |
2868 | 0 | for (i = 0, j = cr->len-1; i < j; i++, j--) { |
2869 | 0 | unsigned char c; |
2870 | 0 | c = qual[i]; |
2871 | 0 | qual[i] = qual[j]; |
2872 | 0 | qual[j] = c; |
2873 | 0 | } |
2874 | 0 | } |
2875 | 0 | } |
2876 | | |
2877 | 408 | pthread_mutex_lock(&fd->ref_lock); |
2878 | 408 | if (refs) { |
2879 | 18 | int i; |
2880 | 27 | for (i = 0; i < fd->refs->nref; i++) { |
2881 | 9 | if (refs[i]) |
2882 | 0 | cram_ref_decr(fd->refs, i); |
2883 | 9 | } |
2884 | 18 | free(refs); |
2885 | 18 | refs = NULL; |
2886 | 390 | } else if (ref_id >= 0 && s->ref != fd->ref_free && !embed_ref) { |
2887 | 0 | cram_ref_decr(fd->refs, ref_id); |
2888 | 0 | } |
2889 | 408 | pthread_mutex_unlock(&fd->ref_lock); |
2890 | | |
2891 | | /* Resolve mate pair cross-references between recs within this slice */ |
2892 | 408 | r |= cram_decode_slice_xref(s, fd->required_fields); |
2893 | | |
2894 | | // Free the original blocks as we no longer need these. |
2895 | 408 | { |
2896 | 408 | int i; |
2897 | 2.05k | for (i = 0; i < s->hdr->num_blocks; i++) { |
2898 | 1.64k | cram_block *b = s->block[i]; |
2899 | 1.64k | cram_free_block(b); |
2900 | 1.64k | s->block[i] = NULL; |
2901 | 1.64k | } |
2902 | 408 | } |
2903 | | |
2904 | | // Also see initial BLOCK_RESIZE_EXACT at top of function. |
2905 | | // As we grow blocks we overallocate by up to 50%. So shrink |
2906 | | // back to their final sizes here. |
2907 | | // |
2908 | | //fprintf(stderr, "%d %d // %d %d // %d %d // %d %d\n", |
2909 | | // (int)s->seqs_blk->byte, (int)s->seqs_blk->alloc, |
2910 | | // (int)s->qual_blk->byte, (int)s->qual_blk->alloc, |
2911 | | // (int)s->name_blk->byte, (int)s->name_blk->alloc, |
2912 | | // (int)s->aux_blk->byte, (int)s->aux_blk->alloc); |
2913 | 408 | BLOCK_RESIZE_EXACT(s->seqs_blk, BLOCK_SIZE(s->seqs_blk)+1); |
2914 | 408 | BLOCK_RESIZE_EXACT(s->qual_blk, BLOCK_SIZE(s->qual_blk)+1); |
2915 | 408 | BLOCK_RESIZE_EXACT(s->name_blk, BLOCK_SIZE(s->name_blk)+1); |
2916 | 408 | BLOCK_RESIZE_EXACT(s->aux_blk, BLOCK_SIZE(s->aux_blk)+1); |
2917 | | |
2918 | 408 | return r; |
2919 | | |
2920 | 21 | block_err: |
2921 | 21 | if (refs) { |
2922 | 15 | int i; |
2923 | 15 | pthread_mutex_lock(&fd->ref_lock); |
2924 | 45 | for (i = 0; i < fd->refs->nref; i++) { |
2925 | 30 | if (refs[i]) |
2926 | 0 | cram_ref_decr(fd->refs, i); |
2927 | 30 | } |
2928 | 15 | free(refs); |
2929 | 15 | pthread_mutex_unlock(&fd->ref_lock); |
2930 | 15 | } |
2931 | | |
2932 | 21 | return -1; |
2933 | 408 | } |
2934 | | |
2935 | | typedef struct { |
2936 | | cram_fd *fd; |
2937 | | cram_container *c; |
2938 | | cram_slice *s; |
2939 | | sam_hdr_t *h; |
2940 | | int exit_code; |
2941 | | } cram_decode_job; |
2942 | | |
2943 | 0 | void *cram_decode_slice_thread(void *arg) { |
2944 | 0 | cram_decode_job *j = (cram_decode_job *)arg; |
2945 | |
|
2946 | 0 | j->exit_code = cram_decode_slice(j->fd, j->c, j->s, j->h); |
2947 | |
|
2948 | 0 | return j; |
2949 | 0 | } |
2950 | | |
2951 | | /* |
2952 | | * Spawn a multi-threaded version of cram_decode_slice(). |
2953 | | */ |
2954 | | int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s, |
2955 | 480 | sam_hdr_t *bfd) { |
2956 | 480 | cram_decode_job *j; |
2957 | 480 | int nonblock; |
2958 | | |
2959 | 480 | if (!fd->pool) |
2960 | 480 | return cram_decode_slice(fd, c, s, bfd); |
2961 | | |
2962 | 0 | if (!(j = malloc(sizeof(*j)))) |
2963 | 0 | return -1; |
2964 | | |
2965 | 0 | j->fd = fd; |
2966 | 0 | j->c = c; |
2967 | 0 | j->s = s; |
2968 | 0 | j->h = bfd; |
2969 | |
|
2970 | 0 | nonblock = hts_tpool_process_sz(fd->rqueue) ? 1 : 0; |
2971 | |
|
2972 | 0 | int saved_errno = errno; |
2973 | 0 | errno = 0; |
2974 | 0 | if (-1 == hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_decode_slice_thread, |
2975 | 0 | j, nonblock)) { |
2976 | | /* Would block */ |
2977 | 0 | if (errno != EAGAIN) |
2978 | 0 | return -1; |
2979 | 0 | fd->job_pending = j; |
2980 | 0 | } else { |
2981 | 0 | fd->job_pending = NULL; |
2982 | 0 | } |
2983 | 0 | errno = saved_errno; |
2984 | | |
2985 | | // flush too |
2986 | 0 | return 0; |
2987 | 0 | } |
2988 | | |
2989 | | |
2990 | | /* ---------------------------------------------------------------------- |
2991 | | * CRAM sequence iterators. |
2992 | | */ |
2993 | | |
2994 | | /* |
2995 | | * Converts a cram in-memory record into a bam in-memory record. We |
2996 | | * pass a pointer to a bam_seq_t pointer along with the a pointer to |
2997 | | * the allocated size. These can initially be pointers to NULL and zero. |
2998 | | * |
2999 | | * This function will reallocate the bam buffer as required and update |
3000 | | * (*bam)->alloc accordingly, allowing it to be used within a loop |
3001 | | * efficiently without needing to allocate new bam objects over and |
3002 | | * over again. |
3003 | | * |
3004 | | * Returns the used size of the bam record on success |
3005 | | * -1 on failure. |
3006 | | */ |
3007 | | int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s, |
3008 | 0 | cram_record *cr, int rec, bam_seq_t **bam) { |
3009 | 0 | int ret, rg_len; |
3010 | 0 | char name_a[1024], *name; |
3011 | 0 | int name_len; |
3012 | 0 | char *aux; |
3013 | 0 | char *seq, *qual; |
3014 | 0 | sam_hrecs_t *bfd = sh->hrecs; |
3015 | | |
3016 | | /* Assign names if not explicitly set */ |
3017 | 0 | if (fd->required_fields & SAM_QNAME) { |
3018 | 0 | if (cr->name_len) { |
3019 | 0 | name = (char *)BLOCK_DATA(s->name_blk) + cr->name; |
3020 | 0 | name_len = cr->name_len; |
3021 | 0 | } else { |
3022 | 0 | name = name_a; |
3023 | 0 | if (cr->mate_line >= 0 && cr->mate_line < s->max_rec && |
3024 | 0 | s->crecs[cr->mate_line].name_len > 0) { |
3025 | | // Copy our mate if non-zero. |
3026 | 0 | memcpy(name_a, BLOCK_DATA(s->name_blk)+s->crecs[cr->mate_line].name, |
3027 | 0 | s->crecs[cr->mate_line].name_len); |
3028 | 0 | name = name_a + s->crecs[cr->mate_line].name_len; |
3029 | 0 | } else { |
3030 | | // Otherwise generate a name based on prefix |
3031 | 0 | name_len = strlen(fd->prefix); |
3032 | 0 | memcpy(name, fd->prefix, name_len); |
3033 | 0 | name += name_len; |
3034 | 0 | *name++ = ':'; |
3035 | 0 | if (cr->mate_line >= 0 && cr->mate_line < rec) { |
3036 | 0 | name = (char *)append_uint64((unsigned char *)name, |
3037 | 0 | s->hdr->record_counter + |
3038 | 0 | cr->mate_line + 1); |
3039 | 0 | } else { |
3040 | 0 | name = (char *)append_uint64((unsigned char *)name, |
3041 | 0 | s->hdr->record_counter + |
3042 | 0 | rec + 1); |
3043 | 0 | } |
3044 | 0 | } |
3045 | 0 | name_len = name - name_a; |
3046 | 0 | name = name_a; |
3047 | 0 | } |
3048 | 0 | } else { |
3049 | 0 | name = "?"; |
3050 | 0 | name_len = 1; |
3051 | 0 | } |
3052 | | |
3053 | | /* Generate BAM record */ |
3054 | 0 | if (cr->rg < -1 || cr->rg >= bfd->nrg) |
3055 | 0 | return -1; |
3056 | 0 | rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0; |
3057 | |
|
3058 | 0 | if (fd->required_fields & (SAM_SEQ | SAM_QUAL)) { |
3059 | 0 | if (!BLOCK_DATA(s->seqs_blk)) |
3060 | 0 | return -1; |
3061 | 0 | seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq; |
3062 | 0 | } else { |
3063 | 0 | seq = "*"; |
3064 | 0 | cr->len = 0; |
3065 | 0 | } |
3066 | | |
3067 | 0 | if (fd->required_fields & SAM_QUAL) { |
3068 | 0 | if (!BLOCK_DATA(s->qual_blk)) |
3069 | 0 | return -1; |
3070 | 0 | qual = (char *)BLOCK_DATA(s->qual_blk) + cr->qual; |
3071 | 0 | } else { |
3072 | 0 | qual = NULL; |
3073 | 0 | } |
3074 | | |
3075 | 0 | ret = bam_set1(*bam, |
3076 | 0 | name_len, name, |
3077 | 0 | cr->flags, cr->ref_id, cr->apos - 1, cr->mqual, |
3078 | 0 | cr->ncigar, &s->cigar[cr->cigar], |
3079 | 0 | cr->mate_ref_id, cr->mate_pos - 1, cr->tlen, |
3080 | 0 | cr->len, seq, qual, |
3081 | 0 | cr->aux_size + rg_len); |
3082 | 0 | if (ret < 0) { |
3083 | 0 | return ret; |
3084 | 0 | } |
3085 | | |
3086 | 0 | aux = (char *)bam_aux(*bam); |
3087 | | |
3088 | | /* Auxiliary strings */ |
3089 | 0 | if (cr->aux_size != 0) { |
3090 | 0 | memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size); |
3091 | 0 | aux += cr->aux_size; |
3092 | 0 | (*bam)->l_data += cr->aux_size; |
3093 | 0 | } |
3094 | | |
3095 | | /* RG:Z: */ |
3096 | 0 | if (rg_len > 0) { |
3097 | 0 | *aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z'; |
3098 | 0 | int len = bfd->rg[cr->rg].name_len; |
3099 | 0 | memcpy(aux, bfd->rg[cr->rg].name, len); |
3100 | 0 | aux += len; |
3101 | 0 | *aux++ = 0; |
3102 | 0 | (*bam)->l_data += rg_len; |
3103 | 0 | } |
3104 | |
|
3105 | 0 | return (*bam)->l_data; |
3106 | 0 | } |
3107 | | |
3108 | | /* |
3109 | | * Here be dragons! The multi-threading code in this is crufty beyond belief. |
3110 | | */ |
3111 | | |
3112 | | /* |
3113 | | * Load first container. |
3114 | | * Called when fd->ctr is NULL> |
3115 | | * |
3116 | | * Returns container on success |
3117 | | * NULL on failure. |
3118 | | */ |
3119 | 8.48k | static cram_container *cram_first_slice(cram_fd *fd) { |
3120 | 8.48k | cram_container *c; |
3121 | | |
3122 | 17.2k | do { |
3123 | 17.2k | if (fd->ctr) |
3124 | 8.76k | cram_free_container(fd->ctr); |
3125 | | |
3126 | 17.2k | if (!(c = fd->ctr = cram_read_container(fd))) |
3127 | 647 | return NULL; |
3128 | 16.6k | c->curr_slice_mt = c->curr_slice; |
3129 | 16.6k | } while (c->length == 0); |
3130 | | |
3131 | | /* |
3132 | | * The first container may be a result of a sub-range query. |
3133 | | * In which case it may still not be the optimal starting point |
3134 | | * due to skipped containers/slices in the index. |
3135 | | */ |
3136 | | // No need for locks here as we're in the main thread. |
3137 | 7.84k | if (fd->range.refid != -2) { |
3138 | 0 | while (c->ref_seq_id != -2 && |
3139 | 0 | (c->ref_seq_id < fd->range.refid || |
3140 | 0 | (fd->range.refid >= 0 && c->ref_seq_id == fd->range.refid |
3141 | 0 | && c->ref_seq_start + c->ref_seq_span-1 < fd->range.start))) { |
3142 | 0 | if (0 != cram_seek(fd, c->length, SEEK_CUR)) |
3143 | 0 | return NULL; |
3144 | 0 | cram_free_container(fd->ctr); |
3145 | 0 | do { |
3146 | 0 | if (!(c = fd->ctr = cram_read_container(fd))) |
3147 | 0 | return NULL; |
3148 | 0 | } while (c->length == 0); |
3149 | 0 | } |
3150 | | |
3151 | 0 | if (c->ref_seq_id != -2 && c->ref_seq_id != fd->range.refid) { |
3152 | 0 | fd->eof = 1; |
3153 | 0 | return NULL; |
3154 | 0 | } |
3155 | 0 | } |
3156 | | |
3157 | 7.84k | if (!(c->comp_hdr_block = cram_read_block(fd))) |
3158 | 255 | return NULL; |
3159 | 7.58k | if (c->comp_hdr_block->content_type != COMPRESSION_HEADER) |
3160 | 56 | return NULL; |
3161 | | |
3162 | 7.52k | c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block); |
3163 | 7.52k | if (!c->comp_hdr) |
3164 | 7.13k | return NULL; |
3165 | 393 | if (!c->comp_hdr->AP_delta && |
3166 | 393 | sam_hrecs_sort_order(fd->header->hrecs) != ORDER_COORD) { |
3167 | 0 | pthread_mutex_lock(&fd->ref_lock); |
3168 | 0 | fd->unsorted = 1; |
3169 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3170 | 0 | } |
3171 | | |
3172 | 393 | return c; |
3173 | 7.52k | } |
3174 | | |
3175 | 8.89k | cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { |
3176 | 8.89k | cram_container *c_curr; // container being consumed via cram_get_seq() |
3177 | 8.89k | cram_slice *s_curr = NULL; |
3178 | | |
3179 | | // Populate the first container if unknown. |
3180 | 8.89k | if (!(c_curr = fd->ctr)) { |
3181 | 8.48k | if (!(c_curr = cram_first_slice(fd))) |
3182 | 8.09k | return NULL; |
3183 | 8.48k | } |
3184 | | |
3185 | | // Discard previous slice |
3186 | 801 | if ((s_curr = c_curr->slice)) { |
3187 | 408 | c_curr->slice = NULL; |
3188 | 408 | cram_free_slice(s_curr); |
3189 | 408 | s_curr = NULL; |
3190 | 408 | } |
3191 | | |
3192 | | // If we've consumed all slices in this container, also discard |
3193 | | // the container too. |
3194 | 801 | if (c_curr->curr_slice == c_curr->max_slice) { |
3195 | 99 | if (fd->ctr == c_curr) |
3196 | 99 | fd->ctr = NULL; |
3197 | 99 | if (fd->ctr_mt == c_curr) |
3198 | 18 | fd->ctr_mt = NULL; |
3199 | 99 | cram_free_container(c_curr); |
3200 | 99 | c_curr = NULL; |
3201 | 99 | } |
3202 | | |
3203 | 801 | if (!fd->ctr_mt) |
3204 | 411 | fd->ctr_mt = c_curr; |
3205 | | |
3206 | | // Fetch the next slice (and the container if necessary). |
3207 | | // |
3208 | | // If single threaded this loop bails out as soon as it finds |
3209 | | // a slice in range. In this case c_next and c_curr end up being |
3210 | | // the same thing. |
3211 | | // |
3212 | | // If multi-threaded, we loop until we have filled out |
3213 | | // thread pool input queue. Here c_next and c_curr *may* differ, as |
3214 | | // can fd->ctr and fd->ctr_mt. |
3215 | 801 | for (;;) { |
3216 | 801 | cram_container *c_next = fd->ctr_mt; |
3217 | 801 | cram_slice *s_next = NULL; |
3218 | | |
3219 | | // Next slice; either from the last job we failed to push |
3220 | | // to the input queue or via more I/O. |
3221 | 801 | if (fd->job_pending) { |
3222 | 0 | cram_decode_job *j = (cram_decode_job *)fd->job_pending; |
3223 | 0 | c_next = j->c; |
3224 | 0 | s_next = j->s; |
3225 | 0 | free(fd->job_pending); |
3226 | 0 | fd->job_pending = NULL; |
3227 | 801 | } else if (!fd->ooc) { |
3228 | 1.05k | empty_container: |
3229 | 1.05k | if (!c_next || c_next->curr_slice_mt == c_next->max_slice) { |
3230 | | // new container |
3231 | 1.42k | for(;;) { |
3232 | 1.42k | if (!(c_next = cram_read_container(fd))) { |
3233 | 30 | if (fd->pool) { |
3234 | 0 | fd->ooc = 1; |
3235 | 0 | break; |
3236 | 0 | } |
3237 | | |
3238 | 30 | return NULL; |
3239 | 30 | } |
3240 | 1.39k | c_next->curr_slice_mt = c_next->curr_slice; |
3241 | | |
3242 | 1.39k | if (c_next->length != 0) |
3243 | 327 | break; |
3244 | | |
3245 | 1.07k | cram_free_container(c_next); |
3246 | 1.07k | } |
3247 | 327 | if (fd->ooc) |
3248 | 0 | break; |
3249 | | |
3250 | | // printf("%p %d:%ld-%ld vs %d:%ld-%ld\n", fd, |
3251 | | // c_next->ref_seq_id, c_next->ref_seq_start, c_next->ref_seq_start+c_next->ref_seq_span-1, |
3252 | | // fd->range.refid, fd->range.start, fd->range.end); |
3253 | | |
3254 | | /* Skip containers not yet spanning our range */ |
3255 | 327 | if (fd->range.refid != -2 && c_next->ref_seq_id != -2) { |
3256 | | // ref_id beyond end of range; bail out |
3257 | 0 | if (c_next->ref_seq_id != fd->range.refid) { |
3258 | 0 | cram_free_container(c_next); |
3259 | 0 | fd->ctr_mt = NULL; |
3260 | 0 | fd->ooc = 1; |
3261 | 0 | break; |
3262 | 0 | } |
3263 | | |
3264 | | // position beyond end of range; bail out |
3265 | 0 | if (fd->range.refid != -1 && |
3266 | 0 | c_next->ref_seq_start > fd->range.end) { |
3267 | 0 | cram_free_container(c_next); |
3268 | 0 | fd->ctr_mt = NULL; |
3269 | 0 | fd->ooc = 1; |
3270 | 0 | break; |
3271 | 0 | } |
3272 | | |
3273 | | // before start of range; skip to next container |
3274 | 0 | if (fd->range.refid != -1 && |
3275 | 0 | c_next->ref_seq_start + c_next->ref_seq_span-1 < |
3276 | 0 | fd->range.start) { |
3277 | 0 | c_next->curr_slice_mt = c_next->max_slice; |
3278 | 0 | cram_seek(fd, c_next->length, SEEK_CUR); |
3279 | 0 | cram_free_container(c_next); |
3280 | 0 | c_next = NULL; |
3281 | 0 | continue; |
3282 | 0 | } |
3283 | 0 | } |
3284 | | |
3285 | | // Container is valid range, so remember it for restarting |
3286 | | // this function. |
3287 | 327 | fd->ctr_mt = c_next; |
3288 | | |
3289 | 327 | if (!(c_next->comp_hdr_block = cram_read_block(fd))) |
3290 | 36 | return NULL; |
3291 | 291 | if (c_next->comp_hdr_block->content_type != COMPRESSION_HEADER) |
3292 | 18 | return NULL; |
3293 | | |
3294 | 273 | c_next->comp_hdr = |
3295 | 273 | cram_decode_compression_header(fd, c_next->comp_hdr_block); |
3296 | 273 | if (!c_next->comp_hdr) |
3297 | 39 | return NULL; |
3298 | | |
3299 | 234 | if (!c_next->comp_hdr->AP_delta && |
3300 | 234 | sam_hrecs_sort_order(fd->header->hrecs) != ORDER_COORD) { |
3301 | 0 | pthread_mutex_lock(&fd->ref_lock); |
3302 | 0 | fd->unsorted = 1; |
3303 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3304 | 0 | } |
3305 | 234 | } |
3306 | | |
3307 | 936 | if (c_next->num_records == 0) { |
3308 | 258 | if (fd->ctr == c_next) |
3309 | 36 | fd->ctr = NULL; |
3310 | 258 | if (c_curr == c_next) |
3311 | 36 | c_curr = NULL; |
3312 | 258 | if (fd->ctr_mt == c_next) |
3313 | 258 | fd->ctr_mt = NULL; |
3314 | 258 | cram_free_container(c_next); |
3315 | 258 | c_next = NULL; |
3316 | 258 | goto empty_container; |
3317 | 258 | } |
3318 | | |
3319 | 678 | if (!(s_next = c_next->slice = cram_read_slice(fd))) |
3320 | 198 | return NULL; |
3321 | | |
3322 | 480 | s_next->slice_num = ++c_next->curr_slice_mt; |
3323 | 480 | s_next->curr_rec = 0; |
3324 | 480 | s_next->max_rec = s_next->hdr->num_records; |
3325 | | |
3326 | 480 | s_next->last_apos = s_next->hdr->ref_seq_start; |
3327 | | |
3328 | | // We know the container overlaps our range, but with multi-slice |
3329 | | // containers we may have slices that do not. Skip these also. |
3330 | 480 | if (fd->range.refid != -2 && s_next->hdr->ref_seq_id != -2) { |
3331 | | // ref_id beyond end of range; bail out |
3332 | 0 | if (s_next->hdr->ref_seq_id != fd->range.refid) { |
3333 | 0 | fd->ooc = 1; |
3334 | 0 | cram_free_slice(s_next); |
3335 | 0 | c_next->slice = s_next = NULL; |
3336 | 0 | break; |
3337 | 0 | } |
3338 | | |
3339 | | // position beyond end of range; bail out |
3340 | 0 | if (fd->range.refid != -1 && |
3341 | 0 | s_next->hdr->ref_seq_start > fd->range.end) { |
3342 | 0 | fd->ooc = 1; |
3343 | 0 | cram_free_slice(s_next); |
3344 | 0 | c_next->slice = s_next = NULL; |
3345 | 0 | break; |
3346 | 0 | } |
3347 | | |
3348 | | // before start of range; skip to next slice |
3349 | 0 | if (fd->range.refid != -1 && |
3350 | 0 | s_next->hdr->ref_seq_start + s_next->hdr->ref_seq_span-1 < |
3351 | 0 | fd->range.start) { |
3352 | 0 | cram_free_slice(s_next); |
3353 | 0 | c_next->slice = s_next = NULL; |
3354 | 0 | continue; |
3355 | 0 | } |
3356 | 0 | } |
3357 | 480 | } // end: if (!fd->ooc) |
3358 | | |
3359 | 480 | if (!c_next || !s_next) |
3360 | 0 | break; |
3361 | | |
3362 | | // Decode the slice, either right now (non-threaded) or by pushing |
3363 | | // it to the a decode queue (threaded). |
3364 | 480 | if (cram_decode_slice_mt(fd, c_next, s_next, fd->header) != 0) { |
3365 | 72 | hts_log_error("Failure to decode slice"); |
3366 | 72 | cram_free_slice(s_next); |
3367 | 72 | c_next->slice = NULL; |
3368 | 72 | return NULL; |
3369 | 72 | } |
3370 | | |
3371 | | // No thread pool, so don't loop again |
3372 | 408 | if (!fd->pool) { |
3373 | 408 | c_curr = c_next; |
3374 | 408 | s_curr = s_next; |
3375 | 408 | break; |
3376 | 408 | } |
3377 | | |
3378 | | // With thread pool, but we have a job pending so our decode queue |
3379 | | // is full. |
3380 | 0 | if (fd->job_pending) |
3381 | 0 | break; |
3382 | | |
3383 | | // Otherwise we're threaded with room in the decode input queue, so |
3384 | | // keep reading slices for decode. |
3385 | | // Push it a bit far, to qsize in queue rather than pending arrival, |
3386 | | // as cram tends to be a bit bursty in decode timings. |
3387 | 0 | if (hts_tpool_process_len(fd->rqueue) > |
3388 | 0 | hts_tpool_process_qsize(fd->rqueue)) |
3389 | 0 | break; |
3390 | 0 | } // end of for(;;) |
3391 | | |
3392 | | |
3393 | | // When not threaded we've already have c_curr and s_curr. |
3394 | | // Otherwise we need get them by pulling off the decode output queue. |
3395 | 408 | if (fd->pool) { |
3396 | 0 | hts_tpool_result *res; |
3397 | 0 | cram_decode_job *j; |
3398 | |
|
3399 | 0 | if (fd->ooc && hts_tpool_process_empty(fd->rqueue)) { |
3400 | 0 | fd->eof = 1; |
3401 | 0 | return NULL; |
3402 | 0 | } |
3403 | | |
3404 | 0 | res = hts_tpool_next_result_wait(fd->rqueue); |
3405 | |
|
3406 | 0 | if (!res || !hts_tpool_result_data(res)) { |
3407 | 0 | hts_log_error("Call to hts_tpool_next_result failed"); |
3408 | 0 | return NULL; |
3409 | 0 | } |
3410 | | |
3411 | 0 | j = (cram_decode_job *)hts_tpool_result_data(res); |
3412 | 0 | c_curr = j->c; |
3413 | 0 | s_curr = j->s; |
3414 | |
|
3415 | 0 | if (j->exit_code != 0) { |
3416 | 0 | hts_log_error("Slice decode failure"); |
3417 | 0 | fd->eof = 0; |
3418 | 0 | hts_tpool_delete_result(res, 1); |
3419 | 0 | return NULL; |
3420 | 0 | } |
3421 | | |
3422 | 0 | hts_tpool_delete_result(res, 1); |
3423 | 0 | } |
3424 | | |
3425 | 408 | *cp = c_curr; |
3426 | | |
3427 | | // Update current slice being processed (as opposed to current |
3428 | | // slice in the multi-threaded reahead. |
3429 | 408 | fd->ctr = c_curr; |
3430 | 408 | if (c_curr) { |
3431 | 408 | c_curr->slice = s_curr; |
3432 | 408 | if (s_curr) |
3433 | 408 | c_curr->curr_slice = s_curr->slice_num; |
3434 | 408 | } |
3435 | 408 | if (s_curr) |
3436 | 408 | s_curr->curr_rec = 0; |
3437 | 0 | else |
3438 | 0 | fd->eof = 1; |
3439 | | |
3440 | 408 | return s_curr; |
3441 | 408 | } |
3442 | | |
3443 | | /* |
3444 | | * Read the next cram record and return it. |
3445 | | * Note that to decode cram_record the caller will need to look up some data |
3446 | | * in the current slice, pointed to by fd->ctr->slice. This is valid until |
3447 | | * the next call to cram_get_seq (which may invalidate it). |
3448 | | * |
3449 | | * Returns record pointer on success (do not free) |
3450 | | * NULL on failure |
3451 | | */ |
3452 | 8.48k | cram_record *cram_get_seq(cram_fd *fd) { |
3453 | 8.48k | cram_container *c; |
3454 | 8.48k | cram_slice *s; |
3455 | | |
3456 | 8.89k | for (;;) { |
3457 | 8.89k | c = fd->ctr; |
3458 | 8.89k | if (c && c->slice && c->slice->curr_rec < c->slice->max_rec) { |
3459 | 0 | s = c->slice; |
3460 | 8.89k | } else { |
3461 | 8.89k | if (!(s = cram_next_slice(fd, &c))) |
3462 | 8.48k | return NULL; |
3463 | 408 | continue; /* In case slice contains no records */ |
3464 | 8.89k | } |
3465 | | |
3466 | | // No need to lock here as get_seq is running in the main thread, |
3467 | | // which is also the same one that does the range modifications. |
3468 | 0 | if (fd->range.refid != -2) { |
3469 | 0 | if (fd->range.refid == -1 && s->crecs[s->curr_rec].ref_id != -1) { |
3470 | | // Special case when looking for unmapped blocks at end. |
3471 | | // If these are mixed in with mapped data (c->ref_id == -2) |
3472 | | // then we need skip until we find the unmapped data, if at all |
3473 | 0 | s->curr_rec++; |
3474 | 0 | continue; |
3475 | 0 | } |
3476 | 0 | if (s->crecs[s->curr_rec].ref_id < fd->range.refid && |
3477 | 0 | s->crecs[s->curr_rec].ref_id != -1) { |
3478 | | // Looking for a mapped read, but not there yet. Special case |
3479 | | // as -1 (unmapped) shouldn't be considered < refid. |
3480 | 0 | s->curr_rec++; |
3481 | 0 | continue; |
3482 | 0 | } |
3483 | | |
3484 | 0 | if (s->crecs[s->curr_rec].ref_id != fd->range.refid) { |
3485 | 0 | fd->eof = 1; |
3486 | 0 | cram_free_slice(s); |
3487 | 0 | c->slice = NULL; |
3488 | 0 | return NULL; |
3489 | 0 | } |
3490 | | |
3491 | 0 | if (fd->range.refid != -1 && s->crecs[s->curr_rec].apos > fd->range.end) { |
3492 | 0 | fd->eof = 1; |
3493 | 0 | cram_free_slice(s); |
3494 | 0 | c->slice = NULL; |
3495 | 0 | return NULL; |
3496 | 0 | } |
3497 | | |
3498 | 0 | if (fd->range.refid != -1 && s->crecs[s->curr_rec].aend < fd->range.start) { |
3499 | 0 | s->curr_rec++; |
3500 | 0 | continue; |
3501 | 0 | } |
3502 | 0 | } |
3503 | | |
3504 | 0 | break; |
3505 | 0 | } |
3506 | | |
3507 | 0 | fd->ctr = c; |
3508 | 0 | c->slice = s; |
3509 | 0 | return &s->crecs[s->curr_rec++]; |
3510 | 8.48k | } |
3511 | | |
3512 | | /* |
3513 | | * Read the next cram record and convert it to a bam_seq_t struct. |
3514 | | * |
3515 | | * Returns >= 0 success (number of bytes written to *bam) |
3516 | | * -1 on EOF or failure (check fd->err) |
3517 | | */ |
3518 | 8.48k | int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam) { |
3519 | 8.48k | cram_record *cr; |
3520 | 8.48k | cram_container *c; |
3521 | 8.48k | cram_slice *s; |
3522 | | |
3523 | 8.48k | if (!(cr = cram_get_seq(fd))) |
3524 | 8.48k | return -1; |
3525 | | |
3526 | 0 | c = fd->ctr; |
3527 | 0 | s = c->slice; |
3528 | |
|
3529 | 0 | return cram_to_bam(fd->header, fd, s, cr, s->curr_rec-1, bam); |
3530 | 8.48k | } |
3531 | | |
3532 | | /* |
3533 | | * Drains and frees the decode read-queue for a multi-threaded reader. |
3534 | | */ |
3535 | 11.3k | void cram_drain_rqueue(cram_fd *fd) { |
3536 | 11.3k | cram_container *lc = NULL; |
3537 | | |
3538 | 11.3k | if (!fd->pool || !fd->rqueue) |
3539 | 11.3k | return; |
3540 | | |
3541 | | // drain queue of any in-flight decode jobs |
3542 | 0 | while (!hts_tpool_process_empty(fd->rqueue)) { |
3543 | 0 | hts_tpool_result *r = hts_tpool_next_result_wait(fd->rqueue); |
3544 | 0 | if (!r) |
3545 | 0 | break; |
3546 | 0 | cram_decode_job *j = (cram_decode_job *)hts_tpool_result_data(r); |
3547 | 0 | if (j->c->slice == j->s) |
3548 | 0 | j->c->slice = NULL; |
3549 | 0 | if (j->c != lc) { |
3550 | 0 | if (lc) { |
3551 | 0 | if (fd->ctr == lc) |
3552 | 0 | fd->ctr = NULL; |
3553 | 0 | if (fd->ctr_mt == lc) |
3554 | 0 | fd->ctr_mt = NULL; |
3555 | 0 | cram_free_container(lc); |
3556 | 0 | } |
3557 | 0 | lc = j->c; |
3558 | 0 | } |
3559 | 0 | cram_free_slice(j->s); |
3560 | 0 | hts_tpool_delete_result(r, 1); |
3561 | 0 | } |
3562 | | |
3563 | | // Also tidy up any pending decode job that we didn't submit to the workers |
3564 | | // due to the input queue being full. |
3565 | 0 | if (fd->job_pending) { |
3566 | 0 | cram_decode_job *j = (cram_decode_job *)fd->job_pending; |
3567 | 0 | if (j->c->slice == j->s) |
3568 | 0 | j->c->slice = NULL; |
3569 | 0 | if (j->c != lc) { |
3570 | 0 | if (lc) { |
3571 | 0 | if (fd->ctr == lc) |
3572 | 0 | fd->ctr = NULL; |
3573 | 0 | if (fd->ctr_mt == lc) |
3574 | 0 | fd->ctr_mt = NULL; |
3575 | 0 | cram_free_container(lc); |
3576 | 0 | } |
3577 | 0 | lc = j->c; |
3578 | 0 | } |
3579 | 0 | cram_free_slice(j->s); |
3580 | 0 | free(j); |
3581 | 0 | fd->job_pending = NULL; |
3582 | 0 | } |
3583 | |
|
3584 | 0 | if (lc) { |
3585 | 0 | if (fd->ctr == lc) |
3586 | 0 | fd->ctr = NULL; |
3587 | 0 | if (fd->ctr_mt == lc) |
3588 | 0 | fd->ctr_mt = NULL; |
3589 | 0 | cram_free_container(lc); |
3590 | 0 | } |
3591 | 0 | } |