/src/htslib/cram/cram_io.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | Copyright (c) 2012-2022 Genome Research Ltd. |
3 | | Author: James Bonfield <jkb@sanger.ac.uk> |
4 | | |
5 | | Redistribution and use in source and binary forms, with or without |
6 | | modification, are permitted provided that the following conditions are met: |
7 | | |
8 | | 1. Redistributions of source code must retain the above copyright notice, |
9 | | this list of conditions and the following disclaimer. |
10 | | |
11 | | 2. Redistributions in binary form must reproduce the above copyright notice, |
12 | | this list of conditions and the following disclaimer in the documentation |
13 | | and/or other materials provided with the distribution. |
14 | | |
15 | | 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger |
16 | | Institute nor the names of its contributors may be used to endorse or promote |
17 | | products derived from this software without specific prior written permission. |
18 | | |
19 | | THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND |
20 | | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
21 | | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
22 | | DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE |
23 | | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
24 | | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
25 | | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
26 | | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
27 | | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
28 | | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 | | */ |
30 | | |
31 | | /* |
32 | | * CRAM I/O primitives. |
33 | | * |
34 | | * - ITF8 encoding and decoding. |
35 | | * - Block based I/O |
36 | | * - Zlib inflating and deflating (memory) |
37 | | * - CRAM basic data structure reading and writing |
38 | | * - File opening / closing |
39 | | * - Reference sequence handling |
40 | | */ |
41 | | |
42 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
43 | | #include <config.h> |
44 | | |
45 | | #include <stdio.h> |
46 | | #include <errno.h> |
47 | | #include <assert.h> |
48 | | #include <stdlib.h> |
49 | | #include <string.h> |
50 | | #include <unistd.h> |
51 | | #include <zlib.h> |
52 | | #ifdef HAVE_LIBBZ2 |
53 | | #include <bzlib.h> |
54 | | #endif |
55 | | #ifdef HAVE_LIBLZMA |
56 | | #ifdef HAVE_LZMA_H |
57 | | #include <lzma.h> |
58 | | #else |
59 | | #include "../os/lzma_stub.h" |
60 | | #endif |
61 | | #endif |
62 | | #include <sys/types.h> |
63 | | #include <sys/stat.h> |
64 | | #include <math.h> |
65 | | #include <stdint.h> |
66 | | |
67 | | #ifdef HAVE_LIBDEFLATE |
68 | | #include <libdeflate.h> |
69 | | #define crc32(a,b,c) libdeflate_crc32((a),(b),(c)) |
70 | | #endif |
71 | | |
72 | | #include "cram.h" |
73 | | #include "os.h" |
74 | | #include "../htslib/hts.h" |
75 | | #include "open_trace_file.h" |
76 | | |
77 | | #if defined(HAVE_EXTERNAL_LIBHTSCODECS) |
78 | | #include <htscodecs/rANS_static.h> |
79 | | #include <htscodecs/rANS_static4x16.h> |
80 | | #include <htscodecs/arith_dynamic.h> |
81 | | #include <htscodecs/tokenise_name3.h> |
82 | | #include <htscodecs/fqzcomp_qual.h> |
83 | | #include <htscodecs/varint.h> // CRAM v4.0 variable-size integers |
84 | | #else |
85 | | #include "../htscodecs/htscodecs/rANS_static.h" |
86 | | #include "../htscodecs/htscodecs/rANS_static4x16.h" |
87 | | #include "../htscodecs/htscodecs/arith_dynamic.h" |
88 | | #include "../htscodecs/htscodecs/tokenise_name3.h" |
89 | | #include "../htscodecs/htscodecs/fqzcomp_qual.h" |
90 | | #include "../htscodecs/htscodecs/varint.h" |
91 | | #endif |
92 | | |
93 | | //#define REF_DEBUG |
94 | | |
95 | | #ifdef REF_DEBUG |
96 | | #include <sys/syscall.h> |
97 | | #define gettid() (int)syscall(SYS_gettid) |
98 | | |
99 | | #define RP(...) fprintf (stderr, __VA_ARGS__) |
100 | | #else |
101 | | #define RP(...) |
102 | | #endif |
103 | | |
104 | | #include "../htslib/hfile.h" |
105 | | #include "../htslib/bgzf.h" |
106 | | #include "../htslib/faidx.h" |
107 | | #include "../hts_internal.h" |
108 | | |
109 | | #ifndef PATH_MAX |
110 | | #define PATH_MAX FILENAME_MAX |
111 | | #endif |
112 | | |
113 | 11.5k | #define TRIAL_SPAN 70 |
114 | 11.5k | #define NTRIALS 3 |
115 | | |
116 | 275 | #define CRAM_DEFAULT_LEVEL 5 |
117 | | |
118 | | /* ---------------------------------------------------------------------- |
119 | | * ITF8 encoding and decoding. |
120 | | * |
121 | | * Also see the itf8_get and itf8_put macros in cram_io.h |
122 | | */ |
123 | | |
124 | | /* |
125 | | * LEGACY: consider using itf8_decode_crc. |
126 | | * |
127 | | * Reads an integer in ITF-8 encoding from 'cp' and stores it in |
128 | | * *val. |
129 | | * |
130 | | * Returns the number of bytes read on success |
131 | | * -1 on failure |
132 | | */ |
133 | 0 | int itf8_decode(cram_fd *fd, int32_t *val_p) { |
134 | 0 | static int nbytes[16] = { |
135 | 0 | 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx |
136 | 0 | 1,1,1,1, // 1000xxxx - 1011xxxx |
137 | 0 | 2,2, // 1100xxxx - 1101xxxx |
138 | 0 | 3, // 1110xxxx |
139 | 0 | 4, // 1111xxxx |
140 | 0 | }; |
141 | |
|
142 | 0 | static int nbits[16] = { |
143 | 0 | 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx |
144 | 0 | 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx |
145 | 0 | 0x1f, 0x1f, // 1100xxxx - 1101xxxx |
146 | 0 | 0x0f, // 1110xxxx |
147 | 0 | 0x0f, // 1111xxxx |
148 | 0 | }; |
149 | |
|
150 | 0 | int32_t val = hgetc(fd->fp); |
151 | 0 | if (val == -1) |
152 | 0 | return -1; |
153 | | |
154 | 0 | int i = nbytes[val>>4]; |
155 | 0 | val &= nbits[val>>4]; |
156 | |
|
157 | 0 | switch(i) { |
158 | 0 | case 0: |
159 | 0 | *val_p = val; |
160 | 0 | return 1; |
161 | | |
162 | 0 | case 1: |
163 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
164 | 0 | *val_p = val; |
165 | 0 | return 2; |
166 | | |
167 | 0 | case 2: |
168 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
169 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
170 | 0 | *val_p = val; |
171 | 0 | return 3; |
172 | | |
173 | 0 | case 3: |
174 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
175 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
176 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
177 | 0 | *val_p = val; |
178 | 0 | return 4; |
179 | | |
180 | 0 | case 4: // really 3.5 more, why make it different? |
181 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
182 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
183 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
184 | 0 | val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f); |
185 | 0 | *val_p = val; |
186 | 0 | } |
187 | | |
188 | 0 | return 5; |
189 | 0 | } |
190 | | |
191 | 540k | int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) { |
192 | 540k | static int nbytes[16] = { |
193 | 540k | 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx |
194 | 540k | 1,1,1,1, // 1000xxxx - 1011xxxx |
195 | 540k | 2,2, // 1100xxxx - 1101xxxx |
196 | 540k | 3, // 1110xxxx |
197 | 540k | 4, // 1111xxxx |
198 | 540k | }; |
199 | | |
200 | 540k | static int nbits[16] = { |
201 | 540k | 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx |
202 | 540k | 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx |
203 | 540k | 0x1f, 0x1f, // 1100xxxx - 1101xxxx |
204 | 540k | 0x0f, // 1110xxxx |
205 | 540k | 0x0f, // 1111xxxx |
206 | 540k | }; |
207 | 540k | unsigned char c[5]; |
208 | | |
209 | 540k | int32_t val = hgetc(fd->fp); |
210 | 540k | if (val == -1) |
211 | 23 | return -1; |
212 | | |
213 | 540k | c[0]=val; |
214 | | |
215 | 540k | int i = nbytes[val>>4]; |
216 | 540k | val &= nbits[val>>4]; |
217 | | |
218 | 540k | if (i > 0) { |
219 | 130k | if (hread(fd->fp, &c[1], i) < i) |
220 | 3 | return -1; |
221 | 130k | } |
222 | | |
223 | 540k | switch(i) { |
224 | 410k | case 0: |
225 | 410k | *val_p = val; |
226 | 410k | *crc = crc32(*crc, c, 1); |
227 | 410k | return 1; |
228 | | |
229 | 64.5k | case 1: |
230 | 64.5k | val = (val<<8) | c[1]; |
231 | 64.5k | *val_p = val; |
232 | 64.5k | *crc = crc32(*crc, c, 2); |
233 | 64.5k | return 2; |
234 | | |
235 | 31.7k | case 2: |
236 | 31.7k | val = (val<<8) | c[1]; |
237 | 31.7k | val = (val<<8) | c[2]; |
238 | 31.7k | *val_p = val; |
239 | 31.7k | *crc = crc32(*crc, c, 3); |
240 | 31.7k | return 3; |
241 | | |
242 | 16.6k | case 3: |
243 | 16.6k | val = (val<<8) | c[1]; |
244 | 16.6k | val = (val<<8) | c[2]; |
245 | 16.6k | val = (val<<8) | c[3]; |
246 | 16.6k | *val_p = val; |
247 | 16.6k | *crc = crc32(*crc, c, 4); |
248 | 16.6k | return 4; |
249 | | |
250 | 17.4k | case 4: // really 3.5 more, why make it different? |
251 | 17.4k | { |
252 | 17.4k | uint32_t uv = val; |
253 | 17.4k | uv = (uv<<8) | c[1]; |
254 | 17.4k | uv = (uv<<8) | c[2]; |
255 | 17.4k | uv = (uv<<8) | c[3]; |
256 | 17.4k | uv = (uv<<4) | (c[4] & 0x0f); |
257 | | // Avoid implementation-defined behaviour on negative values |
258 | 17.4k | *val_p = uv < 0x80000000UL ? (int32_t) uv : -((int32_t) (0xffffffffUL - uv)) - 1; |
259 | 17.4k | *crc = crc32(*crc, c, 5); |
260 | 17.4k | } |
261 | 540k | } |
262 | | |
263 | 17.4k | return 5; |
264 | 540k | } |
265 | | |
266 | | /* |
267 | | * Stores a value to memory in ITF-8 format. |
268 | | * |
269 | | * Returns the number of bytes required to store the number. |
270 | | * This is a maximum of 5 bytes. |
271 | | */ |
272 | 0 | static inline int itf8_put(char *cp, int32_t val) { |
273 | 0 | unsigned char *up = (unsigned char *)cp; |
274 | 0 | if (!(val & ~0x00000007f)) { // 1 byte |
275 | 0 | *up = val; |
276 | 0 | return 1; |
277 | 0 | } else if (!(val & ~0x00003fff)) { // 2 byte |
278 | 0 | *up++ = (val >> 8 ) | 0x80; |
279 | 0 | *up = val & 0xff; |
280 | 0 | return 2; |
281 | 0 | } else if (!(val & ~0x01fffff)) { // 3 byte |
282 | 0 | *up++ = (val >> 16) | 0xc0; |
283 | 0 | *up++ = (val >> 8 ) & 0xff; |
284 | 0 | *up = val & 0xff; |
285 | 0 | return 3; |
286 | 0 | } else if (!(val & ~0x0fffffff)) { // 4 byte |
287 | 0 | *up++ = (val >> 24) | 0xe0; |
288 | 0 | *up++ = (val >> 16) & 0xff; |
289 | 0 | *up++ = (val >> 8 ) & 0xff; |
290 | 0 | *up = val & 0xff; |
291 | 0 | return 4; |
292 | 0 | } else { // 5 byte |
293 | 0 | *up++ = 0xf0 | ((val>>28) & 0xff); |
294 | 0 | *up++ = (val >> 20) & 0xff; |
295 | 0 | *up++ = (val >> 12) & 0xff; |
296 | 0 | *up++ = (val >> 4 ) & 0xff; |
297 | 0 | *up = val & 0x0f; |
298 | 0 | return 5; |
299 | 0 | } |
300 | 0 | } |
301 | | |
302 | | |
303 | | /* 64-bit itf8 variant */ |
304 | 0 | static inline int ltf8_put(char *cp, int64_t val) { |
305 | 0 | unsigned char *up = (unsigned char *)cp; |
306 | 0 | if (!(val & ~((1LL<<7)-1))) { |
307 | 0 | *up = val; |
308 | 0 | return 1; |
309 | 0 | } else if (!(val & ~((1LL<<(6+8))-1))) { |
310 | 0 | *up++ = (val >> 8 ) | 0x80; |
311 | 0 | *up = val & 0xff; |
312 | 0 | return 2; |
313 | 0 | } else if (!(val & ~((1LL<<(5+2*8))-1))) { |
314 | 0 | *up++ = (val >> 16) | 0xc0; |
315 | 0 | *up++ = (val >> 8 ) & 0xff; |
316 | 0 | *up = val & 0xff; |
317 | 0 | return 3; |
318 | 0 | } else if (!(val & ~((1LL<<(4+3*8))-1))) { |
319 | 0 | *up++ = (val >> 24) | 0xe0; |
320 | 0 | *up++ = (val >> 16) & 0xff; |
321 | 0 | *up++ = (val >> 8 ) & 0xff; |
322 | 0 | *up = val & 0xff; |
323 | 0 | return 4; |
324 | 0 | } else if (!(val & ~((1LL<<(3+4*8))-1))) { |
325 | 0 | *up++ = (val >> 32) | 0xf0; |
326 | 0 | *up++ = (val >> 24) & 0xff; |
327 | 0 | *up++ = (val >> 16) & 0xff; |
328 | 0 | *up++ = (val >> 8 ) & 0xff; |
329 | 0 | *up = val & 0xff; |
330 | 0 | return 5; |
331 | 0 | } else if (!(val & ~((1LL<<(2+5*8))-1))) { |
332 | 0 | *up++ = (val >> 40) | 0xf8; |
333 | 0 | *up++ = (val >> 32) & 0xff; |
334 | 0 | *up++ = (val >> 24) & 0xff; |
335 | 0 | *up++ = (val >> 16) & 0xff; |
336 | 0 | *up++ = (val >> 8 ) & 0xff; |
337 | 0 | *up = val & 0xff; |
338 | 0 | return 6; |
339 | 0 | } else if (!(val & ~((1LL<<(1+6*8))-1))) { |
340 | 0 | *up++ = (val >> 48) | 0xfc; |
341 | 0 | *up++ = (val >> 40) & 0xff; |
342 | 0 | *up++ = (val >> 32) & 0xff; |
343 | 0 | *up++ = (val >> 24) & 0xff; |
344 | 0 | *up++ = (val >> 16) & 0xff; |
345 | 0 | *up++ = (val >> 8 ) & 0xff; |
346 | 0 | *up = val & 0xff; |
347 | 0 | return 7; |
348 | 0 | } else if (!(val & ~((1LL<<(7*8))-1))) { |
349 | 0 | *up++ = (val >> 56) | 0xfe; |
350 | 0 | *up++ = (val >> 48) & 0xff; |
351 | 0 | *up++ = (val >> 40) & 0xff; |
352 | 0 | *up++ = (val >> 32) & 0xff; |
353 | 0 | *up++ = (val >> 24) & 0xff; |
354 | 0 | *up++ = (val >> 16) & 0xff; |
355 | 0 | *up++ = (val >> 8 ) & 0xff; |
356 | 0 | *up = val & 0xff; |
357 | 0 | return 8; |
358 | 0 | } else { |
359 | 0 | *up++ = 0xff; |
360 | 0 | *up++ = (val >> 56) & 0xff; |
361 | 0 | *up++ = (val >> 48) & 0xff; |
362 | 0 | *up++ = (val >> 40) & 0xff; |
363 | 0 | *up++ = (val >> 32) & 0xff; |
364 | 0 | *up++ = (val >> 24) & 0xff; |
365 | 0 | *up++ = (val >> 16) & 0xff; |
366 | 0 | *up++ = (val >> 8 ) & 0xff; |
367 | 0 | *up = val & 0xff; |
368 | 0 | return 9; |
369 | 0 | } |
370 | 0 | } |
371 | | |
372 | | /* |
373 | | * Encodes and writes a single integer in ITF-8 format. |
374 | | * Returns 0 on success |
375 | | * -1 on failure |
376 | | */ |
377 | 0 | int itf8_encode(cram_fd *fd, int32_t val) { |
378 | 0 | char buf[5]; |
379 | 0 | int len = itf8_put(buf, val); |
380 | 0 | return hwrite(fd->fp, buf, len) == len ? 0 : -1; |
381 | 0 | } |
382 | | |
383 | | const int itf8_bytes[16] = { |
384 | | 1, 1, 1, 1, 1, 1, 1, 1, |
385 | | 2, 2, 2, 2, 3, 3, 4, 5 |
386 | | }; |
387 | | |
388 | | const int ltf8_bytes[256] = { |
389 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
390 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
391 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
392 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
393 | | |
394 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
395 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
396 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
397 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
398 | | |
399 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
400 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
401 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
402 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
403 | | |
404 | | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
405 | | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
406 | | |
407 | | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
408 | | |
409 | | 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 8, 9 |
410 | | }; |
411 | | |
412 | | /* |
413 | | * LEGACY: consider using ltf8_decode_crc. |
414 | | */ |
415 | 0 | int ltf8_decode(cram_fd *fd, int64_t *val_p) { |
416 | 0 | int c = hgetc(fd->fp); |
417 | 0 | int64_t val = (unsigned char)c; |
418 | 0 | if (c == -1) |
419 | 0 | return -1; |
420 | | |
421 | 0 | if (val < 0x80) { |
422 | 0 | *val_p = val; |
423 | 0 | return 1; |
424 | |
|
425 | 0 | } else if (val < 0xc0) { |
426 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
427 | 0 | *val_p = val & (((1LL<<(6+8)))-1); |
428 | 0 | return 2; |
429 | |
|
430 | 0 | } else if (val < 0xe0) { |
431 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
432 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
433 | 0 | *val_p = val & ((1LL<<(5+2*8))-1); |
434 | 0 | return 3; |
435 | |
|
436 | 0 | } else if (val < 0xf0) { |
437 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
438 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
439 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
440 | 0 | *val_p = val & ((1LL<<(4+3*8))-1); |
441 | 0 | return 4; |
442 | |
|
443 | 0 | } else if (val < 0xf8) { |
444 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
445 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
446 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
447 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
448 | 0 | *val_p = val & ((1LL<<(3+4*8))-1); |
449 | 0 | return 5; |
450 | |
|
451 | 0 | } else if (val < 0xfc) { |
452 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
453 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
454 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
455 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
456 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
457 | 0 | *val_p = val & ((1LL<<(2+5*8))-1); |
458 | 0 | return 6; |
459 | |
|
460 | 0 | } else if (val < 0xfe) { |
461 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
462 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
463 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
464 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
465 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
466 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
467 | 0 | *val_p = val & ((1LL<<(1+6*8))-1); |
468 | 0 | return 7; |
469 | |
|
470 | 0 | } else if (val < 0xff) { |
471 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
472 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
473 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
474 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
475 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
476 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
477 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
478 | 0 | *val_p = val & ((1LL<<(7*8))-1); |
479 | 0 | return 8; |
480 | |
|
481 | 0 | } else { |
482 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
483 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
484 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
485 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
486 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
487 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
488 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
489 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
490 | 0 | *val_p = val; |
491 | 0 | } |
492 | | |
493 | 0 | return 9; |
494 | 0 | } |
495 | | |
496 | 1.31k | int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) { |
497 | 1.31k | unsigned char c[9]; |
498 | 1.31k | int64_t val = hgetc(fd->fp); |
499 | 1.31k | if (val < 0) |
500 | 0 | return -1; |
501 | | |
502 | 1.31k | c[0] = val; |
503 | | |
504 | 1.31k | if (val < 0x80) { |
505 | 1.03k | *val_p = val; |
506 | 1.03k | *crc = crc32(*crc, c, 1); |
507 | 1.03k | return 1; |
508 | | |
509 | 1.03k | } else if (val < 0xc0) { |
510 | 55 | int v = hgetc(fd->fp); |
511 | 55 | if (v < 0) |
512 | 0 | return -1; |
513 | 55 | val = (val<<8) | (c[1]=v); |
514 | 55 | *val_p = val & (((1LL<<(6+8)))-1); |
515 | 55 | *crc = crc32(*crc, c, 2); |
516 | 55 | return 2; |
517 | | |
518 | 224 | } else if (val < 0xe0) { |
519 | 50 | if (hread(fd->fp, &c[1], 2) < 2) |
520 | 0 | return -1; |
521 | 50 | val = (val<<8) | c[1]; |
522 | 50 | val = (val<<8) | c[2]; |
523 | 50 | *val_p = val & ((1LL<<(5+2*8))-1); |
524 | 50 | *crc = crc32(*crc, c, 3); |
525 | 50 | return 3; |
526 | | |
527 | 174 | } else if (val < 0xf0) { |
528 | 20 | if (hread(fd->fp, &c[1], 3) < 3) |
529 | 0 | return -1; |
530 | 20 | val = (val<<8) | c[1]; |
531 | 20 | val = (val<<8) | c[2]; |
532 | 20 | val = (val<<8) | c[3]; |
533 | 20 | *val_p = val & ((1LL<<(4+3*8))-1); |
534 | 20 | *crc = crc32(*crc, c, 4); |
535 | 20 | return 4; |
536 | | |
537 | 154 | } else if (val < 0xf8) { |
538 | 1 | if (hread(fd->fp, &c[1], 4) < 4) |
539 | 0 | return -1; |
540 | 1 | val = (val<<8) | c[1]; |
541 | 1 | val = (val<<8) | c[2]; |
542 | 1 | val = (val<<8) | c[3]; |
543 | 1 | val = (val<<8) | c[4]; |
544 | 1 | *val_p = val & ((1LL<<(3+4*8))-1); |
545 | 1 | *crc = crc32(*crc, c, 5); |
546 | 1 | return 5; |
547 | | |
548 | 153 | } else if (val < 0xfc) { |
549 | 79 | if (hread(fd->fp, &c[1], 5) < 5) |
550 | 0 | return -1; |
551 | 79 | val = (val<<8) | c[1]; |
552 | 79 | val = (val<<8) | c[2]; |
553 | 79 | val = (val<<8) | c[3]; |
554 | 79 | val = (val<<8) | c[4]; |
555 | 79 | val = (val<<8) | c[5]; |
556 | 79 | *val_p = val & ((1LL<<(2+5*8))-1); |
557 | 79 | *crc = crc32(*crc, c, 6); |
558 | 79 | return 6; |
559 | | |
560 | 79 | } else if (val < 0xfe) { |
561 | 32 | if (hread(fd->fp, &c[1], 6) < 6) |
562 | 0 | return -1; |
563 | 32 | val = (val<<8) | c[1]; |
564 | 32 | val = (val<<8) | c[2]; |
565 | 32 | val = (val<<8) | c[3]; |
566 | 32 | val = (val<<8) | c[4]; |
567 | 32 | val = (val<<8) | c[5]; |
568 | 32 | val = (val<<8) | c[6]; |
569 | 32 | *val_p = val & ((1LL<<(1+6*8))-1); |
570 | 32 | *crc = crc32(*crc, c, 7); |
571 | 32 | return 7; |
572 | | |
573 | 42 | } else if (val < 0xff) { |
574 | 0 | uint64_t uval = val; |
575 | 0 | if (hread(fd->fp, &c[1], 7) < 7) |
576 | 0 | return -1; |
577 | 0 | uval = (uval<<8) | c[1]; |
578 | 0 | uval = (uval<<8) | c[2]; |
579 | 0 | uval = (uval<<8) | c[3]; |
580 | 0 | uval = (uval<<8) | c[4]; |
581 | 0 | uval = (uval<<8) | c[5]; |
582 | 0 | uval = (uval<<8) | c[6]; |
583 | 0 | uval = (uval<<8) | c[7]; |
584 | 0 | *val_p = uval & ((1ULL<<(7*8))-1); |
585 | 0 | *crc = crc32(*crc, c, 8); |
586 | 0 | return 8; |
587 | |
|
588 | 42 | } else { |
589 | 42 | uint64_t uval; |
590 | 42 | if (hread(fd->fp, &c[1], 8) < 8) |
591 | 0 | return -1; |
592 | 42 | uval = c[1]; |
593 | 42 | uval = (uval<<8) | c[2]; |
594 | 42 | uval = (uval<<8) | c[3]; |
595 | 42 | uval = (uval<<8) | c[4]; |
596 | 42 | uval = (uval<<8) | c[5]; |
597 | 42 | uval = (uval<<8) | c[6]; |
598 | 42 | uval = (uval<<8) | c[7]; |
599 | 42 | uval = (uval<<8) | c[8]; |
600 | 42 | *crc = crc32(*crc, c, 9); |
601 | | // Avoid implementation-defined behaviour on negative values |
602 | 42 | *val_p = c[1] < 0x80 ? (int64_t) uval : -((int64_t) (0xffffffffffffffffULL - uval)) - 1; |
603 | 42 | } |
604 | | |
605 | 42 | return 9; |
606 | 1.31k | } |
607 | | |
608 | | /* |
609 | | * Pushes a value in ITF8 format onto the end of a block. |
610 | | * This shouldn't be used for high-volume data as it is not the fastest |
611 | | * method. |
612 | | * |
613 | | * Returns the number of bytes written |
614 | | */ |
615 | 0 | int itf8_put_blk(cram_block *blk, int32_t val) { |
616 | 0 | char buf[5]; |
617 | 0 | int sz; |
618 | |
|
619 | 0 | sz = itf8_put(buf, val); |
620 | 0 | BLOCK_APPEND(blk, buf, sz); |
621 | 0 | return sz; |
622 | | |
623 | 0 | block_err: |
624 | 0 | return -1; |
625 | 0 | } |
626 | | |
627 | 0 | int ltf8_put_blk(cram_block *blk, int64_t val) { |
628 | 0 | char buf[9]; |
629 | 0 | int sz; |
630 | |
|
631 | 0 | sz = ltf8_put(buf, val); |
632 | 0 | BLOCK_APPEND(blk, buf, sz); |
633 | 0 | return sz; |
634 | | |
635 | 0 | block_err: |
636 | 0 | return -1; |
637 | 0 | } |
638 | | |
639 | 57.1k | static int64_t safe_itf8_get(char **cp, const char *endp, int *err) { |
640 | 57.1k | const unsigned char *up = (unsigned char *)*cp; |
641 | | |
642 | 57.1k | if (endp && endp - *cp < 5 && |
643 | 57.1k | (*cp >= endp || endp - *cp < itf8_bytes[up[0]>>4])) { |
644 | 337 | if (err) *err = 1; |
645 | 337 | return 0; |
646 | 337 | } |
647 | | |
648 | 56.8k | if (up[0] < 0x80) { |
649 | 53.9k | (*cp)++; |
650 | 53.9k | return up[0]; |
651 | 53.9k | } else if (up[0] < 0xc0) { |
652 | 1.59k | (*cp)+=2; |
653 | 1.59k | return ((up[0] <<8) | up[1]) & 0x3fff; |
654 | 1.59k | } else if (up[0] < 0xe0) { |
655 | 246 | (*cp)+=3; |
656 | 246 | return ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff; |
657 | 1.08k | } else if (up[0] < 0xf0) { |
658 | 516 | (*cp)+=4; |
659 | 516 | uint32_t uv = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff; |
660 | 516 | return (int32_t)uv; |
661 | 565 | } else { |
662 | 565 | (*cp)+=5; |
663 | 565 | uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f); |
664 | 565 | return (int32_t)uv; |
665 | 565 | } |
666 | 56.8k | } |
667 | | |
668 | 409 | static int64_t safe_ltf8_get(char **cp, const char *endp, int *err) { |
669 | 409 | unsigned char *up = (unsigned char *)*cp; |
670 | | |
671 | 409 | if (endp && endp - *cp < 9 && |
672 | 409 | (*cp >= endp || endp - *cp < ltf8_bytes[up[0]])) { |
673 | 238 | if (err) *err = 1; |
674 | 238 | return 0; |
675 | 238 | } |
676 | | |
677 | 171 | if (up[0] < 0x80) { |
678 | 141 | (*cp)++; |
679 | 141 | return up[0]; |
680 | 141 | } else if (up[0] < 0xc0) { |
681 | 10 | (*cp)+=2; |
682 | 10 | return (((uint64_t)up[0]<< 8) | |
683 | 10 | (uint64_t)up[1]) & (((1LL<<(6+8)))-1); |
684 | 20 | } else if (up[0] < 0xe0) { |
685 | 0 | (*cp)+=3; |
686 | 0 | return (((uint64_t)up[0]<<16) | |
687 | 0 | ((uint64_t)up[1]<< 8) | |
688 | 0 | (uint64_t)up[2]) & ((1LL<<(5+2*8))-1); |
689 | 20 | } else if (up[0] < 0xf0) { |
690 | 0 | (*cp)+=4; |
691 | 0 | return (((uint64_t)up[0]<<24) | |
692 | 0 | ((uint64_t)up[1]<<16) | |
693 | 0 | ((uint64_t)up[2]<< 8) | |
694 | 0 | (uint64_t)up[3]) & ((1LL<<(4+3*8))-1); |
695 | 20 | } else if (up[0] < 0xf8) { |
696 | 16 | (*cp)+=5; |
697 | 16 | return (((uint64_t)up[0]<<32) | |
698 | 16 | ((uint64_t)up[1]<<24) | |
699 | 16 | ((uint64_t)up[2]<<16) | |
700 | 16 | ((uint64_t)up[3]<< 8) | |
701 | 16 | (uint64_t)up[4]) & ((1LL<<(3+4*8))-1); |
702 | 16 | } else if (up[0] < 0xfc) { |
703 | 1 | (*cp)+=6; |
704 | 1 | return (((uint64_t)up[0]<<40) | |
705 | 1 | ((uint64_t)up[1]<<32) | |
706 | 1 | ((uint64_t)up[2]<<24) | |
707 | 1 | ((uint64_t)up[3]<<16) | |
708 | 1 | ((uint64_t)up[4]<< 8) | |
709 | 1 | (uint64_t)up[5]) & ((1LL<<(2+5*8))-1); |
710 | 3 | } else if (up[0] < 0xfe) { |
711 | 0 | (*cp)+=7; |
712 | 0 | return (((uint64_t)up[0]<<48) | |
713 | 0 | ((uint64_t)up[1]<<40) | |
714 | 0 | ((uint64_t)up[2]<<32) | |
715 | 0 | ((uint64_t)up[3]<<24) | |
716 | 0 | ((uint64_t)up[4]<<16) | |
717 | 0 | ((uint64_t)up[5]<< 8) | |
718 | 0 | (uint64_t)up[6]) & ((1LL<<(1+6*8))-1); |
719 | 3 | } else if (up[0] < 0xff) { |
720 | 0 | (*cp)+=8; |
721 | 0 | return (((uint64_t)up[1]<<48) | |
722 | 0 | ((uint64_t)up[2]<<40) | |
723 | 0 | ((uint64_t)up[3]<<32) | |
724 | 0 | ((uint64_t)up[4]<<24) | |
725 | 0 | ((uint64_t)up[5]<<16) | |
726 | 0 | ((uint64_t)up[6]<< 8) | |
727 | 0 | (uint64_t)up[7]) & ((1LL<<(7*8))-1); |
728 | 3 | } else { |
729 | 3 | (*cp)+=9; |
730 | 3 | return (((uint64_t)up[1]<<56) | |
731 | 3 | ((uint64_t)up[2]<<48) | |
732 | 3 | ((uint64_t)up[3]<<40) | |
733 | 3 | ((uint64_t)up[4]<<32) | |
734 | 3 | ((uint64_t)up[5]<<24) | |
735 | 3 | ((uint64_t)up[6]<<16) | |
736 | 3 | ((uint64_t)up[7]<< 8) | |
737 | 3 | (uint64_t)up[8]); |
738 | 3 | } |
739 | 171 | } |
740 | | |
741 | | // Wrapper for now |
742 | 0 | static int safe_itf8_put(char *cp, char *cp_end, int32_t val) { |
743 | 0 | return itf8_put(cp, val); |
744 | 0 | } |
745 | | |
746 | 0 | static int safe_ltf8_put(char *cp, char *cp_end, int64_t val) { |
747 | 0 | return ltf8_put(cp, val); |
748 | 0 | } |
749 | | |
750 | 5.81k | static int itf8_size(int64_t v) { |
751 | 5.81k | return ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5); |
752 | 5.81k | } |
753 | | |
754 | | //----------------------------------------------------------------------------- |
755 | | |
756 | | // CRAM v4.0 onwards uses a different variable sized integer encoding |
757 | | // that is size agnostic. |
758 | | |
759 | | // Local interface to varint.h inline version, so we can use in func ptr. |
760 | | // Note a lot of these use the unsigned interface but take signed int64_t. |
761 | | // This is because the old CRAM ITF8 inteface had signed -1 as unsigned |
762 | | // 0xffffffff. |
763 | 0 | static int uint7_size(int64_t v) { |
764 | 0 | return var_size_u64(v); |
765 | 0 | } |
766 | | |
767 | 0 | static int64_t uint7_get_32(char **cp, const char *endp, int *err) { |
768 | 0 | uint32_t val; |
769 | 0 | int nb = var_get_u32((uint8_t *)(*cp), (const uint8_t *)endp, &val); |
770 | 0 | (*cp) += nb; |
771 | 0 | if (!nb && err) *err = 1; |
772 | 0 | return val; |
773 | 0 | } |
774 | | |
775 | 0 | static int64_t sint7_get_32(char **cp, const char *endp, int *err) { |
776 | 0 | int32_t val; |
777 | 0 | int nb = var_get_s32((uint8_t *)(*cp), (const uint8_t *)endp, &val); |
778 | 0 | (*cp) += nb; |
779 | 0 | if (!nb && err) *err = 1; |
780 | 0 | return val; |
781 | 0 | } |
782 | | |
783 | 0 | static int64_t uint7_get_64(char **cp, const char *endp, int *err) { |
784 | 0 | uint64_t val; |
785 | 0 | int nb = var_get_u64((uint8_t *)(*cp), (const uint8_t *)endp, &val); |
786 | 0 | (*cp) += nb; |
787 | 0 | if (!nb && err) *err = 1; |
788 | 0 | return val; |
789 | 0 | } |
790 | | |
791 | 0 | static int64_t sint7_get_64(char **cp, const char *endp, int *err) { |
792 | 0 | int64_t val; |
793 | 0 | int nb = var_get_s64((uint8_t *)(*cp), (const uint8_t *)endp, &val); |
794 | 0 | (*cp) += nb; |
795 | 0 | if (!nb && err) *err = 1; |
796 | 0 | return val; |
797 | 0 | } |
798 | | |
799 | 0 | static int uint7_put_32(char *cp, char *endp, int32_t val) { |
800 | 0 | return var_put_u32((uint8_t *)cp, (uint8_t *)endp, val); |
801 | 0 | } |
802 | | |
803 | 0 | static int sint7_put_32(char *cp, char *endp, int32_t val) { |
804 | 0 | return var_put_s32((uint8_t *)cp, (uint8_t *)endp, val); |
805 | 0 | } |
806 | | |
807 | 0 | static int uint7_put_64(char *cp, char *endp, int64_t val) { |
808 | 0 | return var_put_u64((uint8_t *)cp, (uint8_t *)endp, val); |
809 | 0 | } |
810 | | |
811 | 0 | static int sint7_put_64(char *cp, char *endp, int64_t val) { |
812 | 0 | return var_put_s64((uint8_t *)cp, (uint8_t *)endp, val); |
813 | 0 | } |
814 | | |
815 | | // Put direct to to cram_block |
816 | 0 | static int uint7_put_blk_32(cram_block *blk, int32_t v) { |
817 | 0 | uint8_t buf[10]; |
818 | 0 | int sz = var_put_u32(buf, buf+10, v); |
819 | 0 | BLOCK_APPEND(blk, buf, sz); |
820 | 0 | return sz; |
821 | | |
822 | 0 | block_err: |
823 | 0 | return -1; |
824 | 0 | } |
825 | | |
826 | 0 | static int sint7_put_blk_32(cram_block *blk, int32_t v) { |
827 | 0 | uint8_t buf[10]; |
828 | 0 | int sz = var_put_s32(buf, buf+10, v); |
829 | 0 | BLOCK_APPEND(blk, buf, sz); |
830 | 0 | return sz; |
831 | | |
832 | 0 | block_err: |
833 | 0 | return -1; |
834 | 0 | } |
835 | | |
836 | 0 | static int uint7_put_blk_64(cram_block *blk, int64_t v) { |
837 | 0 | uint8_t buf[10]; |
838 | 0 | int sz = var_put_u64(buf, buf+10, v); |
839 | 0 | BLOCK_APPEND(blk, buf, sz); |
840 | 0 | return sz; |
841 | | |
842 | 0 | block_err: |
843 | 0 | return -1; |
844 | 0 | } |
845 | | |
846 | 0 | static int sint7_put_blk_64(cram_block *blk, int64_t v) { |
847 | 0 | uint8_t buf[10]; |
848 | 0 | int sz = var_put_s64(buf, buf+10, v); |
849 | 0 | BLOCK_APPEND(blk, buf, sz); |
850 | 0 | return sz; |
851 | | |
852 | 0 | block_err: |
853 | 0 | return -1; |
854 | 0 | } |
855 | | |
856 | | // Decode 32-bits with CRC update from cram_fd |
857 | 5.16k | static int uint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) { |
858 | 5.16k | uint8_t b[5], i = 0; |
859 | 5.16k | int c; |
860 | 5.16k | uint32_t v = 0; |
861 | | |
862 | | #ifdef VARINT2 |
863 | | b[0] = hgetc(fd->fp); |
864 | | if (b[0] < 177) { |
865 | | } else if (b[0] < 241) { |
866 | | b[1] = hgetc(fd->fp); |
867 | | } else if (b[0] < 249) { |
868 | | b[1] = hgetc(fd->fp); |
869 | | b[2] = hgetc(fd->fp); |
870 | | } else { |
871 | | int n = b[0]+2, z = 1; |
872 | | while (n-- >= 249) |
873 | | b[z++] = hgetc(fd->fp); |
874 | | } |
875 | | i = var_get_u32(b, NULL, &v); |
876 | | #else |
877 | | // // Little endian |
878 | | // int s = 0; |
879 | | // do { |
880 | | // b[i++] = c = hgetc(fd->fp); |
881 | | // if (c < 0) |
882 | | // return -1; |
883 | | // v |= (c & 0x7f) << s; |
884 | | // s += 7; |
885 | | // } while (i < 5 && (c & 0x80)); |
886 | | |
887 | | // Big endian, see also htscodecs/varint.h |
888 | 10.3k | do { |
889 | 10.3k | b[i++] = c = hgetc(fd->fp); |
890 | 10.3k | if (c < 0) |
891 | 1 | return -1; |
892 | 10.3k | v = (v<<7) | (c & 0x7f); |
893 | 10.3k | } while (i < 5 && (c & 0x80)); |
894 | 5.16k | #endif |
895 | 5.16k | *crc = crc32(*crc, b, i); |
896 | | |
897 | 5.16k | *val_p = v; |
898 | 5.16k | return i; |
899 | 5.16k | } |
900 | | |
901 | | // Decode 32-bits with CRC update from cram_fd |
902 | 3 | static int sint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) { |
903 | 3 | uint8_t b[5], i = 0; |
904 | 3 | int c; |
905 | 3 | uint32_t v = 0; |
906 | | |
907 | | #ifdef VARINT2 |
908 | | b[0] = hgetc(fd->fp); |
909 | | if (b[0] < 177) { |
910 | | } else if (b[0] < 241) { |
911 | | b[1] = hgetc(fd->fp); |
912 | | } else if (b[0] < 249) { |
913 | | b[1] = hgetc(fd->fp); |
914 | | b[2] = hgetc(fd->fp); |
915 | | } else { |
916 | | int n = b[0]+2, z = 1; |
917 | | while (n-- >= 249) |
918 | | b[z++] = hgetc(fd->fp); |
919 | | } |
920 | | i = var_get_u32(b, NULL, &v); |
921 | | #else |
922 | | // // Little endian |
923 | | // int s = 0; |
924 | | // do { |
925 | | // b[i++] = c = hgetc(fd->fp); |
926 | | // if (c < 0) |
927 | | // return -1; |
928 | | // v |= (c & 0x7f) << s; |
929 | | // s += 7; |
930 | | // } while (i < 5 && (c & 0x80)); |
931 | | |
932 | | // Big endian, see also htscodecs/varint.h |
933 | 7 | do { |
934 | 7 | b[i++] = c = hgetc(fd->fp); |
935 | 7 | if (c < 0) |
936 | 0 | return -1; |
937 | 7 | v = (v<<7) | (c & 0x7f); |
938 | 7 | } while (i < 5 && (c & 0x80)); |
939 | 3 | #endif |
940 | 3 | *crc = crc32(*crc, b, i); |
941 | | |
942 | 3 | *val_p = (v>>1) ^ -(v&1); |
943 | 3 | return i; |
944 | 3 | } |
945 | | |
946 | | |
947 | | // Decode 64-bits with CRC update from cram_fd |
948 | 12 | static int uint7_decode_crc64(cram_fd *fd, int64_t *val_p, uint32_t *crc) { |
949 | 12 | uint8_t b[10], i = 0; |
950 | 12 | int c; |
951 | 12 | uint64_t v = 0; |
952 | | |
953 | | #ifdef VARINT2 |
954 | | b[0] = hgetc(fd->fp); |
955 | | if (b[0] < 177) { |
956 | | } else if (b[0] < 241) { |
957 | | b[1] = hgetc(fd->fp); |
958 | | } else if (b[0] < 249) { |
959 | | b[1] = hgetc(fd->fp); |
960 | | b[2] = hgetc(fd->fp); |
961 | | } else { |
962 | | int n = b[0]+2, z = 1; |
963 | | while (n-- >= 249) |
964 | | b[z++] = hgetc(fd->fp); |
965 | | } |
966 | | i = var_get_u64(b, NULL, &v); |
967 | | #else |
968 | | // // Little endian |
969 | | // int s = 0; |
970 | | // do { |
971 | | // b[i++] = c = hgetc(fd->fp); |
972 | | // if (c < 0) |
973 | | // return -1; |
974 | | // v |= (c & 0x7f) << s; |
975 | | // s += 7; |
976 | | // } while (i < 10 && (c & 0x80)); |
977 | | |
978 | | // Big endian, see also htscodecs/varint.h |
979 | 20 | do { |
980 | 20 | b[i++] = c = hgetc(fd->fp); |
981 | 20 | if (c < 0) |
982 | 0 | return -1; |
983 | 20 | v = (v<<7) | (c & 0x7f); |
984 | 20 | } while (i < 5 && (c & 0x80)); |
985 | 12 | #endif |
986 | 12 | *crc = crc32(*crc, b, i); |
987 | | |
988 | 12 | *val_p = v; |
989 | 12 | return i; |
990 | 12 | } |
991 | | |
992 | | //----------------------------------------------------------------------------- |
993 | | |
994 | | /* |
995 | | * Decodes a 32-bit little endian value from fd and stores in val. |
996 | | * |
997 | | * Returns the number of bytes read on success |
998 | | * -1 on failure |
999 | | */ |
1000 | 1.56k | static int int32_decode(cram_fd *fd, int32_t *val) { |
1001 | 1.56k | int32_t i; |
1002 | 1.56k | if (4 != hread(fd->fp, &i, 4)) |
1003 | 0 | return -1; |
1004 | | |
1005 | 1.56k | *val = le_int4(i); |
1006 | 1.56k | return 4; |
1007 | 1.56k | } |
1008 | | |
1009 | | /* |
1010 | | * Encodes a 32-bit little endian value 'val' and writes to fd. |
1011 | | * |
1012 | | * Returns the number of bytes written on success |
1013 | | * -1 on failure |
1014 | | */ |
1015 | 0 | static int int32_encode(cram_fd *fd, int32_t val) { |
1016 | 0 | uint32_t v = le_int4(val); |
1017 | 0 | if (4 != hwrite(fd->fp, &v, 4)) |
1018 | 0 | return -1; |
1019 | | |
1020 | 0 | return 4; |
1021 | 0 | } |
1022 | | |
1023 | | /* As int32_decoded/encode, but from/to blocks instead of cram_fd */ |
1024 | 15 | int int32_get_blk(cram_block *b, int32_t *val) { |
1025 | 15 | if (b->uncomp_size - BLOCK_SIZE(b) < 4) |
1026 | 1 | return -1; |
1027 | | |
1028 | 14 | uint32_t v = |
1029 | 14 | ((uint32_t) b->data[b->byte ]) | |
1030 | 14 | (((uint32_t) b->data[b->byte+1]) << 8) | |
1031 | 14 | (((uint32_t) b->data[b->byte+2]) << 16) | |
1032 | 14 | (((uint32_t) b->data[b->byte+3]) << 24); |
1033 | | // Avoid implementation-defined behaviour on negative values |
1034 | 14 | *val = v < 0x80000000U ? (int32_t) v : -((int32_t) (0xffffffffU - v)) - 1; |
1035 | 14 | BLOCK_SIZE(b) += 4; |
1036 | 14 | return 4; |
1037 | 15 | } |
1038 | | |
1039 | | /* As int32_decoded/encode, but from/to blocks instead of cram_fd */ |
1040 | 0 | int int32_put_blk(cram_block *b, int32_t val) { |
1041 | 0 | unsigned char cp[4]; |
1042 | 0 | uint32_t v = val; |
1043 | 0 | cp[0] = ( v & 0xff); |
1044 | 0 | cp[1] = ((v>>8) & 0xff); |
1045 | 0 | cp[2] = ((v>>16) & 0xff); |
1046 | 0 | cp[3] = ((v>>24) & 0xff); |
1047 | |
|
1048 | 0 | BLOCK_APPEND(b, cp, 4); |
1049 | 0 | return 0; |
1050 | | |
1051 | 0 | block_err: |
1052 | 0 | return -1; |
1053 | 0 | } |
1054 | | |
1055 | | #ifdef HAVE_LIBDEFLATE |
1056 | | /* ---------------------------------------------------------------------- |
1057 | | * libdeflate compression code, with interface to match |
1058 | | * zlib_mem_{in,de}flate for simplicity elsewhere. |
1059 | | */ |
1060 | | |
1061 | | // Named the same as the version that uses zlib as we always use libdeflate for |
1062 | | // decompression when available. |
1063 | | char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) { |
1064 | | struct libdeflate_decompressor *z = libdeflate_alloc_decompressor(); |
1065 | | if (!z) { |
1066 | | hts_log_error("Call to libdeflate_alloc_decompressor failed"); |
1067 | | return NULL; |
1068 | | } |
1069 | | |
1070 | | uint8_t *data = NULL, *new_data; |
1071 | | if (!*size) |
1072 | | *size = csize*2; |
1073 | | for(;;) { |
1074 | | new_data = realloc(data, *size); |
1075 | | if (!new_data) { |
1076 | | hts_log_error("Memory allocation failure"); |
1077 | | goto fail; |
1078 | | } |
1079 | | data = new_data; |
1080 | | |
1081 | | int ret = libdeflate_gzip_decompress(z, cdata, csize, data, *size, size); |
1082 | | |
1083 | | // Auto grow output buffer size if needed and try again. |
1084 | | // Fortunately for all bar one call of this we know the size already. |
1085 | | if (ret == LIBDEFLATE_INSUFFICIENT_SPACE) { |
1086 | | (*size) *= 1.5; |
1087 | | continue; |
1088 | | } |
1089 | | |
1090 | | if (ret != LIBDEFLATE_SUCCESS) { |
1091 | | hts_log_error("Inflate operation failed: %d", ret); |
1092 | | goto fail; |
1093 | | } else { |
1094 | | break; |
1095 | | } |
1096 | | } |
1097 | | |
1098 | | libdeflate_free_decompressor(z); |
1099 | | return (char *)data; |
1100 | | |
1101 | | fail: |
1102 | | libdeflate_free_decompressor(z); |
1103 | | free(data); |
1104 | | return NULL; |
1105 | | } |
1106 | | |
1107 | | // Named differently as we use both zlib/libdeflate for compression. |
1108 | | static char *libdeflate_deflate(char *data, size_t size, size_t *cdata_size, |
1109 | | int level, int strat) { |
1110 | | level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default |
1111 | | level *= 1.23; // NB levels go up to 12 here; 5 onwards is +1 |
1112 | | level += level>=8; // 5,6,7->6,7,8 8->10 9->12 |
1113 | | if (level > 12) level = 12; |
1114 | | |
1115 | | if (strat == Z_RLE) // not supported by libdeflate |
1116 | | level = 1; |
1117 | | |
1118 | | struct libdeflate_compressor *z = libdeflate_alloc_compressor(level); |
1119 | | if (!z) { |
1120 | | hts_log_error("Call to libdeflate_alloc_compressor failed"); |
1121 | | return NULL; |
1122 | | } |
1123 | | |
1124 | | unsigned char *cdata = NULL; /* Compressed output */ |
1125 | | size_t cdata_alloc; |
1126 | | cdata = malloc(cdata_alloc = size*1.05+100); |
1127 | | if (!cdata) { |
1128 | | hts_log_error("Memory allocation failure"); |
1129 | | libdeflate_free_compressor(z); |
1130 | | return NULL; |
1131 | | } |
1132 | | |
1133 | | *cdata_size = libdeflate_gzip_compress(z, data, size, cdata, cdata_alloc); |
1134 | | libdeflate_free_compressor(z); |
1135 | | |
1136 | | if (*cdata_size == 0) { |
1137 | | hts_log_error("Call to libdeflate_gzip_compress failed"); |
1138 | | free(cdata); |
1139 | | return NULL; |
1140 | | } |
1141 | | |
1142 | | return (char *)cdata; |
1143 | | } |
1144 | | |
1145 | | #else |
1146 | | |
1147 | | /* ---------------------------------------------------------------------- |
1148 | | * zlib compression code - from Gap5's tg_iface_g.c |
1149 | | * They're static here as they're only used within the cram_compress_block |
1150 | | * and cram_uncompress_block functions, which are the external interface. |
1151 | | */ |
1152 | 0 | char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) { |
1153 | 0 | z_stream s; |
1154 | 0 | unsigned char *data = NULL; /* Uncompressed output */ |
1155 | 0 | int data_alloc = 0; |
1156 | 0 | int err; |
1157 | | |
1158 | | /* Starting point at uncompressed size, and scale after that */ |
1159 | 0 | data = malloc(data_alloc = csize*1.2+100); |
1160 | 0 | if (!data) |
1161 | 0 | return NULL; |
1162 | | |
1163 | | /* Initialise zlib stream */ |
1164 | 0 | s.zalloc = Z_NULL; /* use default allocation functions */ |
1165 | 0 | s.zfree = Z_NULL; |
1166 | 0 | s.opaque = Z_NULL; |
1167 | 0 | s.next_in = (unsigned char *)cdata; |
1168 | 0 | s.avail_in = csize; |
1169 | 0 | s.total_in = 0; |
1170 | 0 | s.next_out = data; |
1171 | 0 | s.avail_out = data_alloc; |
1172 | 0 | s.total_out = 0; |
1173 | | |
1174 | | //err = inflateInit(&s); |
1175 | 0 | err = inflateInit2(&s, 15 + 32); |
1176 | 0 | if (err != Z_OK) { |
1177 | 0 | hts_log_error("Call to zlib inflateInit failed: %s", s.msg); |
1178 | 0 | free(data); |
1179 | 0 | return NULL; |
1180 | 0 | } |
1181 | | |
1182 | | /* Decode to 'data' array */ |
1183 | 0 | for (;s.avail_in;) { |
1184 | 0 | unsigned char *data_tmp; |
1185 | 0 | int alloc_inc; |
1186 | |
|
1187 | 0 | s.next_out = &data[s.total_out]; |
1188 | 0 | err = inflate(&s, Z_NO_FLUSH); |
1189 | 0 | if (err == Z_STREAM_END) |
1190 | 0 | break; |
1191 | | |
1192 | 0 | if (err != Z_OK) { |
1193 | 0 | hts_log_error("Call to zlib inflate failed: %s", s.msg); |
1194 | 0 | free(data); |
1195 | 0 | inflateEnd(&s); |
1196 | 0 | return NULL; |
1197 | 0 | } |
1198 | | |
1199 | | /* More to come, so realloc based on growth so far */ |
1200 | 0 | alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100; |
1201 | 0 | data = realloc((data_tmp = data), data_alloc += alloc_inc); |
1202 | 0 | if (!data) { |
1203 | 0 | free(data_tmp); |
1204 | 0 | inflateEnd(&s); |
1205 | 0 | return NULL; |
1206 | 0 | } |
1207 | 0 | s.avail_out += alloc_inc; |
1208 | 0 | } |
1209 | 0 | inflateEnd(&s); |
1210 | |
|
1211 | 0 | *size = s.total_out; |
1212 | 0 | return (char *)data; |
1213 | 0 | } |
1214 | | #endif |
1215 | | |
1216 | | #if !defined(HAVE_LIBDEFLATE) || LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR == 1 && LIBDEFLATE_VERSION_MINOR <= 8) |
1217 | | static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size, |
1218 | 0 | int level, int strat) { |
1219 | 0 | z_stream s; |
1220 | 0 | unsigned char *cdata = NULL; /* Compressed output */ |
1221 | 0 | int cdata_alloc = 0; |
1222 | 0 | int cdata_pos = 0; |
1223 | 0 | int err; |
1224 | |
|
1225 | 0 | cdata = malloc(cdata_alloc = size*1.05+100); |
1226 | 0 | if (!cdata) |
1227 | 0 | return NULL; |
1228 | 0 | cdata_pos = 0; |
1229 | | |
1230 | | /* Initialise zlib stream */ |
1231 | 0 | s.zalloc = Z_NULL; /* use default allocation functions */ |
1232 | 0 | s.zfree = Z_NULL; |
1233 | 0 | s.opaque = Z_NULL; |
1234 | 0 | s.next_in = (unsigned char *)data; |
1235 | 0 | s.avail_in = size; |
1236 | 0 | s.total_in = 0; |
1237 | 0 | s.next_out = cdata; |
1238 | 0 | s.avail_out = cdata_alloc; |
1239 | 0 | s.total_out = 0; |
1240 | 0 | s.data_type = Z_BINARY; |
1241 | |
|
1242 | 0 | err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat); |
1243 | 0 | if (err != Z_OK) { |
1244 | 0 | hts_log_error("Call to zlib deflateInit2 failed: %s", s.msg); |
1245 | 0 | return NULL; |
1246 | 0 | } |
1247 | | |
1248 | | /* Encode to 'cdata' array */ |
1249 | 0 | for (;s.avail_in;) { |
1250 | 0 | s.next_out = &cdata[cdata_pos]; |
1251 | 0 | s.avail_out = cdata_alloc - cdata_pos; |
1252 | 0 | if (cdata_alloc - cdata_pos <= 0) { |
1253 | 0 | hts_log_error("Deflate produced larger output than expected"); |
1254 | 0 | return NULL; |
1255 | 0 | } |
1256 | 0 | err = deflate(&s, Z_NO_FLUSH); |
1257 | 0 | cdata_pos = cdata_alloc - s.avail_out; |
1258 | 0 | if (err != Z_OK) { |
1259 | 0 | hts_log_error("Call to zlib deflate failed: %s", s.msg); |
1260 | 0 | break; |
1261 | 0 | } |
1262 | 0 | } |
1263 | 0 | if (deflate(&s, Z_FINISH) != Z_STREAM_END) { |
1264 | 0 | hts_log_error("Call to zlib deflate failed: %s", s.msg); |
1265 | 0 | } |
1266 | 0 | *cdata_size = s.total_out; |
1267 | |
|
1268 | 0 | if (deflateEnd(&s) != Z_OK) { |
1269 | 0 | hts_log_error("Call to zlib deflate failed: %s", s.msg); |
1270 | 0 | } |
1271 | 0 | return (char *)cdata; |
1272 | 0 | } |
1273 | | #endif |
1274 | | |
1275 | | #ifdef HAVE_LIBLZMA |
1276 | | /* ------------------------------------------------------------------------ */ |
1277 | | /* |
1278 | | * Data compression routines using liblzma (xz) |
1279 | | * |
1280 | | * On a test set this shrunk the main db from 136157104 bytes to 114796168, but |
1281 | | * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq |
1282 | | * went from 18.3s to 36.3s. So decompression suffers too, but not as bad |
1283 | | * as compression times. |
1284 | | * |
1285 | | * For now we disable this functionality. If it's to be reenabled make sure you |
1286 | | * improve the mem_inflate implementation as it's just a test hack at the |
1287 | | * moment. |
1288 | | */ |
1289 | | |
1290 | | static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size, |
1291 | 0 | int level) { |
1292 | 0 | char *out; |
1293 | 0 | size_t out_size = lzma_stream_buffer_bound(size); |
1294 | 0 | *cdata_size = 0; |
1295 | |
|
1296 | 0 | out = malloc(out_size); |
1297 | | |
1298 | | /* Single call compression */ |
1299 | 0 | if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL, |
1300 | 0 | (uint8_t *)data, size, |
1301 | 0 | (uint8_t *)out, cdata_size, |
1302 | 0 | out_size)) |
1303 | 0 | return NULL; |
1304 | | |
1305 | 0 | return out; |
1306 | 0 | } |
1307 | | |
1308 | 0 | static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) { |
1309 | 0 | lzma_stream strm = LZMA_STREAM_INIT; |
1310 | 0 | size_t out_size = 0, out_pos = 0; |
1311 | 0 | char *out = NULL, *new_out; |
1312 | 0 | int r; |
1313 | | |
1314 | | /* Initiate the decoder */ |
1315 | 0 | if (LZMA_OK != lzma_stream_decoder(&strm, lzma_easy_decoder_memusage(9), 0)) |
1316 | 0 | return NULL; |
1317 | | |
1318 | | /* Decode loop */ |
1319 | 0 | strm.avail_in = csize; |
1320 | 0 | strm.next_in = (uint8_t *)cdata; |
1321 | |
|
1322 | 0 | for (;strm.avail_in;) { |
1323 | 0 | if (strm.avail_in > out_size - out_pos) { |
1324 | 0 | out_size += strm.avail_in * 4 + 32768; |
1325 | 0 | new_out = realloc(out, out_size); |
1326 | 0 | if (!new_out) |
1327 | 0 | goto fail; |
1328 | 0 | out = new_out; |
1329 | 0 | } |
1330 | 0 | strm.avail_out = out_size - out_pos; |
1331 | 0 | strm.next_out = (uint8_t *)&out[out_pos]; |
1332 | |
|
1333 | 0 | r = lzma_code(&strm, LZMA_RUN); |
1334 | 0 | if (LZMA_OK != r && LZMA_STREAM_END != r) { |
1335 | 0 | hts_log_error("LZMA decode failure (error %d)", r); |
1336 | 0 | goto fail; |
1337 | 0 | } |
1338 | | |
1339 | 0 | out_pos = strm.total_out; |
1340 | |
|
1341 | 0 | if (r == LZMA_STREAM_END) |
1342 | 0 | break; |
1343 | 0 | } |
1344 | | |
1345 | | /* finish up any unflushed data; necessary? */ |
1346 | 0 | r = lzma_code(&strm, LZMA_FINISH); |
1347 | 0 | if (r != LZMA_OK && r != LZMA_STREAM_END) { |
1348 | 0 | hts_log_error("Call to lzma_code failed with error %d", r); |
1349 | 0 | goto fail; |
1350 | 0 | } |
1351 | | |
1352 | 0 | new_out = realloc(out, strm.total_out > 0 ? strm.total_out : 1); |
1353 | 0 | if (new_out) |
1354 | 0 | out = new_out; |
1355 | 0 | *size = strm.total_out; |
1356 | |
|
1357 | 0 | lzma_end(&strm); |
1358 | |
|
1359 | 0 | return out; |
1360 | | |
1361 | 0 | fail: |
1362 | 0 | lzma_end(&strm); |
1363 | 0 | free(out); |
1364 | 0 | return NULL; |
1365 | 0 | } |
1366 | | #endif |
1367 | | |
1368 | | /* ---------------------------------------------------------------------- |
1369 | | * CRAM blocks - the dynamically growable data block. We have code to |
1370 | | * create, update, (un)compress and read/write. |
1371 | | * |
1372 | | * These are derived from the deflate_interlaced.c blocks, but with the |
1373 | | * CRAM extension of content types and IDs. |
1374 | | */ |
1375 | | |
1376 | | /* |
1377 | | * Allocates a new cram_block structure with a specified content_type and |
1378 | | * id. |
1379 | | * |
1380 | | * Returns block pointer on success |
1381 | | * NULL on failure |
1382 | | */ |
1383 | | cram_block *cram_new_block(enum cram_content_type content_type, |
1384 | 633 | int content_id) { |
1385 | 633 | cram_block *b = malloc(sizeof(*b)); |
1386 | 633 | if (!b) |
1387 | 0 | return NULL; |
1388 | 633 | b->method = b->orig_method = RAW; |
1389 | 633 | b->content_type = content_type; |
1390 | 633 | b->content_id = content_id; |
1391 | 633 | b->comp_size = 0; |
1392 | 633 | b->uncomp_size = 0; |
1393 | 633 | b->data = NULL; |
1394 | 633 | b->alloc = 0; |
1395 | 633 | b->byte = 0; |
1396 | 633 | b->bit = 7; // MSB |
1397 | 633 | b->crc32 = 0; |
1398 | 633 | b->idx = 0; |
1399 | 633 | b->m = NULL; |
1400 | | |
1401 | 633 | return b; |
1402 | 633 | } |
1403 | | |
1404 | | /* |
1405 | | * Reads a block from a cram file. |
1406 | | * Returns cram_block pointer on success. |
1407 | | * NULL on failure |
1408 | | */ |
1409 | 3.33k | cram_block *cram_read_block(cram_fd *fd) { |
1410 | 3.33k | cram_block *b = malloc(sizeof(*b)); |
1411 | 3.33k | unsigned char c; |
1412 | 3.33k | uint32_t crc = 0; |
1413 | 3.33k | if (!b) |
1414 | 0 | return NULL; |
1415 | | |
1416 | | //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp)); |
1417 | | |
1418 | 3.33k | if (-1 == (b->method = hgetc(fd->fp))) { free(b); return NULL; } |
1419 | 3.33k | c = b->method; crc = crc32(crc, &c, 1); |
1420 | 3.33k | if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; } |
1421 | 3.33k | c = b->content_type; crc = crc32(crc, &c, 1); |
1422 | 3.33k | if (-1 == fd->vv.varint_decode32_crc(fd, &b->content_id, &crc)) { free(b); return NULL; } |
1423 | 3.33k | if (-1 == fd->vv.varint_decode32_crc(fd, &b->comp_size, &crc)) { free(b); return NULL; } |
1424 | 3.33k | if (-1 == fd->vv.varint_decode32_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; } |
1425 | | |
1426 | | //fprintf(stderr, " method %d, ctype %d, cid %d, csize %d, ucsize %d\n", |
1427 | | // b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size); |
1428 | | |
1429 | 3.33k | if (b->method == RAW) { |
1430 | 673 | if (b->uncomp_size < 0 || b->comp_size != b->uncomp_size) { |
1431 | 5 | free(b); |
1432 | 5 | return NULL; |
1433 | 5 | } |
1434 | 668 | b->alloc = b->uncomp_size; |
1435 | 668 | if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; } |
1436 | 668 | if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) { |
1437 | 4 | free(b->data); |
1438 | 4 | free(b); |
1439 | 4 | return NULL; |
1440 | 4 | } |
1441 | 2.66k | } else { |
1442 | 2.66k | if (b->comp_size < 0 || b->uncomp_size < 0) { |
1443 | 3 | free(b); |
1444 | 3 | return NULL; |
1445 | 3 | } |
1446 | 2.65k | b->alloc = b->comp_size; |
1447 | 2.65k | if (!(b->data = malloc(b->comp_size))) { free(b); return NULL; } |
1448 | 2.65k | if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) { |
1449 | 11 | free(b->data); |
1450 | 11 | free(b); |
1451 | 11 | return NULL; |
1452 | 11 | } |
1453 | 2.65k | } |
1454 | | |
1455 | 3.31k | if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
1456 | 0 | if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) { |
1457 | 0 | free(b->data); |
1458 | 0 | free(b); |
1459 | 0 | return NULL; |
1460 | 0 | } |
1461 | | |
1462 | 0 | b->crc32_checked = fd->ignore_md5; |
1463 | 0 | b->crc_part = crc; |
1464 | 3.31k | } else { |
1465 | 3.31k | b->crc32_checked = 1; // CRC not present |
1466 | 3.31k | } |
1467 | | |
1468 | 3.31k | b->orig_method = b->method; |
1469 | 3.31k | b->idx = 0; |
1470 | 3.31k | b->byte = 0; |
1471 | 3.31k | b->bit = 7; // MSB |
1472 | | |
1473 | 3.31k | return b; |
1474 | 3.31k | } |
1475 | | |
1476 | | |
1477 | | /* |
1478 | | * Computes the size of a cram block, including the block |
1479 | | * header itself. |
1480 | | */ |
1481 | 0 | uint32_t cram_block_size(cram_block *b) { |
1482 | 0 | unsigned char dat[100], *cp = dat;; |
1483 | 0 | uint32_t sz; |
1484 | |
|
1485 | 0 | *cp++ = b->method; |
1486 | 0 | *cp++ = b->content_type; |
1487 | 0 | cp += itf8_put((char*)cp, b->content_id); |
1488 | 0 | cp += itf8_put((char*)cp, b->comp_size); |
1489 | 0 | cp += itf8_put((char*)cp, b->uncomp_size); |
1490 | |
|
1491 | 0 | sz = cp-dat + 4; |
1492 | 0 | sz += b->method == RAW ? b->uncomp_size : b->comp_size; |
1493 | |
|
1494 | 0 | return sz; |
1495 | 0 | } |
1496 | | |
1497 | | /* |
1498 | | * Writes a CRAM block. |
1499 | | * Returns 0 on success |
1500 | | * -1 on failure |
1501 | | */ |
1502 | 0 | int cram_write_block(cram_fd *fd, cram_block *b) { |
1503 | 0 | char vardata[100]; |
1504 | 0 | int vardata_o = 0; |
1505 | |
|
1506 | 0 | assert(b->method != RAW || (b->comp_size == b->uncomp_size)); |
1507 | | |
1508 | 0 | if (hputc(b->method, fd->fp) == EOF) return -1; |
1509 | 0 | if (hputc(b->content_type, fd->fp) == EOF) return -1; |
1510 | 0 | vardata_o += fd->vv.varint_put32(vardata , vardata+100, b->content_id); |
1511 | 0 | vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->comp_size); |
1512 | 0 | vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->uncomp_size); |
1513 | 0 | if (vardata_o != hwrite(fd->fp, vardata, vardata_o)) |
1514 | 0 | return -1; |
1515 | | |
1516 | 0 | if (b->data) { |
1517 | 0 | if (b->method == RAW) { |
1518 | 0 | if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size)) |
1519 | 0 | return -1; |
1520 | 0 | } else { |
1521 | 0 | if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size)) |
1522 | 0 | return -1; |
1523 | 0 | } |
1524 | 0 | } else { |
1525 | | // Absent blocks should be size 0 |
1526 | 0 | assert(b->method == RAW && b->uncomp_size == 0); |
1527 | 0 | } |
1528 | | |
1529 | 0 | if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
1530 | 0 | char dat[100], *cp = (char *)dat; |
1531 | 0 | uint32_t crc; |
1532 | |
|
1533 | 0 | *cp++ = b->method; |
1534 | 0 | *cp++ = b->content_type; |
1535 | 0 | cp += fd->vv.varint_put32(cp, dat+100, b->content_id); |
1536 | 0 | cp += fd->vv.varint_put32(cp, dat+100, b->comp_size); |
1537 | 0 | cp += fd->vv.varint_put32(cp, dat+100, b->uncomp_size); |
1538 | 0 | crc = crc32(0L, (uc *)dat, cp-dat); |
1539 | |
|
1540 | 0 | if (b->method == RAW) { |
1541 | 0 | b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size); |
1542 | 0 | } else { |
1543 | 0 | b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size); |
1544 | 0 | } |
1545 | |
|
1546 | 0 | if (-1 == int32_encode(fd, b->crc32)) |
1547 | 0 | return -1; |
1548 | 0 | } |
1549 | | |
1550 | 0 | return 0; |
1551 | 0 | } |
1552 | | |
1553 | | /* |
1554 | | * Frees a CRAM block, deallocating internal data too. |
1555 | | */ |
1556 | 4.12k | void cram_free_block(cram_block *b) { |
1557 | 4.12k | if (!b) |
1558 | 180 | return; |
1559 | 3.94k | if (b->data) |
1560 | 3.72k | free(b->data); |
1561 | 3.94k | free(b); |
1562 | 3.94k | } |
1563 | | |
1564 | | /* |
1565 | | * Uncompresses a CRAM block, if compressed. |
1566 | | */ |
1567 | 1.02k | int cram_uncompress_block(cram_block *b) { |
1568 | 1.02k | char *uncomp; |
1569 | 1.02k | size_t uncomp_size = 0; |
1570 | | |
1571 | 1.02k | if (b->crc32_checked == 0) { |
1572 | 0 | uint32_t crc = crc32(b->crc_part, b->data ? b->data : (uc *)"", b->alloc); |
1573 | 0 | b->crc32_checked = 1; |
1574 | 0 | if (crc != b->crc32) { |
1575 | 0 | hts_log_error("Block CRC32 failure"); |
1576 | 0 | return -1; |
1577 | 0 | } |
1578 | 0 | } |
1579 | | |
1580 | 1.02k | if (b->uncomp_size == 0) { |
1581 | | // blank block |
1582 | 432 | b->method = RAW; |
1583 | 432 | return 0; |
1584 | 432 | } |
1585 | 592 | assert(b->uncomp_size >= 0); // cram_read_block should ensure this |
1586 | | |
1587 | 592 | switch (b->method) { |
1588 | 10 | case RAW: |
1589 | 10 | return 0; |
1590 | | |
1591 | 0 | case GZIP: |
1592 | 0 | uncomp_size = b->uncomp_size; |
1593 | 0 | uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size); |
1594 | |
|
1595 | 0 | if (!uncomp) |
1596 | 0 | return -1; |
1597 | 0 | if (uncomp_size != b->uncomp_size) { |
1598 | 0 | free(uncomp); |
1599 | 0 | return -1; |
1600 | 0 | } |
1601 | 0 | free(b->data); |
1602 | 0 | b->data = (unsigned char *)uncomp; |
1603 | 0 | b->alloc = uncomp_size; |
1604 | 0 | b->method = RAW; |
1605 | 0 | break; |
1606 | | |
1607 | 0 | #ifdef HAVE_LIBBZ2 |
1608 | 0 | case BZIP2: { |
1609 | 0 | unsigned int usize = b->uncomp_size; |
1610 | 0 | if (!(uncomp = malloc(usize))) |
1611 | 0 | return -1; |
1612 | 0 | if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize, |
1613 | 0 | (char *)b->data, b->comp_size, |
1614 | 0 | 0, 0)) { |
1615 | 0 | free(uncomp); |
1616 | 0 | return -1; |
1617 | 0 | } |
1618 | 0 | free(b->data); |
1619 | 0 | b->data = (unsigned char *)uncomp; |
1620 | 0 | b->alloc = usize; |
1621 | 0 | b->method = RAW; |
1622 | 0 | b->uncomp_size = usize; // Just in case it differs |
1623 | 0 | break; |
1624 | 0 | } |
1625 | | #else |
1626 | | case BZIP2: |
1627 | | hts_log_error("Bzip2 compression is not compiled into this version. Please rebuild and try again"); |
1628 | | return -1; |
1629 | | #endif |
1630 | | |
1631 | 0 | #ifdef HAVE_LIBLZMA |
1632 | 0 | case LZMA: |
1633 | 0 | uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size); |
1634 | 0 | if (!uncomp) |
1635 | 0 | return -1; |
1636 | 0 | if (uncomp_size != b->uncomp_size) { |
1637 | 0 | free(uncomp); |
1638 | 0 | return -1; |
1639 | 0 | } |
1640 | 0 | free(b->data); |
1641 | 0 | b->data = (unsigned char *)uncomp; |
1642 | 0 | b->alloc = uncomp_size; |
1643 | 0 | b->method = RAW; |
1644 | 0 | break; |
1645 | | #else |
1646 | | case LZMA: |
1647 | | hts_log_error("Lzma compression is not compiled into this version. Please rebuild and try again"); |
1648 | | return -1; |
1649 | | break; |
1650 | | #endif |
1651 | | |
1652 | 1 | case RANS: { |
1653 | 1 | unsigned int usize = b->uncomp_size, usize2; |
1654 | 1 | uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2); |
1655 | 1 | if (!uncomp) |
1656 | 1 | return -1; |
1657 | 0 | if (usize != usize2) { |
1658 | 0 | free(uncomp); |
1659 | 0 | return -1; |
1660 | 0 | } |
1661 | 0 | free(b->data); |
1662 | 0 | b->data = (unsigned char *)uncomp; |
1663 | 0 | b->alloc = usize2; |
1664 | 0 | b->method = RAW; |
1665 | 0 | b->uncomp_size = usize2; // Just in case it differs |
1666 | | //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size); |
1667 | 0 | break; |
1668 | 0 | } |
1669 | | |
1670 | 82 | case FQZ: { |
1671 | 82 | uncomp_size = b->uncomp_size; |
1672 | 82 | uncomp = fqz_decompress((char *)b->data, b->comp_size, &uncomp_size, NULL, 0); |
1673 | 82 | if (!uncomp) |
1674 | 18 | return -1; |
1675 | 64 | free(b->data); |
1676 | 64 | b->data = (unsigned char *)uncomp; |
1677 | 64 | b->alloc = uncomp_size; |
1678 | 64 | b->method = RAW; |
1679 | 64 | b->uncomp_size = uncomp_size; |
1680 | 64 | break; |
1681 | 82 | } |
1682 | | |
1683 | 127 | case RANS_PR0: { |
1684 | 127 | unsigned int usize = b->uncomp_size, usize2; |
1685 | 127 | uncomp = (char *)rans_uncompress_4x16(b->data, b->comp_size, &usize2); |
1686 | 127 | if (!uncomp) |
1687 | 2 | return -1; |
1688 | 125 | if (usize != usize2) { |
1689 | 0 | free(uncomp); |
1690 | 0 | return -1; |
1691 | 0 | } |
1692 | 125 | b->orig_method = RANS_PR0 + (b->data[0]&1) |
1693 | 125 | + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0); |
1694 | 125 | free(b->data); |
1695 | 125 | b->data = (unsigned char *)uncomp; |
1696 | 125 | b->alloc = usize2; |
1697 | 125 | b->method = RAW; |
1698 | 125 | b->uncomp_size = usize2; // Just incase it differs |
1699 | | //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size); |
1700 | 125 | break; |
1701 | 125 | } |
1702 | | |
1703 | 245 | case ARITH_PR0: { |
1704 | 245 | unsigned int usize = b->uncomp_size, usize2; |
1705 | 245 | uncomp = (char *)arith_uncompress_to(b->data, b->comp_size, NULL, &usize2); |
1706 | 245 | if (!uncomp) |
1707 | 0 | return -1; |
1708 | 245 | if (usize != usize2) { |
1709 | 0 | free(uncomp); |
1710 | 0 | return -1; |
1711 | 0 | } |
1712 | 245 | b->orig_method = ARITH_PR0 + (b->data[0]&1) |
1713 | 245 | + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0); |
1714 | 245 | free(b->data); |
1715 | 245 | b->data = (unsigned char *)uncomp; |
1716 | 245 | b->alloc = usize2; |
1717 | 245 | b->method = RAW; |
1718 | 245 | b->uncomp_size = usize2; // Just incase it differs |
1719 | | //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size); |
1720 | 245 | break; |
1721 | 245 | } |
1722 | | |
1723 | 126 | case TOK3: { |
1724 | 126 | uint32_t out_len; |
1725 | 126 | uint8_t *cp = tok3_decode_names(b->data, b->comp_size, &out_len); |
1726 | 126 | if (!cp) |
1727 | 124 | return -1; |
1728 | 2 | b->orig_method = TOK3; |
1729 | 2 | b->method = RAW; |
1730 | 2 | free(b->data); |
1731 | 2 | b->data = cp; |
1732 | 2 | b->alloc = out_len; |
1733 | 2 | b->uncomp_size = out_len; |
1734 | 2 | break; |
1735 | 126 | } |
1736 | | |
1737 | 1 | default: |
1738 | 1 | return -1; |
1739 | 592 | } |
1740 | | |
1741 | 436 | return 0; |
1742 | 592 | } |
1743 | | |
1744 | | static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size, |
1745 | | int content_id, size_t *out_size, |
1746 | | enum cram_block_method_int method, |
1747 | 0 | int level, int strat) { |
1748 | 0 | switch (method) { |
1749 | 0 | case GZIP: |
1750 | 0 | case GZIP_RLE: |
1751 | 0 | case GZIP_1: |
1752 | | // Read names bizarrely benefit from zlib over libdeflate for |
1753 | | // mid-range compression levels. Focusing purely of ratio or |
1754 | | // speed, libdeflate still wins. It also seems to win for |
1755 | | // other data series too. |
1756 | | // |
1757 | | // Eg RN at level 5; libdeflate=55.9MB zlib=51.6MB |
1758 | | #ifdef HAVE_LIBDEFLATE |
1759 | | # if (LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR == 1 && LIBDEFLATE_VERSION_MINOR <= 8)) |
1760 | | if (content_id == DS_RN && level >= 4 && level <= 7) |
1761 | | return zlib_mem_deflate(in, in_size, out_size, level, strat); |
1762 | | else |
1763 | | # endif |
1764 | | return libdeflate_deflate(in, in_size, out_size, level, strat); |
1765 | | #else |
1766 | 0 | return zlib_mem_deflate(in, in_size, out_size, level, strat); |
1767 | 0 | #endif |
1768 | | |
1769 | 0 | case BZIP2: { |
1770 | 0 | #ifdef HAVE_LIBBZ2 |
1771 | 0 | unsigned int comp_size = in_size*1.01 + 600; |
1772 | 0 | char *comp = malloc(comp_size); |
1773 | 0 | if (!comp) |
1774 | 0 | return NULL; |
1775 | | |
1776 | 0 | if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size, |
1777 | 0 | in, in_size, |
1778 | 0 | level, 0, 30)) { |
1779 | 0 | free(comp); |
1780 | 0 | return NULL; |
1781 | 0 | } |
1782 | 0 | *out_size = comp_size; |
1783 | 0 | return comp; |
1784 | | #else |
1785 | | return NULL; |
1786 | | #endif |
1787 | 0 | } |
1788 | | |
1789 | 0 | case FQZ: |
1790 | 0 | case FQZ_b: |
1791 | 0 | case FQZ_c: |
1792 | 0 | case FQZ_d: { |
1793 | | // Extract the necessary portion of the slice into an fqz_slice struct. |
1794 | | // These previously were the same thing, but this permits us to detach |
1795 | | // the codec from the rest of this CRAM implementation. |
1796 | 0 | fqz_slice *f = malloc(2*s->hdr->num_records * sizeof(uint32_t) + sizeof(fqz_slice)); |
1797 | 0 | if (!f) |
1798 | 0 | return NULL; |
1799 | 0 | f->num_records = s->hdr->num_records; |
1800 | 0 | f->len = (uint32_t *)(((char *)f) + sizeof(fqz_slice)); |
1801 | 0 | f->flags = f->len + s->hdr->num_records; |
1802 | 0 | int i; |
1803 | 0 | for (i = 0; i < s->hdr->num_records; i++) { |
1804 | 0 | f->flags[i] = s->crecs[i].flags; |
1805 | 0 | f->len[i] = (i+1 < s->hdr->num_records |
1806 | 0 | ? s->crecs[i+1].qual - s->crecs[i].qual |
1807 | 0 | : s->block[DS_QS]->uncomp_size - s->crecs[i].qual); |
1808 | 0 | } |
1809 | 0 | char *comp = fqz_compress(strat & 0xff /* cram vers */, f, |
1810 | 0 | in, in_size, out_size, strat >> 8, NULL); |
1811 | 0 | free(f); |
1812 | 0 | return comp; |
1813 | 0 | } |
1814 | | |
1815 | 0 | case LZMA: |
1816 | 0 | #ifdef HAVE_LIBLZMA |
1817 | 0 | return lzma_mem_deflate(in, in_size, out_size, level); |
1818 | | #else |
1819 | | return NULL; |
1820 | | #endif |
1821 | | |
1822 | 0 | case RANS0: |
1823 | 0 | case RANS1: { |
1824 | 0 | unsigned int out_size_i; |
1825 | 0 | unsigned char *cp; |
1826 | 0 | cp = rans_compress((unsigned char *)in, in_size, &out_size_i, |
1827 | 0 | method == RANS0 ? 0 : 1); |
1828 | 0 | *out_size = out_size_i; |
1829 | 0 | return (char *)cp; |
1830 | 0 | } |
1831 | | |
1832 | 0 | case RANS_PR0: |
1833 | 0 | case RANS_PR1: |
1834 | 0 | case RANS_PR64: |
1835 | 0 | case RANS_PR9: |
1836 | 0 | case RANS_PR128: |
1837 | 0 | case RANS_PR129: |
1838 | 0 | case RANS_PR192: |
1839 | 0 | case RANS_PR193: { |
1840 | 0 | unsigned int out_size_i; |
1841 | 0 | unsigned char *cp; |
1842 | | |
1843 | | // see enum cram_block. We map RANS_* methods to order bit-fields |
1844 | 0 | static int methmap[] = { 1, 64,9, 128,129, 192,193 }; |
1845 | |
|
1846 | 0 | cp = rans_compress_4x16((unsigned char *)in, in_size, &out_size_i, |
1847 | 0 | method == RANS_PR0 ? 0 : methmap[method - RANS_PR1]); |
1848 | 0 | *out_size = out_size_i; |
1849 | 0 | return (char *)cp; |
1850 | 0 | } |
1851 | | |
1852 | 0 | case ARITH_PR0: |
1853 | 0 | case ARITH_PR1: |
1854 | 0 | case ARITH_PR64: |
1855 | 0 | case ARITH_PR9: |
1856 | 0 | case ARITH_PR128: |
1857 | 0 | case ARITH_PR129: |
1858 | 0 | case ARITH_PR192: |
1859 | 0 | case ARITH_PR193: { |
1860 | 0 | unsigned int out_size_i; |
1861 | 0 | unsigned char *cp; |
1862 | | |
1863 | | // see enum cram_block. We map ARITH_* methods to order bit-fields |
1864 | 0 | static int methmap[] = { 1, 64,9, 128,129, 192,193 }; |
1865 | |
|
1866 | 0 | cp = arith_compress_to((unsigned char *)in, in_size, NULL, &out_size_i, |
1867 | 0 | method == ARITH_PR0 ? 0 : methmap[method - ARITH_PR1]); |
1868 | 0 | *out_size = out_size_i; |
1869 | 0 | return (char *)cp; |
1870 | 0 | } |
1871 | | |
1872 | 0 | case TOK3: |
1873 | 0 | case TOKA: { |
1874 | 0 | int out_len; |
1875 | 0 | int lev = level; |
1876 | 0 | if (method == TOK3 && lev > 3) |
1877 | 0 | lev = 3; |
1878 | 0 | uint8_t *cp = tok3_encode_names(in, in_size, lev, strat, &out_len, NULL); |
1879 | 0 | *out_size = out_len; |
1880 | 0 | return (char *)cp; |
1881 | 0 | } |
1882 | | |
1883 | 0 | case RAW: |
1884 | 0 | break; |
1885 | | |
1886 | 0 | default: |
1887 | 0 | return NULL; |
1888 | 0 | } |
1889 | | |
1890 | 0 | return NULL; |
1891 | 0 | } |
1892 | | |
1893 | | |
1894 | | /* |
1895 | | * Compresses a block using one of two different zlib strategies. If we only |
1896 | | * want one choice set strat2 to be -1. |
1897 | | * |
1898 | | * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED |
1899 | | * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is |
1900 | | * significantly faster. |
1901 | | * |
1902 | | * Method and level -1 implies defaults, as specified in cram_fd. |
1903 | | */ |
1904 | | int cram_compress_block2(cram_fd *fd, cram_slice *s, |
1905 | | cram_block *b, cram_metrics *metrics, |
1906 | 0 | int method, int level) { |
1907 | |
|
1908 | 0 | if (!b) |
1909 | 0 | return 0; |
1910 | | |
1911 | 0 | char *comp = NULL; |
1912 | 0 | size_t comp_size = 0; |
1913 | 0 | int strat; |
1914 | | |
1915 | | // Internally we have parameterised methods that externally map |
1916 | | // to the same CRAM method value. |
1917 | | // See enum_cram_block_method_int in cram_structs.h. |
1918 | 0 | int methmap[] = { |
1919 | | // Externally defined values |
1920 | 0 | RAW, GZIP, BZIP2, LZMA, RANS, RANSPR, ARITH, FQZ, TOK3, |
1921 | | |
1922 | | // Reserved for possible expansion |
1923 | 0 | 0, 0, |
1924 | | |
1925 | | // Internally parameterised versions matching back to above |
1926 | | // external values |
1927 | 0 | GZIP, GZIP, |
1928 | 0 | FQZ, FQZ, FQZ, |
1929 | 0 | RANS, |
1930 | 0 | RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, |
1931 | 0 | TOK3, |
1932 | 0 | ARITH, ARITH, ARITH, ARITH, ARITH, ARITH, ARITH, |
1933 | 0 | }; |
1934 | |
|
1935 | 0 | if (b->method != RAW) { |
1936 | | // Maybe already compressed if s->block[0] was compressed and |
1937 | | // we have e.g. s->block[DS_BA] set to s->block[0] due to only |
1938 | | // one base type present and hence using E_HUFFMAN on block 0. |
1939 | | // A second explicit attempt to compress the same block then |
1940 | | // occurs. |
1941 | 0 | return 0; |
1942 | 0 | } |
1943 | | |
1944 | 0 | if (method == -1) { |
1945 | 0 | method = 1<<GZIP; |
1946 | 0 | if (fd->use_bz2) |
1947 | 0 | method |= 1<<BZIP2; |
1948 | 0 | if (fd->use_lzma) |
1949 | 0 | method |= 1<<LZMA; |
1950 | 0 | } |
1951 | |
|
1952 | 0 | if (level == -1) |
1953 | 0 | level = fd->level; |
1954 | | |
1955 | | //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size); |
1956 | |
|
1957 | 0 | if (method == RAW || level == 0 || b->uncomp_size == 0) { |
1958 | 0 | b->method = RAW; |
1959 | 0 | b->comp_size = b->uncomp_size; |
1960 | | //fprintf(stderr, "Skip block id %d\n", b->content_id); |
1961 | 0 | return 0; |
1962 | 0 | } |
1963 | | |
1964 | 0 | #ifndef ABS |
1965 | 0 | # define ABS(a) ((a)>=0?(a):-(a)) |
1966 | 0 | #endif |
1967 | | |
1968 | 0 | if (metrics) { |
1969 | 0 | pthread_mutex_lock(&fd->metrics_lock); |
1970 | | // Sudden changes in size trigger a retrial. These are mainly |
1971 | | // triggered when switching to sorted / unsorted, where the number |
1972 | | // of elements in a slice radically changes. |
1973 | | // |
1974 | | // We also get large fluctuations based on genome coordinate for |
1975 | | // e.g. SA:Z and SC series, but we consider the typical scale of |
1976 | | // delta between blocks and use this to look for abnormality. |
1977 | 0 | if (metrics->input_avg_sz && |
1978 | 0 | (b->uncomp_size + 1000 > 4*(metrics->input_avg_sz+1000) || |
1979 | 0 | b->uncomp_size + 1000 < (metrics->input_avg_sz+1000)/4) && |
1980 | 0 | ABS(b->uncomp_size-metrics->input_avg_sz) |
1981 | 0 | > 10*metrics->input_avg_delta) { |
1982 | 0 | metrics->next_trial = 0; |
1983 | 0 | } |
1984 | |
|
1985 | 0 | if (metrics->trial > 0 || --metrics->next_trial <= 0) { |
1986 | 0 | int m, unpackable = metrics->unpackable; |
1987 | 0 | size_t sz_best = b->uncomp_size; |
1988 | 0 | size_t sz[CRAM_MAX_METHOD] = {0}; |
1989 | 0 | int method_best = 0; // RAW |
1990 | 0 | char *c_best = NULL, *c = NULL; |
1991 | |
|
1992 | 0 | metrics->input_avg_delta = |
1993 | 0 | 0.9 * (metrics->input_avg_delta + |
1994 | 0 | ABS(b->uncomp_size - metrics->input_avg_sz)); |
1995 | |
|
1996 | 0 | metrics->input_avg_sz += b->uncomp_size*.2; |
1997 | 0 | metrics->input_avg_sz *= 0.8; |
1998 | |
|
1999 | 0 | if (metrics->revised_method) |
2000 | 0 | method = metrics->revised_method; |
2001 | 0 | else |
2002 | 0 | metrics->revised_method = method; |
2003 | |
|
2004 | 0 | if (metrics->next_trial <= 0) { |
2005 | 0 | metrics->next_trial = TRIAL_SPAN; |
2006 | 0 | metrics->trial = NTRIALS; |
2007 | 0 | for (m = 0; m < CRAM_MAX_METHOD; m++) |
2008 | 0 | metrics->sz[m] /= 2; |
2009 | 0 | metrics->unpackable = 0; |
2010 | 0 | } |
2011 | | |
2012 | | // Compress this block using the best method |
2013 | 0 | if (unpackable && CRAM_MAJOR_VERS(fd->version) > 3) { |
2014 | | // No point trying bit-pack if 17+ symbols. |
2015 | 0 | if (method & (1<<RANS_PR128)) |
2016 | 0 | method = (method|(1<<RANS_PR0))&~(1<<RANS_PR128); |
2017 | 0 | if (method & (1<<RANS_PR129)) |
2018 | 0 | method = (method|(1<<RANS_PR1))&~(1<<RANS_PR129); |
2019 | 0 | if (method & (1<<RANS_PR192)) |
2020 | 0 | method = (method|(1<<RANS_PR64))&~(1<<RANS_PR192); |
2021 | 0 | if (method & (1<<RANS_PR193)) |
2022 | 0 | method = (method|(1<<RANS_PR64)|(1<<RANS_PR1))&~(1<<RANS_PR193); |
2023 | |
|
2024 | 0 | if (method & (1<<ARITH_PR128)) |
2025 | 0 | method = (method|(1<<ARITH_PR0))&~(1<<ARITH_PR128); |
2026 | 0 | if (method & (1<<ARITH_PR129)) |
2027 | 0 | method = (method|(1<<ARITH_PR1))&~(1<<ARITH_PR129); |
2028 | 0 | if (method & (1<<ARITH_PR192)) |
2029 | 0 | method = (method|(1<<ARITH_PR64))&~(1<<ARITH_PR192); |
2030 | 0 | if (method & (1u<<ARITH_PR193)) |
2031 | 0 | method = (method|(1<<ARITH_PR64)|(1<<ARITH_PR1))&~(1u<<ARITH_PR193); |
2032 | 0 | } |
2033 | | |
2034 | | // Libdeflate doesn't have a Z_RLE strategy. |
2035 | | // We treat it as level 1, but iff we haven't also |
2036 | | // explicitly listed that in the method list. |
2037 | | #ifdef HAVE_LIBDEFLATE |
2038 | | if ((method & (1<<GZIP_RLE)) && (method & (1<<GZIP_1))) |
2039 | | method &= ~(1<<GZIP_RLE); |
2040 | | #endif |
2041 | |
|
2042 | 0 | pthread_mutex_unlock(&fd->metrics_lock); |
2043 | |
|
2044 | 0 | for (m = 0; m < CRAM_MAX_METHOD; m++) { |
2045 | 0 | if (method & (1u<<m)) { |
2046 | 0 | int lvl = level; |
2047 | 0 | switch (m) { |
2048 | 0 | case GZIP: strat = Z_FILTERED; break; |
2049 | 0 | case GZIP_1: strat = Z_DEFAULT_STRATEGY; lvl = 1; break; |
2050 | 0 | case GZIP_RLE: strat = Z_RLE; break; |
2051 | 0 | case FQZ: strat = CRAM_MAJOR_VERS(fd->version); break; |
2052 | 0 | case FQZ_b: strat = CRAM_MAJOR_VERS(fd->version)+256; break; |
2053 | 0 | case FQZ_c: strat = CRAM_MAJOR_VERS(fd->version)+2*256; break; |
2054 | 0 | case FQZ_d: strat = CRAM_MAJOR_VERS(fd->version)+3*256; break; |
2055 | 0 | case TOK3: strat = 0; break; |
2056 | 0 | case TOKA: strat = 1; break; |
2057 | 0 | default: strat = 0; |
2058 | 0 | } |
2059 | | |
2060 | 0 | c = cram_compress_by_method(s, (char *)b->data, b->uncomp_size, |
2061 | 0 | b->content_id, &sz[m], m, lvl, strat); |
2062 | |
|
2063 | 0 | if (c && sz_best > sz[m]) { |
2064 | 0 | sz_best = sz[m]; |
2065 | 0 | method_best = m; |
2066 | 0 | if (c_best) |
2067 | 0 | free(c_best); |
2068 | 0 | c_best = c; |
2069 | 0 | } else if (c) { |
2070 | 0 | free(c); |
2071 | 0 | } else { |
2072 | 0 | sz[m] = b->uncomp_size*2+1000; // arbitrarily worse than raw |
2073 | 0 | } |
2074 | 0 | } else { |
2075 | 0 | sz[m] = b->uncomp_size*2+1000; // arbitrarily worse than raw |
2076 | 0 | } |
2077 | 0 | } |
2078 | | |
2079 | 0 | if (c_best) { |
2080 | 0 | free(b->data); |
2081 | 0 | b->data = (unsigned char *)c_best; |
2082 | 0 | b->method = method_best; // adjusted to methmap[method_best] later |
2083 | 0 | b->comp_size = sz_best; |
2084 | 0 | } |
2085 | | |
2086 | | // Accumulate stats for all methods tried |
2087 | 0 | pthread_mutex_lock(&fd->metrics_lock); |
2088 | 0 | for (m = 0; m < CRAM_MAX_METHOD; m++) |
2089 | | // don't be overly sure on small blocks. |
2090 | | // +2000 means eg bzip2 vs gzip (1.07 to 1.04) or gz vs rans1 |
2091 | | // needs to be at least 60 bytes smaller to overcome the |
2092 | | // fixed size addition. |
2093 | 0 | metrics->sz[m] += sz[m]+2000; |
2094 | | |
2095 | | // When enough trials performed, find the best on average |
2096 | 0 | if (--metrics->trial == 0) { |
2097 | 0 | int best_method = RAW; |
2098 | 0 | int best_sz = INT_MAX; |
2099 | | |
2100 | | // Relative costs of methods. See enum_cram_block_method_int |
2101 | | // and methmap |
2102 | 0 | double meth_cost[32] = { |
2103 | | // Externally defined methods |
2104 | 0 | 1, // 0 raw |
2105 | 0 | 1.04, // 1 gzip (Z_FILTERED) |
2106 | 0 | 1.07, // 2 bzip2 |
2107 | 0 | 1.08, // 3 lzma |
2108 | 0 | 1.00, // 4 rans (O0) |
2109 | 0 | 1.00, // 5 ranspr (O0) |
2110 | 0 | 1.04, // 6 arithpr (O0) |
2111 | 0 | 1.05, // 7 fqz |
2112 | 0 | 1.05, // 8 tok3 (rans) |
2113 | 0 | 1.00, 1.00, // 9,10 reserved |
2114 | | |
2115 | | // Paramterised versions of above |
2116 | 0 | 1.01, // gzip rle |
2117 | 0 | 1.01, // gzip -1 |
2118 | |
|
2119 | 0 | 1.05, 1.05, 1.05, // FQZ_b,c,d |
2120 | |
|
2121 | 0 | 1.01, // rans O1 |
2122 | |
|
2123 | 0 | 1.01, // rans_pr1 |
2124 | 0 | 1.00, // rans_pr64; if smaller, usually fast |
2125 | 0 | 1.03, // rans_pr65/9 |
2126 | 0 | 1.00, // rans_pr128 |
2127 | 0 | 1.01, // rans_pr129 |
2128 | 0 | 1.00, // rans_pr192 |
2129 | 0 | 1.01, // rans_pr193 |
2130 | |
|
2131 | 0 | 1.07, // tok3 arith |
2132 | |
|
2133 | 0 | 1.04, // arith_pr1 |
2134 | 0 | 1.04, // arith_pr64 |
2135 | 0 | 1.04, // arith_pr9 |
2136 | 0 | 1.03, // arith_pr128 |
2137 | 0 | 1.04, // arith_pr129 |
2138 | 0 | 1.04, // arith_pr192 |
2139 | 0 | 1.04, // arith_pr193 |
2140 | 0 | }; |
2141 | | |
2142 | | // Scale methods by cost based on compression level |
2143 | 0 | if (fd->level <= 1) { |
2144 | 0 | for (m = 0; m < CRAM_MAX_METHOD; m++) |
2145 | 0 | metrics->sz[m] *= 1+(meth_cost[m]-1)*4; |
2146 | 0 | } else if (fd->level <= 3) { |
2147 | 0 | for (m = 0; m < CRAM_MAX_METHOD; m++) |
2148 | 0 | metrics->sz[m] *= 1+(meth_cost[m]-1); |
2149 | 0 | } else if (fd->level <= 6) { |
2150 | 0 | for (m = 0; m < CRAM_MAX_METHOD; m++) |
2151 | 0 | metrics->sz[m] *= 1+(meth_cost[m]-1)/2; |
2152 | 0 | } else if (fd->level <= 7) { |
2153 | 0 | for (m = 0; m < CRAM_MAX_METHOD; m++) |
2154 | 0 | metrics->sz[m] *= 1+(meth_cost[m]-1)/3; |
2155 | 0 | } // else cost is ignored |
2156 | | |
2157 | | // Ensure these are never used; BSC and ZSTD |
2158 | 0 | metrics->sz[9] = metrics->sz[10] = INT_MAX; |
2159 | |
|
2160 | 0 | for (m = 0; m < CRAM_MAX_METHOD; m++) { |
2161 | 0 | if ((!metrics->sz[m]) || (!(method & (1u<<m)))) |
2162 | 0 | continue; |
2163 | | |
2164 | 0 | if (best_sz > metrics->sz[m]) |
2165 | 0 | best_sz = metrics->sz[m], best_method = m; |
2166 | 0 | } |
2167 | |
|
2168 | 0 | if (best_method != metrics->method) { |
2169 | | //metrics->trial = (NTRIALS+1)/2; // be sure |
2170 | | //metrics->next_trial /= 1.5; |
2171 | 0 | metrics->consistency = 0; |
2172 | 0 | } else { |
2173 | 0 | metrics->next_trial *= MIN(2, 1+metrics->consistency/4.0); |
2174 | 0 | metrics->consistency++; |
2175 | 0 | } |
2176 | |
|
2177 | 0 | metrics->method = best_method; |
2178 | 0 | switch (best_method) { |
2179 | 0 | case GZIP: strat = Z_FILTERED; break; |
2180 | 0 | case GZIP_1: strat = Z_DEFAULT_STRATEGY; break; |
2181 | 0 | case GZIP_RLE: strat = Z_RLE; break; |
2182 | 0 | case FQZ: strat = CRAM_MAJOR_VERS(fd->version); break; |
2183 | 0 | case FQZ_b: strat = CRAM_MAJOR_VERS(fd->version)+256; break; |
2184 | 0 | case FQZ_c: strat = CRAM_MAJOR_VERS(fd->version)+2*256; break; |
2185 | 0 | case FQZ_d: strat = CRAM_MAJOR_VERS(fd->version)+3*256; break; |
2186 | 0 | default: strat = 0; |
2187 | 0 | } |
2188 | 0 | metrics->strat = strat; |
2189 | | |
2190 | | // If we see at least MAXFAIL trials in a row for a specific |
2191 | | // compression method with more than MAXDELTA aggregate |
2192 | | // size then we drop this from the list of methods used |
2193 | | // for this block type. |
2194 | 0 | #define MAXDELTA 0.20 |
2195 | 0 | #define MAXFAILS 4 |
2196 | 0 | for (m = 0; m < CRAM_MAX_METHOD; m++) { |
2197 | 0 | if (best_method == m) { |
2198 | 0 | metrics->cnt[m] = 0; |
2199 | 0 | metrics->extra[m] = 0; |
2200 | 0 | } else if (best_sz < metrics->sz[m]) { |
2201 | 0 | double r = (double)metrics->sz[m] / best_sz - 1; |
2202 | 0 | int mul = 1+(fd->level>=7); |
2203 | 0 | if (++metrics->cnt[m] >= MAXFAILS*mul && |
2204 | 0 | (metrics->extra[m] += r) >= MAXDELTA*mul) |
2205 | 0 | method &= ~(1u<<m); |
2206 | | |
2207 | | // Special case for fqzcomp as it rarely changes |
2208 | 0 | if (m == FQZ || m == FQZ_b || m == FQZ_c || m == FQZ_d) { |
2209 | 0 | if (metrics->sz[m] > best_sz) |
2210 | 0 | method &= ~(1u<<m); |
2211 | 0 | } |
2212 | 0 | } |
2213 | 0 | } |
2214 | | |
2215 | | //if (fd->verbose > 1 && method != metrics->revised_method) |
2216 | | // fprintf(stderr, "%d: revising method from %x to %x\n", |
2217 | | // b->content_id, metrics->revised_method, method); |
2218 | 0 | metrics->revised_method = method; |
2219 | 0 | } |
2220 | 0 | pthread_mutex_unlock(&fd->metrics_lock); |
2221 | 0 | } else { |
2222 | 0 | metrics->input_avg_delta = |
2223 | 0 | 0.9 * (metrics->input_avg_delta + |
2224 | 0 | ABS(b->uncomp_size - metrics->input_avg_sz)); |
2225 | |
|
2226 | 0 | metrics->input_avg_sz += b->uncomp_size*.2; |
2227 | 0 | metrics->input_avg_sz *= 0.8; |
2228 | |
|
2229 | 0 | strat = metrics->strat; |
2230 | 0 | method = metrics->method; |
2231 | |
|
2232 | 0 | pthread_mutex_unlock(&fd->metrics_lock); |
2233 | 0 | comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size, |
2234 | 0 | b->content_id, &comp_size, method, |
2235 | 0 | method == GZIP_1 ? 1 : level, |
2236 | 0 | strat); |
2237 | 0 | if (!comp) |
2238 | 0 | return -1; |
2239 | | |
2240 | 0 | if (comp_size < b->uncomp_size) { |
2241 | 0 | free(b->data); |
2242 | 0 | b->data = (unsigned char *)comp; |
2243 | 0 | b->comp_size = comp_size; |
2244 | 0 | b->method = method; |
2245 | 0 | } else { |
2246 | 0 | free(comp); |
2247 | 0 | } |
2248 | 0 | } |
2249 | |
|
2250 | 0 | } else { |
2251 | | // no cached metrics, so just do zlib? |
2252 | 0 | comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size, |
2253 | 0 | b->content_id, &comp_size, GZIP, level, Z_FILTERED); |
2254 | 0 | if (!comp) { |
2255 | 0 | hts_log_error("Compression failed!"); |
2256 | 0 | return -1; |
2257 | 0 | } |
2258 | | |
2259 | 0 | if (comp_size < b->uncomp_size) { |
2260 | 0 | free(b->data); |
2261 | 0 | b->data = (unsigned char *)comp; |
2262 | 0 | b->comp_size = comp_size; |
2263 | 0 | b->method = GZIP; |
2264 | 0 | } else { |
2265 | 0 | free(comp); |
2266 | 0 | } |
2267 | 0 | strat = Z_FILTERED; |
2268 | 0 | } |
2269 | | |
2270 | 0 | hts_log_info("Compressed block ID %d from %d to %d by method %s", |
2271 | 0 | b->content_id, b->uncomp_size, b->comp_size, |
2272 | 0 | cram_block_method2str(b->method)); |
2273 | |
|
2274 | 0 | b->method = methmap[b->method]; |
2275 | |
|
2276 | 0 | return 0; |
2277 | 0 | } |
2278 | | int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, |
2279 | 0 | int method, int level) { |
2280 | 0 | return cram_compress_block2(fd, NULL, b, metrics, method, level); |
2281 | 0 | } |
2282 | | |
2283 | 11.5k | cram_metrics *cram_new_metrics(void) { |
2284 | 11.5k | cram_metrics *m = calloc(1, sizeof(*m)); |
2285 | 11.5k | if (!m) |
2286 | 0 | return NULL; |
2287 | 11.5k | m->trial = NTRIALS-1; |
2288 | 11.5k | m->next_trial = TRIAL_SPAN/2; // learn quicker at start |
2289 | 11.5k | m->method = RAW; |
2290 | 11.5k | m->strat = 0; |
2291 | 11.5k | m->revised_method = 0; |
2292 | 11.5k | m->unpackable = 0; |
2293 | | |
2294 | 11.5k | return m; |
2295 | 11.5k | } |
2296 | | |
2297 | 0 | char *cram_block_method2str(enum cram_block_method_int m) { |
2298 | 0 | switch(m) { |
2299 | 0 | case RAW: return "RAW"; |
2300 | 0 | case GZIP: return "GZIP"; |
2301 | 0 | case BZIP2: return "BZIP2"; |
2302 | 0 | case LZMA: return "LZMA"; |
2303 | 0 | case RANS0: return "RANS0"; |
2304 | 0 | case RANS1: return "RANS1"; |
2305 | 0 | case GZIP_RLE: return "GZIP_RLE"; |
2306 | 0 | case GZIP_1: return "GZIP_1"; |
2307 | 0 | case FQZ: return "FQZ"; |
2308 | 0 | case FQZ_b: return "FQZ_b"; |
2309 | 0 | case FQZ_c: return "FQZ_c"; |
2310 | 0 | case FQZ_d: return "FQZ_d"; |
2311 | 0 | case RANS_PR0: return "RANS_PR0"; |
2312 | 0 | case RANS_PR1: return "RANS_PR1"; |
2313 | 0 | case RANS_PR64: return "RANS_PR64"; |
2314 | 0 | case RANS_PR9: return "RANS_PR9"; |
2315 | 0 | case RANS_PR128: return "RANS_PR128"; |
2316 | 0 | case RANS_PR129: return "RANS_PR129"; |
2317 | 0 | case RANS_PR192: return "RANS_PR192"; |
2318 | 0 | case RANS_PR193: return "RANS_PR193"; |
2319 | 0 | case TOK3: return "TOK3_R"; |
2320 | 0 | case TOKA: return "TOK3_A"; |
2321 | 0 | case ARITH_PR0: return "ARITH_PR0"; |
2322 | 0 | case ARITH_PR1: return "ARITH_PR1"; |
2323 | 0 | case ARITH_PR64: return "ARITH_PR64"; |
2324 | 0 | case ARITH_PR9: return "ARITH_PR9"; |
2325 | 0 | case ARITH_PR128: return "ARITH_PR128"; |
2326 | 0 | case ARITH_PR129: return "ARITH_PR129"; |
2327 | 0 | case ARITH_PR192: return "ARITH_PR192"; |
2328 | 0 | case ARITH_PR193: return "ARITH_PR193"; |
2329 | 0 | case BM_ERROR: break; |
2330 | 0 | } |
2331 | 0 | return "?"; |
2332 | 0 | } |
2333 | | |
2334 | 1 | char *cram_content_type2str(enum cram_content_type t) { |
2335 | 1 | switch (t) { |
2336 | 1 | case FILE_HEADER: return "FILE_HEADER"; |
2337 | 0 | case COMPRESSION_HEADER: return "COMPRESSION_HEADER"; |
2338 | 0 | case MAPPED_SLICE: return "MAPPED_SLICE"; |
2339 | 0 | case UNMAPPED_SLICE: return "UNMAPPED_SLICE"; |
2340 | 0 | case EXTERNAL: return "EXTERNAL"; |
2341 | 0 | case CORE: return "CORE"; |
2342 | 0 | case CT_ERROR: break; |
2343 | 1 | } |
2344 | 0 | return "?"; |
2345 | 1 | } |
2346 | | |
2347 | | /* ---------------------------------------------------------------------- |
2348 | | * Reference sequence handling |
2349 | | * |
2350 | | * These revolve around the refs_t structure, which may potentially be |
2351 | | * shared between multiple cram_fd. |
2352 | | * |
2353 | | * We start with refs_create() to allocate an empty refs_t and then |
2354 | | * populate it with @SQ line data using refs_from_header(). This is done on |
2355 | | * cram_open(). Also at start up we can call cram_load_reference() which |
2356 | | * is used with "scramble -r foo.fa". This replaces the fd->refs with the |
2357 | | * new one specified. In either case refs2id() is then called which |
2358 | | * maps ref_entry names to @SQ ids (refs_t->ref_id[]). |
2359 | | * |
2360 | | * Later, possibly within a thread, we will want to know the actual ref |
2361 | | * seq itself, obtained by calling cram_get_ref(). This may use the |
2362 | | * UR: or M5: fields or the filename specified in the original |
2363 | | * cram_load_reference() call. |
2364 | | * |
2365 | | * Given the potential for multi-threaded reference usage, we have |
2366 | | * reference counting (sorry for the confusing double use of "ref") to |
2367 | | * track the number of callers interested in any specific reference. |
2368 | | */ |
2369 | | |
2370 | | /* |
2371 | | * Frees/unmaps a reference sequence and associated file handles. |
2372 | | */ |
2373 | 216 | static void ref_entry_free_seq(ref_entry *e) { |
2374 | 216 | if (e->mf) |
2375 | 0 | mfclose(e->mf); |
2376 | 216 | if (e->seq && !e->mf) |
2377 | 0 | free(e->seq); |
2378 | | |
2379 | 216 | e->seq = NULL; |
2380 | 216 | e->mf = NULL; |
2381 | 216 | } |
2382 | | |
2383 | 245 | void refs_free(refs_t *r) { |
2384 | 245 | RP("refs_free()\n"); |
2385 | | |
2386 | 245 | if (--r->count > 0) |
2387 | 0 | return; |
2388 | | |
2389 | 245 | if (!r) |
2390 | 0 | return; |
2391 | | |
2392 | 245 | if (r->pool) |
2393 | 245 | string_pool_destroy(r->pool); |
2394 | | |
2395 | 245 | if (r->h_meta) { |
2396 | 245 | khint_t k; |
2397 | | |
2398 | 653 | for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) { |
2399 | 408 | ref_entry *e; |
2400 | | |
2401 | 408 | if (!kh_exist(r->h_meta, k)) |
2402 | 192 | continue; |
2403 | 216 | if (!(e = kh_val(r->h_meta, k))) |
2404 | 0 | continue; |
2405 | 216 | ref_entry_free_seq(e); |
2406 | 216 | free(e); |
2407 | 216 | } |
2408 | | |
2409 | 245 | kh_destroy(refs, r->h_meta); |
2410 | 245 | } |
2411 | | |
2412 | 245 | if (r->ref_id) |
2413 | 18 | free(r->ref_id); |
2414 | | |
2415 | 245 | if (r->fp) |
2416 | 0 | bgzf_close(r->fp); |
2417 | | |
2418 | 245 | pthread_mutex_destroy(&r->lock); |
2419 | | |
2420 | 245 | free(r); |
2421 | 245 | } |
2422 | | |
2423 | 245 | static refs_t *refs_create(void) { |
2424 | 245 | refs_t *r = calloc(1, sizeof(*r)); |
2425 | | |
2426 | 245 | RP("refs_create()\n"); |
2427 | | |
2428 | 245 | if (!r) |
2429 | 0 | return NULL; |
2430 | | |
2431 | 245 | if (!(r->pool = string_pool_create(8192))) |
2432 | 0 | goto err; |
2433 | | |
2434 | 245 | r->ref_id = NULL; // see refs2id() to populate. |
2435 | 245 | r->count = 1; |
2436 | 245 | r->last = NULL; |
2437 | 245 | r->last_id = -1; |
2438 | | |
2439 | 245 | if (!(r->h_meta = kh_init(refs))) |
2440 | 0 | goto err; |
2441 | | |
2442 | 245 | pthread_mutex_init(&r->lock, NULL); |
2443 | | |
2444 | 245 | return r; |
2445 | | |
2446 | 0 | err: |
2447 | 0 | refs_free(r); |
2448 | 0 | return NULL; |
2449 | 245 | } |
2450 | | |
2451 | | /* |
2452 | | * Opens a reference fasta file as a BGZF stream, allowing for |
2453 | | * compressed files. It automatically builds a .fai file if |
2454 | | * required and if compressed a .gzi bgzf index too. |
2455 | | * |
2456 | | * Returns a BGZF handle on success; |
2457 | | * NULL on failure. |
2458 | | */ |
2459 | 0 | static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) { |
2460 | 0 | BGZF *fp; |
2461 | |
|
2462 | 0 | if (!is_md5 && !hisremote(fn)) { |
2463 | 0 | char fai_file[PATH_MAX]; |
2464 | |
|
2465 | 0 | snprintf(fai_file, PATH_MAX, "%s.fai", fn); |
2466 | 0 | if (access(fai_file, R_OK) != 0) |
2467 | 0 | if (fai_build(fn) != 0) |
2468 | 0 | return NULL; |
2469 | 0 | } |
2470 | | |
2471 | 0 | if (!(fp = bgzf_open(fn, mode))) { |
2472 | 0 | perror(fn); |
2473 | 0 | return NULL; |
2474 | 0 | } |
2475 | | |
2476 | 0 | if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) { |
2477 | 0 | hts_log_error("Unable to load .gzi index '%s.gzi'", fn); |
2478 | 0 | bgzf_close(fp); |
2479 | 0 | return NULL; |
2480 | 0 | } |
2481 | | |
2482 | 0 | return fp; |
2483 | 0 | } |
2484 | | |
2485 | | /* |
2486 | | * Loads a FAI file for a reference.fasta. |
2487 | | * "is_err" indicates whether failure to load is worthy of emitting an |
2488 | | * error message. In some cases (eg with embedded references) we |
2489 | | * speculatively load, just in case, and silently ignore errors. |
2490 | | * |
2491 | | * Returns the refs_t struct on success (maybe newly allocated); |
2492 | | * NULL on failure |
2493 | | */ |
2494 | 0 | static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) { |
2495 | 0 | hFILE *fp = NULL; |
2496 | 0 | char fai_fn[PATH_MAX]; |
2497 | 0 | char line[8192]; |
2498 | 0 | refs_t *r = r_orig; |
2499 | 0 | size_t fn_l = strlen(fn); |
2500 | 0 | int id = 0, id_alloc = 0; |
2501 | |
|
2502 | 0 | RP("refs_load_fai %s\n", fn); |
2503 | |
|
2504 | 0 | if (!r) |
2505 | 0 | if (!(r = refs_create())) |
2506 | 0 | goto err; |
2507 | | |
2508 | 0 | if (r->fp) |
2509 | 0 | if (bgzf_close(r->fp) != 0) |
2510 | 0 | goto err; |
2511 | 0 | r->fp = NULL; |
2512 | | |
2513 | | /* Look for a FASTA##idx##FAI format */ |
2514 | 0 | char *fn_delim = strstr(fn, HTS_IDX_DELIM); |
2515 | 0 | if (fn_delim) { |
2516 | 0 | if (!(r->fn = string_ndup(r->pool, fn, fn_delim - fn))) |
2517 | 0 | goto err; |
2518 | 0 | fn_delim += strlen(HTS_IDX_DELIM); |
2519 | 0 | snprintf(fai_fn, PATH_MAX, "%s", fn_delim); |
2520 | 0 | } else { |
2521 | | /* An index file was provided, instead of the actual reference file */ |
2522 | 0 | if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0) { |
2523 | 0 | if (!r->fn) { |
2524 | 0 | if (!(r->fn = string_ndup(r->pool, fn, fn_l-4))) |
2525 | 0 | goto err; |
2526 | 0 | } |
2527 | 0 | snprintf(fai_fn, PATH_MAX, "%s", fn); |
2528 | 0 | } else { |
2529 | | /* Only the reference file provided. Get the index file name from it */ |
2530 | 0 | if (!(r->fn = string_dup(r->pool, fn))) |
2531 | 0 | goto err; |
2532 | 0 | sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, fn); |
2533 | 0 | } |
2534 | 0 | } |
2535 | | |
2536 | 0 | if (!(r->fp = bgzf_open_ref(r->fn, "r", 0))) { |
2537 | 0 | hts_log_error("Failed to open reference file '%s'", r->fn); |
2538 | 0 | goto err; |
2539 | 0 | } |
2540 | | |
2541 | 0 | if (!(fp = hopen(fai_fn, "r"))) { |
2542 | 0 | hts_log_error("Failed to open index file '%s'", fai_fn); |
2543 | 0 | if (is_err) |
2544 | 0 | perror(fai_fn); |
2545 | 0 | goto err; |
2546 | 0 | } |
2547 | 0 | while (hgets(line, 8192, fp) != NULL) { |
2548 | 0 | ref_entry *e = malloc(sizeof(*e)); |
2549 | 0 | char *cp; |
2550 | 0 | int n; |
2551 | 0 | khint_t k; |
2552 | |
|
2553 | 0 | if (!e) |
2554 | 0 | return NULL; |
2555 | | |
2556 | | // id |
2557 | 0 | for (cp = line; *cp && !isspace_c(*cp); cp++) |
2558 | 0 | ; |
2559 | 0 | *cp++ = 0; |
2560 | 0 | e->name = string_dup(r->pool, line); |
2561 | | |
2562 | | // length |
2563 | 0 | while (*cp && isspace_c(*cp)) |
2564 | 0 | cp++; |
2565 | 0 | e->length = strtoll(cp, &cp, 10); |
2566 | | |
2567 | | // offset |
2568 | 0 | while (*cp && isspace_c(*cp)) |
2569 | 0 | cp++; |
2570 | 0 | e->offset = strtoll(cp, &cp, 10); |
2571 | | |
2572 | | // bases per line |
2573 | 0 | while (*cp && isspace_c(*cp)) |
2574 | 0 | cp++; |
2575 | 0 | e->bases_per_line = strtol(cp, &cp, 10); |
2576 | | |
2577 | | // line length |
2578 | 0 | while (*cp && isspace_c(*cp)) |
2579 | 0 | cp++; |
2580 | 0 | e->line_length = strtol(cp, &cp, 10); |
2581 | | |
2582 | | // filename |
2583 | 0 | e->fn = r->fn; |
2584 | |
|
2585 | 0 | e->count = 0; |
2586 | 0 | e->seq = NULL; |
2587 | 0 | e->mf = NULL; |
2588 | 0 | e->is_md5 = 0; |
2589 | 0 | e->validated_md5 = 0; |
2590 | |
|
2591 | 0 | k = kh_put(refs, r->h_meta, e->name, &n); |
2592 | 0 | if (-1 == n) { |
2593 | 0 | free(e); |
2594 | 0 | return NULL; |
2595 | 0 | } |
2596 | | |
2597 | 0 | if (n) { |
2598 | 0 | kh_val(r->h_meta, k) = e; |
2599 | 0 | } else { |
2600 | 0 | ref_entry *re = kh_val(r->h_meta, k); |
2601 | 0 | if (re && (re->count != 0 || re->length != 0)) { |
2602 | | /* Keep old */ |
2603 | 0 | free(e); |
2604 | 0 | } else { |
2605 | | /* Replace old */ |
2606 | 0 | if (re) |
2607 | 0 | free(re); |
2608 | 0 | kh_val(r->h_meta, k) = e; |
2609 | 0 | } |
2610 | 0 | } |
2611 | |
|
2612 | 0 | if (id >= id_alloc) { |
2613 | 0 | ref_entry **new_refs; |
2614 | 0 | int x; |
2615 | |
|
2616 | 0 | id_alloc = id_alloc ?id_alloc*2 : 16; |
2617 | 0 | new_refs = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id)); |
2618 | 0 | if (!new_refs) |
2619 | 0 | goto err; |
2620 | 0 | r->ref_id = new_refs; |
2621 | |
|
2622 | 0 | for (x = id; x < id_alloc; x++) |
2623 | 0 | r->ref_id[x] = NULL; |
2624 | 0 | } |
2625 | 0 | r->ref_id[id] = e; |
2626 | 0 | r->nref = ++id; |
2627 | 0 | } |
2628 | | |
2629 | 0 | if(hclose(fp) < 0) |
2630 | 0 | goto err; |
2631 | 0 | return r; |
2632 | | |
2633 | 0 | err: |
2634 | 0 | if (fp) |
2635 | 0 | hclose_abruptly(fp); |
2636 | |
|
2637 | 0 | if (!r_orig) |
2638 | 0 | refs_free(r); |
2639 | |
|
2640 | 0 | return NULL; |
2641 | 0 | } |
2642 | | |
2643 | | /* |
2644 | | * Verifies that the CRAM @SQ lines and .fai files match. |
2645 | | */ |
2646 | 0 | static void sanitise_SQ_lines(cram_fd *fd) { |
2647 | 0 | int i; |
2648 | |
|
2649 | 0 | if (!fd->header || !fd->header->hrecs) |
2650 | 0 | return; |
2651 | | |
2652 | 0 | if (!fd->refs || !fd->refs->h_meta) |
2653 | 0 | return; |
2654 | | |
2655 | 0 | for (i = 0; i < fd->header->hrecs->nref; i++) { |
2656 | 0 | const char *name = fd->header->hrecs->ref[i].name; |
2657 | 0 | khint_t k = kh_get(refs, fd->refs->h_meta, name); |
2658 | 0 | ref_entry *r; |
2659 | | |
2660 | | // We may have @SQ lines which have no known .fai, but do not |
2661 | | // in themselves pose a problem because they are unused in the file. |
2662 | 0 | if (k == kh_end(fd->refs->h_meta)) |
2663 | 0 | continue; |
2664 | | |
2665 | 0 | if (!(r = (ref_entry *)kh_val(fd->refs->h_meta, k))) |
2666 | 0 | continue; |
2667 | | |
2668 | 0 | if (r->length && r->length != fd->header->hrecs->ref[i].len) { |
2669 | 0 | assert(strcmp(r->name, fd->header->hrecs->ref[i].name) == 0); |
2670 | | |
2671 | | // Should we also check MD5sums here to ensure the correct |
2672 | | // reference was given? |
2673 | 0 | hts_log_warning("Header @SQ length mismatch for ref %s, %"PRIhts_pos" vs %d", |
2674 | 0 | r->name, fd->header->hrecs->ref[i].len, (int)r->length); |
2675 | | |
2676 | | // Fixing the parsed @SQ header will make MD:Z: strings work |
2677 | | // and also stop it producing N for the sequence. |
2678 | 0 | fd->header->hrecs->ref[i].len = r->length; |
2679 | 0 | } |
2680 | 0 | } |
2681 | 0 | } |
2682 | | |
2683 | | /* |
2684 | | * Indexes references by the order they appear in a BAM file. This may not |
2685 | | * necessarily be the same order they appear in the fasta reference file. |
2686 | | * |
2687 | | * Returns 0 on success |
2688 | | * -1 on failure |
2689 | | */ |
2690 | 0 | int refs2id(refs_t *r, sam_hdr_t *hdr) { |
2691 | 0 | int i; |
2692 | 0 | sam_hrecs_t *h = hdr->hrecs; |
2693 | |
|
2694 | 0 | if (r->ref_id) |
2695 | 0 | free(r->ref_id); |
2696 | 0 | if (r->last) |
2697 | 0 | r->last = NULL; |
2698 | |
|
2699 | 0 | r->ref_id = calloc(h->nref, sizeof(*r->ref_id)); |
2700 | 0 | if (!r->ref_id) |
2701 | 0 | return -1; |
2702 | | |
2703 | 0 | r->nref = h->nref; |
2704 | 0 | for (i = 0; i < h->nref; i++) { |
2705 | 0 | khint_t k = kh_get(refs, r->h_meta, h->ref[i].name); |
2706 | 0 | if (k != kh_end(r->h_meta)) { |
2707 | 0 | r->ref_id[i] = kh_val(r->h_meta, k); |
2708 | 0 | } else { |
2709 | 0 | hts_log_warning("Unable to find ref name '%s'", h->ref[i].name); |
2710 | 0 | } |
2711 | 0 | } |
2712 | |
|
2713 | 0 | return 0; |
2714 | 0 | } |
2715 | | |
2716 | | /* |
2717 | | * Generates refs_t entries based on @SQ lines in the header. |
2718 | | * Returns 0 on success |
2719 | | * -1 on failure |
2720 | | */ |
2721 | 245 | static int refs_from_header(cram_fd *fd) { |
2722 | 245 | if (!fd) |
2723 | 0 | return -1; |
2724 | | |
2725 | 245 | refs_t *r = fd->refs; |
2726 | 245 | if (!r) |
2727 | 0 | return -1; |
2728 | | |
2729 | 245 | sam_hdr_t *h = fd->header; |
2730 | 245 | if (!h) |
2731 | 0 | return 0; |
2732 | | |
2733 | 245 | if (!h->hrecs) { |
2734 | 197 | if (-1 == sam_hdr_fill_hrecs(h)) |
2735 | 0 | return -1; |
2736 | 197 | } |
2737 | | |
2738 | 245 | if (h->hrecs->nref == 0) |
2739 | 227 | return 0; |
2740 | | |
2741 | | //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode); |
2742 | | |
2743 | | /* Existing refs are fine, as long as they're compatible with the hdr. */ |
2744 | 18 | ref_entry **new_ref_id = realloc(r->ref_id, (r->nref + h->hrecs->nref) * sizeof(*r->ref_id)); |
2745 | 18 | if (!new_ref_id) |
2746 | 0 | return -1; |
2747 | 18 | r->ref_id = new_ref_id; |
2748 | | |
2749 | 18 | int i, j; |
2750 | | /* Copy info from h->ref[i] over to r */ |
2751 | 234 | for (i = 0, j = r->nref; i < h->hrecs->nref; i++) { |
2752 | 216 | sam_hrec_type_t *ty; |
2753 | 216 | sam_hrec_tag_t *tag; |
2754 | 216 | khint_t k; |
2755 | 216 | int n; |
2756 | | |
2757 | 216 | k = kh_get(refs, r->h_meta, h->hrecs->ref[i].name); |
2758 | 216 | if (k != kh_end(r->h_meta)) |
2759 | | // Ref already known about |
2760 | 0 | continue; |
2761 | | |
2762 | 216 | if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry)))) |
2763 | 0 | return -1; |
2764 | | |
2765 | 216 | if (!h->hrecs->ref[i].name) |
2766 | 0 | return -1; |
2767 | | |
2768 | 216 | r->ref_id[j]->name = string_dup(r->pool, h->hrecs->ref[i].name); |
2769 | 216 | if (!r->ref_id[j]->name) return -1; |
2770 | 216 | r->ref_id[j]->length = 0; // marker for not yet loaded |
2771 | | |
2772 | | /* Initialise likely filename if known */ |
2773 | 216 | if ((ty = sam_hrecs_find_type_id(h->hrecs, "SQ", "SN", h->hrecs->ref[i].name))) { |
2774 | 216 | if ((tag = sam_hrecs_find_key(ty, "M5", NULL))) { |
2775 | 0 | r->ref_id[j]->fn = string_dup(r->pool, tag->str+3); |
2776 | | //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn); |
2777 | 0 | } |
2778 | 216 | } |
2779 | | |
2780 | 216 | k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n); |
2781 | 216 | if (n <= 0) // already exists or error |
2782 | 0 | return -1; |
2783 | 216 | kh_val(r->h_meta, k) = r->ref_id[j]; |
2784 | | |
2785 | 216 | j++; |
2786 | 216 | } |
2787 | 18 | r->nref = j; |
2788 | | |
2789 | 18 | return 0; |
2790 | 18 | } |
2791 | | |
2792 | | /* |
2793 | | * Attaches a header to a cram_fd. |
2794 | | * |
2795 | | * This should be used when creating a new cram_fd for writing where |
2796 | | * we have a header already constructed (eg from a file we've read |
2797 | | * in). |
2798 | | */ |
2799 | 0 | int cram_set_header2(cram_fd *fd, const sam_hdr_t *hdr) { |
2800 | 0 | if (!fd || !hdr ) |
2801 | 0 | return -1; |
2802 | | |
2803 | 0 | if (fd->header != hdr) { |
2804 | 0 | if (fd->header) |
2805 | 0 | sam_hdr_destroy(fd->header); |
2806 | 0 | fd->header = sam_hdr_dup(hdr); |
2807 | 0 | if (!fd->header) |
2808 | 0 | return -1; |
2809 | 0 | } |
2810 | 0 | return refs_from_header(fd); |
2811 | 0 | } |
2812 | | |
2813 | 0 | int cram_set_header(cram_fd *fd, sam_hdr_t *hdr) { |
2814 | 0 | return cram_set_header2(fd, hdr); |
2815 | 0 | } |
2816 | | |
2817 | | /* |
2818 | | * Returns whether the path refers to a directory. |
2819 | | */ |
2820 | 0 | static int is_directory(char *fn) { |
2821 | 0 | struct stat buf; |
2822 | 0 | if ( stat(fn,&buf) ) return 0; |
2823 | 0 | return S_ISDIR(buf.st_mode); |
2824 | 0 | } |
2825 | | |
2826 | | /* |
2827 | | * Converts a directory and a filename into an expanded path, replacing %s |
2828 | | * in directory with the filename and %[0-9]+s with portions of the filename |
2829 | | * Any remaining parts of filename are added to the end with /%s. |
2830 | | */ |
2831 | 0 | static int expand_cache_path(char *path, char *dir, const char *fn) { |
2832 | 0 | char *cp, *start = path; |
2833 | 0 | size_t len; |
2834 | 0 | size_t sz = PATH_MAX; |
2835 | |
|
2836 | 0 | while ((cp = strchr(dir, '%'))) { |
2837 | 0 | if (cp-dir >= sz) return -1; |
2838 | 0 | strncpy(path, dir, cp-dir); |
2839 | 0 | path += cp-dir; |
2840 | 0 | sz -= cp-dir; |
2841 | |
|
2842 | 0 | if (*++cp == 's') { |
2843 | 0 | len = strlen(fn); |
2844 | 0 | if (len >= sz) return -1; |
2845 | 0 | strcpy(path, fn); |
2846 | 0 | path += len; |
2847 | 0 | sz -= len; |
2848 | 0 | fn += len; |
2849 | 0 | cp++; |
2850 | 0 | } else if (*cp >= '0' && *cp <= '9') { |
2851 | 0 | char *endp; |
2852 | 0 | long l; |
2853 | |
|
2854 | 0 | l = strtol(cp, &endp, 10); |
2855 | 0 | l = MIN(l, strlen(fn)); |
2856 | 0 | if (*endp == 's') { |
2857 | 0 | if (l >= sz) return -1; |
2858 | 0 | strncpy(path, fn, l); |
2859 | 0 | path += l; |
2860 | 0 | fn += l; |
2861 | 0 | sz -= l; |
2862 | 0 | *path = 0; |
2863 | 0 | cp = endp+1; |
2864 | 0 | } else { |
2865 | 0 | if (sz < 3) return -1; |
2866 | 0 | *path++ = '%'; |
2867 | 0 | *path++ = *cp++; |
2868 | 0 | } |
2869 | 0 | } else { |
2870 | 0 | if (sz < 3) return -1; |
2871 | 0 | *path++ = '%'; |
2872 | 0 | *path++ = *cp++; |
2873 | 0 | } |
2874 | 0 | dir = cp; |
2875 | 0 | } |
2876 | | |
2877 | 0 | len = strlen(dir); |
2878 | 0 | if (len >= sz) return -1; |
2879 | 0 | strcpy(path, dir); |
2880 | 0 | path += len; |
2881 | 0 | sz -= len; |
2882 | |
|
2883 | 0 | len = strlen(fn) + ((*fn && path > start && path[-1] != '/') ? 1 : 0); |
2884 | 0 | if (len >= sz) return -1; |
2885 | 0 | if (*fn && path > start && path[-1] != '/') |
2886 | 0 | *path++ = '/'; |
2887 | 0 | strcpy(path, fn); |
2888 | 0 | return 0; |
2889 | 0 | } |
2890 | | |
2891 | | /* |
2892 | | * Make the directory containing path and any prefix directories. |
2893 | | */ |
2894 | 0 | static void mkdir_prefix(char *path, int mode) { |
2895 | 0 | char *cp = strrchr(path, '/'); |
2896 | 0 | if (!cp) |
2897 | 0 | return; |
2898 | | |
2899 | 0 | *cp = 0; |
2900 | 0 | if (is_directory(path)) { |
2901 | 0 | *cp = '/'; |
2902 | 0 | return; |
2903 | 0 | } |
2904 | | |
2905 | 0 | if (mkdir(path, mode) == 0) { |
2906 | 0 | chmod(path, mode); |
2907 | 0 | *cp = '/'; |
2908 | 0 | return; |
2909 | 0 | } |
2910 | | |
2911 | 0 | mkdir_prefix(path, mode); |
2912 | 0 | mkdir(path, mode); |
2913 | 0 | chmod(path, mode); |
2914 | 0 | *cp = '/'; |
2915 | 0 | } |
2916 | | |
2917 | | /* |
2918 | | * Return the cache directory to use, based on the first of these |
2919 | | * environment variables to be set to a non-empty value. |
2920 | | */ |
2921 | 0 | static const char *get_cache_basedir(const char **extra) { |
2922 | 0 | char *base; |
2923 | |
|
2924 | 0 | *extra = ""; |
2925 | |
|
2926 | 0 | base = getenv("XDG_CACHE_HOME"); |
2927 | 0 | if (base && *base) return base; |
2928 | | |
2929 | 0 | base = getenv("HOME"); |
2930 | 0 | if (base && *base) { *extra = "/.cache"; return base; } |
2931 | | |
2932 | 0 | base = getenv("TMPDIR"); |
2933 | 0 | if (base && *base) return base; |
2934 | | |
2935 | 0 | base = getenv("TEMP"); |
2936 | 0 | if (base && *base) return base; |
2937 | | |
2938 | 0 | return "/tmp"; |
2939 | 0 | } |
2940 | | |
2941 | | /* |
2942 | | * Queries the M5 string from the header and attempts to populate the |
2943 | | * reference from this using the REF_PATH environment. |
2944 | | * |
2945 | | * Returns 0 on success |
2946 | | * -1 on failure |
2947 | | */ |
2948 | 0 | static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { |
2949 | 0 | char *ref_path = getenv("REF_PATH"); |
2950 | 0 | sam_hrec_type_t *ty; |
2951 | 0 | sam_hrec_tag_t *tag; |
2952 | 0 | char path[PATH_MAX]; |
2953 | 0 | kstring_t path_tmp = KS_INITIALIZE; |
2954 | 0 | char cache[PATH_MAX], cache_root[PATH_MAX]; |
2955 | 0 | char *local_cache = getenv("REF_CACHE"); |
2956 | 0 | mFILE *mf; |
2957 | 0 | int local_path = 0; |
2958 | |
|
2959 | 0 | hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id); |
2960 | |
|
2961 | 0 | cache_root[0] = '\0'; |
2962 | |
|
2963 | 0 | if (!ref_path || *ref_path == '\0') { |
2964 | | /* |
2965 | | * If we have no ref path, we use the EBI server. |
2966 | | * However to avoid spamming it we require a local ref cache too. |
2967 | | */ |
2968 | 0 | ref_path = "https://www.ebi.ac.uk/ena/cram/md5/%s"; |
2969 | 0 | if (!local_cache || *local_cache == '\0') { |
2970 | 0 | const char *extra; |
2971 | 0 | const char *base = get_cache_basedir(&extra); |
2972 | 0 | snprintf(cache_root, PATH_MAX, "%s%s/hts-ref", base, extra); |
2973 | 0 | snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra); |
2974 | 0 | local_cache = cache; |
2975 | 0 | hts_log_info("Populating local cache: %s", local_cache); |
2976 | 0 | } |
2977 | 0 | } |
2978 | |
|
2979 | 0 | if (!r->name) |
2980 | 0 | return -1; |
2981 | | |
2982 | 0 | if (!(ty = sam_hrecs_find_type_id(fd->header->hrecs, "SQ", "SN", r->name))) |
2983 | 0 | return -1; |
2984 | | |
2985 | 0 | if (!(tag = sam_hrecs_find_key(ty, "M5", NULL))) |
2986 | 0 | goto no_M5; |
2987 | | |
2988 | 0 | hts_log_info("Querying ref %s", tag->str+3); |
2989 | | |
2990 | | /* Use cache if available */ |
2991 | 0 | if (local_cache && *local_cache) { |
2992 | 0 | if (expand_cache_path(path, local_cache, tag->str+3) == 0) |
2993 | 0 | local_path = 1; |
2994 | 0 | } |
2995 | |
|
2996 | | #ifndef HAVE_MMAP |
2997 | | char *path2; |
2998 | | /* Search local files in REF_PATH; we can open them and return as above */ |
2999 | | if (!local_path && (path2 = find_path(tag->str+3, ref_path))) { |
3000 | | int len = snprintf(path, PATH_MAX, "%s", path2); |
3001 | | free(path2); |
3002 | | if (len > 0 && len < PATH_MAX) // in case it's too long |
3003 | | local_path = 1; |
3004 | | } |
3005 | | #endif |
3006 | | |
3007 | | /* Found via REF_CACHE or local REF_PATH file */ |
3008 | 0 | if (local_path) { |
3009 | 0 | struct stat sb; |
3010 | 0 | BGZF *fp; |
3011 | |
|
3012 | 0 | if (0 == stat(path, &sb) |
3013 | 0 | && S_ISREG(sb.st_mode) |
3014 | 0 | && (fp = bgzf_open(path, "r"))) { |
3015 | 0 | r->length = sb.st_size; |
3016 | 0 | r->offset = r->line_length = r->bases_per_line = 0; |
3017 | |
|
3018 | 0 | r->fn = string_dup(fd->refs->pool, path); |
3019 | |
|
3020 | 0 | if (fd->refs->fp) |
3021 | 0 | if (bgzf_close(fd->refs->fp) != 0) |
3022 | 0 | return -1; |
3023 | 0 | fd->refs->fp = fp; |
3024 | 0 | fd->refs->fn = r->fn; |
3025 | 0 | r->is_md5 = 1; |
3026 | 0 | r->validated_md5 = 1; |
3027 | | |
3028 | | // Fall back to cram_get_ref() where it'll do the actual |
3029 | | // reading of the file. |
3030 | 0 | return 0; |
3031 | 0 | } |
3032 | 0 | } |
3033 | | |
3034 | | |
3035 | | /* Otherwise search full REF_PATH; slower as loads entire file */ |
3036 | 0 | if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) { |
3037 | 0 | size_t sz; |
3038 | 0 | r->seq = mfsteal(mf, &sz); |
3039 | 0 | if (r->seq) { |
3040 | 0 | r->mf = NULL; |
3041 | 0 | } else { |
3042 | | // keep mf around as we couldn't detach |
3043 | 0 | r->seq = mf->data; |
3044 | 0 | r->mf = mf; |
3045 | 0 | } |
3046 | 0 | r->length = sz; |
3047 | 0 | r->is_md5 = 1; |
3048 | 0 | r->validated_md5 = 1; |
3049 | 0 | } else { |
3050 | 0 | refs_t *refs; |
3051 | 0 | const char *fn; |
3052 | |
|
3053 | 0 | no_M5: |
3054 | | /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */ |
3055 | 0 | if (!(tag = sam_hrecs_find_key(ty, "UR", NULL))) |
3056 | 0 | return -1; |
3057 | | |
3058 | 0 | fn = (strncmp(tag->str+3, "file:", 5) == 0) |
3059 | 0 | ? tag->str+8 |
3060 | 0 | : tag->str+3; |
3061 | |
|
3062 | 0 | if (fd->refs->fp) { |
3063 | 0 | if (bgzf_close(fd->refs->fp) != 0) |
3064 | 0 | return -1; |
3065 | 0 | fd->refs->fp = NULL; |
3066 | 0 | } |
3067 | 0 | if (!(refs = refs_load_fai(fd->refs, fn, 0))) |
3068 | 0 | return -1; |
3069 | 0 | sanitise_SQ_lines(fd); |
3070 | |
|
3071 | 0 | fd->refs = refs; |
3072 | 0 | if (fd->refs->fp) { |
3073 | 0 | if (bgzf_close(fd->refs->fp) != 0) |
3074 | 0 | return -1; |
3075 | 0 | fd->refs->fp = NULL; |
3076 | 0 | } |
3077 | | |
3078 | 0 | if (!fd->refs->fn) |
3079 | 0 | return -1; |
3080 | | |
3081 | 0 | if (-1 == refs2id(fd->refs, fd->header)) |
3082 | 0 | return -1; |
3083 | 0 | if (!fd->refs->ref_id || !fd->refs->ref_id[id]) |
3084 | 0 | return -1; |
3085 | | |
3086 | | // Local copy already, so fall back to cram_get_ref(). |
3087 | 0 | return 0; |
3088 | 0 | } |
3089 | | |
3090 | | /* Populate the local disk cache if required */ |
3091 | 0 | if (local_cache && *local_cache) { |
3092 | 0 | hFILE *fp; |
3093 | |
|
3094 | 0 | if (*cache_root && !is_directory(cache_root)) { |
3095 | 0 | hts_log_warning("Creating reference cache directory %s\n" |
3096 | 0 | "This may become large; see the samtools(1) manual page REF_CACHE discussion", |
3097 | 0 | cache_root); |
3098 | 0 | } |
3099 | |
|
3100 | 0 | if (expand_cache_path(path, local_cache, tag->str+3) < 0) { |
3101 | 0 | return 0; // Not fatal - we have the data already so keep going. |
3102 | 0 | } |
3103 | 0 | hts_log_info("Writing cache file '%s'", path); |
3104 | 0 | mkdir_prefix(path, 01777); |
3105 | |
|
3106 | 0 | fp = hts_open_tmpfile(path, "wx", &path_tmp); |
3107 | 0 | if (!fp) { |
3108 | 0 | perror(path_tmp.s); |
3109 | 0 | free(path_tmp.s); |
3110 | | |
3111 | | // Not fatal - we have the data already so keep going. |
3112 | 0 | return 0; |
3113 | 0 | } |
3114 | | |
3115 | | // Check md5sum |
3116 | 0 | hts_md5_context *md5; |
3117 | 0 | char unsigned md5_buf1[16]; |
3118 | 0 | char md5_buf2[33]; |
3119 | |
|
3120 | 0 | if (!(md5 = hts_md5_init())) { |
3121 | 0 | hclose_abruptly(fp); |
3122 | 0 | unlink(path_tmp.s); |
3123 | 0 | free(path_tmp.s); |
3124 | 0 | return -1; |
3125 | 0 | } |
3126 | 0 | hts_md5_update(md5, r->seq, r->length); |
3127 | 0 | hts_md5_final(md5_buf1, md5); |
3128 | 0 | hts_md5_destroy(md5); |
3129 | 0 | hts_md5_hex(md5_buf2, md5_buf1); |
3130 | |
|
3131 | 0 | if (strncmp(tag->str+3, md5_buf2, 32) != 0) { |
3132 | 0 | hts_log_error("Mismatching md5sum for downloaded reference"); |
3133 | 0 | hclose_abruptly(fp); |
3134 | 0 | unlink(path_tmp.s); |
3135 | 0 | free(path_tmp.s); |
3136 | 0 | return -1; |
3137 | 0 | } |
3138 | | |
3139 | 0 | ssize_t length_written = hwrite(fp, r->seq, r->length); |
3140 | 0 | if (hclose(fp) < 0 || length_written != r->length || |
3141 | 0 | chmod(path_tmp.s, 0444) < 0 || |
3142 | 0 | rename(path_tmp.s, path) < 0) { |
3143 | 0 | hts_log_error("Creating reference at %s failed: %s", |
3144 | 0 | path, strerror(errno)); |
3145 | 0 | unlink(path_tmp.s); |
3146 | 0 | } |
3147 | 0 | } |
3148 | | |
3149 | 0 | free(path_tmp.s); |
3150 | 0 | return 0; |
3151 | 0 | } |
3152 | | |
3153 | 0 | static void cram_ref_incr_locked(refs_t *r, int id) { |
3154 | 0 | RP("%d INC REF %d, %d %p\n", gettid(), id, |
3155 | 0 | (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count+1:-999), |
3156 | 0 | id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1); |
3157 | |
|
3158 | 0 | if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) |
3159 | 0 | return; |
3160 | | |
3161 | 0 | if (r->last_id == id) |
3162 | 0 | r->last_id = -1; |
3163 | |
|
3164 | 0 | ++r->ref_id[id]->count; |
3165 | 0 | } |
3166 | | |
3167 | 0 | void cram_ref_incr(refs_t *r, int id) { |
3168 | 0 | pthread_mutex_lock(&r->lock); |
3169 | 0 | cram_ref_incr_locked(r, id); |
3170 | 0 | pthread_mutex_unlock(&r->lock); |
3171 | 0 | } |
3172 | | |
3173 | 0 | static void cram_ref_decr_locked(refs_t *r, int id) { |
3174 | 0 | RP("%d DEC REF %d, %d %p\n", gettid(), id, |
3175 | 0 | (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count-1:-999), |
3176 | 0 | id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1); |
3177 | |
|
3178 | 0 | if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) { |
3179 | 0 | return; |
3180 | 0 | } |
3181 | | |
3182 | 0 | if (--r->ref_id[id]->count <= 0) { |
3183 | 0 | assert(r->ref_id[id]->count == 0); |
3184 | 0 | if (r->last_id >= 0) { |
3185 | 0 | if (r->ref_id[r->last_id]->count <= 0 && |
3186 | 0 | r->ref_id[r->last_id]->seq) { |
3187 | 0 | RP("%d FREE REF %d (%p)\n", gettid(), |
3188 | 0 | r->last_id, r->ref_id[r->last_id]->seq); |
3189 | 0 | ref_entry_free_seq(r->ref_id[r->last_id]); |
3190 | 0 | if (r->ref_id[r->last_id]->is_md5) r->ref_id[r->last_id]->length = 0; |
3191 | 0 | } |
3192 | 0 | } |
3193 | 0 | r->last_id = id; |
3194 | 0 | } |
3195 | 0 | } |
3196 | | |
3197 | 0 | void cram_ref_decr(refs_t *r, int id) { |
3198 | 0 | pthread_mutex_lock(&r->lock); |
3199 | 0 | cram_ref_decr_locked(r, id); |
3200 | 0 | pthread_mutex_unlock(&r->lock); |
3201 | 0 | } |
3202 | | |
3203 | | /* |
3204 | | * Used by cram_ref_load and cram_ref_get. The file handle will have |
3205 | | * already been opened, so we can catch it. The ref_entry *e informs us |
3206 | | * of whether this is a multi-line fasta file or a raw MD5 style file. |
3207 | | * Either way we create a single contiguous sequence. |
3208 | | * |
3209 | | * Returns all or part of a reference sequence on success (malloced); |
3210 | | * NULL on failure. |
3211 | | */ |
3212 | 0 | static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) { |
3213 | 0 | off_t offset, len; |
3214 | 0 | char *seq; |
3215 | |
|
3216 | 0 | if (end < start) |
3217 | 0 | end = start; |
3218 | | |
3219 | | /* |
3220 | | * Compute locations in file. This is trivial for the MD5 files, but |
3221 | | * is still necessary for the fasta variants. |
3222 | | */ |
3223 | 0 | offset = e->line_length |
3224 | 0 | ? e->offset + (start-1)/e->bases_per_line * e->line_length + |
3225 | 0 | (start-1) % e->bases_per_line |
3226 | 0 | : start-1; |
3227 | |
|
3228 | 0 | len = (e->line_length |
3229 | 0 | ? e->offset + (end-1)/e->bases_per_line * e->line_length + |
3230 | 0 | (end-1) % e->bases_per_line |
3231 | 0 | : end-1) - offset + 1; |
3232 | |
|
3233 | 0 | if (bgzf_useek(fp, offset, SEEK_SET) < 0) { |
3234 | 0 | perror("bgzf_useek() on reference file"); |
3235 | 0 | return NULL; |
3236 | 0 | } |
3237 | | |
3238 | 0 | if (len == 0 || !(seq = malloc(len))) { |
3239 | 0 | return NULL; |
3240 | 0 | } |
3241 | | |
3242 | 0 | if (len != bgzf_read(fp, seq, len)) { |
3243 | 0 | perror("bgzf_read() on reference file"); |
3244 | 0 | free(seq); |
3245 | 0 | return NULL; |
3246 | 0 | } |
3247 | | |
3248 | | /* Strip white-space if required. */ |
3249 | 0 | if (len != end-start+1) { |
3250 | 0 | int i, j; |
3251 | 0 | char *cp = seq; |
3252 | 0 | char *cp_to; |
3253 | |
|
3254 | 0 | for (i = j = 0; i < len; i++) { |
3255 | 0 | if (cp[i] >= '!' && cp[i] <= '~') |
3256 | 0 | cp[j++] = toupper_c(cp[i]); |
3257 | 0 | } |
3258 | 0 | cp_to = cp+j; |
3259 | |
|
3260 | 0 | if (cp_to - seq != end-start+1) { |
3261 | 0 | hts_log_error("Malformed reference file"); |
3262 | 0 | free(seq); |
3263 | 0 | return NULL; |
3264 | 0 | } |
3265 | 0 | } else { |
3266 | 0 | int i; |
3267 | 0 | for (i = 0; i < len; i++) { |
3268 | 0 | seq[i] = toupper_c(seq[i]); |
3269 | 0 | } |
3270 | 0 | } |
3271 | | |
3272 | 0 | return seq; |
3273 | 0 | } |
3274 | | |
3275 | | /* |
3276 | | * Load the entire reference 'id'. |
3277 | | * This also increments the reference count by 1. |
3278 | | * |
3279 | | * Returns ref_entry on success; |
3280 | | * NULL on failure |
3281 | | */ |
3282 | 0 | ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) { |
3283 | 0 | ref_entry *e = r->ref_id[id]; |
3284 | 0 | int start = 1, end = e->length; |
3285 | 0 | char *seq; |
3286 | |
|
3287 | 0 | if (e->seq) { |
3288 | 0 | return e; |
3289 | 0 | } |
3290 | | |
3291 | 0 | assert(e->count == 0); |
3292 | | |
3293 | 0 | if (r->last) { |
3294 | | #ifdef REF_DEBUG |
3295 | | int idx = 0; |
3296 | | for (idx = 0; idx < r->nref; idx++) |
3297 | | if (r->last == r->ref_id[idx]) |
3298 | | break; |
3299 | | RP("%d cram_ref_load DECR %d\n", gettid(), idx); |
3300 | | #endif |
3301 | 0 | assert(r->last->count > 0); |
3302 | 0 | if (--r->last->count <= 0) { |
3303 | 0 | RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq); |
3304 | 0 | if (r->last->seq) |
3305 | 0 | ref_entry_free_seq(r->last); |
3306 | 0 | } |
3307 | 0 | } |
3308 | | |
3309 | 0 | if (!r->fn) |
3310 | 0 | return NULL; |
3311 | | |
3312 | | /* Open file if it's not already the current open reference */ |
3313 | 0 | if (strcmp(r->fn, e->fn) || r->fp == NULL) { |
3314 | 0 | if (r->fp) |
3315 | 0 | if (bgzf_close(r->fp) != 0) |
3316 | 0 | return NULL; |
3317 | 0 | r->fn = e->fn; |
3318 | 0 | if (!(r->fp = bgzf_open_ref(r->fn, "r", is_md5))) |
3319 | 0 | return NULL; |
3320 | 0 | } |
3321 | | |
3322 | 0 | RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end); |
3323 | |
|
3324 | 0 | if (!(seq = load_ref_portion(r->fp, e, start, end))) { |
3325 | 0 | return NULL; |
3326 | 0 | } |
3327 | | |
3328 | 0 | RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq); |
3329 | |
|
3330 | 0 | RP("%d INC REF %d, %"PRId64"\n", gettid(), id, (e->count+1)); |
3331 | 0 | e->seq = seq; |
3332 | 0 | e->mf = NULL; |
3333 | 0 | e->count++; |
3334 | | |
3335 | | /* |
3336 | | * Also keep track of last used ref so incr/decr loops on the same |
3337 | | * sequence don't cause load/free loops. |
3338 | | */ |
3339 | 0 | RP("%d cram_ref_load INCR %d => %"PRId64"\n", gettid(), id, e->count+1); |
3340 | 0 | r->last = e; |
3341 | 0 | e->count++; |
3342 | |
|
3343 | 0 | return e; |
3344 | 0 | } |
3345 | | |
3346 | | /* |
3347 | | * Returns a portion of a reference sequence from start to end inclusive. |
3348 | | * The returned pointer is owned by either the cram_file fd or by the |
3349 | | * internal refs_t structure and should not be freed by the caller. |
3350 | | * |
3351 | | * The difference is whether or not this refs_t is in use by just the one |
3352 | | * cram_fd or by multiples, or whether we have multiple threads accessing |
3353 | | * references. In either case fd->shared will be true and we start using |
3354 | | * reference counting to track the number of users of a specific reference |
3355 | | * sequence. |
3356 | | * |
3357 | | * Otherwise the ref seq returned is allocated as part of cram_fd itself |
3358 | | * and will be freed up on the next call to cram_get_ref or cram_close. |
3359 | | * |
3360 | | * To return the entire reference sequence, specify start as 1 and end |
3361 | | * as 0. |
3362 | | * |
3363 | | * To cease using a reference, call cram_ref_decr(). |
3364 | | * |
3365 | | * Returns reference on success, |
3366 | | * NULL on failure |
3367 | | */ |
3368 | 0 | char *cram_get_ref(cram_fd *fd, int id, int start, int end) { |
3369 | 0 | ref_entry *r; |
3370 | 0 | char *seq; |
3371 | 0 | int ostart = start; |
3372 | |
|
3373 | 0 | if (id == -1 || start < 1) |
3374 | 0 | return NULL; |
3375 | | |
3376 | | /* FIXME: axiomatic query of r->seq being true? |
3377 | | * Or shortcut for unsorted data where we load once and never free? |
3378 | | */ |
3379 | | |
3380 | | //fd->shared_ref = 1; // hard code for now to simplify things |
3381 | | |
3382 | 0 | pthread_mutex_lock(&fd->ref_lock); |
3383 | |
|
3384 | 0 | RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end); |
3385 | | |
3386 | | /* |
3387 | | * Unsorted data implies we want to fetch an entire reference at a time. |
3388 | | * We just deal with this at the moment by claiming we're sharing |
3389 | | * references instead, which has the same requirement. |
3390 | | */ |
3391 | 0 | if (fd->unsorted) |
3392 | 0 | fd->shared_ref = 1; |
3393 | | |
3394 | | |
3395 | | /* Sanity checking: does this ID exist? */ |
3396 | 0 | if (id >= fd->refs->nref) { |
3397 | 0 | hts_log_error("No reference found for id %d", id); |
3398 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3399 | 0 | return NULL; |
3400 | 0 | } |
3401 | | |
3402 | 0 | if (!fd->refs || !fd->refs->ref_id[id]) { |
3403 | 0 | hts_log_error("No reference found for id %d", id); |
3404 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3405 | 0 | return NULL; |
3406 | 0 | } |
3407 | | |
3408 | 0 | if (!(r = fd->refs->ref_id[id])) { |
3409 | 0 | hts_log_error("No reference found for id %d", id); |
3410 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3411 | 0 | return NULL; |
3412 | 0 | } |
3413 | | |
3414 | | |
3415 | | /* |
3416 | | * It has an entry, but may not have been populated yet. |
3417 | | * Any manually loaded .fai files have their lengths known. |
3418 | | * A ref entry computed from @SQ lines (M5 or UR field) will have |
3419 | | * r->length == 0 unless it's been loaded once and verified that we have |
3420 | | * an on-disk filename for it. |
3421 | | * |
3422 | | * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls |
3423 | | * open_path_mfile and libcurl, which isn't multi-thread safe unless I |
3424 | | * rewrite my code to have one curl handle per thread. |
3425 | | */ |
3426 | 0 | pthread_mutex_lock(&fd->refs->lock); |
3427 | 0 | if (r->length == 0) { |
3428 | 0 | if (fd->ref_fn) |
3429 | 0 | hts_log_warning("Reference file given, but ref '%s' not present", |
3430 | 0 | r->name); |
3431 | 0 | if (cram_populate_ref(fd, id, r) == -1) { |
3432 | 0 | hts_log_error("Failed to populate reference for id %d", id); |
3433 | 0 | pthread_mutex_unlock(&fd->refs->lock); |
3434 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3435 | 0 | return NULL; |
3436 | 0 | } |
3437 | 0 | r = fd->refs->ref_id[id]; |
3438 | 0 | if (fd->unsorted) |
3439 | 0 | cram_ref_incr_locked(fd->refs, id); |
3440 | 0 | } |
3441 | | |
3442 | | |
3443 | | /* |
3444 | | * We now know that we the filename containing the reference, so check |
3445 | | * for limits. If it's over half the reference we'll load all of it in |
3446 | | * memory as this will speed up subsequent calls. |
3447 | | */ |
3448 | 0 | if (end < 1) |
3449 | 0 | end = r->length; |
3450 | 0 | if (end >= r->length) |
3451 | 0 | end = r->length; |
3452 | |
|
3453 | 0 | if (end - start >= 0.5*r->length || fd->shared_ref) { |
3454 | 0 | start = 1; |
3455 | 0 | end = r->length; |
3456 | 0 | } |
3457 | | |
3458 | | /* |
3459 | | * Maybe we have it cached already? If so use it. |
3460 | | * |
3461 | | * Alternatively if we don't have the sequence but we're sharing |
3462 | | * references and/or are asking for the entire length of it, then |
3463 | | * load the full reference into the refs structure and return |
3464 | | * a pointer to that one instead. |
3465 | | */ |
3466 | 0 | if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) { |
3467 | 0 | char *cp; |
3468 | |
|
3469 | 0 | if (id >= 0) { |
3470 | 0 | if (r->seq) { |
3471 | 0 | cram_ref_incr_locked(fd->refs, id); |
3472 | 0 | } else { |
3473 | 0 | ref_entry *e; |
3474 | 0 | if (!(e = cram_ref_load(fd->refs, id, r->is_md5))) { |
3475 | 0 | pthread_mutex_unlock(&fd->refs->lock); |
3476 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3477 | 0 | return NULL; |
3478 | 0 | } |
3479 | | |
3480 | | /* unsorted data implies cache ref indefinitely, to avoid |
3481 | | * continually loading and unloading. |
3482 | | */ |
3483 | 0 | if (fd->unsorted) |
3484 | 0 | cram_ref_incr_locked(fd->refs, id); |
3485 | 0 | } |
3486 | | |
3487 | 0 | fd->ref = NULL; /* We never access it directly */ |
3488 | 0 | fd->ref_start = 1; |
3489 | 0 | fd->ref_end = r->length; |
3490 | 0 | fd->ref_id = id; |
3491 | |
|
3492 | 0 | cp = fd->refs->ref_id[id]->seq + ostart-1; |
3493 | 0 | } else { |
3494 | 0 | fd->ref = NULL; |
3495 | 0 | cp = NULL; |
3496 | 0 | } |
3497 | | |
3498 | 0 | RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count); |
3499 | |
|
3500 | 0 | pthread_mutex_unlock(&fd->refs->lock); |
3501 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3502 | 0 | return cp; |
3503 | 0 | } |
3504 | | |
3505 | | /* |
3506 | | * Otherwise we're not sharing, we don't have a copy of it already and |
3507 | | * we're only asking for a small portion of it. |
3508 | | * |
3509 | | * In this case load up just that segment ourselves, freeing any old |
3510 | | * small segments in the process. |
3511 | | */ |
3512 | | |
3513 | | /* Unmapped ref ID */ |
3514 | 0 | if (id < 0 || !fd->refs->fn) { |
3515 | 0 | if (fd->ref_free) { |
3516 | 0 | free(fd->ref_free); |
3517 | 0 | fd->ref_free = NULL; |
3518 | 0 | } |
3519 | 0 | fd->ref = NULL; |
3520 | 0 | fd->ref_id = id; |
3521 | 0 | pthread_mutex_unlock(&fd->refs->lock); |
3522 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3523 | 0 | return NULL; |
3524 | 0 | } |
3525 | | |
3526 | | /* Open file if it's not already the current open reference */ |
3527 | 0 | if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) { |
3528 | 0 | if (fd->refs->fp) |
3529 | 0 | if (bgzf_close(fd->refs->fp) != 0) |
3530 | 0 | return NULL; |
3531 | 0 | fd->refs->fn = r->fn; |
3532 | 0 | if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r", r->is_md5))) { |
3533 | 0 | pthread_mutex_unlock(&fd->refs->lock); |
3534 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3535 | 0 | return NULL; |
3536 | 0 | } |
3537 | 0 | } |
3538 | | |
3539 | 0 | if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) { |
3540 | 0 | pthread_mutex_unlock(&fd->refs->lock); |
3541 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3542 | 0 | return NULL; |
3543 | 0 | } |
3544 | | |
3545 | 0 | if (fd->ref_free) |
3546 | 0 | free(fd->ref_free); |
3547 | |
|
3548 | 0 | fd->ref_id = id; |
3549 | 0 | fd->ref_start = start; |
3550 | 0 | fd->ref_end = end; |
3551 | 0 | fd->ref_free = fd->ref; |
3552 | 0 | seq = fd->ref; |
3553 | |
|
3554 | 0 | pthread_mutex_unlock(&fd->refs->lock); |
3555 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3556 | |
|
3557 | 0 | return seq ? seq + ostart - start : NULL; |
3558 | 0 | } |
3559 | | |
3560 | | /* |
3561 | | * If fd has been opened for reading, it may be permitted to specify 'fn' |
3562 | | * as NULL and let the code auto-detect the reference by parsing the |
3563 | | * SAM header @SQ lines. |
3564 | | */ |
3565 | 0 | int cram_load_reference(cram_fd *fd, char *fn) { |
3566 | 0 | int ret = 0; |
3567 | |
|
3568 | 0 | if (fn) { |
3569 | 0 | fd->refs = refs_load_fai(fd->refs, fn, |
3570 | 0 | !(fd->embed_ref>0 && fd->mode == 'r')); |
3571 | 0 | fn = fd->refs ? fd->refs->fn : NULL; |
3572 | 0 | if (!fn) |
3573 | 0 | ret = -1; |
3574 | 0 | sanitise_SQ_lines(fd); |
3575 | 0 | } |
3576 | 0 | fd->ref_fn = fn; |
3577 | |
|
3578 | 0 | if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) { |
3579 | 0 | if (fd->refs) |
3580 | 0 | refs_free(fd->refs); |
3581 | 0 | if (!(fd->refs = refs_create())) |
3582 | 0 | return -1; |
3583 | 0 | if (-1 == refs_from_header(fd)) |
3584 | 0 | return -1; |
3585 | 0 | } |
3586 | | |
3587 | 0 | if (fd->header) |
3588 | 0 | if (-1 == refs2id(fd->refs, fd->header)) |
3589 | 0 | return -1; |
3590 | | |
3591 | 0 | return ret; |
3592 | 0 | } |
3593 | | |
3594 | | /* ---------------------------------------------------------------------- |
3595 | | * Containers |
3596 | | */ |
3597 | | |
3598 | | /* |
3599 | | * Creates a new container, specifying the maximum number of slices |
3600 | | * and records permitted. |
3601 | | * |
3602 | | * Returns cram_container ptr on success |
3603 | | * NULL on failure |
3604 | | */ |
3605 | 0 | cram_container *cram_new_container(int nrec, int nslice) { |
3606 | 0 | cram_container *c = calloc(1, sizeof(*c)); |
3607 | 0 | enum cram_DS_ID id; |
3608 | |
|
3609 | 0 | if (!c) |
3610 | 0 | return NULL; |
3611 | | |
3612 | 0 | c->curr_ref = -2; |
3613 | |
|
3614 | 0 | c->max_c_rec = nrec * nslice; |
3615 | 0 | c->curr_c_rec = 0; |
3616 | |
|
3617 | 0 | c->max_rec = nrec; |
3618 | 0 | c->record_counter = 0; |
3619 | 0 | c->num_bases = 0; |
3620 | 0 | c->s_num_bases = 0; |
3621 | |
|
3622 | 0 | c->max_slice = nslice; |
3623 | 0 | c->curr_slice = 0; |
3624 | |
|
3625 | 0 | c->pos_sorted = 1; |
3626 | 0 | c->max_apos = 0; |
3627 | 0 | c->multi_seq = 0; |
3628 | 0 | c->qs_seq_orient = 1; |
3629 | |
|
3630 | 0 | c->bams = NULL; |
3631 | |
|
3632 | 0 | if (!(c->slices = calloc(nslice != 0 ? nslice : 1, sizeof(cram_slice *)))) |
3633 | 0 | goto err; |
3634 | 0 | c->slice = NULL; |
3635 | |
|
3636 | 0 | if (!(c->comp_hdr = cram_new_compression_header())) |
3637 | 0 | goto err; |
3638 | 0 | c->comp_hdr_block = NULL; |
3639 | |
|
3640 | 0 | for (id = DS_RN; id < DS_TN; id++) |
3641 | 0 | if (!(c->stats[id] = cram_stats_create())) goto err; |
3642 | | |
3643 | | //c->aux_B_stats = cram_stats_create(); |
3644 | | |
3645 | 0 | if (!(c->tags_used = kh_init(m_tagmap))) |
3646 | 0 | goto err; |
3647 | 0 | c->refs_used = 0; |
3648 | 0 | c->ref_free = 0; |
3649 | |
|
3650 | 0 | return c; |
3651 | | |
3652 | 0 | err: |
3653 | 0 | if (c) { |
3654 | 0 | if (c->slices) |
3655 | 0 | free(c->slices); |
3656 | 0 | free(c); |
3657 | 0 | } |
3658 | 0 | return NULL; |
3659 | 0 | } |
3660 | | |
3661 | 3.94k | void cram_free_container(cram_container *c) { |
3662 | 3.94k | enum cram_DS_ID id; |
3663 | 3.94k | int i; |
3664 | | |
3665 | 3.94k | if (!c) |
3666 | 0 | return; |
3667 | | |
3668 | 3.94k | if (c->refs_used) |
3669 | 0 | free(c->refs_used); |
3670 | | |
3671 | 3.94k | if (c->landmark) |
3672 | 222 | free(c->landmark); |
3673 | | |
3674 | 3.94k | if (c->comp_hdr) |
3675 | 441 | cram_free_compression_header(c->comp_hdr); |
3676 | | |
3677 | 3.94k | if (c->comp_hdr_block) |
3678 | 640 | cram_free_block(c->comp_hdr_block); |
3679 | | |
3680 | | // Free the slices; filled out by encoder only |
3681 | 3.94k | if (c->slices) { |
3682 | 0 | for (i = 0; i < c->max_slice; i++) { |
3683 | 0 | if (c->slices[i]) |
3684 | 0 | cram_free_slice(c->slices[i]); |
3685 | 0 | if (c->slices[i] == c->slice) |
3686 | 0 | c->slice = NULL; |
3687 | 0 | } |
3688 | 0 | free(c->slices); |
3689 | 0 | } |
3690 | | |
3691 | | // Free the current slice; set by both encoder & decoder |
3692 | 3.94k | if (c->slice) { |
3693 | 0 | cram_free_slice(c->slice); |
3694 | 0 | c->slice = NULL; |
3695 | 0 | } |
3696 | | |
3697 | 114k | for (id = DS_RN; id < DS_TN; id++) |
3698 | 110k | if (c->stats[id]) cram_stats_free(c->stats[id]); |
3699 | | |
3700 | | //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats); |
3701 | | |
3702 | 3.94k | if (c->tags_used) { |
3703 | 0 | khint_t k; |
3704 | |
|
3705 | 0 | for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) { |
3706 | 0 | if (!kh_exist(c->tags_used, k)) |
3707 | 0 | continue; |
3708 | | |
3709 | 0 | cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k); |
3710 | 0 | if (tm) { |
3711 | 0 | cram_codec *c = tm->codec; |
3712 | |
|
3713 | 0 | if (c) c->free(c); |
3714 | 0 | free(tm); |
3715 | 0 | } |
3716 | 0 | } |
3717 | |
|
3718 | 0 | kh_destroy(m_tagmap, c->tags_used); |
3719 | 0 | } |
3720 | | |
3721 | 3.94k | if (c->ref_free) |
3722 | 0 | free(c->ref); |
3723 | | |
3724 | 3.94k | free(c); |
3725 | 3.94k | } |
3726 | | |
3727 | | /* |
3728 | | * Reads a container header. |
3729 | | * |
3730 | | * Returns cram_container on success |
3731 | | * NULL on failure or no container left (fd->err == 0). |
3732 | | */ |
3733 | 3.95k | cram_container *cram_read_container(cram_fd *fd) { |
3734 | 3.95k | cram_container c2, *c; |
3735 | 3.95k | int i, s; |
3736 | 3.95k | size_t rd = 0; |
3737 | 3.95k | uint32_t crc = 0; |
3738 | | |
3739 | 3.95k | fd->err = 0; |
3740 | 3.95k | fd->eof = 0; |
3741 | | |
3742 | 3.95k | memset(&c2, 0, sizeof(c2)); |
3743 | 3.95k | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
3744 | 2.63k | if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) { |
3745 | 8 | fd->eof = fd->empty_container ? 1 : 2; |
3746 | 8 | return NULL; |
3747 | 2.62k | } else { |
3748 | 2.62k | rd+=s; |
3749 | 2.62k | } |
3750 | 2.63k | } else if (CRAM_MAJOR_VERS(fd->version) < 4) { |
3751 | 1.31k | uint32_t len; |
3752 | 1.31k | if ((s = int32_decode(fd, &c2.length)) == -1) { |
3753 | 0 | if (CRAM_MAJOR_VERS(fd->version) == 2 && |
3754 | 0 | CRAM_MINOR_VERS(fd->version) == 0) |
3755 | 0 | fd->eof = 1; // EOF blocks arrived in v2.1 |
3756 | 0 | else |
3757 | 0 | fd->eof = fd->empty_container ? 1 : 2; |
3758 | 0 | return NULL; |
3759 | 1.31k | } else { |
3760 | 1.31k | rd+=s; |
3761 | 1.31k | } |
3762 | 1.31k | len = le_int4(c2.length); |
3763 | 1.31k | crc = crc32(0L, (unsigned char *)&len, 4); |
3764 | 1.31k | } else { |
3765 | 3 | if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) { |
3766 | 0 | fd->eof = fd->empty_container ? 1 : 2; |
3767 | 0 | return NULL; |
3768 | 3 | } else { |
3769 | 3 | rd+=s; |
3770 | 3 | } |
3771 | 3 | } |
3772 | 3.94k | if ((s = fd->vv.varint_decode32s_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s; |
3773 | 3.94k | if (CRAM_MAJOR_VERS(fd->version) >= 4) { |
3774 | 3 | int64_t i64; |
3775 | 3 | if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc))== -1) return NULL; else rd+=s; |
3776 | 3 | c2.ref_seq_start = i64; |
3777 | 3 | if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc)) == -1) return NULL; else rd+=s; |
3778 | 3 | c2.ref_seq_span = i64; |
3779 | 3.94k | } else { |
3780 | 3.94k | int32_t i32; |
3781 | 3.94k | if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s; |
3782 | 3.94k | c2.ref_seq_start = i32; |
3783 | 3.94k | if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s; |
3784 | 3.94k | c2.ref_seq_span = i32; |
3785 | 3.94k | } |
3786 | 3.94k | if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_records, &crc)) == -1) return NULL; else rd+=s; |
3787 | | |
3788 | 3.94k | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
3789 | 2.62k | c2.record_counter = 0; |
3790 | 2.62k | c2.num_bases = 0; |
3791 | 2.62k | } else { |
3792 | 1.32k | if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
3793 | 3 | if ((s = fd->vv.varint_decode64_crc(fd, &c2.record_counter, &crc)) == -1) |
3794 | 0 | return NULL; |
3795 | 3 | else |
3796 | 3 | rd += s; |
3797 | 1.31k | } else { |
3798 | 1.31k | int32_t i32; |
3799 | 1.31k | if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) |
3800 | 0 | return NULL; |
3801 | 1.31k | else |
3802 | 1.31k | rd += s; |
3803 | 1.31k | c2.record_counter = i32; |
3804 | 1.31k | } |
3805 | | |
3806 | 1.32k | if ((s = fd->vv.varint_decode64_crc(fd, &c2.num_bases, &crc))== -1) |
3807 | 0 | return NULL; |
3808 | 1.32k | else |
3809 | 1.32k | rd += s; |
3810 | 1.32k | } |
3811 | 3.94k | if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_blocks, &crc)) == -1) |
3812 | 2 | return NULL; |
3813 | 3.94k | else |
3814 | 3.94k | rd+=s; |
3815 | 3.94k | if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_landmarks, &crc))== -1) |
3816 | 0 | return NULL; |
3817 | 3.94k | else |
3818 | 3.94k | rd+=s; |
3819 | | |
3820 | 3.94k | if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t)) |
3821 | 1 | return NULL; |
3822 | | |
3823 | 3.94k | if (!(c = calloc(1, sizeof(*c)))) |
3824 | 0 | return NULL; |
3825 | | |
3826 | 3.94k | *c = c2; |
3827 | | |
3828 | 3.94k | if (c->num_landmarks && !(c->landmark = malloc(c->num_landmarks * sizeof(int32_t)))) { |
3829 | 0 | fd->err = errno; |
3830 | 0 | cram_free_container(c); |
3831 | 0 | return NULL; |
3832 | 0 | } |
3833 | 511k | for (i = 0; i < c->num_landmarks; i++) { |
3834 | 508k | if ((s = fd->vv.varint_decode32_crc(fd, &c->landmark[i], &crc)) == -1) { |
3835 | 12 | cram_free_container(c); |
3836 | 12 | return NULL; |
3837 | 508k | } else { |
3838 | 508k | rd += s; |
3839 | 508k | } |
3840 | 508k | } |
3841 | | |
3842 | 3.93k | if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
3843 | 2 | if (-1 == int32_decode(fd, (int32_t *)&c->crc32)) { |
3844 | 0 | cram_free_container(c); |
3845 | 0 | return NULL; |
3846 | 2 | } else { |
3847 | 2 | rd+=4; |
3848 | 2 | } |
3849 | | |
3850 | 2 | if (crc != c->crc32) { |
3851 | 2 | hts_log_error("Container header CRC32 failure"); |
3852 | 2 | cram_free_container(c); |
3853 | 2 | return NULL; |
3854 | 2 | } |
3855 | 2 | } |
3856 | | |
3857 | 3.92k | c->offset = rd; |
3858 | 3.92k | c->slices = NULL; |
3859 | 3.92k | c->slice = NULL; |
3860 | 3.92k | c->curr_slice = 0; |
3861 | 3.92k | c->max_slice = c->num_landmarks; |
3862 | 3.92k | c->slice_rec = 0; |
3863 | 3.92k | c->curr_rec = 0; |
3864 | 3.92k | c->max_rec = 0; |
3865 | | |
3866 | 3.92k | if (c->ref_seq_id == -2) { |
3867 | 0 | c->multi_seq = 1; |
3868 | 0 | fd->multi_seq = 1; |
3869 | 0 | } |
3870 | | |
3871 | 3.92k | fd->empty_container = |
3872 | 3.92k | (c->num_records == 0 && |
3873 | 3.92k | c->ref_seq_id == -1 && |
3874 | 3.92k | c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0; |
3875 | | |
3876 | 3.92k | return c; |
3877 | 3.93k | } |
3878 | | |
3879 | | |
3880 | | /* MAXIMUM storage size needed for the container. */ |
3881 | 0 | int cram_container_size(cram_container *c) { |
3882 | 0 | return 55 + 5*c->num_landmarks; |
3883 | 0 | } |
3884 | | |
3885 | | |
3886 | | /* |
3887 | | * Stores the container structure in dat and returns *size as the |
3888 | | * number of bytes written to dat[]. The input size of dat is also |
3889 | | * held in *size and should be initialised to cram_container_size(c). |
3890 | | * |
3891 | | * Returns 0 on success; |
3892 | | * -1 on failure |
3893 | | */ |
3894 | | int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) |
3895 | 0 | { |
3896 | 0 | char *cp = (char *)dat; |
3897 | 0 | int i; |
3898 | | |
3899 | | // Check the input buffer is large enough according to our stated |
3900 | | // requirements. (NOTE: it may actually take less.) |
3901 | 0 | if (cram_container_size(c) > *size) |
3902 | 0 | return -1; |
3903 | | |
3904 | 0 | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
3905 | 0 | cp += itf8_put(cp, c->length); |
3906 | 0 | } else { |
3907 | 0 | *(int32_t *)cp = le_int4(c->length); |
3908 | 0 | cp += 4; |
3909 | 0 | } |
3910 | 0 | if (c->multi_seq) { |
3911 | 0 | cp += fd->vv.varint_put32(cp, NULL, -2); |
3912 | 0 | cp += fd->vv.varint_put32(cp, NULL, 0); |
3913 | 0 | cp += fd->vv.varint_put32(cp, NULL, 0); |
3914 | 0 | } else { |
3915 | 0 | cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id); |
3916 | 0 | if (CRAM_MAJOR_VERS(fd->version) >= 4) { |
3917 | 0 | cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start); |
3918 | 0 | cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span); |
3919 | 0 | } else { |
3920 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start); |
3921 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span); |
3922 | 0 | } |
3923 | 0 | } |
3924 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->num_records); |
3925 | 0 | if (CRAM_MAJOR_VERS(fd->version) == 2) { |
3926 | 0 | cp += fd->vv.varint_put64(cp, NULL, c->record_counter); |
3927 | 0 | } else if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
3928 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->record_counter); |
3929 | 0 | } |
3930 | 0 | cp += fd->vv.varint_put64(cp, NULL, c->num_bases); |
3931 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->num_blocks); |
3932 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks); |
3933 | 0 | for (i = 0; i < c->num_landmarks; i++) |
3934 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]); |
3935 | |
|
3936 | 0 | if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
3937 | 0 | c->crc32 = crc32(0L, (uc *)dat, cp-dat); |
3938 | 0 | cp[0] = c->crc32 & 0xff; |
3939 | 0 | cp[1] = (c->crc32 >> 8) & 0xff; |
3940 | 0 | cp[2] = (c->crc32 >> 16) & 0xff; |
3941 | 0 | cp[3] = (c->crc32 >> 24) & 0xff; |
3942 | 0 | cp += 4; |
3943 | 0 | } |
3944 | |
|
3945 | 0 | *size = cp-dat; // actual used size |
3946 | |
|
3947 | 0 | return 0; |
3948 | 0 | } |
3949 | | |
3950 | | |
3951 | | /* |
3952 | | * Writes a container structure. |
3953 | | * |
3954 | | * Returns 0 on success |
3955 | | * -1 on failure |
3956 | | */ |
3957 | 0 | int cram_write_container(cram_fd *fd, cram_container *c) { |
3958 | 0 | char buf_a[1024], *buf = buf_a, *cp; |
3959 | 0 | int i; |
3960 | |
|
3961 | 0 | if (61 + c->num_landmarks * 10 >= 1024) { |
3962 | 0 | buf = malloc(61 + c->num_landmarks * 10); |
3963 | 0 | if (!buf) |
3964 | 0 | return -1; |
3965 | 0 | } |
3966 | 0 | cp = buf; |
3967 | |
|
3968 | 0 | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
3969 | 0 | cp += itf8_put(cp, c->length); |
3970 | 0 | } else if (CRAM_MAJOR_VERS(fd->version) <= 3) { |
3971 | 0 | *(int32_t *)cp = le_int4(c->length); |
3972 | 0 | cp += 4; |
3973 | 0 | } else { |
3974 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->length); |
3975 | 0 | } |
3976 | 0 | if (c->multi_seq) { |
3977 | 0 | cp += fd->vv.varint_put32(cp, NULL, (uint32_t)-2); |
3978 | 0 | cp += fd->vv.varint_put32(cp, NULL, 0); |
3979 | 0 | cp += fd->vv.varint_put32(cp, NULL, 0); |
3980 | 0 | } else { |
3981 | 0 | cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id); |
3982 | 0 | if (CRAM_MAJOR_VERS(fd->version) >= 4) { |
3983 | 0 | cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start); |
3984 | 0 | cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span); |
3985 | 0 | } else { |
3986 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start); |
3987 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span); |
3988 | 0 | } |
3989 | 0 | } |
3990 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->num_records); |
3991 | 0 | if (CRAM_MAJOR_VERS(fd->version) >= 3) |
3992 | 0 | cp += fd->vv.varint_put64(cp, NULL, c->record_counter); |
3993 | 0 | else |
3994 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->record_counter); |
3995 | 0 | cp += fd->vv.varint_put64(cp, NULL, c->num_bases); |
3996 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->num_blocks); |
3997 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks); |
3998 | 0 | for (i = 0; i < c->num_landmarks; i++) |
3999 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]); |
4000 | |
|
4001 | 0 | if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
4002 | 0 | c->crc32 = crc32(0L, (uc *)buf, cp-buf); |
4003 | 0 | cp[0] = c->crc32 & 0xff; |
4004 | 0 | cp[1] = (c->crc32 >> 8) & 0xff; |
4005 | 0 | cp[2] = (c->crc32 >> 16) & 0xff; |
4006 | 0 | cp[3] = (c->crc32 >> 24) & 0xff; |
4007 | 0 | cp += 4; |
4008 | 0 | } |
4009 | |
|
4010 | 0 | if (cp-buf != hwrite(fd->fp, buf, cp-buf)) { |
4011 | 0 | if (buf != buf_a) |
4012 | 0 | free(buf); |
4013 | 0 | return -1; |
4014 | 0 | } |
4015 | | |
4016 | 0 | if (buf != buf_a) |
4017 | 0 | free(buf); |
4018 | |
|
4019 | 0 | return 0; |
4020 | 0 | } |
4021 | | |
4022 | | // common component shared by cram_flush_container{,_mt} |
4023 | 0 | static int cram_flush_container2(cram_fd *fd, cram_container *c) { |
4024 | 0 | int i, j; |
4025 | |
|
4026 | 0 | if (c->curr_slice > 0 && !c->slices) |
4027 | 0 | return -1; |
4028 | | |
4029 | | //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum); |
4030 | | |
4031 | 0 | off_t c_offset = htell(fd->fp); // File offset of container |
4032 | | |
4033 | | /* Write the container struct itself */ |
4034 | 0 | if (0 != cram_write_container(fd, c)) |
4035 | 0 | return -1; |
4036 | | |
4037 | 0 | off_t hdr_size = htell(fd->fp) - c_offset; |
4038 | | |
4039 | | /* And the compression header */ |
4040 | 0 | if (0 != cram_write_block(fd, c->comp_hdr_block)) |
4041 | 0 | return -1; |
4042 | | |
4043 | | /* Followed by the slice blocks */ |
4044 | 0 | off_t file_offset = htell(fd->fp); |
4045 | 0 | for (i = 0; i < c->curr_slice; i++) { |
4046 | 0 | cram_slice *s = c->slices[i]; |
4047 | 0 | off_t spos = file_offset - c_offset - hdr_size; |
4048 | |
|
4049 | 0 | if (0 != cram_write_block(fd, s->hdr_block)) |
4050 | 0 | return -1; |
4051 | | |
4052 | 0 | for (j = 0; j < s->hdr->num_blocks; j++) { |
4053 | 0 | if (0 != cram_write_block(fd, s->block[j])) |
4054 | 0 | return -1; |
4055 | 0 | } |
4056 | | |
4057 | 0 | file_offset = htell(fd->fp); |
4058 | 0 | off_t sz = file_offset - c_offset - hdr_size - spos; |
4059 | |
|
4060 | 0 | if (fd->idxfp) { |
4061 | 0 | if (cram_index_slice(fd, c, s, fd->idxfp, c_offset, spos, sz) < 0) |
4062 | 0 | return -1; |
4063 | 0 | } |
4064 | 0 | } |
4065 | | |
4066 | 0 | return 0; |
4067 | 0 | } |
4068 | | |
4069 | | /* |
4070 | | * Flushes a completely or partially full container to disk, writing |
4071 | | * container structure, header and blocks. This also calls the encoder |
4072 | | * functions. |
4073 | | * |
4074 | | * Returns 0 on success |
4075 | | * -1 on failure |
4076 | | */ |
4077 | 0 | int cram_flush_container(cram_fd *fd, cram_container *c) { |
4078 | | /* Encode the container blocks and generate compression header */ |
4079 | 0 | if (0 != cram_encode_container(fd, c)) |
4080 | 0 | return -1; |
4081 | | |
4082 | 0 | return cram_flush_container2(fd, c); |
4083 | 0 | } |
4084 | | |
4085 | | typedef struct { |
4086 | | cram_fd *fd; |
4087 | | cram_container *c; |
4088 | | } cram_job; |
4089 | | |
4090 | 0 | void *cram_flush_thread(void *arg) { |
4091 | 0 | cram_job *j = (cram_job *)arg; |
4092 | | |
4093 | | /* Encode the container blocks and generate compression header */ |
4094 | 0 | if (0 != cram_encode_container(j->fd, j->c)) { |
4095 | 0 | hts_log_error("Call to cram_encode_container failed"); |
4096 | 0 | return NULL; |
4097 | 0 | } |
4098 | | |
4099 | 0 | return arg; |
4100 | 0 | } |
4101 | | |
4102 | 0 | static int cram_flush_result(cram_fd *fd) { |
4103 | 0 | int i, ret = 0; |
4104 | 0 | hts_tpool_result *r; |
4105 | 0 | cram_container *lc = NULL; |
4106 | | |
4107 | | // NB: we can have one result per slice, not per container, |
4108 | | // so we need to free the container only after all slices |
4109 | | // within it have been freed. (Automatic via reference counting.) |
4110 | 0 | while ((r = hts_tpool_next_result(fd->rqueue))) { |
4111 | 0 | cram_job *j = (cram_job *)hts_tpool_result_data(r); |
4112 | 0 | cram_container *c; |
4113 | |
|
4114 | 0 | if (!j) { |
4115 | 0 | hts_tpool_delete_result(r, 0); |
4116 | 0 | return -1; |
4117 | 0 | } |
4118 | | |
4119 | 0 | fd = j->fd; |
4120 | 0 | c = j->c; |
4121 | |
|
4122 | 0 | if (fd->mode == 'w') |
4123 | 0 | if (0 != cram_flush_container2(fd, c)) |
4124 | 0 | return -1; |
4125 | | |
4126 | | // Free the slices; filled out by encoder only |
4127 | 0 | if (c->slices) { |
4128 | 0 | for (i = 0; i < c->max_slice; i++) { |
4129 | 0 | if (c->slices[i]) |
4130 | 0 | cram_free_slice(c->slices[i]); |
4131 | 0 | if (c->slices[i] == c->slice) |
4132 | 0 | c->slice = NULL; |
4133 | 0 | c->slices[i] = NULL; |
4134 | 0 | } |
4135 | 0 | } |
4136 | | |
4137 | | // Free the current slice; set by both encoder & decoder |
4138 | 0 | if (c->slice) { |
4139 | 0 | cram_free_slice(c->slice); |
4140 | 0 | c->slice = NULL; |
4141 | 0 | } |
4142 | 0 | c->curr_slice = 0; |
4143 | | |
4144 | | // Our jobs will be in order, so we free the last |
4145 | | // container when our job has switched to a new one. |
4146 | 0 | if (c != lc) { |
4147 | 0 | if (lc) { |
4148 | 0 | if (fd->ctr == lc) |
4149 | 0 | fd->ctr = NULL; |
4150 | 0 | if (fd->ctr_mt == lc) |
4151 | 0 | fd->ctr_mt = NULL; |
4152 | 0 | cram_free_container(lc); |
4153 | 0 | } |
4154 | 0 | lc = c; |
4155 | 0 | } |
4156 | |
|
4157 | 0 | hts_tpool_delete_result(r, 1); |
4158 | 0 | } |
4159 | 0 | if (lc) { |
4160 | 0 | if (fd->ctr == lc) |
4161 | 0 | fd->ctr = NULL; |
4162 | 0 | if (fd->ctr_mt == lc) |
4163 | 0 | fd->ctr_mt = NULL; |
4164 | 0 | cram_free_container(lc); |
4165 | 0 | } |
4166 | |
|
4167 | 0 | return ret; |
4168 | 0 | } |
4169 | | |
4170 | | // Note: called while metrics_lock is held. |
4171 | | // Will be left in this state too, but may temporarily unlock. |
4172 | 0 | void reset_metrics(cram_fd *fd) { |
4173 | 0 | int i; |
4174 | |
|
4175 | 0 | if (fd->pool) { |
4176 | | // If multi-threaded we have multiple blocks being |
4177 | | // compressed already and several on the to-do list |
4178 | | // (fd->rqueue->pending). It's tricky to reset the |
4179 | | // metrics exactly the correct point, so instead we |
4180 | | // just flush the pool, reset, and then continue again. |
4181 | | |
4182 | | // Don't bother starting a new trial before then though. |
4183 | 0 | for (i = 0; i < DS_END; i++) { |
4184 | 0 | cram_metrics *m = fd->m[i]; |
4185 | 0 | if (!m) |
4186 | 0 | continue; |
4187 | 0 | m->next_trial = 999; |
4188 | 0 | } |
4189 | |
|
4190 | 0 | pthread_mutex_unlock(&fd->metrics_lock); |
4191 | 0 | hts_tpool_process_flush(fd->rqueue); |
4192 | 0 | pthread_mutex_lock(&fd->metrics_lock); |
4193 | 0 | } |
4194 | |
|
4195 | 0 | for (i = 0; i < DS_END; i++) { |
4196 | 0 | cram_metrics *m = fd->m[i]; |
4197 | 0 | if (!m) |
4198 | 0 | continue; |
4199 | | |
4200 | 0 | m->trial = NTRIALS; |
4201 | 0 | m->next_trial = TRIAL_SPAN; |
4202 | 0 | m->revised_method = 0; |
4203 | 0 | m->unpackable = 0; |
4204 | |
|
4205 | 0 | memset(m->sz, 0, sizeof(m->sz)); |
4206 | 0 | } |
4207 | 0 | } |
4208 | | |
4209 | 0 | int cram_flush_container_mt(cram_fd *fd, cram_container *c) { |
4210 | 0 | cram_job *j; |
4211 | | |
4212 | | // At the junction of mapped to unmapped data the compression |
4213 | | // methods may need to change due to very different statistical |
4214 | | // properties; particularly BA if minhash sorted. |
4215 | | // |
4216 | | // However with threading we'll have several in-flight blocks |
4217 | | // arriving out of order. |
4218 | | // |
4219 | | // So we do one trial reset of NThreads to last for NThreads |
4220 | | // duration to get us over this transition period, followed |
4221 | | // by another retrial of the usual ntrials & trial span. |
4222 | 0 | pthread_mutex_lock(&fd->metrics_lock); |
4223 | 0 | if (c->n_mapped < 0.3*c->curr_rec && |
4224 | 0 | fd->last_mapped > 0.7*c->max_rec) { |
4225 | 0 | reset_metrics(fd); |
4226 | 0 | } |
4227 | 0 | fd->last_mapped = c->n_mapped * (c->max_rec+1)/(c->curr_rec+1) ; |
4228 | 0 | pthread_mutex_unlock(&fd->metrics_lock); |
4229 | |
|
4230 | 0 | if (!fd->pool) |
4231 | 0 | return cram_flush_container(fd, c); |
4232 | | |
4233 | 0 | if (!(j = malloc(sizeof(*j)))) |
4234 | 0 | return -1; |
4235 | 0 | j->fd = fd; |
4236 | 0 | j->c = c; |
4237 | | |
4238 | | // Flush the job. Note our encoder queue may be full, so we |
4239 | | // either have to keep trying in non-blocking mode (what we do) or |
4240 | | // use a dedicated separate thread for draining the queue. |
4241 | 0 | for (;;) { |
4242 | 0 | errno = 0; |
4243 | 0 | hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_flush_thread, j, 1); |
4244 | 0 | int pending = (errno == EAGAIN); |
4245 | 0 | if (cram_flush_result(fd) != 0) |
4246 | 0 | return -1; |
4247 | 0 | if (!pending) |
4248 | 0 | break; |
4249 | | |
4250 | 0 | usleep(1000); |
4251 | 0 | } |
4252 | | |
4253 | 0 | return 0; |
4254 | 0 | } |
4255 | | |
4256 | | /* ---------------------------------------------------------------------- |
4257 | | * Compression headers; the first part of the container |
4258 | | */ |
4259 | | |
4260 | | /* |
4261 | | * Creates a new blank container compression header |
4262 | | * |
4263 | | * Returns header ptr on success |
4264 | | * NULL on failure |
4265 | | */ |
4266 | 0 | cram_block_compression_hdr *cram_new_compression_header(void) { |
4267 | 0 | cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr)); |
4268 | 0 | if (!hdr) |
4269 | 0 | return NULL; |
4270 | | |
4271 | 0 | if (!(hdr->TD_blk = cram_new_block(CORE, 0))) { |
4272 | 0 | free(hdr); |
4273 | 0 | return NULL; |
4274 | 0 | } |
4275 | | |
4276 | 0 | if (!(hdr->TD_hash = kh_init(m_s2i))) { |
4277 | 0 | cram_free_block(hdr->TD_blk); |
4278 | 0 | free(hdr); |
4279 | 0 | return NULL; |
4280 | 0 | } |
4281 | | |
4282 | 0 | if (!(hdr->TD_keys = string_pool_create(8192))) { |
4283 | 0 | kh_destroy(m_s2i, hdr->TD_hash); |
4284 | 0 | cram_free_block(hdr->TD_blk); |
4285 | 0 | free(hdr); |
4286 | 0 | return NULL; |
4287 | 0 | } |
4288 | | |
4289 | 0 | return hdr; |
4290 | 0 | } |
4291 | | |
4292 | 498 | void cram_free_compression_header(cram_block_compression_hdr *hdr) { |
4293 | 498 | int i; |
4294 | | |
4295 | 498 | if (hdr->landmark) |
4296 | 498 | free(hdr->landmark); |
4297 | | |
4298 | 498 | if (hdr->preservation_map) |
4299 | 498 | kh_destroy(map, hdr->preservation_map); |
4300 | | |
4301 | 16.4k | for (i = 0; i < CRAM_MAP_HASH; i++) { |
4302 | 15.9k | cram_map *m, *m2; |
4303 | 19.7k | for (m = hdr->rec_encoding_map[i]; m; m = m2) { |
4304 | 3.78k | m2 = m->next; |
4305 | 3.78k | if (m->codec) |
4306 | 0 | m->codec->free(m->codec); |
4307 | 3.78k | free(m); |
4308 | 3.78k | } |
4309 | 15.9k | } |
4310 | | |
4311 | 16.4k | for (i = 0; i < CRAM_MAP_HASH; i++) { |
4312 | 15.9k | cram_map *m, *m2; |
4313 | 16.0k | for (m = hdr->tag_encoding_map[i]; m; m = m2) { |
4314 | 157 | m2 = m->next; |
4315 | 157 | if (m->codec) |
4316 | 157 | m->codec->free(m->codec); |
4317 | 157 | free(m); |
4318 | 157 | } |
4319 | 15.9k | } |
4320 | | |
4321 | 23.9k | for (i = 0; i < DS_END; i++) { |
4322 | 23.4k | if (hdr->codecs[i]) |
4323 | 99 | hdr->codecs[i]->free(hdr->codecs[i]); |
4324 | 23.4k | } |
4325 | | |
4326 | 498 | if (hdr->TL) |
4327 | 3 | free(hdr->TL); |
4328 | 498 | if (hdr->TD_blk) |
4329 | 3 | cram_free_block(hdr->TD_blk); |
4330 | 498 | if (hdr->TD_hash) |
4331 | 498 | kh_destroy(m_s2i, hdr->TD_hash); |
4332 | 498 | if (hdr->TD_keys) |
4333 | 0 | string_pool_destroy(hdr->TD_keys); |
4334 | | |
4335 | 498 | free(hdr); |
4336 | 498 | } |
4337 | | |
4338 | | |
4339 | | /* ---------------------------------------------------------------------- |
4340 | | * Slices and slice headers |
4341 | | */ |
4342 | | |
4343 | 110 | void cram_free_slice_header(cram_block_slice_hdr *hdr) { |
4344 | 110 | if (!hdr) |
4345 | 0 | return; |
4346 | | |
4347 | 110 | if (hdr->block_content_ids) |
4348 | 110 | free(hdr->block_content_ids); |
4349 | | |
4350 | 110 | free(hdr); |
4351 | | |
4352 | 110 | return; |
4353 | 110 | } |
4354 | | |
4355 | 118 | void cram_free_slice(cram_slice *s) { |
4356 | 118 | if (!s) |
4357 | 0 | return; |
4358 | | |
4359 | 118 | if (s->hdr_block) |
4360 | 105 | cram_free_block(s->hdr_block); |
4361 | | |
4362 | 118 | if (s->block) { |
4363 | 110 | int i; |
4364 | | |
4365 | 110 | if (s->hdr) { |
4366 | 798 | for (i = 0; i < s->hdr->num_blocks; i++) { |
4367 | 688 | if (i > 0 && s->block[i] == s->block[0]) |
4368 | 246 | continue; |
4369 | 442 | cram_free_block(s->block[i]); |
4370 | 442 | } |
4371 | 110 | } |
4372 | 110 | free(s->block); |
4373 | 110 | } |
4374 | | |
4375 | 118 | if (s->block_by_id) |
4376 | 105 | free(s->block_by_id); |
4377 | | |
4378 | 118 | if (s->hdr) |
4379 | 110 | cram_free_slice_header(s->hdr); |
4380 | | |
4381 | 118 | if (s->seqs_blk) |
4382 | 105 | cram_free_block(s->seqs_blk); |
4383 | | |
4384 | 118 | if (s->qual_blk) |
4385 | 105 | cram_free_block(s->qual_blk); |
4386 | | |
4387 | 118 | if (s->name_blk) |
4388 | 105 | cram_free_block(s->name_blk); |
4389 | | |
4390 | 118 | if (s->aux_blk) |
4391 | 105 | cram_free_block(s->aux_blk); |
4392 | | |
4393 | 118 | if (s->base_blk) |
4394 | 105 | cram_free_block(s->base_blk); |
4395 | | |
4396 | 118 | if (s->soft_blk) |
4397 | 105 | cram_free_block(s->soft_blk); |
4398 | | |
4399 | 118 | if (s->cigar) |
4400 | 105 | free(s->cigar); |
4401 | | |
4402 | 118 | if (s->crecs) |
4403 | 104 | free(s->crecs); |
4404 | | |
4405 | 118 | if (s->features) |
4406 | 0 | free(s->features); |
4407 | | |
4408 | 118 | if (s->TN) |
4409 | 0 | free(s->TN); |
4410 | | |
4411 | 118 | if (s->pair_keys) |
4412 | 0 | string_pool_destroy(s->pair_keys); |
4413 | | |
4414 | 118 | if (s->pair[0]) |
4415 | 118 | kh_destroy(m_s2i, s->pair[0]); |
4416 | 118 | if (s->pair[1]) |
4417 | 118 | kh_destroy(m_s2i, s->pair[1]); |
4418 | | |
4419 | 118 | if (s->aux_block) |
4420 | 0 | free(s->aux_block); |
4421 | | |
4422 | 118 | free(s); |
4423 | 118 | } |
4424 | | |
4425 | | /* |
4426 | | * Creates a new empty slice in memory, for subsequent writing to |
4427 | | * disk. |
4428 | | * |
4429 | | * Returns cram_slice ptr on success |
4430 | | * NULL on failure |
4431 | | */ |
4432 | 0 | cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) { |
4433 | 0 | cram_slice *s = calloc(1, sizeof(*s)); |
4434 | 0 | if (!s) |
4435 | 0 | return NULL; |
4436 | | |
4437 | 0 | if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr)))) |
4438 | 0 | goto err; |
4439 | 0 | s->hdr->content_type = type; |
4440 | |
|
4441 | 0 | s->hdr_block = NULL; |
4442 | 0 | s->block = NULL; |
4443 | 0 | s->block_by_id = NULL; |
4444 | 0 | s->last_apos = 0; |
4445 | 0 | if (!(s->crecs = malloc(nrecs * sizeof(cram_record)))) goto err; |
4446 | 0 | s->cigar_alloc = 1024; |
4447 | 0 | if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err; |
4448 | 0 | s->ncigar = 0; |
4449 | |
|
4450 | 0 | if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err; |
4451 | 0 | if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err; |
4452 | 0 | if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err; |
4453 | 0 | if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err; |
4454 | 0 | if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err; |
4455 | 0 | if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err; |
4456 | | |
4457 | 0 | s->features = NULL; |
4458 | 0 | s->nfeatures = s->afeatures = 0; |
4459 | |
|
4460 | 0 | #ifndef TN_external |
4461 | 0 | s->TN = NULL; |
4462 | 0 | s->nTN = s->aTN = 0; |
4463 | 0 | #endif |
4464 | | |
4465 | | // Volatile keys as we do realloc in dstring |
4466 | 0 | if (!(s->pair_keys = string_pool_create(8192))) goto err; |
4467 | 0 | if (!(s->pair[0] = kh_init(m_s2i))) goto err; |
4468 | 0 | if (!(s->pair[1] = kh_init(m_s2i))) goto err; |
4469 | | |
4470 | | #ifdef BA_external |
4471 | | s->BA_len = 0; |
4472 | | #endif |
4473 | | |
4474 | 0 | return s; |
4475 | | |
4476 | 0 | err: |
4477 | 0 | if (s) |
4478 | 0 | cram_free_slice(s); |
4479 | |
|
4480 | 0 | return NULL; |
4481 | 0 | } |
4482 | | |
4483 | | /* |
4484 | | * Loads an entire slice. |
4485 | | * FIXME: In 1.0 the native unit of slices within CRAM is broken |
4486 | | * as slices contain references to objects in other slices. |
4487 | | * To work around this while keeping the slice oriented outer loop |
4488 | | * we read all slices and stitch them together into a fake large |
4489 | | * slice instead. |
4490 | | * |
4491 | | * Returns cram_slice ptr on success |
4492 | | * NULL on failure |
4493 | | */ |
4494 | 118 | cram_slice *cram_read_slice(cram_fd *fd) { |
4495 | 118 | cram_block *b = cram_read_block(fd); |
4496 | 118 | cram_slice *s = calloc(1, sizeof(*s)); |
4497 | 118 | int i, n, max_id, min_id; |
4498 | | |
4499 | 118 | if (!b || !s) |
4500 | 4 | goto err; |
4501 | | |
4502 | 114 | s->hdr_block = b; |
4503 | 114 | switch (b->content_type) { |
4504 | 113 | case MAPPED_SLICE: |
4505 | 113 | case UNMAPPED_SLICE: |
4506 | 113 | if (!(s->hdr = cram_decode_slice_header(fd, b))) |
4507 | 3 | goto err; |
4508 | 110 | break; |
4509 | | |
4510 | 110 | default: |
4511 | 1 | hts_log_error("Unexpected block of type %s", |
4512 | 1 | cram_content_type2str(b->content_type)); |
4513 | 1 | goto err; |
4514 | 114 | } |
4515 | | |
4516 | 110 | if (s->hdr->num_blocks < 1) { |
4517 | 0 | hts_log_error("Slice does not include any data blocks"); |
4518 | 0 | goto err; |
4519 | 0 | } |
4520 | | |
4521 | 110 | s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block)); |
4522 | 110 | if (!s->block) |
4523 | 0 | goto err; |
4524 | | |
4525 | 720 | for (max_id = i = 0, min_id = INT_MAX; i < n; i++) { |
4526 | 615 | if (!(s->block[i] = cram_read_block(fd))) |
4527 | 5 | goto err; |
4528 | | |
4529 | 610 | if (s->block[i]->content_type == EXTERNAL) { |
4530 | 241 | if (max_id < s->block[i]->content_id) |
4531 | 5 | max_id = s->block[i]->content_id; |
4532 | 241 | if (min_id > s->block[i]->content_id) |
4533 | 86 | min_id = s->block[i]->content_id; |
4534 | 241 | } |
4535 | 610 | } |
4536 | | |
4537 | 105 | if (!(s->block_by_id = calloc(512, sizeof(s->block[0])))) |
4538 | 0 | goto err; |
4539 | | |
4540 | 520 | for (i = 0; i < n; i++) { |
4541 | 415 | if (s->block[i]->content_type != EXTERNAL) |
4542 | 294 | continue; |
4543 | 121 | uint32_t v = s->block[i]->content_id; |
4544 | 121 | if (v >= 256) |
4545 | 7 | v = 256 + v % 251; |
4546 | 121 | s->block_by_id[v] = s->block[i]; |
4547 | 121 | } |
4548 | | |
4549 | | /* Initialise encoding/decoding tables */ |
4550 | 105 | s->cigar_alloc = 1024; |
4551 | 105 | if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err; |
4552 | 105 | s->ncigar = 0; |
4553 | | |
4554 | 105 | if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err; |
4555 | 105 | if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err; |
4556 | 105 | if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err; |
4557 | 105 | if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err; |
4558 | 105 | if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err; |
4559 | 105 | if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err; |
4560 | | |
4561 | 105 | s->crecs = NULL; |
4562 | | |
4563 | 105 | s->last_apos = s->hdr->ref_seq_start; |
4564 | 105 | s->decode_md = fd->decode_md; |
4565 | | |
4566 | 105 | return s; |
4567 | | |
4568 | 13 | err: |
4569 | 13 | if (b) |
4570 | 9 | cram_free_block(b); |
4571 | 13 | if (s) { |
4572 | 13 | s->hdr_block = NULL; |
4573 | 13 | cram_free_slice(s); |
4574 | 13 | } |
4575 | 13 | return NULL; |
4576 | 105 | } |
4577 | | |
4578 | | |
4579 | | /* ---------------------------------------------------------------------- |
4580 | | * CRAM file definition (header) |
4581 | | */ |
4582 | | |
4583 | | /* |
4584 | | * Reads a CRAM file definition structure. |
4585 | | * Returns file_def ptr on success |
4586 | | * NULL on failure |
4587 | | */ |
4588 | 275 | cram_file_def *cram_read_file_def(cram_fd *fd) { |
4589 | 275 | cram_file_def *def = malloc(sizeof(*def)); |
4590 | 275 | if (!def) |
4591 | 0 | return NULL; |
4592 | | |
4593 | 275 | if (26 != hread(fd->fp, &def->magic[0], 26)) { |
4594 | 0 | free(def); |
4595 | 0 | return NULL; |
4596 | 0 | } |
4597 | | |
4598 | 275 | if (memcmp(def->magic, "CRAM", 4) != 0) { |
4599 | 0 | free(def); |
4600 | 0 | return NULL; |
4601 | 0 | } |
4602 | | |
4603 | 275 | if (def->major_version > 4) { |
4604 | 0 | hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x, 3.x or 4.x, got %d.%d", |
4605 | 0 | def->major_version, def->minor_version); |
4606 | 0 | free(def); |
4607 | 0 | return NULL; |
4608 | 0 | } |
4609 | | |
4610 | 275 | fd->first_container += 26; |
4611 | 275 | fd->curr_position = fd->first_container; |
4612 | 275 | fd->last_slice = 0; |
4613 | | |
4614 | 275 | return def; |
4615 | 275 | } |
4616 | | |
4617 | | /* |
4618 | | * Writes a cram_file_def structure to cram_fd. |
4619 | | * Returns 0 on success |
4620 | | * -1 on failure |
4621 | | */ |
4622 | 0 | int cram_write_file_def(cram_fd *fd, cram_file_def *def) { |
4623 | 0 | return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1; |
4624 | 0 | } |
4625 | | |
4626 | 275 | void cram_free_file_def(cram_file_def *def) { |
4627 | 275 | if (def) free(def); |
4628 | 275 | } |
4629 | | |
4630 | | /* ---------------------------------------------------------------------- |
4631 | | * SAM header I/O |
4632 | | */ |
4633 | | |
4634 | | |
4635 | | /* |
4636 | | * Reads the SAM header from the first CRAM data block. |
4637 | | * Also performs minimal parsing to extract read-group |
4638 | | * and sample information. |
4639 | | |
4640 | | * Returns SAM hdr ptr on success |
4641 | | * NULL on failure |
4642 | | */ |
4643 | 275 | sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) { |
4644 | 275 | int32_t header_len; |
4645 | 275 | char *header; |
4646 | 275 | sam_hdr_t *hdr; |
4647 | | |
4648 | | /* 1.1 onwards stores the header in the first block of a container */ |
4649 | 275 | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
4650 | | /* Length */ |
4651 | 241 | if (-1 == int32_decode(fd, &header_len)) |
4652 | 0 | return NULL; |
4653 | | |
4654 | | /* Alloc and read */ |
4655 | 241 | if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1))) |
4656 | 1 | return NULL; |
4657 | | |
4658 | 240 | if (header_len != hread(fd->fp, header, header_len)) { |
4659 | 0 | free(header); |
4660 | 0 | return NULL; |
4661 | 0 | } |
4662 | 240 | header[header_len] = '\0'; |
4663 | | |
4664 | 240 | fd->first_container += 4 + header_len; |
4665 | 240 | } else { |
4666 | 34 | cram_container *c = cram_read_container(fd); |
4667 | 34 | cram_block *b; |
4668 | 34 | int i; |
4669 | 34 | int64_t len; |
4670 | | |
4671 | 34 | if (!c) |
4672 | 7 | return NULL; |
4673 | | |
4674 | 27 | fd->first_container += c->length + c->offset; |
4675 | 27 | fd->curr_position = fd->first_container; |
4676 | | |
4677 | 27 | if (c->num_blocks < 1) { |
4678 | 0 | cram_free_container(c); |
4679 | 0 | return NULL; |
4680 | 0 | } |
4681 | | |
4682 | 27 | if (!(b = cram_read_block(fd))) { |
4683 | 2 | cram_free_container(c); |
4684 | 2 | return NULL; |
4685 | 2 | } |
4686 | 25 | if (cram_uncompress_block(b) != 0) { |
4687 | 10 | cram_free_container(c); |
4688 | 10 | cram_free_block(b); |
4689 | 10 | return NULL; |
4690 | 10 | } |
4691 | | |
4692 | 15 | len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + |
4693 | 15 | fd->vv.varint_size(b->content_id) + |
4694 | 15 | fd->vv.varint_size(b->uncomp_size) + |
4695 | 15 | fd->vv.varint_size(b->comp_size); |
4696 | | |
4697 | | /* Extract header from 1st block */ |
4698 | 15 | if (-1 == int32_get_blk(b, &header_len) || |
4699 | 15 | header_len < 0 || /* Spec. says signed... why? */ |
4700 | 15 | b->uncomp_size - 4 < header_len) { |
4701 | 2 | cram_free_container(c); |
4702 | 2 | cram_free_block(b); |
4703 | 2 | return NULL; |
4704 | 2 | } |
4705 | 13 | if (NULL == (header = malloc((size_t) header_len+1))) { |
4706 | 0 | cram_free_container(c); |
4707 | 0 | cram_free_block(b); |
4708 | 0 | return NULL; |
4709 | 0 | } |
4710 | 13 | memcpy(header, BLOCK_END(b), header_len); |
4711 | 13 | header[header_len] = '\0'; |
4712 | 13 | cram_free_block(b); |
4713 | | |
4714 | | /* Consume any remaining blocks */ |
4715 | 1.93k | for (i = 1; i < c->num_blocks; i++) { |
4716 | 1.92k | if (!(b = cram_read_block(fd))) { |
4717 | 6 | cram_free_container(c); |
4718 | 6 | free(header); |
4719 | 6 | return NULL; |
4720 | 6 | } |
4721 | 1.92k | len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + |
4722 | 1.92k | fd->vv.varint_size(b->content_id) + |
4723 | 1.92k | fd->vv.varint_size(b->uncomp_size) + |
4724 | 1.92k | fd->vv.varint_size(b->comp_size); |
4725 | 1.92k | cram_free_block(b); |
4726 | 1.92k | } |
4727 | | |
4728 | 7 | if (c->length > 0 && len > 0 && c->length > len) { |
4729 | | // Consume padding |
4730 | 1 | char *pads = malloc(c->length - len); |
4731 | 1 | if (!pads) { |
4732 | 0 | cram_free_container(c); |
4733 | 0 | free(header); |
4734 | 0 | return NULL; |
4735 | 0 | } |
4736 | | |
4737 | 1 | if (c->length - len != hread(fd->fp, pads, c->length - len)) { |
4738 | 1 | cram_free_container(c); |
4739 | 1 | free(header); |
4740 | 1 | free(pads); |
4741 | 1 | return NULL; |
4742 | 1 | } |
4743 | 0 | free(pads); |
4744 | 0 | } |
4745 | | |
4746 | 6 | cram_free_container(c); |
4747 | 6 | } |
4748 | | |
4749 | | /* Parse */ |
4750 | 246 | hdr = sam_hdr_init(); |
4751 | 246 | if (!hdr) { |
4752 | 0 | free(header); |
4753 | 0 | return NULL; |
4754 | 0 | } |
4755 | | |
4756 | 246 | if (-1 == sam_hdr_add_lines(hdr, header, header_len)) { |
4757 | 1 | free(header); |
4758 | 1 | sam_hdr_destroy(hdr); |
4759 | 1 | return NULL; |
4760 | 1 | } |
4761 | | |
4762 | 245 | hdr->l_text = header_len; |
4763 | 245 | hdr->text = header; |
4764 | | |
4765 | 245 | return hdr; |
4766 | | |
4767 | 246 | } |
4768 | | |
4769 | | /* |
4770 | | * Converts 'in' to a full pathname to store in out. |
4771 | | * Out must be at least PATH_MAX bytes long. |
4772 | | */ |
4773 | 0 | static void full_path(char *out, char *in) { |
4774 | 0 | size_t in_l = strlen(in); |
4775 | 0 | if (hisremote(in)) { |
4776 | 0 | if (in_l > PATH_MAX) { |
4777 | 0 | hts_log_error("Reference path is longer than %d", PATH_MAX); |
4778 | 0 | return; |
4779 | 0 | } |
4780 | 0 | strncpy(out, in, PATH_MAX-1); |
4781 | 0 | out[PATH_MAX-1] = 0; |
4782 | 0 | return; |
4783 | 0 | } |
4784 | 0 | if (*in == '/' || |
4785 | | // Windows paths |
4786 | 0 | (in_l > 3 && toupper_c(*in) >= 'A' && toupper_c(*in) <= 'Z' && |
4787 | 0 | in[1] == ':' && (in[2] == '/' || in[2] == '\\'))) { |
4788 | 0 | strncpy(out, in, PATH_MAX-1); |
4789 | 0 | out[PATH_MAX-1] = 0; |
4790 | 0 | } else { |
4791 | 0 | int len; |
4792 | | |
4793 | | // unable to get dir or out+in is too long |
4794 | 0 | if (!getcwd(out, PATH_MAX) || |
4795 | 0 | (len = strlen(out))+1+strlen(in) >= PATH_MAX) { |
4796 | 0 | strncpy(out, in, PATH_MAX-1); |
4797 | 0 | out[PATH_MAX-1] = 0; |
4798 | 0 | return; |
4799 | 0 | } |
4800 | | |
4801 | 0 | sprintf(out+len, "/%.*s", PATH_MAX - 2 - len, in); |
4802 | | |
4803 | | // FIXME: cope with `pwd`/../../../foo.fa ? |
4804 | 0 | } |
4805 | 0 | } |
4806 | | |
4807 | | /* |
4808 | | * Writes a CRAM SAM header. |
4809 | | * Returns 0 on success |
4810 | | * -1 on failure |
4811 | | */ |
4812 | 0 | int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { |
4813 | 0 | size_t header_len; |
4814 | 0 | int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3); |
4815 | | |
4816 | | /* Write CRAM MAGIC if not yet written. */ |
4817 | 0 | if (fd->file_def->major_version == 0) { |
4818 | 0 | fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version); |
4819 | 0 | fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version); |
4820 | 0 | if (0 != cram_write_file_def(fd, fd->file_def)) |
4821 | 0 | return -1; |
4822 | 0 | } |
4823 | | |
4824 | | /* 1.0 requires an UNKNOWN read-group */ |
4825 | 0 | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
4826 | 0 | if (!sam_hrecs_find_rg(hdr->hrecs, "UNKNOWN")) |
4827 | 0 | if (sam_hdr_add_line(hdr, "RG", |
4828 | 0 | "ID", "UNKNOWN", "SM", "UNKNOWN", NULL)) |
4829 | 0 | return -1; |
4830 | 0 | } |
4831 | | |
4832 | 0 | if (-1 == refs_from_header(fd)) |
4833 | 0 | return -1; |
4834 | 0 | if (-1 == refs2id(fd->refs, fd->header)) |
4835 | 0 | return -1; |
4836 | | |
4837 | | /* Fix M5 strings */ |
4838 | 0 | if (fd->refs && !fd->no_ref && fd->embed_ref <= 1) { |
4839 | 0 | int i; |
4840 | 0 | for (i = 0; i < hdr->hrecs->nref; i++) { |
4841 | 0 | sam_hrec_type_t *ty; |
4842 | 0 | char *ref; |
4843 | |
|
4844 | 0 | if (!(ty = sam_hrecs_find_type_id(hdr->hrecs, "SQ", "SN", hdr->hrecs->ref[i].name))) |
4845 | 0 | return -1; |
4846 | | |
4847 | 0 | if (!sam_hrecs_find_key(ty, "M5", NULL)) { |
4848 | 0 | char unsigned buf[16]; |
4849 | 0 | char buf2[33]; |
4850 | 0 | int rlen; |
4851 | 0 | hts_md5_context *md5; |
4852 | |
|
4853 | 0 | if (!fd->refs || |
4854 | 0 | !fd->refs->ref_id || |
4855 | 0 | !fd->refs->ref_id[i]) { |
4856 | 0 | return -1; |
4857 | 0 | } |
4858 | 0 | rlen = fd->refs->ref_id[i]->length; |
4859 | 0 | ref = cram_get_ref(fd, i, 1, rlen); |
4860 | 0 | if (NULL == ref) { |
4861 | 0 | if (fd->embed_ref == -1) { |
4862 | | // auto embed-ref |
4863 | 0 | hts_log_warning("No M5 tags present and could not " |
4864 | 0 | "find reference"); |
4865 | 0 | hts_log_warning("Enabling embed_ref=2 option"); |
4866 | 0 | hts_log_warning("NOTE: the CRAM file will be bigger " |
4867 | 0 | "than using an external reference"); |
4868 | 0 | pthread_mutex_lock(&fd->ref_lock); |
4869 | 0 | fd->embed_ref = 2; |
4870 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
4871 | 0 | break; |
4872 | 0 | } |
4873 | 0 | return -1; |
4874 | 0 | } |
4875 | 0 | rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */ |
4876 | 0 | if (!(md5 = hts_md5_init())) |
4877 | 0 | return -1; |
4878 | 0 | hts_md5_update(md5, ref, rlen); |
4879 | 0 | hts_md5_final(buf, md5); |
4880 | 0 | hts_md5_destroy(md5); |
4881 | 0 | cram_ref_decr(fd->refs, i); |
4882 | |
|
4883 | 0 | hts_md5_hex(buf2, buf); |
4884 | 0 | fd->refs->ref_id[i]->validated_md5 = 1; |
4885 | 0 | if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "M5", buf2, NULL)) |
4886 | 0 | return -1; |
4887 | 0 | } |
4888 | | |
4889 | 0 | if (fd->ref_fn) { |
4890 | 0 | char ref_fn[PATH_MAX]; |
4891 | 0 | full_path(ref_fn, fd->ref_fn); |
4892 | 0 | if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "UR", ref_fn, NULL)) |
4893 | 0 | return -1; |
4894 | 0 | } |
4895 | 0 | } |
4896 | 0 | } |
4897 | | |
4898 | | /* Length */ |
4899 | 0 | header_len = sam_hdr_length(hdr); |
4900 | 0 | if (header_len > INT32_MAX) { |
4901 | 0 | hts_log_error("Header is too long for CRAM format"); |
4902 | 0 | return -1; |
4903 | 0 | } |
4904 | 0 | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
4905 | 0 | if (-1 == int32_encode(fd, header_len)) |
4906 | 0 | return -1; |
4907 | | |
4908 | | /* Text data */ |
4909 | 0 | if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len)) |
4910 | 0 | return -1; |
4911 | 0 | } else { |
4912 | | /* Create block(s) inside a container */ |
4913 | 0 | cram_block *b = cram_new_block(FILE_HEADER, 0); |
4914 | 0 | cram_container *c = cram_new_container(0, 0); |
4915 | 0 | int padded_length; |
4916 | 0 | char *pads; |
4917 | 0 | int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3); |
4918 | |
|
4919 | 0 | if (!b || !c) { |
4920 | 0 | if (b) cram_free_block(b); |
4921 | 0 | if (c) cram_free_container(c); |
4922 | 0 | return -1; |
4923 | 0 | } |
4924 | | |
4925 | 0 | if (int32_put_blk(b, header_len) < 0) |
4926 | 0 | return -1; |
4927 | 0 | if (header_len) |
4928 | 0 | BLOCK_APPEND(b, sam_hdr_str(hdr), header_len); |
4929 | 0 | BLOCK_UPLEN(b); |
4930 | | |
4931 | | // Compress header block if V3.0 and above |
4932 | 0 | if (CRAM_MAJOR_VERS(fd->version) >= 3) |
4933 | 0 | if (cram_compress_block(fd, b, NULL, -1, -1) < 0) |
4934 | 0 | return -1; |
4935 | | |
4936 | 0 | if (blank_block) { |
4937 | 0 | c->length = b->comp_size + 2 + 4*is_cram_3 + |
4938 | 0 | fd->vv.varint_size(b->content_id) + |
4939 | 0 | fd->vv.varint_size(b->uncomp_size) + |
4940 | 0 | fd->vv.varint_size(b->comp_size); |
4941 | |
|
4942 | 0 | c->num_blocks = 2; |
4943 | 0 | c->num_landmarks = 2; |
4944 | 0 | if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) { |
4945 | 0 | cram_free_block(b); |
4946 | 0 | cram_free_container(c); |
4947 | 0 | return -1; |
4948 | 0 | } |
4949 | 0 | c->landmark[0] = 0; |
4950 | 0 | c->landmark[1] = c->length; |
4951 | | |
4952 | | // Plus extra storage for uncompressed secondary blank block |
4953 | 0 | padded_length = MIN(c->length*.5, 10000); |
4954 | 0 | c->length += padded_length + 2 + 4*is_cram_3 + |
4955 | 0 | fd->vv.varint_size(b->content_id) + |
4956 | 0 | fd->vv.varint_size(padded_length)*2; |
4957 | 0 | } else { |
4958 | | // Pad the block instead. |
4959 | 0 | c->num_blocks = 1; |
4960 | 0 | c->num_landmarks = 1; |
4961 | 0 | if (!(c->landmark = malloc(sizeof(*c->landmark)))) |
4962 | 0 | return -1; |
4963 | 0 | c->landmark[0] = 0; |
4964 | |
|
4965 | 0 | padded_length = MAX(c->length*1.5, 10000) - c->length; |
4966 | |
|
4967 | 0 | c->length = b->comp_size + padded_length + |
4968 | 0 | 2 + 4*is_cram_3 + |
4969 | 0 | fd->vv.varint_size(b->content_id) + |
4970 | 0 | fd->vv.varint_size(b->uncomp_size) + |
4971 | 0 | fd->vv.varint_size(b->comp_size); |
4972 | |
|
4973 | 0 | if (NULL == (pads = calloc(1, padded_length))) { |
4974 | 0 | cram_free_block(b); |
4975 | 0 | cram_free_container(c); |
4976 | 0 | return -1; |
4977 | 0 | } |
4978 | 0 | BLOCK_APPEND(b, pads, padded_length); |
4979 | 0 | BLOCK_UPLEN(b); |
4980 | 0 | free(pads); |
4981 | 0 | } |
4982 | | |
4983 | 0 | if (-1 == cram_write_container(fd, c)) { |
4984 | 0 | cram_free_block(b); |
4985 | 0 | cram_free_container(c); |
4986 | 0 | return -1; |
4987 | 0 | } |
4988 | | |
4989 | 0 | if (-1 == cram_write_block(fd, b)) { |
4990 | 0 | cram_free_block(b); |
4991 | 0 | cram_free_container(c); |
4992 | 0 | return -1; |
4993 | 0 | } |
4994 | | |
4995 | 0 | if (blank_block) { |
4996 | 0 | BLOCK_RESIZE(b, padded_length); |
4997 | 0 | memset(BLOCK_DATA(b), 0, padded_length); |
4998 | 0 | BLOCK_SIZE(b) = padded_length; |
4999 | 0 | BLOCK_UPLEN(b); |
5000 | 0 | b->method = RAW; |
5001 | 0 | if (-1 == cram_write_block(fd, b)) { |
5002 | 0 | cram_free_block(b); |
5003 | 0 | cram_free_container(c); |
5004 | 0 | return -1; |
5005 | 0 | } |
5006 | 0 | } |
5007 | | |
5008 | 0 | cram_free_block(b); |
5009 | 0 | cram_free_container(c); |
5010 | 0 | } |
5011 | | |
5012 | 0 | if (0 != hflush(fd->fp)) |
5013 | 0 | return -1; |
5014 | | |
5015 | 0 | RP("=== Finishing saving header ===\n"); |
5016 | |
|
5017 | 0 | return 0; |
5018 | | |
5019 | 0 | block_err: |
5020 | 0 | return -1; |
5021 | 0 | } |
5022 | | |
5023 | | /* ---------------------------------------------------------------------- |
5024 | | * The top-level cram opening, closing and option handling |
5025 | | */ |
5026 | | |
5027 | | /* |
5028 | | * Sets CRAM variable sized integer decode function tables. |
5029 | | * CRAM 1, 2, and 3.x all used ITF8 for uint32 and UTF8 for uint64. |
5030 | | * CRAM 4.x uses the same encoding mechanism for 32-bit and 64-bit |
5031 | | * (or anything inbetween), but also now supports signed values. |
5032 | | * |
5033 | | * Version is the CRAM major version number. |
5034 | | * vv is the vector table (probably &cram_fd->vv) |
5035 | | */ |
5036 | 275 | static void cram_init_varint(varint_vec *vv, int version) { |
5037 | 275 | if (version >= 4) { |
5038 | 3 | vv->varint_get32 = uint7_get_32; // FIXME: varint.h API should be size agnostic |
5039 | 3 | vv->varint_get32s = sint7_get_32; |
5040 | 3 | vv->varint_get64 = uint7_get_64; |
5041 | 3 | vv->varint_get64s = sint7_get_64; |
5042 | 3 | vv->varint_put32 = uint7_put_32; |
5043 | 3 | vv->varint_put32s = sint7_put_32; |
5044 | 3 | vv->varint_put64 = uint7_put_64; |
5045 | 3 | vv->varint_put64s = sint7_put_64; |
5046 | 3 | vv->varint_put32_blk = uint7_put_blk_32; |
5047 | 3 | vv->varint_put32s_blk = sint7_put_blk_32; |
5048 | 3 | vv->varint_put64_blk = uint7_put_blk_64; |
5049 | 3 | vv->varint_put64s_blk = sint7_put_blk_64; |
5050 | 3 | vv->varint_size = uint7_size; |
5051 | 3 | vv->varint_decode32_crc = uint7_decode_crc32; |
5052 | 3 | vv->varint_decode32s_crc = sint7_decode_crc32; |
5053 | 3 | vv->varint_decode64_crc = uint7_decode_crc64; |
5054 | 272 | } else { |
5055 | 272 | vv->varint_get32 = safe_itf8_get; |
5056 | 272 | vv->varint_get32s = safe_itf8_get; |
5057 | 272 | vv->varint_get64 = safe_ltf8_get; |
5058 | 272 | vv->varint_get64s = safe_ltf8_get; |
5059 | 272 | vv->varint_put32 = safe_itf8_put; |
5060 | 272 | vv->varint_put32s = safe_itf8_put; |
5061 | 272 | vv->varint_put64 = safe_ltf8_put; |
5062 | 272 | vv->varint_put64s = safe_ltf8_put; |
5063 | 272 | vv->varint_put32_blk = itf8_put_blk; |
5064 | 272 | vv->varint_put32s_blk = itf8_put_blk; |
5065 | 272 | vv->varint_put64_blk = ltf8_put_blk; |
5066 | 272 | vv->varint_put64s_blk = ltf8_put_blk; |
5067 | 272 | vv->varint_size = itf8_size; |
5068 | 272 | vv->varint_decode32_crc = itf8_decode_crc; |
5069 | 272 | vv->varint_decode32s_crc = itf8_decode_crc; |
5070 | 272 | vv->varint_decode64_crc = ltf8_decode_crc; |
5071 | 272 | } |
5072 | 275 | } |
5073 | | |
5074 | | /* |
5075 | | * Initialises the lookup tables. These could be global statics, but they're |
5076 | | * clumsy to setup in a multi-threaded environment unless we generate |
5077 | | * verbatim code and include that. |
5078 | | */ |
5079 | 275 | static void cram_init_tables(cram_fd *fd) { |
5080 | 275 | int i; |
5081 | | |
5082 | 275 | memset(fd->L1, 4, 256); |
5083 | 275 | fd->L1['A'] = 0; fd->L1['a'] = 0; |
5084 | 275 | fd->L1['C'] = 1; fd->L1['c'] = 1; |
5085 | 275 | fd->L1['G'] = 2; fd->L1['g'] = 2; |
5086 | 275 | fd->L1['T'] = 3; fd->L1['t'] = 3; |
5087 | | |
5088 | 275 | memset(fd->L2, 5, 256); |
5089 | 275 | fd->L2['A'] = 0; fd->L2['a'] = 0; |
5090 | 275 | fd->L2['C'] = 1; fd->L2['c'] = 1; |
5091 | 275 | fd->L2['G'] = 2; fd->L2['g'] = 2; |
5092 | 275 | fd->L2['T'] = 3; fd->L2['t'] = 3; |
5093 | 275 | fd->L2['N'] = 4; fd->L2['n'] = 4; |
5094 | | |
5095 | 275 | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
5096 | 123k | for (i = 0; i < 0x200; i++) { |
5097 | 123k | int f = 0; |
5098 | | |
5099 | 123k | if (i & CRAM_FPAIRED) f |= BAM_FPAIRED; |
5100 | 123k | if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR; |
5101 | 123k | if (i & CRAM_FUNMAP) f |= BAM_FUNMAP; |
5102 | 123k | if (i & CRAM_FREVERSE) f |= BAM_FREVERSE; |
5103 | 123k | if (i & CRAM_FREAD1) f |= BAM_FREAD1; |
5104 | 123k | if (i & CRAM_FREAD2) f |= BAM_FREAD2; |
5105 | 123k | if (i & CRAM_FSECONDARY) f |= BAM_FSECONDARY; |
5106 | 123k | if (i & CRAM_FQCFAIL) f |= BAM_FQCFAIL; |
5107 | 123k | if (i & CRAM_FDUP) f |= BAM_FDUP; |
5108 | | |
5109 | 123k | fd->bam_flag_swap[i] = f; |
5110 | 123k | } |
5111 | | |
5112 | 987k | for (i = 0; i < 0x1000; i++) { |
5113 | 987k | int g = 0; |
5114 | | |
5115 | 987k | if (i & BAM_FPAIRED) g |= CRAM_FPAIRED; |
5116 | 987k | if (i & BAM_FPROPER_PAIR) g |= CRAM_FPROPER_PAIR; |
5117 | 987k | if (i & BAM_FUNMAP) g |= CRAM_FUNMAP; |
5118 | 987k | if (i & BAM_FREVERSE) g |= CRAM_FREVERSE; |
5119 | 987k | if (i & BAM_FREAD1) g |= CRAM_FREAD1; |
5120 | 987k | if (i & BAM_FREAD2) g |= CRAM_FREAD2; |
5121 | 987k | if (i & BAM_FSECONDARY) g |= CRAM_FSECONDARY; |
5122 | 987k | if (i & BAM_FQCFAIL) g |= CRAM_FQCFAIL; |
5123 | 987k | if (i & BAM_FDUP) g |= CRAM_FDUP; |
5124 | | |
5125 | 987k | fd->cram_flag_swap[i] = g; |
5126 | 987k | } |
5127 | 241 | } else { |
5128 | | /* NOP */ |
5129 | 139k | for (i = 0; i < 0x1000; i++) |
5130 | 139k | fd->bam_flag_swap[i] = i; |
5131 | 139k | for (i = 0; i < 0x1000; i++) |
5132 | 139k | fd->cram_flag_swap[i] = i; |
5133 | 34 | } |
5134 | | |
5135 | 275 | memset(fd->cram_sub_matrix, 4, 32*32); |
5136 | 9.07k | for (i = 0; i < 32; i++) { |
5137 | 8.80k | fd->cram_sub_matrix[i]['A'&0x1f]=0; |
5138 | 8.80k | fd->cram_sub_matrix[i]['C'&0x1f]=1; |
5139 | 8.80k | fd->cram_sub_matrix[i]['G'&0x1f]=2; |
5140 | 8.80k | fd->cram_sub_matrix[i]['T'&0x1f]=3; |
5141 | 8.80k | fd->cram_sub_matrix[i]['N'&0x1f]=4; |
5142 | 8.80k | } |
5143 | 1.65k | for (i = 0; i < 20; i+=4) { |
5144 | 1.37k | int j; |
5145 | 28.8k | for (j = 0; j < 20; j++) { |
5146 | 27.5k | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3; |
5147 | 27.5k | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3; |
5148 | 27.5k | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3; |
5149 | 27.5k | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3; |
5150 | 27.5k | } |
5151 | 1.37k | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0; |
5152 | 1.37k | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1; |
5153 | 1.37k | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2; |
5154 | 1.37k | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3; |
5155 | 1.37k | } |
5156 | | |
5157 | 275 | cram_init_varint(&fd->vv, CRAM_MAJOR_VERS(fd->version)); |
5158 | 275 | } |
5159 | | |
5160 | | // Default version numbers for CRAM |
5161 | | static int major_version = 3; |
5162 | | static int minor_version = 0; |
5163 | | |
5164 | | /* |
5165 | | * Opens a CRAM file for read (mode "rb") or write ("wb"). |
5166 | | * The filename may be "-" to indicate stdin or stdout. |
5167 | | * |
5168 | | * Returns file handle on success |
5169 | | * NULL on failure. |
5170 | | */ |
5171 | 0 | cram_fd *cram_open(const char *filename, const char *mode) { |
5172 | 0 | hFILE *fp; |
5173 | 0 | cram_fd *fd; |
5174 | 0 | char fmode[3]= { mode[0], '\0', '\0' }; |
5175 | |
|
5176 | 0 | if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) { |
5177 | 0 | fmode[1] = 'b'; |
5178 | 0 | } |
5179 | |
|
5180 | 0 | fp = hopen(filename, fmode); |
5181 | 0 | if (!fp) |
5182 | 0 | return NULL; |
5183 | | |
5184 | 0 | fd = cram_dopen(fp, filename, mode); |
5185 | 0 | if (!fd) |
5186 | 0 | hclose_abruptly(fp); |
5187 | |
|
5188 | 0 | return fd; |
5189 | 0 | } |
5190 | | |
5191 | | /* Opens an existing stream for reading or writing. |
5192 | | * |
5193 | | * Returns file handle on success; |
5194 | | * NULL on failure. |
5195 | | */ |
5196 | 275 | cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { |
5197 | 275 | int i; |
5198 | 275 | char *cp; |
5199 | 275 | cram_fd *fd = calloc(1, sizeof(*fd)); |
5200 | 275 | if (!fd) |
5201 | 0 | return NULL; |
5202 | | |
5203 | 275 | fd->level = CRAM_DEFAULT_LEVEL; |
5204 | 825 | for (i = 0; mode[i]; i++) { |
5205 | 550 | if (mode[i] >= '0' && mode[i] <= '9') { |
5206 | 0 | fd->level = mode[i] - '0'; |
5207 | 0 | break; |
5208 | 0 | } |
5209 | 550 | } |
5210 | | |
5211 | 275 | fd->fp = fp; |
5212 | 275 | fd->mode = *mode; |
5213 | 275 | fd->first_container = 0; |
5214 | 275 | fd->curr_position = 0; |
5215 | | |
5216 | 275 | if (fd->mode == 'r') { |
5217 | | /* Reader */ |
5218 | | |
5219 | 275 | if (!(fd->file_def = cram_read_file_def(fd))) |
5220 | 0 | goto err; |
5221 | | |
5222 | 275 | fd->version = fd->file_def->major_version * 256 + |
5223 | 275 | fd->file_def->minor_version; |
5224 | | |
5225 | 275 | cram_init_tables(fd); |
5226 | | |
5227 | 275 | if (!(fd->header = cram_read_SAM_hdr(fd))) { |
5228 | 30 | cram_free_file_def(fd->file_def); |
5229 | 30 | goto err; |
5230 | 30 | } |
5231 | | |
5232 | 275 | } else { |
5233 | | /* Writer */ |
5234 | 0 | cram_file_def *def = calloc(1, sizeof(*def)); |
5235 | 0 | if (!def) |
5236 | 0 | return NULL; |
5237 | | |
5238 | 0 | fd->file_def = def; |
5239 | |
|
5240 | 0 | def->magic[0] = 'C'; |
5241 | 0 | def->magic[1] = 'R'; |
5242 | 0 | def->magic[2] = 'A'; |
5243 | 0 | def->magic[3] = 'M'; |
5244 | 0 | def->major_version = 0; // Indicator to write file def later. |
5245 | 0 | def->minor_version = 0; |
5246 | 0 | memset(def->file_id, 0, 20); |
5247 | 0 | strncpy(def->file_id, filename, 20); |
5248 | |
|
5249 | 0 | fd->version = major_version * 256 + minor_version; |
5250 | 0 | cram_init_tables(fd); |
5251 | | |
5252 | | /* SAM header written later along with this file_def */ |
5253 | 0 | } |
5254 | | |
5255 | 245 | fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename); |
5256 | 245 | if (!fd->prefix) |
5257 | 0 | goto err; |
5258 | 245 | fd->first_base = fd->last_base = -1; |
5259 | 245 | fd->record_counter = 0; |
5260 | | |
5261 | 245 | fd->ctr = NULL; |
5262 | 245 | fd->ctr_mt = NULL; |
5263 | 245 | fd->refs = refs_create(); |
5264 | 245 | if (!fd->refs) |
5265 | 0 | goto err; |
5266 | 245 | fd->ref_id = -2; |
5267 | 245 | fd->ref = NULL; |
5268 | | |
5269 | 245 | fd->decode_md = 0; |
5270 | 245 | fd->seqs_per_slice = SEQS_PER_SLICE; |
5271 | 245 | fd->bases_per_slice = BASES_PER_SLICE; |
5272 | 245 | fd->slices_per_container = SLICE_PER_CNT; |
5273 | 245 | fd->embed_ref = -1; // automatic selection |
5274 | 245 | fd->no_ref = 0; |
5275 | 245 | fd->ap_delta = 0; |
5276 | 245 | fd->ignore_md5 = 0; |
5277 | 245 | fd->lossy_read_names = 0; |
5278 | 245 | fd->use_bz2 = 0; |
5279 | 245 | fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3); |
5280 | 245 | fd->use_tok = (CRAM_MAJOR_VERS(fd->version) >= 3) && (CRAM_MINOR_VERS(fd->version) >= 1); |
5281 | 245 | fd->use_lzma = 0; |
5282 | 245 | fd->multi_seq = -1; |
5283 | 245 | fd->multi_seq_user = -1; |
5284 | 245 | fd->unsorted = 0; |
5285 | 245 | fd->shared_ref = 0; |
5286 | 245 | fd->store_md = 0; |
5287 | 245 | fd->store_nm = 0; |
5288 | 245 | fd->last_RI_count = 0; |
5289 | | |
5290 | 245 | fd->index = NULL; |
5291 | 245 | fd->own_pool = 0; |
5292 | 245 | fd->pool = NULL; |
5293 | 245 | fd->rqueue = NULL; |
5294 | 245 | fd->job_pending = NULL; |
5295 | 245 | fd->ooc = 0; |
5296 | 245 | fd->required_fields = INT_MAX; |
5297 | | |
5298 | 11.7k | for (i = 0; i < DS_END; i++) { |
5299 | 11.5k | fd->m[i] = cram_new_metrics(); |
5300 | 11.5k | if (!fd->m[i]) |
5301 | 0 | goto err; |
5302 | 11.5k | } |
5303 | | |
5304 | 245 | if (!(fd->tags_used = kh_init(m_metrics))) |
5305 | 0 | goto err; |
5306 | | |
5307 | 245 | fd->range.refid = -2; // no ref. |
5308 | 245 | fd->eof = 1; // See samtools issue #150 |
5309 | 245 | fd->ref_fn = NULL; |
5310 | | |
5311 | 245 | fd->bl = NULL; |
5312 | | |
5313 | | /* Initialise dummy refs from the @SQ headers */ |
5314 | 245 | if (-1 == refs_from_header(fd)) |
5315 | 0 | goto err; |
5316 | | |
5317 | 245 | return fd; |
5318 | | |
5319 | 30 | err: |
5320 | 30 | if (fd) |
5321 | 30 | free(fd); |
5322 | | |
5323 | 30 | return NULL; |
5324 | 245 | } |
5325 | | |
5326 | | /* |
5327 | | * Seek within a CRAM file. |
5328 | | * |
5329 | | * Returns 0 on success |
5330 | | * -1 on failure |
5331 | | */ |
5332 | 0 | int cram_seek(cram_fd *fd, off_t offset, int whence) { |
5333 | 0 | char buf[65536]; |
5334 | |
|
5335 | 0 | fd->ooc = 0; |
5336 | |
|
5337 | 0 | cram_drain_rqueue(fd); |
5338 | |
|
5339 | 0 | if (hseek(fd->fp, offset, whence) >= 0) { |
5340 | 0 | return 0; |
5341 | 0 | } |
5342 | | |
5343 | 0 | if (!(whence == SEEK_CUR && offset >= 0)) |
5344 | 0 | return -1; |
5345 | | |
5346 | | /* Couldn't fseek, but we're in SEEK_CUR mode so read instead */ |
5347 | 0 | while (offset > 0) { |
5348 | 0 | int len = MIN(65536, offset); |
5349 | 0 | if (len != hread(fd->fp, buf, len)) |
5350 | 0 | return -1; |
5351 | 0 | offset -= len; |
5352 | 0 | } |
5353 | | |
5354 | 0 | return 0; |
5355 | 0 | } |
5356 | | |
5357 | | /* |
5358 | | * Flushes a CRAM file. |
5359 | | * Useful for when writing to stdout without wishing to close the stream. |
5360 | | * |
5361 | | * Returns 0 on success |
5362 | | * -1 on failure |
5363 | | */ |
5364 | 0 | int cram_flush(cram_fd *fd) { |
5365 | 0 | if (!fd) |
5366 | 0 | return -1; |
5367 | | |
5368 | 0 | if (fd->mode == 'w' && fd->ctr) { |
5369 | 0 | if(fd->ctr->slice) |
5370 | 0 | cram_update_curr_slice(fd->ctr, fd->version); |
5371 | |
|
5372 | 0 | if (-1 == cram_flush_container_mt(fd, fd->ctr)) |
5373 | 0 | return -1; |
5374 | 0 | } |
5375 | | |
5376 | 0 | return 0; |
5377 | 0 | } |
5378 | | |
5379 | | /* |
5380 | | * Writes an EOF block to a CRAM file. |
5381 | | * |
5382 | | * Returns 0 on success |
5383 | | * -1 on failure |
5384 | | */ |
5385 | 0 | int cram_write_eof_block(cram_fd *fd) { |
5386 | | // EOF block is a container with special values to aid detection |
5387 | 0 | if (CRAM_MAJOR_VERS(fd->version) >= 2) { |
5388 | | // Empty container with |
5389 | | // ref_seq_id -1 |
5390 | | // start pos 0x454f46 ("EOF") |
5391 | | // span 0 |
5392 | | // nrec 0 |
5393 | | // counter 0 |
5394 | | // nbases 0 |
5395 | | // 1 block (landmark 0) |
5396 | | // (CRC32) |
5397 | 0 | cram_container c; |
5398 | 0 | memset(&c, 0, sizeof(c)); |
5399 | 0 | c.ref_seq_id = -1; |
5400 | 0 | c.ref_seq_start = 0x454f46; // "EOF" |
5401 | 0 | c.ref_seq_span = 0; |
5402 | 0 | c.record_counter = 0; |
5403 | 0 | c.num_bases = 0; |
5404 | 0 | c.num_blocks = 1; |
5405 | 0 | int32_t land[1] = {0}; |
5406 | 0 | c.landmark = land; |
5407 | | |
5408 | | // An empty compression header block with |
5409 | | // method raw (0) |
5410 | | // type comp header (1) |
5411 | | // content id 0 |
5412 | | // block contents size 6 |
5413 | | // raw size 6 |
5414 | | // empty preservation map (01 00) |
5415 | | // empty data series map (01 00) |
5416 | | // empty tag map (01 00) |
5417 | | // block CRC |
5418 | 0 | cram_block_compression_hdr ch; |
5419 | 0 | memset(&ch, 0, sizeof(ch)); |
5420 | 0 | c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch, 0); |
5421 | |
|
5422 | 0 | c.length = c.comp_hdr_block->byte // Landmark[0] |
5423 | 0 | + 5 // block struct |
5424 | 0 | + 4*(CRAM_MAJOR_VERS(fd->version) >= 3); // CRC |
5425 | 0 | if (cram_write_container(fd, &c) < 0 || |
5426 | 0 | cram_write_block(fd, c.comp_hdr_block) < 0) { |
5427 | 0 | cram_close(fd); |
5428 | 0 | cram_free_block(c.comp_hdr_block); |
5429 | 0 | return -1; |
5430 | 0 | } |
5431 | 0 | if (ch.preservation_map) |
5432 | 0 | kh_destroy(map, ch.preservation_map); |
5433 | 0 | cram_free_block(c.comp_hdr_block); |
5434 | | |
5435 | | // V2.1 bytes |
5436 | | // 0b 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id |
5437 | | // e0 45 4f 46 00 00 00 // Cont HDR: pos, span, nrec, counter |
5438 | | // 00 01 00 // Cont HDR: nbase, nblk, landmark |
5439 | | // 00 01 00 06 06 // Comp.HDR blk |
5440 | | // 01 00 01 00 01 00 // Comp.HDR blk |
5441 | | |
5442 | | // V3.0 bytes: |
5443 | | // 0f 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id |
5444 | | // e0 45 4f 46 00 00 00 // Cont HDR: pos, span, nrec, counter |
5445 | | // 00 01 00 // Cont HDR: nbase, nblk, landmark |
5446 | | // 05 bd d9 4f // CRC32 |
5447 | | // 00 01 00 06 06 // Comp.HDR blk |
5448 | | // 01 00 01 00 01 00 // Comp.HDR blk |
5449 | | // ee 63 01 4b // CRC32 |
5450 | | |
5451 | | // V4.0 bytes: |
5452 | | // 0f 00 00 00 8f ff ff ff // Cont HDR: size, ref seq id |
5453 | | // 82 95 9e 46 00 00 00 // Cont HDR: pos, span, nrec, counter |
5454 | | // 00 01 00 // Cont HDR: nbase, nblk, landmark |
5455 | | // ac d6 05 bc // CRC32 |
5456 | | // 00 01 00 06 06 // Comp.HDR blk |
5457 | | // 01 00 01 00 01 00 // Comp.HDR blk |
5458 | | // ee 63 01 4b // CRC32 |
5459 | 0 | } |
5460 | | |
5461 | 0 | return 0; |
5462 | 0 | } |
5463 | | /* |
5464 | | * Closes a CRAM file. |
5465 | | * Returns 0 on success |
5466 | | * -1 on failure |
5467 | | */ |
5468 | 245 | int cram_close(cram_fd *fd) { |
5469 | 245 | spare_bams *bl, *next; |
5470 | 245 | int i; |
5471 | | |
5472 | 245 | if (!fd) |
5473 | 0 | return -1; |
5474 | | |
5475 | 245 | if (fd->mode == 'w' && fd->ctr) { |
5476 | 0 | if(fd->ctr->slice) |
5477 | 0 | cram_update_curr_slice(fd->ctr, fd->version); |
5478 | |
|
5479 | 0 | if (-1 == cram_flush_container_mt(fd, fd->ctr)) |
5480 | 0 | return -1; |
5481 | 0 | } |
5482 | | |
5483 | 245 | if (fd->mode != 'w') |
5484 | 245 | cram_drain_rqueue(fd); |
5485 | | |
5486 | 245 | if (fd->pool && fd->eof >= 0 && fd->rqueue) { |
5487 | 0 | hts_tpool_process_flush(fd->rqueue); |
5488 | |
|
5489 | 0 | if (0 != cram_flush_result(fd)) |
5490 | 0 | return -1; |
5491 | | |
5492 | 0 | if (fd->mode == 'w') |
5493 | 0 | fd->ctr = NULL; // prevent double freeing |
5494 | |
|
5495 | 0 | pthread_mutex_destroy(&fd->metrics_lock); |
5496 | 0 | pthread_mutex_destroy(&fd->ref_lock); |
5497 | 0 | pthread_mutex_destroy(&fd->bam_list_lock); |
5498 | | |
5499 | | //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue); |
5500 | |
|
5501 | 0 | hts_tpool_process_destroy(fd->rqueue); |
5502 | 0 | } |
5503 | | |
5504 | 245 | if (fd->mode == 'w') { |
5505 | | /* Write EOF block */ |
5506 | 0 | if (0 != cram_write_eof_block(fd)) |
5507 | 0 | return -1; |
5508 | 0 | } |
5509 | | |
5510 | 245 | for (bl = fd->bl; bl; bl = next) { |
5511 | 0 | int i, max_rec = fd->seqs_per_slice * fd->slices_per_container; |
5512 | |
|
5513 | 0 | next = bl->next; |
5514 | 0 | for (i = 0; i < max_rec; i++) { |
5515 | 0 | if (bl->bams[i]) |
5516 | 0 | bam_free(bl->bams[i]); |
5517 | 0 | } |
5518 | 0 | free(bl->bams); |
5519 | 0 | free(bl); |
5520 | 0 | } |
5521 | | |
5522 | 245 | if (hclose(fd->fp) != 0) |
5523 | 0 | return -1; |
5524 | | |
5525 | 245 | if (fd->file_def) |
5526 | 245 | cram_free_file_def(fd->file_def); |
5527 | | |
5528 | 245 | if (fd->header) |
5529 | 245 | sam_hdr_destroy(fd->header); |
5530 | | |
5531 | 245 | free(fd->prefix); |
5532 | | |
5533 | 245 | if (fd->ctr) |
5534 | 207 | cram_free_container(fd->ctr); |
5535 | | |
5536 | 245 | if (fd->ctr_mt && fd->ctr_mt != fd->ctr) |
5537 | 16 | cram_free_container(fd->ctr_mt); |
5538 | | |
5539 | 245 | if (fd->refs) |
5540 | 245 | refs_free(fd->refs); |
5541 | 245 | if (fd->ref_free) |
5542 | 0 | free(fd->ref_free); |
5543 | | |
5544 | 11.7k | for (i = 0; i < DS_END; i++) |
5545 | 11.5k | if (fd->m[i]) |
5546 | 11.5k | free(fd->m[i]); |
5547 | | |
5548 | 245 | if (fd->tags_used) { |
5549 | 245 | khint_t k; |
5550 | | |
5551 | 245 | for (k = kh_begin(fd->tags_used); k != kh_end(fd->tags_used); k++) { |
5552 | 0 | if (kh_exist(fd->tags_used, k)) |
5553 | 0 | free(kh_val(fd->tags_used, k)); |
5554 | 0 | } |
5555 | | |
5556 | 245 | kh_destroy(m_metrics, fd->tags_used); |
5557 | 245 | } |
5558 | | |
5559 | 245 | if (fd->index) |
5560 | 0 | cram_index_free(fd); |
5561 | | |
5562 | 245 | if (fd->own_pool && fd->pool) |
5563 | 0 | hts_tpool_destroy(fd->pool); |
5564 | | |
5565 | 245 | if (fd->idxfp) |
5566 | 0 | if (bgzf_close(fd->idxfp) < 0) |
5567 | 0 | return -1; |
5568 | | |
5569 | 245 | free(fd); |
5570 | 245 | return 0; |
5571 | 245 | } |
5572 | | |
5573 | | /* |
5574 | | * Returns 1 if we hit an EOF while reading. |
5575 | | */ |
5576 | 490 | int cram_eof(cram_fd *fd) { |
5577 | 490 | return fd->eof; |
5578 | 490 | } |
5579 | | |
5580 | | |
5581 | | /* |
5582 | | * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h. |
5583 | | * Use this immediately after opening. |
5584 | | * |
5585 | | * Returns 0 on success |
5586 | | * -1 on failure |
5587 | | */ |
5588 | 245 | int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...) { |
5589 | 245 | int r; |
5590 | 245 | va_list args; |
5591 | | |
5592 | 245 | va_start(args, opt); |
5593 | 245 | r = cram_set_voption(fd, opt, args); |
5594 | 245 | va_end(args); |
5595 | | |
5596 | 245 | return r; |
5597 | 245 | } |
5598 | | |
5599 | | /* |
5600 | | * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h. |
5601 | | * Use this immediately after opening. |
5602 | | * |
5603 | | * Returns 0 on success |
5604 | | * -1 on failure |
5605 | | */ |
5606 | 245 | int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { |
5607 | 245 | refs_t *refs; |
5608 | | |
5609 | 245 | if (!fd) { |
5610 | 0 | errno = EBADF; |
5611 | 0 | return -1; |
5612 | 0 | } |
5613 | | |
5614 | 245 | switch (opt) { |
5615 | 245 | case CRAM_OPT_DECODE_MD: |
5616 | 245 | fd->decode_md = va_arg(args, int); |
5617 | 245 | break; |
5618 | | |
5619 | 0 | case CRAM_OPT_PREFIX: |
5620 | 0 | if (fd->prefix) |
5621 | 0 | free(fd->prefix); |
5622 | 0 | if (!(fd->prefix = strdup(va_arg(args, char *)))) |
5623 | 0 | return -1; |
5624 | 0 | break; |
5625 | | |
5626 | 0 | case CRAM_OPT_VERBOSITY: |
5627 | 0 | break; |
5628 | | |
5629 | 0 | case CRAM_OPT_SEQS_PER_SLICE: |
5630 | 0 | fd->seqs_per_slice = va_arg(args, int); |
5631 | 0 | if (fd->bases_per_slice == BASES_PER_SLICE) |
5632 | 0 | fd->bases_per_slice = fd->seqs_per_slice * 500; |
5633 | 0 | break; |
5634 | | |
5635 | 0 | case CRAM_OPT_BASES_PER_SLICE: |
5636 | 0 | fd->bases_per_slice = va_arg(args, int); |
5637 | 0 | break; |
5638 | | |
5639 | 0 | case CRAM_OPT_SLICES_PER_CONTAINER: |
5640 | 0 | fd->slices_per_container = va_arg(args, int); |
5641 | 0 | break; |
5642 | | |
5643 | 0 | case CRAM_OPT_EMBED_REF: |
5644 | 0 | fd->embed_ref = va_arg(args, int); |
5645 | 0 | break; |
5646 | | |
5647 | 0 | case CRAM_OPT_NO_REF: |
5648 | 0 | fd->no_ref = va_arg(args, int); |
5649 | 0 | break; |
5650 | | |
5651 | 0 | case CRAM_OPT_POS_DELTA: |
5652 | 0 | fd->ap_delta = va_arg(args, int); |
5653 | 0 | break; |
5654 | | |
5655 | 0 | case CRAM_OPT_IGNORE_MD5: |
5656 | 0 | fd->ignore_md5 = va_arg(args, int); |
5657 | 0 | break; |
5658 | | |
5659 | 0 | case CRAM_OPT_LOSSY_NAMES: |
5660 | 0 | fd->lossy_read_names = va_arg(args, int); |
5661 | | // Currently lossy read names required paired (attached) reads. |
5662 | | // TLEN 0 or being 1 out causes read pairs to be detached, breaking |
5663 | | // the lossy read name compression, so we have extra options to |
5664 | | // slacken the exact TLEN round-trip checks. |
5665 | 0 | fd->tlen_approx = fd->lossy_read_names; |
5666 | 0 | fd->tlen_zero = fd->lossy_read_names; |
5667 | 0 | break; |
5668 | | |
5669 | 0 | case CRAM_OPT_USE_BZIP2: |
5670 | 0 | fd->use_bz2 = va_arg(args, int); |
5671 | 0 | break; |
5672 | | |
5673 | 0 | case CRAM_OPT_USE_RANS: |
5674 | 0 | fd->use_rans = va_arg(args, int); |
5675 | 0 | break; |
5676 | | |
5677 | 0 | case CRAM_OPT_USE_TOK: |
5678 | 0 | fd->use_tok = va_arg(args, int); |
5679 | 0 | break; |
5680 | | |
5681 | 0 | case CRAM_OPT_USE_FQZ: |
5682 | 0 | fd->use_fqz = va_arg(args, int); |
5683 | 0 | break; |
5684 | | |
5685 | 0 | case CRAM_OPT_USE_ARITH: |
5686 | 0 | fd->use_arith = va_arg(args, int); |
5687 | 0 | break; |
5688 | | |
5689 | 0 | case CRAM_OPT_USE_LZMA: |
5690 | 0 | fd->use_lzma = va_arg(args, int); |
5691 | 0 | break; |
5692 | | |
5693 | 0 | case CRAM_OPT_SHARED_REF: |
5694 | 0 | fd->shared_ref = 1; |
5695 | 0 | refs = va_arg(args, refs_t *); |
5696 | 0 | if (refs != fd->refs) { |
5697 | 0 | if (fd->refs) |
5698 | 0 | refs_free(fd->refs); |
5699 | 0 | fd->refs = refs; |
5700 | 0 | fd->refs->count++; |
5701 | 0 | } |
5702 | 0 | break; |
5703 | | |
5704 | 0 | case CRAM_OPT_RANGE: { |
5705 | 0 | int r = cram_seek_to_refpos(fd, va_arg(args, cram_range *)); |
5706 | 0 | pthread_mutex_lock(&fd->range_lock); |
5707 | 0 | if (fd->range.refid != -2) |
5708 | 0 | fd->required_fields |= SAM_POS; |
5709 | 0 | pthread_mutex_unlock(&fd->range_lock); |
5710 | 0 | return r; |
5711 | 0 | } |
5712 | | |
5713 | 0 | case CRAM_OPT_RANGE_NOSEEK: { |
5714 | | // As per CRAM_OPT_RANGE, but no seeking |
5715 | 0 | pthread_mutex_lock(&fd->range_lock); |
5716 | 0 | cram_range *r = va_arg(args, cram_range *); |
5717 | 0 | fd->range = *r; |
5718 | 0 | if (r->refid == HTS_IDX_NOCOOR) { |
5719 | 0 | fd->range.refid = -1; |
5720 | 0 | fd->range.start = 0; |
5721 | 0 | } else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) { |
5722 | 0 | fd->range.refid = -2; // special case in cram_next_slice |
5723 | 0 | } |
5724 | 0 | if (fd->range.refid != -2) |
5725 | 0 | fd->required_fields |= SAM_POS; |
5726 | 0 | fd->ooc = 0; |
5727 | 0 | fd->eof = 0; |
5728 | 0 | pthread_mutex_unlock(&fd->range_lock); |
5729 | 0 | return 0; |
5730 | 0 | } |
5731 | | |
5732 | 0 | case CRAM_OPT_REFERENCE: |
5733 | 0 | return cram_load_reference(fd, va_arg(args, char *)); |
5734 | | |
5735 | 0 | case CRAM_OPT_VERSION: { |
5736 | 0 | int major, minor; |
5737 | 0 | char *s = va_arg(args, char *); |
5738 | 0 | if (2 != sscanf(s, "%d.%d", &major, &minor)) { |
5739 | 0 | hts_log_error("Malformed version string %s", s); |
5740 | 0 | return -1; |
5741 | 0 | } |
5742 | 0 | if (!((major == 1 && minor == 0) || |
5743 | 0 | (major == 2 && (minor == 0 || minor == 1)) || |
5744 | 0 | (major == 3 && (minor == 0 || minor == 1)) || |
5745 | 0 | (major == 4 && minor == 0))) { |
5746 | 0 | hts_log_error("Unknown version string; use 1.0, 2.0, 2.1, 3.0, 3.1 or 4.0"); |
5747 | 0 | errno = EINVAL; |
5748 | 0 | return -1; |
5749 | 0 | } |
5750 | | |
5751 | 0 | if (major > 3 || (major == 3 && minor > 0)) { |
5752 | 0 | hts_log_warning( |
5753 | 0 | "CRAM version %s is still a draft and subject to change.\n" |
5754 | 0 | "This is a technology demonstration that should not be " |
5755 | 0 | "used for archival data.", s); |
5756 | 0 | } |
5757 | |
|
5758 | 0 | fd->version = major*256 + minor; |
5759 | |
|
5760 | 0 | fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0; |
5761 | |
|
5762 | 0 | fd->use_tok = ((CRAM_MAJOR_VERS(fd->version) == 3 && |
5763 | 0 | CRAM_MINOR_VERS(fd->version) >= 1) || |
5764 | 0 | CRAM_MAJOR_VERS(fd->version) >= 4) ? 1 : 0; |
5765 | 0 | cram_init_tables(fd); |
5766 | |
|
5767 | 0 | break; |
5768 | 0 | } |
5769 | | |
5770 | 0 | case CRAM_OPT_MULTI_SEQ_PER_SLICE: |
5771 | 0 | fd->multi_seq_user = fd->multi_seq = va_arg(args, int); |
5772 | 0 | break; |
5773 | | |
5774 | 0 | case CRAM_OPT_NTHREADS: { |
5775 | 0 | int nthreads = va_arg(args, int); |
5776 | 0 | if (nthreads >= 1) { |
5777 | 0 | if (!(fd->pool = hts_tpool_init(nthreads))) |
5778 | 0 | return -1; |
5779 | | |
5780 | 0 | fd->rqueue = hts_tpool_process_init(fd->pool, nthreads*2, 0); |
5781 | 0 | pthread_mutex_init(&fd->metrics_lock, NULL); |
5782 | 0 | pthread_mutex_init(&fd->ref_lock, NULL); |
5783 | 0 | pthread_mutex_init(&fd->range_lock, NULL); |
5784 | 0 | pthread_mutex_init(&fd->bam_list_lock, NULL); |
5785 | 0 | fd->shared_ref = 1; |
5786 | 0 | fd->own_pool = 1; |
5787 | 0 | } |
5788 | 0 | break; |
5789 | 0 | } |
5790 | | |
5791 | 0 | case CRAM_OPT_THREAD_POOL: { |
5792 | 0 | htsThreadPool *p = va_arg(args, htsThreadPool *); |
5793 | 0 | fd->pool = p ? p->pool : NULL; |
5794 | 0 | if (fd->pool) { |
5795 | 0 | fd->rqueue = hts_tpool_process_init(fd->pool, |
5796 | 0 | p->qsize ? p->qsize : hts_tpool_size(fd->pool)*2, |
5797 | 0 | 0); |
5798 | 0 | pthread_mutex_init(&fd->metrics_lock, NULL); |
5799 | 0 | pthread_mutex_init(&fd->ref_lock, NULL); |
5800 | 0 | pthread_mutex_init(&fd->range_lock, NULL); |
5801 | 0 | pthread_mutex_init(&fd->bam_list_lock, NULL); |
5802 | 0 | } |
5803 | 0 | fd->shared_ref = 1; // Needed to avoid clobbering ref between threads |
5804 | 0 | fd->own_pool = 0; |
5805 | | |
5806 | | //fd->qsize = 1; |
5807 | | //fd->decoded = calloc(fd->qsize, sizeof(cram_container *)); |
5808 | | //hts_tpool_dispatch(fd->pool, cram_decoder_thread, fd); |
5809 | 0 | break; |
5810 | 0 | } |
5811 | | |
5812 | 0 | case CRAM_OPT_REQUIRED_FIELDS: |
5813 | 0 | fd->required_fields = va_arg(args, int); |
5814 | 0 | if (fd->range.refid != -2) |
5815 | 0 | fd->required_fields |= SAM_POS; |
5816 | 0 | break; |
5817 | | |
5818 | 0 | case CRAM_OPT_STORE_MD: |
5819 | 0 | fd->store_md = va_arg(args, int); |
5820 | 0 | break; |
5821 | | |
5822 | 0 | case CRAM_OPT_STORE_NM: |
5823 | 0 | fd->store_nm = va_arg(args, int); |
5824 | 0 | break; |
5825 | | |
5826 | 0 | case HTS_OPT_COMPRESSION_LEVEL: |
5827 | 0 | fd->level = va_arg(args, int); |
5828 | 0 | break; |
5829 | | |
5830 | 0 | case HTS_OPT_PROFILE: { |
5831 | 0 | enum hts_profile_option prof = va_arg(args, int); |
5832 | 0 | switch (prof) { |
5833 | 0 | case HTS_PROFILE_FAST: |
5834 | 0 | if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 1; |
5835 | 0 | fd->use_tok = 0; |
5836 | 0 | fd->seqs_per_slice = 10000; |
5837 | 0 | break; |
5838 | | |
5839 | 0 | case HTS_PROFILE_NORMAL: |
5840 | 0 | break; |
5841 | | |
5842 | 0 | case HTS_PROFILE_SMALL: |
5843 | 0 | if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 6; |
5844 | 0 | fd->use_bz2 = 1; |
5845 | 0 | fd->use_fqz = 1; |
5846 | 0 | fd->seqs_per_slice = 25000; |
5847 | 0 | break; |
5848 | | |
5849 | 0 | case HTS_PROFILE_ARCHIVE: |
5850 | 0 | if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 7; |
5851 | 0 | fd->use_bz2 = 1; |
5852 | 0 | fd->use_fqz = 1; |
5853 | 0 | fd->use_arith = 1; |
5854 | 0 | if (fd->level > 7) |
5855 | 0 | fd->use_lzma = 1; |
5856 | 0 | fd->seqs_per_slice = 100000; |
5857 | 0 | break; |
5858 | 0 | } |
5859 | | |
5860 | 0 | if (fd->bases_per_slice == BASES_PER_SLICE) |
5861 | 0 | fd->bases_per_slice = fd->seqs_per_slice * 500; |
5862 | 0 | break; |
5863 | 0 | } |
5864 | | |
5865 | 0 | default: |
5866 | 0 | hts_log_error("Unknown CRAM option code %d", opt); |
5867 | 0 | errno = EINVAL; |
5868 | 0 | return -1; |
5869 | 245 | } |
5870 | | |
5871 | 245 | return 0; |
5872 | 245 | } |
5873 | | |
5874 | | int cram_check_EOF(cram_fd *fd) |
5875 | 0 | { |
5876 | | // Byte 9 in these templates is & with 0x0f to resolve differences |
5877 | | // between ITF-8 interpretations between early Java and C |
5878 | | // implementations of CRAM |
5879 | 0 | static const unsigned char TEMPLATE_2_1[30] = { |
5880 | 0 | 0x0b, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0, |
5881 | 0 | 0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, |
5882 | 0 | 0x01, 0x00, 0x06, 0x06, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00 |
5883 | 0 | }; |
5884 | 0 | static const unsigned char TEMPLATE_3[38] = { |
5885 | 0 | 0x0f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0, |
5886 | 0 | 0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x05, |
5887 | 0 | 0xbd, 0xd9, 0x4f, 0x00, 0x01, 0x00, 0x06, 0x06, 0x01, 0x00, |
5888 | 0 | 0x01, 0x00, 0x01, 0x00, 0xee, 0x63, 0x01, 0x4b |
5889 | 0 | }; |
5890 | |
|
5891 | 0 | unsigned char buf[38]; // max(sizeof TEMPLATE_*) |
5892 | |
|
5893 | 0 | uint8_t major = CRAM_MAJOR_VERS(fd->version); |
5894 | 0 | uint8_t minor = CRAM_MINOR_VERS(fd->version); |
5895 | |
|
5896 | 0 | const unsigned char *template; |
5897 | 0 | ssize_t template_len; |
5898 | 0 | if ((major < 2) || |
5899 | 0 | (major == 2 && minor == 0)) { |
5900 | 0 | return 3; // No EOF support in cram versions less than 2.1 |
5901 | 0 | } else if (major == 2 && minor == 1) { |
5902 | 0 | template = TEMPLATE_2_1; |
5903 | 0 | template_len = sizeof TEMPLATE_2_1; |
5904 | 0 | } else { |
5905 | 0 | template = TEMPLATE_3; |
5906 | 0 | template_len = sizeof TEMPLATE_3; |
5907 | 0 | } |
5908 | | |
5909 | 0 | off_t offset = htell(fd->fp); |
5910 | 0 | if (hseek(fd->fp, -template_len, SEEK_END) < 0) { |
5911 | 0 | if (errno == ESPIPE) { |
5912 | 0 | hclearerr(fd->fp); |
5913 | 0 | return 2; |
5914 | 0 | } |
5915 | 0 | else { |
5916 | 0 | return -1; |
5917 | 0 | } |
5918 | 0 | } |
5919 | 0 | if (hread(fd->fp, buf, template_len) != template_len) return -1; |
5920 | 0 | if (hseek(fd->fp, offset, SEEK_SET) < 0) return -1; |
5921 | 0 | buf[8] &= 0x0f; |
5922 | 0 | return (memcmp(template, buf, template_len) == 0)? 1 : 0; |
5923 | 0 | } |