/src/htslib/cram/cram_io.c
Line | Count | Source |
1 | | /* |
2 | | Copyright (c) 2012-2026 Genome Research Ltd. |
3 | | Author: James Bonfield <jkb@sanger.ac.uk> |
4 | | |
5 | | Redistribution and use in source and binary forms, with or without |
6 | | modification, are permitted provided that the following conditions are met: |
7 | | |
8 | | 1. Redistributions of source code must retain the above copyright notice, |
9 | | this list of conditions and the following disclaimer. |
10 | | |
11 | | 2. Redistributions in binary form must reproduce the above copyright notice, |
12 | | this list of conditions and the following disclaimer in the documentation |
13 | | and/or other materials provided with the distribution. |
14 | | |
15 | | 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger |
16 | | Institute nor the names of its contributors may be used to endorse or promote |
17 | | products derived from this software without specific prior written permission. |
18 | | |
19 | | THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND |
20 | | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
21 | | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
22 | | DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE |
23 | | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
24 | | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
25 | | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
26 | | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
27 | | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
28 | | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 | | */ |
30 | | |
31 | | /* |
32 | | * CRAM I/O primitives. |
33 | | * |
34 | | * - ITF8 encoding and decoding. |
35 | | * - Block based I/O |
36 | | * - Zlib inflating and deflating (memory) |
37 | | * - CRAM basic data structure reading and writing |
38 | | * - File opening / closing |
39 | | * - Reference sequence handling |
40 | | */ |
41 | | |
42 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
43 | | #include <config.h> |
44 | | |
45 | | #include <stdio.h> |
46 | | #include <errno.h> |
47 | | #include <assert.h> |
48 | | #include <stdlib.h> |
49 | | #include <string.h> |
50 | | #include <unistd.h> |
51 | | #include <zlib.h> |
52 | | #ifdef HAVE_LIBBZ2 |
53 | | #include <bzlib.h> |
54 | | #endif |
55 | | #ifdef HAVE_LIBLZMA |
56 | | #ifdef HAVE_LZMA_H |
57 | | #include <lzma.h> |
58 | | #else |
59 | | #include "../os/lzma_stub.h" |
60 | | #endif |
61 | | #endif |
62 | | #include <sys/types.h> |
63 | | #include <sys/stat.h> |
64 | | #include <math.h> |
65 | | #include <stdint.h> |
66 | | |
67 | | #ifdef HAVE_LIBDEFLATE |
68 | | #include <libdeflate.h> |
69 | | #define crc32(a,b,c) libdeflate_crc32((a),(b),(c)) |
70 | | #endif |
71 | | |
72 | | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
73 | | #include "../fuzz_settings.h" |
74 | | #endif |
75 | | |
76 | | #include "cram.h" |
77 | | #include "os.h" |
78 | | #include "../htslib/hts.h" |
79 | | #include "../htslib/hts_alloc.h" |
80 | | #include "../hts_internal.h" |
81 | | #include "open_trace_file.h" |
82 | | |
83 | | #if defined(HAVE_EXTERNAL_LIBHTSCODECS) |
84 | | #include <htscodecs/rANS_static.h> |
85 | | #include <htscodecs/rANS_static4x16.h> |
86 | | #include <htscodecs/arith_dynamic.h> |
87 | | #include <htscodecs/tokenise_name3.h> |
88 | | #include <htscodecs/fqzcomp_qual.h> |
89 | | #else |
90 | | #include "../htscodecs/htscodecs/rANS_static.h" |
91 | | #include "../htscodecs/htscodecs/rANS_static4x16.h" |
92 | | #include "../htscodecs/htscodecs/arith_dynamic.h" |
93 | | #include "../htscodecs/htscodecs/tokenise_name3.h" |
94 | | #include "../htscodecs/htscodecs/fqzcomp_qual.h" |
95 | | #include "../htscodecs/htscodecs/varint.h" |
96 | | #endif |
97 | | |
98 | | //#define REF_DEBUG |
99 | | |
100 | | #ifdef REF_DEBUG |
101 | | #include <sys/syscall.h> |
102 | | #define gettid() (int)syscall(SYS_gettid) |
103 | | |
104 | | #define RP(...) fprintf (stderr, __VA_ARGS__) |
105 | | #else |
106 | | #define RP(...) |
107 | | #endif |
108 | | |
109 | | #include "../htslib/hfile.h" |
110 | | #include "../htslib/bgzf.h" |
111 | | #include "../htslib/faidx.h" |
112 | | #include "../hts_internal.h" |
113 | | |
114 | | #ifndef PATH_MAX |
115 | | #define PATH_MAX FILENAME_MAX |
116 | | #endif |
117 | | |
118 | 756k | #define TRIAL_SPAN 70 |
119 | 756k | #define NTRIALS 3 |
120 | | |
121 | 17.7k | #define CRAM_DEFAULT_LEVEL 5 |
122 | | |
123 | | /* ---------------------------------------------------------------------- |
124 | | * ITF8 encoding and decoding. |
125 | | * |
126 | | * Also see the itf8_get and itf8_put macros in cram_io.h |
127 | | */ |
128 | | |
129 | | /* |
130 | | * LEGACY: consider using itf8_decode_crc. |
131 | | * |
132 | | * Reads an integer in ITF-8 encoding from 'cp' and stores it in |
133 | | * *val. |
134 | | * |
135 | | * Returns the number of bytes read on success |
136 | | * -1 on failure |
137 | | */ |
138 | 0 | int itf8_decode(cram_fd *fd, int32_t *val_p) { |
139 | 0 | static int nbytes[16] = { |
140 | 0 | 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx |
141 | 0 | 1,1,1,1, // 1000xxxx - 1011xxxx |
142 | 0 | 2,2, // 1100xxxx - 1101xxxx |
143 | 0 | 3, // 1110xxxx |
144 | 0 | 4, // 1111xxxx |
145 | 0 | }; |
146 | |
|
147 | 0 | static int nbits[16] = { |
148 | 0 | 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx |
149 | 0 | 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx |
150 | 0 | 0x1f, 0x1f, // 1100xxxx - 1101xxxx |
151 | 0 | 0x0f, // 1110xxxx |
152 | 0 | 0x0f, // 1111xxxx |
153 | 0 | }; |
154 | |
|
155 | 0 | int32_t val = hgetc(fd->fp); |
156 | 0 | if (val == -1) |
157 | 0 | return -1; |
158 | | |
159 | 0 | int i = nbytes[val>>4]; |
160 | 0 | val &= nbits[val>>4]; |
161 | |
|
162 | 0 | switch(i) { |
163 | 0 | case 0: |
164 | 0 | *val_p = val; |
165 | 0 | return 1; |
166 | | |
167 | 0 | case 1: |
168 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
169 | 0 | *val_p = val; |
170 | 0 | return 2; |
171 | | |
172 | 0 | case 2: |
173 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
174 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
175 | 0 | *val_p = val; |
176 | 0 | return 3; |
177 | | |
178 | 0 | case 3: |
179 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
180 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
181 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
182 | 0 | *val_p = val; |
183 | 0 | return 4; |
184 | | |
185 | 0 | case 4: // really 3.5 more, why make it different? |
186 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
187 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
188 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
189 | 0 | val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f); |
190 | 0 | *val_p = val; |
191 | 0 | } |
192 | | |
193 | 0 | return 5; |
194 | 0 | } |
195 | | |
196 | 133k | int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) { |
197 | 133k | static int nbytes[16] = { |
198 | 133k | 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx |
199 | 133k | 1,1,1,1, // 1000xxxx - 1011xxxx |
200 | 133k | 2,2, // 1100xxxx - 1101xxxx |
201 | 133k | 3, // 1110xxxx |
202 | 133k | 4, // 1111xxxx |
203 | 133k | }; |
204 | | |
205 | 133k | static int nbits[16] = { |
206 | 133k | 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx |
207 | 133k | 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx |
208 | 133k | 0x1f, 0x1f, // 1100xxxx - 1101xxxx |
209 | 133k | 0x0f, // 1110xxxx |
210 | 133k | 0x0f, // 1111xxxx |
211 | 133k | }; |
212 | 133k | unsigned char c[5]; |
213 | | |
214 | 133k | int32_t val = hgetc(fd->fp); |
215 | 133k | if (val == -1) |
216 | 274 | return -1; |
217 | | |
218 | 132k | c[0]=val; |
219 | | |
220 | 132k | int i = nbytes[val>>4]; |
221 | 132k | val &= nbits[val>>4]; |
222 | | |
223 | 132k | if (i > 0) { |
224 | 27.9k | if (hread(fd->fp, &c[1], i) < i) |
225 | 95 | return -1; |
226 | 27.9k | } |
227 | | |
228 | 132k | switch(i) { |
229 | 104k | case 0: |
230 | 104k | *val_p = val; |
231 | 104k | *crc = crc32(*crc, c, 1); |
232 | 104k | return 1; |
233 | | |
234 | 20.2k | case 1: |
235 | 20.2k | val = (val<<8) | c[1]; |
236 | 20.2k | *val_p = val; |
237 | 20.2k | *crc = crc32(*crc, c, 2); |
238 | 20.2k | return 2; |
239 | | |
240 | 3.86k | case 2: |
241 | 3.86k | val = (val<<8) | c[1]; |
242 | 3.86k | val = (val<<8) | c[2]; |
243 | 3.86k | *val_p = val; |
244 | 3.86k | *crc = crc32(*crc, c, 3); |
245 | 3.86k | return 3; |
246 | | |
247 | 1.01k | case 3: |
248 | 1.01k | val = (val<<8) | c[1]; |
249 | 1.01k | val = (val<<8) | c[2]; |
250 | 1.01k | val = (val<<8) | c[3]; |
251 | 1.01k | *val_p = val; |
252 | 1.01k | *crc = crc32(*crc, c, 4); |
253 | 1.01k | return 4; |
254 | | |
255 | 2.72k | case 4: // really 3.5 more, why make it different? |
256 | 2.72k | { |
257 | 2.72k | uint32_t uv = val; |
258 | 2.72k | uv = (uv<<8) | c[1]; |
259 | 2.72k | uv = (uv<<8) | c[2]; |
260 | 2.72k | uv = (uv<<8) | c[3]; |
261 | 2.72k | uv = (uv<<4) | (c[4] & 0x0f); |
262 | | // Avoid implementation-defined behaviour on negative values |
263 | 2.72k | *val_p = uv < 0x80000000UL ? (int32_t) uv : -((int32_t) (0xffffffffUL - uv)) - 1; |
264 | 2.72k | *crc = crc32(*crc, c, 5); |
265 | 2.72k | } |
266 | 132k | } |
267 | | |
268 | 2.72k | return 5; |
269 | 132k | } |
270 | | |
271 | | /* |
272 | | * Stores a value to memory in ITF-8 format. |
273 | | * |
274 | | * Returns the number of bytes required to store the number. |
275 | | * This is a maximum of 5 bytes. |
276 | | */ |
277 | 11.8M | static inline int itf8_put(char *cp, int32_t val) { |
278 | 11.8M | unsigned char *up = (unsigned char *)cp; |
279 | 11.8M | if (!(val & ~0x00000007f)) { // 1 byte |
280 | 11.7M | *up = val; |
281 | 11.7M | return 1; |
282 | 11.7M | } else if (!(val & ~0x00003fff)) { // 2 byte |
283 | 72.5k | *up++ = (val >> 8 ) | 0x80; |
284 | 72.5k | *up = val & 0xff; |
285 | 72.5k | return 2; |
286 | 72.5k | } else if (!(val & ~0x01fffff)) { // 3 byte |
287 | 2.70k | *up++ = (val >> 16) | 0xc0; |
288 | 2.70k | *up++ = (val >> 8 ) & 0xff; |
289 | 2.70k | *up = val & 0xff; |
290 | 2.70k | return 3; |
291 | 42.3k | } else if (!(val & ~0x0fffffff)) { // 4 byte |
292 | 16.4k | *up++ = (val >> 24) | 0xe0; |
293 | 16.4k | *up++ = (val >> 16) & 0xff; |
294 | 16.4k | *up++ = (val >> 8 ) & 0xff; |
295 | 16.4k | *up = val & 0xff; |
296 | 16.4k | return 4; |
297 | 25.8k | } else { // 5 byte |
298 | 25.8k | *up++ = 0xf0 | ((val>>28) & 0xff); |
299 | 25.8k | *up++ = (val >> 20) & 0xff; |
300 | 25.8k | *up++ = (val >> 12) & 0xff; |
301 | 25.8k | *up++ = (val >> 4 ) & 0xff; |
302 | 25.8k | *up = val & 0x0f; |
303 | 25.8k | return 5; |
304 | 25.8k | } |
305 | 11.8M | } |
306 | | |
307 | | |
308 | | /* 64-bit itf8 variant */ |
309 | 36.5k | static inline int ltf8_put(char *cp, int64_t val) { |
310 | 36.5k | unsigned char *up = (unsigned char *)cp; |
311 | 36.5k | if (!(val & ~((1LL<<7)-1))) { |
312 | 34.5k | *up = val; |
313 | 34.5k | return 1; |
314 | 34.5k | } else if (!(val & ~((1LL<<(6+8))-1))) { |
315 | 894 | *up++ = (val >> 8 ) | 0x80; |
316 | 894 | *up = val & 0xff; |
317 | 894 | return 2; |
318 | 1.06k | } else if (!(val & ~((1LL<<(5+2*8))-1))) { |
319 | 1.03k | *up++ = (val >> 16) | 0xc0; |
320 | 1.03k | *up++ = (val >> 8 ) & 0xff; |
321 | 1.03k | *up = val & 0xff; |
322 | 1.03k | return 3; |
323 | 1.03k | } else if (!(val & ~((1LL<<(4+3*8))-1))) { |
324 | 39 | *up++ = (val >> 24) | 0xe0; |
325 | 39 | *up++ = (val >> 16) & 0xff; |
326 | 39 | *up++ = (val >> 8 ) & 0xff; |
327 | 39 | *up = val & 0xff; |
328 | 39 | return 4; |
329 | 39 | } else if (!(val & ~((1LL<<(3+4*8))-1))) { |
330 | 0 | *up++ = (val >> 32) | 0xf0; |
331 | 0 | *up++ = (val >> 24) & 0xff; |
332 | 0 | *up++ = (val >> 16) & 0xff; |
333 | 0 | *up++ = (val >> 8 ) & 0xff; |
334 | 0 | *up = val & 0xff; |
335 | 0 | return 5; |
336 | 0 | } else if (!(val & ~((1LL<<(2+5*8))-1))) { |
337 | 0 | *up++ = (val >> 40) | 0xf8; |
338 | 0 | *up++ = (val >> 32) & 0xff; |
339 | 0 | *up++ = (val >> 24) & 0xff; |
340 | 0 | *up++ = (val >> 16) & 0xff; |
341 | 0 | *up++ = (val >> 8 ) & 0xff; |
342 | 0 | *up = val & 0xff; |
343 | 0 | return 6; |
344 | 0 | } else if (!(val & ~((1LL<<(1+6*8))-1))) { |
345 | 0 | *up++ = (val >> 48) | 0xfc; |
346 | 0 | *up++ = (val >> 40) & 0xff; |
347 | 0 | *up++ = (val >> 32) & 0xff; |
348 | 0 | *up++ = (val >> 24) & 0xff; |
349 | 0 | *up++ = (val >> 16) & 0xff; |
350 | 0 | *up++ = (val >> 8 ) & 0xff; |
351 | 0 | *up = val & 0xff; |
352 | 0 | return 7; |
353 | 0 | } else if (!(val & ~((1LL<<(7*8))-1))) { |
354 | 0 | *up++ = (val >> 56) | 0xfe; |
355 | 0 | *up++ = (val >> 48) & 0xff; |
356 | 0 | *up++ = (val >> 40) & 0xff; |
357 | 0 | *up++ = (val >> 32) & 0xff; |
358 | 0 | *up++ = (val >> 24) & 0xff; |
359 | 0 | *up++ = (val >> 16) & 0xff; |
360 | 0 | *up++ = (val >> 8 ) & 0xff; |
361 | 0 | *up = val & 0xff; |
362 | 0 | return 8; |
363 | 0 | } else { |
364 | 0 | *up++ = 0xff; |
365 | 0 | *up++ = (val >> 56) & 0xff; |
366 | 0 | *up++ = (val >> 48) & 0xff; |
367 | 0 | *up++ = (val >> 40) & 0xff; |
368 | 0 | *up++ = (val >> 32) & 0xff; |
369 | 0 | *up++ = (val >> 24) & 0xff; |
370 | 0 | *up++ = (val >> 16) & 0xff; |
371 | 0 | *up++ = (val >> 8 ) & 0xff; |
372 | 0 | *up = val & 0xff; |
373 | 0 | return 9; |
374 | 0 | } |
375 | 36.5k | } |
376 | | |
377 | | /* |
378 | | * Encodes and writes a single integer in ITF-8 format. |
379 | | * Returns 0 on success |
380 | | * -1 on failure |
381 | | */ |
382 | 0 | int itf8_encode(cram_fd *fd, int32_t val) { |
383 | 0 | char buf[5]; |
384 | 0 | int len = itf8_put(buf, val); |
385 | 0 | return hwrite(fd->fp, buf, len) == len ? 0 : -1; |
386 | 0 | } |
387 | | |
388 | | const int itf8_bytes[16] = { |
389 | | 1, 1, 1, 1, 1, 1, 1, 1, |
390 | | 2, 2, 2, 2, 3, 3, 4, 5 |
391 | | }; |
392 | | |
393 | | const int ltf8_bytes[256] = { |
394 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
395 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
396 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
397 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
398 | | |
399 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
400 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
401 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
402 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
403 | | |
404 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
405 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
406 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
407 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
408 | | |
409 | | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
410 | | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
411 | | |
412 | | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
413 | | |
414 | | 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 8, 9 |
415 | | }; |
416 | | |
417 | | /* |
418 | | * LEGACY: consider using ltf8_decode_crc. |
419 | | */ |
420 | 0 | int ltf8_decode(cram_fd *fd, int64_t *val_p) { |
421 | 0 | int c = hgetc(fd->fp); |
422 | 0 | int64_t val = (unsigned char)c; |
423 | 0 | if (c == -1) |
424 | 0 | return -1; |
425 | | |
426 | 0 | if (val < 0x80) { |
427 | 0 | *val_p = val; |
428 | 0 | return 1; |
429 | |
|
430 | 0 | } else if (val < 0xc0) { |
431 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
432 | 0 | *val_p = val & (((1LL<<(6+8)))-1); |
433 | 0 | return 2; |
434 | |
|
435 | 0 | } else if (val < 0xe0) { |
436 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
437 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
438 | 0 | *val_p = val & ((1LL<<(5+2*8))-1); |
439 | 0 | return 3; |
440 | |
|
441 | 0 | } else if (val < 0xf0) { |
442 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
443 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
444 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
445 | 0 | *val_p = val & ((1LL<<(4+3*8))-1); |
446 | 0 | return 4; |
447 | |
|
448 | 0 | } else if (val < 0xf8) { |
449 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
450 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
451 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
452 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
453 | 0 | *val_p = val & ((1LL<<(3+4*8))-1); |
454 | 0 | return 5; |
455 | |
|
456 | 0 | } else if (val < 0xfc) { |
457 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
458 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
459 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
460 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
461 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
462 | 0 | *val_p = val & ((1LL<<(2+5*8))-1); |
463 | 0 | return 6; |
464 | |
|
465 | 0 | } else if (val < 0xfe) { |
466 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
467 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
468 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
469 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
470 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
471 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
472 | 0 | *val_p = val & ((1LL<<(1+6*8))-1); |
473 | 0 | return 7; |
474 | |
|
475 | 0 | } else if (val < 0xff) { |
476 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
477 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
478 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
479 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
480 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
481 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
482 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
483 | 0 | *val_p = val & ((1LL<<(7*8))-1); |
484 | 0 | return 8; |
485 | |
|
486 | 0 | } else { |
487 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
488 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
489 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
490 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
491 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
492 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
493 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
494 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
495 | 0 | *val_p = val; |
496 | 0 | } |
497 | | |
498 | 0 | return 9; |
499 | 0 | } |
500 | | |
501 | 3.79k | int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) { |
502 | 3.79k | unsigned char c[9]; |
503 | 3.79k | int64_t val = hgetc(fd->fp); |
504 | 3.79k | if (val < 0) |
505 | 16 | return -1; |
506 | | |
507 | 3.77k | c[0] = val; |
508 | | |
509 | 3.77k | if (val < 0x80) { |
510 | 3.18k | *val_p = val; |
511 | 3.18k | *crc = crc32(*crc, c, 1); |
512 | 3.18k | return 1; |
513 | | |
514 | 3.18k | } else if (val < 0xc0) { |
515 | 107 | int v = hgetc(fd->fp); |
516 | 107 | if (v < 0) |
517 | 5 | return -1; |
518 | 102 | val = (val<<8) | (c[1]=v); |
519 | 102 | *val_p = val & (((1LL<<(6+8)))-1); |
520 | 102 | *crc = crc32(*crc, c, 2); |
521 | 102 | return 2; |
522 | | |
523 | 490 | } else if (val < 0xe0) { |
524 | 36 | if (hread(fd->fp, &c[1], 2) < 2) |
525 | 4 | return -1; |
526 | 32 | val = (val<<8) | c[1]; |
527 | 32 | val = (val<<8) | c[2]; |
528 | 32 | *val_p = val & ((1LL<<(5+2*8))-1); |
529 | 32 | *crc = crc32(*crc, c, 3); |
530 | 32 | return 3; |
531 | | |
532 | 454 | } else if (val < 0xf0) { |
533 | 46 | if (hread(fd->fp, &c[1], 3) < 3) |
534 | 4 | return -1; |
535 | 42 | val = (val<<8) | c[1]; |
536 | 42 | val = (val<<8) | c[2]; |
537 | 42 | val = (val<<8) | c[3]; |
538 | 42 | *val_p = val & ((1LL<<(4+3*8))-1); |
539 | 42 | *crc = crc32(*crc, c, 4); |
540 | 42 | return 4; |
541 | | |
542 | 408 | } else if (val < 0xf8) { |
543 | 37 | if (hread(fd->fp, &c[1], 4) < 4) |
544 | 7 | return -1; |
545 | 30 | val = (val<<8) | c[1]; |
546 | 30 | val = (val<<8) | c[2]; |
547 | 30 | val = (val<<8) | c[3]; |
548 | 30 | val = (val<<8) | c[4]; |
549 | 30 | *val_p = val & ((1LL<<(3+4*8))-1); |
550 | 30 | *crc = crc32(*crc, c, 5); |
551 | 30 | return 5; |
552 | | |
553 | 371 | } else if (val < 0xfc) { |
554 | 98 | if (hread(fd->fp, &c[1], 5) < 5) |
555 | 5 | return -1; |
556 | 93 | val = (val<<8) | c[1]; |
557 | 93 | val = (val<<8) | c[2]; |
558 | 93 | val = (val<<8) | c[3]; |
559 | 93 | val = (val<<8) | c[4]; |
560 | 93 | val = (val<<8) | c[5]; |
561 | 93 | *val_p = val & ((1LL<<(2+5*8))-1); |
562 | 93 | *crc = crc32(*crc, c, 6); |
563 | 93 | return 6; |
564 | | |
565 | 273 | } else if (val < 0xfe) { |
566 | 96 | if (hread(fd->fp, &c[1], 6) < 6) |
567 | 5 | return -1; |
568 | 91 | val = (val<<8) | c[1]; |
569 | 91 | val = (val<<8) | c[2]; |
570 | 91 | val = (val<<8) | c[3]; |
571 | 91 | val = (val<<8) | c[4]; |
572 | 91 | val = (val<<8) | c[5]; |
573 | 91 | val = (val<<8) | c[6]; |
574 | 91 | *val_p = val & ((1LL<<(1+6*8))-1); |
575 | 91 | *crc = crc32(*crc, c, 7); |
576 | 91 | return 7; |
577 | | |
578 | 177 | } else if (val < 0xff) { |
579 | 18 | uint64_t uval = val; |
580 | 18 | if (hread(fd->fp, &c[1], 7) < 7) |
581 | 1 | return -1; |
582 | 17 | uval = (uval<<8) | c[1]; |
583 | 17 | uval = (uval<<8) | c[2]; |
584 | 17 | uval = (uval<<8) | c[3]; |
585 | 17 | uval = (uval<<8) | c[4]; |
586 | 17 | uval = (uval<<8) | c[5]; |
587 | 17 | uval = (uval<<8) | c[6]; |
588 | 17 | uval = (uval<<8) | c[7]; |
589 | 17 | *val_p = uval & ((1ULL<<(7*8))-1); |
590 | 17 | *crc = crc32(*crc, c, 8); |
591 | 17 | return 8; |
592 | | |
593 | 159 | } else { |
594 | 159 | uint64_t uval; |
595 | 159 | if (hread(fd->fp, &c[1], 8) < 8) |
596 | 9 | return -1; |
597 | 150 | uval = c[1]; |
598 | 150 | uval = (uval<<8) | c[2]; |
599 | 150 | uval = (uval<<8) | c[3]; |
600 | 150 | uval = (uval<<8) | c[4]; |
601 | 150 | uval = (uval<<8) | c[5]; |
602 | 150 | uval = (uval<<8) | c[6]; |
603 | 150 | uval = (uval<<8) | c[7]; |
604 | 150 | uval = (uval<<8) | c[8]; |
605 | 150 | *crc = crc32(*crc, c, 9); |
606 | | // Avoid implementation-defined behaviour on negative values |
607 | 150 | *val_p = c[1] < 0x80 ? (int64_t) uval : -((int64_t) (0xffffffffffffffffULL - uval)) - 1; |
608 | 150 | } |
609 | | |
610 | 150 | return 9; |
611 | 3.77k | } |
612 | | |
613 | | /* |
614 | | * Pushes a value in ITF8 format onto the end of a block. |
615 | | * This shouldn't be used for high-volume data as it is not the fastest |
616 | | * method. |
617 | | * |
618 | | * Returns the number of bytes written |
619 | | */ |
620 | 11.2M | int itf8_put_blk(cram_block *blk, int32_t val) { |
621 | 11.2M | char buf[5]; |
622 | 11.2M | int sz; |
623 | | |
624 | 11.2M | sz = itf8_put(buf, val); |
625 | 11.2M | BLOCK_APPEND(blk, buf, sz); |
626 | 11.2M | return sz; |
627 | | |
628 | 0 | block_err: |
629 | 0 | return -1; |
630 | 11.2M | } |
631 | | |
632 | 0 | int ltf8_put_blk(cram_block *blk, int64_t val) { |
633 | 0 | char buf[9]; |
634 | 0 | int sz; |
635 | |
|
636 | 0 | sz = ltf8_put(buf, val); |
637 | 0 | BLOCK_APPEND(blk, buf, sz); |
638 | 0 | return sz; |
639 | | |
640 | 0 | block_err: |
641 | 0 | return -1; |
642 | 0 | } |
643 | | |
644 | 1.43M | static int64_t safe_itf8_get(char **cp, const char *endp, int *err) { |
645 | 1.43M | const unsigned char *up = (unsigned char *)*cp; |
646 | | |
647 | 1.43M | if (endp && endp - *cp < 5 && |
648 | 1.31M | (*cp >= endp || endp - *cp < itf8_bytes[up[0]>>4])) { |
649 | 1.31M | if (err) *err = 1; |
650 | 1.31M | return 0; |
651 | 1.31M | } |
652 | | |
653 | 117k | if (up[0] < 0x80) { |
654 | 107k | (*cp)++; |
655 | 107k | return up[0]; |
656 | 107k | } else if (up[0] < 0xc0) { |
657 | 4.33k | (*cp)+=2; |
658 | 4.33k | return ((up[0] <<8) | up[1]) & 0x3fff; |
659 | 5.46k | } else if (up[0] < 0xe0) { |
660 | 2.05k | (*cp)+=3; |
661 | 2.05k | return ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff; |
662 | 3.41k | } else if (up[0] < 0xf0) { |
663 | 894 | (*cp)+=4; |
664 | 894 | uint32_t uv = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff; |
665 | 894 | return (int32_t)uv; |
666 | 2.52k | } else { |
667 | 2.52k | (*cp)+=5; |
668 | 2.52k | uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f); |
669 | 2.52k | return (int32_t)uv; |
670 | 2.52k | } |
671 | 117k | } |
672 | | |
673 | 0 | static int64_t safe_ltf8_get(char **cp, const char *endp, int *err) { |
674 | 0 | unsigned char *up = (unsigned char *)*cp; |
675 | |
|
676 | 0 | if (endp && endp - *cp < 9 && |
677 | 0 | (*cp >= endp || endp - *cp < ltf8_bytes[up[0]])) { |
678 | 0 | if (err) *err = 1; |
679 | 0 | return 0; |
680 | 0 | } |
681 | | |
682 | 0 | if (up[0] < 0x80) { |
683 | 0 | (*cp)++; |
684 | 0 | return up[0]; |
685 | 0 | } else if (up[0] < 0xc0) { |
686 | 0 | (*cp)+=2; |
687 | 0 | return (((uint64_t)up[0]<< 8) | |
688 | 0 | (uint64_t)up[1]) & (((1LL<<(6+8)))-1); |
689 | 0 | } else if (up[0] < 0xe0) { |
690 | 0 | (*cp)+=3; |
691 | 0 | return (((uint64_t)up[0]<<16) | |
692 | 0 | ((uint64_t)up[1]<< 8) | |
693 | 0 | (uint64_t)up[2]) & ((1LL<<(5+2*8))-1); |
694 | 0 | } else if (up[0] < 0xf0) { |
695 | 0 | (*cp)+=4; |
696 | 0 | return (((uint64_t)up[0]<<24) | |
697 | 0 | ((uint64_t)up[1]<<16) | |
698 | 0 | ((uint64_t)up[2]<< 8) | |
699 | 0 | (uint64_t)up[3]) & ((1LL<<(4+3*8))-1); |
700 | 0 | } else if (up[0] < 0xf8) { |
701 | 0 | (*cp)+=5; |
702 | 0 | return (((uint64_t)up[0]<<32) | |
703 | 0 | ((uint64_t)up[1]<<24) | |
704 | 0 | ((uint64_t)up[2]<<16) | |
705 | 0 | ((uint64_t)up[3]<< 8) | |
706 | 0 | (uint64_t)up[4]) & ((1LL<<(3+4*8))-1); |
707 | 0 | } else if (up[0] < 0xfc) { |
708 | 0 | (*cp)+=6; |
709 | 0 | return (((uint64_t)up[0]<<40) | |
710 | 0 | ((uint64_t)up[1]<<32) | |
711 | 0 | ((uint64_t)up[2]<<24) | |
712 | 0 | ((uint64_t)up[3]<<16) | |
713 | 0 | ((uint64_t)up[4]<< 8) | |
714 | 0 | (uint64_t)up[5]) & ((1LL<<(2+5*8))-1); |
715 | 0 | } else if (up[0] < 0xfe) { |
716 | 0 | (*cp)+=7; |
717 | 0 | return (((uint64_t)up[0]<<48) | |
718 | 0 | ((uint64_t)up[1]<<40) | |
719 | 0 | ((uint64_t)up[2]<<32) | |
720 | 0 | ((uint64_t)up[3]<<24) | |
721 | 0 | ((uint64_t)up[4]<<16) | |
722 | 0 | ((uint64_t)up[5]<< 8) | |
723 | 0 | (uint64_t)up[6]) & ((1LL<<(1+6*8))-1); |
724 | 0 | } else if (up[0] < 0xff) { |
725 | 0 | (*cp)+=8; |
726 | 0 | return (((uint64_t)up[1]<<48) | |
727 | 0 | ((uint64_t)up[2]<<40) | |
728 | 0 | ((uint64_t)up[3]<<32) | |
729 | 0 | ((uint64_t)up[4]<<24) | |
730 | 0 | ((uint64_t)up[5]<<16) | |
731 | 0 | ((uint64_t)up[6]<< 8) | |
732 | 0 | (uint64_t)up[7]) & ((1LL<<(7*8))-1); |
733 | 0 | } else { |
734 | 0 | (*cp)+=9; |
735 | 0 | return (((uint64_t)up[1]<<56) | |
736 | 0 | ((uint64_t)up[2]<<48) | |
737 | 0 | ((uint64_t)up[3]<<40) | |
738 | 0 | ((uint64_t)up[4]<<32) | |
739 | 0 | ((uint64_t)up[5]<<24) | |
740 | 0 | ((uint64_t)up[6]<<16) | |
741 | 0 | ((uint64_t)up[7]<< 8) | |
742 | 0 | (uint64_t)up[8]); |
743 | 0 | } |
744 | 0 | } |
745 | | |
746 | | // Wrapper for now |
747 | 550k | static int safe_itf8_put(char *cp, char *cp_end, int32_t val) { |
748 | 550k | return itf8_put(cp, val); |
749 | 550k | } |
750 | | |
751 | 36.5k | static int safe_ltf8_put(char *cp, char *cp_end, int64_t val) { |
752 | 36.5k | return ltf8_put(cp, val); |
753 | 36.5k | } |
754 | | |
755 | 129k | static int itf8_size(int64_t v) { |
756 | 129k | return ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5); |
757 | 129k | } |
758 | | |
759 | | //----------------------------------------------------------------------------- |
760 | | |
761 | | /* |
762 | | * Decodes a 32-bit little endian value from fd and stores in val. |
763 | | * |
764 | | * Returns the number of bytes read on success |
765 | | * -1 on failure |
766 | | */ |
767 | 11.1k | static int int32_decode(cram_fd *fd, int32_t *val) { |
768 | 11.1k | int32_t i; |
769 | 11.1k | if (4 != hread(fd->fp, &i, 4)) |
770 | 36 | return -1; |
771 | | |
772 | 11.1k | *val = le_int4(i); |
773 | 11.1k | return 4; |
774 | 11.1k | } |
775 | | |
776 | | /* |
777 | | * Encodes a 32-bit little endian value 'val' and writes to fd. |
778 | | * |
779 | | * Returns the number of bytes written on success |
780 | | * -1 on failure |
781 | | */ |
782 | 38.5k | static int int32_encode(cram_fd *fd, int32_t val) { |
783 | 38.5k | uint32_t v = le_int4(val); |
784 | 38.5k | if (4 != hwrite(fd->fp, &v, 4)) |
785 | 0 | return -1; |
786 | | |
787 | 38.5k | return 4; |
788 | 38.5k | } |
789 | | |
790 | | /* As int32_decoded/encode, but from/to blocks instead of cram_fd */ |
791 | 331 | int int32_get_blk(cram_block *b, int32_t *val) { |
792 | 331 | if (b->uncomp_size - BLOCK_SIZE(b) < 4) |
793 | 27 | return -1; |
794 | | |
795 | 304 | uint32_t v = |
796 | 304 | ((uint32_t) b->data[b->byte ]) | |
797 | 304 | (((uint32_t) b->data[b->byte+1]) << 8) | |
798 | 304 | (((uint32_t) b->data[b->byte+2]) << 16) | |
799 | 304 | (((uint32_t) b->data[b->byte+3]) << 24); |
800 | | // Avoid implementation-defined behaviour on negative values |
801 | 304 | *val = v < 0x80000000U ? (int32_t) v : -((int32_t) (0xffffffffU - v)) - 1; |
802 | 304 | BLOCK_SIZE(b) += 4; |
803 | 304 | return 4; |
804 | 331 | } |
805 | | |
806 | | /* As int32_decoded/encode, but from/to blocks instead of cram_fd */ |
807 | 5.44k | int int32_put_blk(cram_block *b, int32_t val) { |
808 | 5.44k | unsigned char cp[4]; |
809 | 5.44k | uint32_t v = val; |
810 | 5.44k | cp[0] = ( v & 0xff); |
811 | 5.44k | cp[1] = ((v>>8) & 0xff); |
812 | 5.44k | cp[2] = ((v>>16) & 0xff); |
813 | 5.44k | cp[3] = ((v>>24) & 0xff); |
814 | | |
815 | 5.44k | BLOCK_APPEND(b, cp, 4); |
816 | 5.44k | return 0; |
817 | | |
818 | 0 | block_err: |
819 | 0 | return -1; |
820 | 5.44k | } |
821 | | |
822 | | #ifdef HAVE_LIBDEFLATE |
823 | | /* ---------------------------------------------------------------------- |
824 | | * libdeflate compression code, with interface to match |
825 | | * zlib_mem_{in,de}flate for simplicity elsewhere. |
826 | | */ |
827 | | |
828 | | // Named the same as the version that uses zlib as we always use libdeflate for |
829 | | // decompression when available. |
830 | | char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) { |
831 | | struct libdeflate_decompressor *z = libdeflate_alloc_decompressor(); |
832 | | if (!z) { |
833 | | hts_log_error("Call to libdeflate_alloc_decompressor failed"); |
834 | | return NULL; |
835 | | } |
836 | | |
837 | | uint8_t *data = NULL, *new_data; |
838 | | if (!*size) |
839 | | *size = csize*2; |
840 | | for(;;) { |
841 | | new_data = realloc(data, *size); |
842 | | if (!new_data) { |
843 | | hts_log_error("Memory allocation failure"); |
844 | | goto fail; |
845 | | } |
846 | | data = new_data; |
847 | | |
848 | | int ret = libdeflate_gzip_decompress(z, cdata, csize, data, *size, size); |
849 | | |
850 | | // Auto grow output buffer size if needed and try again. |
851 | | // Fortunately for all bar one call of this we know the size already. |
852 | | if (ret == LIBDEFLATE_INSUFFICIENT_SPACE) { |
853 | | (*size) *= 1.5; |
854 | | continue; |
855 | | } |
856 | | |
857 | | if (ret != LIBDEFLATE_SUCCESS) { |
858 | | hts_log_error("Inflate operation failed: %d", ret); |
859 | | goto fail; |
860 | | } else { |
861 | | break; |
862 | | } |
863 | | } |
864 | | |
865 | | libdeflate_free_decompressor(z); |
866 | | return (char *)data; |
867 | | |
868 | | fail: |
869 | | libdeflate_free_decompressor(z); |
870 | | free(data); |
871 | | return NULL; |
872 | | } |
873 | | |
874 | | // Named differently as we use both zlib/libdeflate for compression. |
875 | | static char *libdeflate_deflate(char *data, size_t size, size_t *cdata_size, |
876 | | int level, int strat) { |
877 | | level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default |
878 | | level *= 1.23; // NB levels go up to 12 here; 5 onwards is +1 |
879 | | level += level>=8; // 5,6,7->6,7,8 8->10 9->12 |
880 | | if (level > 12) level = 12; |
881 | | |
882 | | if (strat == Z_RLE) // not supported by libdeflate |
883 | | level = 1; |
884 | | |
885 | | struct libdeflate_compressor *z = libdeflate_alloc_compressor(level); |
886 | | if (!z) { |
887 | | hts_log_error("Call to libdeflate_alloc_compressor failed"); |
888 | | return NULL; |
889 | | } |
890 | | |
891 | | unsigned char *cdata = NULL; /* Compressed output */ |
892 | | size_t cdata_alloc; |
893 | | cdata = malloc(cdata_alloc = size*1.05+100); |
894 | | if (!cdata) { |
895 | | hts_log_error("Memory allocation failure"); |
896 | | libdeflate_free_compressor(z); |
897 | | return NULL; |
898 | | } |
899 | | |
900 | | *cdata_size = libdeflate_gzip_compress(z, data, size, cdata, cdata_alloc); |
901 | | libdeflate_free_compressor(z); |
902 | | |
903 | | if (*cdata_size == 0) { |
904 | | hts_log_error("Call to libdeflate_gzip_compress failed"); |
905 | | free(cdata); |
906 | | return NULL; |
907 | | } |
908 | | |
909 | | return (char *)cdata; |
910 | | } |
911 | | |
912 | | #else |
913 | | |
914 | | /* ---------------------------------------------------------------------- |
915 | | * zlib compression code - from Gap5's tg_iface_g.c |
916 | | * They're static here as they're only used within the cram_compress_block |
917 | | * and cram_uncompress_block functions, which are the external interface. |
918 | | */ |
919 | 10 | char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) { |
920 | 10 | z_stream s; |
921 | 10 | unsigned char *data = NULL; /* Uncompressed output */ |
922 | 10 | int data_alloc = 0; |
923 | 10 | int err; |
924 | | |
925 | | /* Starting point at uncompressed size, and scale after that */ |
926 | 10 | data = malloc(data_alloc = csize*1.2+100); |
927 | 10 | if (!data) |
928 | 0 | return NULL; |
929 | | |
930 | | /* Initialise zlib stream */ |
931 | 10 | s.zalloc = Z_NULL; /* use default allocation functions */ |
932 | 10 | s.zfree = Z_NULL; |
933 | 10 | s.opaque = Z_NULL; |
934 | 10 | s.next_in = (unsigned char *)cdata; |
935 | 10 | s.avail_in = csize; |
936 | 10 | s.total_in = 0; |
937 | 10 | s.next_out = data; |
938 | 10 | s.avail_out = data_alloc; |
939 | 10 | s.total_out = 0; |
940 | | |
941 | | //err = inflateInit(&s); |
942 | 10 | err = inflateInit2(&s, 15 + 32); |
943 | 10 | if (err != Z_OK) { |
944 | 0 | hts_log_error("Call to zlib inflateInit failed: %s", s.msg); |
945 | 0 | free(data); |
946 | 0 | return NULL; |
947 | 0 | } |
948 | | |
949 | | /* Decode to 'data' array */ |
950 | 14 | for (;s.avail_in;) { |
951 | 8 | unsigned char *data_tmp; |
952 | 8 | int alloc_inc; |
953 | | |
954 | 8 | s.next_out = &data[s.total_out]; |
955 | 8 | err = inflate(&s, Z_NO_FLUSH); |
956 | 8 | if (err == Z_STREAM_END) |
957 | 0 | break; |
958 | | |
959 | 8 | if (err != Z_OK) { |
960 | 4 | hts_log_error("Call to zlib inflate failed: %s", s.msg); |
961 | 4 | free(data); |
962 | 4 | inflateEnd(&s); |
963 | 4 | return NULL; |
964 | 4 | } |
965 | | |
966 | | /* More to come, so realloc based on growth so far */ |
967 | 4 | alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100; |
968 | 4 | data = realloc((data_tmp = data), data_alloc += alloc_inc); |
969 | 4 | if (!data) { |
970 | 0 | free(data_tmp); |
971 | 0 | inflateEnd(&s); |
972 | 0 | return NULL; |
973 | 0 | } |
974 | 4 | s.avail_out += alloc_inc; |
975 | 4 | } |
976 | 6 | inflateEnd(&s); |
977 | | |
978 | 6 | *size = s.total_out; |
979 | 6 | return (char *)data; |
980 | 10 | } |
981 | | #endif |
982 | | |
983 | | #if !defined(HAVE_LIBDEFLATE) || LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR == 1 && LIBDEFLATE_VERSION_MINOR <= 8) |
984 | | static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size, |
985 | 33.9k | int level, int strat) { |
986 | 33.9k | z_stream s; |
987 | 33.9k | unsigned char *cdata = NULL; /* Compressed output */ |
988 | 33.9k | int cdata_alloc = 0; |
989 | 33.9k | int cdata_pos = 0; |
990 | 33.9k | int err; |
991 | | |
992 | 33.9k | cdata = malloc(cdata_alloc = size*1.05+100); |
993 | 33.9k | if (!cdata) |
994 | 0 | return NULL; |
995 | 33.9k | cdata_pos = 0; |
996 | | |
997 | | /* Initialise zlib stream */ |
998 | 33.9k | s.zalloc = Z_NULL; /* use default allocation functions */ |
999 | 33.9k | s.zfree = Z_NULL; |
1000 | 33.9k | s.opaque = Z_NULL; |
1001 | 33.9k | s.next_in = (unsigned char *)data; |
1002 | 33.9k | s.avail_in = size; |
1003 | 33.9k | s.total_in = 0; |
1004 | 33.9k | s.next_out = cdata; |
1005 | 33.9k | s.avail_out = cdata_alloc; |
1006 | 33.9k | s.total_out = 0; |
1007 | 33.9k | s.data_type = Z_BINARY; |
1008 | | |
1009 | 33.9k | err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat); |
1010 | 33.9k | if (err != Z_OK) { |
1011 | 0 | hts_log_error("Call to zlib deflateInit2 failed: %s", s.msg); |
1012 | 0 | return NULL; |
1013 | 0 | } |
1014 | | |
1015 | | /* Encode to 'cdata' array */ |
1016 | 67.8k | for (;s.avail_in;) { |
1017 | 33.9k | s.next_out = &cdata[cdata_pos]; |
1018 | 33.9k | s.avail_out = cdata_alloc - cdata_pos; |
1019 | 33.9k | if (cdata_alloc - cdata_pos <= 0) { |
1020 | 0 | hts_log_error("Deflate produced larger output than expected"); |
1021 | 0 | return NULL; |
1022 | 0 | } |
1023 | 33.9k | err = deflate(&s, Z_NO_FLUSH); |
1024 | 33.9k | cdata_pos = cdata_alloc - s.avail_out; |
1025 | 33.9k | if (err != Z_OK) { |
1026 | 0 | hts_log_error("Call to zlib deflate failed: %s", s.msg); |
1027 | 0 | break; |
1028 | 0 | } |
1029 | 33.9k | } |
1030 | 33.9k | if (deflate(&s, Z_FINISH) != Z_STREAM_END) { |
1031 | 0 | hts_log_error("Call to zlib deflate failed: %s", s.msg); |
1032 | 0 | } |
1033 | 33.9k | *cdata_size = s.total_out; |
1034 | | |
1035 | 33.9k | if (deflateEnd(&s) != Z_OK) { |
1036 | 0 | hts_log_error("Call to zlib deflate failed: %s", s.msg); |
1037 | 0 | } |
1038 | 33.9k | return (char *)cdata; |
1039 | 33.9k | } |
1040 | | #endif |
1041 | | |
1042 | | #ifdef HAVE_LIBLZMA |
1043 | | /* ------------------------------------------------------------------------ */ |
1044 | | /* |
1045 | | * Data compression routines using liblzma (xz) |
1046 | | * |
1047 | | * On a test set this shrunk the main db from 136157104 bytes to 114796168, but |
1048 | | * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq |
1049 | | * went from 18.3s to 36.3s. So decompression suffers too, but not as bad |
1050 | | * as compression times. |
1051 | | * |
1052 | | * For now we disable this functionality. If it's to be reenabled make sure you |
1053 | | * improve the mem_inflate implementation as it's just a test hack at the |
1054 | | * moment. |
1055 | | */ |
1056 | | |
1057 | | static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size, |
1058 | 0 | int level) { |
1059 | 0 | char *out; |
1060 | 0 | size_t out_size = lzma_stream_buffer_bound(size); |
1061 | 0 | *cdata_size = 0; |
1062 | |
|
1063 | 0 | out = malloc(out_size); |
1064 | | |
1065 | | /* Single call compression */ |
1066 | 0 | if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL, |
1067 | 0 | (uint8_t *)data, size, |
1068 | 0 | (uint8_t *)out, cdata_size, |
1069 | 0 | out_size)) |
1070 | 0 | return NULL; |
1071 | | |
1072 | 0 | return out; |
1073 | 0 | } |
1074 | | |
1075 | 16 | static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) { |
1076 | 16 | lzma_stream strm = LZMA_STREAM_INIT; |
1077 | 16 | size_t out_size = 0, out_pos = 0; |
1078 | 16 | char *out = NULL, *new_out; |
1079 | 16 | int r; |
1080 | | |
1081 | | /* Initiate the decoder */ |
1082 | 16 | if (LZMA_OK != lzma_stream_decoder(&strm, lzma_easy_decoder_memusage(9), 0)) |
1083 | 0 | return NULL; |
1084 | | |
1085 | | /* Decode loop */ |
1086 | 16 | strm.avail_in = csize; |
1087 | 16 | strm.next_in = (uint8_t *)cdata; |
1088 | | |
1089 | 24 | for (;strm.avail_in;) { |
1090 | 15 | if (strm.avail_in > out_size - out_pos) { |
1091 | 15 | out_size += strm.avail_in * 4 + 32768; |
1092 | 15 | new_out = realloc(out, out_size); |
1093 | 15 | if (!new_out) |
1094 | 0 | goto fail; |
1095 | 15 | out = new_out; |
1096 | 15 | } |
1097 | 15 | strm.avail_out = out_size - out_pos; |
1098 | 15 | strm.next_out = (uint8_t *)&out[out_pos]; |
1099 | | |
1100 | 15 | r = lzma_code(&strm, LZMA_RUN); |
1101 | 15 | if (LZMA_OK != r && LZMA_STREAM_END != r) { |
1102 | 7 | hts_log_error("LZMA decode failure (error %d)", r); |
1103 | 7 | goto fail; |
1104 | 7 | } |
1105 | | |
1106 | 8 | out_pos = strm.total_out; |
1107 | | |
1108 | 8 | if (r == LZMA_STREAM_END) |
1109 | 0 | break; |
1110 | 8 | } |
1111 | | |
1112 | | /* finish up any unflushed data; necessary? */ |
1113 | 9 | r = lzma_code(&strm, LZMA_FINISH); |
1114 | 9 | if (r != LZMA_OK && r != LZMA_STREAM_END) { |
1115 | 0 | hts_log_error("Call to lzma_code failed with error %d", r); |
1116 | 0 | goto fail; |
1117 | 0 | } |
1118 | | |
1119 | 9 | new_out = realloc(out, strm.total_out > 0 ? strm.total_out : 1); |
1120 | 9 | if (new_out) |
1121 | 9 | out = new_out; |
1122 | 9 | *size = strm.total_out; |
1123 | | |
1124 | 9 | lzma_end(&strm); |
1125 | | |
1126 | 9 | return out; |
1127 | | |
1128 | 7 | fail: |
1129 | 7 | lzma_end(&strm); |
1130 | 7 | free(out); |
1131 | 7 | return NULL; |
1132 | 9 | } |
1133 | | #endif |
1134 | | |
1135 | | /* ---------------------------------------------------------------------- |
1136 | | * CRAM blocks - the dynamically growable data block. We have code to |
1137 | | * create, update, (un)compress and read/write. |
1138 | | * |
1139 | | * These are derived from the deflate_interlaced.c blocks, but with the |
1140 | | * CRAM extension of content types and IDs. |
1141 | | */ |
1142 | | |
1143 | | /* |
1144 | | * Allocates a new cram_block structure with a specified content_type and |
1145 | | * id. |
1146 | | * |
1147 | | * Returns block pointer on success |
1148 | | * NULL on failure |
1149 | | */ |
1150 | | cram_block *cram_new_block(enum cram_content_type content_type, |
1151 | 91.6k | int content_id) { |
1152 | 91.6k | cram_block *b = malloc(sizeof(*b)); |
1153 | 91.6k | if (!b) |
1154 | 0 | return NULL; |
1155 | 91.6k | b->method = b->orig_method = RAW; |
1156 | 91.6k | b->content_type = content_type; |
1157 | 91.6k | b->content_id = content_id; |
1158 | 91.6k | b->comp_size = 0; |
1159 | 91.6k | b->uncomp_size = 0; |
1160 | 91.6k | b->data = NULL; |
1161 | 91.6k | b->alloc = 0; |
1162 | 91.6k | b->byte = 0; |
1163 | 91.6k | b->bit = 7; // MSB |
1164 | 91.6k | b->crc32 = 0; |
1165 | 91.6k | b->idx = 0; |
1166 | 91.6k | b->m = NULL; |
1167 | | |
1168 | 91.6k | return b; |
1169 | 91.6k | } |
1170 | | |
1171 | | /* |
1172 | | * Reads a block from a cram file. |
1173 | | * Returns cram_block pointer on success. |
1174 | | * NULL on failure |
1175 | | */ |
1176 | 7.38k | cram_block *cram_read_block(cram_fd *fd) { |
1177 | 7.38k | cram_block *b = malloc(sizeof(*b)); |
1178 | 7.38k | unsigned char c; |
1179 | 7.38k | uint32_t crc = 0; |
1180 | 7.38k | if (!b) |
1181 | 0 | return NULL; |
1182 | | |
1183 | | //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp)); |
1184 | | |
1185 | 7.38k | if (-1 == (b->method = hgetc(fd->fp))) { free(b); return NULL; } |
1186 | 7.35k | if (b->method > TOK3) { |
1187 | 221 | hts_log_error("Unknown block compression method %d", (int) b->method); |
1188 | 221 | free(b); return NULL; |
1189 | 221 | } |
1190 | 7.13k | c = b->method; crc = crc32(crc, &c, 1); |
1191 | 7.13k | if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; } |
1192 | 7.12k | c = b->content_type; crc = crc32(crc, &c, 1); |
1193 | 7.12k | if (-1 == fd->vv.varint_decode32_crc(fd, &b->content_id, &crc)) { free(b); return NULL; } |
1194 | 7.10k | if (-1 == fd->vv.varint_decode32_crc(fd, &b->comp_size, &crc)) { free(b); return NULL; } |
1195 | 7.10k | if (-1 == fd->vv.varint_decode32_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; } |
1196 | | |
1197 | | //fprintf(stderr, " method %d, ctype %d, cid %d, csize %d, ucsize %d\n", |
1198 | | // b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size); |
1199 | | |
1200 | 7.09k | if (b->method == RAW) { |
1201 | 2.17k | if (b->uncomp_size < 0 || b->comp_size != b->uncomp_size) { |
1202 | 46 | free(b); |
1203 | 46 | return NULL; |
1204 | 46 | } |
1205 | 2.12k | b->alloc = b->uncomp_size; |
1206 | 2.12k | if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; } |
1207 | 2.12k | if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) { |
1208 | 6 | free(b->data); |
1209 | 6 | free(b); |
1210 | 6 | return NULL; |
1211 | 6 | } |
1212 | 4.92k | } else { |
1213 | 4.92k | if (b->comp_size < 0 || b->uncomp_size < 0) { |
1214 | 17 | free(b); |
1215 | 17 | return NULL; |
1216 | 17 | } |
1217 | 4.91k | b->alloc = b->comp_size; |
1218 | 4.91k | if (!(b->data = malloc(b->comp_size))) { free(b); return NULL; } |
1219 | 4.91k | if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) { |
1220 | 32 | free(b->data); |
1221 | 32 | free(b); |
1222 | 32 | return NULL; |
1223 | 32 | } |
1224 | 4.91k | } |
1225 | | |
1226 | 6.99k | if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
1227 | 317 | if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) { |
1228 | 8 | free(b->data); |
1229 | 8 | free(b); |
1230 | 8 | return NULL; |
1231 | 8 | } |
1232 | | |
1233 | 309 | b->crc32_checked = fd->ignore_md5; |
1234 | 309 | b->crc_part = crc; |
1235 | 6.68k | } else { |
1236 | 6.68k | b->crc32_checked = 1; // CRC not present |
1237 | 6.68k | } |
1238 | | |
1239 | 6.98k | b->orig_method = b->method; |
1240 | 6.98k | b->idx = 0; |
1241 | 6.98k | b->byte = 0; |
1242 | 6.98k | b->bit = 7; // MSB |
1243 | | |
1244 | 6.98k | return b; |
1245 | 6.99k | } |
1246 | | |
1247 | | |
1248 | | /* |
1249 | | * Computes the size of a cram block, including the block |
1250 | | * header itself. |
1251 | | */ |
1252 | 0 | uint32_t cram_block_size(cram_block *b) { |
1253 | 0 | unsigned char dat[100], *cp = dat;; |
1254 | 0 | uint32_t sz; |
1255 | |
|
1256 | 0 | *cp++ = b->method; |
1257 | 0 | *cp++ = b->content_type; |
1258 | 0 | cp += itf8_put((char*)cp, b->content_id); |
1259 | 0 | cp += itf8_put((char*)cp, b->comp_size); |
1260 | 0 | cp += itf8_put((char*)cp, b->uncomp_size); |
1261 | |
|
1262 | 0 | sz = cp-dat + 4; |
1263 | 0 | sz += b->method == RAW ? b->uncomp_size : b->comp_size; |
1264 | |
|
1265 | 0 | return sz; |
1266 | 0 | } |
1267 | | |
1268 | | /* |
1269 | | * Writes a CRAM block. |
1270 | | * Returns 0 on success |
1271 | | * -1 on failure |
1272 | | */ |
1273 | 38.5k | int cram_write_block(cram_fd *fd, cram_block *b) { |
1274 | 38.5k | char vardata[100]; |
1275 | 38.5k | int vardata_o = 0; |
1276 | | |
1277 | 38.5k | assert(b->method != RAW || (b->comp_size == b->uncomp_size)); |
1278 | | |
1279 | 38.5k | if (hputc(b->method, fd->fp) == EOF) return -1; |
1280 | 38.5k | if (hputc(b->content_type, fd->fp) == EOF) return -1; |
1281 | 38.5k | vardata_o += fd->vv.varint_put32(vardata , vardata+100, b->content_id); |
1282 | 38.5k | vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->comp_size); |
1283 | 38.5k | vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->uncomp_size); |
1284 | 38.5k | if (vardata_o != hwrite(fd->fp, vardata, vardata_o)) |
1285 | 0 | return -1; |
1286 | | |
1287 | 38.5k | if (b->data) { |
1288 | 35.6k | if (b->method == RAW) { |
1289 | 28.7k | if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size)) |
1290 | 0 | return -1; |
1291 | 28.7k | } else { |
1292 | 6.92k | if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size)) |
1293 | 0 | return -1; |
1294 | 6.92k | } |
1295 | 35.6k | } else { |
1296 | | // Absent blocks should be size 0 |
1297 | 2.85k | assert(b->method == RAW && b->uncomp_size == 0); |
1298 | 2.85k | } |
1299 | | |
1300 | 38.5k | if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
1301 | 38.5k | char dat[100], *cp = (char *)dat; |
1302 | 38.5k | uint32_t crc; |
1303 | | |
1304 | 38.5k | *cp++ = b->method; |
1305 | 38.5k | *cp++ = b->content_type; |
1306 | 38.5k | cp += fd->vv.varint_put32(cp, dat+100, b->content_id); |
1307 | 38.5k | cp += fd->vv.varint_put32(cp, dat+100, b->comp_size); |
1308 | 38.5k | cp += fd->vv.varint_put32(cp, dat+100, b->uncomp_size); |
1309 | 38.5k | crc = crc32(0L, (uc *)dat, cp-dat); |
1310 | | |
1311 | 38.5k | if (b->method == RAW) { |
1312 | 31.5k | b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size); |
1313 | 31.5k | } else { |
1314 | 6.92k | b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size); |
1315 | 6.92k | } |
1316 | | |
1317 | 38.5k | if (-1 == int32_encode(fd, b->crc32)) |
1318 | 0 | return -1; |
1319 | 38.5k | } |
1320 | | |
1321 | 38.5k | return 0; |
1322 | 38.5k | } |
1323 | | |
1324 | | /* |
1325 | | * Frees a CRAM block, deallocating internal data too. |
1326 | | */ |
1327 | 104k | void cram_free_block(cram_block *b) { |
1328 | 104k | if (!b) |
1329 | 5.45k | return; |
1330 | 98.6k | if (b->data) |
1331 | 56.8k | free(b->data); |
1332 | 98.6k | free(b); |
1333 | 98.6k | } |
1334 | | |
1335 | | /* |
1336 | | * Uncompresses a CRAM block, if compressed. |
1337 | | */ |
1338 | 5.13k | int cram_uncompress_block(cram_block *b) { |
1339 | 5.13k | char *uncomp; |
1340 | 5.13k | size_t uncomp_size = 0; |
1341 | | |
1342 | 5.13k | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
1343 | | // Pretend the CRC was OK so the fuzzer doesn't have to get it right |
1344 | 5.13k | b->crc32_checked = 1; |
1345 | 5.13k | #endif |
1346 | | |
1347 | 5.13k | if (b->crc32_checked == 0) { |
1348 | 0 | uint32_t crc = crc32(b->crc_part, b->data ? b->data : (uc *)"", b->alloc); |
1349 | 0 | b->crc32_checked = 1; |
1350 | 0 | if (crc != b->crc32) { |
1351 | 0 | hts_log_error("Block CRC32 failure"); |
1352 | 0 | return -1; |
1353 | 0 | } |
1354 | 0 | } |
1355 | | |
1356 | 5.13k | if (b->uncomp_size == 0) { |
1357 | | // blank block |
1358 | 38 | b->method = RAW; |
1359 | 38 | return 0; |
1360 | 38 | } |
1361 | 5.13k | assert(b->uncomp_size >= 0); // cram_read_block should ensure this |
1362 | | |
1363 | 5.10k | switch (b->method) { |
1364 | 286 | case RAW: |
1365 | 286 | return 0; |
1366 | | |
1367 | 10 | case GZIP: |
1368 | 10 | uncomp_size = b->uncomp_size; |
1369 | 10 | uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size); |
1370 | | |
1371 | 10 | if (!uncomp) |
1372 | 4 | return -1; |
1373 | 6 | if (uncomp_size != b->uncomp_size) { |
1374 | 6 | free(uncomp); |
1375 | 6 | return -1; |
1376 | 6 | } |
1377 | 0 | free(b->data); |
1378 | 0 | b->data = (unsigned char *)uncomp; |
1379 | 0 | b->alloc = uncomp_size; |
1380 | 0 | b->method = RAW; |
1381 | 0 | break; |
1382 | | |
1383 | 0 | #ifdef HAVE_LIBBZ2 |
1384 | 4 | case BZIP2: { |
1385 | 4 | unsigned int usize = b->uncomp_size; |
1386 | 4 | if (!(uncomp = malloc(usize))) |
1387 | 0 | return -1; |
1388 | 4 | if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize, |
1389 | 4 | (char *)b->data, b->comp_size, |
1390 | 4 | 0, 0)) { |
1391 | 4 | free(uncomp); |
1392 | 4 | return -1; |
1393 | 4 | } |
1394 | 0 | free(b->data); |
1395 | 0 | b->data = (unsigned char *)uncomp; |
1396 | 0 | b->alloc = usize; |
1397 | 0 | b->method = RAW; |
1398 | 0 | b->uncomp_size = usize; // Just in case it differs |
1399 | 0 | break; |
1400 | 4 | } |
1401 | | #else |
1402 | | case BZIP2: |
1403 | | hts_log_error("Bzip2 compression is not compiled into this version. Please rebuild and try again"); |
1404 | | return -1; |
1405 | | #endif |
1406 | | |
1407 | 0 | #ifdef HAVE_LIBLZMA |
1408 | 16 | case LZMA: |
1409 | 16 | uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size); |
1410 | 16 | if (!uncomp) |
1411 | 7 | return -1; |
1412 | 9 | if (uncomp_size != b->uncomp_size) { |
1413 | 9 | free(uncomp); |
1414 | 9 | return -1; |
1415 | 9 | } |
1416 | 0 | free(b->data); |
1417 | 0 | b->data = (unsigned char *)uncomp; |
1418 | 0 | b->alloc = uncomp_size; |
1419 | 0 | b->method = RAW; |
1420 | 0 | break; |
1421 | | #else |
1422 | | case LZMA: |
1423 | | hts_log_error("Lzma compression is not compiled into this version. Please rebuild and try again"); |
1424 | | return -1; |
1425 | | break; |
1426 | | #endif |
1427 | | |
1428 | 426 | case RANS: { |
1429 | 426 | unsigned int usize = b->uncomp_size, usize2 = 0; |
1430 | 426 | uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2); |
1431 | 426 | if (!uncomp) |
1432 | 209 | return -1; |
1433 | 217 | if (usize != usize2) { |
1434 | 201 | free(uncomp); |
1435 | 201 | return -1; |
1436 | 201 | } |
1437 | 16 | free(b->data); |
1438 | 16 | b->data = (unsigned char *)uncomp; |
1439 | 16 | b->alloc = usize2; |
1440 | 16 | b->method = RAW; |
1441 | 16 | b->uncomp_size = usize2; // Just in case it differs |
1442 | | //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size); |
1443 | 16 | break; |
1444 | 217 | } |
1445 | | |
1446 | 737 | case FQZ: { |
1447 | 737 | uncomp_size = b->uncomp_size; |
1448 | 737 | uncomp = fqz_decompress((char *)b->data, b->comp_size, &uncomp_size, NULL, 0); |
1449 | 737 | if (!uncomp) |
1450 | 570 | return -1; |
1451 | 167 | free(b->data); |
1452 | 167 | b->data = (unsigned char *)uncomp; |
1453 | 167 | b->alloc = uncomp_size; |
1454 | 167 | b->method = RAW; |
1455 | 167 | b->uncomp_size = uncomp_size; |
1456 | 167 | break; |
1457 | 737 | } |
1458 | | |
1459 | 967 | case RANS_PR0: { |
1460 | 967 | unsigned int usize = b->uncomp_size, usize2 = 0; |
1461 | 967 | uncomp = (char *)rans_uncompress_4x16(b->data, b->comp_size, &usize2); |
1462 | 967 | if (!uncomp) |
1463 | 689 | return -1; |
1464 | 278 | if (usize != usize2) { |
1465 | 182 | free(uncomp); |
1466 | 182 | return -1; |
1467 | 182 | } |
1468 | 96 | b->orig_method = RANSPR; |
1469 | 96 | free(b->data); |
1470 | 96 | b->data = (unsigned char *)uncomp; |
1471 | 96 | b->alloc = usize2; |
1472 | 96 | b->method = RAW; |
1473 | 96 | b->uncomp_size = usize2; // Just incase it differs |
1474 | | //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size); |
1475 | 96 | break; |
1476 | 278 | } |
1477 | | |
1478 | 553 | case ARITH_PR0: { |
1479 | 553 | unsigned int usize = b->uncomp_size, usize2 = 0; |
1480 | 553 | uncomp = (char *)arith_uncompress_to(b->data, b->comp_size, NULL, &usize2); |
1481 | 553 | if (!uncomp) |
1482 | 381 | return -1; |
1483 | 172 | if (usize != usize2) { |
1484 | 55 | free(uncomp); |
1485 | 55 | return -1; |
1486 | 55 | } |
1487 | 117 | b->orig_method = ARITH; |
1488 | 117 | free(b->data); |
1489 | 117 | b->data = (unsigned char *)uncomp; |
1490 | 117 | b->alloc = usize2; |
1491 | 117 | b->method = RAW; |
1492 | 117 | b->uncomp_size = usize2; // Just incase it differs |
1493 | | //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size); |
1494 | 117 | break; |
1495 | 172 | } |
1496 | | |
1497 | 2.10k | case TOK3: { |
1498 | 2.10k | uint32_t out_len; |
1499 | 2.10k | uint8_t *cp = tok3_decode_names(b->data, b->comp_size, &out_len); |
1500 | 2.10k | if (!cp) |
1501 | 1.98k | return -1; |
1502 | 115 | b->orig_method = TOK3; |
1503 | 115 | b->method = RAW; |
1504 | 115 | free(b->data); |
1505 | 115 | b->data = cp; |
1506 | 115 | b->alloc = out_len; |
1507 | 115 | b->uncomp_size = out_len; |
1508 | 115 | break; |
1509 | 2.10k | } |
1510 | | |
1511 | 0 | default: |
1512 | 0 | return -1; |
1513 | 5.10k | } |
1514 | | |
1515 | 511 | return 0; |
1516 | 5.10k | } |
1517 | | |
1518 | | static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size, |
1519 | | int content_id, size_t *out_size, |
1520 | | enum cram_block_method_int method, |
1521 | 78.4k | int level, int strat) { |
1522 | 78.4k | switch (method) { |
1523 | 16.5k | case GZIP: |
1524 | 22.9k | case GZIP_RLE: |
1525 | 33.9k | case GZIP_1: |
1526 | | // Read names bizarrely benefit from zlib over libdeflate for |
1527 | | // mid-range compression levels. Focusing purely of ratio or |
1528 | | // speed, libdeflate still wins. It also seems to win for |
1529 | | // other data series too. |
1530 | | // |
1531 | | // Eg RN at level 5; libdeflate=55.9MB zlib=51.6MB |
1532 | | #ifdef HAVE_LIBDEFLATE |
1533 | | # if (LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR == 1 && LIBDEFLATE_VERSION_MINOR <= 8)) |
1534 | | if (content_id == DS_RN && level >= 4 && level <= 7) |
1535 | | return zlib_mem_deflate(in, in_size, out_size, level, strat); |
1536 | | else |
1537 | | # endif |
1538 | | return libdeflate_deflate(in, in_size, out_size, level, strat); |
1539 | | #else |
1540 | 33.9k | return zlib_mem_deflate(in, in_size, out_size, level, strat); |
1541 | 0 | #endif |
1542 | | |
1543 | 0 | case BZIP2: { |
1544 | 0 | #ifdef HAVE_LIBBZ2 |
1545 | 0 | unsigned int comp_size = in_size*1.01 + 600; |
1546 | 0 | char *comp = malloc(comp_size); |
1547 | 0 | if (!comp) |
1548 | 0 | return NULL; |
1549 | | |
1550 | 0 | if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size, |
1551 | 0 | in, in_size, |
1552 | 0 | level, 0, 30)) { |
1553 | 0 | free(comp); |
1554 | 0 | return NULL; |
1555 | 0 | } |
1556 | 0 | *out_size = comp_size; |
1557 | 0 | return comp; |
1558 | | #else |
1559 | | return NULL; |
1560 | | #endif |
1561 | 0 | } |
1562 | | |
1563 | 0 | case FQZ: |
1564 | 0 | case FQZ_b: |
1565 | 0 | case FQZ_c: |
1566 | 0 | case FQZ_d: { |
1567 | | // Extract the necessary portion of the slice into an fqz_slice struct. |
1568 | | // These previously were the same thing, but this permits us to detach |
1569 | | // the codec from the rest of this CRAM implementation. |
1570 | 0 | fqz_slice *f = malloc(2*s->hdr->num_records * sizeof(uint32_t) + sizeof(fqz_slice)); |
1571 | 0 | if (!f) |
1572 | 0 | return NULL; |
1573 | 0 | f->num_records = s->hdr->num_records; |
1574 | 0 | f->len = (uint32_t *)(((char *)f) + sizeof(fqz_slice)); |
1575 | 0 | f->flags = f->len + s->hdr->num_records; |
1576 | 0 | int i; |
1577 | 0 | for (i = 0; i < s->hdr->num_records; i++) { |
1578 | 0 | f->flags[i] = s->crecs[i].flags; |
1579 | 0 | f->len[i] = (i+1 < s->hdr->num_records |
1580 | 0 | ? s->crecs[i+1].qual - s->crecs[i].qual |
1581 | 0 | : s->block[DS_QS]->uncomp_size - s->crecs[i].qual); |
1582 | 0 | } |
1583 | 0 | char *comp = fqz_compress(strat & 0xff /* cram vers */, f, |
1584 | 0 | in, in_size, out_size, strat >> 8, NULL); |
1585 | 0 | free(f); |
1586 | 0 | return comp; |
1587 | 0 | } |
1588 | | |
1589 | 0 | case LZMA: |
1590 | 0 | #ifdef HAVE_LIBLZMA |
1591 | 0 | return lzma_mem_deflate(in, in_size, out_size, level); |
1592 | | #else |
1593 | | return NULL; |
1594 | | #endif |
1595 | | |
1596 | 0 | case RANS0: |
1597 | 0 | case RANS1: { |
1598 | 0 | unsigned int out_size_i; |
1599 | 0 | unsigned char *cp; |
1600 | 0 | cp = rans_compress((unsigned char *)in, in_size, &out_size_i, |
1601 | 0 | method == RANS0 ? 0 : 1); |
1602 | 0 | *out_size = out_size_i; |
1603 | 0 | return (char *)cp; |
1604 | 0 | } |
1605 | | |
1606 | 6.55k | case RANS_PR0: |
1607 | 13.2k | case RANS_PR1: |
1608 | 20.1k | case RANS_PR64: |
1609 | 26.5k | case RANS_PR9: |
1610 | 33.4k | case RANS_PR128: |
1611 | 33.4k | case RANS_PR129: |
1612 | 33.4k | case RANS_PR192: |
1613 | 39.8k | case RANS_PR193: { |
1614 | 39.8k | unsigned int out_size_i; |
1615 | 39.8k | unsigned char *cp; |
1616 | | |
1617 | | // see enum cram_block. We map RANS_* methods to order bit-fields |
1618 | 39.8k | static int methmap[] = { 1, 64,9, 128,129, 192,193 }; |
1619 | | |
1620 | 39.8k | int m = method == RANS_PR0 ? 0 : methmap[method - RANS_PR1]; |
1621 | 39.8k | cp = rans_compress_4x16((unsigned char *)in, in_size, &out_size_i, |
1622 | 39.8k | m | RANS_ORDER_SIMD_AUTO); |
1623 | 39.8k | *out_size = out_size_i; |
1624 | 39.8k | return (char *)cp; |
1625 | 33.4k | } |
1626 | | |
1627 | 0 | case ARITH_PR0: |
1628 | 0 | case ARITH_PR1: |
1629 | 0 | case ARITH_PR64: |
1630 | 0 | case ARITH_PR9: |
1631 | 0 | case ARITH_PR128: |
1632 | 0 | case ARITH_PR129: |
1633 | 0 | case ARITH_PR192: |
1634 | 0 | case ARITH_PR193: { |
1635 | 0 | unsigned int out_size_i; |
1636 | 0 | unsigned char *cp; |
1637 | | |
1638 | | // see enum cram_block. We map ARITH_* methods to order bit-fields |
1639 | 0 | static int methmap[] = { 1, 64,9, 128,129, 192,193 }; |
1640 | |
|
1641 | 0 | cp = arith_compress_to((unsigned char *)in, in_size, NULL, &out_size_i, |
1642 | 0 | method == ARITH_PR0 ? 0 : methmap[method - ARITH_PR1]); |
1643 | 0 | *out_size = out_size_i; |
1644 | 0 | return (char *)cp; |
1645 | 0 | } |
1646 | | |
1647 | 4.61k | case TOK3: |
1648 | 4.61k | case TOKA: { |
1649 | 4.61k | int out_len; |
1650 | 4.61k | int lev = level; |
1651 | 4.61k | if (method == TOK3 && lev > 3) |
1652 | 4.61k | lev = 3; |
1653 | 4.61k | uint8_t *cp = tok3_encode_names(in, in_size, lev, strat, &out_len, NULL); |
1654 | 4.61k | *out_size = out_len; |
1655 | 4.61k | return (char *)cp; |
1656 | 4.61k | } |
1657 | | |
1658 | 0 | case RAW: |
1659 | 0 | break; |
1660 | | |
1661 | 0 | default: |
1662 | 0 | return NULL; |
1663 | 78.4k | } |
1664 | | |
1665 | 0 | return NULL; |
1666 | 78.4k | } |
1667 | | |
1668 | | /* |
1669 | | * A copy of cram_compress_block2 with added recursion detection. |
1670 | | * This is only called for error handling where the auto-tuning has failed. |
1671 | | * The simplest way of doing this is recusion + an additional argument, but |
1672 | | * we didn't want to complicate the existing code hence this is static. |
1673 | | */ |
1674 | | static int cram_compress_block3(cram_fd *fd, cram_slice *s, |
1675 | | cram_block *b, cram_metrics *metrics, |
1676 | | int method, int level, |
1677 | 39.5k | int recurse) { |
1678 | | |
1679 | 39.5k | if (!b) |
1680 | 1.55k | return 0; |
1681 | | |
1682 | 37.9k | int orig_method = method; |
1683 | 37.9k | char *comp = NULL; |
1684 | 37.9k | size_t comp_size = 0; |
1685 | 37.9k | int strat; |
1686 | | |
1687 | | // Internally we have parameterised methods that externally map |
1688 | | // to the same CRAM method value. |
1689 | | // See enum_cram_block_method_int in cram_structs.h. |
1690 | 37.9k | int methmap[] = { |
1691 | | // Externally defined values |
1692 | 37.9k | RAW, GZIP, BZIP2, LZMA, RANS, RANSPR, ARITH, FQZ, TOK3, |
1693 | | |
1694 | | // Reserved for possible expansion |
1695 | 37.9k | 0, 0, |
1696 | | |
1697 | | // Internally parameterised versions matching back to above |
1698 | | // external values |
1699 | 37.9k | GZIP, GZIP, |
1700 | 37.9k | FQZ, FQZ, FQZ, |
1701 | 37.9k | RANS, |
1702 | 37.9k | RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, |
1703 | 37.9k | TOK3, |
1704 | 37.9k | ARITH, ARITH, ARITH, ARITH, ARITH, ARITH, ARITH, |
1705 | 37.9k | }; |
1706 | | |
1707 | 37.9k | if (b->method != RAW) { |
1708 | | // Maybe already compressed if s->block[0] was compressed and |
1709 | | // we have e.g. s->block[DS_BA] set to s->block[0] due to only |
1710 | | // one base type present and hence using E_HUFFMAN on block 0. |
1711 | | // A second explicit attempt to compress the same block then |
1712 | | // occurs. |
1713 | 0 | return 0; |
1714 | 0 | } |
1715 | | |
1716 | 37.9k | if (method == -1) { |
1717 | 5.44k | method = 1<<GZIP; |
1718 | 5.44k | if (fd->use_bz2) |
1719 | 0 | method |= 1<<BZIP2; |
1720 | 5.44k | if (fd->use_lzma) |
1721 | 0 | method |= 1<<LZMA; |
1722 | 5.44k | } |
1723 | | |
1724 | 37.9k | if (level == -1) |
1725 | 5.44k | level = fd->level; |
1726 | | |
1727 | | //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size); |
1728 | | |
1729 | 37.9k | if (method == RAW || level == 0 || b->uncomp_size == 0) { |
1730 | 19.7k | b->method = RAW; |
1731 | 19.7k | b->comp_size = b->uncomp_size; |
1732 | | //fprintf(stderr, "Skip block id %d\n", b->content_id); |
1733 | 19.7k | return 0; |
1734 | 19.7k | } |
1735 | | |
1736 | 18.1k | #ifndef ABS |
1737 | 18.1k | # define ABS(a) ((a)>=0?(a):-(a)) |
1738 | 18.1k | #endif |
1739 | | |
1740 | 18.1k | if (metrics) { |
1741 | 12.6k | pthread_mutex_lock(&fd->metrics_lock); |
1742 | | // Sudden changes in size trigger a retrial. These are mainly |
1743 | | // triggered when switching to sorted / unsorted, where the number |
1744 | | // of elements in a slice radically changes. |
1745 | | // |
1746 | | // We also get large fluctuations based on genome coordinate for |
1747 | | // e.g. SA:Z and SC series, but we consider the typical scale of |
1748 | | // delta between blocks and use this to look for abnormality. |
1749 | | |
1750 | | // Equivalent to (but minus possible integer overflow) |
1751 | | // (b->uncomp_size + 1000)/4 > metrics->input_avg_sz+1000 || |
1752 | | // b->uncomp_size + 1000 < (metrics->input_avg_sz+1000)/4) |
1753 | 12.6k | if (metrics->input_avg_sz && |
1754 | 3.28k | (b->uncomp_size/4 - 750 > metrics->input_avg_sz || |
1755 | 2.97k | b->uncomp_size < metrics->input_avg_sz/4 - 750) && |
1756 | 378 | ABS(b->uncomp_size-metrics->input_avg_sz)/10 |
1757 | 378 | > metrics->input_avg_delta) { |
1758 | 19 | metrics->next_trial = 0; |
1759 | 19 | } |
1760 | | |
1761 | 12.6k | if (metrics->trial > 0 || --metrics->next_trial <= 0) { |
1762 | 10.8k | int m, unpackable = metrics->unpackable; |
1763 | 10.8k | size_t sz_best = b->uncomp_size; |
1764 | 10.8k | size_t sz[CRAM_MAX_METHOD] = {0}; |
1765 | 10.8k | int method_best = 0; // RAW |
1766 | 10.8k | char *c_best = NULL, *c = NULL; |
1767 | | |
1768 | 10.8k | metrics->input_avg_delta = |
1769 | 10.8k | 0.9 * (metrics->input_avg_delta + |
1770 | 10.8k | ABS(b->uncomp_size - metrics->input_avg_sz)); |
1771 | | |
1772 | 10.8k | metrics->input_avg_sz += b->uncomp_size*.2; |
1773 | 10.8k | metrics->input_avg_sz *= 0.8; |
1774 | | |
1775 | 10.8k | if (metrics->revised_method) |
1776 | 3.04k | method = metrics->revised_method; |
1777 | 7.81k | else |
1778 | 7.81k | metrics->revised_method = method; |
1779 | | |
1780 | 10.8k | if (metrics->next_trial <= 0) { |
1781 | 23 | metrics->next_trial = TRIAL_SPAN; |
1782 | 23 | metrics->trial = NTRIALS; |
1783 | 759 | for (m = 0; m < CRAM_MAX_METHOD; m++) |
1784 | 736 | metrics->sz[m] /= 2; |
1785 | 23 | metrics->unpackable = 0; |
1786 | 23 | } |
1787 | | |
1788 | | // Compress this block using the best method |
1789 | 10.8k | if (unpackable && CRAM_MAJOR_VERS(fd->version) > 3) { |
1790 | | // No point trying bit-pack if 17+ symbols. |
1791 | 0 | if (method & (1<<RANS_PR128)) |
1792 | 0 | method = (method|(1<<RANS_PR0))&~(1<<RANS_PR128); |
1793 | 0 | if (method & (1<<RANS_PR129)) |
1794 | 0 | method = (method|(1<<RANS_PR1))&~(1<<RANS_PR129); |
1795 | 0 | if (method & (1<<RANS_PR192)) |
1796 | 0 | method = (method|(1<<RANS_PR64))&~(1<<RANS_PR192); |
1797 | 0 | if (method & (1<<RANS_PR193)) |
1798 | 0 | method = (method|(1<<RANS_PR64)|(1<<RANS_PR1))&~(1<<RANS_PR193); |
1799 | |
|
1800 | 0 | if (method & (1<<ARITH_PR128)) |
1801 | 0 | method = (method|(1<<ARITH_PR0))&~(1<<ARITH_PR128); |
1802 | 0 | if (method & (1<<ARITH_PR129)) |
1803 | 0 | method = (method|(1<<ARITH_PR1))&~(1<<ARITH_PR129); |
1804 | 0 | if (method & (1<<ARITH_PR192)) |
1805 | 0 | method = (method|(1<<ARITH_PR64))&~(1<<ARITH_PR192); |
1806 | 0 | if (method & (1u<<ARITH_PR193)) |
1807 | 0 | method = (method|(1<<ARITH_PR64)|(1<<ARITH_PR1))&~(1u<<ARITH_PR193); |
1808 | 0 | } |
1809 | | |
1810 | | // Libdeflate doesn't have a Z_RLE strategy. |
1811 | | // We treat it as level 1, but iff we haven't also |
1812 | | // explicitly listed that in the method list. |
1813 | | #ifdef HAVE_LIBDEFLATE |
1814 | | if ((method & (1<<GZIP_RLE)) && (method & (1<<GZIP_1))) |
1815 | | method &= ~(1<<GZIP_RLE); |
1816 | | #endif |
1817 | | |
1818 | 10.8k | pthread_mutex_unlock(&fd->metrics_lock); |
1819 | | |
1820 | 358k | for (m = 0; m < CRAM_MAX_METHOD; m++) { |
1821 | 347k | if (method & (1u<<m)) { |
1822 | 71.0k | int lvl = level; |
1823 | 71.0k | switch (m) { |
1824 | 10.8k | case GZIP: strat = Z_FILTERED; break; |
1825 | 10.8k | case GZIP_1: strat = Z_DEFAULT_STRATEGY; lvl = 1; break; |
1826 | 6.41k | case GZIP_RLE: strat = Z_RLE; break; |
1827 | 0 | case FQZ: strat = CRAM_MAJOR_VERS(fd->version); break; |
1828 | 0 | case FQZ_b: strat = CRAM_MAJOR_VERS(fd->version)+256; break; |
1829 | 0 | case FQZ_c: strat = CRAM_MAJOR_VERS(fd->version)+2*256; break; |
1830 | 0 | case FQZ_d: strat = CRAM_MAJOR_VERS(fd->version)+3*256; break; |
1831 | 4.44k | case TOK3: strat = 0; break; |
1832 | 0 | case TOKA: strat = 1; break; |
1833 | 38.5k | default: strat = 0; |
1834 | 71.0k | } |
1835 | | |
1836 | 71.0k | c = cram_compress_by_method(s, (char *)b->data, b->uncomp_size, |
1837 | 71.0k | b->content_id, &sz[m], m, lvl, strat); |
1838 | | |
1839 | 71.0k | if (c && sz_best > sz[m]) { |
1840 | 10.5k | sz_best = sz[m]; |
1841 | 10.5k | method_best = m; |
1842 | 10.5k | if (c_best) |
1843 | 5.32k | free(c_best); |
1844 | 10.5k | c_best = c; |
1845 | 60.5k | } else if (c) { |
1846 | 59.2k | free(c); |
1847 | 59.2k | } else { |
1848 | 1.27k | sz[m] = UINT_MAX; // arbitrarily worse than raw |
1849 | 1.27k | } |
1850 | 276k | } else { |
1851 | 276k | sz[m] = UINT_MAX; // arbitrarily worse than raw |
1852 | 276k | } |
1853 | 347k | } |
1854 | | |
1855 | 10.8k | if (c_best) { |
1856 | 5.25k | free(b->data); |
1857 | 5.25k | b->data = (unsigned char *)c_best; |
1858 | 5.25k | b->method = method_best; // adjusted to methmap[method_best] later |
1859 | 5.25k | b->comp_size = sz_best; |
1860 | 5.25k | } |
1861 | | |
1862 | | // Accumulate stats for all methods tried |
1863 | 10.8k | pthread_mutex_lock(&fd->metrics_lock); |
1864 | 358k | for (m = 0; m < CRAM_MAX_METHOD; m++) |
1865 | | // don't be overly sure on small blocks. |
1866 | | // +2000 means eg bzip2 vs gzip (1.07 to 1.04) or gz vs rans1 |
1867 | | // needs to be at least 60 bytes smaller to overcome the |
1868 | | // fixed size addition. |
1869 | 347k | metrics->sz[m] += sz[m]+2000; |
1870 | | |
1871 | | // When enough trials performed, find the best on average |
1872 | 10.8k | if (--metrics->trial == 0) { |
1873 | 2.68k | int best_method = RAW; |
1874 | 2.68k | int best_sz = INT_MAX; |
1875 | | |
1876 | | // Relative costs of methods. See enum_cram_block_method_int |
1877 | | // and methmap |
1878 | 2.68k | double meth_cost[32] = { |
1879 | | // Externally defined methods |
1880 | 2.68k | 1, // 0 raw |
1881 | 2.68k | 1.04, // 1 gzip (Z_FILTERED) |
1882 | 2.68k | 1.07, // 2 bzip2 |
1883 | 2.68k | 1.08, // 3 lzma |
1884 | 2.68k | 1.00, // 4 rans (O0) |
1885 | 2.68k | 1.00, // 5 ranspr (O0) |
1886 | 2.68k | 1.04, // 6 arithpr (O0) |
1887 | 2.68k | 1.05, // 7 fqz |
1888 | 2.68k | 1.05, // 8 tok3 (rans) |
1889 | 2.68k | 1.00, 1.00, // 9,10 reserved |
1890 | | |
1891 | | // Paramterised versions of above |
1892 | 2.68k | 1.01, // gzip rle |
1893 | 2.68k | 1.01, // gzip -1 |
1894 | | |
1895 | 2.68k | 1.05, 1.05, 1.05, // FQZ_b,c,d |
1896 | | |
1897 | 2.68k | 1.01, // rans O1 |
1898 | | |
1899 | 2.68k | 1.01, // rans_pr1 |
1900 | 2.68k | 1.00, // rans_pr64; if smaller, usually fast |
1901 | 2.68k | 1.03, // rans_pr65/9 |
1902 | 2.68k | 1.00, // rans_pr128 |
1903 | 2.68k | 1.01, // rans_pr129 |
1904 | 2.68k | 1.00, // rans_pr192 |
1905 | 2.68k | 1.01, // rans_pr193 |
1906 | | |
1907 | 2.68k | 1.07, // tok3 arith |
1908 | | |
1909 | 2.68k | 1.04, // arith_pr1 |
1910 | 2.68k | 1.04, // arith_pr64 |
1911 | 2.68k | 1.04, // arith_pr9 |
1912 | 2.68k | 1.03, // arith_pr128 |
1913 | 2.68k | 1.04, // arith_pr129 |
1914 | 2.68k | 1.04, // arith_pr192 |
1915 | 2.68k | 1.04, // arith_pr193 |
1916 | 2.68k | }; |
1917 | | |
1918 | | // Scale methods by cost based on compression level |
1919 | 2.68k | if (fd->level <= 1) { |
1920 | 0 | for (m = 0; m < CRAM_MAX_METHOD; m++) |
1921 | 0 | metrics->sz[m] *= 1+(meth_cost[m]-1)*4; |
1922 | 2.68k | } else if (fd->level <= 3) { |
1923 | 0 | for (m = 0; m < CRAM_MAX_METHOD; m++) |
1924 | 0 | metrics->sz[m] *= 1+(meth_cost[m]-1); |
1925 | 2.68k | } else if (fd->level <= 6) { |
1926 | 88.4k | for (m = 0; m < CRAM_MAX_METHOD; m++) |
1927 | 85.7k | metrics->sz[m] *= 1+(meth_cost[m]-1)/2; |
1928 | 2.68k | } else if (fd->level <= 7) { |
1929 | 0 | for (m = 0; m < CRAM_MAX_METHOD; m++) |
1930 | 0 | metrics->sz[m] *= 1+(meth_cost[m]-1)/3; |
1931 | 0 | } // else cost is ignored |
1932 | | |
1933 | | // Ensure these are never used; BSC and ZSTD |
1934 | 2.68k | metrics->sz[9] = metrics->sz[10] = INT_MAX; |
1935 | | |
1936 | 88.4k | for (m = 0; m < CRAM_MAX_METHOD; m++) { |
1937 | 85.7k | if ((!metrics->sz[m]) || (!(method & (1u<<m)))) |
1938 | 72.3k | continue; |
1939 | | |
1940 | 13.4k | if (best_sz > metrics->sz[m]) |
1941 | 5.99k | best_sz = metrics->sz[m], best_method = m; |
1942 | 13.4k | } |
1943 | | |
1944 | 2.68k | if (best_method != metrics->method) { |
1945 | | //metrics->trial = (NTRIALS+1)/2; // be sure |
1946 | | //metrics->next_trial /= 1.5; |
1947 | 2.54k | metrics->consistency = 0; |
1948 | 2.54k | } else { |
1949 | 131 | metrics->next_trial *= MIN(2, 1+metrics->consistency/4.0); |
1950 | 131 | metrics->consistency++; |
1951 | 131 | } |
1952 | | |
1953 | 2.68k | metrics->method = best_method; |
1954 | 2.68k | switch (best_method) { |
1955 | 116 | case GZIP: strat = Z_FILTERED; break; |
1956 | 1.53k | case GZIP_1: strat = Z_DEFAULT_STRATEGY; break; |
1957 | 0 | case GZIP_RLE: strat = Z_RLE; break; |
1958 | 0 | case FQZ: strat = CRAM_MAJOR_VERS(fd->version); break; |
1959 | 0 | case FQZ_b: strat = CRAM_MAJOR_VERS(fd->version)+256; break; |
1960 | 0 | case FQZ_c: strat = CRAM_MAJOR_VERS(fd->version)+2*256; break; |
1961 | 0 | case FQZ_d: strat = CRAM_MAJOR_VERS(fd->version)+3*256; break; |
1962 | 255 | case TOK3: strat = 0; break; |
1963 | 0 | case TOKA: strat = 1; break; |
1964 | 776 | default: strat = 0; |
1965 | 2.68k | } |
1966 | 2.68k | metrics->strat = strat; |
1967 | | |
1968 | | // If we see at least MAXFAIL trials in a row for a specific |
1969 | | // compression method with more than MAXDELTA aggregate |
1970 | | // size then we drop this from the list of methods used |
1971 | | // for this block type. |
1972 | 2.68k | #define MAXDELTA 0.20 |
1973 | 45.9k | #define MAXFAILS 4 |
1974 | 88.4k | for (m = 0; m < CRAM_MAX_METHOD; m++) { |
1975 | 85.7k | if (best_method == m) { |
1976 | 2.68k | metrics->cnt[m] = 0; |
1977 | 2.68k | metrics->extra[m] = 0; |
1978 | 83.0k | } else if (best_sz < metrics->sz[m]) { |
1979 | 45.9k | double r = (double)metrics->sz[m] / best_sz - 1; |
1980 | 45.9k | int mul = 1+(fd->level>=7); |
1981 | 45.9k | if (++metrics->cnt[m] >= MAXFAILS*mul && |
1982 | 757 | (metrics->extra[m] += r) >= MAXDELTA*mul) |
1983 | 247 | method &= ~(1u<<m); |
1984 | | |
1985 | | // Special case for fqzcomp as it rarely changes |
1986 | 45.9k | if (m == FQZ || m == FQZ_b || m == FQZ_c || m == FQZ_d) { |
1987 | 8.14k | if (metrics->sz[m] > best_sz) |
1988 | 8.14k | method &= ~(1u<<m); |
1989 | 8.14k | } |
1990 | 45.9k | } |
1991 | 85.7k | } |
1992 | | |
1993 | | //if (fd->verbose > 1 && method != metrics->revised_method) |
1994 | | // fprintf(stderr, "%d: revising method from %x to %x\n", |
1995 | | // b->content_id, metrics->revised_method, method); |
1996 | 2.68k | metrics->revised_method = method; |
1997 | 2.68k | } |
1998 | 10.8k | pthread_mutex_unlock(&fd->metrics_lock); |
1999 | 10.8k | } else { |
2000 | 1.76k | metrics->input_avg_delta = |
2001 | 1.76k | 0.9 * (metrics->input_avg_delta + |
2002 | 1.76k | ABS(b->uncomp_size - metrics->input_avg_sz)); |
2003 | | |
2004 | 1.76k | metrics->input_avg_sz += b->uncomp_size*.2; |
2005 | 1.76k | metrics->input_avg_sz *= 0.8; |
2006 | | |
2007 | 1.76k | strat = metrics->strat; |
2008 | 1.76k | method = metrics->method; |
2009 | | |
2010 | 1.76k | pthread_mutex_unlock(&fd->metrics_lock); |
2011 | 1.76k | comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size, |
2012 | 1.76k | b->content_id, &comp_size, method, |
2013 | 1.76k | method == GZIP_1 ? 1 : level, |
2014 | 1.76k | strat); |
2015 | 1.76k | if (!comp) { |
2016 | | // Our cached best method failed, but maybe another works? |
2017 | | // Rerun with trial mode engaged again. |
2018 | 168 | if (!recurse) { |
2019 | 168 | hts_log_warning("Compressed block ID %d method %s failed, " |
2020 | 168 | "redoing trial", b->content_id, |
2021 | 168 | cram_block_method2str(method)); |
2022 | 168 | pthread_mutex_lock(&fd->metrics_lock); |
2023 | 168 | metrics->trial = NTRIALS; |
2024 | 168 | metrics->next_trial = TRIAL_SPAN; |
2025 | 168 | metrics->revised_method = orig_method; |
2026 | 168 | pthread_mutex_unlock(&fd->metrics_lock); |
2027 | 168 | return cram_compress_block3(fd, s, b, metrics, method, |
2028 | 168 | level, 1); |
2029 | 168 | } |
2030 | 0 | return -1; |
2031 | 168 | } |
2032 | | |
2033 | 1.59k | if (comp_size < b->uncomp_size) { |
2034 | 1.48k | free(b->data); |
2035 | 1.48k | b->data = (unsigned char *)comp; |
2036 | 1.48k | b->comp_size = comp_size; |
2037 | 1.48k | b->method = method; |
2038 | 1.48k | } else { |
2039 | 111 | free(comp); |
2040 | 111 | } |
2041 | 1.59k | } |
2042 | | |
2043 | 12.6k | } else { |
2044 | | // no cached metrics, so just do zlib? |
2045 | 5.56k | comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size, |
2046 | 5.56k | b->content_id, &comp_size, GZIP, level, Z_FILTERED); |
2047 | 5.56k | if (!comp) { |
2048 | 0 | hts_log_error("Compression failed!"); |
2049 | 0 | return -1; |
2050 | 0 | } |
2051 | | |
2052 | 5.56k | if (comp_size < b->uncomp_size) { |
2053 | 194 | free(b->data); |
2054 | 194 | b->data = (unsigned char *)comp; |
2055 | 194 | b->comp_size = comp_size; |
2056 | 194 | b->method = GZIP; |
2057 | 5.36k | } else { |
2058 | 5.36k | free(comp); |
2059 | 5.36k | } |
2060 | 5.56k | strat = Z_FILTERED; |
2061 | 5.56k | } |
2062 | | |
2063 | 18.0k | hts_log_info("Compressed block ID %d from %d to %d by method %s", |
2064 | 18.0k | b->content_id, b->uncomp_size, b->comp_size, |
2065 | 18.0k | cram_block_method2str(b->method)); |
2066 | | |
2067 | 18.0k | b->method = methmap[b->method]; |
2068 | | |
2069 | 18.0k | return 0; |
2070 | 18.1k | } |
2071 | | |
2072 | | /* |
2073 | | * Compresses a block using a selection of compression codecs and options. |
2074 | | * The best is learnt and used for subsequent slices, periodically resampling. |
2075 | | * |
2076 | | * Method and level -1 implies defaults, as specified in cram_fd. |
2077 | | */ |
2078 | | int cram_compress_block2(cram_fd *fd, cram_slice *s, |
2079 | | cram_block *b, cram_metrics *metrics, |
2080 | 39.3k | int method, int level) { |
2081 | 39.3k | return cram_compress_block3(fd, s, b, metrics, method, level, 0); |
2082 | 39.3k | } |
2083 | | |
2084 | | int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, |
2085 | 5.44k | int method, int level) { |
2086 | 5.44k | return cram_compress_block2(fd, NULL, b, metrics, method, level); |
2087 | 5.44k | } |
2088 | | |
2089 | 755k | cram_metrics *cram_new_metrics(void) { |
2090 | 755k | cram_metrics *m = calloc(1, sizeof(*m)); |
2091 | 755k | if (!m) |
2092 | 0 | return NULL; |
2093 | 755k | m->trial = NTRIALS-1; |
2094 | 755k | m->next_trial = TRIAL_SPAN/2; // learn quicker at start |
2095 | 755k | m->method = RAW; |
2096 | 755k | m->strat = 0; |
2097 | 755k | m->revised_method = 0; |
2098 | 755k | m->unpackable = 0; |
2099 | | |
2100 | 755k | return m; |
2101 | 755k | } |
2102 | | |
2103 | 18.1k | char *cram_block_method2str(enum cram_block_method_int m) { |
2104 | 18.1k | switch(m) { |
2105 | 11.0k | case RAW: return "RAW"; |
2106 | 2.23k | case GZIP: return "GZIP"; |
2107 | 0 | case BZIP2: return "BZIP2"; |
2108 | 0 | case LZMA: return "LZMA"; |
2109 | 0 | case RANS0: return "RANS0"; |
2110 | 0 | case RANS1: return "RANS1"; |
2111 | 4 | case GZIP_RLE: return "GZIP_RLE"; |
2112 | 342 | case GZIP_1: return "GZIP_1"; |
2113 | 0 | case FQZ: return "FQZ"; |
2114 | 0 | case FQZ_b: return "FQZ_b"; |
2115 | 0 | case FQZ_c: return "FQZ_c"; |
2116 | 0 | case FQZ_d: return "FQZ_d"; |
2117 | 317 | case RANS_PR0: return "RANS_PR0"; |
2118 | 441 | case RANS_PR1: return "RANS_PR1"; |
2119 | 897 | case RANS_PR64: return "RANS_PR64"; |
2120 | 5 | case RANS_PR9: return "RANS_PR9"; |
2121 | 2.58k | case RANS_PR128: return "RANS_PR128"; |
2122 | 0 | case RANS_PR129: return "RANS_PR129"; |
2123 | 0 | case RANS_PR192: return "RANS_PR192"; |
2124 | 103 | case RANS_PR193: return "RANS_PR193"; |
2125 | 168 | case TOK3: return "TOK3_R"; |
2126 | 0 | case TOKA: return "TOK3_A"; |
2127 | 0 | case ARITH_PR0: return "ARITH_PR0"; |
2128 | 0 | case ARITH_PR1: return "ARITH_PR1"; |
2129 | 0 | case ARITH_PR64: return "ARITH_PR64"; |
2130 | 0 | case ARITH_PR9: return "ARITH_PR9"; |
2131 | 0 | case ARITH_PR128: return "ARITH_PR128"; |
2132 | 0 | case ARITH_PR129: return "ARITH_PR129"; |
2133 | 0 | case ARITH_PR192: return "ARITH_PR192"; |
2134 | 0 | case ARITH_PR193: return "ARITH_PR193"; |
2135 | 0 | case BM_ERROR: break; |
2136 | 18.1k | } |
2137 | 0 | return "?"; |
2138 | 18.1k | } |
2139 | | |
2140 | 3 | char *cram_content_type2str(enum cram_content_type t) { |
2141 | 3 | switch (t) { |
2142 | 0 | case FILE_HEADER: return "FILE_HEADER"; |
2143 | 0 | case COMPRESSION_HEADER: return "COMPRESSION_HEADER"; |
2144 | 0 | case MAPPED_SLICE: return "MAPPED_SLICE"; |
2145 | 0 | case UNMAPPED_SLICE: return "UNMAPPED_SLICE"; |
2146 | 0 | case EXTERNAL: return "EXTERNAL"; |
2147 | 3 | case CORE: return "CORE"; |
2148 | 0 | case CT_ERROR: break; |
2149 | 3 | } |
2150 | 0 | return "?"; |
2151 | 3 | } |
2152 | | |
2153 | | /* ---------------------------------------------------------------------- |
2154 | | * Reference sequence handling |
2155 | | * |
2156 | | * These revolve around the refs_t structure, which may potentially be |
2157 | | * shared between multiple cram_fd. |
2158 | | * |
2159 | | * We start with refs_create() to allocate an empty refs_t and then |
2160 | | * populate it with @SQ line data using refs_from_header(). This is done on |
2161 | | * cram_open(). Also at start up we can call cram_load_reference() which |
2162 | | * is used with "scramble -r foo.fa". This replaces the fd->refs with the |
2163 | | * new one specified. In either case refs2id() is then called which |
2164 | | * maps ref_entry names to @SQ ids (refs_t->ref_id[]). |
2165 | | * |
2166 | | * Later, possibly within a thread, we will want to know the actual ref |
2167 | | * seq itself, obtained by calling cram_get_ref(). This may use the |
2168 | | * UR: or M5: fields or the filename specified in the original |
2169 | | * cram_load_reference() call. |
2170 | | * |
2171 | | * Given the potential for multi-threaded reference usage, we have |
2172 | | * reference counting (sorry for the confusing double use of "ref") to |
2173 | | * track the number of callers interested in any specific reference. |
2174 | | */ |
2175 | | |
2176 | | /* |
2177 | | * Frees/unmaps a reference sequence and associated file handles. |
2178 | | */ |
2179 | 694 | static void ref_entry_free_seq(ref_entry *e) { |
2180 | 694 | if (e->mf) |
2181 | 0 | mfclose(e->mf); |
2182 | 694 | if (e->seq && !e->mf) |
2183 | 0 | free(e->seq); |
2184 | | |
2185 | 694 | e->seq = NULL; |
2186 | 694 | e->mf = NULL; |
2187 | 694 | } |
2188 | | |
2189 | 16.0k | void refs_free(refs_t *r) { |
2190 | 16.0k | RP("refs_free()\n"); |
2191 | | |
2192 | 16.0k | if (!r) |
2193 | 0 | return; |
2194 | | |
2195 | 16.0k | if (--r->count > 0) |
2196 | 0 | return; |
2197 | | |
2198 | 16.0k | if (r->pool) |
2199 | 16.0k | string_pool_destroy(r->pool); |
2200 | | |
2201 | 16.0k | if (r->h_meta) { |
2202 | 16.0k | khint_t k; |
2203 | | |
2204 | 17.7k | for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) { |
2205 | 1.71k | ref_entry *e; |
2206 | | |
2207 | 1.71k | if (!kh_exist(r->h_meta, k)) |
2208 | 1.02k | continue; |
2209 | 694 | if (!(e = kh_val(r->h_meta, k))) |
2210 | 0 | continue; |
2211 | 694 | ref_entry_free_seq(e); |
2212 | 694 | free(e); |
2213 | 694 | } |
2214 | | |
2215 | 16.0k | kh_destroy(refs, r->h_meta); |
2216 | 16.0k | } |
2217 | | |
2218 | 16.0k | if (r->ref_id) |
2219 | 5.49k | free(r->ref_id); |
2220 | | |
2221 | 16.0k | if (r->fp) |
2222 | 0 | bgzf_close(r->fp); |
2223 | | |
2224 | 16.0k | pthread_mutex_destroy(&r->lock); |
2225 | | |
2226 | 16.0k | free(r); |
2227 | 16.0k | } |
2228 | | |
2229 | 16.0k | static refs_t *refs_create(void) { |
2230 | 16.0k | refs_t *r = calloc(1, sizeof(*r)); |
2231 | | |
2232 | 16.0k | RP("refs_create()\n"); |
2233 | | |
2234 | 16.0k | if (!r) |
2235 | 0 | return NULL; |
2236 | | |
2237 | 16.0k | if (!(r->pool = string_pool_create(8192))) |
2238 | 0 | goto err; |
2239 | | |
2240 | 16.0k | r->ref_id = NULL; // see refs2id() to populate. |
2241 | 16.0k | r->count = 1; |
2242 | 16.0k | r->last = NULL; |
2243 | 16.0k | r->last_id = -1; |
2244 | | |
2245 | 16.0k | if (!(r->h_meta = kh_init(refs))) |
2246 | 0 | goto err; |
2247 | | |
2248 | 16.0k | pthread_mutex_init(&r->lock, NULL); |
2249 | | |
2250 | 16.0k | return r; |
2251 | | |
2252 | 0 | err: |
2253 | 0 | refs_free(r); |
2254 | 0 | return NULL; |
2255 | 16.0k | } |
2256 | | |
2257 | | /* |
2258 | | * Opens a reference fasta file as a BGZF stream, allowing for |
2259 | | * compressed files. It automatically builds a .fai file if |
2260 | | * required and if compressed a .gzi bgzf index too. |
2261 | | * |
2262 | | * Returns a BGZF handle on success; |
2263 | | * NULL on failure. |
2264 | | */ |
2265 | 2 | static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) { |
2266 | 2 | BGZF *fp; |
2267 | | |
2268 | 2 | if (strncmp(fn, "file://", 7) == 0) |
2269 | 0 | fn += 7; |
2270 | | |
2271 | 2 | if (!is_md5 && !hisremote(fn)) { |
2272 | 2 | char fai_file[PATH_MAX]; |
2273 | | |
2274 | 2 | snprintf(fai_file, PATH_MAX, "%s.fai", fn); |
2275 | 2 | if (access(fai_file, R_OK) != 0) |
2276 | 2 | if (fai_build(fn) != 0) |
2277 | 2 | return NULL; |
2278 | 2 | } |
2279 | | |
2280 | 0 | if (!(fp = bgzf_open(fn, mode))) { |
2281 | 0 | perror(fn); |
2282 | 0 | return NULL; |
2283 | 0 | } |
2284 | | |
2285 | 0 | if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) { |
2286 | 0 | hts_log_error("Unable to load .gzi index '%s.gzi'", fn); |
2287 | 0 | bgzf_close(fp); |
2288 | 0 | return NULL; |
2289 | 0 | } |
2290 | | |
2291 | 0 | return fp; |
2292 | 0 | } |
2293 | | |
2294 | | /* |
2295 | | * Loads a FAI file for a reference.fasta. |
2296 | | * "is_err" indicates whether failure to load is worthy of emitting an |
2297 | | * error message. In some cases (eg with embedded references) we |
2298 | | * speculatively load, just in case, and silently ignore errors. |
2299 | | * |
2300 | | * Returns the refs_t struct on success (maybe newly allocated); |
2301 | | * NULL on failure |
2302 | | */ |
2303 | 2 | static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) { |
2304 | 2 | hFILE *fp = NULL; |
2305 | 2 | char fai_fn[PATH_MAX]; |
2306 | 2 | char line[8192]; |
2307 | 2 | refs_t *r = r_orig; |
2308 | 2 | size_t fn_l = strlen(fn); |
2309 | 2 | int id = 0, id_alloc = 0; |
2310 | | |
2311 | 2 | RP("refs_load_fai %s\n", fn); |
2312 | | |
2313 | 2 | if (!r) |
2314 | 0 | if (!(r = refs_create())) |
2315 | 0 | goto err; |
2316 | | |
2317 | 2 | if (r->fp) |
2318 | 0 | if (bgzf_close(r->fp) != 0) |
2319 | 0 | goto err; |
2320 | 2 | r->fp = NULL; |
2321 | | |
2322 | | /* Look for a FASTA##idx##FAI format */ |
2323 | 2 | char *fn_delim = strstr(fn, HTS_IDX_DELIM); |
2324 | 2 | if (fn_delim) { |
2325 | 0 | if (!(r->fn = string_ndup(r->pool, fn, fn_delim - fn))) |
2326 | 0 | goto err; |
2327 | 0 | fn_delim += strlen(HTS_IDX_DELIM); |
2328 | 0 | snprintf(fai_fn, PATH_MAX, "%s", fn_delim); |
2329 | 2 | } else { |
2330 | | /* An index file was provided, instead of the actual reference file */ |
2331 | 2 | if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0) { |
2332 | 0 | if (!r->fn) { |
2333 | 0 | if (!(r->fn = string_ndup(r->pool, fn, fn_l-4))) |
2334 | 0 | goto err; |
2335 | 0 | } |
2336 | 0 | snprintf(fai_fn, PATH_MAX, "%s", fn); |
2337 | 2 | } else { |
2338 | | /* Only the reference file provided. Get the index file name from it */ |
2339 | 2 | if (!(r->fn = string_dup(r->pool, fn))) |
2340 | 0 | goto err; |
2341 | 2 | snprintf(fai_fn, PATH_MAX, "%.*s.fai", PATH_MAX-5, fn); |
2342 | 2 | } |
2343 | 2 | } |
2344 | | |
2345 | 2 | if (!(r->fp = bgzf_open_ref(r->fn, "r", 0))) { |
2346 | 2 | hts_log_error("Failed to open reference file '%s'", r->fn); |
2347 | 2 | goto err; |
2348 | 2 | } |
2349 | | |
2350 | 0 | if (!(fp = hopen(fai_fn, "r"))) { |
2351 | 0 | hts_log_error("Failed to open index file '%s'", fai_fn); |
2352 | 0 | if (is_err) |
2353 | 0 | perror(fai_fn); |
2354 | 0 | goto err; |
2355 | 0 | } |
2356 | 0 | while (hgets(line, 8192, fp) != NULL) { |
2357 | 0 | ref_entry *e = malloc(sizeof(*e)); |
2358 | 0 | char *cp; |
2359 | 0 | int n; |
2360 | 0 | khint_t k; |
2361 | |
|
2362 | 0 | if (!e) |
2363 | 0 | return NULL; |
2364 | | |
2365 | | // id |
2366 | 0 | for (cp = line; *cp && !isspace_c(*cp); cp++) |
2367 | 0 | ; |
2368 | 0 | *cp++ = 0; |
2369 | 0 | e->name = string_dup(r->pool, line); |
2370 | | |
2371 | | // length |
2372 | 0 | while (*cp && isspace_c(*cp)) |
2373 | 0 | cp++; |
2374 | 0 | e->length = strtoll(cp, &cp, 10); |
2375 | | |
2376 | | // offset |
2377 | 0 | while (*cp && isspace_c(*cp)) |
2378 | 0 | cp++; |
2379 | 0 | e->offset = strtoll(cp, &cp, 10); |
2380 | | |
2381 | | // bases per line |
2382 | 0 | while (*cp && isspace_c(*cp)) |
2383 | 0 | cp++; |
2384 | 0 | e->bases_per_line = strtol(cp, &cp, 10); |
2385 | | |
2386 | | // line length |
2387 | 0 | while (*cp && isspace_c(*cp)) |
2388 | 0 | cp++; |
2389 | 0 | e->line_length = strtol(cp, &cp, 10); |
2390 | | |
2391 | | // filename |
2392 | 0 | e->fn = r->fn; |
2393 | |
|
2394 | 0 | e->count = 0; |
2395 | 0 | e->seq = NULL; |
2396 | 0 | e->mf = NULL; |
2397 | 0 | e->is_md5 = 0; |
2398 | 0 | e->validated_md5 = 0; |
2399 | |
|
2400 | 0 | k = kh_put(refs, r->h_meta, e->name, &n); |
2401 | 0 | if (-1 == n) { |
2402 | 0 | free(e); |
2403 | 0 | return NULL; |
2404 | 0 | } |
2405 | | |
2406 | 0 | if (n) { |
2407 | 0 | kh_val(r->h_meta, k) = e; |
2408 | 0 | } else { |
2409 | 0 | ref_entry *re = kh_val(r->h_meta, k); |
2410 | 0 | if (re && (re->count != 0 || re->length != 0)) { |
2411 | | /* Keep old */ |
2412 | 0 | free(e); |
2413 | 0 | } else { |
2414 | | /* Replace old */ |
2415 | 0 | if (re) |
2416 | 0 | free(re); |
2417 | 0 | kh_val(r->h_meta, k) = e; |
2418 | 0 | } |
2419 | 0 | } |
2420 | |
|
2421 | 0 | if (id >= id_alloc) { |
2422 | 0 | ref_entry **new_refs; |
2423 | 0 | int x; |
2424 | |
|
2425 | 0 | id_alloc = id_alloc ?id_alloc*2 : 16; |
2426 | 0 | new_refs = hts_realloc_p(r->ref_id, sizeof(*r->ref_id), id_alloc); |
2427 | 0 | if (!new_refs) |
2428 | 0 | goto err; |
2429 | 0 | r->ref_id = new_refs; |
2430 | |
|
2431 | 0 | for (x = id; x < id_alloc; x++) |
2432 | 0 | r->ref_id[x] = NULL; |
2433 | 0 | } |
2434 | 0 | r->ref_id[id] = e; |
2435 | 0 | r->nref = ++id; |
2436 | 0 | } |
2437 | | |
2438 | 0 | if(hclose(fp) < 0) |
2439 | 0 | goto err; |
2440 | 0 | return r; |
2441 | | |
2442 | 2 | err: |
2443 | 2 | if (fp) |
2444 | 0 | hclose_abruptly(fp); |
2445 | | |
2446 | 2 | if (!r_orig) |
2447 | 0 | refs_free(r); |
2448 | | |
2449 | 2 | return NULL; |
2450 | 0 | } |
2451 | | |
2452 | | /* |
2453 | | * Verifies that the CRAM @SQ lines and .fai files match. |
2454 | | */ |
2455 | 0 | static void sanitise_SQ_lines(cram_fd *fd) { |
2456 | 0 | int i; |
2457 | |
|
2458 | 0 | if (!fd->header || !fd->header->hrecs) |
2459 | 0 | return; |
2460 | | |
2461 | 0 | if (!fd->refs || !fd->refs->h_meta) |
2462 | 0 | return; |
2463 | | |
2464 | 0 | for (i = 0; i < fd->header->hrecs->nref; i++) { |
2465 | 0 | const char *name = fd->header->hrecs->ref[i].name; |
2466 | 0 | khint_t k = kh_get(refs, fd->refs->h_meta, name); |
2467 | 0 | ref_entry *r; |
2468 | | |
2469 | | // We may have @SQ lines which have no known .fai, but do not |
2470 | | // in themselves pose a problem because they are unused in the file. |
2471 | 0 | if (k == kh_end(fd->refs->h_meta)) |
2472 | 0 | continue; |
2473 | | |
2474 | 0 | if (!(r = (ref_entry *)kh_val(fd->refs->h_meta, k))) |
2475 | 0 | continue; |
2476 | | |
2477 | 0 | if (r->length && r->length != fd->header->hrecs->ref[i].len) { |
2478 | 0 | assert(strcmp(r->name, fd->header->hrecs->ref[i].name) == 0); |
2479 | | |
2480 | | // Should we also check MD5sums here to ensure the correct |
2481 | | // reference was given? |
2482 | 0 | hts_log_warning("Header @SQ length mismatch for ref %s, %"PRIhts_pos" vs %d", |
2483 | 0 | r->name, fd->header->hrecs->ref[i].len, (int)r->length); |
2484 | | |
2485 | | // Fixing the parsed @SQ header will make MD:Z: strings work |
2486 | | // and also stop it producing N for the sequence. |
2487 | 0 | fd->header->hrecs->ref[i].len = r->length; |
2488 | 0 | } |
2489 | 0 | } |
2490 | 0 | } |
2491 | | |
2492 | | /* |
2493 | | * Indexes references by the order they appear in a BAM file. This may not |
2494 | | * necessarily be the same order they appear in the fasta reference file. |
2495 | | * |
2496 | | * Returns 0 on success |
2497 | | * -1 on failure |
2498 | | */ |
2499 | 5.44k | int refs2id(refs_t *r, sam_hdr_t *hdr) { |
2500 | 5.44k | int i; |
2501 | 5.44k | sam_hrecs_t *h = hdr->hrecs; |
2502 | | |
2503 | 5.44k | if (r->ref_id) |
2504 | 292 | free(r->ref_id); |
2505 | 5.44k | if (r->last) |
2506 | 0 | r->last = NULL; |
2507 | | |
2508 | 5.44k | r->ref_id = calloc(h->nref, sizeof(*r->ref_id)); |
2509 | 5.44k | if (!r->ref_id) |
2510 | 0 | return -1; |
2511 | | |
2512 | 5.44k | r->nref = h->nref; |
2513 | 5.94k | for (i = 0; i < h->nref; i++) { |
2514 | 498 | khint_t k = kh_get(refs, r->h_meta, h->ref[i].name); |
2515 | 498 | if (k != kh_end(r->h_meta)) { |
2516 | 498 | r->ref_id[i] = kh_val(r->h_meta, k); |
2517 | 498 | } else { |
2518 | 0 | hts_log_warning("Unable to find ref name '%s'", h->ref[i].name); |
2519 | 0 | } |
2520 | 498 | } |
2521 | | |
2522 | 5.44k | return 0; |
2523 | 5.44k | } |
2524 | | |
2525 | | /* |
2526 | | * Generates refs_t entries based on @SQ lines in the header. |
2527 | | * Returns 0 on success |
2528 | | * -1 on failure |
2529 | | */ |
2530 | 27.0k | static int refs_from_header(cram_fd *fd) { |
2531 | 27.0k | if (!fd) |
2532 | 0 | return -1; |
2533 | | |
2534 | 27.0k | refs_t *r = fd->refs; |
2535 | 27.0k | if (!r) |
2536 | 0 | return -1; |
2537 | | |
2538 | 27.0k | sam_hdr_t *h = fd->header; |
2539 | 27.0k | if (!h) |
2540 | 8.77k | return 0; |
2541 | | |
2542 | 18.2k | if (!h->hrecs) { |
2543 | 12.3k | if (-1 == sam_hdr_fill_hrecs(h)) |
2544 | 77 | return -1; |
2545 | 12.3k | } |
2546 | | |
2547 | 18.1k | if (h->hrecs->nref == 0) |
2548 | 17.5k | return 0; |
2549 | | |
2550 | | //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode); |
2551 | | |
2552 | | /* Existing refs are fine, as long as they're compatible with the hdr. */ |
2553 | 632 | ref_entry **new_ref_id = hts_realloc_ps(r->ref_id, sizeof(*r->ref_id), |
2554 | 632 | r->nref, h->hrecs->nref); |
2555 | 632 | if (!new_ref_id) |
2556 | 0 | return -1; |
2557 | 632 | r->ref_id = new_ref_id; |
2558 | | |
2559 | 632 | int i, j; |
2560 | | /* Copy info from h->ref[i] over to r */ |
2561 | 1.82k | for (i = 0, j = r->nref; i < h->hrecs->nref; i++) { |
2562 | 1.19k | sam_hrec_type_t *ty; |
2563 | 1.19k | sam_hrec_tag_t *tag; |
2564 | 1.19k | khint_t k; |
2565 | 1.19k | int n; |
2566 | | |
2567 | 1.19k | k = kh_get(refs, r->h_meta, h->hrecs->ref[i].name); |
2568 | 1.19k | if (k != kh_end(r->h_meta)) |
2569 | | // Ref already known about |
2570 | 498 | continue; |
2571 | | |
2572 | 694 | if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry)))) |
2573 | 0 | return -1; |
2574 | | |
2575 | 694 | if (!h->hrecs->ref[i].name) |
2576 | 0 | return -1; |
2577 | | |
2578 | 694 | r->ref_id[j]->name = string_dup(r->pool, h->hrecs->ref[i].name); |
2579 | 694 | if (!r->ref_id[j]->name) return -1; |
2580 | 694 | r->ref_id[j]->length = 0; // marker for not yet loaded |
2581 | | |
2582 | | /* Initialise likely filename if known */ |
2583 | 694 | if ((ty = sam_hrecs_find_type_id(h->hrecs, "SQ", "SN", h->hrecs->ref[i].name))) { |
2584 | 694 | if ((tag = sam_hrecs_find_key(ty, "M5", NULL))) |
2585 | 113 | r->ref_id[j]->fn = string_dup(r->pool, tag->str+3); |
2586 | | |
2587 | 694 | if ((tag = sam_hrecs_find_key(ty, "LN", NULL))) { |
2588 | | // LN tag used when constructing consensus reference |
2589 | 694 | r->ref_id[j]->LN_length = strtoll(tag->str+3, NULL, 0); |
2590 | | // See fuzz 382922241 |
2591 | 694 | if (r->ref_id[j]->LN_length < 0) |
2592 | 1 | r->ref_id[j]->LN_length = 0; |
2593 | 694 | } |
2594 | 694 | } |
2595 | | |
2596 | 694 | k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n); |
2597 | 694 | if (n <= 0) // already exists or error |
2598 | 0 | return -1; |
2599 | 694 | kh_val(r->h_meta, k) = r->ref_id[j]; |
2600 | | |
2601 | 694 | j++; |
2602 | 694 | } |
2603 | 632 | r->nref = j; |
2604 | | |
2605 | 632 | return 0; |
2606 | 632 | } |
2607 | | |
2608 | | /* |
2609 | | * Attaches a header to a cram_fd. |
2610 | | * |
2611 | | * This should be used when creating a new cram_fd for writing where |
2612 | | * we have a header already constructed (eg from a file we've read |
2613 | | * in). |
2614 | | */ |
2615 | 5.52k | int cram_set_header2(cram_fd *fd, const sam_hdr_t *hdr) { |
2616 | 5.52k | if (!fd || !hdr ) |
2617 | 0 | return -1; |
2618 | | |
2619 | 5.52k | if (fd->header != hdr) { |
2620 | 5.52k | if (fd->header) |
2621 | 0 | sam_hdr_destroy(fd->header); |
2622 | 5.52k | fd->header = sam_hdr_dup(hdr); |
2623 | 5.52k | if (!fd->header) |
2624 | 0 | return -1; |
2625 | 5.52k | } |
2626 | 5.52k | return refs_from_header(fd); |
2627 | 5.52k | } |
2628 | | |
2629 | 0 | int cram_set_header(cram_fd *fd, sam_hdr_t *hdr) { |
2630 | 0 | return cram_set_header2(fd, hdr); |
2631 | 0 | } |
2632 | | |
2633 | | /* |
2634 | | * Returns whether the path refers to a directory. |
2635 | | */ |
2636 | 0 | static int is_directory(char *fn) { |
2637 | 0 | struct stat buf; |
2638 | 0 | if ( stat(fn,&buf) ) return 0; |
2639 | 0 | return S_ISDIR(buf.st_mode); |
2640 | 0 | } |
2641 | | |
2642 | | /* |
2643 | | * Converts a directory and a filename into an expanded path, replacing %s |
2644 | | * in directory with the filename and %[0-9]+s with portions of the filename |
2645 | | * Any remaining parts of filename are added to the end with /%s. |
2646 | | */ |
2647 | 0 | static int expand_cache_path(char *path, char *dir, const char *fn) { |
2648 | 0 | char *cp, *start = path; |
2649 | 0 | size_t len; |
2650 | 0 | size_t sz = PATH_MAX; |
2651 | |
|
2652 | 0 | while ((cp = strchr(dir, '%'))) { |
2653 | 0 | if (cp-dir >= sz) return -1; |
2654 | 0 | strncpy(path, dir, cp-dir); |
2655 | 0 | path += cp-dir; |
2656 | 0 | sz -= cp-dir; |
2657 | |
|
2658 | 0 | if (*++cp == 's') { |
2659 | 0 | len = strlen(fn); |
2660 | 0 | if (len >= sz) return -1; |
2661 | 0 | strcpy(path, fn); |
2662 | 0 | path += len; |
2663 | 0 | sz -= len; |
2664 | 0 | fn += len; |
2665 | 0 | cp++; |
2666 | 0 | } else if (*cp >= '0' && *cp <= '9') { |
2667 | 0 | char *endp; |
2668 | 0 | long l; |
2669 | |
|
2670 | 0 | l = strtol(cp, &endp, 10); |
2671 | 0 | l = MIN(l, strlen(fn)); |
2672 | 0 | if (*endp == 's') { |
2673 | 0 | if (l >= sz) return -1; |
2674 | 0 | strncpy(path, fn, l); |
2675 | 0 | path += l; |
2676 | 0 | fn += l; |
2677 | 0 | sz -= l; |
2678 | 0 | *path = 0; |
2679 | 0 | cp = endp+1; |
2680 | 0 | } else { |
2681 | 0 | if (sz < 3) return -1; |
2682 | 0 | *path++ = '%'; |
2683 | 0 | *path++ = *cp++; |
2684 | 0 | } |
2685 | 0 | } else { |
2686 | 0 | if (sz < 3) return -1; |
2687 | 0 | *path++ = '%'; |
2688 | 0 | *path++ = *cp++; |
2689 | 0 | } |
2690 | 0 | dir = cp; |
2691 | 0 | } |
2692 | | |
2693 | 0 | len = strlen(dir); |
2694 | 0 | if (len >= sz) return -1; |
2695 | 0 | strcpy(path, dir); |
2696 | 0 | path += len; |
2697 | 0 | sz -= len; |
2698 | |
|
2699 | 0 | len = strlen(fn) + ((*fn && path > start && path[-1] != '/') ? 1 : 0); |
2700 | 0 | if (len >= sz) return -1; |
2701 | 0 | if (*fn && path > start && path[-1] != '/') |
2702 | 0 | *path++ = '/'; |
2703 | 0 | strcpy(path, fn); |
2704 | 0 | return 0; |
2705 | 0 | } |
2706 | | |
2707 | | /* |
2708 | | * Make the directory containing path and any prefix directories. |
2709 | | */ |
2710 | 0 | static void mkdir_prefix(char *path, int mode) { |
2711 | 0 | char *cp = strrchr(path, '/'); |
2712 | 0 | if (!cp) |
2713 | 0 | return; |
2714 | | |
2715 | 0 | *cp = 0; |
2716 | 0 | if (is_directory(path)) { |
2717 | 0 | *cp = '/'; |
2718 | 0 | return; |
2719 | 0 | } |
2720 | | |
2721 | 0 | if (mkdir(path, mode) == 0) { |
2722 | 0 | chmod(path, mode); |
2723 | 0 | *cp = '/'; |
2724 | 0 | return; |
2725 | 0 | } |
2726 | | |
2727 | 0 | mkdir_prefix(path, mode); |
2728 | 0 | mkdir(path, mode); |
2729 | 0 | chmod(path, mode); |
2730 | 0 | *cp = '/'; |
2731 | 0 | } |
2732 | | |
2733 | | /* |
2734 | | * Queries the M5 string from the header and attempts to populate the |
2735 | | * reference from this using the REF_PATH environment. |
2736 | | * |
2737 | | * Returns 0 on success |
2738 | | * -1 on failure |
2739 | | */ |
2740 | 278 | static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { |
2741 | 278 | char *ref_path = getenv("REF_PATH"); |
2742 | 278 | sam_hrec_type_t *ty; |
2743 | 278 | sam_hrec_tag_t *tag; |
2744 | 278 | char path[PATH_MAX]; |
2745 | 278 | kstring_t path_tmp = KS_INITIALIZE; |
2746 | 278 | char *local_cache = getenv("REF_CACHE"); |
2747 | 278 | mFILE *mf; |
2748 | 278 | int local_path = 0; |
2749 | | |
2750 | 278 | hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id); |
2751 | | |
2752 | 278 | if (!r->name) |
2753 | 0 | return -1; |
2754 | | |
2755 | 278 | if (!(ty = sam_hrecs_find_type_id(fd->header->hrecs, "SQ", "SN", r->name))) |
2756 | 0 | return -1; |
2757 | | |
2758 | 278 | if (!(tag = sam_hrecs_find_key(ty, "M5", NULL))) |
2759 | 277 | goto no_M5; |
2760 | | |
2761 | 1 | hts_log_info("Querying ref %s", tag->str+3); |
2762 | | |
2763 | | /* Use cache if available */ |
2764 | 1 | if (local_cache && *local_cache) { |
2765 | 0 | struct stat sb; |
2766 | 0 | if (expand_cache_path(path, local_cache, tag->str+3) == 0 && |
2767 | 0 | stat(path, &sb) == 0) |
2768 | | // Found it in the local cache |
2769 | 0 | local_path = 1; |
2770 | 0 | } |
2771 | | |
2772 | | #ifndef HAVE_MMAP |
2773 | | char *path2; |
2774 | | /* Search local files in REF_PATH; we can open them and return as above */ |
2775 | | if (!local_path && (path2 = find_path(tag->str+3, ref_path))) { |
2776 | | int len = snprintf(path, PATH_MAX, "%s", path2); |
2777 | | free(path2); |
2778 | | if (len > 0 && len < PATH_MAX) // in case it's too long |
2779 | | local_path = 1; |
2780 | | } |
2781 | | #endif |
2782 | | |
2783 | | /* Found via REF_CACHE or local REF_PATH file */ |
2784 | 1 | if (local_path) { |
2785 | 0 | struct stat sb; |
2786 | 0 | BGZF *fp; |
2787 | |
|
2788 | 0 | if (0 == stat(path, &sb) |
2789 | 0 | && S_ISREG(sb.st_mode) |
2790 | 0 | && (fp = bgzf_open(path, "r"))) { |
2791 | 0 | r->length = sb.st_size; |
2792 | 0 | r->offset = r->line_length = r->bases_per_line = 0; |
2793 | |
|
2794 | 0 | r->fn = string_dup(fd->refs->pool, path); |
2795 | |
|
2796 | 0 | if (fd->refs->fp) |
2797 | 0 | if (bgzf_close(fd->refs->fp) != 0) |
2798 | 0 | return -1; |
2799 | 0 | fd->refs->fp = fp; |
2800 | 0 | fd->refs->fn = r->fn; |
2801 | 0 | r->is_md5 = 1; |
2802 | 0 | r->validated_md5 = 1; |
2803 | | |
2804 | | // Fall back to cram_get_ref() where it'll do the actual |
2805 | | // reading of the file. |
2806 | 0 | return 0; |
2807 | 0 | } |
2808 | 0 | } |
2809 | | |
2810 | | |
2811 | | /* Otherwise search full REF_PATH; slower as loads entire file */ |
2812 | 1 | int is_local = 0; |
2813 | 1 | if ((mf = open_path_mfile(tag->str+3, ref_path, NULL, &is_local))) { |
2814 | 0 | size_t sz; |
2815 | 0 | r->seq = mfsteal(mf, &sz); |
2816 | 0 | if (r->seq) { |
2817 | 0 | r->mf = NULL; |
2818 | 0 | } else { |
2819 | | // keep mf around as we couldn't detach |
2820 | 0 | r->seq = mf->data; |
2821 | 0 | r->mf = mf; |
2822 | 0 | } |
2823 | 0 | r->length = sz; |
2824 | 0 | r->is_md5 = 1; |
2825 | 0 | r->validated_md5 = 1; |
2826 | 1 | } else { |
2827 | 1 | refs_t *refs; |
2828 | 1 | const char *fn; |
2829 | 1 | sam_hrec_tag_t *UR_tag; |
2830 | | |
2831 | 278 | no_M5: |
2832 | | /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */ |
2833 | 278 | if (!(UR_tag = sam_hrecs_find_key(ty, "UR", NULL))) |
2834 | 276 | return -1; |
2835 | | |
2836 | 2 | if (strstr(UR_tag->str+3, "://") && |
2837 | 0 | strncmp(UR_tag->str+3, "file:", 5) != 0) { |
2838 | | // Documented as omitted, but accidentally supported until now |
2839 | 0 | hts_log_error("UR tags pointing to remote files are not supported"); |
2840 | 0 | return -1; |
2841 | 0 | } |
2842 | | |
2843 | 2 | fn = (strncmp(UR_tag->str+3, "file:", 5) == 0) |
2844 | 2 | ? UR_tag->str+8 |
2845 | 2 | : UR_tag->str+3; |
2846 | | |
2847 | 2 | if (fd->refs->fp) { |
2848 | 0 | if (bgzf_close(fd->refs->fp) != 0) |
2849 | 0 | return -1; |
2850 | 0 | fd->refs->fp = NULL; |
2851 | 0 | } |
2852 | 2 | if (!(refs = refs_load_fai(fd->refs, fn, 0))) |
2853 | 2 | return -1; |
2854 | 0 | sanitise_SQ_lines(fd); |
2855 | |
|
2856 | 0 | fd->refs = refs; |
2857 | 0 | if (fd->refs->fp) { |
2858 | 0 | if (bgzf_close(fd->refs->fp) != 0) |
2859 | 0 | return -1; |
2860 | 0 | fd->refs->fp = NULL; |
2861 | 0 | } |
2862 | | |
2863 | 0 | if (!fd->refs->fn) |
2864 | 0 | return -1; |
2865 | | |
2866 | 0 | if (-1 == refs2id(fd->refs, fd->header)) |
2867 | 0 | return -1; |
2868 | 0 | if (!fd->refs->ref_id || !fd->refs->ref_id[id]) |
2869 | 0 | return -1; |
2870 | | |
2871 | | // Local copy already, so fall back to cram_get_ref(). |
2872 | 0 | return 0; |
2873 | 0 | } |
2874 | | |
2875 | | /* Populate the local disk cache if required */ |
2876 | 0 | if (!is_local && local_cache && *local_cache) { |
2877 | 0 | hFILE *fp; |
2878 | |
|
2879 | 0 | if (expand_cache_path(path, local_cache, tag->str+3) < 0) { |
2880 | 0 | return 0; // Not fatal - we have the data already so keep going. |
2881 | 0 | } |
2882 | 0 | hts_log_info("Writing cache file '%s'", path); |
2883 | 0 | mkdir_prefix(path, 01777); |
2884 | |
|
2885 | 0 | fp = hts_open_tmpfile(path, "wx", &path_tmp); |
2886 | 0 | if (!fp) { |
2887 | 0 | perror(path_tmp.s); |
2888 | 0 | free(path_tmp.s); |
2889 | | |
2890 | | // Not fatal - we have the data already so keep going. |
2891 | 0 | return 0; |
2892 | 0 | } |
2893 | | |
2894 | | // Check md5sum |
2895 | 0 | hts_md5_context *md5; |
2896 | 0 | char unsigned md5_buf1[16]; |
2897 | 0 | char md5_buf2[33]; |
2898 | |
|
2899 | 0 | if (!(md5 = hts_md5_init())) { |
2900 | 0 | hclose_abruptly(fp); |
2901 | 0 | unlink(path_tmp.s); |
2902 | 0 | free(path_tmp.s); |
2903 | 0 | return -1; |
2904 | 0 | } |
2905 | 0 | hts_md5_update(md5, r->seq, r->length); |
2906 | 0 | hts_md5_final(md5_buf1, md5); |
2907 | 0 | hts_md5_destroy(md5); |
2908 | 0 | hts_md5_hex(md5_buf2, md5_buf1); |
2909 | |
|
2910 | 0 | if (strncmp(tag->str+3, md5_buf2, 32) != 0) { |
2911 | 0 | hts_log_error("Mismatching md5sum for downloaded reference"); |
2912 | 0 | hclose_abruptly(fp); |
2913 | 0 | unlink(path_tmp.s); |
2914 | 0 | free(path_tmp.s); |
2915 | 0 | return -1; |
2916 | 0 | } |
2917 | | |
2918 | 0 | ssize_t length_written = hwrite(fp, r->seq, r->length); |
2919 | 0 | if (hclose(fp) < 0 || length_written != r->length || |
2920 | 0 | chmod(path_tmp.s, 0444) < 0 || |
2921 | 0 | rename(path_tmp.s, path) < 0) { |
2922 | 0 | hts_log_error("Creating reference at %s failed: %s", |
2923 | 0 | path, strerror(errno)); |
2924 | 0 | unlink(path_tmp.s); |
2925 | 0 | } |
2926 | 0 | } |
2927 | | |
2928 | 0 | free(path_tmp.s); |
2929 | 0 | return 0; |
2930 | 0 | } |
2931 | | |
2932 | 42 | static void cram_ref_incr_locked(refs_t *r, int id) { |
2933 | 42 | RP("%d INC REF %d, %d %p\n", gettid(), id, |
2934 | 42 | (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count+1:-999), |
2935 | 42 | id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1); |
2936 | | |
2937 | 42 | if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) |
2938 | 42 | return; |
2939 | | |
2940 | 0 | if (r->last_id == id) |
2941 | 0 | r->last_id = -1; |
2942 | |
|
2943 | 0 | ++r->ref_id[id]->count; |
2944 | 0 | } |
2945 | | |
2946 | 42 | void cram_ref_incr(refs_t *r, int id) { |
2947 | 42 | pthread_mutex_lock(&r->lock); |
2948 | 42 | cram_ref_incr_locked(r, id); |
2949 | 42 | pthread_mutex_unlock(&r->lock); |
2950 | 42 | } |
2951 | | |
2952 | 0 | static void cram_ref_decr_locked(refs_t *r, int id) { |
2953 | 0 | RP("%d DEC REF %d, %d %p\n", gettid(), id, |
2954 | 0 | (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count-1:-999), |
2955 | 0 | id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1); |
2956 | |
|
2957 | 0 | if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) { |
2958 | 0 | return; |
2959 | 0 | } |
2960 | | |
2961 | 0 | if (--r->ref_id[id]->count <= 0) { |
2962 | 0 | assert(r->ref_id[id]->count == 0); |
2963 | 0 | if (r->last_id >= 0) { |
2964 | 0 | if (r->ref_id[r->last_id]->count <= 0 && |
2965 | 0 | r->ref_id[r->last_id]->seq) { |
2966 | 0 | RP("%d FREE REF %d (%p)\n", gettid(), |
2967 | 0 | r->last_id, r->ref_id[r->last_id]->seq); |
2968 | 0 | ref_entry_free_seq(r->ref_id[r->last_id]); |
2969 | 0 | if (r->ref_id[r->last_id]->is_md5) r->ref_id[r->last_id]->length = 0; |
2970 | 0 | } |
2971 | 0 | } |
2972 | 0 | r->last_id = id; |
2973 | 0 | } |
2974 | 0 | } |
2975 | | |
2976 | 0 | void cram_ref_decr(refs_t *r, int id) { |
2977 | 0 | pthread_mutex_lock(&r->lock); |
2978 | 0 | cram_ref_decr_locked(r, id); |
2979 | 0 | pthread_mutex_unlock(&r->lock); |
2980 | 0 | } |
2981 | | |
2982 | | /* |
2983 | | * Used by cram_ref_load and cram_get_ref. The file handle will have |
2984 | | * already been opened, so we can catch it. The ref_entry *e informs us |
2985 | | * of whether this is a multi-line fasta file or a raw MD5 style file. |
2986 | | * Either way we create a single contiguous sequence. |
2987 | | * |
2988 | | * Returns all or part of a reference sequence on success (malloced); |
2989 | | * NULL on failure. |
2990 | | */ |
2991 | | static char *load_ref_portion(BGZF *fp, ref_entry *e, |
2992 | 0 | hts_pos_t start, hts_pos_t end) { |
2993 | 0 | off_t offset, len; |
2994 | 0 | char *seq; |
2995 | |
|
2996 | 0 | if (end < start) |
2997 | 0 | end = start; |
2998 | | |
2999 | | /* |
3000 | | * Compute locations in file. This is trivial for the MD5 files, but |
3001 | | * is still necessary for the fasta variants. |
3002 | | * |
3003 | | * Note the offset here, as with faidx, has the assumption that white- |
3004 | | * space (the diff between line_length and bases_per_line) only occurs |
3005 | | * at the end of a line of text. |
3006 | | */ |
3007 | 0 | offset = e->line_length |
3008 | 0 | ? e->offset + (start-1)/e->bases_per_line * e->line_length + |
3009 | 0 | (start-1) % e->bases_per_line |
3010 | 0 | : start-1; |
3011 | |
|
3012 | 0 | len = (e->line_length |
3013 | 0 | ? e->offset + (end-1)/e->bases_per_line * e->line_length + |
3014 | 0 | (end-1) % e->bases_per_line |
3015 | 0 | : end-1) - offset + 1; |
3016 | |
|
3017 | 0 | if (bgzf_useek(fp, offset, SEEK_SET) < 0) { |
3018 | 0 | perror("bgzf_useek() on reference file"); |
3019 | 0 | return NULL; |
3020 | 0 | } |
3021 | | |
3022 | 0 | if (len == 0 || !(seq = malloc(len))) { |
3023 | 0 | return NULL; |
3024 | 0 | } |
3025 | | |
3026 | 0 | if (len != bgzf_read(fp, seq, len)) { |
3027 | 0 | perror("bgzf_read() on reference file"); |
3028 | 0 | free(seq); |
3029 | 0 | return NULL; |
3030 | 0 | } |
3031 | | |
3032 | | /* Strip white-space if required. */ |
3033 | 0 | if (len != end-start+1) { |
3034 | 0 | hts_pos_t i, j; |
3035 | 0 | char *cp = seq; |
3036 | 0 | char *cp_to; |
3037 | | |
3038 | | // Copy up to the first white-space, and then repeatedly just copy |
3039 | | // bases_per_line verbatim, and use the slow method to end again. |
3040 | | // |
3041 | | // This may seem excessive, but this code can be a significant |
3042 | | // portion of total CRAM decode CPU time for shallow data sets. |
3043 | 0 | for (i = j = 0; i < len; i++) { |
3044 | 0 | if (!isspace_c(cp[i])) |
3045 | 0 | cp[j++] = cp[i] & ~0x20; |
3046 | 0 | else |
3047 | 0 | break; |
3048 | 0 | } |
3049 | 0 | while (i < len && isspace_c(cp[i])) |
3050 | 0 | i++; |
3051 | 0 | while (i < len - e->line_length) { |
3052 | 0 | hts_pos_t j_end = j + e->bases_per_line; |
3053 | 0 | while (j < j_end) |
3054 | 0 | cp[j++] = cp[i++] & ~0x20; // toupper equiv |
3055 | 0 | i += e->line_length - e->bases_per_line; |
3056 | 0 | } |
3057 | 0 | for (; i < len; i++) { |
3058 | 0 | if (!isspace_c(cp[i])) |
3059 | 0 | cp[j++] = cp[i] & ~0x20; |
3060 | 0 | } |
3061 | |
|
3062 | 0 | cp_to = cp+j; |
3063 | |
|
3064 | 0 | if (cp_to - seq != end-start+1) { |
3065 | 0 | hts_log_error("Malformed reference file"); |
3066 | 0 | free(seq); |
3067 | 0 | return NULL; |
3068 | 0 | } |
3069 | 0 | } else { |
3070 | 0 | int i; |
3071 | 0 | for (i = 0; i < len; i++) { |
3072 | 0 | seq[i] = toupper_c(seq[i]); |
3073 | 0 | } |
3074 | 0 | } |
3075 | | |
3076 | 0 | return seq; |
3077 | 0 | } |
3078 | | |
3079 | | /* |
3080 | | * Load the entire reference 'id'. |
3081 | | * This also increments the reference count by 1. |
3082 | | * |
3083 | | * Returns ref_entry on success; |
3084 | | * NULL on failure |
3085 | | */ |
3086 | 0 | ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) { |
3087 | 0 | ref_entry *e = r->ref_id[id]; |
3088 | 0 | hts_pos_t start = 1, end = e->length; |
3089 | 0 | char *seq; |
3090 | |
|
3091 | 0 | if (e->seq) { |
3092 | 0 | return e; |
3093 | 0 | } |
3094 | | |
3095 | 0 | assert(e->count == 0); |
3096 | |
|
3097 | 0 | if (r->last) { |
3098 | | #ifdef REF_DEBUG |
3099 | | int idx = 0; |
3100 | | for (idx = 0; idx < r->nref; idx++) |
3101 | | if (r->last == r->ref_id[idx]) |
3102 | | break; |
3103 | | RP("%d cram_ref_load DECR %d\n", gettid(), idx); |
3104 | | #endif |
3105 | 0 | assert(r->last->count > 0); |
3106 | 0 | if (--r->last->count <= 0) { |
3107 | 0 | RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq); |
3108 | 0 | if (r->last->seq) |
3109 | 0 | ref_entry_free_seq(r->last); |
3110 | 0 | } |
3111 | 0 | } |
3112 | |
|
3113 | 0 | if (!r->fn) |
3114 | 0 | return NULL; |
3115 | | |
3116 | | /* Open file if it's not already the current open reference */ |
3117 | 0 | if (strcmp(r->fn, e->fn) || r->fp == NULL) { |
3118 | 0 | if (r->fp) |
3119 | 0 | if (bgzf_close(r->fp) != 0) |
3120 | 0 | return NULL; |
3121 | 0 | r->fn = e->fn; |
3122 | 0 | if (!(r->fp = bgzf_open_ref(r->fn, "r", is_md5))) |
3123 | 0 | return NULL; |
3124 | 0 | } |
3125 | | |
3126 | 0 | RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end); |
3127 | |
|
3128 | 0 | if (!(seq = load_ref_portion(r->fp, e, start, end))) { |
3129 | 0 | return NULL; |
3130 | 0 | } |
3131 | | |
3132 | 0 | RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq); |
3133 | |
|
3134 | 0 | RP("%d INC REF %d, %"PRId64"\n", gettid(), id, (e->count+1)); |
3135 | 0 | e->seq = seq; |
3136 | 0 | e->mf = NULL; |
3137 | 0 | e->count++; |
3138 | | |
3139 | | /* |
3140 | | * Also keep track of last used ref so incr/decr loops on the same |
3141 | | * sequence don't cause load/free loops. |
3142 | | */ |
3143 | 0 | RP("%d cram_ref_load INCR %d => %"PRId64"\n", gettid(), id, e->count+1); |
3144 | 0 | r->last = e; |
3145 | 0 | e->count++; |
3146 | |
|
3147 | 0 | return e; |
3148 | 0 | } |
3149 | | |
3150 | | /* |
3151 | | * Returns a portion of a reference sequence from start to end inclusive. |
3152 | | * The returned pointer is owned by either the cram_file fd or by the |
3153 | | * internal refs_t structure and should not be freed by the caller. |
3154 | | * |
3155 | | * The difference is whether or not this refs_t is in use by just the one |
3156 | | * cram_fd or by multiples, or whether we have multiple threads accessing |
3157 | | * references. In either case fd->shared will be true and we start using |
3158 | | * reference counting to track the number of users of a specific reference |
3159 | | * sequence. |
3160 | | * |
3161 | | * Otherwise the ref seq returned is allocated as part of cram_fd itself |
3162 | | * and will be freed up on the next call to cram_get_ref or cram_close. |
3163 | | * |
3164 | | * To return the entire reference sequence, specify start as 1 and end |
3165 | | * as 0. |
3166 | | * |
3167 | | * To cease using a reference, call cram_ref_decr(). |
3168 | | * |
3169 | | * Returns reference on success, |
3170 | | * NULL on failure |
3171 | | */ |
3172 | 3.24k | char *cram_get_ref(cram_fd *fd, int id, hts_pos_t start, hts_pos_t end) { |
3173 | 3.24k | ref_entry *r; |
3174 | 3.24k | char *seq; |
3175 | 3.24k | int ostart = start; |
3176 | | |
3177 | 3.24k | if (id == -1 || start < 1) |
3178 | 2.96k | return NULL; |
3179 | | |
3180 | | /* FIXME: axiomatic query of r->seq being true? |
3181 | | * Or shortcut for unsorted data where we load once and never free? |
3182 | | */ |
3183 | | |
3184 | | //fd->shared_ref = 1; // hard code for now to simplify things |
3185 | | |
3186 | 284 | pthread_mutex_lock(&fd->ref_lock); |
3187 | | |
3188 | 284 | RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end); |
3189 | | |
3190 | | /* |
3191 | | * Unsorted data implies we want to fetch an entire reference at a time. |
3192 | | * We just deal with this at the moment by claiming we're sharing |
3193 | | * references instead, which has the same requirement. |
3194 | | */ |
3195 | 284 | if (fd->unsorted) |
3196 | 0 | fd->shared_ref = 1; |
3197 | | |
3198 | | |
3199 | | /* Sanity checking: does this ID exist? */ |
3200 | 284 | if (!fd->refs || id < 0 || id >= fd->refs->nref || !fd->refs->ref_id[id]) { |
3201 | 6 | hts_log_error("No reference found for id %d", id); |
3202 | 6 | pthread_mutex_unlock(&fd->ref_lock); |
3203 | 6 | return NULL; |
3204 | 6 | } |
3205 | | |
3206 | 278 | r = fd->refs->ref_id[id]; |
3207 | | |
3208 | | /* |
3209 | | * It has an entry, but may not have been populated yet. |
3210 | | * Any manually loaded .fai files have their lengths known. |
3211 | | * A ref entry computed from @SQ lines (M5 or UR field) will have |
3212 | | * r->length == 0 unless it's been loaded once and verified that we have |
3213 | | * an on-disk filename for it. |
3214 | | * |
3215 | | * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls |
3216 | | * open_path_mfile and libcurl, which isn't multi-thread safe unless I |
3217 | | * rewrite my code to have one curl handle per thread. |
3218 | | */ |
3219 | 278 | pthread_mutex_lock(&fd->refs->lock); |
3220 | 278 | if (r->length == 0) { |
3221 | 278 | if (fd->ref_fn) |
3222 | 0 | hts_log_warning("Reference file given, but ref '%s' not present", |
3223 | 278 | r->name); |
3224 | 278 | if (cram_populate_ref(fd, id, r) == -1) { |
3225 | 278 | hts_log_warning("Failed to populate reference \"%s\"", |
3226 | 278 | r->name); |
3227 | 278 | hts_log_warning("See https://www.htslib.org/doc/reference_seqs.html for further suggestions"); |
3228 | 278 | pthread_mutex_unlock(&fd->refs->lock); |
3229 | 278 | pthread_mutex_unlock(&fd->ref_lock); |
3230 | 278 | return NULL; |
3231 | 278 | } |
3232 | 0 | r = fd->refs->ref_id[id]; |
3233 | 0 | if (fd->unsorted) |
3234 | 0 | cram_ref_incr_locked(fd->refs, id); |
3235 | 0 | } |
3236 | | |
3237 | | |
3238 | | /* |
3239 | | * We now know that we the filename containing the reference, so check |
3240 | | * for limits. If it's over half the reference we'll load all of it in |
3241 | | * memory as this will speed up subsequent calls. |
3242 | | */ |
3243 | 0 | if (end < 1) |
3244 | 0 | end = r->length; |
3245 | 0 | if (end >= r->length) |
3246 | 0 | end = r->length; |
3247 | |
|
3248 | 0 | if (end - start >= 0.5*r->length || fd->shared_ref) { |
3249 | 0 | start = 1; |
3250 | 0 | end = r->length; |
3251 | 0 | } |
3252 | | |
3253 | | /* |
3254 | | * Maybe we have it cached already? If so use it. |
3255 | | * |
3256 | | * Alternatively if we don't have the sequence but we're sharing |
3257 | | * references and/or are asking for the entire length of it, then |
3258 | | * load the full reference into the refs structure and return |
3259 | | * a pointer to that one instead. |
3260 | | */ |
3261 | 0 | if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) { |
3262 | 0 | char *cp; |
3263 | |
|
3264 | 0 | if (id >= 0) { |
3265 | 0 | if (r->seq) { |
3266 | 0 | cram_ref_incr_locked(fd->refs, id); |
3267 | 0 | } else { |
3268 | 0 | ref_entry *e; |
3269 | 0 | if (!(e = cram_ref_load(fd->refs, id, r->is_md5))) { |
3270 | 0 | pthread_mutex_unlock(&fd->refs->lock); |
3271 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3272 | 0 | return NULL; |
3273 | 0 | } |
3274 | | |
3275 | | /* unsorted data implies cache ref indefinitely, to avoid |
3276 | | * continually loading and unloading. |
3277 | | */ |
3278 | 0 | if (fd->unsorted) |
3279 | 0 | cram_ref_incr_locked(fd->refs, id); |
3280 | 0 | } |
3281 | | |
3282 | 0 | fd->ref = NULL; /* We never access it directly */ |
3283 | 0 | fd->ref_start = 1; |
3284 | 0 | fd->ref_end = r->length; |
3285 | 0 | fd->ref_id = id; |
3286 | |
|
3287 | 0 | cp = fd->refs->ref_id[id]->seq + ostart-1; |
3288 | 0 | } else { |
3289 | 0 | fd->ref = NULL; |
3290 | 0 | cp = NULL; |
3291 | 0 | } |
3292 | | |
3293 | 0 | RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count); |
3294 | |
|
3295 | 0 | pthread_mutex_unlock(&fd->refs->lock); |
3296 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3297 | 0 | return cp; |
3298 | 0 | } |
3299 | | |
3300 | | /* |
3301 | | * Otherwise we're not sharing, we don't have a copy of it already and |
3302 | | * we're only asking for a small portion of it. |
3303 | | * |
3304 | | * In this case load up just that segment ourselves, freeing any old |
3305 | | * small segments in the process. |
3306 | | */ |
3307 | | |
3308 | | /* Unmapped ref ID */ |
3309 | 0 | if (id < 0 || !fd->refs->fn) { |
3310 | 0 | if (fd->ref_free) { |
3311 | 0 | free(fd->ref_free); |
3312 | 0 | fd->ref_free = NULL; |
3313 | 0 | } |
3314 | 0 | fd->ref = NULL; |
3315 | 0 | fd->ref_id = id; |
3316 | 0 | pthread_mutex_unlock(&fd->refs->lock); |
3317 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3318 | 0 | return NULL; |
3319 | 0 | } |
3320 | | |
3321 | | /* Open file if it's not already the current open reference */ |
3322 | 0 | if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) { |
3323 | 0 | if (fd->refs->fp) |
3324 | 0 | if (bgzf_close(fd->refs->fp) != 0) |
3325 | 0 | return NULL; |
3326 | 0 | fd->refs->fn = r->fn; |
3327 | 0 | if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r", r->is_md5))) { |
3328 | 0 | pthread_mutex_unlock(&fd->refs->lock); |
3329 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3330 | 0 | return NULL; |
3331 | 0 | } |
3332 | 0 | } |
3333 | | |
3334 | 0 | if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) { |
3335 | 0 | pthread_mutex_unlock(&fd->refs->lock); |
3336 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3337 | 0 | return NULL; |
3338 | 0 | } |
3339 | | |
3340 | 0 | if (fd->ref_free) |
3341 | 0 | free(fd->ref_free); |
3342 | |
|
3343 | 0 | fd->ref_id = id; |
3344 | 0 | fd->ref_start = start; |
3345 | 0 | fd->ref_end = end; |
3346 | 0 | fd->ref_free = fd->ref; |
3347 | 0 | seq = fd->ref; |
3348 | |
|
3349 | 0 | pthread_mutex_unlock(&fd->refs->lock); |
3350 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3351 | |
|
3352 | 0 | return seq ? seq + ostart - start : NULL; |
3353 | 0 | } |
3354 | | |
3355 | | /* |
3356 | | * If fd has been opened for reading, it may be permitted to specify 'fn' |
3357 | | * as NULL and let the code auto-detect the reference by parsing the |
3358 | | * SAM header @SQ lines. |
3359 | | */ |
3360 | 0 | int cram_load_reference(cram_fd *fd, char *fn) { |
3361 | 0 | int ret = 0; |
3362 | |
|
3363 | 0 | if (fn) { |
3364 | 0 | fd->refs = refs_load_fai(fd->refs, fn, |
3365 | 0 | !(fd->embed_ref>0 && fd->mode == 'r')); |
3366 | 0 | fn = fd->refs ? fd->refs->fn : NULL; |
3367 | 0 | if (!fn) |
3368 | 0 | ret = -1; |
3369 | 0 | sanitise_SQ_lines(fd); |
3370 | 0 | } |
3371 | 0 | fd->ref_fn = fn; |
3372 | |
|
3373 | 0 | if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) { |
3374 | 0 | if (fd->refs) |
3375 | 0 | refs_free(fd->refs); |
3376 | 0 | if (!(fd->refs = refs_create())) |
3377 | 0 | return -1; |
3378 | 0 | if (-1 == refs_from_header(fd)) |
3379 | 0 | return -1; |
3380 | 0 | } |
3381 | | |
3382 | 0 | if (fd->header) |
3383 | 0 | if (-1 == refs2id(fd->refs, fd->header)) |
3384 | 0 | return -1; |
3385 | | |
3386 | 0 | return ret; |
3387 | 0 | } |
3388 | | |
3389 | | /* ---------------------------------------------------------------------- |
3390 | | * Containers |
3391 | | */ |
3392 | | |
3393 | | /* |
3394 | | * Creates a new container, specifying the maximum number of slices |
3395 | | * and records permitted. |
3396 | | * |
3397 | | * Returns cram_container ptr on success |
3398 | | * NULL on failure |
3399 | | */ |
3400 | 8.64k | cram_container *cram_new_container(int nrec, int nslice) { |
3401 | 8.64k | cram_container *c = calloc(1, sizeof(*c)); |
3402 | 8.64k | enum cram_DS_ID id; |
3403 | | |
3404 | 8.64k | if (!c) |
3405 | 0 | return NULL; |
3406 | | |
3407 | 8.64k | c->curr_ref = -2; |
3408 | | |
3409 | 8.64k | c->max_c_rec = nrec * nslice; |
3410 | 8.64k | c->curr_c_rec = 0; |
3411 | | |
3412 | 8.64k | c->max_rec = nrec; |
3413 | 8.64k | c->record_counter = 0; |
3414 | 8.64k | c->num_bases = 0; |
3415 | 8.64k | c->s_num_bases = 0; |
3416 | | |
3417 | 8.64k | c->max_slice = nslice; |
3418 | 8.64k | c->curr_slice = 0; |
3419 | | |
3420 | 8.64k | c->pos_sorted = 1; |
3421 | 8.64k | c->max_apos = 0; |
3422 | 8.64k | c->multi_seq = 0; |
3423 | 8.64k | c->qs_seq_orient = 1; |
3424 | 8.64k | c->no_ref = 0; |
3425 | 8.64k | c->embed_ref = -1; // automatic selection |
3426 | | |
3427 | 8.64k | c->bams = NULL; |
3428 | | |
3429 | 8.64k | if (!(c->slices = calloc(nslice != 0 ? nslice : 1, sizeof(cram_slice *)))) |
3430 | 0 | goto err; |
3431 | 8.64k | c->slice = NULL; |
3432 | | |
3433 | 8.64k | if (!(c->comp_hdr = cram_new_compression_header())) |
3434 | 0 | goto err; |
3435 | 8.64k | c->comp_hdr_block = NULL; |
3436 | | |
3437 | 250k | for (id = DS_RN; id < DS_TN; id++) |
3438 | 242k | if (!(c->stats[id] = cram_stats_create())) goto err; |
3439 | | |
3440 | | //c->aux_B_stats = cram_stats_create(); |
3441 | | |
3442 | 8.64k | if (!(c->tags_used = kh_init(m_tagmap))) |
3443 | 0 | goto err; |
3444 | 8.64k | c->refs_used = 0; |
3445 | 8.64k | c->ref_free = 0; |
3446 | | |
3447 | 8.64k | return c; |
3448 | | |
3449 | 0 | err: |
3450 | 0 | if (c) { |
3451 | 0 | if (c->slices) |
3452 | 0 | free(c->slices); |
3453 | 0 | free(c); |
3454 | 0 | } |
3455 | 0 | return NULL; |
3456 | 8.64k | } |
3457 | | |
3458 | 2.62k | static void free_bam_list(bam_seq_t *bams, int max_rec) { |
3459 | 2.62k | int i; |
3460 | 26.2M | for (i = 0; i < max_rec; i++) |
3461 | 26.2M | bam_free(&bams[i]); |
3462 | | |
3463 | 2.62k | free(bams); |
3464 | 2.62k | } |
3465 | | |
3466 | 19.6k | void cram_free_container(cram_container *c) { |
3467 | 19.6k | enum cram_DS_ID id; |
3468 | 19.6k | int i; |
3469 | | |
3470 | 19.6k | if (!c) |
3471 | 0 | return; |
3472 | | |
3473 | 19.6k | if (c->refs_used) |
3474 | 9 | free(c->refs_used); |
3475 | | |
3476 | 19.6k | if (c->landmark) |
3477 | 9.48k | free(c->landmark); |
3478 | | |
3479 | 19.6k | if (c->comp_hdr) |
3480 | 8.98k | cram_free_compression_header(c->comp_hdr); |
3481 | | |
3482 | 19.6k | if (c->comp_hdr_block) |
3483 | 8.06k | cram_free_block(c->comp_hdr_block); |
3484 | | |
3485 | | // Free the slices; filled out by encoder only |
3486 | 19.6k | if (c->slices) { |
3487 | 11.8k | for (i = 0; i < c->max_slice; i++) { |
3488 | 3.20k | if (c->slices[i]) |
3489 | 2.62k | cram_free_slice(c->slices[i]); |
3490 | 3.20k | if (c->slices[i] == c->slice) |
3491 | 3.20k | c->slice = NULL; |
3492 | 3.20k | } |
3493 | 8.64k | free(c->slices); |
3494 | 8.64k | } |
3495 | | |
3496 | | // Free the current slice; set by both encoder & decoder |
3497 | 19.6k | if (c->slice) { |
3498 | 0 | cram_free_slice(c->slice); |
3499 | 0 | c->slice = NULL; |
3500 | 0 | } |
3501 | | |
3502 | 570k | for (id = DS_RN; id < DS_TN; id++) |
3503 | 550k | if (c->stats[id]) cram_stats_free(c->stats[id]); |
3504 | | |
3505 | | //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats); |
3506 | | |
3507 | 19.6k | if (c->tags_used) { |
3508 | 8.64k | khint_t k; |
3509 | | |
3510 | 13.3k | for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) { |
3511 | 4.72k | if (!kh_exist(c->tags_used, k)) |
3512 | 2.73k | continue; |
3513 | | |
3514 | 1.98k | cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k); |
3515 | 1.98k | if (tm) { |
3516 | 1.98k | cram_codec *c = tm->codec; |
3517 | | |
3518 | 1.98k | if (c) c->free(c); |
3519 | | |
3520 | | // If tm->blk or tm->blk2 is set, then we haven't yet got to |
3521 | | // cram_encode_container which copies the blocks to s->aux_block |
3522 | | // and NULLifies tm->blk*. In this case we failed to complete |
3523 | | // the container construction, so we have to free up our partially |
3524 | | // converted CRAM. |
3525 | 1.98k | cram_free_block(tm->blk); |
3526 | 1.98k | cram_free_block(tm->blk2); |
3527 | 1.98k | free(tm); |
3528 | 1.98k | } |
3529 | 1.98k | } |
3530 | | |
3531 | 8.64k | kh_destroy(m_tagmap, c->tags_used); |
3532 | 8.64k | } |
3533 | | |
3534 | 19.6k | if (c->ref_free) |
3535 | 152 | free(c->ref); |
3536 | | |
3537 | 19.6k | if (c->bams) |
3538 | 292 | free_bam_list(c->bams, c->max_c_rec); |
3539 | | |
3540 | 19.6k | free(c); |
3541 | 19.6k | } |
3542 | | |
3543 | | /* |
3544 | | * Reads a container header. |
3545 | | * |
3546 | | * Returns cram_container on success |
3547 | | * NULL on failure or no container left (fd->err == 0). |
3548 | | */ |
3549 | 11.3k | cram_container *cram_read_container(cram_fd *fd) { |
3550 | 11.3k | cram_container c2, *c; |
3551 | 11.3k | int i, s; |
3552 | 11.3k | size_t rd = 0; |
3553 | 11.3k | uint32_t crc = 0; |
3554 | | |
3555 | 11.3k | fd->err = 0; |
3556 | 11.3k | fd->eof = 0; |
3557 | | |
3558 | 11.3k | memset(&c2, 0, sizeof(c2)); |
3559 | 11.3k | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
3560 | 8.67k | if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) { |
3561 | 27 | fd->eof = fd->empty_container ? 1 : 2; |
3562 | 27 | return NULL; |
3563 | 8.64k | } else { |
3564 | 8.64k | rd+=s; |
3565 | 8.64k | } |
3566 | 8.67k | } else if (CRAM_MAJOR_VERS(fd->version) < 4) { |
3567 | 2.33k | uint32_t len; |
3568 | 2.33k | if ((s = int32_decode(fd, &c2.length)) == -1) { |
3569 | 21 | if (CRAM_MAJOR_VERS(fd->version) == 2 && |
3570 | 8 | CRAM_MINOR_VERS(fd->version) == 0) |
3571 | 4 | fd->eof = 1; // EOF blocks arrived in v2.1 |
3572 | 17 | else |
3573 | 17 | fd->eof = fd->empty_container ? 1 : 2; |
3574 | 21 | return NULL; |
3575 | 2.31k | } else { |
3576 | 2.31k | rd+=s; |
3577 | 2.31k | } |
3578 | 2.31k | len = le_int4(c2.length); |
3579 | 2.31k | crc = crc32(0L, (unsigned char *)&len, 4); |
3580 | 2.31k | } |
3581 | 11.2k | if ((s = fd->vv.varint_decode32s_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s; |
3582 | 11.2k | int32_t i32; |
3583 | 11.2k | if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s; |
3584 | 11.1k | c2.ref_seq_start = i32; |
3585 | 11.1k | if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s; |
3586 | 11.1k | c2.ref_seq_span = i32; |
3587 | 11.1k | if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_records, &crc)) == -1) return NULL; else rd+=s; |
3588 | | |
3589 | 11.1k | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
3590 | 8.58k | c2.record_counter = 0; |
3591 | 8.58k | c2.num_bases = 0; |
3592 | 8.58k | } else { |
3593 | 2.56k | if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
3594 | 1.25k | if ((s = fd->vv.varint_decode64_crc(fd, &c2.record_counter, &crc)) == -1) |
3595 | 21 | return NULL; |
3596 | 1.23k | else |
3597 | 1.23k | rd += s; |
3598 | 1.30k | } else { |
3599 | 1.30k | int32_t i32; |
3600 | 1.30k | if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) |
3601 | 4 | return NULL; |
3602 | 1.30k | else |
3603 | 1.30k | rd += s; |
3604 | 1.30k | c2.record_counter = i32; |
3605 | 1.30k | } |
3606 | | |
3607 | 2.53k | if ((s = fd->vv.varint_decode64_crc(fd, &c2.num_bases, &crc))== -1) |
3608 | 35 | return NULL; |
3609 | 2.50k | else |
3610 | 2.50k | rd += s; |
3611 | 2.53k | } |
3612 | 11.0k | if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_blocks, &crc)) == -1) |
3613 | 15 | return NULL; |
3614 | 11.0k | else |
3615 | 11.0k | rd+=s; |
3616 | 11.0k | if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_landmarks, &crc))== -1) |
3617 | 35 | return NULL; |
3618 | 11.0k | else |
3619 | 11.0k | rd+=s; |
3620 | | |
3621 | 11.0k | if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t)) |
3622 | 25 | return NULL; |
3623 | | |
3624 | 11.0k | if (!(c = calloc(1, sizeof(*c)))) |
3625 | 0 | return NULL; |
3626 | | |
3627 | 11.0k | *c = c2; |
3628 | 11.0k | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
3629 | 11.0k | if (c->num_landmarks > FUZZ_ALLOC_LIMIT/sizeof(int32_t)) { |
3630 | 5 | fd->err = errno = ENOMEM; |
3631 | 5 | cram_free_container(c); |
3632 | 5 | return NULL; |
3633 | 5 | } |
3634 | 11.0k | #endif |
3635 | 11.0k | if (c->num_landmarks && !(c->landmark = hts_malloc_p(sizeof(*c->landmark), c->num_landmarks))) { |
3636 | 0 | fd->err = errno; |
3637 | 0 | cram_free_container(c); |
3638 | 0 | return NULL; |
3639 | 0 | } |
3640 | 45.7k | for (i = 0; i < c->num_landmarks; i++) { |
3641 | 34.8k | if ((s = fd->vv.varint_decode32_crc(fd, &c->landmark[i], &crc)) == -1) { |
3642 | 158 | cram_free_container(c); |
3643 | 158 | return NULL; |
3644 | 34.7k | } else { |
3645 | 34.7k | rd += s; |
3646 | 34.7k | } |
3647 | 34.8k | } |
3648 | | |
3649 | 10.8k | if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
3650 | 1.11k | if (-1 == int32_decode(fd, (int32_t *)&c->crc32)) { |
3651 | 5 | cram_free_container(c); |
3652 | 5 | return NULL; |
3653 | 1.11k | } else { |
3654 | 1.11k | rd+=4; |
3655 | 1.11k | } |
3656 | | |
3657 | 1.11k | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
3658 | | // Pretend the CRC was OK so the fuzzer doesn't have to get it right |
3659 | 1.11k | crc = c->crc32; |
3660 | 1.11k | #endif |
3661 | | |
3662 | 1.11k | if (crc != c->crc32) { |
3663 | 0 | hts_log_error("Container header CRC32 failure"); |
3664 | 0 | cram_free_container(c); |
3665 | 0 | return NULL; |
3666 | 0 | } |
3667 | 1.11k | } |
3668 | | |
3669 | 10.8k | c->offset = rd; |
3670 | 10.8k | c->slices = NULL; |
3671 | 10.8k | c->slice = NULL; |
3672 | 10.8k | c->curr_slice = 0; |
3673 | 10.8k | c->max_slice = c->num_landmarks; |
3674 | 10.8k | c->slice_rec = 0; |
3675 | 10.8k | c->curr_rec = 0; |
3676 | 10.8k | c->max_rec = 0; |
3677 | | |
3678 | 10.8k | if (c->ref_seq_id == -2) { |
3679 | 12 | c->multi_seq = 1; |
3680 | 12 | fd->multi_seq = 1; |
3681 | 12 | } |
3682 | | |
3683 | 10.8k | fd->empty_container = |
3684 | 10.8k | (c->num_records == 0 && |
3685 | 5.37k | c->ref_seq_id == -1 && |
3686 | 10.8k | c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0; |
3687 | | |
3688 | 10.8k | return c; |
3689 | 10.8k | } |
3690 | | |
3691 | | |
3692 | | /* MAXIMUM storage size needed for the container. */ |
3693 | 0 | int cram_container_size(cram_container *c) { |
3694 | 0 | return 55 + 5*c->num_landmarks; |
3695 | 0 | } |
3696 | | |
3697 | | |
3698 | | /* |
3699 | | * Stores the container structure in dat and returns *size as the |
3700 | | * number of bytes written to dat[]. The input size of dat is also |
3701 | | * held in *size and should be initialised to cram_container_size(c). |
3702 | | * |
3703 | | * Returns 0 on success; |
3704 | | * -1 on failure |
3705 | | */ |
3706 | | int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) |
3707 | 0 | { |
3708 | 0 | char *cp = (char *)dat; |
3709 | 0 | int i; |
3710 | | |
3711 | | // Check the input buffer is large enough according to our stated |
3712 | | // requirements. (NOTE: it may actually take less.) |
3713 | 0 | if (cram_container_size(c) > *size) |
3714 | 0 | return -1; |
3715 | | |
3716 | 0 | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
3717 | 0 | cp += itf8_put(cp, c->length); |
3718 | 0 | } else { |
3719 | 0 | *(int32_t *)cp = le_int4(c->length); |
3720 | 0 | cp += 4; |
3721 | 0 | } |
3722 | 0 | if (c->multi_seq) { |
3723 | 0 | cp += fd->vv.varint_put32(cp, NULL, -2); |
3724 | 0 | cp += fd->vv.varint_put32(cp, NULL, 0); |
3725 | 0 | cp += fd->vv.varint_put32(cp, NULL, 0); |
3726 | 0 | } else { |
3727 | 0 | cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id); |
3728 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start); |
3729 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span); |
3730 | 0 | } |
3731 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->num_records); |
3732 | 0 | if (CRAM_MAJOR_VERS(fd->version) == 2) { |
3733 | 0 | cp += fd->vv.varint_put64(cp, NULL, c->record_counter); |
3734 | 0 | } else if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
3735 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->record_counter); |
3736 | 0 | } |
3737 | 0 | cp += fd->vv.varint_put64(cp, NULL, c->num_bases); |
3738 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->num_blocks); |
3739 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks); |
3740 | 0 | for (i = 0; i < c->num_landmarks; i++) |
3741 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]); |
3742 | |
|
3743 | 0 | if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
3744 | 0 | c->crc32 = crc32(0L, (uc *)dat, cp-dat); |
3745 | 0 | cp[0] = c->crc32 & 0xff; |
3746 | 0 | cp[1] = (c->crc32 >> 8) & 0xff; |
3747 | 0 | cp[2] = (c->crc32 >> 16) & 0xff; |
3748 | 0 | cp[3] = (c->crc32 >> 24) & 0xff; |
3749 | 0 | cp += 4; |
3750 | 0 | } |
3751 | |
|
3752 | 0 | *size = cp-dat; // actual used size |
3753 | |
|
3754 | 0 | return 0; |
3755 | 0 | } |
3756 | | |
3757 | | |
3758 | | /* |
3759 | | * Writes a container structure. |
3760 | | * |
3761 | | * Returns 0 on success |
3762 | | * -1 on failure |
3763 | | */ |
3764 | 16.8k | int cram_write_container(cram_fd *fd, cram_container *c) { |
3765 | 16.8k | char buf_a[1024], *buf = buf_a, *cp; |
3766 | 16.8k | int i; |
3767 | | |
3768 | 16.8k | if (61 + c->num_landmarks * 10 >= 1024) { |
3769 | 0 | buf = malloc(61 + c->num_landmarks * 10); |
3770 | 0 | if (!buf) |
3771 | 0 | return -1; |
3772 | 0 | } |
3773 | 16.8k | cp = buf; |
3774 | | |
3775 | 16.8k | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
3776 | 0 | cp += itf8_put(cp, c->length); |
3777 | 16.8k | } else if (CRAM_MAJOR_VERS(fd->version) <= 3) { |
3778 | 16.8k | *(int32_t *)cp = le_int4(c->length); |
3779 | 16.8k | cp += 4; |
3780 | 16.8k | } else { |
3781 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->length); |
3782 | 0 | } |
3783 | 16.8k | if (c->multi_seq) { |
3784 | 8 | cp += fd->vv.varint_put32(cp, NULL, (uint32_t)-2); |
3785 | 8 | cp += fd->vv.varint_put32(cp, NULL, 0); |
3786 | 8 | cp += fd->vv.varint_put32(cp, NULL, 0); |
3787 | 16.7k | } else { |
3788 | 16.7k | cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id); |
3789 | 16.7k | cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start); |
3790 | 16.7k | cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span); |
3791 | 16.7k | } |
3792 | 16.8k | cp += fd->vv.varint_put32(cp, NULL, c->num_records); |
3793 | 16.8k | if (CRAM_MAJOR_VERS(fd->version) >= 3) |
3794 | 16.8k | cp += fd->vv.varint_put64(cp, NULL, c->record_counter); |
3795 | 0 | else |
3796 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->record_counter); |
3797 | 16.8k | cp += fd->vv.varint_put64(cp, NULL, c->num_bases); |
3798 | 16.8k | cp += fd->vv.varint_put32(cp, NULL, c->num_blocks); |
3799 | 16.8k | cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks); |
3800 | 30.5k | for (i = 0; i < c->num_landmarks; i++) |
3801 | 13.7k | cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]); |
3802 | | |
3803 | 16.8k | if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
3804 | 16.8k | c->crc32 = crc32(0L, (uc *)buf, cp-buf); |
3805 | 16.8k | cp[0] = c->crc32 & 0xff; |
3806 | 16.8k | cp[1] = (c->crc32 >> 8) & 0xff; |
3807 | 16.8k | cp[2] = (c->crc32 >> 16) & 0xff; |
3808 | 16.8k | cp[3] = (c->crc32 >> 24) & 0xff; |
3809 | 16.8k | cp += 4; |
3810 | 16.8k | } |
3811 | | |
3812 | 16.8k | if (cp-buf != hwrite(fd->fp, buf, cp-buf)) { |
3813 | 0 | if (buf != buf_a) |
3814 | 0 | free(buf); |
3815 | 0 | return -1; |
3816 | 0 | } |
3817 | | |
3818 | 16.8k | if (buf != buf_a) |
3819 | 0 | free(buf); |
3820 | | |
3821 | 16.8k | return 0; |
3822 | 16.8k | } |
3823 | | |
3824 | | // common component shared by cram_flush_container{,_mt} |
3825 | 2.89k | static int cram_flush_container2(cram_fd *fd, cram_container *c) { |
3826 | 2.89k | int i, j; |
3827 | | |
3828 | 2.89k | if (c->curr_slice > 0 && !c->slices) |
3829 | 0 | return -1; |
3830 | | |
3831 | | //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum); |
3832 | | |
3833 | 2.89k | off_t c_offset = htell(fd->fp); // File offset of container |
3834 | | |
3835 | | /* Write the container struct itself */ |
3836 | 2.89k | if (0 != cram_write_container(fd, c)) |
3837 | 0 | return -1; |
3838 | | |
3839 | 2.89k | off_t hdr_size = htell(fd->fp) - c_offset; |
3840 | | |
3841 | | /* And the compression header */ |
3842 | 2.89k | if (0 != cram_write_block(fd, c->comp_hdr_block)) |
3843 | 0 | return -1; |
3844 | | |
3845 | | /* Followed by the slice blocks */ |
3846 | 2.89k | off_t file_offset = htell(fd->fp); |
3847 | 5.78k | for (i = 0; i < c->curr_slice; i++) { |
3848 | 2.89k | cram_slice *s = c->slices[i]; |
3849 | 2.89k | off_t spos = file_offset - c_offset - hdr_size; |
3850 | | |
3851 | 2.89k | if (0 != cram_write_block(fd, s->hdr_block)) |
3852 | 0 | return -1; |
3853 | | |
3854 | 16.2k | for (j = 0; j < s->hdr->num_blocks; j++) { |
3855 | 13.3k | if (0 != cram_write_block(fd, s->block[j])) |
3856 | 0 | return -1; |
3857 | 13.3k | } |
3858 | | |
3859 | 2.89k | file_offset = htell(fd->fp); |
3860 | 2.89k | off_t sz = file_offset - c_offset - hdr_size - spos; |
3861 | | |
3862 | 2.89k | if (fd->idxfp) { |
3863 | 0 | if (cram_index_slice(fd, c, s, fd->idxfp, c_offset, spos, sz) < 0) |
3864 | 0 | return -1; |
3865 | 0 | } |
3866 | 2.89k | } |
3867 | | |
3868 | 2.89k | return 0; |
3869 | 2.89k | } |
3870 | | |
3871 | | /* |
3872 | | * Flushes a completely or partially full container to disk, writing |
3873 | | * container structure, header and blocks. This also calls the encoder |
3874 | | * functions. |
3875 | | * |
3876 | | * Returns 0 on success |
3877 | | * -1 on failure |
3878 | | */ |
3879 | 3.20k | int cram_flush_container(cram_fd *fd, cram_container *c) { |
3880 | | /* Encode the container blocks and generate compression header */ |
3881 | 3.20k | if (0 != cram_encode_container(fd, c)) |
3882 | 309 | return -1; |
3883 | | |
3884 | 2.89k | return cram_flush_container2(fd, c); |
3885 | 3.20k | } |
3886 | | |
3887 | | typedef struct { |
3888 | | cram_fd *fd; |
3889 | | cram_container *c; |
3890 | | } cram_job; |
3891 | | |
3892 | 0 | void *cram_flush_thread(void *arg) { |
3893 | 0 | cram_job *j = (cram_job *)arg; |
3894 | | |
3895 | | /* Encode the container blocks and generate compression header */ |
3896 | 0 | if (0 != cram_encode_container(j->fd, j->c)) { |
3897 | 0 | hts_log_error("Call to cram_encode_container failed"); |
3898 | 0 | return NULL; |
3899 | 0 | } |
3900 | | |
3901 | 0 | return arg; |
3902 | 0 | } |
3903 | | |
3904 | 0 | static int cram_flush_result(cram_fd *fd) { |
3905 | 0 | int i, ret = 0; |
3906 | 0 | hts_tpool_result *r; |
3907 | 0 | cram_container *lc = NULL; |
3908 | | |
3909 | | // NB: we can have one result per slice, not per container, |
3910 | | // so we need to free the container only after all slices |
3911 | | // within it have been freed. (Automatic via reference counting.) |
3912 | 0 | while ((r = hts_tpool_next_result(fd->rqueue))) { |
3913 | 0 | cram_job *j = (cram_job *)hts_tpool_result_data(r); |
3914 | 0 | cram_container *c; |
3915 | |
|
3916 | 0 | if (!j) { |
3917 | 0 | hts_tpool_delete_result(r, 0); |
3918 | 0 | return -1; |
3919 | 0 | } |
3920 | | |
3921 | 0 | fd = j->fd; |
3922 | 0 | c = j->c; |
3923 | |
|
3924 | 0 | if (fd->mode == 'w') |
3925 | 0 | if (0 != cram_flush_container2(fd, c)) |
3926 | 0 | return -1; |
3927 | | |
3928 | | // Free the slices; filled out by encoder only |
3929 | 0 | if (c->slices) { |
3930 | 0 | for (i = 0; i < c->max_slice; i++) { |
3931 | 0 | if (c->slices[i]) |
3932 | 0 | cram_free_slice(c->slices[i]); |
3933 | 0 | if (c->slices[i] == c->slice) |
3934 | 0 | c->slice = NULL; |
3935 | 0 | c->slices[i] = NULL; |
3936 | 0 | } |
3937 | 0 | } |
3938 | | |
3939 | | // Free the current slice; set by both encoder & decoder |
3940 | 0 | if (c->slice) { |
3941 | 0 | cram_free_slice(c->slice); |
3942 | 0 | c->slice = NULL; |
3943 | 0 | } |
3944 | 0 | c->curr_slice = 0; |
3945 | | |
3946 | | // Our jobs will be in order, so we free the last |
3947 | | // container when our job has switched to a new one. |
3948 | 0 | if (c != lc) { |
3949 | 0 | if (lc) { |
3950 | 0 | if (fd->ctr == lc) |
3951 | 0 | fd->ctr = NULL; |
3952 | 0 | if (fd->ctr_mt == lc) |
3953 | 0 | fd->ctr_mt = NULL; |
3954 | 0 | cram_free_container(lc); |
3955 | 0 | } |
3956 | 0 | lc = c; |
3957 | 0 | } |
3958 | |
|
3959 | 0 | hts_tpool_delete_result(r, 1); |
3960 | 0 | } |
3961 | 0 | if (lc) { |
3962 | 0 | if (fd->ctr == lc) |
3963 | 0 | fd->ctr = NULL; |
3964 | 0 | if (fd->ctr_mt == lc) |
3965 | 0 | fd->ctr_mt = NULL; |
3966 | 0 | cram_free_container(lc); |
3967 | 0 | } |
3968 | |
|
3969 | 0 | return ret; |
3970 | 0 | } |
3971 | | |
3972 | | // Note: called while metrics_lock is held. |
3973 | | // Will be left in this state too, but may temporarily unlock. |
3974 | 18 | void reset_metrics(cram_fd *fd) { |
3975 | 18 | int i; |
3976 | | |
3977 | 18 | if (fd->pool) { |
3978 | | // If multi-threaded we have multiple blocks being |
3979 | | // compressed already and several on the to-do list |
3980 | | // (fd->rqueue->pending). It's tricky to reset the |
3981 | | // metrics exactly the correct point, so instead we |
3982 | | // just flush the pool, reset, and then continue again. |
3983 | | |
3984 | | // Don't bother starting a new trial before then though. |
3985 | 0 | for (i = 0; i < DS_END; i++) { |
3986 | 0 | cram_metrics *m = fd->m[i]; |
3987 | 0 | if (!m) |
3988 | 0 | continue; |
3989 | 0 | m->next_trial = 999; |
3990 | 0 | } |
3991 | |
|
3992 | 0 | pthread_mutex_unlock(&fd->metrics_lock); |
3993 | 0 | hts_tpool_process_flush(fd->rqueue); |
3994 | 0 | pthread_mutex_lock(&fd->metrics_lock); |
3995 | 0 | } |
3996 | | |
3997 | 864 | for (i = 0; i < DS_END; i++) { |
3998 | 846 | cram_metrics *m = fd->m[i]; |
3999 | 846 | if (!m) |
4000 | 0 | continue; |
4001 | | |
4002 | 846 | m->trial = NTRIALS; |
4003 | 846 | m->next_trial = TRIAL_SPAN; |
4004 | 846 | m->revised_method = 0; |
4005 | 846 | m->unpackable = 0; |
4006 | | |
4007 | 846 | memset(m->sz, 0, sizeof(m->sz)); |
4008 | 846 | } |
4009 | 18 | } |
4010 | | |
4011 | 3.20k | int cram_flush_container_mt(cram_fd *fd, cram_container *c) { |
4012 | 3.20k | cram_job *j; |
4013 | | |
4014 | | // At the junction of mapped to unmapped data the compression |
4015 | | // methods may need to change due to very different statistical |
4016 | | // properties; particularly BA if minhash sorted. |
4017 | | // |
4018 | | // However with threading we'll have several in-flight blocks |
4019 | | // arriving out of order. |
4020 | | // |
4021 | | // So we do one trial reset of NThreads to last for NThreads |
4022 | | // duration to get us over this transition period, followed |
4023 | | // by another retrial of the usual ntrials & trial span. |
4024 | 3.20k | pthread_mutex_lock(&fd->metrics_lock); |
4025 | 3.20k | if (c->n_mapped < 0.3*c->curr_rec && |
4026 | 2.87k | fd->last_mapped > 0.7*c->max_rec) { |
4027 | 18 | reset_metrics(fd); |
4028 | 18 | } |
4029 | 3.20k | fd->last_mapped = c->n_mapped * (c->max_rec+1)/(c->curr_rec+1) ; |
4030 | 3.20k | pthread_mutex_unlock(&fd->metrics_lock); |
4031 | | |
4032 | 3.20k | if (!fd->pool) |
4033 | 3.20k | return cram_flush_container(fd, c); |
4034 | | |
4035 | 0 | if (!(j = malloc(sizeof(*j)))) |
4036 | 0 | return -1; |
4037 | 0 | j->fd = fd; |
4038 | 0 | j->c = c; |
4039 | | |
4040 | | // Flush the job. Note our encoder queue may be full, so we |
4041 | | // either have to keep trying in non-blocking mode (what we do) or |
4042 | | // use a dedicated separate thread for draining the queue. |
4043 | 0 | for (;;) { |
4044 | 0 | errno = 0; |
4045 | 0 | hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_flush_thread, j, 1); |
4046 | 0 | int pending = (errno == EAGAIN); |
4047 | 0 | if (cram_flush_result(fd) != 0) |
4048 | 0 | return -1; |
4049 | 0 | if (!pending) |
4050 | 0 | break; |
4051 | | |
4052 | 0 | hts_usleep(1000); |
4053 | 0 | } |
4054 | | |
4055 | 0 | return 0; |
4056 | 0 | } |
4057 | | |
4058 | | /* ---------------------------------------------------------------------- |
4059 | | * Compression headers; the first part of the container |
4060 | | */ |
4061 | | |
4062 | | /* |
4063 | | * Creates a new blank container compression header |
4064 | | * |
4065 | | * Returns header ptr on success |
4066 | | * NULL on failure |
4067 | | */ |
4068 | 8.64k | cram_block_compression_hdr *cram_new_compression_header(void) { |
4069 | 8.64k | cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr)); |
4070 | 8.64k | if (!hdr) |
4071 | 0 | return NULL; |
4072 | | |
4073 | 8.64k | if (!(hdr->TD_blk = cram_new_block(CORE, 0))) { |
4074 | 0 | free(hdr); |
4075 | 0 | return NULL; |
4076 | 0 | } |
4077 | | |
4078 | 8.64k | if (!(hdr->TD_hash = kh_init(m_s2i))) { |
4079 | 0 | cram_free_block(hdr->TD_blk); |
4080 | 0 | free(hdr); |
4081 | 0 | return NULL; |
4082 | 0 | } |
4083 | | |
4084 | 8.64k | if (!(hdr->TD_keys = string_pool_create(8192))) { |
4085 | 0 | kh_destroy(m_s2i, hdr->TD_hash); |
4086 | 0 | cram_free_block(hdr->TD_blk); |
4087 | 0 | free(hdr); |
4088 | 0 | return NULL; |
4089 | 0 | } |
4090 | | |
4091 | 8.64k | return hdr; |
4092 | 8.64k | } |
4093 | | |
4094 | 10.4k | void cram_free_compression_header(cram_block_compression_hdr *hdr) { |
4095 | 10.4k | int i; |
4096 | | |
4097 | 10.4k | if (hdr->landmark) |
4098 | 1.77k | free(hdr->landmark); |
4099 | | |
4100 | 10.4k | if (hdr->preservation_map) |
4101 | 4.66k | kh_destroy(map, hdr->preservation_map); |
4102 | | |
4103 | 343k | for (i = 0; i < CRAM_MAP_HASH; i++) { |
4104 | 333k | cram_map *m, *m2; |
4105 | 351k | for (m = hdr->rec_encoding_map[i]; m; m = m2) { |
4106 | 18.3k | m2 = m->next; |
4107 | 18.3k | if (m->codec) |
4108 | 0 | m->codec->free(m->codec); |
4109 | 18.3k | free(m); |
4110 | 18.3k | } |
4111 | 333k | } |
4112 | | |
4113 | 343k | for (i = 0; i < CRAM_MAP_HASH; i++) { |
4114 | 333k | cram_map *m, *m2; |
4115 | 334k | for (m = hdr->tag_encoding_map[i]; m; m = m2) { |
4116 | 711 | m2 = m->next; |
4117 | 711 | if (m->codec) |
4118 | 711 | m->codec->free(m->codec); |
4119 | 711 | free(m); |
4120 | 711 | } |
4121 | 333k | } |
4122 | | |
4123 | 500k | for (i = 0; i < DS_END; i++) { |
4124 | 489k | if (hdr->codecs[i]) |
4125 | 51.8k | hdr->codecs[i]->free(hdr->codecs[i]); |
4126 | 489k | } |
4127 | | |
4128 | 10.4k | if (hdr->TL) |
4129 | 54 | free(hdr->TL); |
4130 | 10.4k | if (hdr->TD_blk) |
4131 | 8.70k | cram_free_block(hdr->TD_blk); |
4132 | 10.4k | if (hdr->TD_hash) |
4133 | 8.64k | kh_destroy(m_s2i, hdr->TD_hash); |
4134 | 10.4k | if (hdr->TD_keys) |
4135 | 8.64k | string_pool_destroy(hdr->TD_keys); |
4136 | | |
4137 | 10.4k | free(hdr); |
4138 | 10.4k | } |
4139 | | |
4140 | | |
4141 | | /* ---------------------------------------------------------------------- |
4142 | | * Slices and slice headers |
4143 | | */ |
4144 | | |
4145 | 3.27k | void cram_free_slice_header(cram_block_slice_hdr *hdr) { |
4146 | 3.27k | if (!hdr) |
4147 | 0 | return; |
4148 | | |
4149 | 3.27k | if (hdr->block_content_ids) |
4150 | 2.98k | free(hdr->block_content_ids); |
4151 | | |
4152 | 3.27k | free(hdr); |
4153 | | |
4154 | 3.27k | return; |
4155 | 3.27k | } |
4156 | | |
4157 | 3.37k | void cram_free_slice(cram_slice *s) { |
4158 | 3.37k | if (!s) |
4159 | 0 | return; |
4160 | | |
4161 | 3.37k | if (s->bl) { |
4162 | 0 | free_bam_list(s->bl->bams, s->bl->nbams); |
4163 | 0 | free(s->bl); |
4164 | 0 | } |
4165 | | |
4166 | 3.37k | if (s->hdr_block) |
4167 | 2.91k | cram_free_block(s->hdr_block); |
4168 | | |
4169 | 3.37k | if (s->block) { |
4170 | 2.97k | int i; |
4171 | | |
4172 | 2.97k | if (s->hdr) { |
4173 | 6.14M | for (i = 0; i < s->hdr->num_blocks; i++) { |
4174 | 6.14M | if (i > 0 && s->block[i] == s->block[0]) |
4175 | 6.12M | continue; |
4176 | 14.2k | cram_free_block(s->block[i]); |
4177 | 14.2k | } |
4178 | 2.97k | } |
4179 | 2.97k | free(s->block); |
4180 | 2.97k | } |
4181 | | |
4182 | 3.37k | { |
4183 | | // Normally already copied into s->block[], but potentially still |
4184 | | // here if we error part way through cram_encode_slice. |
4185 | 3.37k | int i; |
4186 | 4.77k | for (i = 0; i < s->naux_block; i++) |
4187 | 1.39k | cram_free_block(s->aux_block[i]); |
4188 | 3.37k | } |
4189 | | |
4190 | 3.37k | if (s->block_by_id) |
4191 | 18 | free(s->block_by_id); |
4192 | | |
4193 | 3.37k | if (s->hdr) |
4194 | 3.27k | cram_free_slice_header(s->hdr); |
4195 | | |
4196 | 3.37k | if (s->seqs_blk) |
4197 | 3.22k | cram_free_block(s->seqs_blk); |
4198 | | |
4199 | 3.37k | if (s->qual_blk) |
4200 | 316 | cram_free_block(s->qual_blk); |
4201 | | |
4202 | 3.37k | if (s->name_blk) |
4203 | 316 | cram_free_block(s->name_blk); |
4204 | | |
4205 | 3.37k | if (s->aux_blk) |
4206 | 3.22k | cram_free_block(s->aux_blk); |
4207 | | |
4208 | 3.37k | if (s->base_blk) |
4209 | 316 | cram_free_block(s->base_blk); |
4210 | | |
4211 | 3.37k | if (s->soft_blk) |
4212 | 316 | cram_free_block(s->soft_blk); |
4213 | | |
4214 | 3.37k | if (s->cigar) |
4215 | 3.22k | free(s->cigar); |
4216 | | |
4217 | 3.37k | if (s->crecs) |
4218 | 3.21k | free(s->crecs); |
4219 | | |
4220 | 3.37k | if (s->features) |
4221 | 150 | free(s->features); |
4222 | | |
4223 | 3.37k | if (s->TN) |
4224 | 0 | free(s->TN); |
4225 | | |
4226 | 3.37k | if (s->pair_keys) |
4227 | 3.20k | string_pool_destroy(s->pair_keys); |
4228 | | |
4229 | 3.37k | if (s->pair[0]) |
4230 | 3.20k | kh_destroy(m_s2i, s->pair[0]); |
4231 | 3.37k | if (s->pair[1]) |
4232 | 3.20k | kh_destroy(m_s2i, s->pair[1]); |
4233 | | |
4234 | 3.37k | if (s->aux_block) |
4235 | 552 | free(s->aux_block); |
4236 | | |
4237 | 3.37k | free(s); |
4238 | 3.37k | } |
4239 | | |
4240 | | /* |
4241 | | * Creates a new empty slice in memory, for subsequent writing to |
4242 | | * disk. |
4243 | | * |
4244 | | * Returns cram_slice ptr on success |
4245 | | * NULL on failure |
4246 | | */ |
4247 | 3.20k | cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) { |
4248 | 3.20k | cram_slice *s = calloc(1, sizeof(*s)); |
4249 | 3.20k | if (!s) |
4250 | 0 | return NULL; |
4251 | | |
4252 | 3.20k | if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr)))) |
4253 | 0 | goto err; |
4254 | 3.20k | s->hdr->content_type = type; |
4255 | | |
4256 | 3.20k | s->hdr_block = NULL; |
4257 | 3.20k | s->block = NULL; |
4258 | 3.20k | s->block_by_id = NULL; |
4259 | 3.20k | s->last_apos = 0; |
4260 | 3.20k | if (!(s->crecs = hts_malloc_p(sizeof(*s->crecs), nrecs))) goto err; |
4261 | 3.20k | s->cigar_alloc = 1024; |
4262 | 3.20k | if (!(s->cigar = hts_malloc_p(sizeof(*s->cigar), s->cigar_alloc))) goto err; |
4263 | 3.20k | s->ncigar = 0; |
4264 | | |
4265 | 3.20k | if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err; |
4266 | 3.20k | if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err; |
4267 | 3.20k | if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err; |
4268 | 3.20k | if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err; |
4269 | 3.20k | if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err; |
4270 | 3.20k | if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err; |
4271 | | |
4272 | 3.20k | s->features = NULL; |
4273 | 3.20k | s->nfeatures = s->afeatures = 0; |
4274 | | |
4275 | 3.20k | #ifndef TN_external |
4276 | 3.20k | s->TN = NULL; |
4277 | 3.20k | s->nTN = s->aTN = 0; |
4278 | 3.20k | #endif |
4279 | | |
4280 | | // Volatile keys as we do realloc in dstring |
4281 | 3.20k | if (!(s->pair_keys = string_pool_create(8192))) goto err; |
4282 | 3.20k | if (!(s->pair[0] = kh_init(m_s2i))) goto err; |
4283 | 3.20k | if (!(s->pair[1] = kh_init(m_s2i))) goto err; |
4284 | | |
4285 | | #ifdef BA_external |
4286 | | s->BA_len = 0; |
4287 | | #endif |
4288 | | |
4289 | 3.20k | return s; |
4290 | | |
4291 | 0 | err: |
4292 | 0 | if (s) |
4293 | 0 | cram_free_slice(s); |
4294 | |
|
4295 | 0 | return NULL; |
4296 | 3.20k | } |
4297 | | |
4298 | | /* |
4299 | | * Loads an entire slice. |
4300 | | * FIXME: In 1.0 the native unit of slices within CRAM is broken |
4301 | | * as slices contain references to objects in other slices. |
4302 | | * To work around this while keeping the slice oriented outer loop |
4303 | | * we read all slices and stitch them together into a fake large |
4304 | | * slice instead. |
4305 | | * |
4306 | | * Returns cram_slice ptr on success |
4307 | | * NULL on failure |
4308 | | */ |
4309 | 171 | cram_slice *cram_read_slice(cram_fd *fd) { |
4310 | 171 | cram_block *b = cram_read_block(fd); |
4311 | 171 | cram_slice *s = calloc(1, sizeof(*s)); |
4312 | 171 | int i, n, max_id, min_id; |
4313 | | |
4314 | 171 | if (!b || !s) |
4315 | 15 | goto err; |
4316 | | |
4317 | 156 | s->hdr_block = b; |
4318 | 156 | switch (b->content_type) { |
4319 | 144 | case MAPPED_SLICE: |
4320 | 153 | case UNMAPPED_SLICE: |
4321 | 153 | if (!(s->hdr = cram_decode_slice_header(fd, b))) |
4322 | 78 | goto err; |
4323 | 75 | break; |
4324 | | |
4325 | 75 | default: |
4326 | 3 | hts_log_error("Unexpected block of type %s", |
4327 | 3 | cram_content_type2str(b->content_type)); |
4328 | 3 | goto err; |
4329 | 156 | } |
4330 | | |
4331 | 75 | if (s->hdr->num_blocks < 1) { |
4332 | 3 | hts_log_error("Slice does not include any data blocks"); |
4333 | 3 | goto err; |
4334 | 3 | } |
4335 | | |
4336 | 72 | s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block)); |
4337 | 72 | if (!s->block) |
4338 | 0 | goto err; |
4339 | | |
4340 | 306 | for (max_id = i = 0, min_id = INT_MAX; i < n; i++) { |
4341 | 288 | if (!(s->block[i] = cram_read_block(fd))) |
4342 | 54 | goto err; |
4343 | | |
4344 | 234 | if (s->block[i]->content_type == EXTERNAL) { |
4345 | 36 | if (max_id < s->block[i]->content_id) |
4346 | 24 | max_id = s->block[i]->content_id; |
4347 | 36 | if (min_id > s->block[i]->content_id) |
4348 | 18 | min_id = s->block[i]->content_id; |
4349 | 36 | } |
4350 | 234 | } |
4351 | | |
4352 | 18 | if (!(s->block_by_id = calloc(512, sizeof(s->block[0])))) |
4353 | 0 | goto err; |
4354 | | |
4355 | 54 | for (i = 0; i < n; i++) { |
4356 | 36 | if (s->block[i]->content_type != EXTERNAL) |
4357 | 15 | continue; |
4358 | 21 | uint32_t v = s->block[i]->content_id; |
4359 | 21 | if (v >= 256) |
4360 | 9 | v = 256 + v % 251; |
4361 | 21 | s->block_by_id[v] = s->block[i]; |
4362 | 21 | } |
4363 | | |
4364 | | /* Initialise encoding/decoding tables */ |
4365 | 18 | s->cigar_alloc = 1024; |
4366 | 18 | if (!(s->cigar = hts_malloc_p(sizeof(*s->cigar), s->cigar_alloc))) goto err; |
4367 | 18 | s->ncigar = 0; |
4368 | | |
4369 | 18 | if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err; |
4370 | 18 | if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err; |
4371 | 18 | if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err; |
4372 | 18 | if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err; |
4373 | 18 | if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err; |
4374 | 18 | if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err; |
4375 | | |
4376 | 18 | s->crecs = NULL; |
4377 | | |
4378 | 18 | s->last_apos = s->hdr->ref_seq_start; |
4379 | 18 | s->decode_md = fd->decode_md; |
4380 | | |
4381 | 18 | return s; |
4382 | | |
4383 | 153 | err: |
4384 | 153 | if (b) |
4385 | 138 | cram_free_block(b); |
4386 | 153 | if (s) { |
4387 | 153 | s->hdr_block = NULL; |
4388 | 153 | cram_free_slice(s); |
4389 | 153 | } |
4390 | 153 | return NULL; |
4391 | 18 | } |
4392 | | |
4393 | | |
4394 | | /* ---------------------------------------------------------------------- |
4395 | | * CRAM file definition (header) |
4396 | | */ |
4397 | | |
4398 | | /* |
4399 | | * Reads a CRAM file definition structure. |
4400 | | * Returns file_def ptr on success |
4401 | | * NULL on failure |
4402 | | */ |
4403 | 8.98k | cram_file_def *cram_read_file_def(cram_fd *fd) { |
4404 | 8.98k | cram_file_def *def = malloc(sizeof(*def)); |
4405 | 8.98k | if (!def) |
4406 | 0 | return NULL; |
4407 | | |
4408 | 8.98k | if (26 != hread(fd->fp, &def->magic[0], 26)) { |
4409 | 9 | free(def); |
4410 | 9 | return NULL; |
4411 | 9 | } |
4412 | | |
4413 | 8.97k | if (memcmp(def->magic, "CRAM", 4) != 0) { |
4414 | 1 | free(def); |
4415 | 1 | return NULL; |
4416 | 1 | } |
4417 | | |
4418 | 8.97k | if (def->major_version > 4) { |
4419 | 1 | hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x, 3.x or 4.x, got %d.%d", |
4420 | 1 | def->major_version, def->minor_version); |
4421 | 1 | free(def); |
4422 | 1 | return NULL; |
4423 | 1 | } |
4424 | | |
4425 | 8.97k | fd->first_container += 26; |
4426 | 8.97k | fd->curr_position = fd->first_container; |
4427 | 8.97k | fd->last_slice = 0; |
4428 | | |
4429 | 8.97k | return def; |
4430 | 8.97k | } |
4431 | | |
4432 | | /* |
4433 | | * Writes a cram_file_def structure to cram_fd. |
4434 | | * Returns 0 on success |
4435 | | * -1 on failure |
4436 | | */ |
4437 | 5.44k | int cram_write_file_def(cram_fd *fd, cram_file_def *def) { |
4438 | 5.44k | return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1; |
4439 | 5.44k | } |
4440 | | |
4441 | 17.7k | void cram_free_file_def(cram_file_def *def) { |
4442 | 17.7k | if (def) free(def); |
4443 | 17.7k | } |
4444 | | |
4445 | | /* ---------------------------------------------------------------------- |
4446 | | * SAM header I/O |
4447 | | */ |
4448 | | |
4449 | | |
4450 | | /* |
4451 | | * Reads the SAM header from the first CRAM data block. |
4452 | | * Also performs minimal parsing to extract read-group |
4453 | | * and sample information. |
4454 | | |
4455 | | * Returns SAM hdr ptr on success |
4456 | | * NULL on failure |
4457 | | */ |
4458 | 8.97k | sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) { |
4459 | 8.97k | int32_t header_len; |
4460 | 8.97k | char *header; |
4461 | 8.97k | sam_hdr_t *hdr; |
4462 | | |
4463 | | /* 1.1 onwards stores the header in the first block of a container */ |
4464 | 8.97k | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
4465 | | /* Length */ |
4466 | 7.38k | if (-1 == int32_decode(fd, &header_len)) |
4467 | 2 | return NULL; |
4468 | | |
4469 | 7.37k | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
4470 | 7.37k | if (header_len > FUZZ_ALLOC_LIMIT) |
4471 | 2 | return NULL; |
4472 | 7.37k | #endif |
4473 | | |
4474 | | /* Alloc and read */ |
4475 | 7.37k | if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1))) |
4476 | 0 | return NULL; |
4477 | | |
4478 | 7.37k | if (header_len != hread(fd->fp, header, header_len)) { |
4479 | 13 | free(header); |
4480 | 13 | return NULL; |
4481 | 13 | } |
4482 | 7.36k | header[header_len] = '\0'; |
4483 | | |
4484 | 7.36k | fd->first_container += 4 + header_len; |
4485 | 7.36k | } else { |
4486 | 1.59k | cram_container *c = cram_read_container(fd); |
4487 | 1.59k | cram_block *b; |
4488 | 1.59k | int i; |
4489 | 1.59k | int64_t len; |
4490 | | |
4491 | 1.59k | if (!c) |
4492 | 145 | return NULL; |
4493 | | |
4494 | 1.44k | fd->first_container += c->length + c->offset; |
4495 | 1.44k | fd->curr_position = fd->first_container; |
4496 | | |
4497 | 1.44k | if (c->num_blocks < 1) { |
4498 | 46 | cram_free_container(c); |
4499 | 46 | return NULL; |
4500 | 46 | } |
4501 | | |
4502 | 1.40k | if (!(b = cram_read_block(fd))) { |
4503 | 96 | cram_free_container(c); |
4504 | 96 | return NULL; |
4505 | 96 | } |
4506 | 1.30k | if (cram_uncompress_block(b) != 0) { |
4507 | 976 | cram_free_container(c); |
4508 | 976 | cram_free_block(b); |
4509 | 976 | return NULL; |
4510 | 976 | } |
4511 | | |
4512 | 331 | len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + |
4513 | 331 | fd->vv.varint_size(b->content_id) + |
4514 | 331 | fd->vv.varint_size(b->uncomp_size) + |
4515 | 331 | fd->vv.varint_size(b->comp_size); |
4516 | | |
4517 | | /* Extract header from 1st block */ |
4518 | 331 | if (-1 == int32_get_blk(b, &header_len) || |
4519 | 304 | header_len < 0 || /* Spec. says signed... why? */ |
4520 | 301 | b->uncomp_size - 4 < header_len) { |
4521 | 47 | cram_free_container(c); |
4522 | 47 | cram_free_block(b); |
4523 | 47 | return NULL; |
4524 | 47 | } |
4525 | 284 | if (NULL == (header = malloc((size_t) header_len+1))) { |
4526 | 0 | cram_free_container(c); |
4527 | 0 | cram_free_block(b); |
4528 | 0 | return NULL; |
4529 | 0 | } |
4530 | 284 | memcpy(header, BLOCK_END(b), header_len); |
4531 | 284 | header[header_len] = '\0'; |
4532 | 284 | cram_free_block(b); |
4533 | | |
4534 | | /* Consume any remaining blocks */ |
4535 | 410 | for (i = 1; i < c->num_blocks; i++) { |
4536 | 220 | if (!(b = cram_read_block(fd))) { |
4537 | 94 | cram_free_container(c); |
4538 | 94 | free(header); |
4539 | 94 | return NULL; |
4540 | 94 | } |
4541 | 126 | len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + |
4542 | 126 | fd->vv.varint_size(b->content_id) + |
4543 | 126 | fd->vv.varint_size(b->uncomp_size) + |
4544 | 126 | fd->vv.varint_size(b->comp_size); |
4545 | 126 | cram_free_block(b); |
4546 | 126 | } |
4547 | | |
4548 | 190 | if (c->length > 0 && len > 0 && c->length > len) { |
4549 | | // Consume padding |
4550 | 17 | char *pads = malloc(c->length - len); |
4551 | 17 | if (!pads) { |
4552 | 0 | cram_free_container(c); |
4553 | 0 | free(header); |
4554 | 0 | return NULL; |
4555 | 0 | } |
4556 | | |
4557 | 17 | if (c->length - len != hread(fd->fp, pads, c->length - len)) { |
4558 | 16 | cram_free_container(c); |
4559 | 16 | free(header); |
4560 | 16 | free(pads); |
4561 | 16 | return NULL; |
4562 | 16 | } |
4563 | 1 | free(pads); |
4564 | 1 | } |
4565 | | |
4566 | 174 | cram_free_container(c); |
4567 | 174 | } |
4568 | | |
4569 | | /* Parse */ |
4570 | 7.53k | hdr = sam_hdr_init(); |
4571 | 7.53k | if (!hdr) { |
4572 | 0 | free(header); |
4573 | 0 | return NULL; |
4574 | 0 | } |
4575 | | |
4576 | 7.53k | if (-1 == sam_hdr_add_lines(hdr, header, header_len)) { |
4577 | 274 | free(header); |
4578 | 274 | sam_hdr_destroy(hdr); |
4579 | 274 | return NULL; |
4580 | 274 | } |
4581 | | |
4582 | 7.26k | hdr->l_text = header_len; |
4583 | 7.26k | hdr->text = header; |
4584 | | |
4585 | 7.26k | return hdr; |
4586 | | |
4587 | 7.53k | } |
4588 | | |
4589 | | /* |
4590 | | * Converts 'in' to a full pathname to store in out. |
4591 | | * Out must be at least PATH_MAX bytes long. |
4592 | | */ |
4593 | 0 | static void full_path(char *out, char *in) { |
4594 | 0 | size_t in_l = strlen(in); |
4595 | 0 | if (hisremote(in)) { |
4596 | 0 | if (in_l > PATH_MAX) { |
4597 | 0 | hts_log_error("Reference path is longer than %d", PATH_MAX); |
4598 | 0 | return; |
4599 | 0 | } |
4600 | 0 | strncpy(out, in, PATH_MAX-1); |
4601 | 0 | out[PATH_MAX-1] = 0; |
4602 | 0 | return; |
4603 | 0 | } |
4604 | 0 | if (*in == '/' || |
4605 | | // Windows paths |
4606 | 0 | (in_l > 3 && toupper_c(*in) >= 'A' && toupper_c(*in) <= 'Z' && |
4607 | 0 | in[1] == ':' && (in[2] == '/' || in[2] == '\\'))) { |
4608 | 0 | strncpy(out, in, PATH_MAX-1); |
4609 | 0 | out[PATH_MAX-1] = 0; |
4610 | 0 | } else { |
4611 | 0 | size_t len; |
4612 | | |
4613 | | // unable to get dir or out+in is too long |
4614 | 0 | if (!getcwd(out, PATH_MAX) || |
4615 | 0 | (len = strlen(out))+1+strlen(in) >= PATH_MAX) { |
4616 | 0 | strncpy(out, in, PATH_MAX-1); |
4617 | 0 | out[PATH_MAX-1] = 0; |
4618 | 0 | return; |
4619 | 0 | } |
4620 | | |
4621 | 0 | snprintf(out+len, PATH_MAX - len, "/%s", in); |
4622 | | |
4623 | | // FIXME: cope with `pwd`/../../../foo.fa ? |
4624 | 0 | } |
4625 | 0 | } |
4626 | | |
4627 | | /* |
4628 | | * Writes a CRAM SAM header. |
4629 | | * Returns 0 on success |
4630 | | * -1 on failure |
4631 | | */ |
4632 | 5.44k | int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { |
4633 | 5.44k | size_t header_len; |
4634 | 5.44k | int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3); |
4635 | | |
4636 | | /* Write CRAM MAGIC if not yet written. */ |
4637 | 5.44k | if (fd->file_def->major_version == 0) { |
4638 | 5.44k | fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version); |
4639 | 5.44k | fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version); |
4640 | 5.44k | if (0 != cram_write_file_def(fd, fd->file_def)) |
4641 | 0 | return -1; |
4642 | 5.44k | } |
4643 | | |
4644 | | /* 1.0 requires an UNKNOWN read-group */ |
4645 | 5.44k | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
4646 | 0 | if (!sam_hrecs_find_rg(hdr->hrecs, "UNKNOWN")) |
4647 | 0 | if (sam_hdr_add_line(hdr, "RG", |
4648 | 0 | "ID", "UNKNOWN", "SM", "UNKNOWN", NULL)) |
4649 | 0 | return -1; |
4650 | 0 | } |
4651 | | |
4652 | 5.44k | if (-1 == refs_from_header(fd)) |
4653 | 0 | return -1; |
4654 | 5.44k | if (-1 == refs2id(fd->refs, fd->header)) |
4655 | 0 | return -1; |
4656 | | |
4657 | | /* Fix M5 strings */ |
4658 | 5.44k | if (fd->refs && !fd->no_ref && fd->embed_ref <= 1) { |
4659 | 5.44k | int i; |
4660 | 5.52k | for (i = 0; i < hdr->hrecs->nref; i++) { |
4661 | 362 | sam_hrec_type_t *ty; |
4662 | 362 | char *ref; |
4663 | | |
4664 | 362 | if (!(ty = sam_hrecs_find_type_id(hdr->hrecs, "SQ", "SN", hdr->hrecs->ref[i].name))) |
4665 | 0 | return -1; |
4666 | | |
4667 | 362 | if (!sam_hrecs_find_key(ty, "M5", NULL)) { |
4668 | 277 | char unsigned buf[16]; |
4669 | 277 | char buf2[33]; |
4670 | 277 | hts_pos_t rlen; |
4671 | 277 | hts_md5_context *md5; |
4672 | | |
4673 | 277 | if (!fd->refs || |
4674 | 277 | !fd->refs->ref_id || |
4675 | 277 | !fd->refs->ref_id[i]) { |
4676 | 0 | return -1; |
4677 | 0 | } |
4678 | 277 | rlen = fd->refs->ref_id[i]->length; |
4679 | 277 | ref = cram_get_ref(fd, i, 1, rlen); |
4680 | 277 | if (NULL == ref) { |
4681 | 277 | if (fd->embed_ref == -1) { |
4682 | | // auto embed-ref |
4683 | 277 | hts_log_warning("No M5 tags present and could not " |
4684 | 277 | "find reference"); |
4685 | 277 | hts_log_warning("Enabling embed_ref=2 option"); |
4686 | 277 | hts_log_warning("NOTE: the CRAM file will be bigger " |
4687 | 277 | "than using an external reference"); |
4688 | 277 | pthread_mutex_lock(&fd->ref_lock); |
4689 | | // Best guess. It may be unmapped data with broken |
4690 | | // headers, in which case this will get ignored. |
4691 | 277 | fd->embed_ref = 2; |
4692 | 277 | pthread_mutex_unlock(&fd->ref_lock); |
4693 | 277 | break; |
4694 | 277 | } |
4695 | 0 | return -1; |
4696 | 277 | } |
4697 | 0 | rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */ |
4698 | 0 | if (!(md5 = hts_md5_init())) |
4699 | 0 | return -1; |
4700 | 0 | if (HTS_POS_MAX <= ULONG_MAX) { |
4701 | | // Platforms with 64-bit unsigned long update in one go |
4702 | 0 | hts_md5_update(md5, ref, rlen); |
4703 | 0 | } else { |
4704 | | // Those with 32-bit ulong (Windows) may have to loop |
4705 | | // over epic references |
4706 | 0 | hts_pos_t pos = 0; |
4707 | 0 | while (rlen - pos > ULONG_MAX) { |
4708 | 0 | hts_md5_update(md5, ref + pos, ULONG_MAX); |
4709 | 0 | pos += ULONG_MAX; |
4710 | 0 | } |
4711 | 0 | hts_md5_update(md5, ref + pos, (unsigned long)(rlen - pos)); |
4712 | 0 | } |
4713 | 0 | hts_md5_final(buf, md5); |
4714 | 0 | hts_md5_destroy(md5); |
4715 | 0 | cram_ref_decr(fd->refs, i); |
4716 | |
|
4717 | 0 | hts_md5_hex(buf2, buf); |
4718 | 0 | fd->refs->ref_id[i]->validated_md5 = 1; |
4719 | 0 | if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "M5", buf2, NULL)) |
4720 | 0 | return -1; |
4721 | 0 | } |
4722 | | |
4723 | 85 | if (fd->ref_fn) { |
4724 | 0 | char ref_fn[PATH_MAX]; |
4725 | 0 | full_path(ref_fn, fd->ref_fn); |
4726 | 0 | if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "UR", ref_fn, NULL)) |
4727 | 0 | return -1; |
4728 | 0 | } |
4729 | 85 | } |
4730 | 5.44k | } |
4731 | | |
4732 | | /* Length */ |
4733 | 5.44k | header_len = sam_hdr_length(hdr); |
4734 | 5.44k | if (header_len > INT32_MAX) { |
4735 | 0 | hts_log_error("Header is too long for CRAM format"); |
4736 | 0 | return -1; |
4737 | 0 | } |
4738 | 5.44k | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
4739 | 0 | if (-1 == int32_encode(fd, header_len)) |
4740 | 0 | return -1; |
4741 | | |
4742 | | /* Text data */ |
4743 | 0 | if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len)) |
4744 | 0 | return -1; |
4745 | 5.44k | } else { |
4746 | | /* Create block(s) inside a container */ |
4747 | 5.44k | cram_block *b = cram_new_block(FILE_HEADER, 0); |
4748 | 5.44k | cram_container *c = cram_new_container(0, 0); |
4749 | 5.44k | int padded_length; |
4750 | 5.44k | char *pads; |
4751 | 5.44k | int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3); |
4752 | | |
4753 | 5.44k | if (!b || !c) { |
4754 | 0 | if (b) cram_free_block(b); |
4755 | 0 | if (c) cram_free_container(c); |
4756 | 0 | return -1; |
4757 | 0 | } |
4758 | | |
4759 | 5.44k | if (int32_put_blk(b, header_len) < 0) |
4760 | 0 | return -1; |
4761 | 5.44k | if (header_len) |
4762 | 1.70k | BLOCK_APPEND(b, sam_hdr_str(hdr), header_len); |
4763 | 5.44k | BLOCK_UPLEN(b); |
4764 | | |
4765 | | // Compress header block if V3.0 and above |
4766 | 5.44k | if (CRAM_MAJOR_VERS(fd->version) >= 3) |
4767 | 5.44k | if (cram_compress_block(fd, b, NULL, -1, -1) < 0) |
4768 | 0 | return -1; |
4769 | | |
4770 | 5.44k | if (blank_block) { |
4771 | 5.44k | c->length = b->comp_size + 2 + 4*is_cram_3 + |
4772 | 5.44k | fd->vv.varint_size(b->content_id) + |
4773 | 5.44k | fd->vv.varint_size(b->uncomp_size) + |
4774 | 5.44k | fd->vv.varint_size(b->comp_size); |
4775 | | |
4776 | 5.44k | c->num_blocks = 2; |
4777 | 5.44k | c->num_landmarks = 2; |
4778 | 5.44k | if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) { |
4779 | 0 | cram_free_block(b); |
4780 | 0 | cram_free_container(c); |
4781 | 0 | return -1; |
4782 | 0 | } |
4783 | 5.44k | c->landmark[0] = 0; |
4784 | 5.44k | c->landmark[1] = c->length; |
4785 | | |
4786 | | // Plus extra storage for uncompressed secondary blank block |
4787 | 5.44k | padded_length = MIN(c->length*.5, 10000); |
4788 | 5.44k | c->length += padded_length + 2 + 4*is_cram_3 + |
4789 | 5.44k | fd->vv.varint_size(b->content_id) + |
4790 | 5.44k | fd->vv.varint_size(padded_length)*2; |
4791 | 5.44k | } else { |
4792 | | // Pad the block instead. |
4793 | 0 | c->num_blocks = 1; |
4794 | 0 | c->num_landmarks = 1; |
4795 | 0 | if (!(c->landmark = malloc(sizeof(*c->landmark)))) |
4796 | 0 | return -1; |
4797 | 0 | c->landmark[0] = 0; |
4798 | |
|
4799 | 0 | padded_length = MAX(c->length*1.5, 10000) - c->length; |
4800 | |
|
4801 | 0 | c->length = b->comp_size + padded_length + |
4802 | 0 | 2 + 4*is_cram_3 + |
4803 | 0 | fd->vv.varint_size(b->content_id) + |
4804 | 0 | fd->vv.varint_size(b->uncomp_size) + |
4805 | 0 | fd->vv.varint_size(b->comp_size); |
4806 | |
|
4807 | 0 | if (NULL == (pads = calloc(1, padded_length))) { |
4808 | 0 | cram_free_block(b); |
4809 | 0 | cram_free_container(c); |
4810 | 0 | return -1; |
4811 | 0 | } |
4812 | 0 | BLOCK_APPEND(b, pads, padded_length); |
4813 | 0 | BLOCK_UPLEN(b); |
4814 | 0 | free(pads); |
4815 | 0 | } |
4816 | | |
4817 | 5.44k | if (-1 == cram_write_container(fd, c)) { |
4818 | 0 | cram_free_block(b); |
4819 | 0 | cram_free_container(c); |
4820 | 0 | return -1; |
4821 | 0 | } |
4822 | | |
4823 | 5.44k | if (-1 == cram_write_block(fd, b)) { |
4824 | 0 | cram_free_block(b); |
4825 | 0 | cram_free_container(c); |
4826 | 0 | return -1; |
4827 | 0 | } |
4828 | | |
4829 | 5.44k | if (blank_block) { |
4830 | 5.44k | BLOCK_RESIZE(b, padded_length); |
4831 | 5.44k | memset(BLOCK_DATA(b), 0, padded_length); |
4832 | 5.44k | BLOCK_SIZE(b) = padded_length; |
4833 | 5.44k | BLOCK_UPLEN(b); |
4834 | 5.44k | b->method = RAW; |
4835 | 5.44k | if (-1 == cram_write_block(fd, b)) { |
4836 | 0 | cram_free_block(b); |
4837 | 0 | cram_free_container(c); |
4838 | 0 | return -1; |
4839 | 0 | } |
4840 | 5.44k | } |
4841 | | |
4842 | 5.44k | cram_free_block(b); |
4843 | 5.44k | cram_free_container(c); |
4844 | 5.44k | } |
4845 | | |
4846 | 5.44k | if (0 != hflush(fd->fp)) |
4847 | 0 | return -1; |
4848 | | |
4849 | 5.44k | RP("=== Finishing saving header ===\n"); |
4850 | | |
4851 | 5.44k | return 0; |
4852 | | |
4853 | 0 | block_err: |
4854 | 0 | return -1; |
4855 | 5.44k | } |
4856 | | |
4857 | | /* ---------------------------------------------------------------------- |
4858 | | * The top-level cram opening, closing and option handling |
4859 | | */ |
4860 | | |
4861 | | /* |
4862 | | * Sets CRAM variable sized integer decode function tables. |
4863 | | * CRAM 1, 2, and 3.x all used ITF8 for uint32 and UTF8 for uint64. |
4864 | | * CRAM 4.x uses the same encoding mechanism for 32-bit and 64-bit |
4865 | | * (or anything inbetween), but also now supports signed values. |
4866 | | * |
4867 | | * Version is the CRAM major version number. |
4868 | | * vv is the vector table (probably &cram_fd->vv) |
4869 | | */ |
4870 | 17.7k | static void cram_init_varint(varint_vec *vv, int version) { |
4871 | 17.7k | vv->varint_get32 = safe_itf8_get; |
4872 | 17.7k | vv->varint_get32s = safe_itf8_get; |
4873 | 17.7k | vv->varint_get64 = safe_ltf8_get; |
4874 | 17.7k | vv->varint_get64s = safe_ltf8_get; |
4875 | 17.7k | vv->varint_put32 = safe_itf8_put; |
4876 | 17.7k | vv->varint_put32s = safe_itf8_put; |
4877 | 17.7k | vv->varint_put64 = safe_ltf8_put; |
4878 | 17.7k | vv->varint_put64s = safe_ltf8_put; |
4879 | 17.7k | vv->varint_put32_blk = itf8_put_blk; |
4880 | 17.7k | vv->varint_put32s_blk = itf8_put_blk; |
4881 | 17.7k | vv->varint_put64_blk = ltf8_put_blk; |
4882 | 17.7k | vv->varint_put64s_blk = ltf8_put_blk; |
4883 | 17.7k | vv->varint_size = itf8_size; |
4884 | 17.7k | vv->varint_decode32_crc = itf8_decode_crc; |
4885 | 17.7k | vv->varint_decode32s_crc = itf8_decode_crc; |
4886 | 17.7k | vv->varint_decode64_crc = ltf8_decode_crc; |
4887 | 17.7k | } |
4888 | | |
4889 | | /* |
4890 | | * Initialises the lookup tables. These could be global statics, but they're |
4891 | | * clumsy to setup in a multi-threaded environment unless we generate |
4892 | | * verbatim code and include that. |
4893 | | */ |
4894 | 17.7k | static void cram_init_tables(cram_fd *fd) { |
4895 | 17.7k | int i; |
4896 | | |
4897 | 17.7k | memset(fd->L1, 4, 256); |
4898 | 17.7k | fd->L1['A'] = 0; fd->L1['a'] = 0; |
4899 | 17.7k | fd->L1['C'] = 1; fd->L1['c'] = 1; |
4900 | 17.7k | fd->L1['G'] = 2; fd->L1['g'] = 2; |
4901 | 17.7k | fd->L1['T'] = 3; fd->L1['t'] = 3; |
4902 | | |
4903 | 17.7k | memset(fd->L2, 5, 256); |
4904 | 17.7k | fd->L2['A'] = 0; fd->L2['a'] = 0; |
4905 | 17.7k | fd->L2['C'] = 1; fd->L2['c'] = 1; |
4906 | 17.7k | fd->L2['G'] = 2; fd->L2['g'] = 2; |
4907 | 17.7k | fd->L2['T'] = 3; fd->L2['t'] = 3; |
4908 | 17.7k | fd->L2['N'] = 4; fd->L2['n'] = 4; |
4909 | | |
4910 | 17.7k | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
4911 | 3.78M | for (i = 0; i < 0x200; i++) { |
4912 | 3.77M | int f = 0; |
4913 | | |
4914 | 3.77M | if (i & CRAM_FPAIRED) f |= BAM_FPAIRED; |
4915 | 3.77M | if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR; |
4916 | 3.77M | if (i & CRAM_FUNMAP) f |= BAM_FUNMAP; |
4917 | 3.77M | if (i & CRAM_FREVERSE) f |= BAM_FREVERSE; |
4918 | 3.77M | if (i & CRAM_FREAD1) f |= BAM_FREAD1; |
4919 | 3.77M | if (i & CRAM_FREAD2) f |= BAM_FREAD2; |
4920 | 3.77M | if (i & CRAM_FSECONDARY) f |= BAM_FSECONDARY; |
4921 | 3.77M | if (i & CRAM_FQCFAIL) f |= BAM_FQCFAIL; |
4922 | 3.77M | if (i & CRAM_FDUP) f |= BAM_FDUP; |
4923 | | |
4924 | 3.77M | fd->bam_flag_swap[i] = f; |
4925 | 3.77M | } |
4926 | | |
4927 | 30.2M | for (i = 0; i < 0x1000; i++) { |
4928 | 30.2M | int g = 0; |
4929 | | |
4930 | 30.2M | if (i & BAM_FPAIRED) g |= CRAM_FPAIRED; |
4931 | 30.2M | if (i & BAM_FPROPER_PAIR) g |= CRAM_FPROPER_PAIR; |
4932 | 30.2M | if (i & BAM_FUNMAP) g |= CRAM_FUNMAP; |
4933 | 30.2M | if (i & BAM_FREVERSE) g |= CRAM_FREVERSE; |
4934 | 30.2M | if (i & BAM_FREAD1) g |= CRAM_FREAD1; |
4935 | 30.2M | if (i & BAM_FREAD2) g |= CRAM_FREAD2; |
4936 | 30.2M | if (i & BAM_FSECONDARY) g |= CRAM_FSECONDARY; |
4937 | 30.2M | if (i & BAM_FQCFAIL) g |= CRAM_FQCFAIL; |
4938 | 30.2M | if (i & BAM_FDUP) g |= CRAM_FDUP; |
4939 | | |
4940 | 30.2M | fd->cram_flag_swap[i] = g; |
4941 | 30.2M | } |
4942 | 10.3k | } else { |
4943 | | /* NOP */ |
4944 | 42.4M | for (i = 0; i < 0x1000; i++) |
4945 | 42.4M | fd->bam_flag_swap[i] = i; |
4946 | 42.4M | for (i = 0; i < 0x1000; i++) |
4947 | 42.4M | fd->cram_flag_swap[i] = i; |
4948 | 10.3k | } |
4949 | | |
4950 | 17.7k | memset(fd->cram_sub_matrix, 4, 32*32); |
4951 | 585k | for (i = 0; i < 32; i++) { |
4952 | 567k | fd->cram_sub_matrix[i]['A'&0x1f]=0; |
4953 | 567k | fd->cram_sub_matrix[i]['C'&0x1f]=1; |
4954 | 567k | fd->cram_sub_matrix[i]['G'&0x1f]=2; |
4955 | 567k | fd->cram_sub_matrix[i]['T'&0x1f]=3; |
4956 | 567k | fd->cram_sub_matrix[i]['N'&0x1f]=4; |
4957 | 567k | } |
4958 | 106k | for (i = 0; i < 20; i+=4) { |
4959 | 88.7k | int j; |
4960 | 1.86M | for (j = 0; j < 20; j++) { |
4961 | 1.77M | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3; |
4962 | 1.77M | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3; |
4963 | 1.77M | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3; |
4964 | 1.77M | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3; |
4965 | 1.77M | } |
4966 | 88.7k | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0; |
4967 | 88.7k | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1; |
4968 | 88.7k | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2; |
4969 | 88.7k | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3; |
4970 | 88.7k | } |
4971 | | |
4972 | 17.7k | cram_init_varint(&fd->vv, CRAM_MAJOR_VERS(fd->version)); |
4973 | 17.7k | } |
4974 | | |
4975 | | // Default version numbers for CRAM |
4976 | | static int major_version = 3; |
4977 | | static int minor_version = 1; |
4978 | | |
4979 | | /* |
4980 | | * Opens a CRAM file for read (mode "rb") or write ("wb"). |
4981 | | * The filename may be "-" to indicate stdin or stdout. |
4982 | | * |
4983 | | * Returns file handle on success |
4984 | | * NULL on failure. |
4985 | | */ |
4986 | 0 | cram_fd *cram_open(const char *filename, const char *mode) { |
4987 | 0 | hFILE *fp; |
4988 | 0 | cram_fd *fd; |
4989 | 0 | char fmode[3]= { mode[0], '\0', '\0' }; |
4990 | |
|
4991 | 0 | if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) { |
4992 | 0 | fmode[1] = 'b'; |
4993 | 0 | } |
4994 | |
|
4995 | 0 | fp = hopen(filename, fmode); |
4996 | 0 | if (!fp) |
4997 | 0 | return NULL; |
4998 | | |
4999 | 0 | fd = cram_dopen(fp, filename, mode); |
5000 | 0 | if (!fd) |
5001 | 0 | hclose_abruptly(fp); |
5002 | |
|
5003 | 0 | return fd; |
5004 | 0 | } |
5005 | | |
5006 | | /* Opens an existing stream for reading or writing. |
5007 | | * |
5008 | | * Returns file handle on success; |
5009 | | * NULL on failure. |
5010 | | */ |
5011 | 17.7k | cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { |
5012 | 17.7k | int i; |
5013 | 17.7k | char *cp; |
5014 | 17.7k | cram_fd *fd = calloc(1, sizeof(*fd)); |
5015 | 17.7k | if (!fd) |
5016 | 0 | return NULL; |
5017 | | |
5018 | 17.7k | fd->level = CRAM_DEFAULT_LEVEL; |
5019 | 53.2k | for (i = 0; mode[i]; i++) { |
5020 | 35.5k | if (mode[i] >= '0' && mode[i] <= '9') { |
5021 | 0 | fd->level = mode[i] - '0'; |
5022 | 0 | break; |
5023 | 0 | } |
5024 | 35.5k | } |
5025 | | |
5026 | 17.7k | fd->fp = fp; |
5027 | 17.7k | fd->mode = *mode; |
5028 | 17.7k | fd->first_container = 0; |
5029 | 17.7k | fd->curr_position = 0; |
5030 | | |
5031 | 17.7k | if (fd->mode == 'r') { |
5032 | | /* Reader */ |
5033 | | |
5034 | 8.98k | if (!(fd->file_def = cram_read_file_def(fd))) |
5035 | 11 | goto err; |
5036 | | |
5037 | 8.97k | fd->version = fd->file_def->major_version * 256 + |
5038 | 8.97k | fd->file_def->minor_version; |
5039 | | |
5040 | 8.97k | cram_init_tables(fd); |
5041 | | |
5042 | 8.97k | if (!(fd->header = cram_read_SAM_hdr(fd))) { |
5043 | 1.71k | cram_free_file_def(fd->file_def); |
5044 | 1.71k | goto err; |
5045 | 1.71k | } |
5046 | | |
5047 | 8.97k | } else { |
5048 | | /* Writer */ |
5049 | 8.77k | cram_file_def *def = calloc(1, sizeof(*def)); |
5050 | 8.77k | if (!def) |
5051 | 0 | return NULL; |
5052 | | |
5053 | 8.77k | fd->file_def = def; |
5054 | | |
5055 | 8.77k | def->magic[0] = 'C'; |
5056 | 8.77k | def->magic[1] = 'R'; |
5057 | 8.77k | def->magic[2] = 'A'; |
5058 | 8.77k | def->magic[3] = 'M'; |
5059 | 8.77k | def->major_version = 0; // Indicator to write file def later. |
5060 | 8.77k | def->minor_version = 0; |
5061 | 8.77k | memset(def->file_id, 0, 20); |
5062 | 8.77k | strncpy(def->file_id, filename, 20); |
5063 | | |
5064 | 8.77k | fd->version = major_version * 256 + minor_version; |
5065 | 8.77k | cram_init_tables(fd); |
5066 | | |
5067 | | /* SAM header written later along with this file_def */ |
5068 | 8.77k | } |
5069 | | |
5070 | 16.0k | fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename); |
5071 | 16.0k | if (!fd->prefix) |
5072 | 0 | goto err; |
5073 | 16.0k | fd->first_base = fd->last_base = -1; |
5074 | 16.0k | fd->record_counter = 0; |
5075 | | |
5076 | 16.0k | fd->ctr = NULL; |
5077 | 16.0k | fd->ctr_mt = NULL; |
5078 | 16.0k | fd->refs = refs_create(); |
5079 | 16.0k | if (!fd->refs) |
5080 | 0 | goto err; |
5081 | 16.0k | fd->ref_id = -2; |
5082 | 16.0k | fd->ref = NULL; |
5083 | | |
5084 | 16.0k | fd->decode_md = 0; |
5085 | 16.0k | fd->seqs_per_slice = SEQS_PER_SLICE; |
5086 | 16.0k | fd->bases_per_slice = BASES_PER_SLICE; |
5087 | 16.0k | fd->slices_per_container = SLICE_PER_CNT; |
5088 | 16.0k | fd->embed_ref = -1; // automatic selection |
5089 | 16.0k | fd->no_ref = 0; |
5090 | 16.0k | fd->no_ref_counter = 0; |
5091 | 16.0k | fd->ap_delta = 0; |
5092 | 16.0k | fd->ignore_md5 = 0; |
5093 | 16.0k | fd->lossy_read_names = 0; |
5094 | 16.0k | fd->use_bz2 = 0; |
5095 | 16.0k | fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3); |
5096 | 16.0k | fd->use_tok = (CRAM_MAJOR_VERS(fd->version) >= 3) && (CRAM_MINOR_VERS(fd->version) >= 1); |
5097 | 16.0k | fd->use_lzma = 0; |
5098 | 16.0k | fd->multi_seq = -1; |
5099 | 16.0k | fd->multi_seq_user = -1; |
5100 | 16.0k | fd->unsorted = 0; |
5101 | 16.0k | fd->shared_ref = 0; |
5102 | 16.0k | fd->store_md = 0; |
5103 | 16.0k | fd->store_nm = 0; |
5104 | 16.0k | fd->last_RI_count = 0; |
5105 | | |
5106 | 16.0k | fd->index = NULL; |
5107 | 16.0k | fd->own_pool = 0; |
5108 | 16.0k | fd->pool = NULL; |
5109 | 16.0k | fd->rqueue = NULL; |
5110 | 16.0k | fd->job_pending = NULL; |
5111 | 16.0k | fd->ooc = 0; |
5112 | 16.0k | fd->required_fields = INT_MAX; |
5113 | | |
5114 | 16.0k | pthread_mutex_init(&fd->metrics_lock, NULL); |
5115 | 16.0k | pthread_mutex_init(&fd->ref_lock, NULL); |
5116 | 16.0k | pthread_mutex_init(&fd->range_lock, NULL); |
5117 | 16.0k | pthread_mutex_init(&fd->bam_list_lock, NULL); |
5118 | | |
5119 | 769k | for (i = 0; i < DS_END; i++) { |
5120 | 753k | fd->m[i] = cram_new_metrics(); |
5121 | 753k | if (!fd->m[i]) |
5122 | 0 | goto err; |
5123 | 753k | } |
5124 | | |
5125 | 16.0k | if (!(fd->tags_used = kh_init(m_metrics))) |
5126 | 0 | goto err; |
5127 | | |
5128 | 16.0k | fd->range.refid = -2; // no ref. |
5129 | 16.0k | fd->eof = 1; // See samtools issue #150 |
5130 | 16.0k | fd->ref_fn = NULL; |
5131 | | |
5132 | 16.0k | fd->bl = NULL; |
5133 | | |
5134 | | /* Initialise dummy refs from the @SQ headers */ |
5135 | 16.0k | if (-1 == refs_from_header(fd)) |
5136 | 0 | goto err; |
5137 | | |
5138 | 16.0k | return fd; |
5139 | | |
5140 | 1.72k | err: |
5141 | 1.72k | if (fd) |
5142 | 1.72k | free(fd); |
5143 | | |
5144 | 1.72k | return NULL; |
5145 | 16.0k | } |
5146 | | |
5147 | | /* |
5148 | | * Seek within a CRAM file. |
5149 | | * |
5150 | | * Returns 0 on success |
5151 | | * -1 on failure |
5152 | | */ |
5153 | 0 | int cram_seek(cram_fd *fd, off_t offset, int whence) { |
5154 | 0 | fd->ooc = 0; |
5155 | |
|
5156 | 0 | cram_drain_rqueue(fd); |
5157 | |
|
5158 | 0 | return hseek(fd->fp, offset, whence) >= 0 ? 0 : -1; |
5159 | 0 | } |
5160 | | |
5161 | | /* |
5162 | | * Flushes a CRAM file. |
5163 | | * Useful for when writing to stdout without wishing to close the stream. |
5164 | | * |
5165 | | * Returns 0 on success |
5166 | | * -1 on failure |
5167 | | */ |
5168 | 0 | int cram_flush(cram_fd *fd) { |
5169 | 0 | if (!fd) |
5170 | 0 | return -1; |
5171 | | |
5172 | 0 | int ret = 0; |
5173 | |
|
5174 | 0 | if (fd->mode == 'w' && fd->ctr) { |
5175 | 0 | if(fd->ctr->slice) |
5176 | 0 | cram_update_curr_slice(fd->ctr, fd->version); |
5177 | |
|
5178 | 0 | if (-1 == cram_flush_container_mt(fd, fd->ctr)) |
5179 | 0 | ret = -1; |
5180 | |
|
5181 | 0 | cram_free_container(fd->ctr); |
5182 | 0 | if (fd->ctr_mt == fd->ctr) |
5183 | 0 | fd->ctr_mt = NULL; |
5184 | 0 | fd->ctr = NULL; |
5185 | 0 | } |
5186 | |
|
5187 | 0 | return ret; |
5188 | 0 | } |
5189 | | |
5190 | | /* |
5191 | | * Writes an EOF block to a CRAM file. |
5192 | | * |
5193 | | * Returns 0 on success |
5194 | | * -1 on failure |
5195 | | */ |
5196 | 8.46k | int cram_write_eof_block(cram_fd *fd) { |
5197 | | // EOF block is a container with special values to aid detection |
5198 | 8.46k | if (CRAM_MAJOR_VERS(fd->version) >= 2) { |
5199 | | // Empty container with |
5200 | | // ref_seq_id -1 |
5201 | | // start pos 0x454f46 ("EOF") |
5202 | | // span 0 |
5203 | | // nrec 0 |
5204 | | // counter 0 |
5205 | | // nbases 0 |
5206 | | // 1 block (landmark 0) |
5207 | | // (CRC32) |
5208 | 8.46k | cram_container c; |
5209 | 8.46k | memset(&c, 0, sizeof(c)); |
5210 | 8.46k | c.ref_seq_id = -1; |
5211 | 8.46k | c.ref_seq_start = 0x454f46; // "EOF" |
5212 | 8.46k | c.ref_seq_span = 0; |
5213 | 8.46k | c.record_counter = 0; |
5214 | 8.46k | c.num_bases = 0; |
5215 | 8.46k | c.num_blocks = 1; |
5216 | 8.46k | int32_t land[1] = {0}; |
5217 | 8.46k | c.landmark = land; |
5218 | | |
5219 | | // An empty compression header block with |
5220 | | // method raw (0) |
5221 | | // type comp header (1) |
5222 | | // content id 0 |
5223 | | // block contents size 6 |
5224 | | // raw size 6 |
5225 | | // empty preservation map (01 00) |
5226 | | // empty data series map (01 00) |
5227 | | // empty tag map (01 00) |
5228 | | // block CRC |
5229 | 8.46k | cram_block_compression_hdr ch; |
5230 | 8.46k | memset(&ch, 0, sizeof(ch)); |
5231 | 8.46k | c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch, 0); |
5232 | | |
5233 | 8.46k | c.length = c.comp_hdr_block->byte // Landmark[0] |
5234 | 8.46k | + 5 // block struct |
5235 | 8.46k | + 4*(CRAM_MAJOR_VERS(fd->version) >= 3); // CRC |
5236 | 8.46k | if (cram_write_container(fd, &c) < 0 || |
5237 | 8.46k | cram_write_block(fd, c.comp_hdr_block) < 0) { |
5238 | 0 | cram_close(fd); |
5239 | 0 | cram_free_block(c.comp_hdr_block); |
5240 | 0 | return -1; |
5241 | 0 | } |
5242 | 8.46k | if (ch.preservation_map) |
5243 | 0 | kh_destroy(map, ch.preservation_map); |
5244 | 8.46k | cram_free_block(c.comp_hdr_block); |
5245 | | |
5246 | | // V2.1 bytes |
5247 | | // 0b 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id |
5248 | | // e0 45 4f 46 00 00 00 // Cont HDR: pos, span, nrec, counter |
5249 | | // 00 01 00 // Cont HDR: nbase, nblk, landmark |
5250 | | // 00 01 00 06 06 // Comp.HDR blk |
5251 | | // 01 00 01 00 01 00 // Comp.HDR blk |
5252 | | |
5253 | | // V3.0 bytes: |
5254 | | // 0f 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id |
5255 | | // e0 45 4f 46 00 00 00 // Cont HDR: pos, span, nrec, counter |
5256 | | // 00 01 00 // Cont HDR: nbase, nblk, landmark |
5257 | | // 05 bd d9 4f // CRC32 |
5258 | | // 00 01 00 06 06 // Comp.HDR blk |
5259 | | // 01 00 01 00 01 00 // Comp.HDR blk |
5260 | | // ee 63 01 4b // CRC32 |
5261 | 8.46k | } |
5262 | | |
5263 | 8.46k | return 0; |
5264 | 8.46k | } |
5265 | | |
5266 | | /* |
5267 | | * Closes a CRAM file. |
5268 | | * Returns 0 on success |
5269 | | * -1 on failure |
5270 | | */ |
5271 | 16.0k | int cram_close(cram_fd *fd) { |
5272 | 16.0k | bam_list *bl, *next; |
5273 | 16.0k | int i, ret = 0; |
5274 | | |
5275 | 16.0k | if (!fd) |
5276 | 0 | return -1; |
5277 | | |
5278 | 16.0k | if (fd->mode == 'w' && fd->ctr) { |
5279 | 2.62k | if(fd->ctr->slice) |
5280 | 2.62k | cram_update_curr_slice(fd->ctr, fd->version); |
5281 | | |
5282 | 2.62k | if (-1 == cram_flush_container_mt(fd, fd->ctr)) |
5283 | 304 | ret = -1; |
5284 | 2.62k | } |
5285 | | |
5286 | 16.0k | if (fd->mode != 'w') |
5287 | 7.26k | cram_drain_rqueue(fd); |
5288 | | |
5289 | 16.0k | if (fd->pool && fd->eof >= 0 && fd->rqueue) { |
5290 | 0 | hts_tpool_process_flush(fd->rqueue); |
5291 | |
|
5292 | 0 | if (0 != cram_flush_result(fd)) |
5293 | 0 | ret = -1; |
5294 | |
|
5295 | 0 | if (fd->mode == 'w') |
5296 | 0 | fd->ctr = NULL; // prevent double freeing |
5297 | | |
5298 | | //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue); |
5299 | |
|
5300 | 0 | hts_tpool_process_destroy(fd->rqueue); |
5301 | 0 | } |
5302 | | |
5303 | 16.0k | pthread_mutex_destroy(&fd->metrics_lock); |
5304 | 16.0k | pthread_mutex_destroy(&fd->ref_lock); |
5305 | 16.0k | pthread_mutex_destroy(&fd->range_lock); |
5306 | 16.0k | pthread_mutex_destroy(&fd->bam_list_lock); |
5307 | | |
5308 | 16.0k | if (ret == 0 && fd->mode == 'w') { |
5309 | | /* Write EOF block */ |
5310 | 8.46k | if (0 != cram_write_eof_block(fd)) |
5311 | 0 | ret = -1; |
5312 | 8.46k | } |
5313 | | |
5314 | 18.3k | for (bl = fd->bl; bl; bl = next) { |
5315 | 2.33k | next = bl->next; |
5316 | 2.33k | free_bam_list(bl->bams, bl->nbams); |
5317 | 2.33k | free(bl); |
5318 | 2.33k | } |
5319 | | |
5320 | 16.0k | if (hclose(fd->fp) != 0) |
5321 | 0 | ret = -1; |
5322 | | |
5323 | 16.0k | if (fd->file_def) |
5324 | 16.0k | cram_free_file_def(fd->file_def); |
5325 | | |
5326 | 16.0k | if (fd->header) |
5327 | 12.7k | sam_hdr_destroy(fd->header); |
5328 | | |
5329 | 16.0k | free(fd->prefix); |
5330 | | |
5331 | 16.0k | if (fd->ctr) |
5332 | 7.72k | cram_free_container(fd->ctr); |
5333 | | |
5334 | 16.0k | if (fd->ctr_mt && fd->ctr_mt != fd->ctr) |
5335 | 38 | cram_free_container(fd->ctr_mt); |
5336 | | |
5337 | 16.0k | if (fd->refs) |
5338 | 16.0k | refs_free(fd->refs); |
5339 | 16.0k | if (fd->ref_free) |
5340 | 0 | free(fd->ref_free); |
5341 | | |
5342 | 769k | for (i = 0; i < DS_END; i++) |
5343 | 753k | if (fd->m[i]) |
5344 | 753k | free(fd->m[i]); |
5345 | | |
5346 | 16.0k | if (fd->tags_used) { |
5347 | 16.0k | khint_t k; |
5348 | | |
5349 | 20.4k | for (k = kh_begin(fd->tags_used); k != kh_end(fd->tags_used); k++) { |
5350 | 4.36k | if (kh_exist(fd->tags_used, k)) |
5351 | 1.94k | free(kh_val(fd->tags_used, k)); |
5352 | 4.36k | } |
5353 | | |
5354 | 16.0k | kh_destroy(m_metrics, fd->tags_used); |
5355 | 16.0k | } |
5356 | | |
5357 | 16.0k | if (fd->index) |
5358 | 0 | cram_index_free(fd); |
5359 | | |
5360 | 16.0k | if (fd->own_pool && fd->pool) |
5361 | 0 | hts_tpool_destroy(fd->pool); |
5362 | | |
5363 | 16.0k | if (fd->idxfp) |
5364 | 0 | if (bgzf_close(fd->idxfp) < 0) |
5365 | 0 | ret = -1; |
5366 | | |
5367 | 16.0k | free(fd); |
5368 | | |
5369 | 16.0k | return ret; |
5370 | 16.0k | } |
5371 | | |
5372 | | /* |
5373 | | * Returns 1 if we hit an EOF while reading. |
5374 | | */ |
5375 | 12.7k | int cram_eof(cram_fd *fd) { |
5376 | 12.7k | return fd->eof; |
5377 | 12.7k | } |
5378 | | |
5379 | | |
5380 | | /* |
5381 | | * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h. |
5382 | | * Use this immediately after opening. |
5383 | | * |
5384 | | * Returns 0 on success |
5385 | | * -1 on failure |
5386 | | */ |
5387 | 7.26k | int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...) { |
5388 | 7.26k | int r; |
5389 | 7.26k | va_list args; |
5390 | | |
5391 | 7.26k | va_start(args, opt); |
5392 | 7.26k | r = cram_set_voption(fd, opt, args); |
5393 | 7.26k | va_end(args); |
5394 | | |
5395 | 7.26k | return r; |
5396 | 7.26k | } |
5397 | | |
5398 | | /* |
5399 | | * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h. |
5400 | | * Use this immediately after opening. |
5401 | | * |
5402 | | * Returns 0 on success |
5403 | | * -1 on failure |
5404 | | */ |
5405 | 7.26k | int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { |
5406 | 7.26k | refs_t *refs; |
5407 | | |
5408 | 7.26k | if (!fd) { |
5409 | 0 | errno = EBADF; |
5410 | 0 | return -1; |
5411 | 0 | } |
5412 | | |
5413 | 7.26k | switch (opt) { |
5414 | 7.26k | case CRAM_OPT_DECODE_MD: |
5415 | 7.26k | fd->decode_md = va_arg(args, int); |
5416 | 7.26k | break; |
5417 | | |
5418 | 0 | case CRAM_OPT_PREFIX: |
5419 | 0 | if (fd->prefix) |
5420 | 0 | free(fd->prefix); |
5421 | 0 | if (!(fd->prefix = strdup(va_arg(args, char *)))) |
5422 | 0 | return -1; |
5423 | 0 | break; |
5424 | | |
5425 | 0 | case CRAM_OPT_VERBOSITY: |
5426 | 0 | break; |
5427 | | |
5428 | 0 | case CRAM_OPT_SEQS_PER_SLICE: |
5429 | 0 | fd->seqs_per_slice = va_arg(args, int); |
5430 | 0 | if (fd->bases_per_slice == BASES_PER_SLICE) |
5431 | 0 | fd->bases_per_slice = fd->seqs_per_slice * 500; |
5432 | 0 | break; |
5433 | | |
5434 | 0 | case CRAM_OPT_BASES_PER_SLICE: |
5435 | 0 | fd->bases_per_slice = va_arg(args, int); |
5436 | 0 | break; |
5437 | | |
5438 | 0 | case CRAM_OPT_SLICES_PER_CONTAINER: |
5439 | 0 | fd->slices_per_container = va_arg(args, int); |
5440 | 0 | break; |
5441 | | |
5442 | 0 | case CRAM_OPT_EMBED_REF: |
5443 | 0 | fd->embed_ref = va_arg(args, int); |
5444 | 0 | break; |
5445 | | |
5446 | 0 | case CRAM_OPT_NO_REF: |
5447 | 0 | fd->no_ref = va_arg(args, int); |
5448 | 0 | break; |
5449 | | |
5450 | 0 | case CRAM_OPT_POS_DELTA: |
5451 | 0 | fd->ap_delta = va_arg(args, int); |
5452 | 0 | break; |
5453 | | |
5454 | 0 | case CRAM_OPT_IGNORE_MD5: |
5455 | 0 | fd->ignore_md5 = va_arg(args, int); |
5456 | 0 | break; |
5457 | | |
5458 | 0 | case CRAM_OPT_LOSSY_NAMES: |
5459 | 0 | fd->lossy_read_names = va_arg(args, int); |
5460 | | // Currently lossy read names required paired (attached) reads. |
5461 | | // TLEN 0 or being 1 out causes read pairs to be detached, breaking |
5462 | | // the lossy read name compression, so we have extra options to |
5463 | | // slacken the exact TLEN round-trip checks. |
5464 | 0 | fd->tlen_approx = fd->lossy_read_names; |
5465 | 0 | fd->tlen_zero = fd->lossy_read_names; |
5466 | 0 | break; |
5467 | | |
5468 | 0 | case CRAM_OPT_USE_BZIP2: |
5469 | 0 | fd->use_bz2 = va_arg(args, int); |
5470 | 0 | break; |
5471 | | |
5472 | 0 | case CRAM_OPT_USE_RANS: |
5473 | 0 | fd->use_rans = va_arg(args, int); |
5474 | 0 | break; |
5475 | | |
5476 | 0 | case CRAM_OPT_USE_TOK: |
5477 | 0 | fd->use_tok = va_arg(args, int); |
5478 | 0 | break; |
5479 | | |
5480 | 0 | case CRAM_OPT_USE_FQZ: |
5481 | 0 | fd->use_fqz = va_arg(args, int); |
5482 | 0 | break; |
5483 | | |
5484 | 0 | case CRAM_OPT_USE_ARITH: |
5485 | 0 | fd->use_arith = va_arg(args, int); |
5486 | 0 | break; |
5487 | | |
5488 | 0 | case CRAM_OPT_USE_LZMA: |
5489 | 0 | fd->use_lzma = va_arg(args, int); |
5490 | 0 | break; |
5491 | | |
5492 | 0 | case CRAM_OPT_SHARED_REF: |
5493 | 0 | fd->shared_ref = 1; |
5494 | 0 | refs = va_arg(args, refs_t *); |
5495 | 0 | if (refs != fd->refs) { |
5496 | 0 | if (fd->refs) |
5497 | 0 | refs_free(fd->refs); |
5498 | 0 | fd->refs = refs; |
5499 | 0 | fd->refs->count++; |
5500 | 0 | } |
5501 | 0 | break; |
5502 | | |
5503 | 0 | case CRAM_OPT_RANGE: { |
5504 | 0 | int r = cram_seek_to_refpos(fd, va_arg(args, cram_range *)); |
5505 | 0 | pthread_mutex_lock(&fd->range_lock); |
5506 | | // printf("opt range noseek to %p %d:%ld-%ld\n", |
5507 | | // fd, fd->range.refid, fd->range.start, fd->range.end); |
5508 | 0 | if (fd->range.refid != -2) |
5509 | 0 | fd->required_fields |= SAM_POS; |
5510 | 0 | pthread_mutex_unlock(&fd->range_lock); |
5511 | 0 | return r; |
5512 | 0 | } |
5513 | | |
5514 | 0 | case CRAM_OPT_RANGE_NOSEEK: { |
5515 | | // As per CRAM_OPT_RANGE, but no seeking |
5516 | 0 | pthread_mutex_lock(&fd->range_lock); |
5517 | 0 | cram_range *r = va_arg(args, cram_range *); |
5518 | 0 | fd->range = *r; |
5519 | 0 | if (r->refid == HTS_IDX_NOCOOR) { |
5520 | 0 | fd->range.refid = -1; |
5521 | 0 | fd->range.start = 0; |
5522 | 0 | } else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) { |
5523 | 0 | fd->range.refid = -2; // special case in cram_next_slice |
5524 | 0 | } |
5525 | 0 | if (fd->range.refid != -2) |
5526 | 0 | fd->required_fields |= SAM_POS; |
5527 | 0 | fd->ooc = 0; |
5528 | 0 | fd->eof = 0; |
5529 | 0 | pthread_mutex_unlock(&fd->range_lock); |
5530 | 0 | return 0; |
5531 | 0 | } |
5532 | | |
5533 | 0 | case CRAM_OPT_REFERENCE: |
5534 | 0 | return cram_load_reference(fd, va_arg(args, char *)); |
5535 | | |
5536 | 0 | case CRAM_OPT_VERSION: { |
5537 | 0 | int major, minor; |
5538 | 0 | char *s = va_arg(args, char *); |
5539 | 0 | if (2 != sscanf(s, "%d.%d", &major, &minor)) { |
5540 | 0 | hts_log_error("Malformed version string %s", s); |
5541 | 0 | return -1; |
5542 | 0 | } |
5543 | 0 | if (!((major == 1 && minor == 0) || |
5544 | 0 | (major == 2 && (minor == 0 || minor == 1)) || |
5545 | 0 | (major == 3 && (minor == 0 || minor == 1)))) { |
5546 | 0 | hts_log_error("Unknown version string; use 1.0, 2.0, 2.1, " |
5547 | 0 | "3.0 or 3.1"); |
5548 | 0 | errno = EINVAL; |
5549 | 0 | return -1; |
5550 | 0 | } |
5551 | | |
5552 | 0 | fd->version = major*256 + minor; |
5553 | |
|
5554 | 0 | fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0; |
5555 | |
|
5556 | 0 | fd->use_tok = (CRAM_MAJOR_VERS(fd->version) == 3 && |
5557 | 0 | CRAM_MINOR_VERS(fd->version) >= 1); |
5558 | 0 | cram_init_tables(fd); |
5559 | |
|
5560 | 0 | break; |
5561 | 0 | } |
5562 | | |
5563 | 0 | case CRAM_OPT_MULTI_SEQ_PER_SLICE: |
5564 | 0 | fd->multi_seq_user = fd->multi_seq = va_arg(args, int); |
5565 | 0 | break; |
5566 | | |
5567 | 0 | case CRAM_OPT_NTHREADS: { |
5568 | 0 | int nthreads = va_arg(args, int); |
5569 | 0 | if (fd->pool) |
5570 | 0 | return -2; //already exists! |
5571 | 0 | if (nthreads >= 1) { |
5572 | 0 | if (!(fd->pool = hts_tpool_init(nthreads))) |
5573 | 0 | return -1; |
5574 | | |
5575 | 0 | fd->rqueue = hts_tpool_process_init(fd->pool, nthreads*2, 0); |
5576 | 0 | fd->shared_ref = 1; |
5577 | 0 | fd->own_pool = 1; |
5578 | 0 | } |
5579 | 0 | break; |
5580 | 0 | } |
5581 | | |
5582 | 0 | case CRAM_OPT_THREAD_POOL: { |
5583 | 0 | htsThreadPool *p = va_arg(args, htsThreadPool *); |
5584 | 0 | if (fd->pool) |
5585 | 0 | return -2; //already exists! |
5586 | 0 | fd->pool = p ? p->pool : NULL; |
5587 | 0 | if (fd->pool) { |
5588 | 0 | fd->rqueue = hts_tpool_process_init(fd->pool, |
5589 | 0 | p->qsize ? p->qsize : hts_tpool_size(fd->pool)*2, |
5590 | 0 | 0); |
5591 | 0 | } |
5592 | 0 | fd->shared_ref = 1; // Needed to avoid clobbering ref between threads |
5593 | 0 | fd->own_pool = 0; |
5594 | | |
5595 | | //fd->qsize = 1; |
5596 | | //fd->decoded = calloc(fd->qsize, sizeof(cram_container *)); |
5597 | | //hts_tpool_dispatch(fd->pool, cram_decoder_thread, fd); |
5598 | 0 | break; |
5599 | 0 | } |
5600 | | |
5601 | 0 | case CRAM_OPT_REQUIRED_FIELDS: |
5602 | 0 | fd->required_fields = va_arg(args, int); |
5603 | 0 | if (fd->range.refid != -2) |
5604 | 0 | fd->required_fields |= SAM_POS; |
5605 | 0 | break; |
5606 | | |
5607 | 0 | case CRAM_OPT_STORE_MD: |
5608 | 0 | fd->store_md = va_arg(args, int); |
5609 | 0 | break; |
5610 | | |
5611 | 0 | case CRAM_OPT_STORE_NM: |
5612 | 0 | fd->store_nm = va_arg(args, int); |
5613 | 0 | break; |
5614 | | |
5615 | 0 | case HTS_OPT_COMPRESSION_LEVEL: |
5616 | 0 | fd->level = va_arg(args, int); |
5617 | 0 | break; |
5618 | | |
5619 | 0 | case HTS_OPT_PROFILE: { |
5620 | 0 | enum hts_profile_option prof = va_arg(args, int); |
5621 | 0 | switch (prof) { |
5622 | 0 | case HTS_PROFILE_FAST: |
5623 | 0 | if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 1; |
5624 | 0 | fd->use_tok = 0; |
5625 | 0 | fd->seqs_per_slice = 10000; |
5626 | 0 | break; |
5627 | | |
5628 | 0 | case HTS_PROFILE_NORMAL: |
5629 | 0 | break; |
5630 | | |
5631 | 0 | case HTS_PROFILE_SMALL: |
5632 | 0 | if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 6; |
5633 | 0 | fd->use_bz2 = 1; |
5634 | 0 | fd->use_fqz = 1; |
5635 | 0 | fd->seqs_per_slice = 25000; |
5636 | 0 | break; |
5637 | | |
5638 | 0 | case HTS_PROFILE_ARCHIVE: |
5639 | 0 | if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 7; |
5640 | 0 | fd->use_bz2 = 1; |
5641 | 0 | fd->use_fqz = 1; |
5642 | 0 | fd->use_arith = 1; |
5643 | 0 | if (fd->level > 7) |
5644 | 0 | fd->use_lzma = 1; |
5645 | 0 | fd->seqs_per_slice = 100000; |
5646 | 0 | break; |
5647 | 0 | } |
5648 | | |
5649 | 0 | if (fd->bases_per_slice == BASES_PER_SLICE) |
5650 | 0 | fd->bases_per_slice = fd->seqs_per_slice * 500; |
5651 | 0 | break; |
5652 | 0 | } |
5653 | | |
5654 | 0 | default: |
5655 | 0 | hts_log_error("Unknown CRAM option code %d", opt); |
5656 | 0 | errno = EINVAL; |
5657 | 0 | return -1; |
5658 | 7.26k | } |
5659 | | |
5660 | 7.26k | return 0; |
5661 | 7.26k | } |
5662 | | |
5663 | | int cram_check_EOF(cram_fd *fd) |
5664 | 0 | { |
5665 | | // Byte 9 in these templates is & with 0x0f to resolve differences |
5666 | | // between ITF-8 interpretations between early Java and C |
5667 | | // implementations of CRAM |
5668 | 0 | static const unsigned char TEMPLATE_2_1[30] = { |
5669 | 0 | 0x0b, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0, |
5670 | 0 | 0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, |
5671 | 0 | 0x01, 0x00, 0x06, 0x06, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00 |
5672 | 0 | }; |
5673 | 0 | static const unsigned char TEMPLATE_3[38] = { |
5674 | 0 | 0x0f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0, |
5675 | 0 | 0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x05, |
5676 | 0 | 0xbd, 0xd9, 0x4f, 0x00, 0x01, 0x00, 0x06, 0x06, 0x01, 0x00, |
5677 | 0 | 0x01, 0x00, 0x01, 0x00, 0xee, 0x63, 0x01, 0x4b |
5678 | 0 | }; |
5679 | |
|
5680 | 0 | unsigned char buf[38]; // max(sizeof TEMPLATE_*) |
5681 | |
|
5682 | 0 | uint8_t major = CRAM_MAJOR_VERS(fd->version); |
5683 | 0 | uint8_t minor = CRAM_MINOR_VERS(fd->version); |
5684 | |
|
5685 | 0 | const unsigned char *template; |
5686 | 0 | ssize_t template_len; |
5687 | 0 | if ((major < 2) || |
5688 | 0 | (major == 2 && minor == 0)) { |
5689 | 0 | return 3; // No EOF support in cram versions less than 2.1 |
5690 | 0 | } else if (major == 2 && minor == 1) { |
5691 | 0 | template = TEMPLATE_2_1; |
5692 | 0 | template_len = sizeof TEMPLATE_2_1; |
5693 | 0 | } else { |
5694 | 0 | template = TEMPLATE_3; |
5695 | 0 | template_len = sizeof TEMPLATE_3; |
5696 | 0 | } |
5697 | | |
5698 | 0 | off_t offset = htell(fd->fp); |
5699 | 0 | if (hseek(fd->fp, -template_len, SEEK_END) < 0) { |
5700 | 0 | if (errno == ESPIPE) { |
5701 | 0 | hclearerr(fd->fp); |
5702 | 0 | return 2; |
5703 | 0 | } |
5704 | 0 | else { |
5705 | 0 | return -1; |
5706 | 0 | } |
5707 | 0 | } |
5708 | 0 | if (hread(fd->fp, buf, template_len) != template_len) return -1; |
5709 | 0 | if (hseek(fd->fp, offset, SEEK_SET) < 0) return -1; |
5710 | 0 | buf[8] &= 0x0f; |
5711 | 0 | return (memcmp(template, buf, template_len) == 0)? 1 : 0; |
5712 | 0 | } |