/src/htslib/cram/cram_io.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | Copyright (c) 2012-2025 Genome Research Ltd. |
3 | | Author: James Bonfield <jkb@sanger.ac.uk> |
4 | | |
5 | | Redistribution and use in source and binary forms, with or without |
6 | | modification, are permitted provided that the following conditions are met: |
7 | | |
8 | | 1. Redistributions of source code must retain the above copyright notice, |
9 | | this list of conditions and the following disclaimer. |
10 | | |
11 | | 2. Redistributions in binary form must reproduce the above copyright notice, |
12 | | this list of conditions and the following disclaimer in the documentation |
13 | | and/or other materials provided with the distribution. |
14 | | |
15 | | 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger |
16 | | Institute nor the names of its contributors may be used to endorse or promote |
17 | | products derived from this software without specific prior written permission. |
18 | | |
19 | | THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND |
20 | | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
21 | | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
22 | | DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE |
23 | | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
24 | | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
25 | | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
26 | | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
27 | | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
28 | | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 | | */ |
30 | | |
31 | | /* |
32 | | * CRAM I/O primitives. |
33 | | * |
34 | | * - ITF8 encoding and decoding. |
35 | | * - Block based I/O |
36 | | * - Zlib inflating and deflating (memory) |
37 | | * - CRAM basic data structure reading and writing |
38 | | * - File opening / closing |
39 | | * - Reference sequence handling |
40 | | */ |
41 | | |
42 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
43 | | #include <config.h> |
44 | | |
45 | | #include <stdio.h> |
46 | | #include <errno.h> |
47 | | #include <assert.h> |
48 | | #include <stdlib.h> |
49 | | #include <string.h> |
50 | | #include <unistd.h> |
51 | | #include <zlib.h> |
52 | | #ifdef HAVE_LIBBZ2 |
53 | | #include <bzlib.h> |
54 | | #endif |
55 | | #ifdef HAVE_LIBLZMA |
56 | | #ifdef HAVE_LZMA_H |
57 | | #include <lzma.h> |
58 | | #else |
59 | | #include "../os/lzma_stub.h" |
60 | | #endif |
61 | | #endif |
62 | | #include <sys/types.h> |
63 | | #include <sys/stat.h> |
64 | | #include <math.h> |
65 | | #include <stdint.h> |
66 | | |
67 | | #ifdef HAVE_LIBDEFLATE |
68 | | #include <libdeflate.h> |
69 | | #define crc32(a,b,c) libdeflate_crc32((a),(b),(c)) |
70 | | #endif |
71 | | |
72 | | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
73 | | #include "../fuzz_settings.h" |
74 | | #endif |
75 | | |
76 | | #include "cram.h" |
77 | | #include "os.h" |
78 | | #include "../htslib/hts.h" |
79 | | #include "../hts_internal.h" |
80 | | #include "open_trace_file.h" |
81 | | |
82 | | #if defined(HAVE_EXTERNAL_LIBHTSCODECS) |
83 | | #include <htscodecs/rANS_static.h> |
84 | | #include <htscodecs/rANS_static4x16.h> |
85 | | #include <htscodecs/arith_dynamic.h> |
86 | | #include <htscodecs/tokenise_name3.h> |
87 | | #include <htscodecs/fqzcomp_qual.h> |
88 | | #include <htscodecs/varint.h> // CRAM v4.0 variable-size integers |
89 | | #else |
90 | | #include "../htscodecs/htscodecs/rANS_static.h" |
91 | | #include "../htscodecs/htscodecs/rANS_static4x16.h" |
92 | | #include "../htscodecs/htscodecs/arith_dynamic.h" |
93 | | #include "../htscodecs/htscodecs/tokenise_name3.h" |
94 | | #include "../htscodecs/htscodecs/fqzcomp_qual.h" |
95 | | #include "../htscodecs/htscodecs/varint.h" |
96 | | #endif |
97 | | |
98 | | //#define REF_DEBUG |
99 | | |
100 | | #ifdef REF_DEBUG |
101 | | #include <sys/syscall.h> |
102 | | #define gettid() (int)syscall(SYS_gettid) |
103 | | |
104 | | #define RP(...) fprintf (stderr, __VA_ARGS__) |
105 | | #else |
106 | | #define RP(...) |
107 | | #endif |
108 | | |
109 | | #include "../htslib/hfile.h" |
110 | | #include "../htslib/bgzf.h" |
111 | | #include "../htslib/faidx.h" |
112 | | #include "../hts_internal.h" |
113 | | |
114 | | #ifndef PATH_MAX |
115 | | #define PATH_MAX FILENAME_MAX |
116 | | #endif |
117 | | |
118 | 908k | #define TRIAL_SPAN 70 |
119 | 908k | #define NTRIALS 3 |
120 | | |
121 | 18.6k | #define CRAM_DEFAULT_LEVEL 5 |
122 | | |
123 | | /* ---------------------------------------------------------------------- |
124 | | * ITF8 encoding and decoding. |
125 | | * |
126 | | * Also see the itf8_get and itf8_put macros in cram_io.h |
127 | | */ |
128 | | |
129 | | /* |
130 | | * LEGACY: consider using itf8_decode_crc. |
131 | | * |
132 | | * Reads an integer in ITF-8 encoding from 'cp' and stores it in |
133 | | * *val. |
134 | | * |
135 | | * Returns the number of bytes read on success |
136 | | * -1 on failure |
137 | | */ |
138 | 0 | int itf8_decode(cram_fd *fd, int32_t *val_p) { |
139 | 0 | static int nbytes[16] = { |
140 | 0 | 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx |
141 | 0 | 1,1,1,1, // 1000xxxx - 1011xxxx |
142 | 0 | 2,2, // 1100xxxx - 1101xxxx |
143 | 0 | 3, // 1110xxxx |
144 | 0 | 4, // 1111xxxx |
145 | 0 | }; |
146 | |
|
147 | 0 | static int nbits[16] = { |
148 | 0 | 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx |
149 | 0 | 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx |
150 | 0 | 0x1f, 0x1f, // 1100xxxx - 1101xxxx |
151 | 0 | 0x0f, // 1110xxxx |
152 | 0 | 0x0f, // 1111xxxx |
153 | 0 | }; |
154 | |
|
155 | 0 | int32_t val = hgetc(fd->fp); |
156 | 0 | if (val == -1) |
157 | 0 | return -1; |
158 | | |
159 | 0 | int i = nbytes[val>>4]; |
160 | 0 | val &= nbits[val>>4]; |
161 | |
|
162 | 0 | switch(i) { |
163 | 0 | case 0: |
164 | 0 | *val_p = val; |
165 | 0 | return 1; |
166 | | |
167 | 0 | case 1: |
168 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
169 | 0 | *val_p = val; |
170 | 0 | return 2; |
171 | | |
172 | 0 | case 2: |
173 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
174 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
175 | 0 | *val_p = val; |
176 | 0 | return 3; |
177 | | |
178 | 0 | case 3: |
179 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
180 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
181 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
182 | 0 | *val_p = val; |
183 | 0 | return 4; |
184 | | |
185 | 0 | case 4: // really 3.5 more, why make it different? |
186 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
187 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
188 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
189 | 0 | val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f); |
190 | 0 | *val_p = val; |
191 | 0 | } |
192 | | |
193 | 0 | return 5; |
194 | 0 | } |
195 | | |
196 | 242k | int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) { |
197 | 242k | static int nbytes[16] = { |
198 | 242k | 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx |
199 | 242k | 1,1,1,1, // 1000xxxx - 1011xxxx |
200 | 242k | 2,2, // 1100xxxx - 1101xxxx |
201 | 242k | 3, // 1110xxxx |
202 | 242k | 4, // 1111xxxx |
203 | 242k | }; |
204 | | |
205 | 242k | static int nbits[16] = { |
206 | 242k | 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx |
207 | 242k | 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx |
208 | 242k | 0x1f, 0x1f, // 1100xxxx - 1101xxxx |
209 | 242k | 0x0f, // 1110xxxx |
210 | 242k | 0x0f, // 1111xxxx |
211 | 242k | }; |
212 | 242k | unsigned char c[5]; |
213 | | |
214 | 242k | int32_t val = hgetc(fd->fp); |
215 | 242k | if (val == -1) |
216 | 349 | return -1; |
217 | | |
218 | 242k | c[0]=val; |
219 | | |
220 | 242k | int i = nbytes[val>>4]; |
221 | 242k | val &= nbits[val>>4]; |
222 | | |
223 | 242k | if (i > 0) { |
224 | 16.6k | if (hread(fd->fp, &c[1], i) < i) |
225 | 111 | return -1; |
226 | 16.6k | } |
227 | | |
228 | 242k | switch(i) { |
229 | 225k | case 0: |
230 | 225k | *val_p = val; |
231 | 225k | *crc = crc32(*crc, c, 1); |
232 | 225k | return 1; |
233 | | |
234 | 7.35k | case 1: |
235 | 7.35k | val = (val<<8) | c[1]; |
236 | 7.35k | *val_p = val; |
237 | 7.35k | *crc = crc32(*crc, c, 2); |
238 | 7.35k | return 2; |
239 | | |
240 | 3.72k | case 2: |
241 | 3.72k | val = (val<<8) | c[1]; |
242 | 3.72k | val = (val<<8) | c[2]; |
243 | 3.72k | *val_p = val; |
244 | 3.72k | *crc = crc32(*crc, c, 3); |
245 | 3.72k | return 3; |
246 | | |
247 | 1.15k | case 3: |
248 | 1.15k | val = (val<<8) | c[1]; |
249 | 1.15k | val = (val<<8) | c[2]; |
250 | 1.15k | val = (val<<8) | c[3]; |
251 | 1.15k | *val_p = val; |
252 | 1.15k | *crc = crc32(*crc, c, 4); |
253 | 1.15k | return 4; |
254 | | |
255 | 4.27k | case 4: // really 3.5 more, why make it different? |
256 | 4.27k | { |
257 | 4.27k | uint32_t uv = val; |
258 | 4.27k | uv = (uv<<8) | c[1]; |
259 | 4.27k | uv = (uv<<8) | c[2]; |
260 | 4.27k | uv = (uv<<8) | c[3]; |
261 | 4.27k | uv = (uv<<4) | (c[4] & 0x0f); |
262 | | // Avoid implementation-defined behaviour on negative values |
263 | 4.27k | *val_p = uv < 0x80000000UL ? (int32_t) uv : -((int32_t) (0xffffffffUL - uv)) - 1; |
264 | 4.27k | *crc = crc32(*crc, c, 5); |
265 | 4.27k | } |
266 | 242k | } |
267 | | |
268 | 4.27k | return 5; |
269 | 242k | } |
270 | | |
271 | | /* |
272 | | * Stores a value to memory in ITF-8 format. |
273 | | * |
274 | | * Returns the number of bytes required to store the number. |
275 | | * This is a maximum of 5 bytes. |
276 | | */ |
277 | 11.0M | static inline int itf8_put(char *cp, int32_t val) { |
278 | 11.0M | unsigned char *up = (unsigned char *)cp; |
279 | 11.0M | if (!(val & ~0x00000007f)) { // 1 byte |
280 | 10.1M | *up = val; |
281 | 10.1M | return 1; |
282 | 10.1M | } else if (!(val & ~0x00003fff)) { // 2 byte |
283 | 314k | *up++ = (val >> 8 ) | 0x80; |
284 | 314k | *up = val & 0xff; |
285 | 314k | return 2; |
286 | 583k | } else if (!(val & ~0x01fffff)) { // 3 byte |
287 | 7.67k | *up++ = (val >> 16) | 0xc0; |
288 | 7.67k | *up++ = (val >> 8 ) & 0xff; |
289 | 7.67k | *up = val & 0xff; |
290 | 7.67k | return 3; |
291 | 575k | } else if (!(val & ~0x0fffffff)) { // 4 byte |
292 | 401k | *up++ = (val >> 24) | 0xe0; |
293 | 401k | *up++ = (val >> 16) & 0xff; |
294 | 401k | *up++ = (val >> 8 ) & 0xff; |
295 | 401k | *up = val & 0xff; |
296 | 401k | return 4; |
297 | 401k | } else { // 5 byte |
298 | 174k | *up++ = 0xf0 | ((val>>28) & 0xff); |
299 | 174k | *up++ = (val >> 20) & 0xff; |
300 | 174k | *up++ = (val >> 12) & 0xff; |
301 | 174k | *up++ = (val >> 4 ) & 0xff; |
302 | 174k | *up = val & 0x0f; |
303 | 174k | return 5; |
304 | 174k | } |
305 | 11.0M | } |
306 | | |
307 | | |
308 | | /* 64-bit itf8 variant */ |
309 | 141k | static inline int ltf8_put(char *cp, int64_t val) { |
310 | 141k | unsigned char *up = (unsigned char *)cp; |
311 | 141k | if (!(val & ~((1LL<<7)-1))) { |
312 | 95.2k | *up = val; |
313 | 95.2k | return 1; |
314 | 95.2k | } else if (!(val & ~((1LL<<(6+8))-1))) { |
315 | 45.8k | *up++ = (val >> 8 ) | 0x80; |
316 | 45.8k | *up = val & 0xff; |
317 | 45.8k | return 2; |
318 | 45.8k | } else if (!(val & ~((1LL<<(5+2*8))-1))) { |
319 | 361 | *up++ = (val >> 16) | 0xc0; |
320 | 361 | *up++ = (val >> 8 ) & 0xff; |
321 | 361 | *up = val & 0xff; |
322 | 361 | return 3; |
323 | 361 | } else if (!(val & ~((1LL<<(4+3*8))-1))) { |
324 | 16 | *up++ = (val >> 24) | 0xe0; |
325 | 16 | *up++ = (val >> 16) & 0xff; |
326 | 16 | *up++ = (val >> 8 ) & 0xff; |
327 | 16 | *up = val & 0xff; |
328 | 16 | return 4; |
329 | 16 | } else if (!(val & ~((1LL<<(3+4*8))-1))) { |
330 | 0 | *up++ = (val >> 32) | 0xf0; |
331 | 0 | *up++ = (val >> 24) & 0xff; |
332 | 0 | *up++ = (val >> 16) & 0xff; |
333 | 0 | *up++ = (val >> 8 ) & 0xff; |
334 | 0 | *up = val & 0xff; |
335 | 0 | return 5; |
336 | 0 | } else if (!(val & ~((1LL<<(2+5*8))-1))) { |
337 | 0 | *up++ = (val >> 40) | 0xf8; |
338 | 0 | *up++ = (val >> 32) & 0xff; |
339 | 0 | *up++ = (val >> 24) & 0xff; |
340 | 0 | *up++ = (val >> 16) & 0xff; |
341 | 0 | *up++ = (val >> 8 ) & 0xff; |
342 | 0 | *up = val & 0xff; |
343 | 0 | return 6; |
344 | 0 | } else if (!(val & ~((1LL<<(1+6*8))-1))) { |
345 | 0 | *up++ = (val >> 48) | 0xfc; |
346 | 0 | *up++ = (val >> 40) & 0xff; |
347 | 0 | *up++ = (val >> 32) & 0xff; |
348 | 0 | *up++ = (val >> 24) & 0xff; |
349 | 0 | *up++ = (val >> 16) & 0xff; |
350 | 0 | *up++ = (val >> 8 ) & 0xff; |
351 | 0 | *up = val & 0xff; |
352 | 0 | return 7; |
353 | 0 | } else if (!(val & ~((1LL<<(7*8))-1))) { |
354 | 0 | *up++ = (val >> 56) | 0xfe; |
355 | 0 | *up++ = (val >> 48) & 0xff; |
356 | 0 | *up++ = (val >> 40) & 0xff; |
357 | 0 | *up++ = (val >> 32) & 0xff; |
358 | 0 | *up++ = (val >> 24) & 0xff; |
359 | 0 | *up++ = (val >> 16) & 0xff; |
360 | 0 | *up++ = (val >> 8 ) & 0xff; |
361 | 0 | *up = val & 0xff; |
362 | 0 | return 8; |
363 | 0 | } else { |
364 | 0 | *up++ = 0xff; |
365 | 0 | *up++ = (val >> 56) & 0xff; |
366 | 0 | *up++ = (val >> 48) & 0xff; |
367 | 0 | *up++ = (val >> 40) & 0xff; |
368 | 0 | *up++ = (val >> 32) & 0xff; |
369 | 0 | *up++ = (val >> 24) & 0xff; |
370 | 0 | *up++ = (val >> 16) & 0xff; |
371 | 0 | *up++ = (val >> 8 ) & 0xff; |
372 | 0 | *up = val & 0xff; |
373 | 0 | return 9; |
374 | 0 | } |
375 | 141k | } |
376 | | |
377 | | /* |
378 | | * Encodes and writes a single integer in ITF-8 format. |
379 | | * Returns 0 on success |
380 | | * -1 on failure |
381 | | */ |
382 | 0 | int itf8_encode(cram_fd *fd, int32_t val) { |
383 | 0 | char buf[5]; |
384 | 0 | int len = itf8_put(buf, val); |
385 | 0 | return hwrite(fd->fp, buf, len) == len ? 0 : -1; |
386 | 0 | } |
387 | | |
388 | | const int itf8_bytes[16] = { |
389 | | 1, 1, 1, 1, 1, 1, 1, 1, |
390 | | 2, 2, 2, 2, 3, 3, 4, 5 |
391 | | }; |
392 | | |
393 | | const int ltf8_bytes[256] = { |
394 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
395 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
396 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
397 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
398 | | |
399 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
400 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
401 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
402 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
403 | | |
404 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
405 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
406 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
407 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
408 | | |
409 | | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
410 | | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
411 | | |
412 | | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
413 | | |
414 | | 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 8, 9 |
415 | | }; |
416 | | |
417 | | /* |
418 | | * LEGACY: consider using ltf8_decode_crc. |
419 | | */ |
420 | 0 | int ltf8_decode(cram_fd *fd, int64_t *val_p) { |
421 | 0 | int c = hgetc(fd->fp); |
422 | 0 | int64_t val = (unsigned char)c; |
423 | 0 | if (c == -1) |
424 | 0 | return -1; |
425 | | |
426 | 0 | if (val < 0x80) { |
427 | 0 | *val_p = val; |
428 | 0 | return 1; |
429 | |
|
430 | 0 | } else if (val < 0xc0) { |
431 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
432 | 0 | *val_p = val & (((1LL<<(6+8)))-1); |
433 | 0 | return 2; |
434 | |
|
435 | 0 | } else if (val < 0xe0) { |
436 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
437 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
438 | 0 | *val_p = val & ((1LL<<(5+2*8))-1); |
439 | 0 | return 3; |
440 | |
|
441 | 0 | } else if (val < 0xf0) { |
442 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
443 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
444 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
445 | 0 | *val_p = val & ((1LL<<(4+3*8))-1); |
446 | 0 | return 4; |
447 | |
|
448 | 0 | } else if (val < 0xf8) { |
449 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
450 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
451 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
452 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
453 | 0 | *val_p = val & ((1LL<<(3+4*8))-1); |
454 | 0 | return 5; |
455 | |
|
456 | 0 | } else if (val < 0xfc) { |
457 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
458 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
459 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
460 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
461 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
462 | 0 | *val_p = val & ((1LL<<(2+5*8))-1); |
463 | 0 | return 6; |
464 | |
|
465 | 0 | } else if (val < 0xfe) { |
466 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
467 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
468 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
469 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
470 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
471 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
472 | 0 | *val_p = val & ((1LL<<(1+6*8))-1); |
473 | 0 | return 7; |
474 | |
|
475 | 0 | } else if (val < 0xff) { |
476 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
477 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
478 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
479 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
480 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
481 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
482 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
483 | 0 | *val_p = val & ((1LL<<(7*8))-1); |
484 | 0 | return 8; |
485 | |
|
486 | 0 | } else { |
487 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
488 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
489 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
490 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
491 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
492 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
493 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
494 | 0 | val = (val<<8) | (unsigned char)hgetc(fd->fp); |
495 | 0 | *val_p = val; |
496 | 0 | } |
497 | | |
498 | 0 | return 9; |
499 | 0 | } |
500 | | |
501 | 3.55k | int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) { |
502 | 3.55k | unsigned char c[9]; |
503 | 3.55k | int64_t val = hgetc(fd->fp); |
504 | 3.55k | if (val < 0) |
505 | 19 | return -1; |
506 | | |
507 | 3.54k | c[0] = val; |
508 | | |
509 | 3.54k | if (val < 0x80) { |
510 | 2.60k | *val_p = val; |
511 | 2.60k | *crc = crc32(*crc, c, 1); |
512 | 2.60k | return 1; |
513 | | |
514 | 2.60k | } else if (val < 0xc0) { |
515 | 128 | int v = hgetc(fd->fp); |
516 | 128 | if (v < 0) |
517 | 5 | return -1; |
518 | 123 | val = (val<<8) | (c[1]=v); |
519 | 123 | *val_p = val & (((1LL<<(6+8)))-1); |
520 | 123 | *crc = crc32(*crc, c, 2); |
521 | 123 | return 2; |
522 | | |
523 | 805 | } else if (val < 0xe0) { |
524 | 38 | if (hread(fd->fp, &c[1], 2) < 2) |
525 | 6 | return -1; |
526 | 32 | val = (val<<8) | c[1]; |
527 | 32 | val = (val<<8) | c[2]; |
528 | 32 | *val_p = val & ((1LL<<(5+2*8))-1); |
529 | 32 | *crc = crc32(*crc, c, 3); |
530 | 32 | return 3; |
531 | | |
532 | 767 | } else if (val < 0xf0) { |
533 | 45 | if (hread(fd->fp, &c[1], 3) < 3) |
534 | 5 | return -1; |
535 | 40 | val = (val<<8) | c[1]; |
536 | 40 | val = (val<<8) | c[2]; |
537 | 40 | val = (val<<8) | c[3]; |
538 | 40 | *val_p = val & ((1LL<<(4+3*8))-1); |
539 | 40 | *crc = crc32(*crc, c, 4); |
540 | 40 | return 4; |
541 | | |
542 | 722 | } else if (val < 0xf8) { |
543 | 145 | if (hread(fd->fp, &c[1], 4) < 4) |
544 | 2 | return -1; |
545 | 143 | val = (val<<8) | c[1]; |
546 | 143 | val = (val<<8) | c[2]; |
547 | 143 | val = (val<<8) | c[3]; |
548 | 143 | val = (val<<8) | c[4]; |
549 | 143 | *val_p = val & ((1LL<<(3+4*8))-1); |
550 | 143 | *crc = crc32(*crc, c, 5); |
551 | 143 | return 5; |
552 | | |
553 | 577 | } else if (val < 0xfc) { |
554 | 356 | if (hread(fd->fp, &c[1], 5) < 5) |
555 | 8 | return -1; |
556 | 348 | val = (val<<8) | c[1]; |
557 | 348 | val = (val<<8) | c[2]; |
558 | 348 | val = (val<<8) | c[3]; |
559 | 348 | val = (val<<8) | c[4]; |
560 | 348 | val = (val<<8) | c[5]; |
561 | 348 | *val_p = val & ((1LL<<(2+5*8))-1); |
562 | 348 | *crc = crc32(*crc, c, 6); |
563 | 348 | return 6; |
564 | | |
565 | 356 | } else if (val < 0xfe) { |
566 | 75 | if (hread(fd->fp, &c[1], 6) < 6) |
567 | 5 | return -1; |
568 | 70 | val = (val<<8) | c[1]; |
569 | 70 | val = (val<<8) | c[2]; |
570 | 70 | val = (val<<8) | c[3]; |
571 | 70 | val = (val<<8) | c[4]; |
572 | 70 | val = (val<<8) | c[5]; |
573 | 70 | val = (val<<8) | c[6]; |
574 | 70 | *val_p = val & ((1LL<<(1+6*8))-1); |
575 | 70 | *crc = crc32(*crc, c, 7); |
576 | 70 | return 7; |
577 | | |
578 | 146 | } else if (val < 0xff) { |
579 | 90 | uint64_t uval = val; |
580 | 90 | if (hread(fd->fp, &c[1], 7) < 7) |
581 | 1 | return -1; |
582 | 89 | uval = (uval<<8) | c[1]; |
583 | 89 | uval = (uval<<8) | c[2]; |
584 | 89 | uval = (uval<<8) | c[3]; |
585 | 89 | uval = (uval<<8) | c[4]; |
586 | 89 | uval = (uval<<8) | c[5]; |
587 | 89 | uval = (uval<<8) | c[6]; |
588 | 89 | uval = (uval<<8) | c[7]; |
589 | 89 | *val_p = uval & ((1ULL<<(7*8))-1); |
590 | 89 | *crc = crc32(*crc, c, 8); |
591 | 89 | return 8; |
592 | | |
593 | 90 | } else { |
594 | 56 | uint64_t uval; |
595 | 56 | if (hread(fd->fp, &c[1], 8) < 8) |
596 | 4 | return -1; |
597 | 52 | uval = c[1]; |
598 | 52 | uval = (uval<<8) | c[2]; |
599 | 52 | uval = (uval<<8) | c[3]; |
600 | 52 | uval = (uval<<8) | c[4]; |
601 | 52 | uval = (uval<<8) | c[5]; |
602 | 52 | uval = (uval<<8) | c[6]; |
603 | 52 | uval = (uval<<8) | c[7]; |
604 | 52 | uval = (uval<<8) | c[8]; |
605 | 52 | *crc = crc32(*crc, c, 9); |
606 | | // Avoid implementation-defined behaviour on negative values |
607 | 52 | *val_p = c[1] < 0x80 ? (int64_t) uval : -((int64_t) (0xffffffffffffffffULL - uval)) - 1; |
608 | 52 | } |
609 | | |
610 | 52 | return 9; |
611 | 3.54k | } |
612 | | |
613 | | /* |
614 | | * Pushes a value in ITF8 format onto the end of a block. |
615 | | * This shouldn't be used for high-volume data as it is not the fastest |
616 | | * method. |
617 | | * |
618 | | * Returns the number of bytes written |
619 | | */ |
620 | 5.95M | int itf8_put_blk(cram_block *blk, int32_t val) { |
621 | 5.95M | char buf[5]; |
622 | 5.95M | int sz; |
623 | | |
624 | 5.95M | sz = itf8_put(buf, val); |
625 | 5.95M | BLOCK_APPEND(blk, buf, sz); |
626 | 5.95M | return sz; |
627 | | |
628 | 0 | block_err: |
629 | 0 | return -1; |
630 | 5.95M | } |
631 | | |
632 | 0 | int ltf8_put_blk(cram_block *blk, int64_t val) { |
633 | 0 | char buf[9]; |
634 | 0 | int sz; |
635 | |
|
636 | 0 | sz = ltf8_put(buf, val); |
637 | 0 | BLOCK_APPEND(blk, buf, sz); |
638 | 0 | return sz; |
639 | | |
640 | 0 | block_err: |
641 | 0 | return -1; |
642 | 0 | } |
643 | | |
644 | 193k | static int64_t safe_itf8_get(char **cp, const char *endp, int *err) { |
645 | 193k | const unsigned char *up = (unsigned char *)*cp; |
646 | | |
647 | 193k | if (endp && endp - *cp < 5 && |
648 | 193k | (*cp >= endp || endp - *cp < itf8_bytes[up[0]>>4])) { |
649 | 14.8k | if (err) *err = 1; |
650 | 14.8k | return 0; |
651 | 14.8k | } |
652 | | |
653 | 178k | if (up[0] < 0x80) { |
654 | 159k | (*cp)++; |
655 | 159k | return up[0]; |
656 | 159k | } else if (up[0] < 0xc0) { |
657 | 8.81k | (*cp)+=2; |
658 | 8.81k | return ((up[0] <<8) | up[1]) & 0x3fff; |
659 | 9.90k | } else if (up[0] < 0xe0) { |
660 | 2.00k | (*cp)+=3; |
661 | 2.00k | return ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff; |
662 | 7.90k | } else if (up[0] < 0xf0) { |
663 | 2.58k | (*cp)+=4; |
664 | 2.58k | uint32_t uv = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff; |
665 | 2.58k | return (int32_t)uv; |
666 | 5.31k | } else { |
667 | 5.31k | (*cp)+=5; |
668 | 5.31k | uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f); |
669 | 5.31k | return (int32_t)uv; |
670 | 5.31k | } |
671 | 178k | } |
672 | | |
673 | 807 | static int64_t safe_ltf8_get(char **cp, const char *endp, int *err) { |
674 | 807 | unsigned char *up = (unsigned char *)*cp; |
675 | | |
676 | 807 | if (endp && endp - *cp < 9 && |
677 | 807 | (*cp >= endp || endp - *cp < ltf8_bytes[up[0]])) { |
678 | 327 | if (err) *err = 1; |
679 | 327 | return 0; |
680 | 327 | } |
681 | | |
682 | 480 | if (up[0] < 0x80) { |
683 | 237 | (*cp)++; |
684 | 237 | return up[0]; |
685 | 243 | } else if (up[0] < 0xc0) { |
686 | 15 | (*cp)+=2; |
687 | 15 | return (((uint64_t)up[0]<< 8) | |
688 | 15 | (uint64_t)up[1]) & (((1LL<<(6+8)))-1); |
689 | 228 | } else if (up[0] < 0xe0) { |
690 | 42 | (*cp)+=3; |
691 | 42 | return (((uint64_t)up[0]<<16) | |
692 | 42 | ((uint64_t)up[1]<< 8) | |
693 | 42 | (uint64_t)up[2]) & ((1LL<<(5+2*8))-1); |
694 | 186 | } else if (up[0] < 0xf0) { |
695 | 36 | (*cp)+=4; |
696 | 36 | return (((uint64_t)up[0]<<24) | |
697 | 36 | ((uint64_t)up[1]<<16) | |
698 | 36 | ((uint64_t)up[2]<< 8) | |
699 | 36 | (uint64_t)up[3]) & ((1LL<<(4+3*8))-1); |
700 | 150 | } else if (up[0] < 0xf8) { |
701 | 75 | (*cp)+=5; |
702 | 75 | return (((uint64_t)up[0]<<32) | |
703 | 75 | ((uint64_t)up[1]<<24) | |
704 | 75 | ((uint64_t)up[2]<<16) | |
705 | 75 | ((uint64_t)up[3]<< 8) | |
706 | 75 | (uint64_t)up[4]) & ((1LL<<(3+4*8))-1); |
707 | 75 | } else if (up[0] < 0xfc) { |
708 | 3 | (*cp)+=6; |
709 | 3 | return (((uint64_t)up[0]<<40) | |
710 | 3 | ((uint64_t)up[1]<<32) | |
711 | 3 | ((uint64_t)up[2]<<24) | |
712 | 3 | ((uint64_t)up[3]<<16) | |
713 | 3 | ((uint64_t)up[4]<< 8) | |
714 | 3 | (uint64_t)up[5]) & ((1LL<<(2+5*8))-1); |
715 | 72 | } else if (up[0] < 0xfe) { |
716 | 6 | (*cp)+=7; |
717 | 6 | return (((uint64_t)up[0]<<48) | |
718 | 6 | ((uint64_t)up[1]<<40) | |
719 | 6 | ((uint64_t)up[2]<<32) | |
720 | 6 | ((uint64_t)up[3]<<24) | |
721 | 6 | ((uint64_t)up[4]<<16) | |
722 | 6 | ((uint64_t)up[5]<< 8) | |
723 | 6 | (uint64_t)up[6]) & ((1LL<<(1+6*8))-1); |
724 | 66 | } else if (up[0] < 0xff) { |
725 | 3 | (*cp)+=8; |
726 | 3 | return (((uint64_t)up[1]<<48) | |
727 | 3 | ((uint64_t)up[2]<<40) | |
728 | 3 | ((uint64_t)up[3]<<32) | |
729 | 3 | ((uint64_t)up[4]<<24) | |
730 | 3 | ((uint64_t)up[5]<<16) | |
731 | 3 | ((uint64_t)up[6]<< 8) | |
732 | 3 | (uint64_t)up[7]) & ((1LL<<(7*8))-1); |
733 | 63 | } else { |
734 | 63 | (*cp)+=9; |
735 | 63 | return (((uint64_t)up[1]<<56) | |
736 | 63 | ((uint64_t)up[2]<<48) | |
737 | 63 | ((uint64_t)up[3]<<40) | |
738 | 63 | ((uint64_t)up[4]<<32) | |
739 | 63 | ((uint64_t)up[5]<<24) | |
740 | 63 | ((uint64_t)up[6]<<16) | |
741 | 63 | ((uint64_t)up[7]<< 8) | |
742 | 63 | (uint64_t)up[8]); |
743 | 63 | } |
744 | 480 | } |
745 | | |
746 | | // Wrapper for now |
747 | 5.13M | static int safe_itf8_put(char *cp, char *cp_end, int32_t val) { |
748 | 5.13M | return itf8_put(cp, val); |
749 | 5.13M | } |
750 | | |
751 | 141k | static int safe_ltf8_put(char *cp, char *cp_end, int64_t val) { |
752 | 141k | return ltf8_put(cp, val); |
753 | 141k | } |
754 | | |
755 | 1.09M | static int itf8_size(int64_t v) { |
756 | 1.09M | return ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5); |
757 | 1.09M | } |
758 | | |
759 | | //----------------------------------------------------------------------------- |
760 | | |
761 | | // CRAM v4.0 onwards uses a different variable sized integer encoding |
762 | | // that is size agnostic. |
763 | | |
764 | | // Local interface to varint.h inline version, so we can use in func ptr. |
765 | | // Note a lot of these use the unsigned interface but take signed int64_t. |
766 | | // This is because the old CRAM ITF8 inteface had signed -1 as unsigned |
767 | | // 0xffffffff. |
768 | 6.18k | static int uint7_size(int64_t v) { |
769 | 6.18k | return var_size_u64(v); |
770 | 6.18k | } |
771 | | |
772 | 2.16k | static int64_t uint7_get_32(char **cp, const char *endp, int *err) { |
773 | 2.16k | uint32_t val; |
774 | 2.16k | int nb = var_get_u32((uint8_t *)(*cp), (const uint8_t *)endp, &val); |
775 | 2.16k | (*cp) += nb; |
776 | 2.16k | if (!nb && err) *err = 1; |
777 | 2.16k | return val; |
778 | 2.16k | } |
779 | | |
780 | 0 | static int64_t sint7_get_32(char **cp, const char *endp, int *err) { |
781 | 0 | int32_t val; |
782 | 0 | int nb = var_get_s32((uint8_t *)(*cp), (const uint8_t *)endp, &val); |
783 | 0 | (*cp) += nb; |
784 | 0 | if (!nb && err) *err = 1; |
785 | 0 | return val; |
786 | 0 | } |
787 | | |
788 | 12.6M | static int64_t uint7_get_64(char **cp, const char *endp, int *err) { |
789 | 12.6M | uint64_t val; |
790 | 12.6M | int nb = var_get_u64((uint8_t *)(*cp), (const uint8_t *)endp, &val); |
791 | 12.6M | (*cp) += nb; |
792 | 12.6M | if (!nb && err) *err = 1; |
793 | 12.6M | return val; |
794 | 12.6M | } |
795 | | |
796 | 27 | static int64_t sint7_get_64(char **cp, const char *endp, int *err) { |
797 | 27 | int64_t val; |
798 | 27 | int nb = var_get_s64((uint8_t *)(*cp), (const uint8_t *)endp, &val); |
799 | 27 | (*cp) += nb; |
800 | 27 | if (!nb && err) *err = 1; |
801 | 27 | return val; |
802 | 27 | } |
803 | | |
804 | 0 | static int uint7_put_32(char *cp, char *endp, int32_t val) { |
805 | 0 | return var_put_u32((uint8_t *)cp, (uint8_t *)endp, val); |
806 | 0 | } |
807 | | |
808 | 0 | static int sint7_put_32(char *cp, char *endp, int32_t val) { |
809 | 0 | return var_put_s32((uint8_t *)cp, (uint8_t *)endp, val); |
810 | 0 | } |
811 | | |
812 | 0 | static int uint7_put_64(char *cp, char *endp, int64_t val) { |
813 | 0 | return var_put_u64((uint8_t *)cp, (uint8_t *)endp, val); |
814 | 0 | } |
815 | | |
816 | 0 | static int sint7_put_64(char *cp, char *endp, int64_t val) { |
817 | 0 | return var_put_s64((uint8_t *)cp, (uint8_t *)endp, val); |
818 | 0 | } |
819 | | |
820 | | // Put direct to to cram_block |
821 | 0 | static int uint7_put_blk_32(cram_block *blk, int32_t v) { |
822 | 0 | uint8_t buf[10]; |
823 | 0 | int sz = var_put_u32(buf, buf+10, v); |
824 | 0 | BLOCK_APPEND(blk, buf, sz); |
825 | 0 | return sz; |
826 | | |
827 | 0 | block_err: |
828 | 0 | return -1; |
829 | 0 | } |
830 | | |
831 | 0 | static int sint7_put_blk_32(cram_block *blk, int32_t v) { |
832 | 0 | uint8_t buf[10]; |
833 | 0 | int sz = var_put_s32(buf, buf+10, v); |
834 | 0 | BLOCK_APPEND(blk, buf, sz); |
835 | 0 | return sz; |
836 | | |
837 | 0 | block_err: |
838 | 0 | return -1; |
839 | 0 | } |
840 | | |
841 | 0 | static int uint7_put_blk_64(cram_block *blk, int64_t v) { |
842 | 0 | uint8_t buf[10]; |
843 | 0 | int sz = var_put_u64(buf, buf+10, v); |
844 | 0 | BLOCK_APPEND(blk, buf, sz); |
845 | 0 | return sz; |
846 | | |
847 | 0 | block_err: |
848 | 0 | return -1; |
849 | 0 | } |
850 | | |
851 | 0 | static int sint7_put_blk_64(cram_block *blk, int64_t v) { |
852 | 0 | uint8_t buf[10]; |
853 | 0 | int sz = var_put_s64(buf, buf+10, v); |
854 | 0 | BLOCK_APPEND(blk, buf, sz); |
855 | 0 | return sz; |
856 | | |
857 | 0 | block_err: |
858 | 0 | return -1; |
859 | 0 | } |
860 | | |
861 | | // Decode 32-bits with CRC update from cram_fd |
862 | 23.9k | static int uint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) { |
863 | 23.9k | uint8_t b[5], i = 0; |
864 | 23.9k | int c; |
865 | 23.9k | uint32_t v = 0; |
866 | | |
867 | | #ifdef VARINT2 |
868 | | b[0] = hgetc(fd->fp); |
869 | | if (b[0] < 177) { |
870 | | } else if (b[0] < 241) { |
871 | | b[1] = hgetc(fd->fp); |
872 | | } else if (b[0] < 249) { |
873 | | b[1] = hgetc(fd->fp); |
874 | | b[2] = hgetc(fd->fp); |
875 | | } else { |
876 | | int n = b[0]+2, z = 1; |
877 | | while (n-- >= 249) |
878 | | b[z++] = hgetc(fd->fp); |
879 | | } |
880 | | i = var_get_u32(b, NULL, &v); |
881 | | #else |
882 | | // // Little endian |
883 | | // int s = 0; |
884 | | // do { |
885 | | // b[i++] = c = hgetc(fd->fp); |
886 | | // if (c < 0) |
887 | | // return -1; |
888 | | // v |= (c & 0x7f) << s; |
889 | | // s += 7; |
890 | | // } while (i < 5 && (c & 0x80)); |
891 | | |
892 | | // Big endian, see also htscodecs/varint.h |
893 | 30.7k | do { |
894 | 30.7k | b[i++] = c = hgetc(fd->fp); |
895 | 30.7k | if (c < 0) |
896 | 92 | return -1; |
897 | 30.6k | v = (v<<7) | (c & 0x7f); |
898 | 30.6k | } while (i < 5 && (c & 0x80)); |
899 | 23.8k | #endif |
900 | 23.8k | *crc = crc32(*crc, b, i); |
901 | | |
902 | 23.8k | *val_p = v; |
903 | 23.8k | return i; |
904 | 23.9k | } |
905 | | |
906 | | // Decode 32-bits with CRC update from cram_fd |
907 | 2.70k | static int sint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) { |
908 | 2.70k | uint8_t b[5], i = 0; |
909 | 2.70k | int c; |
910 | 2.70k | uint32_t v = 0; |
911 | | |
912 | | #ifdef VARINT2 |
913 | | b[0] = hgetc(fd->fp); |
914 | | if (b[0] < 177) { |
915 | | } else if (b[0] < 241) { |
916 | | b[1] = hgetc(fd->fp); |
917 | | } else if (b[0] < 249) { |
918 | | b[1] = hgetc(fd->fp); |
919 | | b[2] = hgetc(fd->fp); |
920 | | } else { |
921 | | int n = b[0]+2, z = 1; |
922 | | while (n-- >= 249) |
923 | | b[z++] = hgetc(fd->fp); |
924 | | } |
925 | | i = var_get_u32(b, NULL, &v); |
926 | | #else |
927 | | // // Little endian |
928 | | // int s = 0; |
929 | | // do { |
930 | | // b[i++] = c = hgetc(fd->fp); |
931 | | // if (c < 0) |
932 | | // return -1; |
933 | | // v |= (c & 0x7f) << s; |
934 | | // s += 7; |
935 | | // } while (i < 5 && (c & 0x80)); |
936 | | |
937 | | // Big endian, see also htscodecs/varint.h |
938 | 4.09k | do { |
939 | 4.09k | b[i++] = c = hgetc(fd->fp); |
940 | 4.09k | if (c < 0) |
941 | 12 | return -1; |
942 | 4.07k | v = (v<<7) | (c & 0x7f); |
943 | 4.07k | } while (i < 5 && (c & 0x80)); |
944 | 2.68k | #endif |
945 | 2.68k | *crc = crc32(*crc, b, i); |
946 | | |
947 | 2.68k | *val_p = (v>>1) ^ -(v&1); |
948 | 2.68k | return i; |
949 | 2.70k | } |
950 | | |
951 | | |
952 | | // Decode 64-bits with CRC update from cram_fd |
953 | 10.6k | static int uint7_decode_crc64(cram_fd *fd, int64_t *val_p, uint32_t *crc) { |
954 | 10.6k | uint8_t b[10], i = 0; |
955 | 10.6k | int c; |
956 | 10.6k | uint64_t v = 0; |
957 | | |
958 | | #ifdef VARINT2 |
959 | | b[0] = hgetc(fd->fp); |
960 | | if (b[0] < 177) { |
961 | | } else if (b[0] < 241) { |
962 | | b[1] = hgetc(fd->fp); |
963 | | } else if (b[0] < 249) { |
964 | | b[1] = hgetc(fd->fp); |
965 | | b[2] = hgetc(fd->fp); |
966 | | } else { |
967 | | int n = b[0]+2, z = 1; |
968 | | while (n-- >= 249) |
969 | | b[z++] = hgetc(fd->fp); |
970 | | } |
971 | | i = var_get_u64(b, NULL, &v); |
972 | | #else |
973 | | // // Little endian |
974 | | // int s = 0; |
975 | | // do { |
976 | | // b[i++] = c = hgetc(fd->fp); |
977 | | // if (c < 0) |
978 | | // return -1; |
979 | | // v |= (c & 0x7f) << s; |
980 | | // s += 7; |
981 | | // } while (i < 10 && (c & 0x80)); |
982 | | |
983 | | // Big endian, see also htscodecs/varint.h |
984 | 14.4k | do { |
985 | 14.4k | b[i++] = c = hgetc(fd->fp); |
986 | 14.4k | if (c < 0) |
987 | 39 | return -1; |
988 | 14.4k | v = (v<<7) | (c & 0x7f); |
989 | 14.4k | } while (i < 5 && (c & 0x80)); |
990 | 10.6k | #endif |
991 | 10.6k | *crc = crc32(*crc, b, i); |
992 | | |
993 | 10.6k | *val_p = v; |
994 | 10.6k | return i; |
995 | 10.6k | } |
996 | | |
997 | | //----------------------------------------------------------------------------- |
998 | | |
999 | | /* |
1000 | | * Decodes a 32-bit little endian value from fd and stores in val. |
1001 | | * |
1002 | | * Returns the number of bytes read on success |
1003 | | * -1 on failure |
1004 | | */ |
1005 | 16.0k | static int int32_decode(cram_fd *fd, int32_t *val) { |
1006 | 16.0k | int32_t i; |
1007 | 16.0k | if (4 != hread(fd->fp, &i, 4)) |
1008 | 42 | return -1; |
1009 | | |
1010 | 16.0k | *val = le_int4(i); |
1011 | 16.0k | return 4; |
1012 | 16.0k | } |
1013 | | |
1014 | | /* |
1015 | | * Encodes a 32-bit little endian value 'val' and writes to fd. |
1016 | | * |
1017 | | * Returns the number of bytes written on success |
1018 | | * -1 on failure |
1019 | | */ |
1020 | 286k | static int int32_encode(cram_fd *fd, int32_t val) { |
1021 | 286k | uint32_t v = le_int4(val); |
1022 | 286k | if (4 != hwrite(fd->fp, &v, 4)) |
1023 | 0 | return -1; |
1024 | | |
1025 | 286k | return 4; |
1026 | 286k | } |
1027 | | |
1028 | | /* As int32_decoded/encode, but from/to blocks instead of cram_fd */ |
1029 | 655 | int int32_get_blk(cram_block *b, int32_t *val) { |
1030 | 655 | if (b->uncomp_size - BLOCK_SIZE(b) < 4) |
1031 | 75 | return -1; |
1032 | | |
1033 | 580 | uint32_t v = |
1034 | 580 | ((uint32_t) b->data[b->byte ]) | |
1035 | 580 | (((uint32_t) b->data[b->byte+1]) << 8) | |
1036 | 580 | (((uint32_t) b->data[b->byte+2]) << 16) | |
1037 | 580 | (((uint32_t) b->data[b->byte+3]) << 24); |
1038 | | // Avoid implementation-defined behaviour on negative values |
1039 | 580 | *val = v < 0x80000000U ? (int32_t) v : -((int32_t) (0xffffffffU - v)) - 1; |
1040 | 580 | BLOCK_SIZE(b) += 4; |
1041 | 580 | return 4; |
1042 | 655 | } |
1043 | | |
1044 | | /* As int32_decoded/encode, but from/to blocks instead of cram_fd */ |
1045 | 7.93k | int int32_put_blk(cram_block *b, int32_t val) { |
1046 | 7.93k | unsigned char cp[4]; |
1047 | 7.93k | uint32_t v = val; |
1048 | 7.93k | cp[0] = ( v & 0xff); |
1049 | 7.93k | cp[1] = ((v>>8) & 0xff); |
1050 | 7.93k | cp[2] = ((v>>16) & 0xff); |
1051 | 7.93k | cp[3] = ((v>>24) & 0xff); |
1052 | | |
1053 | 7.93k | BLOCK_APPEND(b, cp, 4); |
1054 | 7.93k | return 0; |
1055 | | |
1056 | 0 | block_err: |
1057 | 0 | return -1; |
1058 | 7.93k | } |
1059 | | |
1060 | | #ifdef HAVE_LIBDEFLATE |
1061 | | /* ---------------------------------------------------------------------- |
1062 | | * libdeflate compression code, with interface to match |
1063 | | * zlib_mem_{in,de}flate for simplicity elsewhere. |
1064 | | */ |
1065 | | |
1066 | | // Named the same as the version that uses zlib as we always use libdeflate for |
1067 | | // decompression when available. |
1068 | | char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) { |
1069 | | struct libdeflate_decompressor *z = libdeflate_alloc_decompressor(); |
1070 | | if (!z) { |
1071 | | hts_log_error("Call to libdeflate_alloc_decompressor failed"); |
1072 | | return NULL; |
1073 | | } |
1074 | | |
1075 | | uint8_t *data = NULL, *new_data; |
1076 | | if (!*size) |
1077 | | *size = csize*2; |
1078 | | for(;;) { |
1079 | | new_data = realloc(data, *size); |
1080 | | if (!new_data) { |
1081 | | hts_log_error("Memory allocation failure"); |
1082 | | goto fail; |
1083 | | } |
1084 | | data = new_data; |
1085 | | |
1086 | | int ret = libdeflate_gzip_decompress(z, cdata, csize, data, *size, size); |
1087 | | |
1088 | | // Auto grow output buffer size if needed and try again. |
1089 | | // Fortunately for all bar one call of this we know the size already. |
1090 | | if (ret == LIBDEFLATE_INSUFFICIENT_SPACE) { |
1091 | | (*size) *= 1.5; |
1092 | | continue; |
1093 | | } |
1094 | | |
1095 | | if (ret != LIBDEFLATE_SUCCESS) { |
1096 | | hts_log_error("Inflate operation failed: %d", ret); |
1097 | | goto fail; |
1098 | | } else { |
1099 | | break; |
1100 | | } |
1101 | | } |
1102 | | |
1103 | | libdeflate_free_decompressor(z); |
1104 | | return (char *)data; |
1105 | | |
1106 | | fail: |
1107 | | libdeflate_free_decompressor(z); |
1108 | | free(data); |
1109 | | return NULL; |
1110 | | } |
1111 | | |
1112 | | // Named differently as we use both zlib/libdeflate for compression. |
1113 | | static char *libdeflate_deflate(char *data, size_t size, size_t *cdata_size, |
1114 | | int level, int strat) { |
1115 | | level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default |
1116 | | level *= 1.23; // NB levels go up to 12 here; 5 onwards is +1 |
1117 | | level += level>=8; // 5,6,7->6,7,8 8->10 9->12 |
1118 | | if (level > 12) level = 12; |
1119 | | |
1120 | | if (strat == Z_RLE) // not supported by libdeflate |
1121 | | level = 1; |
1122 | | |
1123 | | struct libdeflate_compressor *z = libdeflate_alloc_compressor(level); |
1124 | | if (!z) { |
1125 | | hts_log_error("Call to libdeflate_alloc_compressor failed"); |
1126 | | return NULL; |
1127 | | } |
1128 | | |
1129 | | unsigned char *cdata = NULL; /* Compressed output */ |
1130 | | size_t cdata_alloc; |
1131 | | cdata = malloc(cdata_alloc = size*1.05+100); |
1132 | | if (!cdata) { |
1133 | | hts_log_error("Memory allocation failure"); |
1134 | | libdeflate_free_compressor(z); |
1135 | | return NULL; |
1136 | | } |
1137 | | |
1138 | | *cdata_size = libdeflate_gzip_compress(z, data, size, cdata, cdata_alloc); |
1139 | | libdeflate_free_compressor(z); |
1140 | | |
1141 | | if (*cdata_size == 0) { |
1142 | | hts_log_error("Call to libdeflate_gzip_compress failed"); |
1143 | | free(cdata); |
1144 | | return NULL; |
1145 | | } |
1146 | | |
1147 | | return (char *)cdata; |
1148 | | } |
1149 | | |
1150 | | #else |
1151 | | |
1152 | | /* ---------------------------------------------------------------------- |
1153 | | * zlib compression code - from Gap5's tg_iface_g.c |
1154 | | * They're static here as they're only used within the cram_compress_block |
1155 | | * and cram_uncompress_block functions, which are the external interface. |
1156 | | */ |
1157 | 14 | char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) { |
1158 | 14 | z_stream s; |
1159 | 14 | unsigned char *data = NULL; /* Uncompressed output */ |
1160 | 14 | int data_alloc = 0; |
1161 | 14 | int err; |
1162 | | |
1163 | | /* Starting point at uncompressed size, and scale after that */ |
1164 | 14 | data = malloc(data_alloc = csize*1.2+100); |
1165 | 14 | if (!data) |
1166 | 0 | return NULL; |
1167 | | |
1168 | | /* Initialise zlib stream */ |
1169 | 14 | s.zalloc = Z_NULL; /* use default allocation functions */ |
1170 | 14 | s.zfree = Z_NULL; |
1171 | 14 | s.opaque = Z_NULL; |
1172 | 14 | s.next_in = (unsigned char *)cdata; |
1173 | 14 | s.avail_in = csize; |
1174 | 14 | s.total_in = 0; |
1175 | 14 | s.next_out = data; |
1176 | 14 | s.avail_out = data_alloc; |
1177 | 14 | s.total_out = 0; |
1178 | | |
1179 | | //err = inflateInit(&s); |
1180 | 14 | err = inflateInit2(&s, 15 + 32); |
1181 | 14 | if (err != Z_OK) { |
1182 | 0 | hts_log_error("Call to zlib inflateInit failed: %s", s.msg); |
1183 | 0 | free(data); |
1184 | 0 | return NULL; |
1185 | 0 | } |
1186 | | |
1187 | | /* Decode to 'data' array */ |
1188 | 21 | for (;s.avail_in;) { |
1189 | 12 | unsigned char *data_tmp; |
1190 | 12 | int alloc_inc; |
1191 | | |
1192 | 12 | s.next_out = &data[s.total_out]; |
1193 | 12 | err = inflate(&s, Z_NO_FLUSH); |
1194 | 12 | if (err == Z_STREAM_END) |
1195 | 0 | break; |
1196 | | |
1197 | 12 | if (err != Z_OK) { |
1198 | 5 | hts_log_error("Call to zlib inflate failed: %s", s.msg); |
1199 | 5 | free(data); |
1200 | 5 | inflateEnd(&s); |
1201 | 5 | return NULL; |
1202 | 5 | } |
1203 | | |
1204 | | /* More to come, so realloc based on growth so far */ |
1205 | 7 | alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100; |
1206 | 7 | data = realloc((data_tmp = data), data_alloc += alloc_inc); |
1207 | 7 | if (!data) { |
1208 | 0 | free(data_tmp); |
1209 | 0 | inflateEnd(&s); |
1210 | 0 | return NULL; |
1211 | 0 | } |
1212 | 7 | s.avail_out += alloc_inc; |
1213 | 7 | } |
1214 | 9 | inflateEnd(&s); |
1215 | | |
1216 | 9 | *size = s.total_out; |
1217 | 9 | return (char *)data; |
1218 | 14 | } |
1219 | | #endif |
1220 | | |
1221 | | #if !defined(HAVE_LIBDEFLATE) || LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR == 1 && LIBDEFLATE_VERSION_MINOR <= 8) |
1222 | | static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size, |
1223 | 198k | int level, int strat) { |
1224 | 198k | z_stream s; |
1225 | 198k | unsigned char *cdata = NULL; /* Compressed output */ |
1226 | 198k | int cdata_alloc = 0; |
1227 | 198k | int cdata_pos = 0; |
1228 | 198k | int err; |
1229 | | |
1230 | 198k | cdata = malloc(cdata_alloc = size*1.05+100); |
1231 | 198k | if (!cdata) |
1232 | 0 | return NULL; |
1233 | 198k | cdata_pos = 0; |
1234 | | |
1235 | | /* Initialise zlib stream */ |
1236 | 198k | s.zalloc = Z_NULL; /* use default allocation functions */ |
1237 | 198k | s.zfree = Z_NULL; |
1238 | 198k | s.opaque = Z_NULL; |
1239 | 198k | s.next_in = (unsigned char *)data; |
1240 | 198k | s.avail_in = size; |
1241 | 198k | s.total_in = 0; |
1242 | 198k | s.next_out = cdata; |
1243 | 198k | s.avail_out = cdata_alloc; |
1244 | 198k | s.total_out = 0; |
1245 | 198k | s.data_type = Z_BINARY; |
1246 | | |
1247 | 198k | err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat); |
1248 | 198k | if (err != Z_OK) { |
1249 | 0 | hts_log_error("Call to zlib deflateInit2 failed: %s", s.msg); |
1250 | 0 | return NULL; |
1251 | 0 | } |
1252 | | |
1253 | | /* Encode to 'cdata' array */ |
1254 | 396k | for (;s.avail_in;) { |
1255 | 198k | s.next_out = &cdata[cdata_pos]; |
1256 | 198k | s.avail_out = cdata_alloc - cdata_pos; |
1257 | 198k | if (cdata_alloc - cdata_pos <= 0) { |
1258 | 0 | hts_log_error("Deflate produced larger output than expected"); |
1259 | 0 | return NULL; |
1260 | 0 | } |
1261 | 198k | err = deflate(&s, Z_NO_FLUSH); |
1262 | 198k | cdata_pos = cdata_alloc - s.avail_out; |
1263 | 198k | if (err != Z_OK) { |
1264 | 0 | hts_log_error("Call to zlib deflate failed: %s", s.msg); |
1265 | 0 | break; |
1266 | 0 | } |
1267 | 198k | } |
1268 | 198k | if (deflate(&s, Z_FINISH) != Z_STREAM_END) { |
1269 | 0 | hts_log_error("Call to zlib deflate failed: %s", s.msg); |
1270 | 0 | } |
1271 | 198k | *cdata_size = s.total_out; |
1272 | | |
1273 | 198k | if (deflateEnd(&s) != Z_OK) { |
1274 | 0 | hts_log_error("Call to zlib deflate failed: %s", s.msg); |
1275 | 0 | } |
1276 | 198k | return (char *)cdata; |
1277 | 198k | } |
1278 | | #endif |
1279 | | |
1280 | | #ifdef HAVE_LIBLZMA |
1281 | | /* ------------------------------------------------------------------------ */ |
1282 | | /* |
1283 | | * Data compression routines using liblzma (xz) |
1284 | | * |
1285 | | * On a test set this shrunk the main db from 136157104 bytes to 114796168, but |
1286 | | * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq |
1287 | | * went from 18.3s to 36.3s. So decompression suffers too, but not as bad |
1288 | | * as compression times. |
1289 | | * |
1290 | | * For now we disable this functionality. If it's to be reenabled make sure you |
1291 | | * improve the mem_inflate implementation as it's just a test hack at the |
1292 | | * moment. |
1293 | | */ |
1294 | | |
1295 | | static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size, |
1296 | 0 | int level) { |
1297 | 0 | char *out; |
1298 | 0 | size_t out_size = lzma_stream_buffer_bound(size); |
1299 | 0 | *cdata_size = 0; |
1300 | |
|
1301 | 0 | out = malloc(out_size); |
1302 | | |
1303 | | /* Single call compression */ |
1304 | 0 | if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL, |
1305 | 0 | (uint8_t *)data, size, |
1306 | 0 | (uint8_t *)out, cdata_size, |
1307 | 0 | out_size)) |
1308 | 0 | return NULL; |
1309 | | |
1310 | 0 | return out; |
1311 | 0 | } |
1312 | | |
1313 | 16 | static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) { |
1314 | 16 | lzma_stream strm = LZMA_STREAM_INIT; |
1315 | 16 | size_t out_size = 0, out_pos = 0; |
1316 | 16 | char *out = NULL, *new_out; |
1317 | 16 | int r; |
1318 | | |
1319 | | /* Initiate the decoder */ |
1320 | 16 | if (LZMA_OK != lzma_stream_decoder(&strm, lzma_easy_decoder_memusage(9), 0)) |
1321 | 0 | return NULL; |
1322 | | |
1323 | | /* Decode loop */ |
1324 | 16 | strm.avail_in = csize; |
1325 | 16 | strm.next_in = (uint8_t *)cdata; |
1326 | | |
1327 | 22 | for (;strm.avail_in;) { |
1328 | 14 | if (strm.avail_in > out_size - out_pos) { |
1329 | 14 | out_size += strm.avail_in * 4 + 32768; |
1330 | 14 | new_out = realloc(out, out_size); |
1331 | 14 | if (!new_out) |
1332 | 0 | goto fail; |
1333 | 14 | out = new_out; |
1334 | 14 | } |
1335 | 14 | strm.avail_out = out_size - out_pos; |
1336 | 14 | strm.next_out = (uint8_t *)&out[out_pos]; |
1337 | | |
1338 | 14 | r = lzma_code(&strm, LZMA_RUN); |
1339 | 14 | if (LZMA_OK != r && LZMA_STREAM_END != r) { |
1340 | 8 | hts_log_error("LZMA decode failure (error %d)", r); |
1341 | 8 | goto fail; |
1342 | 8 | } |
1343 | | |
1344 | 6 | out_pos = strm.total_out; |
1345 | | |
1346 | 6 | if (r == LZMA_STREAM_END) |
1347 | 0 | break; |
1348 | 6 | } |
1349 | | |
1350 | | /* finish up any unflushed data; necessary? */ |
1351 | 8 | r = lzma_code(&strm, LZMA_FINISH); |
1352 | 8 | if (r != LZMA_OK && r != LZMA_STREAM_END) { |
1353 | 0 | hts_log_error("Call to lzma_code failed with error %d", r); |
1354 | 0 | goto fail; |
1355 | 0 | } |
1356 | | |
1357 | 8 | new_out = realloc(out, strm.total_out > 0 ? strm.total_out : 1); |
1358 | 8 | if (new_out) |
1359 | 8 | out = new_out; |
1360 | 8 | *size = strm.total_out; |
1361 | | |
1362 | 8 | lzma_end(&strm); |
1363 | | |
1364 | 8 | return out; |
1365 | | |
1366 | 8 | fail: |
1367 | 8 | lzma_end(&strm); |
1368 | 8 | free(out); |
1369 | 8 | return NULL; |
1370 | 8 | } |
1371 | | #endif |
1372 | | |
1373 | | /* ---------------------------------------------------------------------- |
1374 | | * CRAM blocks - the dynamically growable data block. We have code to |
1375 | | * create, update, (un)compress and read/write. |
1376 | | * |
1377 | | * These are derived from the deflate_interlaced.c blocks, but with the |
1378 | | * CRAM extension of content types and IDs. |
1379 | | */ |
1380 | | |
1381 | | /* |
1382 | | * Allocates a new cram_block structure with a specified content_type and |
1383 | | * id. |
1384 | | * |
1385 | | * Returns block pointer on success |
1386 | | * NULL on failure |
1387 | | */ |
1388 | | cram_block *cram_new_block(enum cram_content_type content_type, |
1389 | 904k | int content_id) { |
1390 | 904k | cram_block *b = malloc(sizeof(*b)); |
1391 | 904k | if (!b) |
1392 | 0 | return NULL; |
1393 | 904k | b->method = b->orig_method = RAW; |
1394 | 904k | b->content_type = content_type; |
1395 | 904k | b->content_id = content_id; |
1396 | 904k | b->comp_size = 0; |
1397 | 904k | b->uncomp_size = 0; |
1398 | 904k | b->data = NULL; |
1399 | 904k | b->alloc = 0; |
1400 | 904k | b->byte = 0; |
1401 | 904k | b->bit = 7; // MSB |
1402 | 904k | b->crc32 = 0; |
1403 | 904k | b->idx = 0; |
1404 | 904k | b->m = NULL; |
1405 | | |
1406 | 904k | return b; |
1407 | 904k | } |
1408 | | |
1409 | | /* |
1410 | | * Reads a block from a cram file. |
1411 | | * Returns cram_block pointer on success. |
1412 | | * NULL on failure |
1413 | | */ |
1414 | 13.4k | cram_block *cram_read_block(cram_fd *fd) { |
1415 | 13.4k | cram_block *b = malloc(sizeof(*b)); |
1416 | 13.4k | unsigned char c; |
1417 | 13.4k | uint32_t crc = 0; |
1418 | 13.4k | if (!b) |
1419 | 0 | return NULL; |
1420 | | |
1421 | | //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp)); |
1422 | | |
1423 | 13.4k | if (-1 == (b->method = hgetc(fd->fp))) { free(b); return NULL; } |
1424 | 13.3k | c = b->method; crc = crc32(crc, &c, 1); |
1425 | 13.3k | if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; } |
1426 | 13.3k | c = b->content_type; crc = crc32(crc, &c, 1); |
1427 | 13.3k | if (-1 == fd->vv.varint_decode32_crc(fd, &b->content_id, &crc)) { free(b); return NULL; } |
1428 | 13.3k | if (-1 == fd->vv.varint_decode32_crc(fd, &b->comp_size, &crc)) { free(b); return NULL; } |
1429 | 13.3k | if (-1 == fd->vv.varint_decode32_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; } |
1430 | | |
1431 | | //fprintf(stderr, " method %d, ctype %d, cid %d, csize %d, ucsize %d\n", |
1432 | | // b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size); |
1433 | | |
1434 | 13.2k | if (b->method == RAW) { |
1435 | 5.60k | if (b->uncomp_size < 0 || b->comp_size != b->uncomp_size) { |
1436 | 75 | free(b); |
1437 | 75 | return NULL; |
1438 | 75 | } |
1439 | 5.53k | b->alloc = b->uncomp_size; |
1440 | 5.53k | if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; } |
1441 | 5.53k | if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) { |
1442 | 27 | free(b->data); |
1443 | 27 | free(b); |
1444 | 27 | return NULL; |
1445 | 27 | } |
1446 | 7.69k | } else { |
1447 | 7.69k | if (b->comp_size < 0 || b->uncomp_size < 0) { |
1448 | 64 | free(b); |
1449 | 64 | return NULL; |
1450 | 64 | } |
1451 | 7.62k | b->alloc = b->comp_size; |
1452 | 7.62k | if (!(b->data = malloc(b->comp_size))) { free(b); return NULL; } |
1453 | 7.62k | if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) { |
1454 | 55 | free(b->data); |
1455 | 55 | free(b); |
1456 | 55 | return NULL; |
1457 | 55 | } |
1458 | 7.62k | } |
1459 | | |
1460 | 13.0k | if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
1461 | 3.08k | if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) { |
1462 | 8 | free(b->data); |
1463 | 8 | free(b); |
1464 | 8 | return NULL; |
1465 | 8 | } |
1466 | | |
1467 | 3.07k | b->crc32_checked = fd->ignore_md5; |
1468 | 3.07k | b->crc_part = crc; |
1469 | 9.99k | } else { |
1470 | 9.99k | b->crc32_checked = 1; // CRC not present |
1471 | 9.99k | } |
1472 | | |
1473 | 13.0k | b->orig_method = b->method; |
1474 | 13.0k | b->idx = 0; |
1475 | 13.0k | b->byte = 0; |
1476 | 13.0k | b->bit = 7; // MSB |
1477 | | |
1478 | 13.0k | return b; |
1479 | 13.0k | } |
1480 | | |
1481 | | |
1482 | | /* |
1483 | | * Computes the size of a cram block, including the block |
1484 | | * header itself. |
1485 | | */ |
1486 | 0 | uint32_t cram_block_size(cram_block *b) { |
1487 | 0 | unsigned char dat[100], *cp = dat;; |
1488 | 0 | uint32_t sz; |
1489 | |
|
1490 | 0 | *cp++ = b->method; |
1491 | 0 | *cp++ = b->content_type; |
1492 | 0 | cp += itf8_put((char*)cp, b->content_id); |
1493 | 0 | cp += itf8_put((char*)cp, b->comp_size); |
1494 | 0 | cp += itf8_put((char*)cp, b->uncomp_size); |
1495 | |
|
1496 | 0 | sz = cp-dat + 4; |
1497 | 0 | sz += b->method == RAW ? b->uncomp_size : b->comp_size; |
1498 | |
|
1499 | 0 | return sz; |
1500 | 0 | } |
1501 | | |
1502 | | /* |
1503 | | * Writes a CRAM block. |
1504 | | * Returns 0 on success |
1505 | | * -1 on failure |
1506 | | */ |
1507 | 286k | int cram_write_block(cram_fd *fd, cram_block *b) { |
1508 | 286k | char vardata[100]; |
1509 | 286k | int vardata_o = 0; |
1510 | | |
1511 | 286k | assert(b->method != RAW || (b->comp_size == b->uncomp_size)); |
1512 | | |
1513 | 286k | if (hputc(b->method, fd->fp) == EOF) return -1; |
1514 | 286k | if (hputc(b->content_type, fd->fp) == EOF) return -1; |
1515 | 286k | vardata_o += fd->vv.varint_put32(vardata , vardata+100, b->content_id); |
1516 | 286k | vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->comp_size); |
1517 | 286k | vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->uncomp_size); |
1518 | 286k | if (vardata_o != hwrite(fd->fp, vardata, vardata_o)) |
1519 | 0 | return -1; |
1520 | | |
1521 | 286k | if (b->data) { |
1522 | 251k | if (b->method == RAW) { |
1523 | 219k | if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size)) |
1524 | 0 | return -1; |
1525 | 219k | } else { |
1526 | 31.2k | if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size)) |
1527 | 0 | return -1; |
1528 | 31.2k | } |
1529 | 251k | } else { |
1530 | | // Absent blocks should be size 0 |
1531 | 35.2k | assert(b->method == RAW && b->uncomp_size == 0); |
1532 | 35.2k | } |
1533 | | |
1534 | 286k | if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
1535 | 286k | char dat[100], *cp = (char *)dat; |
1536 | 286k | uint32_t crc; |
1537 | | |
1538 | 286k | *cp++ = b->method; |
1539 | 286k | *cp++ = b->content_type; |
1540 | 286k | cp += fd->vv.varint_put32(cp, dat+100, b->content_id); |
1541 | 286k | cp += fd->vv.varint_put32(cp, dat+100, b->comp_size); |
1542 | 286k | cp += fd->vv.varint_put32(cp, dat+100, b->uncomp_size); |
1543 | 286k | crc = crc32(0L, (uc *)dat, cp-dat); |
1544 | | |
1545 | 286k | if (b->method == RAW) { |
1546 | 255k | b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size); |
1547 | 255k | } else { |
1548 | 31.2k | b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size); |
1549 | 31.2k | } |
1550 | | |
1551 | 286k | if (-1 == int32_encode(fd, b->crc32)) |
1552 | 0 | return -1; |
1553 | 286k | } |
1554 | | |
1555 | 286k | return 0; |
1556 | 286k | } |
1557 | | |
1558 | | /* |
1559 | | * Frees a CRAM block, deallocating internal data too. |
1560 | | */ |
1561 | 1.19M | void cram_free_block(cram_block *b) { |
1562 | 1.19M | if (!b) |
1563 | 282k | return; |
1564 | 917k | if (b->data) |
1565 | 583k | free(b->data); |
1566 | 917k | free(b); |
1567 | 917k | } |
1568 | | |
1569 | | /* |
1570 | | * Uncompresses a CRAM block, if compressed. |
1571 | | */ |
1572 | 8.31k | int cram_uncompress_block(cram_block *b) { |
1573 | 8.31k | char *uncomp; |
1574 | 8.31k | size_t uncomp_size = 0; |
1575 | | |
1576 | 8.31k | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
1577 | | // Pretend the CRC was OK so the fuzzer doesn't have to get it right |
1578 | 8.31k | b->crc32_checked = 1; |
1579 | 8.31k | #endif |
1580 | | |
1581 | 8.31k | if (b->crc32_checked == 0) { |
1582 | 0 | uint32_t crc = crc32(b->crc_part, b->data ? b->data : (uc *)"", b->alloc); |
1583 | 0 | b->crc32_checked = 1; |
1584 | 0 | if (crc != b->crc32) { |
1585 | 0 | hts_log_error("Block CRC32 failure"); |
1586 | 0 | return -1; |
1587 | 0 | } |
1588 | 0 | } |
1589 | | |
1590 | 8.31k | if (b->uncomp_size == 0) { |
1591 | | // blank block |
1592 | 3.08k | b->method = RAW; |
1593 | 3.08k | return 0; |
1594 | 3.08k | } |
1595 | 5.23k | assert(b->uncomp_size >= 0); // cram_read_block should ensure this |
1596 | | |
1597 | 5.23k | switch (b->method) { |
1598 | 457 | case RAW: |
1599 | 457 | return 0; |
1600 | | |
1601 | 14 | case GZIP: |
1602 | 14 | uncomp_size = b->uncomp_size; |
1603 | 14 | uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size); |
1604 | | |
1605 | 14 | if (!uncomp) |
1606 | 5 | return -1; |
1607 | 9 | if (uncomp_size != b->uncomp_size) { |
1608 | 9 | free(uncomp); |
1609 | 9 | return -1; |
1610 | 9 | } |
1611 | 0 | free(b->data); |
1612 | 0 | b->data = (unsigned char *)uncomp; |
1613 | 0 | b->alloc = uncomp_size; |
1614 | 0 | b->method = RAW; |
1615 | 0 | break; |
1616 | | |
1617 | 0 | #ifdef HAVE_LIBBZ2 |
1618 | 8 | case BZIP2: { |
1619 | 8 | unsigned int usize = b->uncomp_size; |
1620 | 8 | if (!(uncomp = malloc(usize))) |
1621 | 0 | return -1; |
1622 | 8 | if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize, |
1623 | 8 | (char *)b->data, b->comp_size, |
1624 | 8 | 0, 0)) { |
1625 | 8 | free(uncomp); |
1626 | 8 | return -1; |
1627 | 8 | } |
1628 | 0 | free(b->data); |
1629 | 0 | b->data = (unsigned char *)uncomp; |
1630 | 0 | b->alloc = usize; |
1631 | 0 | b->method = RAW; |
1632 | 0 | b->uncomp_size = usize; // Just in case it differs |
1633 | 0 | break; |
1634 | 8 | } |
1635 | | #else |
1636 | | case BZIP2: |
1637 | | hts_log_error("Bzip2 compression is not compiled into this version. Please rebuild and try again"); |
1638 | | return -1; |
1639 | | #endif |
1640 | | |
1641 | 0 | #ifdef HAVE_LIBLZMA |
1642 | 16 | case LZMA: |
1643 | 16 | uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size); |
1644 | 16 | if (!uncomp) |
1645 | 8 | return -1; |
1646 | 8 | if (uncomp_size != b->uncomp_size) { |
1647 | 8 | free(uncomp); |
1648 | 8 | return -1; |
1649 | 8 | } |
1650 | 0 | free(b->data); |
1651 | 0 | b->data = (unsigned char *)uncomp; |
1652 | 0 | b->alloc = uncomp_size; |
1653 | 0 | b->method = RAW; |
1654 | 0 | break; |
1655 | | #else |
1656 | | case LZMA: |
1657 | | hts_log_error("Lzma compression is not compiled into this version. Please rebuild and try again"); |
1658 | | return -1; |
1659 | | break; |
1660 | | #endif |
1661 | | |
1662 | 427 | case RANS: { |
1663 | 427 | unsigned int usize = b->uncomp_size, usize2; |
1664 | 427 | uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2); |
1665 | 427 | if (!uncomp) |
1666 | 182 | return -1; |
1667 | 245 | if (usize != usize2) { |
1668 | 244 | free(uncomp); |
1669 | 244 | return -1; |
1670 | 244 | } |
1671 | 1 | free(b->data); |
1672 | 1 | b->data = (unsigned char *)uncomp; |
1673 | 1 | b->alloc = usize2; |
1674 | 1 | b->method = RAW; |
1675 | 1 | b->uncomp_size = usize2; // Just in case it differs |
1676 | | //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size); |
1677 | 1 | break; |
1678 | 245 | } |
1679 | | |
1680 | 668 | case FQZ: { |
1681 | 668 | uncomp_size = b->uncomp_size; |
1682 | 668 | uncomp = fqz_decompress((char *)b->data, b->comp_size, &uncomp_size, NULL, 0); |
1683 | 668 | if (!uncomp) |
1684 | 488 | return -1; |
1685 | 180 | free(b->data); |
1686 | 180 | b->data = (unsigned char *)uncomp; |
1687 | 180 | b->alloc = uncomp_size; |
1688 | 180 | b->method = RAW; |
1689 | 180 | b->uncomp_size = uncomp_size; |
1690 | 180 | break; |
1691 | 668 | } |
1692 | | |
1693 | 1.40k | case RANS_PR0: { |
1694 | 1.40k | unsigned int usize = b->uncomp_size, usize2; |
1695 | 1.40k | uncomp = (char *)rans_uncompress_4x16(b->data, b->comp_size, &usize2); |
1696 | 1.40k | if (!uncomp) |
1697 | 1.08k | return -1; |
1698 | 314 | if (usize != usize2) { |
1699 | 253 | free(uncomp); |
1700 | 253 | return -1; |
1701 | 253 | } |
1702 | 61 | b->orig_method = RANSPR; |
1703 | 61 | free(b->data); |
1704 | 61 | b->data = (unsigned char *)uncomp; |
1705 | 61 | b->alloc = usize2; |
1706 | 61 | b->method = RAW; |
1707 | 61 | b->uncomp_size = usize2; // Just incase it differs |
1708 | | //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size); |
1709 | 61 | break; |
1710 | 314 | } |
1711 | | |
1712 | 720 | case ARITH_PR0: { |
1713 | 720 | unsigned int usize = b->uncomp_size, usize2; |
1714 | 720 | uncomp = (char *)arith_uncompress_to(b->data, b->comp_size, NULL, &usize2); |
1715 | 720 | if (!uncomp) |
1716 | 480 | return -1; |
1717 | 240 | if (usize != usize2) { |
1718 | 91 | free(uncomp); |
1719 | 91 | return -1; |
1720 | 91 | } |
1721 | 149 | b->orig_method = ARITH; |
1722 | 149 | free(b->data); |
1723 | 149 | b->data = (unsigned char *)uncomp; |
1724 | 149 | b->alloc = usize2; |
1725 | 149 | b->method = RAW; |
1726 | 149 | b->uncomp_size = usize2; // Just incase it differs |
1727 | | //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size); |
1728 | 149 | break; |
1729 | 240 | } |
1730 | | |
1731 | 1.49k | case TOK3: { |
1732 | 1.49k | uint32_t out_len; |
1733 | 1.49k | uint8_t *cp = tok3_decode_names(b->data, b->comp_size, &out_len); |
1734 | 1.49k | if (!cp) |
1735 | 1.41k | return -1; |
1736 | 81 | b->orig_method = TOK3; |
1737 | 81 | b->method = RAW; |
1738 | 81 | free(b->data); |
1739 | 81 | b->data = cp; |
1740 | 81 | b->alloc = out_len; |
1741 | 81 | b->uncomp_size = out_len; |
1742 | 81 | break; |
1743 | 1.49k | } |
1744 | | |
1745 | 18 | default: |
1746 | 18 | return -1; |
1747 | 5.23k | } |
1748 | | |
1749 | 472 | return 0; |
1750 | 5.23k | } |
1751 | | |
1752 | | static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size, |
1753 | | int content_id, size_t *out_size, |
1754 | | enum cram_block_method_int method, |
1755 | 479k | int level, int strat) { |
1756 | 479k | switch (method) { |
1757 | 62.5k | case GZIP: |
1758 | 93.9k | case GZIP_RLE: |
1759 | 198k | case GZIP_1: |
1760 | | // Read names bizarrely benefit from zlib over libdeflate for |
1761 | | // mid-range compression levels. Focusing purely of ratio or |
1762 | | // speed, libdeflate still wins. It also seems to win for |
1763 | | // other data series too. |
1764 | | // |
1765 | | // Eg RN at level 5; libdeflate=55.9MB zlib=51.6MB |
1766 | | #ifdef HAVE_LIBDEFLATE |
1767 | | # if (LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR == 1 && LIBDEFLATE_VERSION_MINOR <= 8)) |
1768 | | if (content_id == DS_RN && level >= 4 && level <= 7) |
1769 | | return zlib_mem_deflate(in, in_size, out_size, level, strat); |
1770 | | else |
1771 | | # endif |
1772 | | return libdeflate_deflate(in, in_size, out_size, level, strat); |
1773 | | #else |
1774 | 198k | return zlib_mem_deflate(in, in_size, out_size, level, strat); |
1775 | 0 | #endif |
1776 | | |
1777 | 0 | case BZIP2: { |
1778 | 0 | #ifdef HAVE_LIBBZ2 |
1779 | 0 | unsigned int comp_size = in_size*1.01 + 600; |
1780 | 0 | char *comp = malloc(comp_size); |
1781 | 0 | if (!comp) |
1782 | 0 | return NULL; |
1783 | | |
1784 | 0 | if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size, |
1785 | 0 | in, in_size, |
1786 | 0 | level, 0, 30)) { |
1787 | 0 | free(comp); |
1788 | 0 | return NULL; |
1789 | 0 | } |
1790 | 0 | *out_size = comp_size; |
1791 | 0 | return comp; |
1792 | | #else |
1793 | | return NULL; |
1794 | | #endif |
1795 | 0 | } |
1796 | | |
1797 | 0 | case FQZ: |
1798 | 0 | case FQZ_b: |
1799 | 0 | case FQZ_c: |
1800 | 0 | case FQZ_d: { |
1801 | | // Extract the necessary portion of the slice into an fqz_slice struct. |
1802 | | // These previously were the same thing, but this permits us to detach |
1803 | | // the codec from the rest of this CRAM implementation. |
1804 | 0 | fqz_slice *f = malloc(2*s->hdr->num_records * sizeof(uint32_t) + sizeof(fqz_slice)); |
1805 | 0 | if (!f) |
1806 | 0 | return NULL; |
1807 | 0 | f->num_records = s->hdr->num_records; |
1808 | 0 | f->len = (uint32_t *)(((char *)f) + sizeof(fqz_slice)); |
1809 | 0 | f->flags = f->len + s->hdr->num_records; |
1810 | 0 | int i; |
1811 | 0 | for (i = 0; i < s->hdr->num_records; i++) { |
1812 | 0 | f->flags[i] = s->crecs[i].flags; |
1813 | 0 | f->len[i] = (i+1 < s->hdr->num_records |
1814 | 0 | ? s->crecs[i+1].qual - s->crecs[i].qual |
1815 | 0 | : s->block[DS_QS]->uncomp_size - s->crecs[i].qual); |
1816 | 0 | } |
1817 | 0 | char *comp = fqz_compress(strat & 0xff /* cram vers */, f, |
1818 | 0 | in, in_size, out_size, strat >> 8, NULL); |
1819 | 0 | free(f); |
1820 | 0 | return comp; |
1821 | 0 | } |
1822 | | |
1823 | 0 | case LZMA: |
1824 | 0 | #ifdef HAVE_LIBLZMA |
1825 | 0 | return lzma_mem_deflate(in, in_size, out_size, level); |
1826 | | #else |
1827 | | return NULL; |
1828 | | #endif |
1829 | | |
1830 | 0 | case RANS0: |
1831 | 0 | case RANS1: { |
1832 | 0 | unsigned int out_size_i; |
1833 | 0 | unsigned char *cp; |
1834 | 0 | cp = rans_compress((unsigned char *)in, in_size, &out_size_i, |
1835 | 0 | method == RANS0 ? 0 : 1); |
1836 | 0 | *out_size = out_size_i; |
1837 | 0 | return (char *)cp; |
1838 | 0 | } |
1839 | | |
1840 | 86.0k | case RANS_PR0: |
1841 | 117k | case RANS_PR1: |
1842 | 161k | case RANS_PR64: |
1843 | 192k | case RANS_PR9: |
1844 | 234k | case RANS_PR128: |
1845 | 234k | case RANS_PR129: |
1846 | 234k | case RANS_PR192: |
1847 | 265k | case RANS_PR193: { |
1848 | 265k | unsigned int out_size_i; |
1849 | 265k | unsigned char *cp; |
1850 | | |
1851 | | // see enum cram_block. We map RANS_* methods to order bit-fields |
1852 | 265k | static int methmap[] = { 1, 64,9, 128,129, 192,193 }; |
1853 | | |
1854 | 265k | int m = method == RANS_PR0 ? 0 : methmap[method - RANS_PR1]; |
1855 | 265k | cp = rans_compress_4x16((unsigned char *)in, in_size, &out_size_i, |
1856 | 265k | m | RANS_ORDER_SIMD_AUTO); |
1857 | 265k | *out_size = out_size_i; |
1858 | 265k | return (char *)cp; |
1859 | 234k | } |
1860 | | |
1861 | 0 | case ARITH_PR0: |
1862 | 0 | case ARITH_PR1: |
1863 | 0 | case ARITH_PR64: |
1864 | 0 | case ARITH_PR9: |
1865 | 0 | case ARITH_PR128: |
1866 | 0 | case ARITH_PR129: |
1867 | 0 | case ARITH_PR192: |
1868 | 0 | case ARITH_PR193: { |
1869 | 0 | unsigned int out_size_i; |
1870 | 0 | unsigned char *cp; |
1871 | | |
1872 | | // see enum cram_block. We map ARITH_* methods to order bit-fields |
1873 | 0 | static int methmap[] = { 1, 64,9, 128,129, 192,193 }; |
1874 | |
|
1875 | 0 | cp = arith_compress_to((unsigned char *)in, in_size, NULL, &out_size_i, |
1876 | 0 | method == ARITH_PR0 ? 0 : methmap[method - ARITH_PR1]); |
1877 | 0 | *out_size = out_size_i; |
1878 | 0 | return (char *)cp; |
1879 | 0 | } |
1880 | | |
1881 | 15.4k | case TOK3: |
1882 | 15.4k | case TOKA: { |
1883 | 15.4k | int out_len; |
1884 | 15.4k | int lev = level; |
1885 | 15.4k | if (method == TOK3 && lev > 3) |
1886 | 15.4k | lev = 3; |
1887 | 15.4k | uint8_t *cp = tok3_encode_names(in, in_size, lev, strat, &out_len, NULL); |
1888 | 15.4k | *out_size = out_len; |
1889 | 15.4k | return (char *)cp; |
1890 | 15.4k | } |
1891 | | |
1892 | 0 | case RAW: |
1893 | 0 | break; |
1894 | | |
1895 | 0 | default: |
1896 | 0 | return NULL; |
1897 | 479k | } |
1898 | | |
1899 | 0 | return NULL; |
1900 | 479k | } |
1901 | | |
1902 | | /* |
1903 | | * A copy of cram_compress_block2 with added recursion detection. |
1904 | | * This is only called for error handling where the auto-tuning has failed. |
1905 | | * The simplest way of doing this is recusion + an additional argument, but |
1906 | | * we didn't want to complicate the existing code hence this is static. |
1907 | | */ |
1908 | | static int cram_compress_block3(cram_fd *fd, cram_slice *s, |
1909 | | cram_block *b, cram_metrics *metrics, |
1910 | | int method, int level, |
1911 | 513k | int recurse) { |
1912 | | |
1913 | 513k | if (!b) |
1914 | 34.6k | return 0; |
1915 | | |
1916 | 478k | int orig_method = method; |
1917 | 478k | char *comp = NULL; |
1918 | 478k | size_t comp_size = 0; |
1919 | 478k | int strat; |
1920 | | |
1921 | | // Internally we have parameterised methods that externally map |
1922 | | // to the same CRAM method value. |
1923 | | // See enum_cram_block_method_int in cram_structs.h. |
1924 | 478k | int methmap[] = { |
1925 | | // Externally defined values |
1926 | 478k | RAW, GZIP, BZIP2, LZMA, RANS, RANSPR, ARITH, FQZ, TOK3, |
1927 | | |
1928 | | // Reserved for possible expansion |
1929 | 478k | 0, 0, |
1930 | | |
1931 | | // Internally parameterised versions matching back to above |
1932 | | // external values |
1933 | 478k | GZIP, GZIP, |
1934 | 478k | FQZ, FQZ, FQZ, |
1935 | 478k | RANS, |
1936 | 478k | RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, |
1937 | 478k | TOK3, |
1938 | 478k | ARITH, ARITH, ARITH, ARITH, ARITH, ARITH, ARITH, |
1939 | 478k | }; |
1940 | | |
1941 | 478k | if (b->method != RAW) { |
1942 | | // Maybe already compressed if s->block[0] was compressed and |
1943 | | // we have e.g. s->block[DS_BA] set to s->block[0] due to only |
1944 | | // one base type present and hence using E_HUFFMAN on block 0. |
1945 | | // A second explicit attempt to compress the same block then |
1946 | | // occurs. |
1947 | 0 | return 0; |
1948 | 0 | } |
1949 | | |
1950 | 478k | if (method == -1) { |
1951 | 7.93k | method = 1<<GZIP; |
1952 | 7.93k | if (fd->use_bz2) |
1953 | 0 | method |= 1<<BZIP2; |
1954 | 7.93k | if (fd->use_lzma) |
1955 | 0 | method |= 1<<LZMA; |
1956 | 7.93k | } |
1957 | | |
1958 | 478k | if (level == -1) |
1959 | 7.93k | level = fd->level; |
1960 | | |
1961 | | //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size); |
1962 | | |
1963 | 478k | if (method == RAW || level == 0 || b->uncomp_size == 0) { |
1964 | 279k | b->method = RAW; |
1965 | 279k | b->comp_size = b->uncomp_size; |
1966 | | //fprintf(stderr, "Skip block id %d\n", b->content_id); |
1967 | 279k | return 0; |
1968 | 279k | } |
1969 | | |
1970 | 198k | #ifndef ABS |
1971 | 198k | # define ABS(a) ((a)>=0?(a):-(a)) |
1972 | 198k | #endif |
1973 | | |
1974 | 198k | if (metrics) { |
1975 | 183k | pthread_mutex_lock(&fd->metrics_lock); |
1976 | | // Sudden changes in size trigger a retrial. These are mainly |
1977 | | // triggered when switching to sorted / unsorted, where the number |
1978 | | // of elements in a slice radically changes. |
1979 | | // |
1980 | | // We also get large fluctuations based on genome coordinate for |
1981 | | // e.g. SA:Z and SC series, but we consider the typical scale of |
1982 | | // delta between blocks and use this to look for abnormality. |
1983 | | |
1984 | | // Equivalent to (but minus possible integer overflow) |
1985 | | // (b->uncomp_size + 1000)/4 > metrics->input_avg_sz+1000 || |
1986 | | // b->uncomp_size + 1000 < (metrics->input_avg_sz+1000)/4) |
1987 | 183k | if (metrics->input_avg_sz && |
1988 | 183k | (b->uncomp_size/4 - 750 > metrics->input_avg_sz || |
1989 | 38.1k | b->uncomp_size < metrics->input_avg_sz/4 - 750) && |
1990 | 183k | ABS(b->uncomp_size-metrics->input_avg_sz)/10 |
1991 | 898 | > metrics->input_avg_delta) { |
1992 | 28 | metrics->next_trial = 0; |
1993 | 28 | } |
1994 | | |
1995 | 183k | if (metrics->trial > 0 || --metrics->next_trial <= 0) { |
1996 | 47.0k | int m, unpackable = metrics->unpackable; |
1997 | 47.0k | size_t sz_best = b->uncomp_size; |
1998 | 47.0k | size_t sz[CRAM_MAX_METHOD] = {0}; |
1999 | 47.0k | int method_best = 0; // RAW |
2000 | 47.0k | char *c_best = NULL, *c = NULL; |
2001 | | |
2002 | 47.0k | metrics->input_avg_delta = |
2003 | 47.0k | 0.9 * (metrics->input_avg_delta + |
2004 | 47.0k | ABS(b->uncomp_size - metrics->input_avg_sz)); |
2005 | | |
2006 | 47.0k | metrics->input_avg_sz += b->uncomp_size*.2; |
2007 | 47.0k | metrics->input_avg_sz *= 0.8; |
2008 | | |
2009 | 47.0k | if (metrics->revised_method) |
2010 | 20.3k | method = metrics->revised_method; |
2011 | 26.6k | else |
2012 | 26.6k | metrics->revised_method = method; |
2013 | | |
2014 | 47.0k | if (metrics->next_trial <= 0) { |
2015 | 1.06k | metrics->next_trial = TRIAL_SPAN; |
2016 | 1.06k | metrics->trial = NTRIALS; |
2017 | 35.1k | for (m = 0; m < CRAM_MAX_METHOD; m++) |
2018 | 34.1k | metrics->sz[m] /= 2; |
2019 | 1.06k | metrics->unpackable = 0; |
2020 | 1.06k | } |
2021 | | |
2022 | | // Compress this block using the best method |
2023 | 47.0k | if (unpackable && CRAM_MAJOR_VERS(fd->version) > 3) { |
2024 | | // No point trying bit-pack if 17+ symbols. |
2025 | 0 | if (method & (1<<RANS_PR128)) |
2026 | 0 | method = (method|(1<<RANS_PR0))&~(1<<RANS_PR128); |
2027 | 0 | if (method & (1<<RANS_PR129)) |
2028 | 0 | method = (method|(1<<RANS_PR1))&~(1<<RANS_PR129); |
2029 | 0 | if (method & (1<<RANS_PR192)) |
2030 | 0 | method = (method|(1<<RANS_PR64))&~(1<<RANS_PR192); |
2031 | 0 | if (method & (1<<RANS_PR193)) |
2032 | 0 | method = (method|(1<<RANS_PR64)|(1<<RANS_PR1))&~(1<<RANS_PR193); |
2033 | |
|
2034 | 0 | if (method & (1<<ARITH_PR128)) |
2035 | 0 | method = (method|(1<<ARITH_PR0))&~(1<<ARITH_PR128); |
2036 | 0 | if (method & (1<<ARITH_PR129)) |
2037 | 0 | method = (method|(1<<ARITH_PR1))&~(1<<ARITH_PR129); |
2038 | 0 | if (method & (1<<ARITH_PR192)) |
2039 | 0 | method = (method|(1<<ARITH_PR64))&~(1<<ARITH_PR192); |
2040 | 0 | if (method & (1u<<ARITH_PR193)) |
2041 | 0 | method = (method|(1<<ARITH_PR64)|(1<<ARITH_PR1))&~(1u<<ARITH_PR193); |
2042 | 0 | } |
2043 | | |
2044 | | // Libdeflate doesn't have a Z_RLE strategy. |
2045 | | // We treat it as level 1, but iff we haven't also |
2046 | | // explicitly listed that in the method list. |
2047 | | #ifdef HAVE_LIBDEFLATE |
2048 | | if ((method & (1<<GZIP_RLE)) && (method & (1<<GZIP_1))) |
2049 | | method &= ~(1<<GZIP_RLE); |
2050 | | #endif |
2051 | | |
2052 | 47.0k | pthread_mutex_unlock(&fd->metrics_lock); |
2053 | | |
2054 | 1.55M | for (m = 0; m < CRAM_MAX_METHOD; m++) { |
2055 | 1.50M | if (method & (1u<<m)) { |
2056 | 327k | int lvl = level; |
2057 | 327k | switch (m) { |
2058 | 46.5k | case GZIP: strat = Z_FILTERED; break; |
2059 | 46.9k | case GZIP_1: strat = Z_DEFAULT_STRATEGY; lvl = 1; break; |
2060 | 31.2k | case GZIP_RLE: strat = Z_RLE; break; |
2061 | 0 | case FQZ: strat = CRAM_MAJOR_VERS(fd->version); break; |
2062 | 0 | case FQZ_b: strat = CRAM_MAJOR_VERS(fd->version)+256; break; |
2063 | 0 | case FQZ_c: strat = CRAM_MAJOR_VERS(fd->version)+2*256; break; |
2064 | 0 | case FQZ_d: strat = CRAM_MAJOR_VERS(fd->version)+3*256; break; |
2065 | 15.4k | case TOK3: strat = 0; break; |
2066 | 0 | case TOKA: strat = 1; break; |
2067 | 187k | default: strat = 0; |
2068 | 327k | } |
2069 | | |
2070 | 327k | c = cram_compress_by_method(s, (char *)b->data, b->uncomp_size, |
2071 | 327k | b->content_id, &sz[m], m, lvl, strat); |
2072 | | |
2073 | 327k | if (c && sz_best > sz[m]) { |
2074 | 21.3k | sz_best = sz[m]; |
2075 | 21.3k | method_best = m; |
2076 | 21.3k | if (c_best) |
2077 | 11.1k | free(c_best); |
2078 | 21.3k | c_best = c; |
2079 | 306k | } else if (c) { |
2080 | 305k | free(c); |
2081 | 305k | } else { |
2082 | 1.27k | sz[m] = UINT_MAX; // arbitrarily worse than raw |
2083 | 1.27k | } |
2084 | 1.17M | } else { |
2085 | 1.17M | sz[m] = UINT_MAX; // arbitrarily worse than raw |
2086 | 1.17M | } |
2087 | 1.50M | } |
2088 | | |
2089 | 47.0k | if (c_best) { |
2090 | 10.1k | free(b->data); |
2091 | 10.1k | b->data = (unsigned char *)c_best; |
2092 | 10.1k | b->method = method_best; // adjusted to methmap[method_best] later |
2093 | 10.1k | b->comp_size = sz_best; |
2094 | 10.1k | } |
2095 | | |
2096 | | // Accumulate stats for all methods tried |
2097 | 47.0k | pthread_mutex_lock(&fd->metrics_lock); |
2098 | 1.55M | for (m = 0; m < CRAM_MAX_METHOD; m++) |
2099 | | // don't be overly sure on small blocks. |
2100 | | // +2000 means eg bzip2 vs gzip (1.07 to 1.04) or gz vs rans1 |
2101 | | // needs to be at least 60 bytes smaller to overcome the |
2102 | | // fixed size addition. |
2103 | 1.50M | metrics->sz[m] += sz[m]+2000; |
2104 | | |
2105 | | // When enough trials performed, find the best on average |
2106 | 47.0k | if (--metrics->trial == 0) { |
2107 | 12.9k | int best_method = RAW; |
2108 | 12.9k | int best_sz = INT_MAX; |
2109 | | |
2110 | | // Relative costs of methods. See enum_cram_block_method_int |
2111 | | // and methmap |
2112 | 12.9k | double meth_cost[32] = { |
2113 | | // Externally defined methods |
2114 | 12.9k | 1, // 0 raw |
2115 | 12.9k | 1.04, // 1 gzip (Z_FILTERED) |
2116 | 12.9k | 1.07, // 2 bzip2 |
2117 | 12.9k | 1.08, // 3 lzma |
2118 | 12.9k | 1.00, // 4 rans (O0) |
2119 | 12.9k | 1.00, // 5 ranspr (O0) |
2120 | 12.9k | 1.04, // 6 arithpr (O0) |
2121 | 12.9k | 1.05, // 7 fqz |
2122 | 12.9k | 1.05, // 8 tok3 (rans) |
2123 | 12.9k | 1.00, 1.00, // 9,10 reserved |
2124 | | |
2125 | | // Paramterised versions of above |
2126 | 12.9k | 1.01, // gzip rle |
2127 | 12.9k | 1.01, // gzip -1 |
2128 | | |
2129 | 12.9k | 1.05, 1.05, 1.05, // FQZ_b,c,d |
2130 | | |
2131 | 12.9k | 1.01, // rans O1 |
2132 | | |
2133 | 12.9k | 1.01, // rans_pr1 |
2134 | 12.9k | 1.00, // rans_pr64; if smaller, usually fast |
2135 | 12.9k | 1.03, // rans_pr65/9 |
2136 | 12.9k | 1.00, // rans_pr128 |
2137 | 12.9k | 1.01, // rans_pr129 |
2138 | 12.9k | 1.00, // rans_pr192 |
2139 | 12.9k | 1.01, // rans_pr193 |
2140 | | |
2141 | 12.9k | 1.07, // tok3 arith |
2142 | | |
2143 | 12.9k | 1.04, // arith_pr1 |
2144 | 12.9k | 1.04, // arith_pr64 |
2145 | 12.9k | 1.04, // arith_pr9 |
2146 | 12.9k | 1.03, // arith_pr128 |
2147 | 12.9k | 1.04, // arith_pr129 |
2148 | 12.9k | 1.04, // arith_pr192 |
2149 | 12.9k | 1.04, // arith_pr193 |
2150 | 12.9k | }; |
2151 | | |
2152 | | // Scale methods by cost based on compression level |
2153 | 12.9k | if (fd->level <= 1) { |
2154 | 0 | for (m = 0; m < CRAM_MAX_METHOD; m++) |
2155 | 0 | metrics->sz[m] *= 1+(meth_cost[m]-1)*4; |
2156 | 12.9k | } else if (fd->level <= 3) { |
2157 | 0 | for (m = 0; m < CRAM_MAX_METHOD; m++) |
2158 | 0 | metrics->sz[m] *= 1+(meth_cost[m]-1); |
2159 | 12.9k | } else if (fd->level <= 6) { |
2160 | 425k | for (m = 0; m < CRAM_MAX_METHOD; m++) |
2161 | 412k | metrics->sz[m] *= 1+(meth_cost[m]-1)/2; |
2162 | 12.9k | } else if (fd->level <= 7) { |
2163 | 0 | for (m = 0; m < CRAM_MAX_METHOD; m++) |
2164 | 0 | metrics->sz[m] *= 1+(meth_cost[m]-1)/3; |
2165 | 0 | } // else cost is ignored |
2166 | | |
2167 | | // Ensure these are never used; BSC and ZSTD |
2168 | 12.9k | metrics->sz[9] = metrics->sz[10] = INT_MAX; |
2169 | | |
2170 | 425k | for (m = 0; m < CRAM_MAX_METHOD; m++) { |
2171 | 412k | if ((!metrics->sz[m]) || (!(method & (1u<<m)))) |
2172 | 333k | continue; |
2173 | | |
2174 | 79.5k | if (best_sz > metrics->sz[m]) |
2175 | 28.5k | best_sz = metrics->sz[m], best_method = m; |
2176 | 79.5k | } |
2177 | | |
2178 | 12.9k | if (best_method != metrics->method) { |
2179 | | //metrics->trial = (NTRIALS+1)/2; // be sure |
2180 | | //metrics->next_trial /= 1.5; |
2181 | 7.72k | metrics->consistency = 0; |
2182 | 7.72k | } else { |
2183 | 5.18k | metrics->next_trial *= MIN(2, 1+metrics->consistency/4.0); |
2184 | 5.18k | metrics->consistency++; |
2185 | 5.18k | } |
2186 | | |
2187 | 12.9k | metrics->method = best_method; |
2188 | 12.9k | switch (best_method) { |
2189 | 111 | case GZIP: strat = Z_FILTERED; break; |
2190 | 6.02k | case GZIP_1: strat = Z_DEFAULT_STRATEGY; break; |
2191 | 4 | case GZIP_RLE: strat = Z_RLE; break; |
2192 | 0 | case FQZ: strat = CRAM_MAJOR_VERS(fd->version); break; |
2193 | 0 | case FQZ_b: strat = CRAM_MAJOR_VERS(fd->version)+256; break; |
2194 | 0 | case FQZ_c: strat = CRAM_MAJOR_VERS(fd->version)+2*256; break; |
2195 | 0 | case FQZ_d: strat = CRAM_MAJOR_VERS(fd->version)+3*256; break; |
2196 | 98 | case TOK3: strat = 0; break; |
2197 | 0 | case TOKA: strat = 1; break; |
2198 | 6.66k | default: strat = 0; |
2199 | 12.9k | } |
2200 | 12.9k | metrics->strat = strat; |
2201 | | |
2202 | | // If we see at least MAXFAIL trials in a row for a specific |
2203 | | // compression method with more than MAXDELTA aggregate |
2204 | | // size then we drop this from the list of methods used |
2205 | | // for this block type. |
2206 | 86.9k | #define MAXDELTA 0.20 |
2207 | 279k | #define MAXFAILS 4 |
2208 | 425k | for (m = 0; m < CRAM_MAX_METHOD; m++) { |
2209 | 412k | if (best_method == m) { |
2210 | 12.9k | metrics->cnt[m] = 0; |
2211 | 12.9k | metrics->extra[m] = 0; |
2212 | 399k | } else if (best_sz < metrics->sz[m]) { |
2213 | 279k | double r = (double)metrics->sz[m] / best_sz - 1; |
2214 | 279k | int mul = 1+(fd->level>=7); |
2215 | 279k | if (++metrics->cnt[m] >= MAXFAILS*mul && |
2216 | 279k | (metrics->extra[m] += r) >= MAXDELTA*mul) |
2217 | 42.7k | method &= ~(1u<<m); |
2218 | | |
2219 | | // Special case for fqzcomp as it rarely changes |
2220 | 279k | if (m == FQZ || m == FQZ_b || m == FQZ_c || m == FQZ_d) { |
2221 | 48.6k | if (metrics->sz[m] > best_sz) |
2222 | 48.6k | method &= ~(1u<<m); |
2223 | 48.6k | } |
2224 | 279k | } |
2225 | 412k | } |
2226 | | |
2227 | | //if (fd->verbose > 1 && method != metrics->revised_method) |
2228 | | // fprintf(stderr, "%d: revising method from %x to %x\n", |
2229 | | // b->content_id, metrics->revised_method, method); |
2230 | 12.9k | metrics->revised_method = method; |
2231 | 12.9k | } |
2232 | 47.0k | pthread_mutex_unlock(&fd->metrics_lock); |
2233 | 135k | } else { |
2234 | 135k | metrics->input_avg_delta = |
2235 | 135k | 0.9 * (metrics->input_avg_delta + |
2236 | 135k | ABS(b->uncomp_size - metrics->input_avg_sz)); |
2237 | | |
2238 | 135k | metrics->input_avg_sz += b->uncomp_size*.2; |
2239 | 135k | metrics->input_avg_sz *= 0.8; |
2240 | | |
2241 | 135k | strat = metrics->strat; |
2242 | 135k | method = metrics->method; |
2243 | | |
2244 | 135k | pthread_mutex_unlock(&fd->metrics_lock); |
2245 | 135k | comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size, |
2246 | 135k | b->content_id, &comp_size, method, |
2247 | 135k | method == GZIP_1 ? 1 : level, |
2248 | 135k | strat); |
2249 | 135k | if (!comp) { |
2250 | | // Our cached best method failed, but maybe another works? |
2251 | | // Rerun with trial mode engaged again. |
2252 | 23 | if (!recurse) { |
2253 | 23 | hts_log_warning("Compressed block ID %d method %s failed, " |
2254 | 23 | "redoing trial", b->content_id, |
2255 | 23 | cram_block_method2str(method)); |
2256 | 23 | pthread_mutex_lock(&fd->metrics_lock); |
2257 | 23 | metrics->trial = NTRIALS; |
2258 | 23 | metrics->next_trial = TRIAL_SPAN; |
2259 | 23 | metrics->revised_method = orig_method; |
2260 | 23 | pthread_mutex_unlock(&fd->metrics_lock); |
2261 | 23 | return cram_compress_block3(fd, s, b, metrics, method, |
2262 | 23 | level, 1); |
2263 | 23 | } |
2264 | 0 | return -1; |
2265 | 23 | } |
2266 | | |
2267 | 135k | if (comp_size < b->uncomp_size) { |
2268 | 20.1k | free(b->data); |
2269 | 20.1k | b->data = (unsigned char *)comp; |
2270 | 20.1k | b->comp_size = comp_size; |
2271 | 20.1k | b->method = method; |
2272 | 115k | } else { |
2273 | 115k | free(comp); |
2274 | 115k | } |
2275 | 135k | } |
2276 | | |
2277 | 183k | } else { |
2278 | | // no cached metrics, so just do zlib? |
2279 | 15.9k | comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size, |
2280 | 15.9k | b->content_id, &comp_size, GZIP, level, Z_FILTERED); |
2281 | 15.9k | if (!comp) { |
2282 | 0 | hts_log_error("Compression failed!"); |
2283 | 0 | return -1; |
2284 | 0 | } |
2285 | | |
2286 | 15.9k | if (comp_size < b->uncomp_size) { |
2287 | 963 | free(b->data); |
2288 | 963 | b->data = (unsigned char *)comp; |
2289 | 963 | b->comp_size = comp_size; |
2290 | 963 | b->method = GZIP; |
2291 | 14.9k | } else { |
2292 | 14.9k | free(comp); |
2293 | 14.9k | } |
2294 | 15.9k | strat = Z_FILTERED; |
2295 | 15.9k | } |
2296 | | |
2297 | 198k | hts_log_info("Compressed block ID %d from %d to %d by method %s", |
2298 | 198k | b->content_id, b->uncomp_size, b->comp_size, |
2299 | 198k | cram_block_method2str(b->method)); |
2300 | | |
2301 | 198k | b->method = methmap[b->method]; |
2302 | | |
2303 | 198k | return 0; |
2304 | 198k | } |
2305 | | |
2306 | | /* |
2307 | | * Compresses a block using a selection of compression codecs and options. |
2308 | | * The best is learnt and used for subsequent slices, periodically resampling. |
2309 | | * |
2310 | | * Method and level -1 implies defaults, as specified in cram_fd. |
2311 | | */ |
2312 | | int cram_compress_block2(cram_fd *fd, cram_slice *s, |
2313 | | cram_block *b, cram_metrics *metrics, |
2314 | 513k | int method, int level) { |
2315 | 513k | return cram_compress_block3(fd, s, b, metrics, method, level, 0); |
2316 | 513k | } |
2317 | | |
2318 | | int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, |
2319 | 7.93k | int method, int level) { |
2320 | 7.93k | return cram_compress_block2(fd, NULL, b, metrics, method, level); |
2321 | 7.93k | } |
2322 | | |
2323 | 786k | cram_metrics *cram_new_metrics(void) { |
2324 | 786k | cram_metrics *m = calloc(1, sizeof(*m)); |
2325 | 786k | if (!m) |
2326 | 0 | return NULL; |
2327 | 786k | m->trial = NTRIALS-1; |
2328 | 786k | m->next_trial = TRIAL_SPAN/2; // learn quicker at start |
2329 | 786k | m->method = RAW; |
2330 | 786k | m->strat = 0; |
2331 | 786k | m->revised_method = 0; |
2332 | 786k | m->unpackable = 0; |
2333 | | |
2334 | 786k | return m; |
2335 | 786k | } |
2336 | | |
2337 | 198k | char *cram_block_method2str(enum cram_block_method_int m) { |
2338 | 198k | switch(m) { |
2339 | 167k | case RAW: return "RAW"; |
2340 | 2.96k | case GZIP: return "GZIP"; |
2341 | 0 | case BZIP2: return "BZIP2"; |
2342 | 0 | case LZMA: return "LZMA"; |
2343 | 0 | case RANS0: return "RANS0"; |
2344 | 0 | case RANS1: return "RANS1"; |
2345 | 150 | case GZIP_RLE: return "GZIP_RLE"; |
2346 | 2.69k | case GZIP_1: return "GZIP_1"; |
2347 | 0 | case FQZ: return "FQZ"; |
2348 | 0 | case FQZ_b: return "FQZ_b"; |
2349 | 0 | case FQZ_c: return "FQZ_c"; |
2350 | 0 | case FQZ_d: return "FQZ_d"; |
2351 | 743 | case RANS_PR0: return "RANS_PR0"; |
2352 | 61 | case RANS_PR1: return "RANS_PR1"; |
2353 | 12.7k | case RANS_PR64: return "RANS_PR64"; |
2354 | 3 | case RANS_PR9: return "RANS_PR9"; |
2355 | 11.6k | case RANS_PR128: return "RANS_PR128"; |
2356 | 0 | case RANS_PR129: return "RANS_PR129"; |
2357 | 0 | case RANS_PR192: return "RANS_PR192"; |
2358 | 195 | case RANS_PR193: return "RANS_PR193"; |
2359 | 27 | case TOK3: return "TOK3_R"; |
2360 | 0 | case TOKA: return "TOK3_A"; |
2361 | 0 | case ARITH_PR0: return "ARITH_PR0"; |
2362 | 0 | case ARITH_PR1: return "ARITH_PR1"; |
2363 | 0 | case ARITH_PR64: return "ARITH_PR64"; |
2364 | 0 | case ARITH_PR9: return "ARITH_PR9"; |
2365 | 0 | case ARITH_PR128: return "ARITH_PR128"; |
2366 | 0 | case ARITH_PR129: return "ARITH_PR129"; |
2367 | 0 | case ARITH_PR192: return "ARITH_PR192"; |
2368 | 0 | case ARITH_PR193: return "ARITH_PR193"; |
2369 | 0 | case BM_ERROR: break; |
2370 | 198k | } |
2371 | 0 | return "?"; |
2372 | 198k | } |
2373 | | |
2374 | 7 | char *cram_content_type2str(enum cram_content_type t) { |
2375 | 7 | switch (t) { |
2376 | 3 | case FILE_HEADER: return "FILE_HEADER"; |
2377 | 2 | case COMPRESSION_HEADER: return "COMPRESSION_HEADER"; |
2378 | 0 | case MAPPED_SLICE: return "MAPPED_SLICE"; |
2379 | 0 | case UNMAPPED_SLICE: return "UNMAPPED_SLICE"; |
2380 | 0 | case EXTERNAL: return "EXTERNAL"; |
2381 | 0 | case CORE: return "CORE"; |
2382 | 0 | case CT_ERROR: break; |
2383 | 7 | } |
2384 | 2 | return "?"; |
2385 | 7 | } |
2386 | | |
2387 | | /* ---------------------------------------------------------------------- |
2388 | | * Reference sequence handling |
2389 | | * |
2390 | | * These revolve around the refs_t structure, which may potentially be |
2391 | | * shared between multiple cram_fd. |
2392 | | * |
2393 | | * We start with refs_create() to allocate an empty refs_t and then |
2394 | | * populate it with @SQ line data using refs_from_header(). This is done on |
2395 | | * cram_open(). Also at start up we can call cram_load_reference() which |
2396 | | * is used with "scramble -r foo.fa". This replaces the fd->refs with the |
2397 | | * new one specified. In either case refs2id() is then called which |
2398 | | * maps ref_entry names to @SQ ids (refs_t->ref_id[]). |
2399 | | * |
2400 | | * Later, possibly within a thread, we will want to know the actual ref |
2401 | | * seq itself, obtained by calling cram_get_ref(). This may use the |
2402 | | * UR: or M5: fields or the filename specified in the original |
2403 | | * cram_load_reference() call. |
2404 | | * |
2405 | | * Given the potential for multi-threaded reference usage, we have |
2406 | | * reference counting (sorry for the confusing double use of "ref") to |
2407 | | * track the number of callers interested in any specific reference. |
2408 | | */ |
2409 | | |
2410 | | /* |
2411 | | * Frees/unmaps a reference sequence and associated file handles. |
2412 | | */ |
2413 | 6.20k | static void ref_entry_free_seq(ref_entry *e) { |
2414 | 6.20k | if (e->mf) |
2415 | 0 | mfclose(e->mf); |
2416 | 6.20k | if (e->seq && !e->mf) |
2417 | 0 | free(e->seq); |
2418 | | |
2419 | 6.20k | e->seq = NULL; |
2420 | 6.20k | e->mf = NULL; |
2421 | 6.20k | } |
2422 | | |
2423 | 16.6k | void refs_free(refs_t *r) { |
2424 | 16.6k | RP("refs_free()\n"); |
2425 | | |
2426 | 16.6k | if (--r->count > 0) |
2427 | 0 | return; |
2428 | | |
2429 | 16.6k | if (!r) |
2430 | 0 | return; |
2431 | | |
2432 | 16.6k | if (r->pool) |
2433 | 16.6k | string_pool_destroy(r->pool); |
2434 | | |
2435 | 16.6k | if (r->h_meta) { |
2436 | 16.6k | khint_t k; |
2437 | | |
2438 | 32.4k | for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) { |
2439 | 15.8k | ref_entry *e; |
2440 | | |
2441 | 15.8k | if (!kh_exist(r->h_meta, k)) |
2442 | 9.61k | continue; |
2443 | 6.20k | if (!(e = kh_val(r->h_meta, k))) |
2444 | 0 | continue; |
2445 | 6.20k | ref_entry_free_seq(e); |
2446 | 6.20k | free(e); |
2447 | 6.20k | } |
2448 | | |
2449 | 16.6k | kh_destroy(refs, r->h_meta); |
2450 | 16.6k | } |
2451 | | |
2452 | 16.6k | if (r->ref_id) |
2453 | 8.44k | free(r->ref_id); |
2454 | | |
2455 | 16.6k | if (r->fp) |
2456 | 0 | bgzf_close(r->fp); |
2457 | | |
2458 | 16.6k | pthread_mutex_destroy(&r->lock); |
2459 | | |
2460 | 16.6k | free(r); |
2461 | 16.6k | } |
2462 | | |
2463 | 16.6k | static refs_t *refs_create(void) { |
2464 | 16.6k | refs_t *r = calloc(1, sizeof(*r)); |
2465 | | |
2466 | 16.6k | RP("refs_create()\n"); |
2467 | | |
2468 | 16.6k | if (!r) |
2469 | 0 | return NULL; |
2470 | | |
2471 | 16.6k | if (!(r->pool = string_pool_create(8192))) |
2472 | 0 | goto err; |
2473 | | |
2474 | 16.6k | r->ref_id = NULL; // see refs2id() to populate. |
2475 | 16.6k | r->count = 1; |
2476 | 16.6k | r->last = NULL; |
2477 | 16.6k | r->last_id = -1; |
2478 | | |
2479 | 16.6k | if (!(r->h_meta = kh_init(refs))) |
2480 | 0 | goto err; |
2481 | | |
2482 | 16.6k | pthread_mutex_init(&r->lock, NULL); |
2483 | | |
2484 | 16.6k | return r; |
2485 | | |
2486 | 0 | err: |
2487 | 0 | refs_free(r); |
2488 | 0 | return NULL; |
2489 | 16.6k | } |
2490 | | |
2491 | | /* |
2492 | | * Opens a reference fasta file as a BGZF stream, allowing for |
2493 | | * compressed files. It automatically builds a .fai file if |
2494 | | * required and if compressed a .gzi bgzf index too. |
2495 | | * |
2496 | | * Returns a BGZF handle on success; |
2497 | | * NULL on failure. |
2498 | | */ |
2499 | 1.01k | static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) { |
2500 | 1.01k | BGZF *fp; |
2501 | | |
2502 | 1.01k | if (strncmp(fn, "file://", 7) == 0) |
2503 | 2 | fn += 7; |
2504 | | |
2505 | 1.01k | if (!is_md5 && !hisremote(fn)) { |
2506 | 610 | char fai_file[PATH_MAX]; |
2507 | | |
2508 | 610 | snprintf(fai_file, PATH_MAX, "%s.fai", fn); |
2509 | 610 | if (access(fai_file, R_OK) != 0) |
2510 | 610 | if (fai_build(fn) != 0) |
2511 | 610 | return NULL; |
2512 | 610 | } |
2513 | | |
2514 | 401 | if (!(fp = bgzf_open(fn, mode))) { |
2515 | 401 | perror(fn); |
2516 | 401 | return NULL; |
2517 | 401 | } |
2518 | | |
2519 | 0 | if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) { |
2520 | 0 | hts_log_error("Unable to load .gzi index '%s.gzi'", fn); |
2521 | 0 | bgzf_close(fp); |
2522 | 0 | return NULL; |
2523 | 0 | } |
2524 | | |
2525 | 0 | return fp; |
2526 | 0 | } |
2527 | | |
2528 | | /* |
2529 | | * Loads a FAI file for a reference.fasta. |
2530 | | * "is_err" indicates whether failure to load is worthy of emitting an |
2531 | | * error message. In some cases (eg with embedded references) we |
2532 | | * speculatively load, just in case, and silently ignore errors. |
2533 | | * |
2534 | | * Returns the refs_t struct on success (maybe newly allocated); |
2535 | | * NULL on failure |
2536 | | */ |
2537 | 1.01k | static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) { |
2538 | 1.01k | hFILE *fp = NULL; |
2539 | 1.01k | char fai_fn[PATH_MAX]; |
2540 | 1.01k | char line[8192]; |
2541 | 1.01k | refs_t *r = r_orig; |
2542 | 1.01k | size_t fn_l = strlen(fn); |
2543 | 1.01k | int id = 0, id_alloc = 0; |
2544 | | |
2545 | 1.01k | RP("refs_load_fai %s\n", fn); |
2546 | | |
2547 | 1.01k | if (!r) |
2548 | 0 | if (!(r = refs_create())) |
2549 | 0 | goto err; |
2550 | | |
2551 | 1.01k | if (r->fp) |
2552 | 0 | if (bgzf_close(r->fp) != 0) |
2553 | 0 | goto err; |
2554 | 1.01k | r->fp = NULL; |
2555 | | |
2556 | | /* Look for a FASTA##idx##FAI format */ |
2557 | 1.01k | char *fn_delim = strstr(fn, HTS_IDX_DELIM); |
2558 | 1.01k | if (fn_delim) { |
2559 | 7 | if (!(r->fn = string_ndup(r->pool, fn, fn_delim - fn))) |
2560 | 0 | goto err; |
2561 | 7 | fn_delim += strlen(HTS_IDX_DELIM); |
2562 | 7 | snprintf(fai_fn, PATH_MAX, "%s", fn_delim); |
2563 | 1.00k | } else { |
2564 | | /* An index file was provided, instead of the actual reference file */ |
2565 | 1.00k | if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0) { |
2566 | 6 | if (!r->fn) { |
2567 | 5 | if (!(r->fn = string_ndup(r->pool, fn, fn_l-4))) |
2568 | 0 | goto err; |
2569 | 5 | } |
2570 | 6 | snprintf(fai_fn, PATH_MAX, "%s", fn); |
2571 | 998 | } else { |
2572 | | /* Only the reference file provided. Get the index file name from it */ |
2573 | 998 | if (!(r->fn = string_dup(r->pool, fn))) |
2574 | 0 | goto err; |
2575 | 998 | snprintf(fai_fn, PATH_MAX, "%.*s.fai", PATH_MAX-5, fn); |
2576 | 998 | } |
2577 | 1.00k | } |
2578 | | |
2579 | 1.01k | if (!(r->fp = bgzf_open_ref(r->fn, "r", 0))) { |
2580 | 1.01k | hts_log_error("Failed to open reference file '%s'", r->fn); |
2581 | 1.01k | goto err; |
2582 | 1.01k | } |
2583 | | |
2584 | 0 | if (!(fp = hopen(fai_fn, "r"))) { |
2585 | 0 | hts_log_error("Failed to open index file '%s'", fai_fn); |
2586 | 0 | if (is_err) |
2587 | 0 | perror(fai_fn); |
2588 | 0 | goto err; |
2589 | 0 | } |
2590 | 0 | while (hgets(line, 8192, fp) != NULL) { |
2591 | 0 | ref_entry *e = malloc(sizeof(*e)); |
2592 | 0 | char *cp; |
2593 | 0 | int n; |
2594 | 0 | khint_t k; |
2595 | |
|
2596 | 0 | if (!e) |
2597 | 0 | return NULL; |
2598 | | |
2599 | | // id |
2600 | 0 | for (cp = line; *cp && !isspace_c(*cp); cp++) |
2601 | 0 | ; |
2602 | 0 | *cp++ = 0; |
2603 | 0 | e->name = string_dup(r->pool, line); |
2604 | | |
2605 | | // length |
2606 | 0 | while (*cp && isspace_c(*cp)) |
2607 | 0 | cp++; |
2608 | 0 | e->length = strtoll(cp, &cp, 10); |
2609 | | |
2610 | | // offset |
2611 | 0 | while (*cp && isspace_c(*cp)) |
2612 | 0 | cp++; |
2613 | 0 | e->offset = strtoll(cp, &cp, 10); |
2614 | | |
2615 | | // bases per line |
2616 | 0 | while (*cp && isspace_c(*cp)) |
2617 | 0 | cp++; |
2618 | 0 | e->bases_per_line = strtol(cp, &cp, 10); |
2619 | | |
2620 | | // line length |
2621 | 0 | while (*cp && isspace_c(*cp)) |
2622 | 0 | cp++; |
2623 | 0 | e->line_length = strtol(cp, &cp, 10); |
2624 | | |
2625 | | // filename |
2626 | 0 | e->fn = r->fn; |
2627 | |
|
2628 | 0 | e->count = 0; |
2629 | 0 | e->seq = NULL; |
2630 | 0 | e->mf = NULL; |
2631 | 0 | e->is_md5 = 0; |
2632 | 0 | e->validated_md5 = 0; |
2633 | |
|
2634 | 0 | k = kh_put(refs, r->h_meta, e->name, &n); |
2635 | 0 | if (-1 == n) { |
2636 | 0 | free(e); |
2637 | 0 | return NULL; |
2638 | 0 | } |
2639 | | |
2640 | 0 | if (n) { |
2641 | 0 | kh_val(r->h_meta, k) = e; |
2642 | 0 | } else { |
2643 | 0 | ref_entry *re = kh_val(r->h_meta, k); |
2644 | 0 | if (re && (re->count != 0 || re->length != 0)) { |
2645 | | /* Keep old */ |
2646 | 0 | free(e); |
2647 | 0 | } else { |
2648 | | /* Replace old */ |
2649 | 0 | if (re) |
2650 | 0 | free(re); |
2651 | 0 | kh_val(r->h_meta, k) = e; |
2652 | 0 | } |
2653 | 0 | } |
2654 | |
|
2655 | 0 | if (id >= id_alloc) { |
2656 | 0 | ref_entry **new_refs; |
2657 | 0 | int x; |
2658 | |
|
2659 | 0 | id_alloc = id_alloc ?id_alloc*2 : 16; |
2660 | 0 | new_refs = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id)); |
2661 | 0 | if (!new_refs) |
2662 | 0 | goto err; |
2663 | 0 | r->ref_id = new_refs; |
2664 | |
|
2665 | 0 | for (x = id; x < id_alloc; x++) |
2666 | 0 | r->ref_id[x] = NULL; |
2667 | 0 | } |
2668 | 0 | r->ref_id[id] = e; |
2669 | 0 | r->nref = ++id; |
2670 | 0 | } |
2671 | | |
2672 | 0 | if(hclose(fp) < 0) |
2673 | 0 | goto err; |
2674 | 0 | return r; |
2675 | | |
2676 | 1.01k | err: |
2677 | 1.01k | if (fp) |
2678 | 0 | hclose_abruptly(fp); |
2679 | | |
2680 | 1.01k | if (!r_orig) |
2681 | 0 | refs_free(r); |
2682 | | |
2683 | 1.01k | return NULL; |
2684 | 0 | } |
2685 | | |
2686 | | /* |
2687 | | * Verifies that the CRAM @SQ lines and .fai files match. |
2688 | | */ |
2689 | 0 | static void sanitise_SQ_lines(cram_fd *fd) { |
2690 | 0 | int i; |
2691 | |
|
2692 | 0 | if (!fd->header || !fd->header->hrecs) |
2693 | 0 | return; |
2694 | | |
2695 | 0 | if (!fd->refs || !fd->refs->h_meta) |
2696 | 0 | return; |
2697 | | |
2698 | 0 | for (i = 0; i < fd->header->hrecs->nref; i++) { |
2699 | 0 | const char *name = fd->header->hrecs->ref[i].name; |
2700 | 0 | khint_t k = kh_get(refs, fd->refs->h_meta, name); |
2701 | 0 | ref_entry *r; |
2702 | | |
2703 | | // We may have @SQ lines which have no known .fai, but do not |
2704 | | // in themselves pose a problem because they are unused in the file. |
2705 | 0 | if (k == kh_end(fd->refs->h_meta)) |
2706 | 0 | continue; |
2707 | | |
2708 | 0 | if (!(r = (ref_entry *)kh_val(fd->refs->h_meta, k))) |
2709 | 0 | continue; |
2710 | | |
2711 | 0 | if (r->length && r->length != fd->header->hrecs->ref[i].len) { |
2712 | 0 | assert(strcmp(r->name, fd->header->hrecs->ref[i].name) == 0); |
2713 | | |
2714 | | // Should we also check MD5sums here to ensure the correct |
2715 | | // reference was given? |
2716 | 0 | hts_log_warning("Header @SQ length mismatch for ref %s, %"PRIhts_pos" vs %d", |
2717 | 0 | r->name, fd->header->hrecs->ref[i].len, (int)r->length); |
2718 | | |
2719 | | // Fixing the parsed @SQ header will make MD:Z: strings work |
2720 | | // and also stop it producing N for the sequence. |
2721 | 0 | fd->header->hrecs->ref[i].len = r->length; |
2722 | 0 | } |
2723 | 0 | } |
2724 | 0 | } |
2725 | | |
2726 | | /* |
2727 | | * Indexes references by the order they appear in a BAM file. This may not |
2728 | | * necessarily be the same order they appear in the fasta reference file. |
2729 | | * |
2730 | | * Returns 0 on success |
2731 | | * -1 on failure |
2732 | | */ |
2733 | 7.93k | int refs2id(refs_t *r, sam_hdr_t *hdr) { |
2734 | 7.93k | int i; |
2735 | 7.93k | sam_hrecs_t *h = hdr->hrecs; |
2736 | | |
2737 | 7.93k | if (r->ref_id) |
2738 | 2.21k | free(r->ref_id); |
2739 | 7.93k | if (r->last) |
2740 | 0 | r->last = NULL; |
2741 | | |
2742 | 7.93k | r->ref_id = calloc(h->nref, sizeof(*r->ref_id)); |
2743 | 7.93k | if (!r->ref_id) |
2744 | 0 | return -1; |
2745 | | |
2746 | 7.93k | r->nref = h->nref; |
2747 | 12.8k | for (i = 0; i < h->nref; i++) { |
2748 | 4.96k | khint_t k = kh_get(refs, r->h_meta, h->ref[i].name); |
2749 | 4.96k | if (k != kh_end(r->h_meta)) { |
2750 | 4.96k | r->ref_id[i] = kh_val(r->h_meta, k); |
2751 | 4.96k | } else { |
2752 | 0 | hts_log_warning("Unable to find ref name '%s'", h->ref[i].name); |
2753 | 0 | } |
2754 | 4.96k | } |
2755 | | |
2756 | 7.93k | return 0; |
2757 | 7.93k | } |
2758 | | |
2759 | | /* |
2760 | | * Generates refs_t entries based on @SQ lines in the header. |
2761 | | * Returns 0 on success |
2762 | | * -1 on failure |
2763 | | */ |
2764 | 33.5k | static int refs_from_header(cram_fd *fd) { |
2765 | 33.5k | if (!fd) |
2766 | 0 | return -1; |
2767 | | |
2768 | 33.5k | refs_t *r = fd->refs; |
2769 | 33.5k | if (!r) |
2770 | 0 | return -1; |
2771 | | |
2772 | 33.5k | sam_hdr_t *h = fd->header; |
2773 | 33.5k | if (!h) |
2774 | 9.37k | return 0; |
2775 | | |
2776 | 24.1k | if (!h->hrecs) { |
2777 | 15.0k | if (-1 == sam_hdr_fill_hrecs(h)) |
2778 | 1.05k | return -1; |
2779 | 15.0k | } |
2780 | | |
2781 | 23.0k | if (h->hrecs->nref == 0) |
2782 | 18.1k | return 0; |
2783 | | |
2784 | | //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode); |
2785 | | |
2786 | | /* Existing refs are fine, as long as they're compatible with the hdr. */ |
2787 | 4.94k | ref_entry **new_ref_id = realloc(r->ref_id, (r->nref + h->hrecs->nref) * sizeof(*r->ref_id)); |
2788 | 4.94k | if (!new_ref_id) |
2789 | 0 | return -1; |
2790 | 4.94k | r->ref_id = new_ref_id; |
2791 | | |
2792 | 4.94k | int i, j; |
2793 | | /* Copy info from h->ref[i] over to r */ |
2794 | 16.1k | for (i = 0, j = r->nref; i < h->hrecs->nref; i++) { |
2795 | 11.1k | sam_hrec_type_t *ty; |
2796 | 11.1k | sam_hrec_tag_t *tag; |
2797 | 11.1k | khint_t k; |
2798 | 11.1k | int n; |
2799 | | |
2800 | 11.1k | k = kh_get(refs, r->h_meta, h->hrecs->ref[i].name); |
2801 | 11.1k | if (k != kh_end(r->h_meta)) |
2802 | | // Ref already known about |
2803 | 4.96k | continue; |
2804 | | |
2805 | 6.20k | if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry)))) |
2806 | 0 | return -1; |
2807 | | |
2808 | 6.20k | if (!h->hrecs->ref[i].name) |
2809 | 0 | return -1; |
2810 | | |
2811 | 6.20k | r->ref_id[j]->name = string_dup(r->pool, h->hrecs->ref[i].name); |
2812 | 6.20k | if (!r->ref_id[j]->name) return -1; |
2813 | 6.20k | r->ref_id[j]->length = 0; // marker for not yet loaded |
2814 | | |
2815 | | /* Initialise likely filename if known */ |
2816 | 6.20k | if ((ty = sam_hrecs_find_type_id(h->hrecs, "SQ", "SN", h->hrecs->ref[i].name))) { |
2817 | 6.20k | if ((tag = sam_hrecs_find_key(ty, "M5", NULL))) |
2818 | 41 | r->ref_id[j]->fn = string_dup(r->pool, tag->str+3); |
2819 | | |
2820 | 6.20k | if ((tag = sam_hrecs_find_key(ty, "LN", NULL))) { |
2821 | | // LN tag used when constructing consensus reference |
2822 | 6.20k | r->ref_id[j]->LN_length = strtoll(tag->str+3, NULL, 0); |
2823 | | // See fuzz 382922241 |
2824 | 6.20k | if (r->ref_id[j]->LN_length < 0) |
2825 | 446 | r->ref_id[j]->LN_length = 0; |
2826 | 6.20k | } |
2827 | 6.20k | } |
2828 | | |
2829 | 6.20k | k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n); |
2830 | 6.20k | if (n <= 0) // already exists or error |
2831 | 0 | return -1; |
2832 | 6.20k | kh_val(r->h_meta, k) = r->ref_id[j]; |
2833 | | |
2834 | 6.20k | j++; |
2835 | 6.20k | } |
2836 | 4.94k | r->nref = j; |
2837 | | |
2838 | 4.94k | return 0; |
2839 | 4.94k | } |
2840 | | |
2841 | | /* |
2842 | | * Attaches a header to a cram_fd. |
2843 | | * |
2844 | | * This should be used when creating a new cram_fd for writing where |
2845 | | * we have a header already constructed (eg from a file we've read |
2846 | | * in). |
2847 | | */ |
2848 | 8.98k | int cram_set_header2(cram_fd *fd, const sam_hdr_t *hdr) { |
2849 | 8.98k | if (!fd || !hdr ) |
2850 | 0 | return -1; |
2851 | | |
2852 | 8.98k | if (fd->header != hdr) { |
2853 | 8.98k | if (fd->header) |
2854 | 0 | sam_hdr_destroy(fd->header); |
2855 | 8.98k | fd->header = sam_hdr_dup(hdr); |
2856 | 8.98k | if (!fd->header) |
2857 | 0 | return -1; |
2858 | 8.98k | } |
2859 | 8.98k | return refs_from_header(fd); |
2860 | 8.98k | } |
2861 | | |
2862 | 0 | int cram_set_header(cram_fd *fd, sam_hdr_t *hdr) { |
2863 | 0 | return cram_set_header2(fd, hdr); |
2864 | 0 | } |
2865 | | |
2866 | | /* |
2867 | | * Returns whether the path refers to a directory. |
2868 | | */ |
2869 | 0 | static int is_directory(char *fn) { |
2870 | 0 | struct stat buf; |
2871 | 0 | if ( stat(fn,&buf) ) return 0; |
2872 | 0 | return S_ISDIR(buf.st_mode); |
2873 | 0 | } |
2874 | | |
2875 | | /* |
2876 | | * Converts a directory and a filename into an expanded path, replacing %s |
2877 | | * in directory with the filename and %[0-9]+s with portions of the filename |
2878 | | * Any remaining parts of filename are added to the end with /%s. |
2879 | | */ |
2880 | 0 | static int expand_cache_path(char *path, char *dir, const char *fn) { |
2881 | 0 | char *cp, *start = path; |
2882 | 0 | size_t len; |
2883 | 0 | size_t sz = PATH_MAX; |
2884 | |
|
2885 | 0 | while ((cp = strchr(dir, '%'))) { |
2886 | 0 | if (cp-dir >= sz) return -1; |
2887 | 0 | strncpy(path, dir, cp-dir); |
2888 | 0 | path += cp-dir; |
2889 | 0 | sz -= cp-dir; |
2890 | |
|
2891 | 0 | if (*++cp == 's') { |
2892 | 0 | len = strlen(fn); |
2893 | 0 | if (len >= sz) return -1; |
2894 | 0 | strcpy(path, fn); |
2895 | 0 | path += len; |
2896 | 0 | sz -= len; |
2897 | 0 | fn += len; |
2898 | 0 | cp++; |
2899 | 0 | } else if (*cp >= '0' && *cp <= '9') { |
2900 | 0 | char *endp; |
2901 | 0 | long l; |
2902 | |
|
2903 | 0 | l = strtol(cp, &endp, 10); |
2904 | 0 | l = MIN(l, strlen(fn)); |
2905 | 0 | if (*endp == 's') { |
2906 | 0 | if (l >= sz) return -1; |
2907 | 0 | strncpy(path, fn, l); |
2908 | 0 | path += l; |
2909 | 0 | fn += l; |
2910 | 0 | sz -= l; |
2911 | 0 | *path = 0; |
2912 | 0 | cp = endp+1; |
2913 | 0 | } else { |
2914 | 0 | if (sz < 3) return -1; |
2915 | 0 | *path++ = '%'; |
2916 | 0 | *path++ = *cp++; |
2917 | 0 | } |
2918 | 0 | } else { |
2919 | 0 | if (sz < 3) return -1; |
2920 | 0 | *path++ = '%'; |
2921 | 0 | *path++ = *cp++; |
2922 | 0 | } |
2923 | 0 | dir = cp; |
2924 | 0 | } |
2925 | | |
2926 | 0 | len = strlen(dir); |
2927 | 0 | if (len >= sz) return -1; |
2928 | 0 | strcpy(path, dir); |
2929 | 0 | path += len; |
2930 | 0 | sz -= len; |
2931 | |
|
2932 | 0 | len = strlen(fn) + ((*fn && path > start && path[-1] != '/') ? 1 : 0); |
2933 | 0 | if (len >= sz) return -1; |
2934 | 0 | if (*fn && path > start && path[-1] != '/') |
2935 | 0 | *path++ = '/'; |
2936 | 0 | strcpy(path, fn); |
2937 | 0 | return 0; |
2938 | 0 | } |
2939 | | |
2940 | | /* |
2941 | | * Make the directory containing path and any prefix directories. |
2942 | | */ |
2943 | 0 | static void mkdir_prefix(char *path, int mode) { |
2944 | 0 | char *cp = strrchr(path, '/'); |
2945 | 0 | if (!cp) |
2946 | 0 | return; |
2947 | | |
2948 | 0 | *cp = 0; |
2949 | 0 | if (is_directory(path)) { |
2950 | 0 | *cp = '/'; |
2951 | 0 | return; |
2952 | 0 | } |
2953 | | |
2954 | 0 | if (mkdir(path, mode) == 0) { |
2955 | 0 | chmod(path, mode); |
2956 | 0 | *cp = '/'; |
2957 | 0 | return; |
2958 | 0 | } |
2959 | | |
2960 | 0 | mkdir_prefix(path, mode); |
2961 | 0 | mkdir(path, mode); |
2962 | 0 | chmod(path, mode); |
2963 | 0 | *cp = '/'; |
2964 | 0 | } |
2965 | | |
2966 | | /* |
2967 | | * Queries the M5 string from the header and attempts to populate the |
2968 | | * reference from this using the REF_PATH environment. |
2969 | | * |
2970 | | * Returns 0 on success |
2971 | | * -1 on failure |
2972 | | */ |
2973 | 2.46k | static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { |
2974 | 2.46k | char *ref_path = getenv("REF_PATH"); |
2975 | 2.46k | sam_hrec_type_t *ty; |
2976 | 2.46k | sam_hrec_tag_t *tag; |
2977 | 2.46k | char path[PATH_MAX]; |
2978 | 2.46k | kstring_t path_tmp = KS_INITIALIZE; |
2979 | 2.46k | char *local_cache = getenv("REF_CACHE"); |
2980 | 2.46k | mFILE *mf; |
2981 | 2.46k | int local_path = 0; |
2982 | | |
2983 | 2.46k | hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id); |
2984 | | |
2985 | 2.46k | if (!r->name) |
2986 | 0 | return -1; |
2987 | | |
2988 | 2.46k | if (!(ty = sam_hrecs_find_type_id(fd->header->hrecs, "SQ", "SN", r->name))) |
2989 | 0 | return -1; |
2990 | | |
2991 | 2.46k | if (!(tag = sam_hrecs_find_key(ty, "M5", NULL))) |
2992 | 2.45k | goto no_M5; |
2993 | | |
2994 | 12 | hts_log_info("Querying ref %s", tag->str+3); |
2995 | | |
2996 | | /* Use cache if available */ |
2997 | 12 | if (local_cache && *local_cache) { |
2998 | 0 | struct stat sb; |
2999 | 0 | if (expand_cache_path(path, local_cache, tag->str+3) == 0 && |
3000 | 0 | stat(path, &sb) == 0) |
3001 | | // Found it in the local cache |
3002 | 0 | local_path = 1; |
3003 | 0 | } |
3004 | | |
3005 | | #ifndef HAVE_MMAP |
3006 | | char *path2; |
3007 | | /* Search local files in REF_PATH; we can open them and return as above */ |
3008 | | if (!local_path && (path2 = find_path(tag->str+3, ref_path))) { |
3009 | | int len = snprintf(path, PATH_MAX, "%s", path2); |
3010 | | free(path2); |
3011 | | if (len > 0 && len < PATH_MAX) // in case it's too long |
3012 | | local_path = 1; |
3013 | | } |
3014 | | #endif |
3015 | | |
3016 | | /* Found via REF_CACHE or local REF_PATH file */ |
3017 | 12 | if (local_path) { |
3018 | 0 | struct stat sb; |
3019 | 0 | BGZF *fp; |
3020 | |
|
3021 | 0 | if (0 == stat(path, &sb) |
3022 | 0 | && S_ISREG(sb.st_mode) |
3023 | 0 | && (fp = bgzf_open(path, "r"))) { |
3024 | 0 | r->length = sb.st_size; |
3025 | 0 | r->offset = r->line_length = r->bases_per_line = 0; |
3026 | |
|
3027 | 0 | r->fn = string_dup(fd->refs->pool, path); |
3028 | |
|
3029 | 0 | if (fd->refs->fp) |
3030 | 0 | if (bgzf_close(fd->refs->fp) != 0) |
3031 | 0 | return -1; |
3032 | 0 | fd->refs->fp = fp; |
3033 | 0 | fd->refs->fn = r->fn; |
3034 | 0 | r->is_md5 = 1; |
3035 | 0 | r->validated_md5 = 1; |
3036 | | |
3037 | | // Fall back to cram_get_ref() where it'll do the actual |
3038 | | // reading of the file. |
3039 | 0 | return 0; |
3040 | 0 | } |
3041 | 0 | } |
3042 | | |
3043 | | |
3044 | | /* Otherwise search full REF_PATH; slower as loads entire file */ |
3045 | 12 | int is_local = 0; |
3046 | 12 | if ((mf = open_path_mfile(tag->str+3, ref_path, NULL, &is_local))) { |
3047 | 0 | size_t sz; |
3048 | 0 | r->seq = mfsteal(mf, &sz); |
3049 | 0 | if (r->seq) { |
3050 | 0 | r->mf = NULL; |
3051 | 0 | } else { |
3052 | | // keep mf around as we couldn't detach |
3053 | 0 | r->seq = mf->data; |
3054 | 0 | r->mf = mf; |
3055 | 0 | } |
3056 | 0 | r->length = sz; |
3057 | 0 | r->is_md5 = 1; |
3058 | 0 | r->validated_md5 = 1; |
3059 | 12 | } else { |
3060 | 12 | refs_t *refs; |
3061 | 12 | const char *fn; |
3062 | 12 | sam_hrec_tag_t *UR_tag; |
3063 | | |
3064 | 2.46k | no_M5: |
3065 | | /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */ |
3066 | 2.46k | if (!(UR_tag = sam_hrecs_find_key(ty, "UR", NULL))) |
3067 | 1.44k | return -1; |
3068 | | |
3069 | 1.01k | if (strstr(UR_tag->str+3, "://") && |
3070 | 1.01k | strncmp(UR_tag->str+3, "file:", 5) != 0) { |
3071 | | // Documented as omitted, but accidentally supported until now |
3072 | 7 | hts_log_error("UR tags pointing to remote files are not supported"); |
3073 | 7 | return -1; |
3074 | 7 | } |
3075 | | |
3076 | 1.01k | fn = (strncmp(UR_tag->str+3, "file:", 5) == 0) |
3077 | 1.01k | ? UR_tag->str+8 |
3078 | 1.01k | : UR_tag->str+3; |
3079 | | |
3080 | 1.01k | if (fd->refs->fp) { |
3081 | 0 | if (bgzf_close(fd->refs->fp) != 0) |
3082 | 0 | return -1; |
3083 | 0 | fd->refs->fp = NULL; |
3084 | 0 | } |
3085 | 1.01k | if (!(refs = refs_load_fai(fd->refs, fn, 0))) |
3086 | 1.01k | return -1; |
3087 | 0 | sanitise_SQ_lines(fd); |
3088 | |
|
3089 | 0 | fd->refs = refs; |
3090 | 0 | if (fd->refs->fp) { |
3091 | 0 | if (bgzf_close(fd->refs->fp) != 0) |
3092 | 0 | return -1; |
3093 | 0 | fd->refs->fp = NULL; |
3094 | 0 | } |
3095 | | |
3096 | 0 | if (!fd->refs->fn) |
3097 | 0 | return -1; |
3098 | | |
3099 | 0 | if (-1 == refs2id(fd->refs, fd->header)) |
3100 | 0 | return -1; |
3101 | 0 | if (!fd->refs->ref_id || !fd->refs->ref_id[id]) |
3102 | 0 | return -1; |
3103 | | |
3104 | | // Local copy already, so fall back to cram_get_ref(). |
3105 | 0 | return 0; |
3106 | 0 | } |
3107 | | |
3108 | | /* Populate the local disk cache if required */ |
3109 | 0 | if (!is_local && local_cache && *local_cache) { |
3110 | 0 | hFILE *fp; |
3111 | |
|
3112 | 0 | if (expand_cache_path(path, local_cache, tag->str+3) < 0) { |
3113 | 0 | return 0; // Not fatal - we have the data already so keep going. |
3114 | 0 | } |
3115 | 0 | hts_log_info("Writing cache file '%s'", path); |
3116 | 0 | mkdir_prefix(path, 01777); |
3117 | |
|
3118 | 0 | fp = hts_open_tmpfile(path, "wx", &path_tmp); |
3119 | 0 | if (!fp) { |
3120 | 0 | perror(path_tmp.s); |
3121 | 0 | free(path_tmp.s); |
3122 | | |
3123 | | // Not fatal - we have the data already so keep going. |
3124 | 0 | return 0; |
3125 | 0 | } |
3126 | | |
3127 | | // Check md5sum |
3128 | 0 | hts_md5_context *md5; |
3129 | 0 | char unsigned md5_buf1[16]; |
3130 | 0 | char md5_buf2[33]; |
3131 | |
|
3132 | 0 | if (!(md5 = hts_md5_init())) { |
3133 | 0 | hclose_abruptly(fp); |
3134 | 0 | unlink(path_tmp.s); |
3135 | 0 | free(path_tmp.s); |
3136 | 0 | return -1; |
3137 | 0 | } |
3138 | 0 | hts_md5_update(md5, r->seq, r->length); |
3139 | 0 | hts_md5_final(md5_buf1, md5); |
3140 | 0 | hts_md5_destroy(md5); |
3141 | 0 | hts_md5_hex(md5_buf2, md5_buf1); |
3142 | |
|
3143 | 0 | if (strncmp(tag->str+3, md5_buf2, 32) != 0) { |
3144 | 0 | hts_log_error("Mismatching md5sum for downloaded reference"); |
3145 | 0 | hclose_abruptly(fp); |
3146 | 0 | unlink(path_tmp.s); |
3147 | 0 | free(path_tmp.s); |
3148 | 0 | return -1; |
3149 | 0 | } |
3150 | | |
3151 | 0 | ssize_t length_written = hwrite(fp, r->seq, r->length); |
3152 | 0 | if (hclose(fp) < 0 || length_written != r->length || |
3153 | 0 | chmod(path_tmp.s, 0444) < 0 || |
3154 | 0 | rename(path_tmp.s, path) < 0) { |
3155 | 0 | hts_log_error("Creating reference at %s failed: %s", |
3156 | 0 | path, strerror(errno)); |
3157 | 0 | unlink(path_tmp.s); |
3158 | 0 | } |
3159 | 0 | } |
3160 | | |
3161 | 0 | free(path_tmp.s); |
3162 | 0 | return 0; |
3163 | 0 | } |
3164 | | |
3165 | 1.04k | static void cram_ref_incr_locked(refs_t *r, int id) { |
3166 | 1.04k | RP("%d INC REF %d, %d %p\n", gettid(), id, |
3167 | 1.04k | (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count+1:-999), |
3168 | 1.04k | id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1); |
3169 | | |
3170 | 1.04k | if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) |
3171 | 1.04k | return; |
3172 | | |
3173 | 0 | if (r->last_id == id) |
3174 | 0 | r->last_id = -1; |
3175 | |
|
3176 | 0 | ++r->ref_id[id]->count; |
3177 | 0 | } |
3178 | | |
3179 | 1.04k | void cram_ref_incr(refs_t *r, int id) { |
3180 | 1.04k | pthread_mutex_lock(&r->lock); |
3181 | 1.04k | cram_ref_incr_locked(r, id); |
3182 | 1.04k | pthread_mutex_unlock(&r->lock); |
3183 | 1.04k | } |
3184 | | |
3185 | 66 | static void cram_ref_decr_locked(refs_t *r, int id) { |
3186 | 66 | RP("%d DEC REF %d, %d %p\n", gettid(), id, |
3187 | 66 | (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count-1:-999), |
3188 | 66 | id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1); |
3189 | | |
3190 | 66 | if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) { |
3191 | 66 | return; |
3192 | 66 | } |
3193 | | |
3194 | 0 | if (--r->ref_id[id]->count <= 0) { |
3195 | 0 | assert(r->ref_id[id]->count == 0); |
3196 | 0 | if (r->last_id >= 0) { |
3197 | 0 | if (r->ref_id[r->last_id]->count <= 0 && |
3198 | 0 | r->ref_id[r->last_id]->seq) { |
3199 | 0 | RP("%d FREE REF %d (%p)\n", gettid(), |
3200 | 0 | r->last_id, r->ref_id[r->last_id]->seq); |
3201 | 0 | ref_entry_free_seq(r->ref_id[r->last_id]); |
3202 | 0 | if (r->ref_id[r->last_id]->is_md5) r->ref_id[r->last_id]->length = 0; |
3203 | 0 | } |
3204 | 0 | } |
3205 | 0 | r->last_id = id; |
3206 | 0 | } |
3207 | 0 | } |
3208 | | |
3209 | 66 | void cram_ref_decr(refs_t *r, int id) { |
3210 | 66 | pthread_mutex_lock(&r->lock); |
3211 | 66 | cram_ref_decr_locked(r, id); |
3212 | 66 | pthread_mutex_unlock(&r->lock); |
3213 | 66 | } |
3214 | | |
3215 | | /* |
3216 | | * Used by cram_ref_load and cram_get_ref. The file handle will have |
3217 | | * already been opened, so we can catch it. The ref_entry *e informs us |
3218 | | * of whether this is a multi-line fasta file or a raw MD5 style file. |
3219 | | * Either way we create a single contiguous sequence. |
3220 | | * |
3221 | | * Returns all or part of a reference sequence on success (malloced); |
3222 | | * NULL on failure. |
3223 | | */ |
3224 | | static char *load_ref_portion(BGZF *fp, ref_entry *e, |
3225 | 0 | hts_pos_t start, hts_pos_t end) { |
3226 | 0 | off_t offset, len; |
3227 | 0 | char *seq; |
3228 | |
|
3229 | 0 | if (end < start) |
3230 | 0 | end = start; |
3231 | | |
3232 | | /* |
3233 | | * Compute locations in file. This is trivial for the MD5 files, but |
3234 | | * is still necessary for the fasta variants. |
3235 | | * |
3236 | | * Note the offset here, as with faidx, has the assumption that white- |
3237 | | * space (the diff between line_length and bases_per_line) only occurs |
3238 | | * at the end of a line of text. |
3239 | | */ |
3240 | 0 | offset = e->line_length |
3241 | 0 | ? e->offset + (start-1)/e->bases_per_line * e->line_length + |
3242 | 0 | (start-1) % e->bases_per_line |
3243 | 0 | : start-1; |
3244 | |
|
3245 | 0 | len = (e->line_length |
3246 | 0 | ? e->offset + (end-1)/e->bases_per_line * e->line_length + |
3247 | 0 | (end-1) % e->bases_per_line |
3248 | 0 | : end-1) - offset + 1; |
3249 | |
|
3250 | 0 | if (bgzf_useek(fp, offset, SEEK_SET) < 0) { |
3251 | 0 | perror("bgzf_useek() on reference file"); |
3252 | 0 | return NULL; |
3253 | 0 | } |
3254 | | |
3255 | 0 | if (len == 0 || !(seq = malloc(len))) { |
3256 | 0 | return NULL; |
3257 | 0 | } |
3258 | | |
3259 | 0 | if (len != bgzf_read(fp, seq, len)) { |
3260 | 0 | perror("bgzf_read() on reference file"); |
3261 | 0 | free(seq); |
3262 | 0 | return NULL; |
3263 | 0 | } |
3264 | | |
3265 | | /* Strip white-space if required. */ |
3266 | 0 | if (len != end-start+1) { |
3267 | 0 | hts_pos_t i, j; |
3268 | 0 | char *cp = seq; |
3269 | 0 | char *cp_to; |
3270 | | |
3271 | | // Copy up to the first white-space, and then repeatedly just copy |
3272 | | // bases_per_line verbatim, and use the slow method to end again. |
3273 | | // |
3274 | | // This may seem excessive, but this code can be a significant |
3275 | | // portion of total CRAM decode CPU time for shallow data sets. |
3276 | 0 | for (i = j = 0; i < len; i++) { |
3277 | 0 | if (!isspace_c(cp[i])) |
3278 | 0 | cp[j++] = cp[i] & ~0x20; |
3279 | 0 | else |
3280 | 0 | break; |
3281 | 0 | } |
3282 | 0 | while (i < len && isspace_c(cp[i])) |
3283 | 0 | i++; |
3284 | 0 | while (i < len - e->line_length) { |
3285 | 0 | hts_pos_t j_end = j + e->bases_per_line; |
3286 | 0 | while (j < j_end) |
3287 | 0 | cp[j++] = cp[i++] & ~0x20; // toupper equiv |
3288 | 0 | i += e->line_length - e->bases_per_line; |
3289 | 0 | } |
3290 | 0 | for (; i < len; i++) { |
3291 | 0 | if (!isspace_c(cp[i])) |
3292 | 0 | cp[j++] = cp[i] & ~0x20; |
3293 | 0 | } |
3294 | |
|
3295 | 0 | cp_to = cp+j; |
3296 | |
|
3297 | 0 | if (cp_to - seq != end-start+1) { |
3298 | 0 | hts_log_error("Malformed reference file"); |
3299 | 0 | free(seq); |
3300 | 0 | return NULL; |
3301 | 0 | } |
3302 | 0 | } else { |
3303 | 0 | int i; |
3304 | 0 | for (i = 0; i < len; i++) { |
3305 | 0 | seq[i] = toupper_c(seq[i]); |
3306 | 0 | } |
3307 | 0 | } |
3308 | | |
3309 | 0 | return seq; |
3310 | 0 | } |
3311 | | |
3312 | | /* |
3313 | | * Load the entire reference 'id'. |
3314 | | * This also increments the reference count by 1. |
3315 | | * |
3316 | | * Returns ref_entry on success; |
3317 | | * NULL on failure |
3318 | | */ |
3319 | 0 | ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) { |
3320 | 0 | ref_entry *e = r->ref_id[id]; |
3321 | 0 | hts_pos_t start = 1, end = e->length; |
3322 | 0 | char *seq; |
3323 | |
|
3324 | 0 | if (e->seq) { |
3325 | 0 | return e; |
3326 | 0 | } |
3327 | | |
3328 | 0 | assert(e->count == 0); |
3329 | | |
3330 | 0 | if (r->last) { |
3331 | | #ifdef REF_DEBUG |
3332 | | int idx = 0; |
3333 | | for (idx = 0; idx < r->nref; idx++) |
3334 | | if (r->last == r->ref_id[idx]) |
3335 | | break; |
3336 | | RP("%d cram_ref_load DECR %d\n", gettid(), idx); |
3337 | | #endif |
3338 | 0 | assert(r->last->count > 0); |
3339 | 0 | if (--r->last->count <= 0) { |
3340 | 0 | RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq); |
3341 | 0 | if (r->last->seq) |
3342 | 0 | ref_entry_free_seq(r->last); |
3343 | 0 | } |
3344 | 0 | } |
3345 | | |
3346 | 0 | if (!r->fn) |
3347 | 0 | return NULL; |
3348 | | |
3349 | | /* Open file if it's not already the current open reference */ |
3350 | 0 | if (strcmp(r->fn, e->fn) || r->fp == NULL) { |
3351 | 0 | if (r->fp) |
3352 | 0 | if (bgzf_close(r->fp) != 0) |
3353 | 0 | return NULL; |
3354 | 0 | r->fn = e->fn; |
3355 | 0 | if (!(r->fp = bgzf_open_ref(r->fn, "r", is_md5))) |
3356 | 0 | return NULL; |
3357 | 0 | } |
3358 | | |
3359 | 0 | RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end); |
3360 | |
|
3361 | 0 | if (!(seq = load_ref_portion(r->fp, e, start, end))) { |
3362 | 0 | return NULL; |
3363 | 0 | } |
3364 | | |
3365 | 0 | RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq); |
3366 | |
|
3367 | 0 | RP("%d INC REF %d, %"PRId64"\n", gettid(), id, (e->count+1)); |
3368 | 0 | e->seq = seq; |
3369 | 0 | e->mf = NULL; |
3370 | 0 | e->count++; |
3371 | | |
3372 | | /* |
3373 | | * Also keep track of last used ref so incr/decr loops on the same |
3374 | | * sequence don't cause load/free loops. |
3375 | | */ |
3376 | 0 | RP("%d cram_ref_load INCR %d => %"PRId64"\n", gettid(), id, e->count+1); |
3377 | 0 | r->last = e; |
3378 | 0 | e->count++; |
3379 | |
|
3380 | 0 | return e; |
3381 | 0 | } |
3382 | | |
3383 | | /* |
3384 | | * Returns a portion of a reference sequence from start to end inclusive. |
3385 | | * The returned pointer is owned by either the cram_file fd or by the |
3386 | | * internal refs_t structure and should not be freed by the caller. |
3387 | | * |
3388 | | * The difference is whether or not this refs_t is in use by just the one |
3389 | | * cram_fd or by multiples, or whether we have multiple threads accessing |
3390 | | * references. In either case fd->shared will be true and we start using |
3391 | | * reference counting to track the number of users of a specific reference |
3392 | | * sequence. |
3393 | | * |
3394 | | * Otherwise the ref seq returned is allocated as part of cram_fd itself |
3395 | | * and will be freed up on the next call to cram_get_ref or cram_close. |
3396 | | * |
3397 | | * To return the entire reference sequence, specify start as 1 and end |
3398 | | * as 0. |
3399 | | * |
3400 | | * To cease using a reference, call cram_ref_decr(). |
3401 | | * |
3402 | | * Returns reference on success, |
3403 | | * NULL on failure |
3404 | | */ |
3405 | 5.73k | char *cram_get_ref(cram_fd *fd, int id, hts_pos_t start, hts_pos_t end) { |
3406 | 5.73k | ref_entry *r; |
3407 | 5.73k | char *seq; |
3408 | 5.73k | int ostart = start; |
3409 | | |
3410 | 5.73k | if (id == -1 || start < 1) |
3411 | 3.25k | return NULL; |
3412 | | |
3413 | | /* FIXME: axiomatic query of r->seq being true? |
3414 | | * Or shortcut for unsorted data where we load once and never free? |
3415 | | */ |
3416 | | |
3417 | | //fd->shared_ref = 1; // hard code for now to simplify things |
3418 | | |
3419 | 2.47k | pthread_mutex_lock(&fd->ref_lock); |
3420 | | |
3421 | 2.47k | RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end); |
3422 | | |
3423 | | /* |
3424 | | * Unsorted data implies we want to fetch an entire reference at a time. |
3425 | | * We just deal with this at the moment by claiming we're sharing |
3426 | | * references instead, which has the same requirement. |
3427 | | */ |
3428 | 2.47k | if (fd->unsorted) |
3429 | 0 | fd->shared_ref = 1; |
3430 | | |
3431 | | |
3432 | | /* Sanity checking: does this ID exist? */ |
3433 | 2.47k | if (id >= fd->refs->nref) { |
3434 | 12 | hts_log_error("No reference found for id %d", id); |
3435 | 12 | pthread_mutex_unlock(&fd->ref_lock); |
3436 | 12 | return NULL; |
3437 | 12 | } |
3438 | | |
3439 | 2.46k | if (!fd->refs || !fd->refs->ref_id[id]) { |
3440 | 0 | hts_log_error("No reference found for id %d", id); |
3441 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3442 | 0 | return NULL; |
3443 | 0 | } |
3444 | | |
3445 | 2.46k | if (!(r = fd->refs->ref_id[id])) { |
3446 | 0 | hts_log_error("No reference found for id %d", id); |
3447 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3448 | 0 | return NULL; |
3449 | 0 | } |
3450 | | |
3451 | | |
3452 | | /* |
3453 | | * It has an entry, but may not have been populated yet. |
3454 | | * Any manually loaded .fai files have their lengths known. |
3455 | | * A ref entry computed from @SQ lines (M5 or UR field) will have |
3456 | | * r->length == 0 unless it's been loaded once and verified that we have |
3457 | | * an on-disk filename for it. |
3458 | | * |
3459 | | * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls |
3460 | | * open_path_mfile and libcurl, which isn't multi-thread safe unless I |
3461 | | * rewrite my code to have one curl handle per thread. |
3462 | | */ |
3463 | 2.46k | pthread_mutex_lock(&fd->refs->lock); |
3464 | 2.46k | if (r->length == 0) { |
3465 | 2.46k | if (fd->ref_fn) |
3466 | 0 | hts_log_warning("Reference file given, but ref '%s' not present", |
3467 | 2.46k | r->name); |
3468 | 2.46k | if (cram_populate_ref(fd, id, r) == -1) { |
3469 | 2.46k | hts_log_warning("Failed to populate reference \"%s\"", |
3470 | 2.46k | r->name); |
3471 | 2.46k | hts_log_warning("See https://www.htslib.org/doc/reference_seqs.html for further suggestions"); |
3472 | 2.46k | pthread_mutex_unlock(&fd->refs->lock); |
3473 | 2.46k | pthread_mutex_unlock(&fd->ref_lock); |
3474 | 2.46k | return NULL; |
3475 | 2.46k | } |
3476 | 0 | r = fd->refs->ref_id[id]; |
3477 | 0 | if (fd->unsorted) |
3478 | 0 | cram_ref_incr_locked(fd->refs, id); |
3479 | 0 | } |
3480 | | |
3481 | | |
3482 | | /* |
3483 | | * We now know that we the filename containing the reference, so check |
3484 | | * for limits. If it's over half the reference we'll load all of it in |
3485 | | * memory as this will speed up subsequent calls. |
3486 | | */ |
3487 | 0 | if (end < 1) |
3488 | 0 | end = r->length; |
3489 | 0 | if (end >= r->length) |
3490 | 0 | end = r->length; |
3491 | |
|
3492 | 0 | if (end - start >= 0.5*r->length || fd->shared_ref) { |
3493 | 0 | start = 1; |
3494 | 0 | end = r->length; |
3495 | 0 | } |
3496 | | |
3497 | | /* |
3498 | | * Maybe we have it cached already? If so use it. |
3499 | | * |
3500 | | * Alternatively if we don't have the sequence but we're sharing |
3501 | | * references and/or are asking for the entire length of it, then |
3502 | | * load the full reference into the refs structure and return |
3503 | | * a pointer to that one instead. |
3504 | | */ |
3505 | 0 | if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) { |
3506 | 0 | char *cp; |
3507 | |
|
3508 | 0 | if (id >= 0) { |
3509 | 0 | if (r->seq) { |
3510 | 0 | cram_ref_incr_locked(fd->refs, id); |
3511 | 0 | } else { |
3512 | 0 | ref_entry *e; |
3513 | 0 | if (!(e = cram_ref_load(fd->refs, id, r->is_md5))) { |
3514 | 0 | pthread_mutex_unlock(&fd->refs->lock); |
3515 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3516 | 0 | return NULL; |
3517 | 0 | } |
3518 | | |
3519 | | /* unsorted data implies cache ref indefinitely, to avoid |
3520 | | * continually loading and unloading. |
3521 | | */ |
3522 | 0 | if (fd->unsorted) |
3523 | 0 | cram_ref_incr_locked(fd->refs, id); |
3524 | 0 | } |
3525 | | |
3526 | 0 | fd->ref = NULL; /* We never access it directly */ |
3527 | 0 | fd->ref_start = 1; |
3528 | 0 | fd->ref_end = r->length; |
3529 | 0 | fd->ref_id = id; |
3530 | |
|
3531 | 0 | cp = fd->refs->ref_id[id]->seq + ostart-1; |
3532 | 0 | } else { |
3533 | 0 | fd->ref = NULL; |
3534 | 0 | cp = NULL; |
3535 | 0 | } |
3536 | | |
3537 | 0 | RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count); |
3538 | |
|
3539 | 0 | pthread_mutex_unlock(&fd->refs->lock); |
3540 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3541 | 0 | return cp; |
3542 | 0 | } |
3543 | | |
3544 | | /* |
3545 | | * Otherwise we're not sharing, we don't have a copy of it already and |
3546 | | * we're only asking for a small portion of it. |
3547 | | * |
3548 | | * In this case load up just that segment ourselves, freeing any old |
3549 | | * small segments in the process. |
3550 | | */ |
3551 | | |
3552 | | /* Unmapped ref ID */ |
3553 | 0 | if (id < 0 || !fd->refs->fn) { |
3554 | 0 | if (fd->ref_free) { |
3555 | 0 | free(fd->ref_free); |
3556 | 0 | fd->ref_free = NULL; |
3557 | 0 | } |
3558 | 0 | fd->ref = NULL; |
3559 | 0 | fd->ref_id = id; |
3560 | 0 | pthread_mutex_unlock(&fd->refs->lock); |
3561 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3562 | 0 | return NULL; |
3563 | 0 | } |
3564 | | |
3565 | | /* Open file if it's not already the current open reference */ |
3566 | 0 | if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) { |
3567 | 0 | if (fd->refs->fp) |
3568 | 0 | if (bgzf_close(fd->refs->fp) != 0) |
3569 | 0 | return NULL; |
3570 | 0 | fd->refs->fn = r->fn; |
3571 | 0 | if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r", r->is_md5))) { |
3572 | 0 | pthread_mutex_unlock(&fd->refs->lock); |
3573 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3574 | 0 | return NULL; |
3575 | 0 | } |
3576 | 0 | } |
3577 | | |
3578 | 0 | if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) { |
3579 | 0 | pthread_mutex_unlock(&fd->refs->lock); |
3580 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3581 | 0 | return NULL; |
3582 | 0 | } |
3583 | | |
3584 | 0 | if (fd->ref_free) |
3585 | 0 | free(fd->ref_free); |
3586 | |
|
3587 | 0 | fd->ref_id = id; |
3588 | 0 | fd->ref_start = start; |
3589 | 0 | fd->ref_end = end; |
3590 | 0 | fd->ref_free = fd->ref; |
3591 | 0 | seq = fd->ref; |
3592 | |
|
3593 | 0 | pthread_mutex_unlock(&fd->refs->lock); |
3594 | 0 | pthread_mutex_unlock(&fd->ref_lock); |
3595 | |
|
3596 | 0 | return seq ? seq + ostart - start : NULL; |
3597 | 0 | } |
3598 | | |
3599 | | /* |
3600 | | * If fd has been opened for reading, it may be permitted to specify 'fn' |
3601 | | * as NULL and let the code auto-detect the reference by parsing the |
3602 | | * SAM header @SQ lines. |
3603 | | */ |
3604 | 0 | int cram_load_reference(cram_fd *fd, char *fn) { |
3605 | 0 | int ret = 0; |
3606 | |
|
3607 | 0 | if (fn) { |
3608 | 0 | fd->refs = refs_load_fai(fd->refs, fn, |
3609 | 0 | !(fd->embed_ref>0 && fd->mode == 'r')); |
3610 | 0 | fn = fd->refs ? fd->refs->fn : NULL; |
3611 | 0 | if (!fn) |
3612 | 0 | ret = -1; |
3613 | 0 | sanitise_SQ_lines(fd); |
3614 | 0 | } |
3615 | 0 | fd->ref_fn = fn; |
3616 | |
|
3617 | 0 | if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) { |
3618 | 0 | if (fd->refs) |
3619 | 0 | refs_free(fd->refs); |
3620 | 0 | if (!(fd->refs = refs_create())) |
3621 | 0 | return -1; |
3622 | 0 | if (-1 == refs_from_header(fd)) |
3623 | 0 | return -1; |
3624 | 0 | } |
3625 | | |
3626 | 0 | if (fd->header) |
3627 | 0 | if (-1 == refs2id(fd->refs, fd->header)) |
3628 | 0 | return -1; |
3629 | | |
3630 | 0 | return ret; |
3631 | 0 | } |
3632 | | |
3633 | | /* ---------------------------------------------------------------------- |
3634 | | * Containers |
3635 | | */ |
3636 | | |
3637 | | /* |
3638 | | * Creates a new container, specifying the maximum number of slices |
3639 | | * and records permitted. |
3640 | | * |
3641 | | * Returns cram_container ptr on success |
3642 | | * NULL on failure |
3643 | | */ |
3644 | 44.3k | cram_container *cram_new_container(int nrec, int nslice) { |
3645 | 44.3k | cram_container *c = calloc(1, sizeof(*c)); |
3646 | 44.3k | enum cram_DS_ID id; |
3647 | | |
3648 | 44.3k | if (!c) |
3649 | 0 | return NULL; |
3650 | | |
3651 | 44.3k | c->curr_ref = -2; |
3652 | | |
3653 | 44.3k | c->max_c_rec = nrec * nslice; |
3654 | 44.3k | c->curr_c_rec = 0; |
3655 | | |
3656 | 44.3k | c->max_rec = nrec; |
3657 | 44.3k | c->record_counter = 0; |
3658 | 44.3k | c->num_bases = 0; |
3659 | 44.3k | c->s_num_bases = 0; |
3660 | | |
3661 | 44.3k | c->max_slice = nslice; |
3662 | 44.3k | c->curr_slice = 0; |
3663 | | |
3664 | 44.3k | c->pos_sorted = 1; |
3665 | 44.3k | c->max_apos = 0; |
3666 | 44.3k | c->multi_seq = 0; |
3667 | 44.3k | c->qs_seq_orient = 1; |
3668 | 44.3k | c->no_ref = 0; |
3669 | 44.3k | c->embed_ref = -1; // automatic selection |
3670 | | |
3671 | 44.3k | c->bams = NULL; |
3672 | | |
3673 | 44.3k | if (!(c->slices = calloc(nslice != 0 ? nslice : 1, sizeof(cram_slice *)))) |
3674 | 0 | goto err; |
3675 | 44.3k | c->slice = NULL; |
3676 | | |
3677 | 44.3k | if (!(c->comp_hdr = cram_new_compression_header())) |
3678 | 0 | goto err; |
3679 | 44.3k | c->comp_hdr_block = NULL; |
3680 | | |
3681 | 1.28M | for (id = DS_RN; id < DS_TN; id++) |
3682 | 1.24M | if (!(c->stats[id] = cram_stats_create())) goto err; |
3683 | | |
3684 | | //c->aux_B_stats = cram_stats_create(); |
3685 | | |
3686 | 44.3k | if (!(c->tags_used = kh_init(m_tagmap))) |
3687 | 0 | goto err; |
3688 | 44.3k | c->refs_used = 0; |
3689 | 44.3k | c->ref_free = 0; |
3690 | | |
3691 | 44.3k | return c; |
3692 | | |
3693 | 0 | err: |
3694 | 0 | if (c) { |
3695 | 0 | if (c->slices) |
3696 | 0 | free(c->slices); |
3697 | 0 | free(c); |
3698 | 0 | } |
3699 | 0 | return NULL; |
3700 | 44.3k | } |
3701 | | |
3702 | 4.39k | static void free_bam_list(bam_seq_t **bams, int max_rec) { |
3703 | 4.39k | int i; |
3704 | 43.9M | for (i = 0; i < max_rec; i++) |
3705 | 43.9M | bam_free(bams[i]); |
3706 | | |
3707 | 4.39k | free(bams); |
3708 | 4.39k | } |
3709 | | |
3710 | 72.3k | void cram_free_container(cram_container *c) { |
3711 | 72.3k | enum cram_DS_ID id; |
3712 | 72.3k | int i; |
3713 | | |
3714 | 72.3k | if (!c) |
3715 | 0 | return; |
3716 | | |
3717 | 72.3k | if (c->refs_used) |
3718 | 489 | free(c->refs_used); |
3719 | | |
3720 | 72.3k | if (c->landmark) |
3721 | 46.1k | free(c->landmark); |
3722 | | |
3723 | 72.3k | if (c->comp_hdr) |
3724 | 44.7k | cram_free_compression_header(c->comp_hdr); |
3725 | | |
3726 | 72.3k | if (c->comp_hdr_block) |
3727 | 40.7k | cram_free_block(c->comp_hdr_block); |
3728 | | |
3729 | | // Free the slices; filled out by encoder only |
3730 | 72.3k | if (c->slices) { |
3731 | 80.7k | for (i = 0; i < c->max_slice; i++) { |
3732 | 36.4k | if (c->slices[i]) |
3733 | 4.39k | cram_free_slice(c->slices[i]); |
3734 | 36.4k | if (c->slices[i] == c->slice) |
3735 | 36.4k | c->slice = NULL; |
3736 | 36.4k | } |
3737 | 44.3k | free(c->slices); |
3738 | 44.3k | } |
3739 | | |
3740 | | // Free the current slice; set by both encoder & decoder |
3741 | 72.3k | if (c->slice) { |
3742 | 0 | cram_free_slice(c->slice); |
3743 | 0 | c->slice = NULL; |
3744 | 0 | } |
3745 | | |
3746 | 2.09M | for (id = DS_RN; id < DS_TN; id++) |
3747 | 2.02M | if (c->stats[id]) cram_stats_free(c->stats[id]); |
3748 | | |
3749 | | //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats); |
3750 | | |
3751 | 72.3k | if (c->tags_used) { |
3752 | 44.3k | khint_t k; |
3753 | | |
3754 | 225k | for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) { |
3755 | 180k | if (!kh_exist(c->tags_used, k)) |
3756 | 103k | continue; |
3757 | | |
3758 | 77.6k | cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k); |
3759 | 77.6k | if (tm) { |
3760 | 77.6k | cram_codec *c = tm->codec; |
3761 | | |
3762 | 77.6k | if (c) c->free(c); |
3763 | | |
3764 | | // If tm->blk or tm->blk2 is set, then we haven't yet got to |
3765 | | // cram_encode_container which copies the blocks to s->aux_block |
3766 | | // and NULLifies tm->blk*. In this case we failed to complete |
3767 | | // the container construction, so we have to free up our partially |
3768 | | // converted CRAM. |
3769 | 77.6k | cram_free_block(tm->blk); |
3770 | 77.6k | cram_free_block(tm->blk2); |
3771 | 77.6k | free(tm); |
3772 | 77.6k | } |
3773 | 77.6k | } |
3774 | | |
3775 | 44.3k | kh_destroy(m_tagmap, c->tags_used); |
3776 | 44.3k | } |
3777 | | |
3778 | 72.3k | if (c->ref_free) |
3779 | 16.8k | free(c->ref); |
3780 | | |
3781 | 72.3k | if (c->bams) |
3782 | 366 | free_bam_list(c->bams, c->max_c_rec); |
3783 | | |
3784 | 72.3k | free(c); |
3785 | 72.3k | } |
3786 | | |
3787 | | /* |
3788 | | * Reads a container header. |
3789 | | * |
3790 | | * Returns cram_container on success |
3791 | | * NULL on failure or no container left (fd->err == 0). |
3792 | | */ |
3793 | 28.5k | cram_container *cram_read_container(cram_fd *fd) { |
3794 | 28.5k | cram_container c2, *c; |
3795 | 28.5k | int i, s; |
3796 | 28.5k | size_t rd = 0; |
3797 | 28.5k | uint32_t crc = 0; |
3798 | | |
3799 | 28.5k | fd->err = 0; |
3800 | 28.5k | fd->eof = 0; |
3801 | | |
3802 | 28.5k | memset(&c2, 0, sizeof(c2)); |
3803 | 28.5k | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
3804 | 22.5k | if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) { |
3805 | 38 | fd->eof = fd->empty_container ? 1 : 2; |
3806 | 38 | return NULL; |
3807 | 22.5k | } else { |
3808 | 22.5k | rd+=s; |
3809 | 22.5k | } |
3810 | 22.5k | } else if (CRAM_MAJOR_VERS(fd->version) < 4) { |
3811 | 3.25k | uint32_t len; |
3812 | 3.25k | if ((s = int32_decode(fd, &c2.length)) == -1) { |
3813 | 19 | if (CRAM_MAJOR_VERS(fd->version) == 2 && |
3814 | 19 | CRAM_MINOR_VERS(fd->version) == 0) |
3815 | 8 | fd->eof = 1; // EOF blocks arrived in v2.1 |
3816 | 11 | else |
3817 | 11 | fd->eof = fd->empty_container ? 1 : 2; |
3818 | 19 | return NULL; |
3819 | 3.23k | } else { |
3820 | 3.23k | rd+=s; |
3821 | 3.23k | } |
3822 | 3.23k | len = le_int4(c2.length); |
3823 | 3.23k | crc = crc32(0L, (unsigned char *)&len, 4); |
3824 | 3.23k | } else { |
3825 | 2.72k | if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) { |
3826 | 24 | fd->eof = fd->empty_container ? 1 : 2; |
3827 | 24 | return NULL; |
3828 | 2.70k | } else { |
3829 | 2.70k | rd+=s; |
3830 | 2.70k | } |
3831 | 2.72k | } |
3832 | 28.4k | if ((s = fd->vv.varint_decode32s_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s; |
3833 | 28.3k | if (CRAM_MAJOR_VERS(fd->version) >= 4) { |
3834 | 2.68k | int64_t i64; |
3835 | 2.68k | if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc))== -1) return NULL; else rd+=s; |
3836 | 2.67k | c2.ref_seq_start = i64; |
3837 | 2.67k | if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc)) == -1) return NULL; else rd+=s; |
3838 | 2.66k | c2.ref_seq_span = i64; |
3839 | 25.6k | } else { |
3840 | 25.6k | int32_t i32; |
3841 | 25.6k | if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s; |
3842 | 25.6k | c2.ref_seq_start = i32; |
3843 | 25.6k | if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s; |
3844 | 25.5k | c2.ref_seq_span = i32; |
3845 | 25.5k | } |
3846 | 28.2k | if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_records, &crc)) == -1) return NULL; else rd+=s; |
3847 | | |
3848 | 28.2k | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
3849 | 22.3k | c2.record_counter = 0; |
3850 | 22.3k | c2.num_bases = 0; |
3851 | 22.3k | } else { |
3852 | 5.85k | if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
3853 | 3.02k | if ((s = fd->vv.varint_decode64_crc(fd, &c2.record_counter, &crc)) == -1) |
3854 | 14 | return NULL; |
3855 | 3.01k | else |
3856 | 3.01k | rd += s; |
3857 | 3.02k | } else { |
3858 | 2.82k | int32_t i32; |
3859 | 2.82k | if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) |
3860 | 14 | return NULL; |
3861 | 2.80k | else |
3862 | 2.80k | rd += s; |
3863 | 2.80k | c2.record_counter = i32; |
3864 | 2.80k | } |
3865 | | |
3866 | 5.82k | if ((s = fd->vv.varint_decode64_crc(fd, &c2.num_bases, &crc))== -1) |
3867 | 53 | return NULL; |
3868 | 5.76k | else |
3869 | 5.76k | rd += s; |
3870 | 5.82k | } |
3871 | 28.1k | if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_blocks, &crc)) == -1) |
3872 | 38 | return NULL; |
3873 | 28.0k | else |
3874 | 28.0k | rd+=s; |
3875 | 28.0k | if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_landmarks, &crc))== -1) |
3876 | 22 | return NULL; |
3877 | 28.0k | else |
3878 | 28.0k | rd+=s; |
3879 | | |
3880 | 28.0k | if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t)) |
3881 | 52 | return NULL; |
3882 | | |
3883 | 28.0k | if (!(c = calloc(1, sizeof(*c)))) |
3884 | 0 | return NULL; |
3885 | | |
3886 | 28.0k | *c = c2; |
3887 | 28.0k | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
3888 | 28.0k | if (c->num_landmarks > FUZZ_ALLOC_LIMIT/sizeof(int32_t)) { |
3889 | 9 | fd->err = errno = ENOMEM; |
3890 | 9 | cram_free_container(c); |
3891 | 9 | return NULL; |
3892 | 9 | } |
3893 | 28.0k | #endif |
3894 | 28.0k | if (c->num_landmarks && !(c->landmark = malloc(c->num_landmarks * sizeof(int32_t)))) { |
3895 | 0 | fd->err = errno; |
3896 | 0 | cram_free_container(c); |
3897 | 0 | return NULL; |
3898 | 0 | } |
3899 | 65.0k | for (i = 0; i < c->num_landmarks; i++) { |
3900 | 37.2k | if ((s = fd->vv.varint_decode32_crc(fd, &c->landmark[i], &crc)) == -1) { |
3901 | 161 | cram_free_container(c); |
3902 | 161 | return NULL; |
3903 | 37.0k | } else { |
3904 | 37.0k | rd += s; |
3905 | 37.0k | } |
3906 | 37.2k | } |
3907 | | |
3908 | 27.8k | if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
3909 | 2.90k | if (-1 == int32_decode(fd, (int32_t *)&c->crc32)) { |
3910 | 12 | cram_free_container(c); |
3911 | 12 | return NULL; |
3912 | 2.89k | } else { |
3913 | 2.89k | rd+=4; |
3914 | 2.89k | } |
3915 | | |
3916 | 2.89k | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
3917 | | // Pretend the CRC was OK so the fuzzer doesn't have to get it right |
3918 | 2.89k | crc = c->crc32; |
3919 | 2.89k | #endif |
3920 | | |
3921 | 2.89k | if (crc != c->crc32) { |
3922 | 0 | hts_log_error("Container header CRC32 failure"); |
3923 | 0 | cram_free_container(c); |
3924 | 0 | return NULL; |
3925 | 0 | } |
3926 | 2.89k | } |
3927 | | |
3928 | 27.8k | c->offset = rd; |
3929 | 27.8k | c->slices = NULL; |
3930 | 27.8k | c->slice = NULL; |
3931 | 27.8k | c->curr_slice = 0; |
3932 | 27.8k | c->max_slice = c->num_landmarks; |
3933 | 27.8k | c->slice_rec = 0; |
3934 | 27.8k | c->curr_rec = 0; |
3935 | 27.8k | c->max_rec = 0; |
3936 | | |
3937 | 27.8k | if (c->ref_seq_id == -2) { |
3938 | 27 | c->multi_seq = 1; |
3939 | 27 | fd->multi_seq = 1; |
3940 | 27 | } |
3941 | | |
3942 | 27.8k | fd->empty_container = |
3943 | 27.8k | (c->num_records == 0 && |
3944 | 27.8k | c->ref_seq_id == -1 && |
3945 | 27.8k | c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0; |
3946 | | |
3947 | 27.8k | return c; |
3948 | 27.8k | } |
3949 | | |
3950 | | |
3951 | | /* MAXIMUM storage size needed for the container. */ |
3952 | 0 | int cram_container_size(cram_container *c) { |
3953 | 0 | return 55 + 5*c->num_landmarks; |
3954 | 0 | } |
3955 | | |
3956 | | |
3957 | | /* |
3958 | | * Stores the container structure in dat and returns *size as the |
3959 | | * number of bytes written to dat[]. The input size of dat is also |
3960 | | * held in *size and should be initialised to cram_container_size(c). |
3961 | | * |
3962 | | * Returns 0 on success; |
3963 | | * -1 on failure |
3964 | | */ |
3965 | | int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) |
3966 | 0 | { |
3967 | 0 | char *cp = (char *)dat; |
3968 | 0 | int i; |
3969 | | |
3970 | | // Check the input buffer is large enough according to our stated |
3971 | | // requirements. (NOTE: it may actually take less.) |
3972 | 0 | if (cram_container_size(c) > *size) |
3973 | 0 | return -1; |
3974 | | |
3975 | 0 | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
3976 | 0 | cp += itf8_put(cp, c->length); |
3977 | 0 | } else { |
3978 | 0 | *(int32_t *)cp = le_int4(c->length); |
3979 | 0 | cp += 4; |
3980 | 0 | } |
3981 | 0 | if (c->multi_seq) { |
3982 | 0 | cp += fd->vv.varint_put32(cp, NULL, -2); |
3983 | 0 | cp += fd->vv.varint_put32(cp, NULL, 0); |
3984 | 0 | cp += fd->vv.varint_put32(cp, NULL, 0); |
3985 | 0 | } else { |
3986 | 0 | cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id); |
3987 | 0 | if (CRAM_MAJOR_VERS(fd->version) >= 4) { |
3988 | 0 | cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start); |
3989 | 0 | cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span); |
3990 | 0 | } else { |
3991 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start); |
3992 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span); |
3993 | 0 | } |
3994 | 0 | } |
3995 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->num_records); |
3996 | 0 | if (CRAM_MAJOR_VERS(fd->version) == 2) { |
3997 | 0 | cp += fd->vv.varint_put64(cp, NULL, c->record_counter); |
3998 | 0 | } else if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
3999 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->record_counter); |
4000 | 0 | } |
4001 | 0 | cp += fd->vv.varint_put64(cp, NULL, c->num_bases); |
4002 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->num_blocks); |
4003 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks); |
4004 | 0 | for (i = 0; i < c->num_landmarks; i++) |
4005 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]); |
4006 | |
|
4007 | 0 | if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
4008 | 0 | c->crc32 = crc32(0L, (uc *)dat, cp-dat); |
4009 | 0 | cp[0] = c->crc32 & 0xff; |
4010 | 0 | cp[1] = (c->crc32 >> 8) & 0xff; |
4011 | 0 | cp[2] = (c->crc32 >> 16) & 0xff; |
4012 | 0 | cp[3] = (c->crc32 >> 24) & 0xff; |
4013 | 0 | cp += 4; |
4014 | 0 | } |
4015 | |
|
4016 | 0 | *size = cp-dat; // actual used size |
4017 | |
|
4018 | 0 | return 0; |
4019 | 0 | } |
4020 | | |
4021 | | |
4022 | | /* |
4023 | | * Writes a container structure. |
4024 | | * |
4025 | | * Returns 0 on success |
4026 | | * -1 on failure |
4027 | | */ |
4028 | 52.7k | int cram_write_container(cram_fd *fd, cram_container *c) { |
4029 | 52.7k | char buf_a[1024], *buf = buf_a, *cp; |
4030 | 52.7k | int i; |
4031 | | |
4032 | 52.7k | if (61 + c->num_landmarks * 10 >= 1024) { |
4033 | 0 | buf = malloc(61 + c->num_landmarks * 10); |
4034 | 0 | if (!buf) |
4035 | 0 | return -1; |
4036 | 0 | } |
4037 | 52.7k | cp = buf; |
4038 | | |
4039 | 52.7k | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
4040 | 0 | cp += itf8_put(cp, c->length); |
4041 | 52.7k | } else if (CRAM_MAJOR_VERS(fd->version) <= 3) { |
4042 | 52.7k | *(int32_t *)cp = le_int4(c->length); |
4043 | 52.7k | cp += 4; |
4044 | 52.7k | } else { |
4045 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->length); |
4046 | 0 | } |
4047 | 52.7k | if (c->multi_seq) { |
4048 | 485 | cp += fd->vv.varint_put32(cp, NULL, (uint32_t)-2); |
4049 | 485 | cp += fd->vv.varint_put32(cp, NULL, 0); |
4050 | 485 | cp += fd->vv.varint_put32(cp, NULL, 0); |
4051 | 52.2k | } else { |
4052 | 52.2k | cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id); |
4053 | 52.2k | if (CRAM_MAJOR_VERS(fd->version) >= 4) { |
4054 | 0 | cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start); |
4055 | 0 | cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span); |
4056 | 52.2k | } else { |
4057 | 52.2k | cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start); |
4058 | 52.2k | cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span); |
4059 | 52.2k | } |
4060 | 52.2k | } |
4061 | 52.7k | cp += fd->vv.varint_put32(cp, NULL, c->num_records); |
4062 | 52.7k | if (CRAM_MAJOR_VERS(fd->version) >= 3) |
4063 | 52.7k | cp += fd->vv.varint_put64(cp, NULL, c->record_counter); |
4064 | 0 | else |
4065 | 0 | cp += fd->vv.varint_put32(cp, NULL, c->record_counter); |
4066 | 52.7k | cp += fd->vv.varint_put64(cp, NULL, c->num_bases); |
4067 | 52.7k | cp += fd->vv.varint_put32(cp, NULL, c->num_blocks); |
4068 | 52.7k | cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks); |
4069 | 104k | for (i = 0; i < c->num_landmarks; i++) |
4070 | 51.7k | cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]); |
4071 | | |
4072 | 52.7k | if (CRAM_MAJOR_VERS(fd->version) >= 3) { |
4073 | 52.7k | c->crc32 = crc32(0L, (uc *)buf, cp-buf); |
4074 | 52.7k | cp[0] = c->crc32 & 0xff; |
4075 | 52.7k | cp[1] = (c->crc32 >> 8) & 0xff; |
4076 | 52.7k | cp[2] = (c->crc32 >> 16) & 0xff; |
4077 | 52.7k | cp[3] = (c->crc32 >> 24) & 0xff; |
4078 | 52.7k | cp += 4; |
4079 | 52.7k | } |
4080 | | |
4081 | 52.7k | if (cp-buf != hwrite(fd->fp, buf, cp-buf)) { |
4082 | 0 | if (buf != buf_a) |
4083 | 0 | free(buf); |
4084 | 0 | return -1; |
4085 | 0 | } |
4086 | | |
4087 | 52.7k | if (buf != buf_a) |
4088 | 0 | free(buf); |
4089 | | |
4090 | 52.7k | return 0; |
4091 | 52.7k | } |
4092 | | |
4093 | | // common component shared by cram_flush_container{,_mt} |
4094 | 35.9k | static int cram_flush_container2(cram_fd *fd, cram_container *c) { |
4095 | 35.9k | int i, j; |
4096 | | |
4097 | 35.9k | if (c->curr_slice > 0 && !c->slices) |
4098 | 0 | return -1; |
4099 | | |
4100 | | //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum); |
4101 | | |
4102 | 35.9k | off_t c_offset = htell(fd->fp); // File offset of container |
4103 | | |
4104 | | /* Write the container struct itself */ |
4105 | 35.9k | if (0 != cram_write_container(fd, c)) |
4106 | 0 | return -1; |
4107 | | |
4108 | 35.9k | off_t hdr_size = htell(fd->fp) - c_offset; |
4109 | | |
4110 | | /* And the compression header */ |
4111 | 35.9k | if (0 != cram_write_block(fd, c->comp_hdr_block)) |
4112 | 0 | return -1; |
4113 | | |
4114 | | /* Followed by the slice blocks */ |
4115 | 35.9k | off_t file_offset = htell(fd->fp); |
4116 | 71.8k | for (i = 0; i < c->curr_slice; i++) { |
4117 | 35.9k | cram_slice *s = c->slices[i]; |
4118 | 35.9k | off_t spos = file_offset - c_offset - hdr_size; |
4119 | | |
4120 | 35.9k | if (0 != cram_write_block(fd, s->hdr_block)) |
4121 | 0 | return -1; |
4122 | | |
4123 | 225k | for (j = 0; j < s->hdr->num_blocks; j++) { |
4124 | 189k | if (0 != cram_write_block(fd, s->block[j])) |
4125 | 0 | return -1; |
4126 | 189k | } |
4127 | | |
4128 | 35.9k | file_offset = htell(fd->fp); |
4129 | 35.9k | off_t sz = file_offset - c_offset - hdr_size - spos; |
4130 | | |
4131 | 35.9k | if (fd->idxfp) { |
4132 | 0 | if (cram_index_slice(fd, c, s, fd->idxfp, c_offset, spos, sz) < 0) |
4133 | 0 | return -1; |
4134 | 0 | } |
4135 | 35.9k | } |
4136 | | |
4137 | 35.9k | return 0; |
4138 | 35.9k | } |
4139 | | |
4140 | | /* |
4141 | | * Flushes a completely or partially full container to disk, writing |
4142 | | * container structure, header and blocks. This also calls the encoder |
4143 | | * functions. |
4144 | | * |
4145 | | * Returns 0 on success |
4146 | | * -1 on failure |
4147 | | */ |
4148 | 36.4k | int cram_flush_container(cram_fd *fd, cram_container *c) { |
4149 | | /* Encode the container blocks and generate compression header */ |
4150 | 36.4k | if (0 != cram_encode_container(fd, c)) |
4151 | 482 | return -1; |
4152 | | |
4153 | 35.9k | return cram_flush_container2(fd, c); |
4154 | 36.4k | } |
4155 | | |
4156 | | typedef struct { |
4157 | | cram_fd *fd; |
4158 | | cram_container *c; |
4159 | | } cram_job; |
4160 | | |
4161 | 0 | void *cram_flush_thread(void *arg) { |
4162 | 0 | cram_job *j = (cram_job *)arg; |
4163 | | |
4164 | | /* Encode the container blocks and generate compression header */ |
4165 | 0 | if (0 != cram_encode_container(j->fd, j->c)) { |
4166 | 0 | hts_log_error("Call to cram_encode_container failed"); |
4167 | 0 | return NULL; |
4168 | 0 | } |
4169 | | |
4170 | 0 | return arg; |
4171 | 0 | } |
4172 | | |
4173 | 0 | static int cram_flush_result(cram_fd *fd) { |
4174 | 0 | int i, ret = 0; |
4175 | 0 | hts_tpool_result *r; |
4176 | 0 | cram_container *lc = NULL; |
4177 | | |
4178 | | // NB: we can have one result per slice, not per container, |
4179 | | // so we need to free the container only after all slices |
4180 | | // within it have been freed. (Automatic via reference counting.) |
4181 | 0 | while ((r = hts_tpool_next_result(fd->rqueue))) { |
4182 | 0 | cram_job *j = (cram_job *)hts_tpool_result_data(r); |
4183 | 0 | cram_container *c; |
4184 | |
|
4185 | 0 | if (!j) { |
4186 | 0 | hts_tpool_delete_result(r, 0); |
4187 | 0 | return -1; |
4188 | 0 | } |
4189 | | |
4190 | 0 | fd = j->fd; |
4191 | 0 | c = j->c; |
4192 | |
|
4193 | 0 | if (fd->mode == 'w') |
4194 | 0 | if (0 != cram_flush_container2(fd, c)) |
4195 | 0 | return -1; |
4196 | | |
4197 | | // Free the slices; filled out by encoder only |
4198 | 0 | if (c->slices) { |
4199 | 0 | for (i = 0; i < c->max_slice; i++) { |
4200 | 0 | if (c->slices[i]) |
4201 | 0 | cram_free_slice(c->slices[i]); |
4202 | 0 | if (c->slices[i] == c->slice) |
4203 | 0 | c->slice = NULL; |
4204 | 0 | c->slices[i] = NULL; |
4205 | 0 | } |
4206 | 0 | } |
4207 | | |
4208 | | // Free the current slice; set by both encoder & decoder |
4209 | 0 | if (c->slice) { |
4210 | 0 | cram_free_slice(c->slice); |
4211 | 0 | c->slice = NULL; |
4212 | 0 | } |
4213 | 0 | c->curr_slice = 0; |
4214 | | |
4215 | | // Our jobs will be in order, so we free the last |
4216 | | // container when our job has switched to a new one. |
4217 | 0 | if (c != lc) { |
4218 | 0 | if (lc) { |
4219 | 0 | if (fd->ctr == lc) |
4220 | 0 | fd->ctr = NULL; |
4221 | 0 | if (fd->ctr_mt == lc) |
4222 | 0 | fd->ctr_mt = NULL; |
4223 | 0 | cram_free_container(lc); |
4224 | 0 | } |
4225 | 0 | lc = c; |
4226 | 0 | } |
4227 | |
|
4228 | 0 | hts_tpool_delete_result(r, 1); |
4229 | 0 | } |
4230 | 0 | if (lc) { |
4231 | 0 | if (fd->ctr == lc) |
4232 | 0 | fd->ctr = NULL; |
4233 | 0 | if (fd->ctr_mt == lc) |
4234 | 0 | fd->ctr_mt = NULL; |
4235 | 0 | cram_free_container(lc); |
4236 | 0 | } |
4237 | |
|
4238 | 0 | return ret; |
4239 | 0 | } |
4240 | | |
4241 | | // Note: called while metrics_lock is held. |
4242 | | // Will be left in this state too, but may temporarily unlock. |
4243 | 2.56k | void reset_metrics(cram_fd *fd) { |
4244 | 2.56k | int i; |
4245 | | |
4246 | 2.56k | if (fd->pool) { |
4247 | | // If multi-threaded we have multiple blocks being |
4248 | | // compressed already and several on the to-do list |
4249 | | // (fd->rqueue->pending). It's tricky to reset the |
4250 | | // metrics exactly the correct point, so instead we |
4251 | | // just flush the pool, reset, and then continue again. |
4252 | | |
4253 | | // Don't bother starting a new trial before then though. |
4254 | 0 | for (i = 0; i < DS_END; i++) { |
4255 | 0 | cram_metrics *m = fd->m[i]; |
4256 | 0 | if (!m) |
4257 | 0 | continue; |
4258 | 0 | m->next_trial = 999; |
4259 | 0 | } |
4260 | |
|
4261 | 0 | pthread_mutex_unlock(&fd->metrics_lock); |
4262 | 0 | hts_tpool_process_flush(fd->rqueue); |
4263 | 0 | pthread_mutex_lock(&fd->metrics_lock); |
4264 | 0 | } |
4265 | | |
4266 | 123k | for (i = 0; i < DS_END; i++) { |
4267 | 120k | cram_metrics *m = fd->m[i]; |
4268 | 120k | if (!m) |
4269 | 0 | continue; |
4270 | | |
4271 | 120k | m->trial = NTRIALS; |
4272 | 120k | m->next_trial = TRIAL_SPAN; |
4273 | 120k | m->revised_method = 0; |
4274 | 120k | m->unpackable = 0; |
4275 | | |
4276 | 120k | memset(m->sz, 0, sizeof(m->sz)); |
4277 | 120k | } |
4278 | 2.56k | } |
4279 | | |
4280 | 36.4k | int cram_flush_container_mt(cram_fd *fd, cram_container *c) { |
4281 | 36.4k | cram_job *j; |
4282 | | |
4283 | | // At the junction of mapped to unmapped data the compression |
4284 | | // methods may need to change due to very different statistical |
4285 | | // properties; particularly BA if minhash sorted. |
4286 | | // |
4287 | | // However with threading we'll have several in-flight blocks |
4288 | | // arriving out of order. |
4289 | | // |
4290 | | // So we do one trial reset of NThreads to last for NThreads |
4291 | | // duration to get us over this transition period, followed |
4292 | | // by another retrial of the usual ntrials & trial span. |
4293 | 36.4k | pthread_mutex_lock(&fd->metrics_lock); |
4294 | 36.4k | if (c->n_mapped < 0.3*c->curr_rec && |
4295 | 36.4k | fd->last_mapped > 0.7*c->max_rec) { |
4296 | 2.56k | reset_metrics(fd); |
4297 | 2.56k | } |
4298 | 36.4k | fd->last_mapped = c->n_mapped * (c->max_rec+1)/(c->curr_rec+1) ; |
4299 | 36.4k | pthread_mutex_unlock(&fd->metrics_lock); |
4300 | | |
4301 | 36.4k | if (!fd->pool) |
4302 | 36.4k | return cram_flush_container(fd, c); |
4303 | | |
4304 | 0 | if (!(j = malloc(sizeof(*j)))) |
4305 | 0 | return -1; |
4306 | 0 | j->fd = fd; |
4307 | 0 | j->c = c; |
4308 | | |
4309 | | // Flush the job. Note our encoder queue may be full, so we |
4310 | | // either have to keep trying in non-blocking mode (what we do) or |
4311 | | // use a dedicated separate thread for draining the queue. |
4312 | 0 | for (;;) { |
4313 | 0 | errno = 0; |
4314 | 0 | hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_flush_thread, j, 1); |
4315 | 0 | int pending = (errno == EAGAIN); |
4316 | 0 | if (cram_flush_result(fd) != 0) |
4317 | 0 | return -1; |
4318 | 0 | if (!pending) |
4319 | 0 | break; |
4320 | | |
4321 | 0 | hts_usleep(1000); |
4322 | 0 | } |
4323 | | |
4324 | 0 | return 0; |
4325 | 0 | } |
4326 | | |
4327 | | /* ---------------------------------------------------------------------- |
4328 | | * Compression headers; the first part of the container |
4329 | | */ |
4330 | | |
4331 | | /* |
4332 | | * Creates a new blank container compression header |
4333 | | * |
4334 | | * Returns header ptr on success |
4335 | | * NULL on failure |
4336 | | */ |
4337 | 44.3k | cram_block_compression_hdr *cram_new_compression_header(void) { |
4338 | 44.3k | cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr)); |
4339 | 44.3k | if (!hdr) |
4340 | 0 | return NULL; |
4341 | | |
4342 | 44.3k | if (!(hdr->TD_blk = cram_new_block(CORE, 0))) { |
4343 | 0 | free(hdr); |
4344 | 0 | return NULL; |
4345 | 0 | } |
4346 | | |
4347 | 44.3k | if (!(hdr->TD_hash = kh_init(m_s2i))) { |
4348 | 0 | cram_free_block(hdr->TD_blk); |
4349 | 0 | free(hdr); |
4350 | 0 | return NULL; |
4351 | 0 | } |
4352 | | |
4353 | 44.3k | if (!(hdr->TD_keys = string_pool_create(8192))) { |
4354 | 0 | kh_destroy(m_s2i, hdr->TD_hash); |
4355 | 0 | cram_free_block(hdr->TD_blk); |
4356 | 0 | free(hdr); |
4357 | 0 | return NULL; |
4358 | 0 | } |
4359 | | |
4360 | 44.3k | return hdr; |
4361 | 44.3k | } |
4362 | | |
4363 | 46.1k | void cram_free_compression_header(cram_block_compression_hdr *hdr) { |
4364 | 46.1k | int i; |
4365 | | |
4366 | 46.1k | if (hdr->landmark) |
4367 | 1.71k | free(hdr->landmark); |
4368 | | |
4369 | 46.1k | if (hdr->preservation_map) |
4370 | 46.1k | kh_destroy(map, hdr->preservation_map); |
4371 | | |
4372 | 1.52M | for (i = 0; i < CRAM_MAP_HASH; i++) { |
4373 | 1.47M | cram_map *m, *m2; |
4374 | 1.50M | for (m = hdr->rec_encoding_map[i]; m; m = m2) { |
4375 | 26.2k | m2 = m->next; |
4376 | 26.2k | if (m->codec) |
4377 | 0 | m->codec->free(m->codec); |
4378 | 26.2k | free(m); |
4379 | 26.2k | } |
4380 | 1.47M | } |
4381 | | |
4382 | 1.52M | for (i = 0; i < CRAM_MAP_HASH; i++) { |
4383 | 1.47M | cram_map *m, *m2; |
4384 | 1.47M | for (m = hdr->tag_encoding_map[i]; m; m = m2) { |
4385 | 639 | m2 = m->next; |
4386 | 639 | if (m->codec) |
4387 | 639 | m->codec->free(m->codec); |
4388 | 639 | free(m); |
4389 | 639 | } |
4390 | 1.47M | } |
4391 | | |
4392 | 2.21M | for (i = 0; i < DS_END; i++) { |
4393 | 2.17M | if (hdr->codecs[i]) |
4394 | 669k | hdr->codecs[i]->free(hdr->codecs[i]); |
4395 | 2.17M | } |
4396 | | |
4397 | 46.1k | if (hdr->TL) |
4398 | 54 | free(hdr->TL); |
4399 | 46.1k | if (hdr->TD_blk) |
4400 | 44.3k | cram_free_block(hdr->TD_blk); |
4401 | 46.1k | if (hdr->TD_hash) |
4402 | 46.1k | kh_destroy(m_s2i, hdr->TD_hash); |
4403 | 46.1k | if (hdr->TD_keys) |
4404 | 44.3k | string_pool_destroy(hdr->TD_keys); |
4405 | | |
4406 | 46.1k | free(hdr); |
4407 | 46.1k | } |
4408 | | |
4409 | | |
4410 | | /* ---------------------------------------------------------------------- |
4411 | | * Slices and slice headers |
4412 | | */ |
4413 | | |
4414 | 37.0k | void cram_free_slice_header(cram_block_slice_hdr *hdr) { |
4415 | 37.0k | if (!hdr) |
4416 | 0 | return; |
4417 | | |
4418 | 37.0k | if (hdr->block_content_ids) |
4419 | 36.6k | free(hdr->block_content_ids); |
4420 | | |
4421 | 37.0k | free(hdr); |
4422 | | |
4423 | 37.0k | return; |
4424 | 37.0k | } |
4425 | | |
4426 | 37.1k | void cram_free_slice(cram_slice *s) { |
4427 | 37.1k | if (!s) |
4428 | 0 | return; |
4429 | | |
4430 | 37.1k | if (s->hdr_block) |
4431 | 36.5k | cram_free_block(s->hdr_block); |
4432 | | |
4433 | 37.1k | if (s->block) { |
4434 | 36.6k | int i; |
4435 | | |
4436 | 36.6k | if (s->hdr) { |
4437 | 6.41M | for (i = 0; i < s->hdr->num_blocks; i++) { |
4438 | 6.38M | if (i > 0 && s->block[i] == s->block[0]) |
4439 | 6.14M | continue; |
4440 | 240k | cram_free_block(s->block[i]); |
4441 | 240k | } |
4442 | 36.6k | } |
4443 | 36.6k | free(s->block); |
4444 | 36.6k | } |
4445 | | |
4446 | 37.1k | { |
4447 | | // Normally already copied into s->block[], but potentially still |
4448 | | // here if we error part way through cram_encode_slice. |
4449 | 37.1k | int i; |
4450 | 114k | for (i = 0; i < s->naux_block; i++) |
4451 | 77.2k | cram_free_block(s->aux_block[i]); |
4452 | 37.1k | } |
4453 | | |
4454 | 37.1k | if (s->block_by_id) |
4455 | 649 | free(s->block_by_id); |
4456 | | |
4457 | 37.1k | if (s->hdr) |
4458 | 37.0k | cram_free_slice_header(s->hdr); |
4459 | | |
4460 | 37.1k | if (s->seqs_blk) |
4461 | 37.0k | cram_free_block(s->seqs_blk); |
4462 | | |
4463 | 37.1k | if (s->qual_blk) |
4464 | 1.05k | cram_free_block(s->qual_blk); |
4465 | | |
4466 | 37.1k | if (s->name_blk) |
4467 | 1.05k | cram_free_block(s->name_blk); |
4468 | | |
4469 | 37.1k | if (s->aux_blk) |
4470 | 37.0k | cram_free_block(s->aux_blk); |
4471 | | |
4472 | 37.1k | if (s->base_blk) |
4473 | 1.05k | cram_free_block(s->base_blk); |
4474 | | |
4475 | 37.1k | if (s->soft_blk) |
4476 | 1.05k | cram_free_block(s->soft_blk); |
4477 | | |
4478 | 37.1k | if (s->cigar) |
4479 | 37.0k | free(s->cigar); |
4480 | | |
4481 | 37.1k | if (s->crecs) |
4482 | 37.0k | free(s->crecs); |
4483 | | |
4484 | 37.1k | if (s->features) |
4485 | 15.8k | free(s->features); |
4486 | | |
4487 | 37.1k | if (s->TN) |
4488 | 0 | free(s->TN); |
4489 | | |
4490 | 37.1k | if (s->pair_keys) |
4491 | 36.4k | string_pool_destroy(s->pair_keys); |
4492 | | |
4493 | 37.1k | if (s->pair[0]) |
4494 | 37.1k | kh_destroy(m_s2i, s->pair[0]); |
4495 | 37.1k | if (s->pair[1]) |
4496 | 37.1k | kh_destroy(m_s2i, s->pair[1]); |
4497 | | |
4498 | 37.1k | if (s->aux_block) |
4499 | 31.0k | free(s->aux_block); |
4500 | | |
4501 | 37.1k | free(s); |
4502 | 37.1k | } |
4503 | | |
4504 | | /* |
4505 | | * Creates a new empty slice in memory, for subsequent writing to |
4506 | | * disk. |
4507 | | * |
4508 | | * Returns cram_slice ptr on success |
4509 | | * NULL on failure |
4510 | | */ |
4511 | 36.4k | cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) { |
4512 | 36.4k | cram_slice *s = calloc(1, sizeof(*s)); |
4513 | 36.4k | if (!s) |
4514 | 0 | return NULL; |
4515 | | |
4516 | 36.4k | if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr)))) |
4517 | 0 | goto err; |
4518 | 36.4k | s->hdr->content_type = type; |
4519 | | |
4520 | 36.4k | s->hdr_block = NULL; |
4521 | 36.4k | s->block = NULL; |
4522 | 36.4k | s->block_by_id = NULL; |
4523 | 36.4k | s->last_apos = 0; |
4524 | 36.4k | if (!(s->crecs = malloc(nrecs * sizeof(cram_record)))) goto err; |
4525 | 36.4k | s->cigar_alloc = 1024; |
4526 | 36.4k | if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err; |
4527 | 36.4k | s->ncigar = 0; |
4528 | | |
4529 | 36.4k | if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err; |
4530 | 36.4k | if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err; |
4531 | 36.4k | if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err; |
4532 | 36.4k | if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err; |
4533 | 36.4k | if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err; |
4534 | 36.4k | if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err; |
4535 | | |
4536 | 36.4k | s->features = NULL; |
4537 | 36.4k | s->nfeatures = s->afeatures = 0; |
4538 | | |
4539 | 36.4k | #ifndef TN_external |
4540 | 36.4k | s->TN = NULL; |
4541 | 36.4k | s->nTN = s->aTN = 0; |
4542 | 36.4k | #endif |
4543 | | |
4544 | | // Volatile keys as we do realloc in dstring |
4545 | 36.4k | if (!(s->pair_keys = string_pool_create(8192))) goto err; |
4546 | 36.4k | if (!(s->pair[0] = kh_init(m_s2i))) goto err; |
4547 | 36.4k | if (!(s->pair[1] = kh_init(m_s2i))) goto err; |
4548 | | |
4549 | | #ifdef BA_external |
4550 | | s->BA_len = 0; |
4551 | | #endif |
4552 | | |
4553 | 36.4k | return s; |
4554 | | |
4555 | 0 | err: |
4556 | 0 | if (s) |
4557 | 0 | cram_free_slice(s); |
4558 | |
|
4559 | 0 | return NULL; |
4560 | 36.4k | } |
4561 | | |
4562 | | /* |
4563 | | * Loads an entire slice. |
4564 | | * FIXME: In 1.0 the native unit of slices within CRAM is broken |
4565 | | * as slices contain references to objects in other slices. |
4566 | | * To work around this while keeping the slice oriented outer loop |
4567 | | * we read all slices and stitch them together into a fake large |
4568 | | * slice instead. |
4569 | | * |
4570 | | * Returns cram_slice ptr on success |
4571 | | * NULL on failure |
4572 | | */ |
4573 | 771 | cram_slice *cram_read_slice(cram_fd *fd) { |
4574 | 771 | cram_block *b = cram_read_block(fd); |
4575 | 771 | cram_slice *s = calloc(1, sizeof(*s)); |
4576 | 771 | int i, n, max_id, min_id; |
4577 | | |
4578 | 771 | if (!b || !s) |
4579 | 20 | goto err; |
4580 | | |
4581 | 751 | s->hdr_block = b; |
4582 | 751 | switch (b->content_type) { |
4583 | 714 | case MAPPED_SLICE: |
4584 | 744 | case UNMAPPED_SLICE: |
4585 | 744 | if (!(s->hdr = cram_decode_slice_header(fd, b))) |
4586 | 55 | goto err; |
4587 | 689 | break; |
4588 | | |
4589 | 689 | default: |
4590 | 7 | hts_log_error("Unexpected block of type %s", |
4591 | 7 | cram_content_type2str(b->content_type)); |
4592 | 7 | goto err; |
4593 | 751 | } |
4594 | | |
4595 | 689 | if (s->hdr->num_blocks < 1) { |
4596 | 8 | hts_log_error("Slice does not include any data blocks"); |
4597 | 8 | goto err; |
4598 | 8 | } |
4599 | | |
4600 | 681 | s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block)); |
4601 | 681 | if (!s->block) |
4602 | 0 | goto err; |
4603 | | |
4604 | 3.22k | for (max_id = i = 0, min_id = INT_MAX; i < n; i++) { |
4605 | 2.57k | if (!(s->block[i] = cram_read_block(fd))) |
4606 | 32 | goto err; |
4607 | | |
4608 | 2.54k | if (s->block[i]->content_type == EXTERNAL) { |
4609 | 603 | if (max_id < s->block[i]->content_id) |
4610 | 14 | max_id = s->block[i]->content_id; |
4611 | 603 | if (min_id > s->block[i]->content_id) |
4612 | 591 | min_id = s->block[i]->content_id; |
4613 | 603 | } |
4614 | 2.54k | } |
4615 | | |
4616 | 649 | if (!(s->block_by_id = calloc(512, sizeof(s->block[0])))) |
4617 | 0 | goto err; |
4618 | | |
4619 | 3.12k | for (i = 0; i < n; i++) { |
4620 | 2.47k | if (s->block[i]->content_type != EXTERNAL) |
4621 | 1.87k | continue; |
4622 | 597 | uint32_t v = s->block[i]->content_id; |
4623 | 597 | if (v >= 256) |
4624 | 15 | v = 256 + v % 251; |
4625 | 597 | s->block_by_id[v] = s->block[i]; |
4626 | 597 | } |
4627 | | |
4628 | | /* Initialise encoding/decoding tables */ |
4629 | 649 | s->cigar_alloc = 1024; |
4630 | 649 | if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err; |
4631 | 649 | s->ncigar = 0; |
4632 | | |
4633 | 649 | if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err; |
4634 | 649 | if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err; |
4635 | 649 | if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err; |
4636 | 649 | if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err; |
4637 | 649 | if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err; |
4638 | 649 | if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err; |
4639 | | |
4640 | 649 | s->crecs = NULL; |
4641 | | |
4642 | 649 | s->last_apos = s->hdr->ref_seq_start; |
4643 | 649 | s->decode_md = fd->decode_md; |
4644 | | |
4645 | 649 | return s; |
4646 | | |
4647 | 122 | err: |
4648 | 122 | if (b) |
4649 | 102 | cram_free_block(b); |
4650 | 122 | if (s) { |
4651 | 122 | s->hdr_block = NULL; |
4652 | 122 | cram_free_slice(s); |
4653 | 122 | } |
4654 | 122 | return NULL; |
4655 | 649 | } |
4656 | | |
4657 | | |
4658 | | /* ---------------------------------------------------------------------- |
4659 | | * CRAM file definition (header) |
4660 | | */ |
4661 | | |
4662 | | /* |
4663 | | * Reads a CRAM file definition structure. |
4664 | | * Returns file_def ptr on success |
4665 | | * NULL on failure |
4666 | | */ |
4667 | 9.29k | cram_file_def *cram_read_file_def(cram_fd *fd) { |
4668 | 9.29k | cram_file_def *def = malloc(sizeof(*def)); |
4669 | 9.29k | if (!def) |
4670 | 0 | return NULL; |
4671 | | |
4672 | 9.29k | if (26 != hread(fd->fp, &def->magic[0], 26)) { |
4673 | 2 | free(def); |
4674 | 2 | return NULL; |
4675 | 2 | } |
4676 | | |
4677 | 9.29k | if (memcmp(def->magic, "CRAM", 4) != 0) { |
4678 | 1 | free(def); |
4679 | 1 | return NULL; |
4680 | 1 | } |
4681 | | |
4682 | 9.29k | if (def->major_version > 4) { |
4683 | 0 | hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x, 3.x or 4.x, got %d.%d", |
4684 | 0 | def->major_version, def->minor_version); |
4685 | 0 | free(def); |
4686 | 0 | return NULL; |
4687 | 0 | } |
4688 | | |
4689 | 9.29k | fd->first_container += 26; |
4690 | 9.29k | fd->curr_position = fd->first_container; |
4691 | 9.29k | fd->last_slice = 0; |
4692 | | |
4693 | 9.29k | return def; |
4694 | 9.29k | } |
4695 | | |
4696 | | /* |
4697 | | * Writes a cram_file_def structure to cram_fd. |
4698 | | * Returns 0 on success |
4699 | | * -1 on failure |
4700 | | */ |
4701 | 7.93k | int cram_write_file_def(cram_fd *fd, cram_file_def *def) { |
4702 | 7.93k | return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1; |
4703 | 7.93k | } |
4704 | | |
4705 | 18.6k | void cram_free_file_def(cram_file_def *def) { |
4706 | 18.6k | if (def) free(def); |
4707 | 18.6k | } |
4708 | | |
4709 | | /* ---------------------------------------------------------------------- |
4710 | | * SAM header I/O |
4711 | | */ |
4712 | | |
4713 | | |
4714 | | /* |
4715 | | * Reads the SAM header from the first CRAM data block. |
4716 | | * Also performs minimal parsing to extract read-group |
4717 | | * and sample information. |
4718 | | |
4719 | | * Returns SAM hdr ptr on success |
4720 | | * NULL on failure |
4721 | | */ |
4722 | 9.29k | sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) { |
4723 | 9.29k | int32_t header_len; |
4724 | 9.29k | char *header; |
4725 | 9.29k | sam_hdr_t *hdr; |
4726 | | |
4727 | | /* 1.1 onwards stores the header in the first block of a container */ |
4728 | 9.29k | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
4729 | | /* Length */ |
4730 | 6.84k | if (-1 == int32_decode(fd, &header_len)) |
4731 | 3 | return NULL; |
4732 | | |
4733 | 6.84k | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
4734 | 6.84k | if (header_len > FUZZ_ALLOC_LIMIT) |
4735 | 2 | return NULL; |
4736 | 6.84k | #endif |
4737 | | |
4738 | | /* Alloc and read */ |
4739 | 6.84k | if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1))) |
4740 | 0 | return NULL; |
4741 | | |
4742 | 6.84k | if (header_len != hread(fd->fp, header, header_len)) { |
4743 | 27 | free(header); |
4744 | 27 | return NULL; |
4745 | 27 | } |
4746 | 6.81k | header[header_len] = '\0'; |
4747 | | |
4748 | 6.81k | fd->first_container += 4 + header_len; |
4749 | 6.81k | } else { |
4750 | 2.44k | cram_container *c = cram_read_container(fd); |
4751 | 2.44k | cram_block *b; |
4752 | 2.44k | int i; |
4753 | 2.44k | int64_t len; |
4754 | | |
4755 | 2.44k | if (!c) |
4756 | 138 | return NULL; |
4757 | | |
4758 | 2.30k | fd->first_container += c->length + c->offset; |
4759 | 2.30k | fd->curr_position = fd->first_container; |
4760 | | |
4761 | 2.30k | if (c->num_blocks < 1) { |
4762 | 74 | cram_free_container(c); |
4763 | 74 | return NULL; |
4764 | 74 | } |
4765 | | |
4766 | 2.23k | if (!(b = cram_read_block(fd))) { |
4767 | 111 | cram_free_container(c); |
4768 | 111 | return NULL; |
4769 | 111 | } |
4770 | 2.12k | if (cram_uncompress_block(b) != 0) { |
4771 | 1.46k | cram_free_container(c); |
4772 | 1.46k | cram_free_block(b); |
4773 | 1.46k | return NULL; |
4774 | 1.46k | } |
4775 | | |
4776 | 655 | len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + |
4777 | 655 | fd->vv.varint_size(b->content_id) + |
4778 | 655 | fd->vv.varint_size(b->uncomp_size) + |
4779 | 655 | fd->vv.varint_size(b->comp_size); |
4780 | | |
4781 | | /* Extract header from 1st block */ |
4782 | 655 | if (-1 == int32_get_blk(b, &header_len) || |
4783 | 655 | header_len < 0 || /* Spec. says signed... why? */ |
4784 | 655 | b->uncomp_size - 4 < header_len) { |
4785 | 127 | cram_free_container(c); |
4786 | 127 | cram_free_block(b); |
4787 | 127 | return NULL; |
4788 | 127 | } |
4789 | 528 | if (NULL == (header = malloc((size_t) header_len+1))) { |
4790 | 0 | cram_free_container(c); |
4791 | 0 | cram_free_block(b); |
4792 | 0 | return NULL; |
4793 | 0 | } |
4794 | 528 | memcpy(header, BLOCK_END(b), header_len); |
4795 | 528 | header[header_len] = '\0'; |
4796 | 528 | cram_free_block(b); |
4797 | | |
4798 | | /* Consume any remaining blocks */ |
4799 | 3.37k | for (i = 1; i < c->num_blocks; i++) { |
4800 | 2.85k | if (!(b = cram_read_block(fd))) { |
4801 | 12 | cram_free_container(c); |
4802 | 12 | free(header); |
4803 | 12 | return NULL; |
4804 | 12 | } |
4805 | 2.84k | len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + |
4806 | 2.84k | fd->vv.varint_size(b->content_id) + |
4807 | 2.84k | fd->vv.varint_size(b->uncomp_size) + |
4808 | 2.84k | fd->vv.varint_size(b->comp_size); |
4809 | 2.84k | cram_free_block(b); |
4810 | 2.84k | } |
4811 | | |
4812 | 516 | if (c->length > 0 && len > 0 && c->length > len) { |
4813 | | // Consume padding |
4814 | 34 | char *pads = malloc(c->length - len); |
4815 | 34 | if (!pads) { |
4816 | 0 | cram_free_container(c); |
4817 | 0 | free(header); |
4818 | 0 | return NULL; |
4819 | 0 | } |
4820 | | |
4821 | 34 | if (c->length - len != hread(fd->fp, pads, c->length - len)) { |
4822 | 21 | cram_free_container(c); |
4823 | 21 | free(header); |
4824 | 21 | free(pads); |
4825 | 21 | return NULL; |
4826 | 21 | } |
4827 | 13 | free(pads); |
4828 | 13 | } |
4829 | | |
4830 | 495 | cram_free_container(c); |
4831 | 495 | } |
4832 | | |
4833 | | /* Parse */ |
4834 | 7.30k | hdr = sam_hdr_init(); |
4835 | 7.30k | if (!hdr) { |
4836 | 0 | free(header); |
4837 | 0 | return NULL; |
4838 | 0 | } |
4839 | | |
4840 | 7.30k | if (-1 == sam_hdr_add_lines(hdr, header, header_len)) { |
4841 | 73 | free(header); |
4842 | 73 | sam_hdr_destroy(hdr); |
4843 | 73 | return NULL; |
4844 | 73 | } |
4845 | | |
4846 | 7.23k | hdr->l_text = header_len; |
4847 | 7.23k | hdr->text = header; |
4848 | | |
4849 | 7.23k | return hdr; |
4850 | | |
4851 | 7.30k | } |
4852 | | |
4853 | | /* |
4854 | | * Converts 'in' to a full pathname to store in out. |
4855 | | * Out must be at least PATH_MAX bytes long. |
4856 | | */ |
4857 | 0 | static void full_path(char *out, char *in) { |
4858 | 0 | size_t in_l = strlen(in); |
4859 | 0 | if (hisremote(in)) { |
4860 | 0 | if (in_l > PATH_MAX) { |
4861 | 0 | hts_log_error("Reference path is longer than %d", PATH_MAX); |
4862 | 0 | return; |
4863 | 0 | } |
4864 | 0 | strncpy(out, in, PATH_MAX-1); |
4865 | 0 | out[PATH_MAX-1] = 0; |
4866 | 0 | return; |
4867 | 0 | } |
4868 | 0 | if (*in == '/' || |
4869 | | // Windows paths |
4870 | 0 | (in_l > 3 && toupper_c(*in) >= 'A' && toupper_c(*in) <= 'Z' && |
4871 | 0 | in[1] == ':' && (in[2] == '/' || in[2] == '\\'))) { |
4872 | 0 | strncpy(out, in, PATH_MAX-1); |
4873 | 0 | out[PATH_MAX-1] = 0; |
4874 | 0 | } else { |
4875 | 0 | size_t len; |
4876 | | |
4877 | | // unable to get dir or out+in is too long |
4878 | 0 | if (!getcwd(out, PATH_MAX) || |
4879 | 0 | (len = strlen(out))+1+strlen(in) >= PATH_MAX) { |
4880 | 0 | strncpy(out, in, PATH_MAX-1); |
4881 | 0 | out[PATH_MAX-1] = 0; |
4882 | 0 | return; |
4883 | 0 | } |
4884 | | |
4885 | 0 | snprintf(out+len, PATH_MAX - len, "/%s", in); |
4886 | | |
4887 | | // FIXME: cope with `pwd`/../../../foo.fa ? |
4888 | 0 | } |
4889 | 0 | } |
4890 | | |
4891 | | /* |
4892 | | * Writes a CRAM SAM header. |
4893 | | * Returns 0 on success |
4894 | | * -1 on failure |
4895 | | */ |
4896 | 7.93k | int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { |
4897 | 7.93k | size_t header_len; |
4898 | 7.93k | int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3); |
4899 | | |
4900 | | /* Write CRAM MAGIC if not yet written. */ |
4901 | 7.93k | if (fd->file_def->major_version == 0) { |
4902 | 7.93k | fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version); |
4903 | 7.93k | fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version); |
4904 | 7.93k | if (0 != cram_write_file_def(fd, fd->file_def)) |
4905 | 0 | return -1; |
4906 | 7.93k | } |
4907 | | |
4908 | | /* 1.0 requires an UNKNOWN read-group */ |
4909 | 7.93k | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
4910 | 0 | if (!sam_hrecs_find_rg(hdr->hrecs, "UNKNOWN")) |
4911 | 0 | if (sam_hdr_add_line(hdr, "RG", |
4912 | 0 | "ID", "UNKNOWN", "SM", "UNKNOWN", NULL)) |
4913 | 0 | return -1; |
4914 | 0 | } |
4915 | | |
4916 | 7.93k | if (-1 == refs_from_header(fd)) |
4917 | 0 | return -1; |
4918 | 7.93k | if (-1 == refs2id(fd->refs, fd->header)) |
4919 | 0 | return -1; |
4920 | | |
4921 | | /* Fix M5 strings */ |
4922 | 7.93k | if (fd->refs && !fd->no_ref && fd->embed_ref <= 1) { |
4923 | 7.93k | int i; |
4924 | 7.96k | for (i = 0; i < hdr->hrecs->nref; i++) { |
4925 | 2.23k | sam_hrec_type_t *ty; |
4926 | 2.23k | char *ref; |
4927 | | |
4928 | 2.23k | if (!(ty = sam_hrecs_find_type_id(hdr->hrecs, "SQ", "SN", hdr->hrecs->ref[i].name))) |
4929 | 0 | return -1; |
4930 | | |
4931 | 2.23k | if (!sam_hrecs_find_key(ty, "M5", NULL)) { |
4932 | 2.20k | char unsigned buf[16]; |
4933 | 2.20k | char buf2[33]; |
4934 | 2.20k | hts_pos_t rlen; |
4935 | 2.20k | hts_md5_context *md5; |
4936 | | |
4937 | 2.20k | if (!fd->refs || |
4938 | 2.20k | !fd->refs->ref_id || |
4939 | 2.20k | !fd->refs->ref_id[i]) { |
4940 | 0 | return -1; |
4941 | 0 | } |
4942 | 2.20k | rlen = fd->refs->ref_id[i]->length; |
4943 | 2.20k | ref = cram_get_ref(fd, i, 1, rlen); |
4944 | 2.20k | if (NULL == ref) { |
4945 | 2.20k | if (fd->embed_ref == -1) { |
4946 | | // auto embed-ref |
4947 | 2.20k | hts_log_warning("No M5 tags present and could not " |
4948 | 2.20k | "find reference"); |
4949 | 2.20k | hts_log_warning("Enabling embed_ref=2 option"); |
4950 | 2.20k | hts_log_warning("NOTE: the CRAM file will be bigger " |
4951 | 2.20k | "than using an external reference"); |
4952 | 2.20k | pthread_mutex_lock(&fd->ref_lock); |
4953 | | // Best guess. It may be unmapped data with broken |
4954 | | // headers, in which case this will get ignored. |
4955 | 2.20k | fd->embed_ref = 2; |
4956 | 2.20k | pthread_mutex_unlock(&fd->ref_lock); |
4957 | 2.20k | break; |
4958 | 2.20k | } |
4959 | 0 | return -1; |
4960 | 2.20k | } |
4961 | 0 | rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */ |
4962 | 0 | if (!(md5 = hts_md5_init())) |
4963 | 0 | return -1; |
4964 | 0 | if (HTS_POS_MAX <= ULONG_MAX) { |
4965 | | // Platforms with 64-bit unsigned long update in one go |
4966 | 0 | hts_md5_update(md5, ref, rlen); |
4967 | 0 | } else { |
4968 | | // Those with 32-bit ulong (Windows) may have to loop |
4969 | | // over epic references |
4970 | 0 | hts_pos_t pos = 0; |
4971 | 0 | while (rlen - pos > ULONG_MAX) { |
4972 | 0 | hts_md5_update(md5, ref + pos, ULONG_MAX); |
4973 | 0 | pos += ULONG_MAX; |
4974 | 0 | } |
4975 | 0 | hts_md5_update(md5, ref + pos, (unsigned long)(rlen - pos)); |
4976 | 0 | } |
4977 | 0 | hts_md5_final(buf, md5); |
4978 | 0 | hts_md5_destroy(md5); |
4979 | 0 | cram_ref_decr(fd->refs, i); |
4980 | |
|
4981 | 0 | hts_md5_hex(buf2, buf); |
4982 | 0 | fd->refs->ref_id[i]->validated_md5 = 1; |
4983 | 0 | if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "M5", buf2, NULL)) |
4984 | 0 | return -1; |
4985 | 0 | } |
4986 | | |
4987 | 30 | if (fd->ref_fn) { |
4988 | 0 | char ref_fn[PATH_MAX]; |
4989 | 0 | full_path(ref_fn, fd->ref_fn); |
4990 | 0 | if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "UR", ref_fn, NULL)) |
4991 | 0 | return -1; |
4992 | 0 | } |
4993 | 30 | } |
4994 | 7.93k | } |
4995 | | |
4996 | | /* Length */ |
4997 | 7.93k | header_len = sam_hdr_length(hdr); |
4998 | 7.93k | if (header_len > INT32_MAX) { |
4999 | 0 | hts_log_error("Header is too long for CRAM format"); |
5000 | 0 | return -1; |
5001 | 0 | } |
5002 | 7.93k | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
5003 | 0 | if (-1 == int32_encode(fd, header_len)) |
5004 | 0 | return -1; |
5005 | | |
5006 | | /* Text data */ |
5007 | 0 | if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len)) |
5008 | 0 | return -1; |
5009 | 7.93k | } else { |
5010 | | /* Create block(s) inside a container */ |
5011 | 7.93k | cram_block *b = cram_new_block(FILE_HEADER, 0); |
5012 | 7.93k | cram_container *c = cram_new_container(0, 0); |
5013 | 7.93k | int padded_length; |
5014 | 7.93k | char *pads; |
5015 | 7.93k | int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3); |
5016 | | |
5017 | 7.93k | if (!b || !c) { |
5018 | 0 | if (b) cram_free_block(b); |
5019 | 0 | if (c) cram_free_container(c); |
5020 | 0 | return -1; |
5021 | 0 | } |
5022 | | |
5023 | 7.93k | if (int32_put_blk(b, header_len) < 0) |
5024 | 0 | return -1; |
5025 | 7.93k | if (header_len) |
5026 | 3.29k | BLOCK_APPEND(b, sam_hdr_str(hdr), header_len); |
5027 | 7.93k | BLOCK_UPLEN(b); |
5028 | | |
5029 | | // Compress header block if V3.0 and above |
5030 | 7.93k | if (CRAM_MAJOR_VERS(fd->version) >= 3) |
5031 | 7.93k | if (cram_compress_block(fd, b, NULL, -1, -1) < 0) |
5032 | 0 | return -1; |
5033 | | |
5034 | 7.93k | if (blank_block) { |
5035 | 7.93k | c->length = b->comp_size + 2 + 4*is_cram_3 + |
5036 | 7.93k | fd->vv.varint_size(b->content_id) + |
5037 | 7.93k | fd->vv.varint_size(b->uncomp_size) + |
5038 | 7.93k | fd->vv.varint_size(b->comp_size); |
5039 | | |
5040 | 7.93k | c->num_blocks = 2; |
5041 | 7.93k | c->num_landmarks = 2; |
5042 | 7.93k | if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) { |
5043 | 0 | cram_free_block(b); |
5044 | 0 | cram_free_container(c); |
5045 | 0 | return -1; |
5046 | 0 | } |
5047 | 7.93k | c->landmark[0] = 0; |
5048 | 7.93k | c->landmark[1] = c->length; |
5049 | | |
5050 | | // Plus extra storage for uncompressed secondary blank block |
5051 | 7.93k | padded_length = MIN(c->length*.5, 10000); |
5052 | 7.93k | c->length += padded_length + 2 + 4*is_cram_3 + |
5053 | 7.93k | fd->vv.varint_size(b->content_id) + |
5054 | 7.93k | fd->vv.varint_size(padded_length)*2; |
5055 | 7.93k | } else { |
5056 | | // Pad the block instead. |
5057 | 0 | c->num_blocks = 1; |
5058 | 0 | c->num_landmarks = 1; |
5059 | 0 | if (!(c->landmark = malloc(sizeof(*c->landmark)))) |
5060 | 0 | return -1; |
5061 | 0 | c->landmark[0] = 0; |
5062 | |
|
5063 | 0 | padded_length = MAX(c->length*1.5, 10000) - c->length; |
5064 | |
|
5065 | 0 | c->length = b->comp_size + padded_length + |
5066 | 0 | 2 + 4*is_cram_3 + |
5067 | 0 | fd->vv.varint_size(b->content_id) + |
5068 | 0 | fd->vv.varint_size(b->uncomp_size) + |
5069 | 0 | fd->vv.varint_size(b->comp_size); |
5070 | |
|
5071 | 0 | if (NULL == (pads = calloc(1, padded_length))) { |
5072 | 0 | cram_free_block(b); |
5073 | 0 | cram_free_container(c); |
5074 | 0 | return -1; |
5075 | 0 | } |
5076 | 0 | BLOCK_APPEND(b, pads, padded_length); |
5077 | 0 | BLOCK_UPLEN(b); |
5078 | 0 | free(pads); |
5079 | 0 | } |
5080 | | |
5081 | 7.93k | if (-1 == cram_write_container(fd, c)) { |
5082 | 0 | cram_free_block(b); |
5083 | 0 | cram_free_container(c); |
5084 | 0 | return -1; |
5085 | 0 | } |
5086 | | |
5087 | 7.93k | if (-1 == cram_write_block(fd, b)) { |
5088 | 0 | cram_free_block(b); |
5089 | 0 | cram_free_container(c); |
5090 | 0 | return -1; |
5091 | 0 | } |
5092 | | |
5093 | 7.93k | if (blank_block) { |
5094 | 7.93k | BLOCK_RESIZE(b, padded_length); |
5095 | 7.93k | memset(BLOCK_DATA(b), 0, padded_length); |
5096 | 7.93k | BLOCK_SIZE(b) = padded_length; |
5097 | 7.93k | BLOCK_UPLEN(b); |
5098 | 7.93k | b->method = RAW; |
5099 | 7.93k | if (-1 == cram_write_block(fd, b)) { |
5100 | 0 | cram_free_block(b); |
5101 | 0 | cram_free_container(c); |
5102 | 0 | return -1; |
5103 | 0 | } |
5104 | 7.93k | } |
5105 | | |
5106 | 7.93k | cram_free_block(b); |
5107 | 7.93k | cram_free_container(c); |
5108 | 7.93k | } |
5109 | | |
5110 | 7.93k | if (0 != hflush(fd->fp)) |
5111 | 0 | return -1; |
5112 | | |
5113 | 7.93k | RP("=== Finishing saving header ===\n"); |
5114 | | |
5115 | 7.93k | return 0; |
5116 | | |
5117 | 0 | block_err: |
5118 | 0 | return -1; |
5119 | 7.93k | } |
5120 | | |
5121 | | /* ---------------------------------------------------------------------- |
5122 | | * The top-level cram opening, closing and option handling |
5123 | | */ |
5124 | | |
5125 | | /* |
5126 | | * Sets CRAM variable sized integer decode function tables. |
5127 | | * CRAM 1, 2, and 3.x all used ITF8 for uint32 and UTF8 for uint64. |
5128 | | * CRAM 4.x uses the same encoding mechanism for 32-bit and 64-bit |
5129 | | * (or anything inbetween), but also now supports signed values. |
5130 | | * |
5131 | | * Version is the CRAM major version number. |
5132 | | * vv is the vector table (probably &cram_fd->vv) |
5133 | | */ |
5134 | 18.6k | static void cram_init_varint(varint_vec *vv, int version) { |
5135 | 18.6k | if (version >= 4) { |
5136 | 838 | vv->varint_get32 = uint7_get_32; // FIXME: varint.h API should be size agnostic |
5137 | 838 | vv->varint_get32s = sint7_get_32; |
5138 | 838 | vv->varint_get64 = uint7_get_64; |
5139 | 838 | vv->varint_get64s = sint7_get_64; |
5140 | 838 | vv->varint_put32 = uint7_put_32; |
5141 | 838 | vv->varint_put32s = sint7_put_32; |
5142 | 838 | vv->varint_put64 = uint7_put_64; |
5143 | 838 | vv->varint_put64s = sint7_put_64; |
5144 | 838 | vv->varint_put32_blk = uint7_put_blk_32; |
5145 | 838 | vv->varint_put32s_blk = sint7_put_blk_32; |
5146 | 838 | vv->varint_put64_blk = uint7_put_blk_64; |
5147 | 838 | vv->varint_put64s_blk = sint7_put_blk_64; |
5148 | 838 | vv->varint_size = uint7_size; |
5149 | 838 | vv->varint_decode32_crc = uint7_decode_crc32; |
5150 | 838 | vv->varint_decode32s_crc = sint7_decode_crc32; |
5151 | 838 | vv->varint_decode64_crc = uint7_decode_crc64; |
5152 | 17.8k | } else { |
5153 | 17.8k | vv->varint_get32 = safe_itf8_get; |
5154 | 17.8k | vv->varint_get32s = safe_itf8_get; |
5155 | 17.8k | vv->varint_get64 = safe_ltf8_get; |
5156 | 17.8k | vv->varint_get64s = safe_ltf8_get; |
5157 | 17.8k | vv->varint_put32 = safe_itf8_put; |
5158 | 17.8k | vv->varint_put32s = safe_itf8_put; |
5159 | 17.8k | vv->varint_put64 = safe_ltf8_put; |
5160 | 17.8k | vv->varint_put64s = safe_ltf8_put; |
5161 | 17.8k | vv->varint_put32_blk = itf8_put_blk; |
5162 | 17.8k | vv->varint_put32s_blk = itf8_put_blk; |
5163 | 17.8k | vv->varint_put64_blk = ltf8_put_blk; |
5164 | 17.8k | vv->varint_put64s_blk = ltf8_put_blk; |
5165 | 17.8k | vv->varint_size = itf8_size; |
5166 | 17.8k | vv->varint_decode32_crc = itf8_decode_crc; |
5167 | 17.8k | vv->varint_decode32s_crc = itf8_decode_crc; |
5168 | 17.8k | vv->varint_decode64_crc = ltf8_decode_crc; |
5169 | 17.8k | } |
5170 | 18.6k | } |
5171 | | |
5172 | | /* |
5173 | | * Initialises the lookup tables. These could be global statics, but they're |
5174 | | * clumsy to setup in a multi-threaded environment unless we generate |
5175 | | * verbatim code and include that. |
5176 | | */ |
5177 | 18.6k | static void cram_init_tables(cram_fd *fd) { |
5178 | 18.6k | int i; |
5179 | | |
5180 | 18.6k | memset(fd->L1, 4, 256); |
5181 | 18.6k | fd->L1['A'] = 0; fd->L1['a'] = 0; |
5182 | 18.6k | fd->L1['C'] = 1; fd->L1['c'] = 1; |
5183 | 18.6k | fd->L1['G'] = 2; fd->L1['g'] = 2; |
5184 | 18.6k | fd->L1['T'] = 3; fd->L1['t'] = 3; |
5185 | | |
5186 | 18.6k | memset(fd->L2, 5, 256); |
5187 | 18.6k | fd->L2['A'] = 0; fd->L2['a'] = 0; |
5188 | 18.6k | fd->L2['C'] = 1; fd->L2['c'] = 1; |
5189 | 18.6k | fd->L2['G'] = 2; fd->L2['g'] = 2; |
5190 | 18.6k | fd->L2['T'] = 3; fd->L2['t'] = 3; |
5191 | 18.6k | fd->L2['N'] = 4; fd->L2['n'] = 4; |
5192 | | |
5193 | 18.6k | if (CRAM_MAJOR_VERS(fd->version) == 1) { |
5194 | 3.51M | for (i = 0; i < 0x200; i++) { |
5195 | 3.50M | int f = 0; |
5196 | | |
5197 | 3.50M | if (i & CRAM_FPAIRED) f |= BAM_FPAIRED; |
5198 | 3.50M | if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR; |
5199 | 3.50M | if (i & CRAM_FUNMAP) f |= BAM_FUNMAP; |
5200 | 3.50M | if (i & CRAM_FREVERSE) f |= BAM_FREVERSE; |
5201 | 3.50M | if (i & CRAM_FREAD1) f |= BAM_FREAD1; |
5202 | 3.50M | if (i & CRAM_FREAD2) f |= BAM_FREAD2; |
5203 | 3.50M | if (i & CRAM_FSECONDARY) f |= BAM_FSECONDARY; |
5204 | 3.50M | if (i & CRAM_FQCFAIL) f |= BAM_FQCFAIL; |
5205 | 3.50M | if (i & CRAM_FDUP) f |= BAM_FDUP; |
5206 | | |
5207 | 3.50M | fd->bam_flag_swap[i] = f; |
5208 | 3.50M | } |
5209 | | |
5210 | 28.0M | for (i = 0; i < 0x1000; i++) { |
5211 | 28.0M | int g = 0; |
5212 | | |
5213 | 28.0M | if (i & BAM_FPAIRED) g |= CRAM_FPAIRED; |
5214 | 28.0M | if (i & BAM_FPROPER_PAIR) g |= CRAM_FPROPER_PAIR; |
5215 | 28.0M | if (i & BAM_FUNMAP) g |= CRAM_FUNMAP; |
5216 | 28.0M | if (i & BAM_FREVERSE) g |= CRAM_FREVERSE; |
5217 | 28.0M | if (i & BAM_FREAD1) g |= CRAM_FREAD1; |
5218 | 28.0M | if (i & BAM_FREAD2) g |= CRAM_FREAD2; |
5219 | 28.0M | if (i & BAM_FSECONDARY) g |= CRAM_FSECONDARY; |
5220 | 28.0M | if (i & BAM_FQCFAIL) g |= CRAM_FQCFAIL; |
5221 | 28.0M | if (i & BAM_FDUP) g |= CRAM_FDUP; |
5222 | | |
5223 | 28.0M | fd->cram_flag_swap[i] = g; |
5224 | 28.0M | } |
5225 | 11.8k | } else { |
5226 | | /* NOP */ |
5227 | 48.4M | for (i = 0; i < 0x1000; i++) |
5228 | 48.4M | fd->bam_flag_swap[i] = i; |
5229 | 48.4M | for (i = 0; i < 0x1000; i++) |
5230 | 48.4M | fd->cram_flag_swap[i] = i; |
5231 | 11.8k | } |
5232 | | |
5233 | 18.6k | memset(fd->cram_sub_matrix, 4, 32*32); |
5234 | 616k | for (i = 0; i < 32; i++) { |
5235 | 597k | fd->cram_sub_matrix[i]['A'&0x1f]=0; |
5236 | 597k | fd->cram_sub_matrix[i]['C'&0x1f]=1; |
5237 | 597k | fd->cram_sub_matrix[i]['G'&0x1f]=2; |
5238 | 597k | fd->cram_sub_matrix[i]['T'&0x1f]=3; |
5239 | 597k | fd->cram_sub_matrix[i]['N'&0x1f]=4; |
5240 | 597k | } |
5241 | 112k | for (i = 0; i < 20; i+=4) { |
5242 | 93.3k | int j; |
5243 | 1.96M | for (j = 0; j < 20; j++) { |
5244 | 1.86M | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3; |
5245 | 1.86M | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3; |
5246 | 1.86M | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3; |
5247 | 1.86M | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3; |
5248 | 1.86M | } |
5249 | 93.3k | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0; |
5250 | 93.3k | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1; |
5251 | 93.3k | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2; |
5252 | 93.3k | fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3; |
5253 | 93.3k | } |
5254 | | |
5255 | 18.6k | cram_init_varint(&fd->vv, CRAM_MAJOR_VERS(fd->version)); |
5256 | 18.6k | } |
5257 | | |
5258 | | // Default version numbers for CRAM |
5259 | | static int major_version = 3; |
5260 | | static int minor_version = 1; |
5261 | | |
5262 | | /* |
5263 | | * Opens a CRAM file for read (mode "rb") or write ("wb"). |
5264 | | * The filename may be "-" to indicate stdin or stdout. |
5265 | | * |
5266 | | * Returns file handle on success |
5267 | | * NULL on failure. |
5268 | | */ |
5269 | 0 | cram_fd *cram_open(const char *filename, const char *mode) { |
5270 | 0 | hFILE *fp; |
5271 | 0 | cram_fd *fd; |
5272 | 0 | char fmode[3]= { mode[0], '\0', '\0' }; |
5273 | |
|
5274 | 0 | if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) { |
5275 | 0 | fmode[1] = 'b'; |
5276 | 0 | } |
5277 | |
|
5278 | 0 | fp = hopen(filename, fmode); |
5279 | 0 | if (!fp) |
5280 | 0 | return NULL; |
5281 | | |
5282 | 0 | fd = cram_dopen(fp, filename, mode); |
5283 | 0 | if (!fd) |
5284 | 0 | hclose_abruptly(fp); |
5285 | |
|
5286 | 0 | return fd; |
5287 | 0 | } |
5288 | | |
5289 | | /* Opens an existing stream for reading or writing. |
5290 | | * |
5291 | | * Returns file handle on success; |
5292 | | * NULL on failure. |
5293 | | */ |
5294 | 18.6k | cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { |
5295 | 18.6k | int i; |
5296 | 18.6k | char *cp; |
5297 | 18.6k | cram_fd *fd = calloc(1, sizeof(*fd)); |
5298 | 18.6k | if (!fd) |
5299 | 0 | return NULL; |
5300 | | |
5301 | 18.6k | fd->level = CRAM_DEFAULT_LEVEL; |
5302 | 56.0k | for (i = 0; mode[i]; i++) { |
5303 | 37.3k | if (mode[i] >= '0' && mode[i] <= '9') { |
5304 | 0 | fd->level = mode[i] - '0'; |
5305 | 0 | break; |
5306 | 0 | } |
5307 | 37.3k | } |
5308 | | |
5309 | 18.6k | fd->fp = fp; |
5310 | 18.6k | fd->mode = *mode; |
5311 | 18.6k | fd->first_container = 0; |
5312 | 18.6k | fd->curr_position = 0; |
5313 | | |
5314 | 18.6k | if (fd->mode == 'r') { |
5315 | | /* Reader */ |
5316 | | |
5317 | 9.29k | if (!(fd->file_def = cram_read_file_def(fd))) |
5318 | 3 | goto err; |
5319 | | |
5320 | 9.29k | fd->version = fd->file_def->major_version * 256 + |
5321 | 9.29k | fd->file_def->minor_version; |
5322 | | |
5323 | 9.29k | cram_init_tables(fd); |
5324 | | |
5325 | 9.29k | if (!(fd->header = cram_read_SAM_hdr(fd))) { |
5326 | 2.05k | cram_free_file_def(fd->file_def); |
5327 | 2.05k | goto err; |
5328 | 2.05k | } |
5329 | | |
5330 | 9.37k | } else { |
5331 | | /* Writer */ |
5332 | 9.37k | cram_file_def *def = calloc(1, sizeof(*def)); |
5333 | 9.37k | if (!def) |
5334 | 0 | return NULL; |
5335 | | |
5336 | 9.37k | fd->file_def = def; |
5337 | | |
5338 | 9.37k | def->magic[0] = 'C'; |
5339 | 9.37k | def->magic[1] = 'R'; |
5340 | 9.37k | def->magic[2] = 'A'; |
5341 | 9.37k | def->magic[3] = 'M'; |
5342 | 9.37k | def->major_version = 0; // Indicator to write file def later. |
5343 | 9.37k | def->minor_version = 0; |
5344 | 9.37k | memset(def->file_id, 0, 20); |
5345 | 9.37k | strncpy(def->file_id, filename, 20); |
5346 | | |
5347 | 9.37k | fd->version = major_version * 256 + minor_version; |
5348 | 9.37k | cram_init_tables(fd); |
5349 | | |
5350 | | /* SAM header written later along with this file_def */ |
5351 | 9.37k | } |
5352 | | |
5353 | 16.6k | fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename); |
5354 | 16.6k | if (!fd->prefix) |
5355 | 0 | goto err; |
5356 | 16.6k | fd->first_base = fd->last_base = -1; |
5357 | 16.6k | fd->record_counter = 0; |
5358 | | |
5359 | 16.6k | fd->ctr = NULL; |
5360 | 16.6k | fd->ctr_mt = NULL; |
5361 | 16.6k | fd->refs = refs_create(); |
5362 | 16.6k | if (!fd->refs) |
5363 | 0 | goto err; |
5364 | 16.6k | fd->ref_id = -2; |
5365 | 16.6k | fd->ref = NULL; |
5366 | | |
5367 | 16.6k | fd->decode_md = 0; |
5368 | 16.6k | fd->seqs_per_slice = SEQS_PER_SLICE; |
5369 | 16.6k | fd->bases_per_slice = BASES_PER_SLICE; |
5370 | 16.6k | fd->slices_per_container = SLICE_PER_CNT; |
5371 | 16.6k | fd->embed_ref = -1; // automatic selection |
5372 | 16.6k | fd->no_ref = 0; |
5373 | 16.6k | fd->no_ref_counter = 0; |
5374 | 16.6k | fd->ap_delta = 0; |
5375 | 16.6k | fd->ignore_md5 = 0; |
5376 | 16.6k | fd->lossy_read_names = 0; |
5377 | 16.6k | fd->use_bz2 = 0; |
5378 | 16.6k | fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3); |
5379 | 16.6k | fd->use_tok = (CRAM_MAJOR_VERS(fd->version) >= 3) && (CRAM_MINOR_VERS(fd->version) >= 1); |
5380 | 16.6k | fd->use_lzma = 0; |
5381 | 16.6k | fd->multi_seq = -1; |
5382 | 16.6k | fd->multi_seq_user = -1; |
5383 | 16.6k | fd->unsorted = 0; |
5384 | 16.6k | fd->shared_ref = 0; |
5385 | 16.6k | fd->store_md = 0; |
5386 | 16.6k | fd->store_nm = 0; |
5387 | 16.6k | fd->last_RI_count = 0; |
5388 | | |
5389 | 16.6k | fd->index = NULL; |
5390 | 16.6k | fd->own_pool = 0; |
5391 | 16.6k | fd->pool = NULL; |
5392 | 16.6k | fd->rqueue = NULL; |
5393 | 16.6k | fd->job_pending = NULL; |
5394 | 16.6k | fd->ooc = 0; |
5395 | 16.6k | fd->required_fields = INT_MAX; |
5396 | | |
5397 | 16.6k | pthread_mutex_init(&fd->metrics_lock, NULL); |
5398 | 16.6k | pthread_mutex_init(&fd->ref_lock, NULL); |
5399 | 16.6k | pthread_mutex_init(&fd->range_lock, NULL); |
5400 | 16.6k | pthread_mutex_init(&fd->bam_list_lock, NULL); |
5401 | | |
5402 | 797k | for (i = 0; i < DS_END; i++) { |
5403 | 780k | fd->m[i] = cram_new_metrics(); |
5404 | 780k | if (!fd->m[i]) |
5405 | 0 | goto err; |
5406 | 780k | } |
5407 | | |
5408 | 16.6k | if (!(fd->tags_used = kh_init(m_metrics))) |
5409 | 0 | goto err; |
5410 | | |
5411 | 16.6k | fd->range.refid = -2; // no ref. |
5412 | 16.6k | fd->eof = 1; // See samtools issue #150 |
5413 | 16.6k | fd->ref_fn = NULL; |
5414 | | |
5415 | 16.6k | fd->bl = NULL; |
5416 | | |
5417 | | /* Initialise dummy refs from the @SQ headers */ |
5418 | 16.6k | if (-1 == refs_from_header(fd)) |
5419 | 0 | goto err; |
5420 | | |
5421 | 16.6k | return fd; |
5422 | | |
5423 | 2.06k | err: |
5424 | 2.06k | if (fd) |
5425 | 2.06k | free(fd); |
5426 | | |
5427 | 2.06k | return NULL; |
5428 | 16.6k | } |
5429 | | |
5430 | | /* |
5431 | | * Seek within a CRAM file. |
5432 | | * |
5433 | | * Returns 0 on success |
5434 | | * -1 on failure |
5435 | | */ |
5436 | 0 | int cram_seek(cram_fd *fd, off_t offset, int whence) { |
5437 | 0 | fd->ooc = 0; |
5438 | |
|
5439 | 0 | cram_drain_rqueue(fd); |
5440 | |
|
5441 | 0 | return hseek(fd->fp, offset, whence) >= 0 ? 0 : -1; |
5442 | 0 | } |
5443 | | |
5444 | | /* |
5445 | | * Flushes a CRAM file. |
5446 | | * Useful for when writing to stdout without wishing to close the stream. |
5447 | | * |
5448 | | * Returns 0 on success |
5449 | | * -1 on failure |
5450 | | */ |
5451 | 0 | int cram_flush(cram_fd *fd) { |
5452 | 0 | if (!fd) |
5453 | 0 | return -1; |
5454 | | |
5455 | 0 | int ret = 0; |
5456 | |
|
5457 | 0 | if (fd->mode == 'w' && fd->ctr) { |
5458 | 0 | if(fd->ctr->slice) |
5459 | 0 | cram_update_curr_slice(fd->ctr, fd->version); |
5460 | |
|
5461 | 0 | if (-1 == cram_flush_container_mt(fd, fd->ctr)) |
5462 | 0 | ret = -1; |
5463 | |
|
5464 | 0 | cram_free_container(fd->ctr); |
5465 | 0 | if (fd->ctr_mt == fd->ctr) |
5466 | 0 | fd->ctr_mt = NULL; |
5467 | 0 | fd->ctr = NULL; |
5468 | 0 | } |
5469 | |
|
5470 | 0 | return ret; |
5471 | 0 | } |
5472 | | |
5473 | | /* |
5474 | | * Writes an EOF block to a CRAM file. |
5475 | | * |
5476 | | * Returns 0 on success |
5477 | | * -1 on failure |
5478 | | */ |
5479 | 8.90k | int cram_write_eof_block(cram_fd *fd) { |
5480 | | // EOF block is a container with special values to aid detection |
5481 | 8.90k | if (CRAM_MAJOR_VERS(fd->version) >= 2) { |
5482 | | // Empty container with |
5483 | | // ref_seq_id -1 |
5484 | | // start pos 0x454f46 ("EOF") |
5485 | | // span 0 |
5486 | | // nrec 0 |
5487 | | // counter 0 |
5488 | | // nbases 0 |
5489 | | // 1 block (landmark 0) |
5490 | | // (CRC32) |
5491 | 8.90k | cram_container c; |
5492 | 8.90k | memset(&c, 0, sizeof(c)); |
5493 | 8.90k | c.ref_seq_id = -1; |
5494 | 8.90k | c.ref_seq_start = 0x454f46; // "EOF" |
5495 | 8.90k | c.ref_seq_span = 0; |
5496 | 8.90k | c.record_counter = 0; |
5497 | 8.90k | c.num_bases = 0; |
5498 | 8.90k | c.num_blocks = 1; |
5499 | 8.90k | int32_t land[1] = {0}; |
5500 | 8.90k | c.landmark = land; |
5501 | | |
5502 | | // An empty compression header block with |
5503 | | // method raw (0) |
5504 | | // type comp header (1) |
5505 | | // content id 0 |
5506 | | // block contents size 6 |
5507 | | // raw size 6 |
5508 | | // empty preservation map (01 00) |
5509 | | // empty data series map (01 00) |
5510 | | // empty tag map (01 00) |
5511 | | // block CRC |
5512 | 8.90k | cram_block_compression_hdr ch; |
5513 | 8.90k | memset(&ch, 0, sizeof(ch)); |
5514 | 8.90k | c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch, 0); |
5515 | | |
5516 | 8.90k | c.length = c.comp_hdr_block->byte // Landmark[0] |
5517 | 8.90k | + 5 // block struct |
5518 | 8.90k | + 4*(CRAM_MAJOR_VERS(fd->version) >= 3); // CRC |
5519 | 8.90k | if (cram_write_container(fd, &c) < 0 || |
5520 | 8.90k | cram_write_block(fd, c.comp_hdr_block) < 0) { |
5521 | 0 | cram_close(fd); |
5522 | 0 | cram_free_block(c.comp_hdr_block); |
5523 | 0 | return -1; |
5524 | 0 | } |
5525 | 8.90k | if (ch.preservation_map) |
5526 | 8.90k | kh_destroy(map, ch.preservation_map); |
5527 | 8.90k | cram_free_block(c.comp_hdr_block); |
5528 | | |
5529 | | // V2.1 bytes |
5530 | | // 0b 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id |
5531 | | // e0 45 4f 46 00 00 00 // Cont HDR: pos, span, nrec, counter |
5532 | | // 00 01 00 // Cont HDR: nbase, nblk, landmark |
5533 | | // 00 01 00 06 06 // Comp.HDR blk |
5534 | | // 01 00 01 00 01 00 // Comp.HDR blk |
5535 | | |
5536 | | // V3.0 bytes: |
5537 | | // 0f 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id |
5538 | | // e0 45 4f 46 00 00 00 // Cont HDR: pos, span, nrec, counter |
5539 | | // 00 01 00 // Cont HDR: nbase, nblk, landmark |
5540 | | // 05 bd d9 4f // CRC32 |
5541 | | // 00 01 00 06 06 // Comp.HDR blk |
5542 | | // 01 00 01 00 01 00 // Comp.HDR blk |
5543 | | // ee 63 01 4b // CRC32 |
5544 | | |
5545 | | // V4.0 bytes: |
5546 | | // 0f 00 00 00 8f ff ff ff // Cont HDR: size, ref seq id |
5547 | | // 82 95 9e 46 00 00 00 // Cont HDR: pos, span, nrec, counter |
5548 | | // 00 01 00 // Cont HDR: nbase, nblk, landmark |
5549 | | // ac d6 05 bc // CRC32 |
5550 | | // 00 01 00 06 06 // Comp.HDR blk |
5551 | | // 01 00 01 00 01 00 // Comp.HDR blk |
5552 | | // ee 63 01 4b // CRC32 |
5553 | 8.90k | } |
5554 | | |
5555 | 8.90k | return 0; |
5556 | 8.90k | } |
5557 | | |
5558 | | /* |
5559 | | * Closes a CRAM file. |
5560 | | * Returns 0 on success |
5561 | | * -1 on failure |
5562 | | */ |
5563 | 16.6k | int cram_close(cram_fd *fd) { |
5564 | 16.6k | spare_bams *bl, *next; |
5565 | 16.6k | int i, ret = 0; |
5566 | | |
5567 | 16.6k | if (!fd) |
5568 | 0 | return -1; |
5569 | | |
5570 | 16.6k | if (fd->mode == 'w' && fd->ctr) { |
5571 | 4.38k | if(fd->ctr->slice) |
5572 | 4.38k | cram_update_curr_slice(fd->ctr, fd->version); |
5573 | | |
5574 | 4.38k | if (-1 == cram_flush_container_mt(fd, fd->ctr)) |
5575 | 472 | ret = -1; |
5576 | 4.38k | } |
5577 | | |
5578 | 16.6k | if (fd->mode != 'w') |
5579 | 7.23k | cram_drain_rqueue(fd); |
5580 | | |
5581 | 16.6k | if (fd->pool && fd->eof >= 0 && fd->rqueue) { |
5582 | 0 | hts_tpool_process_flush(fd->rqueue); |
5583 | |
|
5584 | 0 | if (0 != cram_flush_result(fd)) |
5585 | 0 | ret = -1; |
5586 | |
|
5587 | 0 | if (fd->mode == 'w') |
5588 | 0 | fd->ctr = NULL; // prevent double freeing |
5589 | | |
5590 | | //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue); |
5591 | |
|
5592 | 0 | hts_tpool_process_destroy(fd->rqueue); |
5593 | 0 | } |
5594 | | |
5595 | 16.6k | pthread_mutex_destroy(&fd->metrics_lock); |
5596 | 16.6k | pthread_mutex_destroy(&fd->ref_lock); |
5597 | 16.6k | pthread_mutex_destroy(&fd->range_lock); |
5598 | 16.6k | pthread_mutex_destroy(&fd->bam_list_lock); |
5599 | | |
5600 | 16.6k | if (ret == 0 && fd->mode == 'w') { |
5601 | | /* Write EOF block */ |
5602 | 8.90k | if (0 != cram_write_eof_block(fd)) |
5603 | 0 | ret = -1; |
5604 | 8.90k | } |
5605 | | |
5606 | 20.6k | for (bl = fd->bl; bl; bl = next) { |
5607 | 4.03k | int max_rec = fd->seqs_per_slice * fd->slices_per_container; |
5608 | | |
5609 | 4.03k | next = bl->next; |
5610 | 4.03k | free_bam_list(bl->bams, max_rec); |
5611 | 4.03k | free(bl); |
5612 | 4.03k | } |
5613 | | |
5614 | 16.6k | if (hclose(fd->fp) != 0) |
5615 | 0 | ret = -1; |
5616 | | |
5617 | 16.6k | if (fd->file_def) |
5618 | 16.6k | cram_free_file_def(fd->file_def); |
5619 | | |
5620 | 16.6k | if (fd->header) |
5621 | 16.2k | sam_hdr_destroy(fd->header); |
5622 | | |
5623 | 16.6k | free(fd->prefix); |
5624 | | |
5625 | 16.6k | if (fd->ctr) |
5626 | 9.12k | cram_free_container(fd->ctr); |
5627 | | |
5628 | 16.6k | if (fd->ctr_mt && fd->ctr_mt != fd->ctr) |
5629 | 69 | cram_free_container(fd->ctr_mt); |
5630 | | |
5631 | 16.6k | if (fd->refs) |
5632 | 16.6k | refs_free(fd->refs); |
5633 | 16.6k | if (fd->ref_free) |
5634 | 0 | free(fd->ref_free); |
5635 | | |
5636 | 797k | for (i = 0; i < DS_END; i++) |
5637 | 780k | if (fd->m[i]) |
5638 | 780k | free(fd->m[i]); |
5639 | | |
5640 | 16.6k | if (fd->tags_used) { |
5641 | 16.6k | khint_t k; |
5642 | | |
5643 | 28.5k | for (k = kh_begin(fd->tags_used); k != kh_end(fd->tags_used); k++) { |
5644 | 11.9k | if (kh_exist(fd->tags_used, k)) |
5645 | 5.50k | free(kh_val(fd->tags_used, k)); |
5646 | 11.9k | } |
5647 | | |
5648 | 16.6k | kh_destroy(m_metrics, fd->tags_used); |
5649 | 16.6k | } |
5650 | | |
5651 | 16.6k | if (fd->index) |
5652 | 0 | cram_index_free(fd); |
5653 | | |
5654 | 16.6k | if (fd->own_pool && fd->pool) |
5655 | 0 | hts_tpool_destroy(fd->pool); |
5656 | | |
5657 | 16.6k | if (fd->idxfp) |
5658 | 0 | if (bgzf_close(fd->idxfp) < 0) |
5659 | 0 | ret = -1; |
5660 | | |
5661 | 16.6k | free(fd); |
5662 | | |
5663 | 16.6k | return ret; |
5664 | 16.6k | } |
5665 | | |
5666 | | /* |
5667 | | * Returns 1 if we hit an EOF while reading. |
5668 | | */ |
5669 | 12.5k | int cram_eof(cram_fd *fd) { |
5670 | 12.5k | return fd->eof; |
5671 | 12.5k | } |
5672 | | |
5673 | | |
5674 | | /* |
5675 | | * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h. |
5676 | | * Use this immediately after opening. |
5677 | | * |
5678 | | * Returns 0 on success |
5679 | | * -1 on failure |
5680 | | */ |
5681 | 7.23k | int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...) { |
5682 | 7.23k | int r; |
5683 | 7.23k | va_list args; |
5684 | | |
5685 | 7.23k | va_start(args, opt); |
5686 | 7.23k | r = cram_set_voption(fd, opt, args); |
5687 | 7.23k | va_end(args); |
5688 | | |
5689 | 7.23k | return r; |
5690 | 7.23k | } |
5691 | | |
5692 | | /* |
5693 | | * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h. |
5694 | | * Use this immediately after opening. |
5695 | | * |
5696 | | * Returns 0 on success |
5697 | | * -1 on failure |
5698 | | */ |
5699 | 7.23k | int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { |
5700 | 7.23k | refs_t *refs; |
5701 | | |
5702 | 7.23k | if (!fd) { |
5703 | 0 | errno = EBADF; |
5704 | 0 | return -1; |
5705 | 0 | } |
5706 | | |
5707 | 7.23k | switch (opt) { |
5708 | 7.23k | case CRAM_OPT_DECODE_MD: |
5709 | 7.23k | fd->decode_md = va_arg(args, int); |
5710 | 7.23k | break; |
5711 | | |
5712 | 0 | case CRAM_OPT_PREFIX: |
5713 | 0 | if (fd->prefix) |
5714 | 0 | free(fd->prefix); |
5715 | 0 | if (!(fd->prefix = strdup(va_arg(args, char *)))) |
5716 | 0 | return -1; |
5717 | 0 | break; |
5718 | | |
5719 | 0 | case CRAM_OPT_VERBOSITY: |
5720 | 0 | break; |
5721 | | |
5722 | 0 | case CRAM_OPT_SEQS_PER_SLICE: |
5723 | 0 | fd->seqs_per_slice = va_arg(args, int); |
5724 | 0 | if (fd->bases_per_slice == BASES_PER_SLICE) |
5725 | 0 | fd->bases_per_slice = fd->seqs_per_slice * 500; |
5726 | 0 | break; |
5727 | | |
5728 | 0 | case CRAM_OPT_BASES_PER_SLICE: |
5729 | 0 | fd->bases_per_slice = va_arg(args, int); |
5730 | 0 | break; |
5731 | | |
5732 | 0 | case CRAM_OPT_SLICES_PER_CONTAINER: |
5733 | 0 | fd->slices_per_container = va_arg(args, int); |
5734 | 0 | break; |
5735 | | |
5736 | 0 | case CRAM_OPT_EMBED_REF: |
5737 | 0 | fd->embed_ref = va_arg(args, int); |
5738 | 0 | break; |
5739 | | |
5740 | 0 | case CRAM_OPT_NO_REF: |
5741 | 0 | fd->no_ref = va_arg(args, int); |
5742 | 0 | break; |
5743 | | |
5744 | 0 | case CRAM_OPT_POS_DELTA: |
5745 | 0 | fd->ap_delta = va_arg(args, int); |
5746 | 0 | break; |
5747 | | |
5748 | 0 | case CRAM_OPT_IGNORE_MD5: |
5749 | 0 | fd->ignore_md5 = va_arg(args, int); |
5750 | 0 | break; |
5751 | | |
5752 | 0 | case CRAM_OPT_LOSSY_NAMES: |
5753 | 0 | fd->lossy_read_names = va_arg(args, int); |
5754 | | // Currently lossy read names required paired (attached) reads. |
5755 | | // TLEN 0 or being 1 out causes read pairs to be detached, breaking |
5756 | | // the lossy read name compression, so we have extra options to |
5757 | | // slacken the exact TLEN round-trip checks. |
5758 | 0 | fd->tlen_approx = fd->lossy_read_names; |
5759 | 0 | fd->tlen_zero = fd->lossy_read_names; |
5760 | 0 | break; |
5761 | | |
5762 | 0 | case CRAM_OPT_USE_BZIP2: |
5763 | 0 | fd->use_bz2 = va_arg(args, int); |
5764 | 0 | break; |
5765 | | |
5766 | 0 | case CRAM_OPT_USE_RANS: |
5767 | 0 | fd->use_rans = va_arg(args, int); |
5768 | 0 | break; |
5769 | | |
5770 | 0 | case CRAM_OPT_USE_TOK: |
5771 | 0 | fd->use_tok = va_arg(args, int); |
5772 | 0 | break; |
5773 | | |
5774 | 0 | case CRAM_OPT_USE_FQZ: |
5775 | 0 | fd->use_fqz = va_arg(args, int); |
5776 | 0 | break; |
5777 | | |
5778 | 0 | case CRAM_OPT_USE_ARITH: |
5779 | 0 | fd->use_arith = va_arg(args, int); |
5780 | 0 | break; |
5781 | | |
5782 | 0 | case CRAM_OPT_USE_LZMA: |
5783 | 0 | fd->use_lzma = va_arg(args, int); |
5784 | 0 | break; |
5785 | | |
5786 | 0 | case CRAM_OPT_SHARED_REF: |
5787 | 0 | fd->shared_ref = 1; |
5788 | 0 | refs = va_arg(args, refs_t *); |
5789 | 0 | if (refs != fd->refs) { |
5790 | 0 | if (fd->refs) |
5791 | 0 | refs_free(fd->refs); |
5792 | 0 | fd->refs = refs; |
5793 | 0 | fd->refs->count++; |
5794 | 0 | } |
5795 | 0 | break; |
5796 | | |
5797 | 0 | case CRAM_OPT_RANGE: { |
5798 | 0 | int r = cram_seek_to_refpos(fd, va_arg(args, cram_range *)); |
5799 | 0 | pthread_mutex_lock(&fd->range_lock); |
5800 | | // printf("opt range noseek to %p %d:%ld-%ld\n", |
5801 | | // fd, fd->range.refid, fd->range.start, fd->range.end); |
5802 | 0 | if (fd->range.refid != -2) |
5803 | 0 | fd->required_fields |= SAM_POS; |
5804 | 0 | pthread_mutex_unlock(&fd->range_lock); |
5805 | 0 | return r; |
5806 | 0 | } |
5807 | | |
5808 | 0 | case CRAM_OPT_RANGE_NOSEEK: { |
5809 | | // As per CRAM_OPT_RANGE, but no seeking |
5810 | 0 | pthread_mutex_lock(&fd->range_lock); |
5811 | 0 | cram_range *r = va_arg(args, cram_range *); |
5812 | 0 | fd->range = *r; |
5813 | 0 | if (r->refid == HTS_IDX_NOCOOR) { |
5814 | 0 | fd->range.refid = -1; |
5815 | 0 | fd->range.start = 0; |
5816 | 0 | } else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) { |
5817 | 0 | fd->range.refid = -2; // special case in cram_next_slice |
5818 | 0 | } |
5819 | 0 | if (fd->range.refid != -2) |
5820 | 0 | fd->required_fields |= SAM_POS; |
5821 | 0 | fd->ooc = 0; |
5822 | 0 | fd->eof = 0; |
5823 | 0 | pthread_mutex_unlock(&fd->range_lock); |
5824 | 0 | return 0; |
5825 | 0 | } |
5826 | | |
5827 | 0 | case CRAM_OPT_REFERENCE: |
5828 | 0 | return cram_load_reference(fd, va_arg(args, char *)); |
5829 | | |
5830 | 0 | case CRAM_OPT_VERSION: { |
5831 | 0 | int major, minor; |
5832 | 0 | char *s = va_arg(args, char *); |
5833 | 0 | if (2 != sscanf(s, "%d.%d", &major, &minor)) { |
5834 | 0 | hts_log_error("Malformed version string %s", s); |
5835 | 0 | return -1; |
5836 | 0 | } |
5837 | 0 | if (!((major == 1 && minor == 0) || |
5838 | 0 | (major == 2 && (minor == 0 || minor == 1)) || |
5839 | 0 | (major == 3 && (minor == 0 || minor == 1)) || |
5840 | 0 | (major == 4 && minor == 0))) { |
5841 | 0 | hts_log_error("Unknown version string; use 1.0, 2.0, 2.1, 3.0, 3.1 or 4.0"); |
5842 | 0 | errno = EINVAL; |
5843 | 0 | return -1; |
5844 | 0 | } |
5845 | | |
5846 | 0 | if (major > 3 || (major == 3 && minor > 1)) { |
5847 | 0 | hts_log_warning( |
5848 | 0 | "CRAM version %s is still a draft and subject to change.\n" |
5849 | 0 | "This is a technology demonstration that should not be " |
5850 | 0 | "used for archival data.", s); |
5851 | 0 | } |
5852 | |
|
5853 | 0 | fd->version = major*256 + minor; |
5854 | |
|
5855 | 0 | fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0; |
5856 | |
|
5857 | 0 | fd->use_tok = ((CRAM_MAJOR_VERS(fd->version) == 3 && |
5858 | 0 | CRAM_MINOR_VERS(fd->version) >= 1) || |
5859 | 0 | CRAM_MAJOR_VERS(fd->version) >= 4) ? 1 : 0; |
5860 | 0 | cram_init_tables(fd); |
5861 | |
|
5862 | 0 | break; |
5863 | 0 | } |
5864 | | |
5865 | 0 | case CRAM_OPT_MULTI_SEQ_PER_SLICE: |
5866 | 0 | fd->multi_seq_user = fd->multi_seq = va_arg(args, int); |
5867 | 0 | break; |
5868 | | |
5869 | 0 | case CRAM_OPT_NTHREADS: { |
5870 | 0 | int nthreads = va_arg(args, int); |
5871 | 0 | if (nthreads >= 1) { |
5872 | 0 | if (!(fd->pool = hts_tpool_init(nthreads))) |
5873 | 0 | return -1; |
5874 | | |
5875 | 0 | fd->rqueue = hts_tpool_process_init(fd->pool, nthreads*2, 0); |
5876 | 0 | fd->shared_ref = 1; |
5877 | 0 | fd->own_pool = 1; |
5878 | 0 | } |
5879 | 0 | break; |
5880 | 0 | } |
5881 | | |
5882 | 0 | case CRAM_OPT_THREAD_POOL: { |
5883 | 0 | htsThreadPool *p = va_arg(args, htsThreadPool *); |
5884 | 0 | fd->pool = p ? p->pool : NULL; |
5885 | 0 | if (fd->pool) { |
5886 | 0 | fd->rqueue = hts_tpool_process_init(fd->pool, |
5887 | 0 | p->qsize ? p->qsize : hts_tpool_size(fd->pool)*2, |
5888 | 0 | 0); |
5889 | 0 | } |
5890 | 0 | fd->shared_ref = 1; // Needed to avoid clobbering ref between threads |
5891 | 0 | fd->own_pool = 0; |
5892 | | |
5893 | | //fd->qsize = 1; |
5894 | | //fd->decoded = calloc(fd->qsize, sizeof(cram_container *)); |
5895 | | //hts_tpool_dispatch(fd->pool, cram_decoder_thread, fd); |
5896 | 0 | break; |
5897 | 0 | } |
5898 | | |
5899 | 0 | case CRAM_OPT_REQUIRED_FIELDS: |
5900 | 0 | fd->required_fields = va_arg(args, int); |
5901 | 0 | if (fd->range.refid != -2) |
5902 | 0 | fd->required_fields |= SAM_POS; |
5903 | 0 | break; |
5904 | | |
5905 | 0 | case CRAM_OPT_STORE_MD: |
5906 | 0 | fd->store_md = va_arg(args, int); |
5907 | 0 | break; |
5908 | | |
5909 | 0 | case CRAM_OPT_STORE_NM: |
5910 | 0 | fd->store_nm = va_arg(args, int); |
5911 | 0 | break; |
5912 | | |
5913 | 0 | case HTS_OPT_COMPRESSION_LEVEL: |
5914 | 0 | fd->level = va_arg(args, int); |
5915 | 0 | break; |
5916 | | |
5917 | 0 | case HTS_OPT_PROFILE: { |
5918 | 0 | enum hts_profile_option prof = va_arg(args, int); |
5919 | 0 | switch (prof) { |
5920 | 0 | case HTS_PROFILE_FAST: |
5921 | 0 | if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 1; |
5922 | 0 | fd->use_tok = 0; |
5923 | 0 | fd->seqs_per_slice = 10000; |
5924 | 0 | break; |
5925 | | |
5926 | 0 | case HTS_PROFILE_NORMAL: |
5927 | 0 | break; |
5928 | | |
5929 | 0 | case HTS_PROFILE_SMALL: |
5930 | 0 | if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 6; |
5931 | 0 | fd->use_bz2 = 1; |
5932 | 0 | fd->use_fqz = 1; |
5933 | 0 | fd->seqs_per_slice = 25000; |
5934 | 0 | break; |
5935 | | |
5936 | 0 | case HTS_PROFILE_ARCHIVE: |
5937 | 0 | if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 7; |
5938 | 0 | fd->use_bz2 = 1; |
5939 | 0 | fd->use_fqz = 1; |
5940 | 0 | fd->use_arith = 1; |
5941 | 0 | if (fd->level > 7) |
5942 | 0 | fd->use_lzma = 1; |
5943 | 0 | fd->seqs_per_slice = 100000; |
5944 | 0 | break; |
5945 | 0 | } |
5946 | | |
5947 | 0 | if (fd->bases_per_slice == BASES_PER_SLICE) |
5948 | 0 | fd->bases_per_slice = fd->seqs_per_slice * 500; |
5949 | 0 | break; |
5950 | 0 | } |
5951 | | |
5952 | 0 | default: |
5953 | 0 | hts_log_error("Unknown CRAM option code %d", opt); |
5954 | 0 | errno = EINVAL; |
5955 | 0 | return -1; |
5956 | 7.23k | } |
5957 | | |
5958 | 7.23k | return 0; |
5959 | 7.23k | } |
5960 | | |
5961 | | int cram_check_EOF(cram_fd *fd) |
5962 | 0 | { |
5963 | | // Byte 9 in these templates is & with 0x0f to resolve differences |
5964 | | // between ITF-8 interpretations between early Java and C |
5965 | | // implementations of CRAM |
5966 | 0 | static const unsigned char TEMPLATE_2_1[30] = { |
5967 | 0 | 0x0b, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0, |
5968 | 0 | 0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, |
5969 | 0 | 0x01, 0x00, 0x06, 0x06, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00 |
5970 | 0 | }; |
5971 | 0 | static const unsigned char TEMPLATE_3[38] = { |
5972 | 0 | 0x0f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0, |
5973 | 0 | 0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x05, |
5974 | 0 | 0xbd, 0xd9, 0x4f, 0x00, 0x01, 0x00, 0x06, 0x06, 0x01, 0x00, |
5975 | 0 | 0x01, 0x00, 0x01, 0x00, 0xee, 0x63, 0x01, 0x4b |
5976 | 0 | }; |
5977 | |
|
5978 | 0 | unsigned char buf[38]; // max(sizeof TEMPLATE_*) |
5979 | |
|
5980 | 0 | uint8_t major = CRAM_MAJOR_VERS(fd->version); |
5981 | 0 | uint8_t minor = CRAM_MINOR_VERS(fd->version); |
5982 | |
|
5983 | 0 | const unsigned char *template; |
5984 | 0 | ssize_t template_len; |
5985 | 0 | if ((major < 2) || |
5986 | 0 | (major == 2 && minor == 0)) { |
5987 | 0 | return 3; // No EOF support in cram versions less than 2.1 |
5988 | 0 | } else if (major == 2 && minor == 1) { |
5989 | 0 | template = TEMPLATE_2_1; |
5990 | 0 | template_len = sizeof TEMPLATE_2_1; |
5991 | 0 | } else { |
5992 | 0 | template = TEMPLATE_3; |
5993 | 0 | template_len = sizeof TEMPLATE_3; |
5994 | 0 | } |
5995 | | |
5996 | 0 | off_t offset = htell(fd->fp); |
5997 | 0 | if (hseek(fd->fp, -template_len, SEEK_END) < 0) { |
5998 | 0 | if (errno == ESPIPE) { |
5999 | 0 | hclearerr(fd->fp); |
6000 | 0 | return 2; |
6001 | 0 | } |
6002 | 0 | else { |
6003 | 0 | return -1; |
6004 | 0 | } |
6005 | 0 | } |
6006 | 0 | if (hread(fd->fp, buf, template_len) != template_len) return -1; |
6007 | 0 | if (hseek(fd->fp, offset, SEEK_SET) < 0) return -1; |
6008 | 0 | buf[8] &= 0x0f; |
6009 | 0 | return (memcmp(template, buf, template_len) == 0)? 1 : 0; |
6010 | 0 | } |