Coverage Report

Created: 2025-11-03 07:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/cram/cram_io.c
Line
Count
Source
1
/*
2
Copyright (c) 2012-2025 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * CRAM I/O primitives.
33
 *
34
 * - ITF8 encoding and decoding.
35
 * - Block based I/O
36
 * - Zlib inflating and deflating (memory)
37
 * - CRAM basic data structure reading and writing
38
 * - File opening / closing
39
 * - Reference sequence handling
40
 */
41
42
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
43
#include <config.h>
44
45
#include <stdio.h>
46
#include <errno.h>
47
#include <assert.h>
48
#include <stdlib.h>
49
#include <string.h>
50
#include <unistd.h>
51
#include <zlib.h>
52
#ifdef HAVE_LIBBZ2
53
#include <bzlib.h>
54
#endif
55
#ifdef HAVE_LIBLZMA
56
#ifdef HAVE_LZMA_H
57
#include <lzma.h>
58
#else
59
#include "../os/lzma_stub.h"
60
#endif
61
#endif
62
#include <sys/types.h>
63
#include <sys/stat.h>
64
#include <math.h>
65
#include <stdint.h>
66
67
#ifdef HAVE_LIBDEFLATE
68
#include <libdeflate.h>
69
#define crc32(a,b,c) libdeflate_crc32((a),(b),(c))
70
#endif
71
72
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
73
#include "../fuzz_settings.h"
74
#endif
75
76
#include "cram.h"
77
#include "os.h"
78
#include "../htslib/hts.h"
79
#include "../hts_internal.h"
80
#include "open_trace_file.h"
81
82
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
83
#include <htscodecs/rANS_static.h>
84
#include <htscodecs/rANS_static4x16.h>
85
#include <htscodecs/arith_dynamic.h>
86
#include <htscodecs/tokenise_name3.h>
87
#include <htscodecs/fqzcomp_qual.h>
88
#include <htscodecs/varint.h> // CRAM v4.0 variable-size integers
89
#else
90
#include "../htscodecs/htscodecs/rANS_static.h"
91
#include "../htscodecs/htscodecs/rANS_static4x16.h"
92
#include "../htscodecs/htscodecs/arith_dynamic.h"
93
#include "../htscodecs/htscodecs/tokenise_name3.h"
94
#include "../htscodecs/htscodecs/fqzcomp_qual.h"
95
#include "../htscodecs/htscodecs/varint.h"
96
#endif
97
98
//#define REF_DEBUG
99
100
#ifdef REF_DEBUG
101
#include <sys/syscall.h>
102
#define gettid() (int)syscall(SYS_gettid)
103
104
#define RP(...) fprintf (stderr, __VA_ARGS__)
105
#else
106
#define RP(...)
107
#endif
108
109
#include "../htslib/hfile.h"
110
#include "../htslib/bgzf.h"
111
#include "../htslib/faidx.h"
112
#include "../hts_internal.h"
113
114
#ifndef PATH_MAX
115
#define PATH_MAX FILENAME_MAX
116
#endif
117
118
1.02M
#define TRIAL_SPAN 70
119
1.02M
#define NTRIALS 3
120
121
15.7k
#define CRAM_DEFAULT_LEVEL 5
122
123
/* ----------------------------------------------------------------------
124
 * ITF8 encoding and decoding.
125
 *
126
 * Also see the itf8_get and itf8_put macros in cram_io.h
127
 */
128
129
/*
130
 * LEGACY: consider using itf8_decode_crc.
131
 *
132
 * Reads an integer in ITF-8 encoding from 'cp' and stores it in
133
 * *val.
134
 *
135
 * Returns the number of bytes read on success
136
 *        -1 on failure
137
 */
138
0
int itf8_decode(cram_fd *fd, int32_t *val_p) {
139
0
    static int nbytes[16] = {
140
0
        0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
141
0
        1,1,1,1,                                        // 1000xxxx - 1011xxxx
142
0
        2,2,                                            // 1100xxxx - 1101xxxx
143
0
        3,                                              // 1110xxxx
144
0
        4,                                              // 1111xxxx
145
0
    };
146
147
0
    static int nbits[16] = {
148
0
        0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
149
0
        0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
150
0
        0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
151
0
        0x0f,                                           // 1110xxxx
152
0
        0x0f,                                           // 1111xxxx
153
0
    };
154
155
0
    int32_t val = hgetc(fd->fp);
156
0
    if (val == -1)
157
0
        return -1;
158
159
0
    int i = nbytes[val>>4];
160
0
    val &= nbits[val>>4];
161
162
0
    switch(i) {
163
0
    case 0:
164
0
        *val_p = val;
165
0
        return 1;
166
167
0
    case 1:
168
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
169
0
        *val_p = val;
170
0
        return 2;
171
172
0
    case 2:
173
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
174
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
175
0
        *val_p = val;
176
0
        return 3;
177
178
0
    case 3:
179
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
180
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
181
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
182
0
        *val_p = val;
183
0
        return 4;
184
185
0
    case 4: // really 3.5 more, why make it different?
186
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
187
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
188
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
189
0
        val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f);
190
0
        *val_p = val;
191
0
    }
192
193
0
    return 5;
194
0
}
195
196
269k
int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
197
269k
    static int nbytes[16] = {
198
269k
        0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
199
269k
        1,1,1,1,                                        // 1000xxxx - 1011xxxx
200
269k
        2,2,                                            // 1100xxxx - 1101xxxx
201
269k
        3,                                              // 1110xxxx
202
269k
        4,                                              // 1111xxxx
203
269k
    };
204
205
269k
    static int nbits[16] = {
206
269k
        0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
207
269k
        0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
208
269k
        0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
209
269k
        0x0f,                                           // 1110xxxx
210
269k
        0x0f,                                           // 1111xxxx
211
269k
    };
212
269k
    unsigned char c[5];
213
214
269k
    int32_t val = hgetc(fd->fp);
215
269k
    if (val == -1)
216
287
        return -1;
217
218
269k
    c[0]=val;
219
220
269k
    int i = nbytes[val>>4];
221
269k
    val &= nbits[val>>4];
222
223
269k
    if (i > 0) {
224
19.8k
        if (hread(fd->fp, &c[1], i) < i)
225
97
            return -1;
226
19.8k
    }
227
228
269k
    switch(i) {
229
249k
    case 0:
230
249k
        *val_p = val;
231
249k
        *crc = crc32(*crc, c, 1);
232
249k
        return 1;
233
234
9.04k
    case 1:
235
9.04k
        val = (val<<8) | c[1];
236
9.04k
        *val_p = val;
237
9.04k
        *crc = crc32(*crc, c, 2);
238
9.04k
        return 2;
239
240
4.60k
    case 2:
241
4.60k
        val = (val<<8) | c[1];
242
4.60k
        val = (val<<8) | c[2];
243
4.60k
        *val_p = val;
244
4.60k
        *crc = crc32(*crc, c, 3);
245
4.60k
        return 3;
246
247
1.50k
    case 3:
248
1.50k
        val = (val<<8) | c[1];
249
1.50k
        val = (val<<8) | c[2];
250
1.50k
        val = (val<<8) | c[3];
251
1.50k
        *val_p = val;
252
1.50k
        *crc = crc32(*crc, c, 4);
253
1.50k
        return 4;
254
255
4.60k
    case 4: // really 3.5 more, why make it different?
256
4.60k
        {
257
4.60k
            uint32_t uv = val;
258
4.60k
            uv = (uv<<8) |  c[1];
259
4.60k
            uv = (uv<<8) |  c[2];
260
4.60k
            uv = (uv<<8) |  c[3];
261
4.60k
            uv = (uv<<4) | (c[4] & 0x0f);
262
            // Avoid implementation-defined behaviour on negative values
263
4.60k
            *val_p = uv < 0x80000000UL ? (int32_t) uv : -((int32_t) (0xffffffffUL - uv)) - 1;
264
4.60k
            *crc = crc32(*crc, c, 5);
265
4.60k
        }
266
269k
    }
267
268
4.60k
    return 5;
269
269k
}
270
271
/*
272
 * Stores a value to memory in ITF-8 format.
273
 *
274
 * Returns the number of bytes required to store the number.
275
 * This is a maximum of 5 bytes.
276
 */
277
23.8M
static inline int itf8_put(char *cp, int32_t val) {
278
23.8M
    unsigned char *up = (unsigned char *)cp;
279
23.8M
    if        (!(val & ~0x00000007f)) { // 1 byte
280
22.2M
        *up = val;
281
22.2M
        return 1;
282
22.2M
    } else if (!(val & ~0x00003fff)) { // 2 byte
283
355k
        *up++ = (val >> 8 ) | 0x80;
284
355k
        *up   = val & 0xff;
285
355k
        return 2;
286
1.27M
    } else if (!(val & ~0x01fffff)) { // 3 byte
287
29.4k
        *up++ = (val >> 16) | 0xc0;
288
29.4k
        *up++ = (val >> 8 ) & 0xff;
289
29.4k
        *up   = val & 0xff;
290
29.4k
        return 3;
291
1.24M
    } else if (!(val & ~0x0fffffff)) { // 4 byte
292
1.06M
        *up++ = (val >> 24) | 0xe0;
293
1.06M
        *up++ = (val >> 16) & 0xff;
294
1.06M
        *up++ = (val >> 8 ) & 0xff;
295
1.06M
        *up   = val & 0xff;
296
1.06M
        return 4;
297
1.06M
    } else {                           // 5 byte
298
188k
        *up++ = 0xf0 | ((val>>28) & 0xff);
299
188k
        *up++ = (val >> 20) & 0xff;
300
188k
        *up++ = (val >> 12) & 0xff;
301
188k
        *up++ = (val >> 4 ) & 0xff;
302
188k
        *up = val & 0x0f;
303
188k
        return 5;
304
188k
    }
305
23.8M
}
306
307
308
/* 64-bit itf8 variant */
309
140k
static inline int ltf8_put(char *cp, int64_t val) {
310
140k
    unsigned char *up = (unsigned char *)cp;
311
140k
    if        (!(val & ~((1LL<<7)-1))) {
312
101k
        *up = val;
313
101k
        return 1;
314
101k
    } else if (!(val & ~((1LL<<(6+8))-1))) {
315
38.3k
        *up++ = (val >> 8 ) | 0x80;
316
38.3k
        *up   = val & 0xff;
317
38.3k
        return 2;
318
38.3k
    } else if (!(val & ~((1LL<<(5+2*8))-1))) {
319
1.49k
        *up++ = (val >> 16) | 0xc0;
320
1.49k
        *up++ = (val >> 8 ) & 0xff;
321
1.49k
        *up   = val & 0xff;
322
1.49k
        return 3;
323
1.49k
    } else if (!(val & ~((1LL<<(4+3*8))-1))) {
324
73
        *up++ = (val >> 24) | 0xe0;
325
73
        *up++ = (val >> 16) & 0xff;
326
73
        *up++ = (val >> 8 ) & 0xff;
327
73
        *up   = val & 0xff;
328
73
        return 4;
329
73
    } else if (!(val & ~((1LL<<(3+4*8))-1))) {
330
0
        *up++ = (val >> 32) | 0xf0;
331
0
        *up++ = (val >> 24) & 0xff;
332
0
        *up++ = (val >> 16) & 0xff;
333
0
        *up++ = (val >> 8 ) & 0xff;
334
0
        *up   = val & 0xff;
335
0
        return 5;
336
0
    } else if (!(val & ~((1LL<<(2+5*8))-1))) {
337
0
        *up++ = (val >> 40) | 0xf8;
338
0
        *up++ = (val >> 32) & 0xff;
339
0
        *up++ = (val >> 24) & 0xff;
340
0
        *up++ = (val >> 16) & 0xff;
341
0
        *up++ = (val >> 8 ) & 0xff;
342
0
        *up   = val & 0xff;
343
0
        return 6;
344
0
    } else if (!(val & ~((1LL<<(1+6*8))-1))) {
345
0
        *up++ = (val >> 48) | 0xfc;
346
0
        *up++ = (val >> 40) & 0xff;
347
0
        *up++ = (val >> 32) & 0xff;
348
0
        *up++ = (val >> 24) & 0xff;
349
0
        *up++ = (val >> 16) & 0xff;
350
0
        *up++ = (val >> 8 ) & 0xff;
351
0
        *up   = val & 0xff;
352
0
        return 7;
353
0
    } else if (!(val & ~((1LL<<(7*8))-1))) {
354
0
        *up++ = (val >> 56) | 0xfe;
355
0
        *up++ = (val >> 48) & 0xff;
356
0
        *up++ = (val >> 40) & 0xff;
357
0
        *up++ = (val >> 32) & 0xff;
358
0
        *up++ = (val >> 24) & 0xff;
359
0
        *up++ = (val >> 16) & 0xff;
360
0
        *up++ = (val >> 8 ) & 0xff;
361
0
        *up   = val & 0xff;
362
0
        return 8;
363
0
    } else {
364
0
        *up++ = 0xff;
365
0
        *up++ = (val >> 56) & 0xff;
366
0
        *up++ = (val >> 48) & 0xff;
367
0
        *up++ = (val >> 40) & 0xff;
368
0
        *up++ = (val >> 32) & 0xff;
369
0
        *up++ = (val >> 24) & 0xff;
370
0
        *up++ = (val >> 16) & 0xff;
371
0
        *up++ = (val >> 8 ) & 0xff;
372
0
        *up   = val & 0xff;
373
0
        return 9;
374
0
    }
375
140k
}
376
377
/*
378
 * Encodes and writes a single integer in ITF-8 format.
379
 * Returns 0 on success
380
 *        -1 on failure
381
 */
382
0
int itf8_encode(cram_fd *fd, int32_t val) {
383
0
    char buf[5];
384
0
    int len = itf8_put(buf, val);
385
0
    return hwrite(fd->fp, buf, len) == len ? 0 : -1;
386
0
}
387
388
const int itf8_bytes[16] = {
389
    1, 1, 1, 1,  1, 1, 1, 1,
390
    2, 2, 2, 2,  3, 3, 4, 5
391
};
392
393
const int ltf8_bytes[256] = {
394
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
395
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
396
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
397
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
398
399
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
400
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
401
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
402
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
403
404
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
405
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
406
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
407
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
408
409
    3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
410
    3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
411
412
    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
413
414
    5, 5, 5, 5,  5, 5, 5, 5,  6, 6, 6, 6,  7, 7, 8, 9
415
};
416
417
/*
418
 * LEGACY: consider using ltf8_decode_crc.
419
 */
420
0
int ltf8_decode(cram_fd *fd, int64_t *val_p) {
421
0
    int c = hgetc(fd->fp);
422
0
    int64_t val = (unsigned char)c;
423
0
    if (c == -1)
424
0
        return -1;
425
426
0
    if (val < 0x80) {
427
0
        *val_p =   val;
428
0
        return 1;
429
430
0
    } else if (val < 0xc0) {
431
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
432
0
        *val_p = val & (((1LL<<(6+8)))-1);
433
0
        return 2;
434
435
0
    } else if (val < 0xe0) {
436
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
437
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
438
0
        *val_p = val & ((1LL<<(5+2*8))-1);
439
0
        return 3;
440
441
0
    } else if (val < 0xf0) {
442
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
443
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
444
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
445
0
        *val_p = val & ((1LL<<(4+3*8))-1);
446
0
        return 4;
447
448
0
    } else if (val < 0xf8) {
449
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
450
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
451
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
452
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
453
0
        *val_p = val & ((1LL<<(3+4*8))-1);
454
0
        return 5;
455
456
0
    } else if (val < 0xfc) {
457
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
458
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
459
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
460
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
461
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
462
0
        *val_p = val & ((1LL<<(2+5*8))-1);
463
0
        return 6;
464
465
0
    } else if (val < 0xfe) {
466
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
467
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
468
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
469
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
470
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
471
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
472
0
        *val_p = val & ((1LL<<(1+6*8))-1);
473
0
        return 7;
474
475
0
    } else if (val < 0xff) {
476
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
477
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
478
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
479
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
480
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
481
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
482
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
483
0
        *val_p = val & ((1LL<<(7*8))-1);
484
0
        return 8;
485
486
0
    } else {
487
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
488
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
489
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
490
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
491
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
492
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
493
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
494
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
495
0
        *val_p = val;
496
0
    }
497
498
0
    return 9;
499
0
}
500
501
4.35k
int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
502
4.35k
    unsigned char c[9];
503
4.35k
    int64_t val = hgetc(fd->fp);
504
4.35k
    if (val < 0)
505
3
        return -1;
506
507
4.35k
    c[0] = val;
508
509
4.35k
    if (val < 0x80) {
510
3.84k
        *val_p =   val;
511
3.84k
        *crc = crc32(*crc, c, 1);
512
3.84k
        return 1;
513
514
3.84k
    } else if (val < 0xc0) {
515
345
        int v = hgetc(fd->fp);
516
345
        if (v < 0)
517
3
            return -1;
518
342
        val = (val<<8) | (c[1]=v);
519
342
        *val_p = val & (((1LL<<(6+8)))-1);
520
342
        *crc = crc32(*crc, c, 2);
521
342
        return 2;
522
523
345
    } else if (val < 0xe0) {
524
42
        if (hread(fd->fp, &c[1], 2) < 2)
525
3
            return -1;
526
39
        val = (val<<8) | c[1];
527
39
        val = (val<<8) | c[2];
528
39
        *val_p = val & ((1LL<<(5+2*8))-1);
529
39
        *crc = crc32(*crc, c, 3);
530
39
        return 3;
531
532
124
    } else if (val < 0xf0) {
533
21
        if (hread(fd->fp, &c[1], 3) < 3)
534
3
            return -1;
535
18
        val = (val<<8) | c[1];
536
18
        val = (val<<8) | c[2];
537
18
        val = (val<<8) | c[3];
538
18
        *val_p = val & ((1LL<<(4+3*8))-1);
539
18
        *crc = crc32(*crc, c, 4);
540
18
        return 4;
541
542
103
    } else if (val < 0xf8) {
543
9
        if (hread(fd->fp, &c[1], 4) < 4)
544
0
            return -1;
545
9
        val = (val<<8) | c[1];
546
9
        val = (val<<8) | c[2];
547
9
        val = (val<<8) | c[3];
548
9
        val = (val<<8) | c[4];
549
9
        *val_p = val & ((1LL<<(3+4*8))-1);
550
9
        *crc = crc32(*crc, c, 5);
551
9
        return 5;
552
553
94
    } else if (val < 0xfc) {
554
14
        if (hread(fd->fp, &c[1], 5) < 5)
555
3
            return -1;
556
11
        val = (val<<8) | c[1];
557
11
        val = (val<<8) | c[2];
558
11
        val = (val<<8) | c[3];
559
11
        val = (val<<8) | c[4];
560
11
        val = (val<<8) | c[5];
561
11
        *val_p = val & ((1LL<<(2+5*8))-1);
562
11
        *crc = crc32(*crc, c, 6);
563
11
        return 6;
564
565
80
    } else if (val < 0xfe) {
566
12
        if (hread(fd->fp, &c[1], 6) < 6)
567
3
            return -1;
568
9
        val = (val<<8) | c[1];
569
9
        val = (val<<8) | c[2];
570
9
        val = (val<<8) | c[3];
571
9
        val = (val<<8) | c[4];
572
9
        val = (val<<8) | c[5];
573
9
        val = (val<<8) | c[6];
574
9
        *val_p = val & ((1LL<<(1+6*8))-1);
575
9
        *crc = crc32(*crc, c, 7);
576
9
        return 7;
577
578
68
    } else if (val < 0xff) {
579
31
        uint64_t uval = val;
580
31
        if (hread(fd->fp, &c[1], 7) < 7)
581
0
            return -1;
582
31
        uval = (uval<<8) | c[1];
583
31
        uval = (uval<<8) | c[2];
584
31
        uval = (uval<<8) | c[3];
585
31
        uval = (uval<<8) | c[4];
586
31
        uval = (uval<<8) | c[5];
587
31
        uval = (uval<<8) | c[6];
588
31
        uval = (uval<<8) | c[7];
589
31
        *val_p = uval & ((1ULL<<(7*8))-1);
590
31
        *crc = crc32(*crc, c, 8);
591
31
        return 8;
592
593
37
    } else {
594
37
        uint64_t uval;
595
37
        if (hread(fd->fp, &c[1], 8) < 8)
596
3
            return -1;
597
34
        uval =             c[1];
598
34
        uval = (uval<<8) | c[2];
599
34
        uval = (uval<<8) | c[3];
600
34
        uval = (uval<<8) | c[4];
601
34
        uval = (uval<<8) | c[5];
602
34
        uval = (uval<<8) | c[6];
603
34
        uval = (uval<<8) | c[7];
604
34
        uval = (uval<<8) | c[8];
605
34
        *crc = crc32(*crc, c, 9);
606
        // Avoid implementation-defined behaviour on negative values
607
34
        *val_p = c[1] < 0x80 ? (int64_t) uval : -((int64_t) (0xffffffffffffffffULL - uval)) - 1;
608
34
    }
609
610
34
    return 9;
611
4.35k
}
612
613
/*
614
 * Pushes a value in ITF8 format onto the end of a block.
615
 * This shouldn't be used for high-volume data as it is not the fastest
616
 * method.
617
 *
618
 * Returns the number of bytes written
619
 */
620
16.9M
int itf8_put_blk(cram_block *blk, int32_t val) {
621
16.9M
    char buf[5];
622
16.9M
    int sz;
623
624
16.9M
    sz = itf8_put(buf, val);
625
16.9M
    BLOCK_APPEND(blk, buf, sz);
626
16.9M
    return sz;
627
628
0
 block_err:
629
0
    return -1;
630
16.9M
}
631
632
0
int ltf8_put_blk(cram_block *blk, int64_t val) {
633
0
    char buf[9];
634
0
    int sz;
635
636
0
    sz = ltf8_put(buf, val);
637
0
    BLOCK_APPEND(blk, buf, sz);
638
0
    return sz;
639
640
0
 block_err:
641
0
    return -1;
642
0
}
643
644
704k
static int64_t safe_itf8_get(char **cp, const char *endp, int *err) {
645
704k
    const unsigned char *up = (unsigned char *)*cp;
646
647
704k
    if (endp && endp - *cp < 5 &&
648
432k
        (*cp >= endp || endp - *cp < itf8_bytes[up[0]>>4])) {
649
420k
        if (err) *err = 1;
650
420k
        return 0;
651
420k
    }
652
653
284k
    if (up[0] < 0x80) {
654
257k
        (*cp)++;
655
257k
        return up[0];
656
257k
    } else if (up[0] < 0xc0) {
657
12.4k
        (*cp)+=2;
658
12.4k
        return ((up[0] <<8) |  up[1])                           & 0x3fff;
659
14.9k
    } else if (up[0] < 0xe0) {
660
3.57k
        (*cp)+=3;
661
3.57k
        return ((up[0]<<16) | (up[1]<< 8) |  up[2])             & 0x1fffff;
662
11.3k
    } else if (up[0] < 0xf0) {
663
3.64k
        (*cp)+=4;
664
3.64k
        uint32_t uv = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
665
3.64k
        return (int32_t)uv;
666
7.72k
    } else {
667
7.72k
        (*cp)+=5;
668
7.72k
        uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
669
7.72k
        return (int32_t)uv;
670
7.72k
    }
671
284k
}
672
673
965
static int64_t safe_ltf8_get(char **cp, const char *endp, int *err) {
674
965
    unsigned char *up = (unsigned char *)*cp;
675
676
965
    if (endp && endp - *cp < 9 &&
677
931
        (*cp >= endp || endp - *cp < ltf8_bytes[up[0]])) {
678
339
        if (err) *err = 1;
679
339
        return 0;
680
339
    }
681
682
626
    if (up[0] < 0x80) {
683
381
        (*cp)++;
684
381
        return up[0];
685
381
    } else if (up[0] < 0xc0) {
686
21
        (*cp)+=2;
687
21
        return (((uint64_t)up[0]<< 8) |
688
21
                 (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
689
224
    } else if (up[0] < 0xe0) {
690
32
        (*cp)+=3;
691
32
        return (((uint64_t)up[0]<<16) |
692
32
                ((uint64_t)up[1]<< 8) |
693
32
                 (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
694
192
    } else if (up[0] < 0xf0) {
695
39
        (*cp)+=4;
696
39
        return (((uint64_t)up[0]<<24) |
697
39
                ((uint64_t)up[1]<<16) |
698
39
                ((uint64_t)up[2]<< 8) |
699
39
                 (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
700
153
    } else if (up[0] < 0xf8) {
701
114
        (*cp)+=5;
702
114
        return (((uint64_t)up[0]<<32) |
703
114
                ((uint64_t)up[1]<<24) |
704
114
                ((uint64_t)up[2]<<16) |
705
114
                ((uint64_t)up[3]<< 8) |
706
114
                 (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
707
114
    } else if (up[0] < 0xfc) {
708
9
        (*cp)+=6;
709
9
        return (((uint64_t)up[0]<<40) |
710
9
                ((uint64_t)up[1]<<32) |
711
9
                ((uint64_t)up[2]<<24) |
712
9
                ((uint64_t)up[3]<<16) |
713
9
                ((uint64_t)up[4]<< 8) |
714
9
                 (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
715
30
    } else if (up[0] < 0xfe) {
716
0
        (*cp)+=7;
717
0
        return (((uint64_t)up[0]<<48) |
718
0
                ((uint64_t)up[1]<<40) |
719
0
                ((uint64_t)up[2]<<32) |
720
0
                ((uint64_t)up[3]<<24) |
721
0
                ((uint64_t)up[4]<<16) |
722
0
                ((uint64_t)up[5]<< 8) |
723
0
                 (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
724
30
    } else if (up[0] < 0xff) {
725
0
        (*cp)+=8;
726
0
        return (((uint64_t)up[1]<<48) |
727
0
                ((uint64_t)up[2]<<40) |
728
0
                ((uint64_t)up[3]<<32) |
729
0
                ((uint64_t)up[4]<<24) |
730
0
                ((uint64_t)up[5]<<16) |
731
0
                ((uint64_t)up[6]<< 8) |
732
0
                 (uint64_t)up[7]) & ((1LL<<(7*8))-1);
733
30
    } else {
734
30
        (*cp)+=9;
735
30
        return (((uint64_t)up[1]<<56) |
736
30
                ((uint64_t)up[2]<<48) |
737
30
                ((uint64_t)up[3]<<40) |
738
30
                ((uint64_t)up[4]<<32) |
739
30
                ((uint64_t)up[5]<<24) |
740
30
                ((uint64_t)up[6]<<16) |
741
30
                ((uint64_t)up[7]<< 8) |
742
30
                 (uint64_t)up[8]);
743
30
    }
744
626
}
745
746
// Wrapper for now
747
6.89M
static int safe_itf8_put(char *cp, char *cp_end, int32_t val) {
748
6.89M
    return itf8_put(cp, val);
749
6.89M
}
750
751
140k
static int safe_ltf8_put(char *cp, char *cp_end, int64_t val) {
752
140k
    return ltf8_put(cp, val);
753
140k
}
754
755
1.67M
static int itf8_size(int64_t v) {
756
1.67M
    return ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5);
757
1.67M
}
758
759
//-----------------------------------------------------------------------------
760
761
// CRAM v4.0 onwards uses a different variable sized integer encoding
762
// that is size agnostic.
763
764
// Local interface to varint.h inline version, so we can use in func ptr.
765
// Note a lot of these use the unsigned interface but take signed int64_t.
766
// This is because the old CRAM ITF8 inteface had signed -1 as unsigned
767
// 0xffffffff.
768
6.95k
static int uint7_size(int64_t v) {
769
6.95k
    return var_size_u64(v);
770
6.95k
}
771
772
3.12k
static int64_t uint7_get_32(char **cp, const char *endp, int *err) {
773
3.12k
    uint32_t val;
774
3.12k
    int nb = var_get_u32((uint8_t *)(*cp), (const uint8_t *)endp, &val);
775
3.12k
    (*cp) += nb;
776
3.12k
    if (!nb && err) *err = 1;
777
3.12k
    return val;
778
3.12k
}
779
780
0
static int64_t sint7_get_32(char **cp, const char *endp, int *err) {
781
0
    int32_t val;
782
0
    int nb = var_get_s32((uint8_t *)(*cp), (const uint8_t *)endp, &val);
783
0
    (*cp) += nb;
784
0
    if (!nb && err) *err = 1;
785
0
    return val;
786
0
}
787
788
6.42M
static int64_t uint7_get_64(char **cp, const char *endp, int *err) {
789
6.42M
    uint64_t val;
790
6.42M
    int nb = var_get_u64((uint8_t *)(*cp), (const uint8_t *)endp, &val);
791
6.42M
    (*cp) += nb;
792
6.42M
    if (!nb && err) *err = 1;
793
6.42M
    return val;
794
6.42M
}
795
796
51
static int64_t sint7_get_64(char **cp, const char *endp, int *err) {
797
51
    int64_t val;
798
51
    int nb = var_get_s64((uint8_t *)(*cp), (const uint8_t *)endp, &val);
799
51
    (*cp) += nb;
800
51
    if (!nb && err) *err = 1;
801
51
    return val;
802
51
}
803
804
0
static int uint7_put_32(char *cp, char *endp, int32_t val) {
805
0
    return var_put_u32((uint8_t *)cp, (uint8_t *)endp, val);
806
0
}
807
808
0
static int sint7_put_32(char *cp, char *endp, int32_t val) {
809
0
    return var_put_s32((uint8_t *)cp, (uint8_t *)endp, val);
810
0
}
811
812
0
static int uint7_put_64(char *cp, char *endp, int64_t val) {
813
0
    return var_put_u64((uint8_t *)cp, (uint8_t *)endp, val);
814
0
}
815
816
0
static int sint7_put_64(char *cp, char *endp, int64_t val) {
817
0
    return var_put_s64((uint8_t *)cp, (uint8_t *)endp, val);
818
0
}
819
820
// Put direct to to cram_block
821
0
static int uint7_put_blk_32(cram_block *blk, int32_t v) {
822
0
    uint8_t buf[10];
823
0
    int sz = var_put_u32(buf, buf+10, v);
824
0
    BLOCK_APPEND(blk, buf, sz);
825
0
    return sz;
826
827
0
 block_err:
828
0
    return -1;
829
0
}
830
831
0
static int sint7_put_blk_32(cram_block *blk, int32_t v) {
832
0
    uint8_t buf[10];
833
0
    int sz = var_put_s32(buf, buf+10, v);
834
0
    BLOCK_APPEND(blk, buf, sz);
835
0
    return sz;
836
837
0
 block_err:
838
0
    return -1;
839
0
}
840
841
0
static int uint7_put_blk_64(cram_block *blk, int64_t v) {
842
0
    uint8_t buf[10];
843
0
    int sz = var_put_u64(buf, buf+10, v);
844
0
    BLOCK_APPEND(blk, buf, sz);
845
0
    return sz;
846
847
0
 block_err:
848
0
    return -1;
849
0
}
850
851
0
static int sint7_put_blk_64(cram_block *blk, int64_t v) {
852
0
    uint8_t buf[10];
853
0
    int sz = var_put_s64(buf, buf+10, v);
854
0
    BLOCK_APPEND(blk, buf, sz);
855
0
    return sz;
856
857
0
 block_err:
858
0
    return -1;
859
0
}
860
861
// Decode 32-bits with CRC update from cram_fd
862
42.8k
static int uint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
863
42.8k
    uint8_t b[5], i = 0;
864
42.8k
    int c;
865
42.8k
    uint32_t v = 0;
866
867
#ifdef VARINT2
868
    b[0] = hgetc(fd->fp);
869
    if (b[0] < 177) {
870
    } else if (b[0] < 241) {
871
        b[1] = hgetc(fd->fp);
872
    } else if (b[0] < 249) {
873
        b[1] = hgetc(fd->fp);
874
        b[2] = hgetc(fd->fp);
875
    } else {
876
        int n = b[0]+2, z = 1;
877
        while (n-- >= 249)
878
            b[z++] = hgetc(fd->fp);
879
    }
880
    i = var_get_u32(b, NULL, &v);
881
#else
882
//    // Little endian
883
//    int s = 0;
884
//    do {
885
//        b[i++] = c = hgetc(fd->fp);
886
//        if (c < 0)
887
//            return -1;
888
//        v |= (c & 0x7f) << s;
889
//      s += 7;
890
//    } while (i < 5 && (c & 0x80));
891
892
    // Big endian, see also htscodecs/varint.h
893
49.1k
    do {
894
49.1k
        b[i++] = c = hgetc(fd->fp);
895
49.1k
        if (c < 0)
896
53
            return -1;
897
49.1k
        v = (v<<7) | (c & 0x7f);
898
49.1k
    } while (i < 5 && (c & 0x80));
899
42.7k
#endif
900
42.7k
    *crc = crc32(*crc, b, i);
901
902
42.7k
    *val_p = v;
903
42.7k
    return i;
904
42.8k
}
905
906
// Decode 32-bits with CRC update from cram_fd
907
3.64k
static int sint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
908
3.64k
    uint8_t b[5], i = 0;
909
3.64k
    int c;
910
3.64k
    uint32_t v = 0;
911
912
#ifdef VARINT2
913
    b[0] = hgetc(fd->fp);
914
    if (b[0] < 177) {
915
    } else if (b[0] < 241) {
916
        b[1] = hgetc(fd->fp);
917
    } else if (b[0] < 249) {
918
        b[1] = hgetc(fd->fp);
919
        b[2] = hgetc(fd->fp);
920
    } else {
921
        int n = b[0]+2, z = 1;
922
        while (n-- >= 249)
923
            b[z++] = hgetc(fd->fp);
924
    }
925
    i = var_get_u32(b, NULL, &v);
926
#else
927
//    // Little endian
928
//    int s = 0;
929
//    do {
930
//        b[i++] = c = hgetc(fd->fp);
931
//        if (c < 0)
932
//            return -1;
933
//        v |= (c & 0x7f) << s;
934
//      s += 7;
935
//    } while (i < 5 && (c & 0x80));
936
937
    // Big endian, see also htscodecs/varint.h
938
5.19k
    do {
939
5.19k
        b[i++] = c = hgetc(fd->fp);
940
5.19k
        if (c < 0)
941
3
            return -1;
942
5.19k
        v = (v<<7) | (c & 0x7f);
943
5.19k
    } while (i < 5 && (c & 0x80));
944
3.63k
#endif
945
3.63k
    *crc = crc32(*crc, b, i);
946
947
3.63k
    *val_p = (v>>1) ^ -(v&1);
948
3.63k
    return i;
949
3.64k
}
950
951
952
// Decode 64-bits with CRC update from cram_fd
953
14.5k
static int uint7_decode_crc64(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
954
14.5k
    uint8_t b[10], i = 0;
955
14.5k
    int c;
956
14.5k
    uint64_t v = 0;
957
958
#ifdef VARINT2
959
    b[0] = hgetc(fd->fp);
960
    if (b[0] < 177) {
961
    } else if (b[0] < 241) {
962
        b[1] = hgetc(fd->fp);
963
    } else if (b[0] < 249) {
964
        b[1] = hgetc(fd->fp);
965
        b[2] = hgetc(fd->fp);
966
    } else {
967
        int n = b[0]+2, z = 1;
968
        while (n-- >= 249)
969
            b[z++] = hgetc(fd->fp);
970
    }
971
    i = var_get_u64(b, NULL, &v);
972
#else
973
//    // Little endian
974
//    int s = 0;
975
//    do {
976
//        b[i++] = c = hgetc(fd->fp);
977
//        if (c < 0)
978
//            return -1;
979
//        v |= (c & 0x7f) << s;
980
//      s += 7;
981
//    } while (i < 10 && (c & 0x80));
982
983
    // Big endian, see also htscodecs/varint.h
984
15.4k
    do {
985
15.4k
        b[i++] = c = hgetc(fd->fp);
986
15.4k
        if (c < 0)
987
9
            return -1;
988
15.4k
        v = (v<<7) | (c & 0x7f);
989
15.4k
    } while (i < 5 && (c & 0x80));
990
14.5k
#endif
991
14.5k
    *crc = crc32(*crc, b, i);
992
993
14.5k
    *val_p = v;
994
14.5k
    return i;
995
14.5k
}
996
997
//-----------------------------------------------------------------------------
998
999
/*
1000
 * Decodes a 32-bit little endian value from fd and stores in val.
1001
 *
1002
 * Returns the number of bytes read on success
1003
 *         -1 on failure
1004
 */
1005
18.0k
static int int32_decode(cram_fd *fd, int32_t *val) {
1006
18.0k
    int32_t i;
1007
18.0k
    if (4 != hread(fd->fp, &i, 4))
1008
21
        return -1;
1009
1010
18.0k
    *val = le_int4(i);
1011
18.0k
    return 4;
1012
18.0k
}
1013
1014
/*
1015
 * Encodes a 32-bit little endian value 'val' and writes to fd.
1016
 *
1017
 * Returns the number of bytes written on success
1018
 *         -1 on failure
1019
 */
1020
446k
static int int32_encode(cram_fd *fd, int32_t val) {
1021
446k
    uint32_t v = le_int4(val);
1022
446k
    if (4 != hwrite(fd->fp, &v, 4))
1023
0
        return -1;
1024
1025
446k
    return 4;
1026
446k
}
1027
1028
/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
1029
622
int int32_get_blk(cram_block *b, int32_t *val) {
1030
622
    if (b->uncomp_size - BLOCK_SIZE(b) < 4)
1031
22
        return -1;
1032
1033
600
    uint32_t v =
1034
600
         ((uint32_t) b->data[b->byte  ])        |
1035
600
        (((uint32_t) b->data[b->byte+1]) <<  8) |
1036
600
        (((uint32_t) b->data[b->byte+2]) << 16) |
1037
600
        (((uint32_t) b->data[b->byte+3]) << 24);
1038
    // Avoid implementation-defined behaviour on negative values
1039
600
    *val = v < 0x80000000U ? (int32_t) v : -((int32_t) (0xffffffffU - v)) - 1;
1040
600
    BLOCK_SIZE(b) += 4;
1041
600
    return 4;
1042
622
}
1043
1044
/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
1045
6.83k
int int32_put_blk(cram_block *b, int32_t val) {
1046
6.83k
    unsigned char cp[4];
1047
6.83k
    uint32_t v = val;
1048
6.83k
    cp[0] = ( v      & 0xff);
1049
6.83k
    cp[1] = ((v>>8)  & 0xff);
1050
6.83k
    cp[2] = ((v>>16) & 0xff);
1051
6.83k
    cp[3] = ((v>>24) & 0xff);
1052
1053
6.83k
    BLOCK_APPEND(b, cp, 4);
1054
6.83k
    return 0;
1055
1056
0
 block_err:
1057
0
    return -1;
1058
6.83k
}
1059
1060
#ifdef HAVE_LIBDEFLATE
1061
/* ----------------------------------------------------------------------
1062
 * libdeflate compression code, with interface to match
1063
 * zlib_mem_{in,de}flate for simplicity elsewhere.
1064
 */
1065
1066
// Named the same as the version that uses zlib as we always use libdeflate for
1067
// decompression when available.
1068
char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1069
    struct libdeflate_decompressor *z = libdeflate_alloc_decompressor();
1070
    if (!z) {
1071
        hts_log_error("Call to libdeflate_alloc_decompressor failed");
1072
        return NULL;
1073
    }
1074
1075
    uint8_t *data = NULL, *new_data;
1076
    if (!*size)
1077
        *size = csize*2;
1078
    for(;;) {
1079
        new_data = realloc(data, *size);
1080
        if (!new_data) {
1081
            hts_log_error("Memory allocation failure");
1082
            goto fail;
1083
        }
1084
        data = new_data;
1085
1086
        int ret = libdeflate_gzip_decompress(z, cdata, csize, data, *size, size);
1087
1088
        // Auto grow output buffer size if needed and try again.
1089
        // Fortunately for all bar one call of this we know the size already.
1090
        if (ret == LIBDEFLATE_INSUFFICIENT_SPACE) {
1091
            (*size) *= 1.5;
1092
            continue;
1093
        }
1094
1095
        if (ret != LIBDEFLATE_SUCCESS) {
1096
            hts_log_error("Inflate operation failed: %d", ret);
1097
            goto fail;
1098
        } else {
1099
            break;
1100
        }
1101
    }
1102
1103
    libdeflate_free_decompressor(z);
1104
    return (char *)data;
1105
1106
 fail:
1107
    libdeflate_free_decompressor(z);
1108
    free(data);
1109
    return NULL;
1110
}
1111
1112
// Named differently as we use both zlib/libdeflate for compression.
1113
static char *libdeflate_deflate(char *data, size_t size, size_t *cdata_size,
1114
                                int level, int strat) {
1115
    level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default
1116
    level *= 1.23;     // NB levels go up to 12 here; 5 onwards is +1
1117
    level += level>=8; // 5,6,7->6,7,8  8->10  9->12
1118
    if (level > 12) level = 12;
1119
1120
    if (strat == Z_RLE) // not supported by libdeflate
1121
        level = 1;
1122
1123
    struct libdeflate_compressor *z = libdeflate_alloc_compressor(level);
1124
    if (!z) {
1125
        hts_log_error("Call to libdeflate_alloc_compressor failed");
1126
        return NULL;
1127
    }
1128
1129
    unsigned char *cdata = NULL; /* Compressed output */
1130
    size_t cdata_alloc;
1131
    cdata = malloc(cdata_alloc = size*1.05+100);
1132
    if (!cdata) {
1133
        hts_log_error("Memory allocation failure");
1134
        libdeflate_free_compressor(z);
1135
        return NULL;
1136
    }
1137
1138
    *cdata_size = libdeflate_gzip_compress(z, data, size, cdata, cdata_alloc);
1139
    libdeflate_free_compressor(z);
1140
1141
    if (*cdata_size == 0) {
1142
        hts_log_error("Call to libdeflate_gzip_compress failed");
1143
        free(cdata);
1144
        return NULL;
1145
    }
1146
1147
    return (char *)cdata;
1148
}
1149
1150
#else
1151
1152
/* ----------------------------------------------------------------------
1153
 * zlib compression code - from Gap5's tg_iface_g.c
1154
 * They're static here as they're only used within the cram_compress_block
1155
 * and cram_uncompress_block functions, which are the external interface.
1156
 */
1157
13
char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1158
13
    z_stream s;
1159
13
    unsigned char *data = NULL; /* Uncompressed output */
1160
13
    int data_alloc = 0;
1161
13
    int err;
1162
1163
    /* Starting point at uncompressed size, and scale after that */
1164
13
    data = malloc(data_alloc = csize*1.2+100);
1165
13
    if (!data)
1166
0
        return NULL;
1167
1168
    /* Initialise zlib stream */
1169
13
    s.zalloc = Z_NULL; /* use default allocation functions */
1170
13
    s.zfree  = Z_NULL;
1171
13
    s.opaque = Z_NULL;
1172
13
    s.next_in  = (unsigned char *)cdata;
1173
13
    s.avail_in = csize;
1174
13
    s.total_in = 0;
1175
13
    s.next_out  = data;
1176
13
    s.avail_out = data_alloc;
1177
13
    s.total_out = 0;
1178
1179
    //err = inflateInit(&s);
1180
13
    err = inflateInit2(&s, 15 + 32);
1181
13
    if (err != Z_OK) {
1182
0
        hts_log_error("Call to zlib inflateInit failed: %s", s.msg);
1183
0
        free(data);
1184
0
        return NULL;
1185
0
    }
1186
1187
    /* Decode to 'data' array */
1188
15
    for (;s.avail_in;) {
1189
11
        unsigned char *data_tmp;
1190
11
        int alloc_inc;
1191
1192
11
        s.next_out = &data[s.total_out];
1193
11
        err = inflate(&s, Z_NO_FLUSH);
1194
11
        if (err == Z_STREAM_END)
1195
0
            break;
1196
1197
11
        if (err != Z_OK) {
1198
9
            hts_log_error("Call to zlib inflate failed: %s", s.msg);
1199
9
            free(data);
1200
9
            inflateEnd(&s);
1201
9
            return NULL;
1202
9
        }
1203
1204
        /* More to come, so realloc based on growth so far */
1205
2
        alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
1206
2
        data = realloc((data_tmp = data), data_alloc += alloc_inc);
1207
2
        if (!data) {
1208
0
            free(data_tmp);
1209
0
            inflateEnd(&s);
1210
0
            return NULL;
1211
0
        }
1212
2
        s.avail_out += alloc_inc;
1213
2
    }
1214
4
    inflateEnd(&s);
1215
1216
4
    *size = s.total_out;
1217
4
    return (char *)data;
1218
13
}
1219
#endif
1220
1221
#if !defined(HAVE_LIBDEFLATE) || LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR ==  1 && LIBDEFLATE_VERSION_MINOR <= 8)
1222
static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
1223
347k
                              int level, int strat) {
1224
347k
    z_stream s;
1225
347k
    unsigned char *cdata = NULL; /* Compressed output */
1226
347k
    int cdata_alloc = 0;
1227
347k
    int cdata_pos = 0;
1228
347k
    int err;
1229
1230
347k
    cdata = malloc(cdata_alloc = size*1.05+100);
1231
347k
    if (!cdata)
1232
0
        return NULL;
1233
347k
    cdata_pos = 0;
1234
1235
    /* Initialise zlib stream */
1236
347k
    s.zalloc = Z_NULL; /* use default allocation functions */
1237
347k
    s.zfree  = Z_NULL;
1238
347k
    s.opaque = Z_NULL;
1239
347k
    s.next_in  = (unsigned char *)data;
1240
347k
    s.avail_in = size;
1241
347k
    s.total_in = 0;
1242
347k
    s.next_out  = cdata;
1243
347k
    s.avail_out = cdata_alloc;
1244
347k
    s.total_out = 0;
1245
347k
    s.data_type = Z_BINARY;
1246
1247
347k
    err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
1248
347k
    if (err != Z_OK) {
1249
0
        hts_log_error("Call to zlib deflateInit2 failed: %s", s.msg);
1250
0
        return NULL;
1251
0
    }
1252
1253
    /* Encode to 'cdata' array */
1254
694k
    for (;s.avail_in;) {
1255
347k
        s.next_out = &cdata[cdata_pos];
1256
347k
        s.avail_out = cdata_alloc - cdata_pos;
1257
347k
        if (cdata_alloc - cdata_pos <= 0) {
1258
0
            hts_log_error("Deflate produced larger output than expected");
1259
0
            return NULL;
1260
0
        }
1261
347k
        err = deflate(&s, Z_NO_FLUSH);
1262
347k
        cdata_pos = cdata_alloc - s.avail_out;
1263
347k
        if (err != Z_OK) {
1264
0
            hts_log_error("Call to zlib deflate failed: %s", s.msg);
1265
0
            break;
1266
0
        }
1267
347k
    }
1268
347k
    if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
1269
0
        hts_log_error("Call to zlib deflate failed: %s", s.msg);
1270
0
    }
1271
347k
    *cdata_size = s.total_out;
1272
1273
347k
    if (deflateEnd(&s) != Z_OK) {
1274
0
        hts_log_error("Call to zlib deflate failed: %s", s.msg);
1275
0
    }
1276
347k
    return (char *)cdata;
1277
347k
}
1278
#endif
1279
1280
#ifdef HAVE_LIBLZMA
1281
/* ------------------------------------------------------------------------ */
1282
/*
1283
 * Data compression routines using liblzma (xz)
1284
 *
1285
 * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
1286
 * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
1287
 * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
1288
 * as compression times.
1289
 *
1290
 * For now we disable this functionality. If it's to be reenabled make sure you
1291
 * improve the mem_inflate implementation as it's just a test hack at the
1292
 * moment.
1293
 */
1294
1295
static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
1296
0
                              int level) {
1297
0
    char *out;
1298
0
    size_t out_size = lzma_stream_buffer_bound(size);
1299
0
    *cdata_size = 0;
1300
1301
0
    out = malloc(out_size);
1302
1303
    /* Single call compression */
1304
0
    if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
1305
0
                                           (uint8_t *)data, size,
1306
0
                                           (uint8_t *)out, cdata_size,
1307
0
                                           out_size))
1308
0
        return NULL;
1309
1310
0
    return out;
1311
0
}
1312
1313
4
static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
1314
4
    lzma_stream strm = LZMA_STREAM_INIT;
1315
4
    size_t out_size = 0, out_pos = 0;
1316
4
    char *out = NULL, *new_out;
1317
4
    int r;
1318
1319
    /* Initiate the decoder */
1320
4
    if (LZMA_OK != lzma_stream_decoder(&strm, lzma_easy_decoder_memusage(9), 0))
1321
0
        return NULL;
1322
1323
    /* Decode loop */
1324
4
    strm.avail_in = csize;
1325
4
    strm.next_in = (uint8_t *)cdata;
1326
1327
6
    for (;strm.avail_in;) {
1328
4
        if (strm.avail_in > out_size - out_pos) {
1329
4
            out_size += strm.avail_in * 4 + 32768;
1330
4
            new_out = realloc(out, out_size);
1331
4
            if (!new_out)
1332
0
                goto fail;
1333
4
            out = new_out;
1334
4
        }
1335
4
        strm.avail_out = out_size - out_pos;
1336
4
        strm.next_out = (uint8_t *)&out[out_pos];
1337
1338
4
        r = lzma_code(&strm, LZMA_RUN);
1339
4
        if (LZMA_OK != r && LZMA_STREAM_END != r) {
1340
2
            hts_log_error("LZMA decode failure (error %d)", r);
1341
2
            goto fail;
1342
2
        }
1343
1344
2
        out_pos = strm.total_out;
1345
1346
2
        if (r == LZMA_STREAM_END)
1347
0
            break;
1348
2
    }
1349
1350
    /* finish up any unflushed data; necessary? */
1351
2
    r = lzma_code(&strm, LZMA_FINISH);
1352
2
    if (r != LZMA_OK && r != LZMA_STREAM_END) {
1353
0
        hts_log_error("Call to lzma_code failed with error %d", r);
1354
0
        goto fail;
1355
0
    }
1356
1357
2
    new_out = realloc(out, strm.total_out > 0 ? strm.total_out : 1);
1358
2
    if (new_out)
1359
2
        out = new_out;
1360
2
    *size = strm.total_out;
1361
1362
2
    lzma_end(&strm);
1363
1364
2
    return out;
1365
1366
2
 fail:
1367
2
    lzma_end(&strm);
1368
2
    free(out);
1369
2
    return NULL;
1370
2
}
1371
#endif
1372
1373
/* ----------------------------------------------------------------------
1374
 * CRAM blocks - the dynamically growable data block. We have code to
1375
 * create, update, (un)compress and read/write.
1376
 *
1377
 * These are derived from the deflate_interlaced.c blocks, but with the
1378
 * CRAM extension of content types and IDs.
1379
 */
1380
1381
/*
1382
 * Allocates a new cram_block structure with a specified content_type and
1383
 * id.
1384
 *
1385
 * Returns block pointer on success
1386
 *         NULL on failure
1387
 */
1388
cram_block *cram_new_block(enum cram_content_type content_type,
1389
1.19M
                           int content_id) {
1390
1.19M
    cram_block *b = malloc(sizeof(*b));
1391
1.19M
    if (!b)
1392
0
        return NULL;
1393
1.19M
    b->method = b->orig_method = RAW;
1394
1.19M
    b->content_type = content_type;
1395
1.19M
    b->content_id = content_id;
1396
1.19M
    b->comp_size = 0;
1397
1.19M
    b->uncomp_size = 0;
1398
1.19M
    b->data = NULL;
1399
1.19M
    b->alloc = 0;
1400
1.19M
    b->byte = 0;
1401
1.19M
    b->bit = 7; // MSB
1402
1.19M
    b->crc32 = 0;
1403
1.19M
    b->idx = 0;
1404
1.19M
    b->m = NULL;
1405
1406
1.19M
    return b;
1407
1.19M
}
1408
1409
/*
1410
 * Reads a block from a cram file.
1411
 * Returns cram_block pointer on success.
1412
 *         NULL on failure
1413
 */
1414
14.4k
cram_block *cram_read_block(cram_fd *fd) {
1415
14.4k
    cram_block *b = malloc(sizeof(*b));
1416
14.4k
    unsigned char c;
1417
14.4k
    uint32_t crc = 0;
1418
14.4k
    if (!b)
1419
0
        return NULL;
1420
1421
    //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
1422
1423
14.4k
    if (-1 == (b->method      = hgetc(fd->fp))) { free(b); return NULL; }
1424
14.3k
    c = b->method; crc = crc32(crc, &c, 1);
1425
14.3k
    if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; }
1426
14.3k
    c = b->content_type; crc = crc32(crc, &c, 1);
1427
14.3k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->content_id, &crc))  { free(b); return NULL; }
1428
14.3k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->comp_size, &crc))   { free(b); return NULL; }
1429
14.3k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; }
1430
1431
    //fprintf(stderr, "  method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
1432
    //      b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
1433
1434
14.3k
    if (b->method == RAW) {
1435
7.05k
        if (b->uncomp_size < 0 || b->comp_size != b->uncomp_size) {
1436
97
            free(b);
1437
97
            return NULL;
1438
97
        }
1439
6.95k
        b->alloc = b->uncomp_size;
1440
6.95k
        if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
1441
6.95k
        if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) {
1442
18
            free(b->data);
1443
18
            free(b);
1444
18
            return NULL;
1445
18
        }
1446
7.28k
    } else {
1447
7.28k
        if (b->comp_size < 0 || b->uncomp_size < 0) {
1448
41
            free(b);
1449
41
            return NULL;
1450
41
        }
1451
7.24k
        b->alloc = b->comp_size;
1452
7.24k
        if (!(b->data = malloc(b->comp_size)))  { free(b); return NULL; }
1453
7.24k
        if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) {
1454
76
            free(b->data);
1455
76
            free(b);
1456
76
            return NULL;
1457
76
        }
1458
7.24k
    }
1459
1460
14.1k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1461
2.80k
        if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
1462
0
            free(b->data);
1463
0
            free(b);
1464
0
            return NULL;
1465
0
        }
1466
1467
2.80k
        b->crc32_checked = fd->ignore_md5;
1468
2.80k
        b->crc_part = crc;
1469
11.3k
    } else {
1470
11.3k
        b->crc32_checked = 1; // CRC not present
1471
11.3k
    }
1472
1473
14.1k
    b->orig_method = b->method;
1474
14.1k
    b->idx = 0;
1475
14.1k
    b->byte = 0;
1476
14.1k
    b->bit = 7; // MSB
1477
1478
14.1k
    return b;
1479
14.1k
}
1480
1481
1482
/*
1483
 * Computes the size of a cram block, including the block
1484
 * header itself.
1485
 */
1486
0
uint32_t cram_block_size(cram_block *b) {
1487
0
    unsigned char dat[100], *cp = dat;;
1488
0
    uint32_t sz;
1489
1490
0
    *cp++ = b->method;
1491
0
    *cp++ = b->content_type;
1492
0
    cp += itf8_put((char*)cp, b->content_id);
1493
0
    cp += itf8_put((char*)cp, b->comp_size);
1494
0
    cp += itf8_put((char*)cp, b->uncomp_size);
1495
1496
0
    sz = cp-dat + 4;
1497
0
    sz += b->method == RAW ? b->uncomp_size : b->comp_size;
1498
1499
0
    return sz;
1500
0
}
1501
1502
/*
1503
 * Writes a CRAM block.
1504
 * Returns 0 on success
1505
 *        -1 on failure
1506
 */
1507
446k
int cram_write_block(cram_fd *fd, cram_block *b) {
1508
446k
    char vardata[100];
1509
446k
    int vardata_o = 0;
1510
1511
446k
    assert(b->method != RAW || (b->comp_size == b->uncomp_size));
1512
1513
446k
    if (hputc(b->method,       fd->fp)  == EOF) return -1;
1514
446k
    if (hputc(b->content_type, fd->fp)  == EOF) return -1;
1515
446k
    vardata_o += fd->vv.varint_put32(vardata          , vardata+100, b->content_id);
1516
446k
    vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->comp_size);
1517
446k
    vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->uncomp_size);
1518
446k
    if (vardata_o != hwrite(fd->fp, vardata, vardata_o))
1519
0
        return -1;
1520
1521
446k
    if (b->data) {
1522
409k
        if (b->method == RAW) {
1523
324k
            if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size))
1524
0
                return -1;
1525
324k
        } else {
1526
85.1k
            if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size))
1527
0
                return -1;
1528
85.1k
        }
1529
409k
    } else {
1530
        // Absent blocks should be size 0
1531
37.0k
        assert(b->method == RAW && b->uncomp_size == 0);
1532
37.0k
    }
1533
1534
446k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1535
446k
        char dat[100], *cp = (char *)dat;
1536
446k
        uint32_t crc;
1537
1538
446k
        *cp++ = b->method;
1539
446k
        *cp++ = b->content_type;
1540
446k
        cp += fd->vv.varint_put32(cp, dat+100, b->content_id);
1541
446k
        cp += fd->vv.varint_put32(cp, dat+100, b->comp_size);
1542
446k
        cp += fd->vv.varint_put32(cp, dat+100, b->uncomp_size);
1543
446k
        crc = crc32(0L, (uc *)dat, cp-dat);
1544
1545
446k
        if (b->method == RAW) {
1546
361k
            b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
1547
361k
        } else {
1548
85.1k
            b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
1549
85.1k
        }
1550
1551
446k
        if (-1 == int32_encode(fd, b->crc32))
1552
0
            return -1;
1553
446k
    }
1554
1555
446k
    return 0;
1556
446k
}
1557
1558
/*
1559
 * Frees a CRAM block, deallocating internal data too.
1560
 */
1561
1.83M
void cram_free_block(cram_block *b) {
1562
1.83M
    if (!b)
1563
631k
        return;
1564
1.20M
    if (b->data)
1565
855k
        free(b->data);
1566
1.20M
    free(b);
1567
1.20M
}
1568
1569
/*
1570
 * Uncompresses a CRAM block, if compressed.
1571
 */
1572
9.82k
int cram_uncompress_block(cram_block *b) {
1573
9.82k
    char *uncomp;
1574
9.82k
    size_t uncomp_size = 0;
1575
1576
9.82k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1577
    // Pretend the CRC was OK so the fuzzer doesn't have to get it right
1578
9.82k
    b->crc32_checked = 1;
1579
9.82k
#endif
1580
1581
9.82k
    if (b->crc32_checked == 0) {
1582
0
        uint32_t crc = crc32(b->crc_part, b->data ? b->data : (uc *)"", b->alloc);
1583
0
        b->crc32_checked = 1;
1584
0
        if (crc != b->crc32) {
1585
0
            hts_log_error("Block CRC32 failure");
1586
0
            return -1;
1587
0
        }
1588
0
    }
1589
1590
9.82k
    if (b->uncomp_size == 0) {
1591
        // blank block
1592
5.01k
        b->method = RAW;
1593
5.01k
        return 0;
1594
5.01k
    }
1595
9.82k
    assert(b->uncomp_size >= 0); // cram_read_block should ensure this
1596
1597
4.81k
    switch (b->method) {
1598
550
    case RAW:
1599
550
        return 0;
1600
1601
13
    case GZIP:
1602
13
        uncomp_size = b->uncomp_size;
1603
13
        uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1604
1605
13
        if (!uncomp)
1606
9
            return -1;
1607
4
        if (uncomp_size != b->uncomp_size) {
1608
4
            free(uncomp);
1609
4
            return -1;
1610
4
        }
1611
0
        free(b->data);
1612
0
        b->data = (unsigned char *)uncomp;
1613
0
        b->alloc = uncomp_size;
1614
0
        b->method = RAW;
1615
0
        break;
1616
1617
0
#ifdef HAVE_LIBBZ2
1618
5
    case BZIP2: {
1619
5
        unsigned int usize = b->uncomp_size;
1620
5
        if (!(uncomp = malloc(usize)))
1621
0
            return -1;
1622
5
        if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
1623
5
                                                (char *)b->data, b->comp_size,
1624
5
                                                0, 0)) {
1625
5
            free(uncomp);
1626
5
            return -1;
1627
5
        }
1628
0
        free(b->data);
1629
0
        b->data = (unsigned char *)uncomp;
1630
0
        b->alloc = usize;
1631
0
        b->method = RAW;
1632
0
        b->uncomp_size = usize; // Just in case it differs
1633
0
        break;
1634
5
    }
1635
#else
1636
    case BZIP2:
1637
        hts_log_error("Bzip2 compression is not compiled into this version. Please rebuild and try again");
1638
        return -1;
1639
#endif
1640
1641
0
#ifdef HAVE_LIBLZMA
1642
4
    case LZMA:
1643
4
        uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1644
4
        if (!uncomp)
1645
2
            return -1;
1646
2
        if (uncomp_size != b->uncomp_size) {
1647
2
            free(uncomp);
1648
2
            return -1;
1649
2
        }
1650
0
        free(b->data);
1651
0
        b->data = (unsigned char *)uncomp;
1652
0
        b->alloc = uncomp_size;
1653
0
        b->method = RAW;
1654
0
        break;
1655
#else
1656
    case LZMA:
1657
        hts_log_error("Lzma compression is not compiled into this version. Please rebuild and try again");
1658
        return -1;
1659
        break;
1660
#endif
1661
1662
277
    case RANS: {
1663
277
        unsigned int usize = b->uncomp_size, usize2;
1664
277
        uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2);
1665
277
        if (!uncomp)
1666
144
            return -1;
1667
133
        if (usize != usize2) {
1668
133
            free(uncomp);
1669
133
            return -1;
1670
133
        }
1671
0
        free(b->data);
1672
0
        b->data = (unsigned char *)uncomp;
1673
0
        b->alloc = usize2;
1674
0
        b->method = RAW;
1675
0
        b->uncomp_size = usize2; // Just in case it differs
1676
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1677
0
        break;
1678
133
    }
1679
1680
571
    case FQZ: {
1681
571
        uncomp_size = b->uncomp_size;
1682
571
        uncomp = fqz_decompress((char *)b->data, b->comp_size, &uncomp_size, NULL, 0);
1683
571
        if (!uncomp)
1684
310
            return -1;
1685
261
        free(b->data);
1686
261
        b->data = (unsigned char *)uncomp;
1687
261
        b->alloc = uncomp_size;
1688
261
        b->method = RAW;
1689
261
        b->uncomp_size = uncomp_size;
1690
261
        break;
1691
571
    }
1692
1693
526
    case RANS_PR0: {
1694
526
        unsigned int usize = b->uncomp_size, usize2;
1695
526
        uncomp = (char *)rans_uncompress_4x16(b->data, b->comp_size, &usize2);
1696
526
        if (!uncomp)
1697
311
            return -1;
1698
215
        if (usize != usize2) {
1699
109
            free(uncomp);
1700
109
            return -1;
1701
109
        }
1702
106
        b->orig_method = RANSPR;
1703
106
        free(b->data);
1704
106
        b->data = (unsigned char *)uncomp;
1705
106
        b->alloc = usize2;
1706
106
        b->method = RAW;
1707
106
        b->uncomp_size = usize2; // Just incase it differs
1708
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1709
106
        break;
1710
215
    }
1711
1712
234
    case ARITH_PR0: {
1713
234
        unsigned int usize = b->uncomp_size, usize2;
1714
234
        uncomp = (char *)arith_uncompress_to(b->data, b->comp_size, NULL, &usize2);
1715
234
        if (!uncomp)
1716
24
            return -1;
1717
210
        if (usize != usize2) {
1718
25
            free(uncomp);
1719
25
            return -1;
1720
25
        }
1721
185
        b->orig_method = ARITH;
1722
185
        free(b->data);
1723
185
        b->data = (unsigned char *)uncomp;
1724
185
        b->alloc = usize2;
1725
185
        b->method = RAW;
1726
185
        b->uncomp_size = usize2; // Just incase it differs
1727
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1728
185
        break;
1729
210
    }
1730
1731
2.62k
    case TOK3: {
1732
2.62k
        uint32_t out_len;
1733
2.62k
        uint8_t *cp = tok3_decode_names(b->data, b->comp_size, &out_len);
1734
2.62k
        if (!cp)
1735
2.51k
            return -1;
1736
111
        b->orig_method = TOK3;
1737
111
        b->method = RAW;
1738
111
        free(b->data);
1739
111
        b->data = cp;
1740
111
        b->alloc = out_len;
1741
111
        b->uncomp_size = out_len;
1742
111
        break;
1743
2.62k
    }
1744
1745
16
    default:
1746
16
        return -1;
1747
4.81k
    }
1748
1749
663
    return 0;
1750
4.81k
}
1751
1752
static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size,
1753
                                     int content_id, size_t *out_size,
1754
                                     enum cram_block_method_int method,
1755
1.00M
                                     int level, int strat) {
1756
1.00M
    switch (method) {
1757
113k
    case GZIP:
1758
187k
    case GZIP_RLE:
1759
347k
    case GZIP_1:
1760
        // Read names bizarrely benefit from zlib over libdeflate for
1761
        // mid-range compression levels.  Focusing purely of ratio or
1762
        // speed, libdeflate still wins.  It also seems to win for
1763
        // other data series too.
1764
        //
1765
        // Eg RN at level 5;  libdeflate=55.9MB  zlib=51.6MB
1766
#ifdef HAVE_LIBDEFLATE
1767
#  if (LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR == 1 && LIBDEFLATE_VERSION_MINOR <= 8))
1768
        if (content_id == DS_RN && level >= 4 && level <= 7)
1769
            return zlib_mem_deflate(in, in_size, out_size, level, strat);
1770
        else
1771
#  endif
1772
            return libdeflate_deflate(in, in_size, out_size, level, strat);
1773
#else
1774
347k
        return zlib_mem_deflate(in, in_size, out_size, level, strat);
1775
0
#endif
1776
1777
0
    case BZIP2: {
1778
0
#ifdef HAVE_LIBBZ2
1779
0
        unsigned int comp_size = in_size*1.01 + 600;
1780
0
        char *comp = malloc(comp_size);
1781
0
        if (!comp)
1782
0
            return NULL;
1783
1784
0
        if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
1785
0
                                              in, in_size,
1786
0
                                              level, 0, 30)) {
1787
0
            free(comp);
1788
0
            return NULL;
1789
0
        }
1790
0
        *out_size = comp_size;
1791
0
        return comp;
1792
#else
1793
        return NULL;
1794
#endif
1795
0
    }
1796
1797
0
    case FQZ:
1798
0
    case FQZ_b:
1799
0
    case FQZ_c:
1800
0
    case FQZ_d: {
1801
        // Extract the necessary portion of the slice into an fqz_slice struct.
1802
        // These previously were the same thing, but this permits us to detach
1803
        // the codec from the rest of this CRAM implementation.
1804
0
        fqz_slice *f = malloc(2*s->hdr->num_records * sizeof(uint32_t) + sizeof(fqz_slice));
1805
0
        if (!f)
1806
0
            return NULL;
1807
0
        f->num_records = s->hdr->num_records;
1808
0
        f->len = (uint32_t *)(((char *)f) + sizeof(fqz_slice));
1809
0
        f->flags = f->len + s->hdr->num_records;
1810
0
        int i;
1811
0
        for (i = 0; i < s->hdr->num_records; i++) {
1812
0
            f->flags[i] = s->crecs[i].flags;
1813
0
            f->len[i] = (i+1 < s->hdr->num_records
1814
0
                         ? s->crecs[i+1].qual - s->crecs[i].qual
1815
0
                         : s->block[DS_QS]->uncomp_size - s->crecs[i].qual);
1816
0
        }
1817
0
        char *comp = fqz_compress(strat & 0xff /* cram vers */, f,
1818
0
                                  in, in_size, out_size, strat >> 8, NULL);
1819
0
        free(f);
1820
0
        return comp;
1821
0
    }
1822
1823
0
    case LZMA:
1824
0
#ifdef HAVE_LIBLZMA
1825
0
        return lzma_mem_deflate(in, in_size, out_size, level);
1826
#else
1827
        return NULL;
1828
#endif
1829
1830
0
    case RANS0:
1831
0
    case RANS1: {
1832
0
        unsigned int out_size_i;
1833
0
        unsigned char *cp;
1834
0
        cp = rans_compress((unsigned char *)in, in_size, &out_size_i,
1835
0
                           method == RANS0 ? 0 : 1);
1836
0
        *out_size = out_size_i;
1837
0
        return (char *)cp;
1838
0
    }
1839
1840
194k
    case RANS_PR0:
1841
268k
    case RANS_PR1:
1842
369k
    case RANS_PR64:
1843
443k
    case RANS_PR9:
1844
556k
    case RANS_PR128:
1845
556k
    case RANS_PR129:
1846
556k
    case RANS_PR192:
1847
630k
    case RANS_PR193: {
1848
630k
        unsigned int out_size_i;
1849
630k
        unsigned char *cp;
1850
1851
        // see enum cram_block. We map RANS_* methods to order bit-fields
1852
630k
        static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1853
1854
630k
        int m = method == RANS_PR0 ? 0 : methmap[method - RANS_PR1];
1855
630k
        cp = rans_compress_4x16((unsigned char *)in, in_size, &out_size_i,
1856
630k
                                m | RANS_ORDER_SIMD_AUTO);
1857
630k
        *out_size = out_size_i;
1858
630k
        return (char *)cp;
1859
556k
    }
1860
1861
0
    case ARITH_PR0:
1862
0
    case ARITH_PR1:
1863
0
    case ARITH_PR64:
1864
0
    case ARITH_PR9:
1865
0
    case ARITH_PR128:
1866
0
    case ARITH_PR129:
1867
0
    case ARITH_PR192:
1868
0
    case ARITH_PR193: {
1869
0
        unsigned int out_size_i;
1870
0
        unsigned char *cp;
1871
1872
        // see enum cram_block. We map ARITH_* methods to order bit-fields
1873
0
        static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1874
1875
0
        cp = arith_compress_to((unsigned char *)in, in_size, NULL, &out_size_i,
1876
0
                               method == ARITH_PR0 ? 0 : methmap[method - ARITH_PR1]);
1877
0
        *out_size = out_size_i;
1878
0
        return (char *)cp;
1879
0
    }
1880
1881
26.1k
    case TOK3:
1882
26.1k
    case TOKA: {
1883
26.1k
        int out_len;
1884
26.1k
        int lev = level;
1885
26.1k
        if (method == TOK3 && lev > 3)
1886
26.1k
            lev = 3;
1887
26.1k
        uint8_t *cp = tok3_encode_names(in, in_size, lev, strat, &out_len, NULL);
1888
26.1k
        *out_size = out_len;
1889
26.1k
        return (char *)cp;
1890
26.1k
    }
1891
1892
0
    case RAW:
1893
0
        break;
1894
1895
0
    default:
1896
0
        return NULL;
1897
1.00M
    }
1898
1899
0
    return NULL;
1900
1.00M
}
1901
1902
/*
1903
 * A copy of cram_compress_block2 with added recursion detection.
1904
 * This is only called for error handling where the auto-tuning has failed.
1905
 * The simplest way of doing this is recusion + an additional argument, but
1906
 * we didn't want to complicate the existing code hence this is static.
1907
 */
1908
static int cram_compress_block3(cram_fd *fd, cram_slice *s,
1909
                                cram_block *b, cram_metrics *metrics,
1910
                                int method, int level,
1911
678k
                                int recurse) {
1912
1913
678k
    if (!b)
1914
36.5k
        return 0;
1915
1916
641k
    int orig_method = method;
1917
641k
    char *comp = NULL;
1918
641k
    size_t comp_size = 0;
1919
641k
    int strat;
1920
1921
    // Internally we have parameterised methods that externally map
1922
    // to the same CRAM method value.
1923
    // See enum_cram_block_method_int in cram_structs.h.
1924
641k
    int methmap[] = {
1925
        // Externally defined values
1926
641k
        RAW, GZIP, BZIP2, LZMA, RANS, RANSPR, ARITH, FQZ, TOK3,
1927
1928
        // Reserved for possible expansion
1929
641k
        0, 0,
1930
1931
        // Internally parameterised versions matching back to above
1932
        // external values
1933
641k
        GZIP, GZIP,
1934
641k
        FQZ, FQZ, FQZ,
1935
641k
        RANS,
1936
641k
        RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR,
1937
641k
        TOK3,
1938
641k
        ARITH,  ARITH,  ARITH,  ARITH,  ARITH,  ARITH,  ARITH,
1939
641k
    };
1940
1941
641k
    if (b->method != RAW) {
1942
        // Maybe already compressed if s->block[0] was compressed and
1943
        // we have e.g. s->block[DS_BA] set to s->block[0] due to only
1944
        // one base type present and hence using E_HUFFMAN on block 0.
1945
        // A second explicit attempt to compress the same block then
1946
        // occurs.
1947
0
        return 0;
1948
0
    }
1949
1950
641k
    if (method == -1) {
1951
6.83k
        method = 1<<GZIP;
1952
6.83k
        if (fd->use_bz2)
1953
0
            method |= 1<<BZIP2;
1954
6.83k
        if (fd->use_lzma)
1955
0
            method |= 1<<LZMA;
1956
6.83k
    }
1957
1958
641k
    if (level == -1)
1959
6.83k
        level = fd->level;
1960
1961
    //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
1962
1963
641k
    if (method == RAW || level == 0 || b->uncomp_size == 0) {
1964
278k
        b->method = RAW;
1965
278k
        b->comp_size = b->uncomp_size;
1966
        //fprintf(stderr, "Skip block id %d\n", b->content_id);
1967
278k
        return 0;
1968
278k
    }
1969
1970
363k
#ifndef ABS
1971
363k
#    define ABS(a) ((a)>=0?(a):-(a))
1972
363k
#endif
1973
1974
363k
    if (metrics) {
1975
349k
        pthread_mutex_lock(&fd->metrics_lock);
1976
        // Sudden changes in size trigger a retrial.  These are mainly
1977
        // triggered when switching to sorted / unsorted, where the number
1978
        // of elements in a slice radically changes.
1979
        //
1980
        // We also get large fluctuations based on genome coordinate for
1981
        // e.g. SA:Z and SC series, but we consider the typical scale of
1982
        // delta between blocks and use this to look for abnormality.
1983
1984
        // Equivalent to (but minus possible integer overflow)
1985
        //   (b->uncomp_size + 1000)/4 > metrics->input_avg_sz+1000 ||
1986
        //    b->uncomp_size + 1000    < (metrics->input_avg_sz+1000)/4)
1987
349k
        if (metrics->input_avg_sz &&
1988
147k
            (b->uncomp_size/4 - 750 > metrics->input_avg_sz ||
1989
146k
             b->uncomp_size         < metrics->input_avg_sz/4 - 750) &&
1990
2.05k
            ABS(b->uncomp_size-metrics->input_avg_sz)/10
1991
2.05k
                > metrics->input_avg_delta) {
1992
93
            metrics->next_trial = 0;
1993
93
        }
1994
1995
349k
        if (metrics->trial > 0 || --metrics->next_trial <= 0) {
1996
99.4k
            int m, unpackable = metrics->unpackable;
1997
99.4k
            size_t sz_best = b->uncomp_size;
1998
99.4k
            size_t sz[CRAM_MAX_METHOD] = {0};
1999
99.4k
            int method_best = 0; // RAW
2000
99.4k
            char *c_best = NULL, *c = NULL;
2001
2002
99.4k
            metrics->input_avg_delta =
2003
99.4k
                0.9 * (metrics->input_avg_delta +
2004
99.4k
                       ABS(b->uncomp_size - metrics->input_avg_sz));
2005
2006
99.4k
            metrics->input_avg_sz += b->uncomp_size*.2;
2007
99.4k
            metrics->input_avg_sz *= 0.8;
2008
2009
99.4k
            if (metrics->revised_method)
2010
46.4k
                method = metrics->revised_method;
2011
53.0k
            else
2012
53.0k
                metrics->revised_method = method;
2013
2014
99.4k
            if (metrics->next_trial <= 0) {
2015
2.56k
                metrics->next_trial = TRIAL_SPAN;
2016
2.56k
                metrics->trial = NTRIALS;
2017
84.6k
                for (m = 0; m < CRAM_MAX_METHOD; m++)
2018
82.0k
                    metrics->sz[m] /= 2;
2019
2.56k
                metrics->unpackable = 0;
2020
2.56k
            }
2021
2022
            // Compress this block using the best method
2023
99.4k
            if (unpackable && CRAM_MAJOR_VERS(fd->version) > 3) {
2024
                // No point trying bit-pack if 17+ symbols.
2025
0
                if (method & (1<<RANS_PR128))
2026
0
                    method = (method|(1<<RANS_PR0))&~(1<<RANS_PR128);
2027
0
                if (method & (1<<RANS_PR129))
2028
0
                    method = (method|(1<<RANS_PR1))&~(1<<RANS_PR129);
2029
0
                if (method & (1<<RANS_PR192))
2030
0
                    method = (method|(1<<RANS_PR64))&~(1<<RANS_PR192);
2031
0
                if (method & (1<<RANS_PR193))
2032
0
                    method = (method|(1<<RANS_PR64)|(1<<RANS_PR1))&~(1<<RANS_PR193);
2033
2034
0
                if (method & (1<<ARITH_PR128))
2035
0
                    method = (method|(1<<ARITH_PR0))&~(1<<ARITH_PR128);
2036
0
                if (method & (1<<ARITH_PR129))
2037
0
                    method = (method|(1<<ARITH_PR1))&~(1<<ARITH_PR129);
2038
0
                if (method & (1<<ARITH_PR192))
2039
0
                    method = (method|(1<<ARITH_PR64))&~(1<<ARITH_PR192);
2040
0
                if (method & (1u<<ARITH_PR193))
2041
0
                    method = (method|(1<<ARITH_PR64)|(1<<ARITH_PR1))&~(1u<<ARITH_PR193);
2042
0
            }
2043
2044
            // Libdeflate doesn't have a Z_RLE strategy.
2045
            // We treat it as level 1, but iff we haven't also
2046
            // explicitly listed that in the method list.
2047
#ifdef HAVE_LIBDEFLATE
2048
            if ((method & (1<<GZIP_RLE)) && (method & (1<<GZIP_1)))
2049
                method &= ~(1<<GZIP_RLE);
2050
#endif
2051
2052
99.4k
            pthread_mutex_unlock(&fd->metrics_lock);
2053
2054
3.28M
            for (m = 0; m < CRAM_MAX_METHOD; m++) {
2055
3.18M
                if (method & (1u<<m)) {
2056
739k
                    int lvl = level;
2057
739k
                    switch (m) {
2058
99.4k
                    case GZIP:     strat = Z_FILTERED; break;
2059
99.4k
                    case GZIP_1:   strat = Z_DEFAULT_STRATEGY; lvl = 1; break;
2060
73.5k
                    case GZIP_RLE: strat = Z_RLE; break;
2061
0
                    case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
2062
0
                    case FQZ_b:    strat = CRAM_MAJOR_VERS(fd->version)+256; break;
2063
0
                    case FQZ_c:    strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
2064
0
                    case FQZ_d:    strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
2065
25.9k
                    case TOK3:     strat = 0; break;
2066
0
                    case TOKA:     strat = 1; break;
2067
441k
                    default:       strat = 0;
2068
739k
                    }
2069
2070
739k
                    c = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2071
739k
                                                b->content_id, &sz[m], m, lvl, strat);
2072
2073
739k
                    if (c && sz_best > sz[m]) {
2074
66.5k
                        sz_best = sz[m];
2075
66.5k
                        method_best = m;
2076
66.5k
                        if (c_best)
2077
34.9k
                            free(c_best);
2078
66.5k
                        c_best = c;
2079
673k
                    } else if (c) {
2080
671k
                        free(c);
2081
671k
                    } else {
2082
1.94k
                        sz[m] = UINT_MAX; // arbitrarily worse than raw
2083
1.94k
                    }
2084
2.44M
                } else {
2085
2.44M
                    sz[m] = UINT_MAX; // arbitrarily worse than raw
2086
2.44M
                }
2087
3.18M
            }
2088
2089
99.4k
            if (c_best) {
2090
31.5k
                free(b->data);
2091
31.5k
                b->data = (unsigned char *)c_best;
2092
31.5k
                b->method = method_best; // adjusted to methmap[method_best] later
2093
31.5k
                b->comp_size = sz_best;
2094
31.5k
            }
2095
2096
            // Accumulate stats for all methods tried
2097
99.4k
            pthread_mutex_lock(&fd->metrics_lock);
2098
3.28M
            for (m = 0; m < CRAM_MAX_METHOD; m++)
2099
                // don't be overly sure on small blocks.
2100
                // +2000 means eg bzip2 vs gzip (1.07 to 1.04) or gz vs rans1
2101
                // needs to be at least 60 bytes smaller to overcome the
2102
                // fixed size addition.
2103
3.18M
                metrics->sz[m] += sz[m]+2000;
2104
2105
            // When enough trials performed, find the best on average
2106
99.4k
            if (--metrics->trial == 0) {
2107
24.5k
                int best_method = RAW;
2108
24.5k
                int best_sz = INT_MAX;
2109
2110
                // Relative costs of methods. See enum_cram_block_method_int
2111
                // and methmap
2112
24.5k
                double meth_cost[32] = {
2113
                    // Externally defined methods
2114
24.5k
                    1,    // 0  raw
2115
24.5k
                    1.04, // 1  gzip (Z_FILTERED)
2116
24.5k
                    1.07, // 2  bzip2
2117
24.5k
                    1.08, // 3  lzma
2118
24.5k
                    1.00, // 4  rans    (O0)
2119
24.5k
                    1.00, // 5  ranspr  (O0)
2120
24.5k
                    1.04, // 6  arithpr (O0)
2121
24.5k
                    1.05, // 7  fqz
2122
24.5k
                    1.05, // 8  tok3 (rans)
2123
24.5k
                    1.00, 1.00, // 9,10 reserved
2124
2125
                    // Paramterised versions of above
2126
24.5k
                    1.01, // gzip rle
2127
24.5k
                    1.01, // gzip -1
2128
2129
24.5k
                    1.05, 1.05, 1.05, // FQZ_b,c,d
2130
2131
24.5k
                    1.01, // rans O1
2132
2133
24.5k
                    1.01, // rans_pr1
2134
24.5k
                    1.00, // rans_pr64; if smaller, usually fast
2135
24.5k
                    1.03, // rans_pr65/9
2136
24.5k
                    1.00, // rans_pr128
2137
24.5k
                    1.01, // rans_pr129
2138
24.5k
                    1.00, // rans_pr192
2139
24.5k
                    1.01, // rans_pr193
2140
2141
24.5k
                    1.07, // tok3 arith
2142
2143
24.5k
                    1.04, // arith_pr1
2144
24.5k
                    1.04, // arith_pr64
2145
24.5k
                    1.04, // arith_pr9
2146
24.5k
                    1.03, // arith_pr128
2147
24.5k
                    1.04, // arith_pr129
2148
24.5k
                    1.04, // arith_pr192
2149
24.5k
                    1.04, // arith_pr193
2150
24.5k
                };
2151
2152
                // Scale methods by cost based on compression level
2153
24.5k
                if (fd->level <= 1) {
2154
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2155
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1)*4;
2156
24.5k
                } else if (fd->level <= 3) {
2157
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2158
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1);
2159
24.5k
                } else if (fd->level <= 6) {
2160
808k
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2161
784k
                        metrics->sz[m] *= 1+(meth_cost[m]-1)/2;
2162
24.5k
                } else if (fd->level <= 7) {
2163
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2164
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1)/3;
2165
0
                } // else cost is ignored
2166
2167
                // Ensure these are never used; BSC and ZSTD
2168
24.5k
                metrics->sz[9] = metrics->sz[10] = INT_MAX;
2169
2170
808k
                for (m = 0; m < CRAM_MAX_METHOD; m++) {
2171
784k
                    if ((!metrics->sz[m]) || (!(method & (1u<<m))))
2172
617k
                        continue;
2173
2174
166k
                    if (best_sz > metrics->sz[m])
2175
58.9k
                        best_sz = metrics->sz[m], best_method = m;
2176
166k
                }
2177
2178
24.5k
                if (best_method != metrics->method) {
2179
                    //metrics->trial = (NTRIALS+1)/2; // be sure
2180
                    //metrics->next_trial /= 1.5;
2181
11.7k
                    metrics->consistency = 0;
2182
12.7k
                } else {
2183
12.7k
                    metrics->next_trial *= MIN(2, 1+metrics->consistency/4.0);
2184
12.7k
                    metrics->consistency++;
2185
12.7k
                }
2186
2187
24.5k
                metrics->method = best_method;
2188
24.5k
                switch (best_method) {
2189
139
                case GZIP:     strat = Z_FILTERED; break;
2190
9.13k
                case GZIP_1:   strat = Z_DEFAULT_STRATEGY; break;
2191
5
                case GZIP_RLE: strat = Z_RLE; break;
2192
0
                case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
2193
0
                case FQZ_b:    strat = CRAM_MAJOR_VERS(fd->version)+256; break;
2194
0
                case FQZ_c:    strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
2195
0
                case FQZ_d:    strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
2196
315
                case TOK3:     strat = 0; break;
2197
0
                case TOKA:     strat = 1; break;
2198
14.9k
                default:       strat = 0;
2199
24.5k
                }
2200
24.5k
                metrics->strat  = strat;
2201
2202
                // If we see at least MAXFAIL trials in a row for a specific
2203
                // compression method with more than MAXDELTA aggregate
2204
                // size then we drop this from the list of methods used
2205
                // for this block type.
2206
234k
#define MAXDELTA 0.20
2207
559k
#define MAXFAILS 4
2208
808k
                for (m = 0; m < CRAM_MAX_METHOD; m++) {
2209
784k
                    if (best_method == m) {
2210
24.5k
                        metrics->cnt[m] = 0;
2211
24.5k
                        metrics->extra[m] = 0;
2212
759k
                    } else if (best_sz < metrics->sz[m]) {
2213
559k
                        double r = (double)metrics->sz[m] / best_sz - 1;
2214
559k
                        int mul = 1+(fd->level>=7);
2215
559k
                        if (++metrics->cnt[m] >= MAXFAILS*mul &&
2216
234k
                            (metrics->extra[m] += r) >= MAXDELTA*mul)
2217
100k
                            method &= ~(1u<<m);
2218
2219
                        // Special case for fqzcomp as it rarely changes
2220
559k
                        if (m == FQZ || m == FQZ_b || m == FQZ_c || m == FQZ_d) {
2221
92.9k
                            if (metrics->sz[m] > best_sz)
2222
92.9k
                                method &= ~(1u<<m);
2223
92.9k
                        }
2224
559k
                    }
2225
784k
                }
2226
2227
                //if (fd->verbose > 1 && method != metrics->revised_method)
2228
                //    fprintf(stderr, "%d: revising method from %x to %x\n",
2229
                //          b->content_id, metrics->revised_method, method);
2230
24.5k
                metrics->revised_method = method;
2231
24.5k
            }
2232
99.4k
            pthread_mutex_unlock(&fd->metrics_lock);
2233
249k
        } else {
2234
249k
            metrics->input_avg_delta =
2235
249k
                0.9 * (metrics->input_avg_delta +
2236
249k
                       ABS(b->uncomp_size - metrics->input_avg_sz));
2237
2238
249k
            metrics->input_avg_sz += b->uncomp_size*.2;
2239
249k
            metrics->input_avg_sz *= 0.8;
2240
2241
249k
            strat = metrics->strat;
2242
249k
            method = metrics->method;
2243
2244
249k
            pthread_mutex_unlock(&fd->metrics_lock);
2245
249k
            comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2246
249k
                                           b->content_id, &comp_size, method,
2247
249k
                                           method == GZIP_1 ? 1 : level,
2248
249k
                                           strat);
2249
249k
            if (!comp) {
2250
                // Our cached best method failed, but maybe another works?
2251
                // Rerun with trial mode engaged again.
2252
227
                if (!recurse) {
2253
227
                    hts_log_warning("Compressed block ID %d method %s failed, "
2254
227
                                    "redoing trial", b->content_id,
2255
227
                                    cram_block_method2str(method));
2256
227
                    pthread_mutex_lock(&fd->metrics_lock);
2257
227
                    metrics->trial = NTRIALS;
2258
227
                    metrics->next_trial = TRIAL_SPAN;
2259
227
                    metrics->revised_method = orig_method;
2260
227
                    pthread_mutex_unlock(&fd->metrics_lock);
2261
227
                    return cram_compress_block3(fd, s, b, metrics, method,
2262
227
                                                level, 1);
2263
227
                }
2264
0
                return -1;
2265
227
            }
2266
2267
249k
            if (comp_size < b->uncomp_size) {
2268
51.5k
                free(b->data);
2269
51.5k
                b->data = (unsigned char *)comp;
2270
51.5k
                b->comp_size = comp_size;
2271
51.5k
                b->method = method;
2272
197k
            } else {
2273
197k
                free(comp);
2274
197k
            }
2275
249k
        }
2276
2277
349k
    } else {
2278
        // no cached metrics, so just do zlib?
2279
14.2k
        comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2280
14.2k
                                       b->content_id, &comp_size, GZIP, level, Z_FILTERED);
2281
14.2k
        if (!comp) {
2282
0
            hts_log_error("Compression failed!");
2283
0
            return -1;
2284
0
        }
2285
2286
14.2k
        if (comp_size < b->uncomp_size) {
2287
2.12k
            free(b->data);
2288
2.12k
            b->data = (unsigned char *)comp;
2289
2.12k
            b->comp_size = comp_size;
2290
2.12k
            b->method = GZIP;
2291
12.0k
        } else {
2292
12.0k
            free(comp);
2293
12.0k
        }
2294
14.2k
        strat = Z_FILTERED;
2295
14.2k
    }
2296
2297
363k
    hts_log_info("Compressed block ID %d from %d to %d by method %s",
2298
363k
                 b->content_id, b->uncomp_size, b->comp_size,
2299
363k
                 cram_block_method2str(b->method));
2300
2301
363k
    b->method = methmap[b->method];
2302
2303
363k
    return 0;
2304
363k
}
2305
2306
/*
2307
 * Compresses a block using a selection of compression codecs and options.
2308
 * The best is learnt and used for subsequent slices, periodically resampling.
2309
 *
2310
 * Method and level -1 implies defaults, as specified in cram_fd.
2311
 */
2312
int cram_compress_block2(cram_fd *fd, cram_slice *s,
2313
                         cram_block *b, cram_metrics *metrics,
2314
677k
                         int method, int level) {
2315
677k
    return cram_compress_block3(fd, s, b, metrics, method, level, 0);
2316
677k
}
2317
2318
int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
2319
6.83k
                        int method, int level) {
2320
6.83k
    return cram_compress_block2(fd, NULL, b, metrics, method, level);
2321
6.83k
}
2322
2323
716k
cram_metrics *cram_new_metrics(void) {
2324
716k
    cram_metrics *m = calloc(1, sizeof(*m));
2325
716k
    if (!m)
2326
0
        return NULL;
2327
716k
    m->trial = NTRIALS-1;
2328
716k
    m->next_trial = TRIAL_SPAN/2; // learn quicker at start
2329
716k
    m->method = RAW;
2330
716k
    m->strat = 0;
2331
716k
    m->revised_method = 0;
2332
716k
    m->unpackable = 0;
2333
2334
716k
    return m;
2335
716k
}
2336
2337
363k
char *cram_block_method2str(enum cram_block_method_int m) {
2338
363k
    switch(m) {
2339
277k
    case RAW:         return "RAW";
2340
5.92k
    case GZIP:        return "GZIP";
2341
0
    case BZIP2:       return "BZIP2";
2342
0
    case LZMA:        return "LZMA";
2343
0
    case RANS0:       return "RANS0";
2344
0
    case RANS1:       return "RANS1";
2345
57
    case GZIP_RLE:    return "GZIP_RLE";
2346
11.3k
    case GZIP_1:      return "GZIP_1";
2347
0
    case FQZ:         return "FQZ";
2348
0
    case FQZ_b:       return "FQZ_b";
2349
0
    case FQZ_c:       return "FQZ_c";
2350
0
    case FQZ_d:       return "FQZ_d";
2351
2.98k
    case RANS_PR0:    return "RANS_PR0";
2352
534
    case RANS_PR1:    return "RANS_PR1";
2353
23.2k
    case RANS_PR64:   return "RANS_PR64";
2354
13
    case RANS_PR9:    return "RANS_PR9";
2355
40.7k
    case RANS_PR128:  return "RANS_PR128";
2356
0
    case RANS_PR129:  return "RANS_PR129";
2357
0
    case RANS_PR192:  return "RANS_PR192";
2358
428
    case RANS_PR193:  return "RANS_PR193";
2359
232
    case TOK3:        return "TOK3_R";
2360
0
    case TOKA:        return "TOK3_A";
2361
0
    case ARITH_PR0:   return "ARITH_PR0";
2362
0
    case ARITH_PR1:   return "ARITH_PR1";
2363
0
    case ARITH_PR64:  return "ARITH_PR64";
2364
0
    case ARITH_PR9:   return "ARITH_PR9";
2365
0
    case ARITH_PR128: return "ARITH_PR128";
2366
0
    case ARITH_PR129: return "ARITH_PR129";
2367
0
    case ARITH_PR192: return "ARITH_PR192";
2368
0
    case ARITH_PR193: return "ARITH_PR193";
2369
0
    case BM_ERROR: break;
2370
363k
    }
2371
0
    return "?";
2372
363k
}
2373
2374
11
char *cram_content_type2str(enum cram_content_type t) {
2375
11
    switch (t) {
2376
9
    case FILE_HEADER:         return "FILE_HEADER";
2377
0
    case COMPRESSION_HEADER:  return "COMPRESSION_HEADER";
2378
0
    case MAPPED_SLICE:        return "MAPPED_SLICE";
2379
0
    case UNMAPPED_SLICE:      return "UNMAPPED_SLICE";
2380
0
    case EXTERNAL:            return "EXTERNAL";
2381
0
    case CORE:                return "CORE";
2382
0
    case CT_ERROR:            break;
2383
11
    }
2384
2
    return "?";
2385
11
}
2386
2387
/* ----------------------------------------------------------------------
2388
 * Reference sequence handling
2389
 *
2390
 * These revolve around the refs_t structure, which may potentially be
2391
 * shared between multiple cram_fd.
2392
 *
2393
 * We start with refs_create() to allocate an empty refs_t and then
2394
 * populate it with @SQ line data using refs_from_header(). This is done on
2395
 * cram_open().  Also at start up we can call cram_load_reference() which
2396
 * is used with "scramble -r foo.fa". This replaces the fd->refs with the
2397
 * new one specified. In either case refs2id() is then called which
2398
 * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
2399
 *
2400
 * Later, possibly within a thread, we will want to know the actual ref
2401
 * seq itself, obtained by calling cram_get_ref().  This may use the
2402
 * UR: or M5: fields or the filename specified in the original
2403
 * cram_load_reference() call.
2404
 *
2405
 * Given the potential for multi-threaded reference usage, we have
2406
 * reference counting (sorry for the confusing double use of "ref") to
2407
 * track the number of callers interested in any specific reference.
2408
 */
2409
2410
/*
2411
 * Frees/unmaps a reference sequence and associated file handles.
2412
 */
2413
7.40k
static void ref_entry_free_seq(ref_entry *e) {
2414
7.40k
    if (e->mf)
2415
0
        mfclose(e->mf);
2416
7.40k
    if (e->seq && !e->mf)
2417
0
        free(e->seq);
2418
2419
7.40k
    e->seq = NULL;
2420
7.40k
    e->mf = NULL;
2421
7.40k
}
2422
2423
15.0k
void refs_free(refs_t *r) {
2424
15.0k
    RP("refs_free()\n");
2425
2426
15.0k
    if (--r->count > 0)
2427
0
        return;
2428
2429
15.0k
    if (!r)
2430
0
        return;
2431
2432
15.0k
    if (r->pool)
2433
15.0k
        string_pool_destroy(r->pool);
2434
2435
15.0k
    if (r->h_meta) {
2436
15.0k
        khint_t k;
2437
2438
36.0k
        for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) {
2439
21.0k
            ref_entry *e;
2440
2441
21.0k
            if (!kh_exist(r->h_meta, k))
2442
13.6k
                continue;
2443
7.40k
            if (!(e = kh_val(r->h_meta, k)))
2444
0
                continue;
2445
7.40k
            ref_entry_free_seq(e);
2446
7.40k
            free(e);
2447
7.40k
        }
2448
2449
15.0k
        kh_destroy(refs, r->h_meta);
2450
15.0k
    }
2451
2452
15.0k
    if (r->ref_id)
2453
7.90k
        free(r->ref_id);
2454
2455
15.0k
    if (r->fp)
2456
0
        bgzf_close(r->fp);
2457
2458
15.0k
    pthread_mutex_destroy(&r->lock);
2459
2460
15.0k
    free(r);
2461
15.0k
}
2462
2463
15.0k
static refs_t *refs_create(void) {
2464
15.0k
    refs_t *r = calloc(1, sizeof(*r));
2465
2466
15.0k
    RP("refs_create()\n");
2467
2468
15.0k
    if (!r)
2469
0
        return NULL;
2470
2471
15.0k
    if (!(r->pool = string_pool_create(8192)))
2472
0
        goto err;
2473
2474
15.0k
    r->ref_id = NULL; // see refs2id() to populate.
2475
15.0k
    r->count = 1;
2476
15.0k
    r->last = NULL;
2477
15.0k
    r->last_id = -1;
2478
2479
15.0k
    if (!(r->h_meta = kh_init(refs)))
2480
0
        goto err;
2481
2482
15.0k
    pthread_mutex_init(&r->lock, NULL);
2483
2484
15.0k
    return r;
2485
2486
0
 err:
2487
0
    refs_free(r);
2488
0
    return NULL;
2489
15.0k
}
2490
2491
/*
2492
 * Opens a reference fasta file as a BGZF stream, allowing for
2493
 * compressed files.  It automatically builds a .fai file if
2494
 * required and if compressed a .gzi bgzf index too.
2495
 *
2496
 * Returns a BGZF handle on success;
2497
 *         NULL on failure.
2498
 */
2499
1.93k
static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) {
2500
1.93k
    BGZF *fp;
2501
2502
1.93k
    if (strncmp(fn, "file://", 7) == 0)
2503
0
        fn += 7;
2504
2505
1.93k
    if (!is_md5 && !hisremote(fn)) {
2506
1.15k
        char fai_file[PATH_MAX];
2507
2508
1.15k
        snprintf(fai_file, PATH_MAX, "%s.fai", fn);
2509
1.15k
        if (access(fai_file, R_OK) != 0)
2510
1.15k
            if (fai_build(fn) != 0)
2511
1.15k
                return NULL;
2512
1.15k
    }
2513
2514
780
    if (!(fp = bgzf_open(fn, mode))) {
2515
780
        perror(fn);
2516
780
        return NULL;
2517
780
    }
2518
2519
0
    if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) {
2520
0
        hts_log_error("Unable to load .gzi index '%s.gzi'", fn);
2521
0
        bgzf_close(fp);
2522
0
        return NULL;
2523
0
    }
2524
2525
0
    return fp;
2526
0
}
2527
2528
/*
2529
 * Loads a FAI file for a reference.fasta.
2530
 * "is_err" indicates whether failure to load is worthy of emitting an
2531
 * error message. In some cases (eg with embedded references) we
2532
 * speculatively load, just in case, and silently ignore errors.
2533
 *
2534
 * Returns the refs_t struct on success (maybe newly allocated);
2535
 *         NULL on failure
2536
 */
2537
1.93k
static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) {
2538
1.93k
    hFILE *fp = NULL;
2539
1.93k
    char fai_fn[PATH_MAX];
2540
1.93k
    char line[8192];
2541
1.93k
    refs_t *r = r_orig;
2542
1.93k
    size_t fn_l = strlen(fn);
2543
1.93k
    int id = 0, id_alloc = 0;
2544
2545
1.93k
    RP("refs_load_fai %s\n", fn);
2546
2547
1.93k
    if (!r)
2548
0
        if (!(r = refs_create()))
2549
0
            goto err;
2550
2551
1.93k
    if (r->fp)
2552
0
        if (bgzf_close(r->fp) != 0)
2553
0
            goto err;
2554
1.93k
    r->fp = NULL;
2555
2556
    /* Look for a FASTA##idx##FAI format */
2557
1.93k
    char *fn_delim = strstr(fn, HTS_IDX_DELIM);
2558
1.93k
    if (fn_delim) {
2559
38
        if (!(r->fn = string_ndup(r->pool, fn, fn_delim - fn)))
2560
0
            goto err;
2561
38
        fn_delim += strlen(HTS_IDX_DELIM);
2562
38
        snprintf(fai_fn, PATH_MAX, "%s", fn_delim);
2563
1.89k
    } else {
2564
        /* An index file was provided, instead of the actual reference file */
2565
1.89k
        if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0) {
2566
2
            if (!r->fn) {
2567
1
                if (!(r->fn = string_ndup(r->pool, fn, fn_l-4)))
2568
0
                    goto err;
2569
1
            }
2570
2
            snprintf(fai_fn, PATH_MAX, "%s", fn);
2571
1.89k
        } else {
2572
        /* Only the reference file provided. Get the index file name from it */
2573
1.89k
            if (!(r->fn = string_dup(r->pool, fn)))
2574
0
                goto err;
2575
1.89k
            snprintf(fai_fn, PATH_MAX, "%.*s.fai", PATH_MAX-5, fn);
2576
1.89k
        }
2577
1.89k
    }
2578
2579
1.93k
    if (!(r->fp = bgzf_open_ref(r->fn, "r", 0))) {
2580
1.93k
        hts_log_error("Failed to open reference file '%s'", r->fn);
2581
1.93k
        goto err;
2582
1.93k
    }
2583
2584
0
    if (!(fp = hopen(fai_fn, "r"))) {
2585
0
        hts_log_error("Failed to open index file '%s'", fai_fn);
2586
0
        if (is_err)
2587
0
            perror(fai_fn);
2588
0
        goto err;
2589
0
    }
2590
0
    while (hgets(line, 8192, fp) != NULL) {
2591
0
        ref_entry *e = malloc(sizeof(*e));
2592
0
        char *cp;
2593
0
        int n;
2594
0
        khint_t k;
2595
2596
0
        if (!e)
2597
0
            return NULL;
2598
2599
        // id
2600
0
        for (cp = line; *cp && !isspace_c(*cp); cp++)
2601
0
            ;
2602
0
        *cp++ = 0;
2603
0
        e->name = string_dup(r->pool, line);
2604
2605
        // length
2606
0
        while (*cp && isspace_c(*cp))
2607
0
            cp++;
2608
0
        e->length = strtoll(cp, &cp, 10);
2609
2610
        // offset
2611
0
        while (*cp && isspace_c(*cp))
2612
0
            cp++;
2613
0
        e->offset = strtoll(cp, &cp, 10);
2614
2615
        // bases per line
2616
0
        while (*cp && isspace_c(*cp))
2617
0
            cp++;
2618
0
        e->bases_per_line = strtol(cp, &cp, 10);
2619
2620
        // line length
2621
0
        while (*cp && isspace_c(*cp))
2622
0
            cp++;
2623
0
        e->line_length = strtol(cp, &cp, 10);
2624
2625
        // filename
2626
0
        e->fn = r->fn;
2627
2628
0
        e->count = 0;
2629
0
        e->seq = NULL;
2630
0
        e->mf = NULL;
2631
0
        e->is_md5 = 0;
2632
0
        e->validated_md5 = 0;
2633
2634
0
        k = kh_put(refs, r->h_meta, e->name, &n);
2635
0
        if (-1 == n)  {
2636
0
            free(e);
2637
0
            return NULL;
2638
0
        }
2639
2640
0
        if (n) {
2641
0
            kh_val(r->h_meta, k) = e;
2642
0
        } else {
2643
0
            ref_entry *re = kh_val(r->h_meta, k);
2644
0
            if (re && (re->count != 0 || re->length != 0)) {
2645
                /* Keep old */
2646
0
                free(e);
2647
0
            } else {
2648
                /* Replace old */
2649
0
                if (re)
2650
0
                    free(re);
2651
0
                kh_val(r->h_meta, k) = e;
2652
0
            }
2653
0
        }
2654
2655
0
        if (id >= id_alloc) {
2656
0
            ref_entry **new_refs;
2657
0
            int x;
2658
2659
0
            id_alloc = id_alloc ?id_alloc*2 : 16;
2660
0
            new_refs = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
2661
0
            if (!new_refs)
2662
0
                goto err;
2663
0
            r->ref_id = new_refs;
2664
2665
0
            for (x = id; x < id_alloc; x++)
2666
0
                r->ref_id[x] = NULL;
2667
0
        }
2668
0
        r->ref_id[id] = e;
2669
0
        r->nref = ++id;
2670
0
    }
2671
2672
0
    if(hclose(fp) < 0)
2673
0
        goto err;
2674
0
    return r;
2675
2676
1.93k
 err:
2677
1.93k
    if (fp)
2678
0
        hclose_abruptly(fp);
2679
2680
1.93k
    if (!r_orig)
2681
0
        refs_free(r);
2682
2683
1.93k
    return NULL;
2684
0
}
2685
2686
/*
2687
 * Verifies that the CRAM @SQ lines and .fai files match.
2688
 */
2689
0
static void sanitise_SQ_lines(cram_fd *fd) {
2690
0
    int i;
2691
2692
0
    if (!fd->header || !fd->header->hrecs)
2693
0
        return;
2694
2695
0
    if (!fd->refs || !fd->refs->h_meta)
2696
0
        return;
2697
2698
0
    for (i = 0; i < fd->header->hrecs->nref; i++) {
2699
0
        const char *name = fd->header->hrecs->ref[i].name;
2700
0
        khint_t k = kh_get(refs, fd->refs->h_meta, name);
2701
0
        ref_entry *r;
2702
2703
        // We may have @SQ lines which have no known .fai, but do not
2704
        // in themselves pose a problem because they are unused in the file.
2705
0
        if (k == kh_end(fd->refs->h_meta))
2706
0
            continue;
2707
2708
0
        if (!(r = (ref_entry *)kh_val(fd->refs->h_meta, k)))
2709
0
            continue;
2710
2711
0
        if (r->length && r->length != fd->header->hrecs->ref[i].len) {
2712
0
            assert(strcmp(r->name, fd->header->hrecs->ref[i].name) == 0);
2713
2714
            // Should we also check MD5sums here to ensure the correct
2715
            // reference was given?
2716
0
            hts_log_warning("Header @SQ length mismatch for ref %s, %"PRIhts_pos" vs %d",
2717
0
                            r->name, fd->header->hrecs->ref[i].len, (int)r->length);
2718
2719
            // Fixing the parsed @SQ header will make MD:Z: strings work
2720
            // and also stop it producing N for the sequence.
2721
0
            fd->header->hrecs->ref[i].len = r->length;
2722
0
        }
2723
0
    }
2724
0
}
2725
2726
/*
2727
 * Indexes references by the order they appear in a BAM file. This may not
2728
 * necessarily be the same order they appear in the fasta reference file.
2729
 *
2730
 * Returns 0 on success
2731
 *        -1 on failure
2732
 */
2733
6.83k
int refs2id(refs_t *r, sam_hdr_t *hdr) {
2734
6.83k
    int i;
2735
6.83k
    sam_hrecs_t *h = hdr->hrecs;
2736
2737
6.83k
    if (r->ref_id)
2738
3.49k
        free(r->ref_id);
2739
6.83k
    if (r->last)
2740
0
        r->last = NULL;
2741
2742
6.83k
    r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
2743
6.83k
    if (!r->ref_id)
2744
0
        return -1;
2745
2746
6.83k
    r->nref = h->nref;
2747
11.4k
    for (i = 0; i < h->nref; i++) {
2748
4.63k
        khint_t k = kh_get(refs, r->h_meta, h->ref[i].name);
2749
4.63k
        if (k != kh_end(r->h_meta)) {
2750
4.63k
            r->ref_id[i] = kh_val(r->h_meta, k);
2751
4.63k
        } else {
2752
0
            hts_log_warning("Unable to find ref name '%s'", h->ref[i].name);
2753
0
        }
2754
4.63k
    }
2755
2756
6.83k
    return 0;
2757
6.83k
}
2758
2759
/*
2760
 * Generates refs_t entries based on @SQ lines in the header.
2761
 * Returns 0 on success
2762
 *         -1 on failure
2763
 */
2764
28.8k
static int refs_from_header(cram_fd *fd) {
2765
28.8k
    if (!fd)
2766
0
        return -1;
2767
2768
28.8k
    refs_t *r = fd->refs;
2769
28.8k
    if (!r)
2770
0
        return -1;
2771
2772
28.8k
    sam_hdr_t *h = fd->header;
2773
28.8k
    if (!h)
2774
7.20k
        return 0;
2775
2776
21.6k
    if (!h->hrecs) {
2777
13.5k
        if (-1 == sam_hdr_fill_hrecs(h))
2778
173
            return -1;
2779
13.5k
    }
2780
2781
21.4k
    if (h->hrecs->nref == 0)
2782
13.4k
        return 0;
2783
2784
    //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
2785
2786
    /* Existing refs are fine, as long as they're compatible with the hdr. */
2787
8.06k
    ref_entry **new_ref_id = realloc(r->ref_id, (r->nref + h->hrecs->nref) * sizeof(*r->ref_id));
2788
8.06k
    if (!new_ref_id)
2789
0
        return -1;
2790
8.06k
    r->ref_id = new_ref_id;
2791
2792
8.06k
    int i, j;
2793
    /* Copy info from h->ref[i] over to r */
2794
20.0k
    for (i = 0, j = r->nref; i < h->hrecs->nref; i++) {
2795
12.0k
        sam_hrec_type_t *ty;
2796
12.0k
        sam_hrec_tag_t *tag;
2797
12.0k
        khint_t k;
2798
12.0k
        int n;
2799
2800
12.0k
        k = kh_get(refs, r->h_meta, h->hrecs->ref[i].name);
2801
12.0k
        if (k != kh_end(r->h_meta))
2802
            // Ref already known about
2803
4.63k
            continue;
2804
2805
7.40k
        if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
2806
0
            return -1;
2807
2808
7.40k
        if (!h->hrecs->ref[i].name)
2809
0
            return -1;
2810
2811
7.40k
        r->ref_id[j]->name = string_dup(r->pool, h->hrecs->ref[i].name);
2812
7.40k
        if (!r->ref_id[j]->name) return -1;
2813
7.40k
        r->ref_id[j]->length = 0; // marker for not yet loaded
2814
2815
        /* Initialise likely filename if known */
2816
7.40k
        if ((ty = sam_hrecs_find_type_id(h->hrecs, "SQ", "SN", h->hrecs->ref[i].name))) {
2817
7.40k
            if ((tag = sam_hrecs_find_key(ty, "M5", NULL)))
2818
82
                r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
2819
2820
7.40k
            if ((tag = sam_hrecs_find_key(ty, "LN", NULL))) {
2821
                // LN tag used when constructing consensus reference
2822
7.40k
                r->ref_id[j]->LN_length = strtoll(tag->str+3, NULL, 0);
2823
                // See fuzz 382922241
2824
7.40k
                if (r->ref_id[j]->LN_length < 0)
2825
820
                    r->ref_id[j]->LN_length = 0;
2826
7.40k
            }
2827
7.40k
        }
2828
2829
7.40k
        k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
2830
7.40k
        if (n <= 0) // already exists or error
2831
0
            return -1;
2832
7.40k
        kh_val(r->h_meta, k) = r->ref_id[j];
2833
2834
7.40k
        j++;
2835
7.40k
    }
2836
8.06k
    r->nref = j;
2837
2838
8.06k
    return 0;
2839
8.06k
}
2840
2841
/*
2842
 * Attaches a header to a cram_fd.
2843
 *
2844
 * This should be used when creating a new cram_fd for writing where
2845
 * we have a header already constructed (eg from a file we've read
2846
 * in).
2847
 */
2848
7.01k
int cram_set_header2(cram_fd *fd, const sam_hdr_t *hdr) {
2849
7.01k
    if (!fd || !hdr )
2850
0
        return -1;
2851
2852
7.01k
    if (fd->header != hdr) {
2853
7.01k
        if (fd->header)
2854
0
            sam_hdr_destroy(fd->header);
2855
7.01k
        fd->header = sam_hdr_dup(hdr);
2856
7.01k
        if (!fd->header)
2857
0
            return -1;
2858
7.01k
    }
2859
7.01k
    return refs_from_header(fd);
2860
7.01k
}
2861
2862
0
int cram_set_header(cram_fd *fd, sam_hdr_t *hdr) {
2863
0
    return cram_set_header2(fd, hdr);
2864
0
}
2865
2866
/*
2867
 * Returns whether the path refers to a directory.
2868
 */
2869
0
static int is_directory(char *fn) {
2870
0
    struct stat buf;
2871
0
    if ( stat(fn,&buf) ) return 0;
2872
0
    return S_ISDIR(buf.st_mode);
2873
0
}
2874
2875
/*
2876
 * Converts a directory and a filename into an expanded path, replacing %s
2877
 * in directory with the filename and %[0-9]+s with portions of the filename
2878
 * Any remaining parts of filename are added to the end with /%s.
2879
 */
2880
0
static int expand_cache_path(char *path, char *dir, const char *fn) {
2881
0
    char *cp, *start = path;
2882
0
    size_t len;
2883
0
    size_t sz = PATH_MAX;
2884
2885
0
    while ((cp = strchr(dir, '%'))) {
2886
0
        if (cp-dir >= sz) return -1;
2887
0
        strncpy(path, dir, cp-dir);
2888
0
        path += cp-dir;
2889
0
        sz -= cp-dir;
2890
2891
0
        if (*++cp == 's') {
2892
0
            len = strlen(fn);
2893
0
            if (len >= sz) return -1;
2894
0
            strcpy(path, fn);
2895
0
            path += len;
2896
0
            sz -= len;
2897
0
            fn += len;
2898
0
            cp++;
2899
0
        } else if (*cp >= '0' && *cp <= '9') {
2900
0
            char *endp;
2901
0
            long l;
2902
2903
0
            l = strtol(cp, &endp, 10);
2904
0
            l = MIN(l, strlen(fn));
2905
0
            if (*endp == 's') {
2906
0
                if (l >= sz) return -1;
2907
0
                strncpy(path, fn, l);
2908
0
                path += l;
2909
0
                fn += l;
2910
0
                sz -= l;
2911
0
                *path = 0;
2912
0
                cp = endp+1;
2913
0
            } else {
2914
0
                if (sz < 3) return -1;
2915
0
                *path++ = '%';
2916
0
                *path++ = *cp++;
2917
0
            }
2918
0
        } else {
2919
0
            if (sz < 3) return -1;
2920
0
            *path++ = '%';
2921
0
            *path++ = *cp++;
2922
0
        }
2923
0
        dir = cp;
2924
0
    }
2925
2926
0
    len = strlen(dir);
2927
0
    if (len >= sz) return -1;
2928
0
    strcpy(path, dir);
2929
0
    path += len;
2930
0
    sz -= len;
2931
2932
0
    len = strlen(fn) + ((*fn && path > start && path[-1] != '/') ? 1 : 0);
2933
0
    if (len >= sz) return -1;
2934
0
    if (*fn && path > start && path[-1] != '/')
2935
0
        *path++ = '/';
2936
0
    strcpy(path, fn);
2937
0
    return 0;
2938
0
}
2939
2940
/*
2941
 * Make the directory containing path and any prefix directories.
2942
 */
2943
0
static void mkdir_prefix(char *path, int mode) {
2944
0
    char *cp = strrchr(path, '/');
2945
0
    if (!cp)
2946
0
        return;
2947
2948
0
    *cp = 0;
2949
0
    if (is_directory(path)) {
2950
0
        *cp = '/';
2951
0
        return;
2952
0
    }
2953
2954
0
    if (mkdir(path, mode) == 0) {
2955
0
        chmod(path, mode);
2956
0
        *cp = '/';
2957
0
        return;
2958
0
    }
2959
2960
0
    mkdir_prefix(path, mode);
2961
0
    mkdir(path, mode);
2962
0
    chmod(path, mode);
2963
0
    *cp = '/';
2964
0
}
2965
2966
/*
2967
 * Queries the M5 string from the header and attempts to populate the
2968
 * reference from this using the REF_PATH environment.
2969
 *
2970
 * Returns 0 on success
2971
 *        -1 on failure
2972
 */
2973
3.85k
static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
2974
3.85k
    char *ref_path = getenv("REF_PATH");
2975
3.85k
    sam_hrec_type_t *ty;
2976
3.85k
    sam_hrec_tag_t *tag;
2977
3.85k
    char path[PATH_MAX];
2978
3.85k
    kstring_t path_tmp = KS_INITIALIZE;
2979
3.85k
    char *local_cache = getenv("REF_CACHE");
2980
3.85k
    mFILE *mf;
2981
3.85k
    int local_path = 0;
2982
2983
3.85k
    hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id);
2984
2985
3.85k
    if (!r->name)
2986
0
        return -1;
2987
2988
3.85k
    if (!(ty = sam_hrecs_find_type_id(fd->header->hrecs, "SQ", "SN", r->name)))
2989
0
        return -1;
2990
2991
3.85k
    if (!(tag = sam_hrecs_find_key(ty, "M5", NULL)))
2992
3.81k
        goto no_M5;
2993
2994
38
    hts_log_info("Querying ref %s", tag->str+3);
2995
2996
    /* Use cache if available */
2997
38
    if (local_cache && *local_cache) {
2998
0
        struct stat sb;
2999
0
        if (expand_cache_path(path, local_cache, tag->str+3) == 0 &&
3000
0
            stat(path, &sb) == 0)
3001
            // Found it in the local cache
3002
0
            local_path = 1;
3003
0
    }
3004
3005
#ifndef HAVE_MMAP
3006
    char *path2;
3007
    /* Search local files in REF_PATH; we can open them and return as above */
3008
    if (!local_path && (path2 = find_path(tag->str+3, ref_path))) {
3009
        int len = snprintf(path, PATH_MAX, "%s", path2);
3010
        free(path2);
3011
        if (len > 0 && len < PATH_MAX) // in case it's too long
3012
            local_path = 1;
3013
    }
3014
#endif
3015
3016
    /* Found via REF_CACHE or local REF_PATH file */
3017
38
    if (local_path) {
3018
0
        struct stat sb;
3019
0
        BGZF *fp;
3020
3021
0
        if (0 == stat(path, &sb)
3022
0
            && S_ISREG(sb.st_mode)
3023
0
            && (fp = bgzf_open(path, "r"))) {
3024
0
            r->length = sb.st_size;
3025
0
            r->offset = r->line_length = r->bases_per_line = 0;
3026
3027
0
            r->fn = string_dup(fd->refs->pool, path);
3028
3029
0
            if (fd->refs->fp)
3030
0
                if (bgzf_close(fd->refs->fp) != 0)
3031
0
                    return -1;
3032
0
            fd->refs->fp = fp;
3033
0
            fd->refs->fn = r->fn;
3034
0
            r->is_md5 = 1;
3035
0
            r->validated_md5 = 1;
3036
3037
            // Fall back to cram_get_ref() where it'll do the actual
3038
            // reading of the file.
3039
0
            return 0;
3040
0
        }
3041
0
    }
3042
3043
3044
    /* Otherwise search full REF_PATH; slower as loads entire file */
3045
38
    int is_local = 0;
3046
38
    if ((mf = open_path_mfile(tag->str+3, ref_path, NULL, &is_local))) {
3047
0
        size_t sz;
3048
0
        r->seq = mfsteal(mf, &sz);
3049
0
        if (r->seq) {
3050
0
            r->mf = NULL;
3051
0
        } else {
3052
            // keep mf around as we couldn't detach
3053
0
            r->seq = mf->data;
3054
0
            r->mf = mf;
3055
0
        }
3056
0
        r->length = sz;
3057
0
        r->is_md5 = 1;
3058
0
        r->validated_md5 = 1;
3059
38
    } else {
3060
38
        refs_t *refs;
3061
38
        const char *fn;
3062
38
        sam_hrec_tag_t *UR_tag;
3063
3064
3.85k
    no_M5:
3065
        /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
3066
3.85k
        if (!(UR_tag = sam_hrecs_find_key(ty, "UR", NULL)))
3067
1.89k
            return -1;
3068
3069
1.95k
        if (strstr(UR_tag->str+3, "://") &&
3070
81
            strncmp(UR_tag->str+3, "file:", 5) != 0) {
3071
            // Documented as omitted, but accidentally supported until now
3072
24
            hts_log_error("UR tags pointing to remote files are not supported");
3073
24
            return -1;
3074
24
        }
3075
3076
1.93k
        fn = (strncmp(UR_tag->str+3, "file:", 5) == 0)
3077
1.93k
            ? UR_tag->str+8
3078
1.93k
            : UR_tag->str+3;
3079
3080
1.93k
        if (fd->refs->fp) {
3081
0
            if (bgzf_close(fd->refs->fp) != 0)
3082
0
                return -1;
3083
0
            fd->refs->fp = NULL;
3084
0
        }
3085
1.93k
        if (!(refs = refs_load_fai(fd->refs, fn, 0)))
3086
1.93k
            return -1;
3087
0
        sanitise_SQ_lines(fd);
3088
3089
0
        fd->refs = refs;
3090
0
        if (fd->refs->fp) {
3091
0
            if (bgzf_close(fd->refs->fp) != 0)
3092
0
                return -1;
3093
0
            fd->refs->fp = NULL;
3094
0
        }
3095
3096
0
        if (!fd->refs->fn)
3097
0
            return -1;
3098
3099
0
        if (-1 == refs2id(fd->refs, fd->header))
3100
0
            return -1;
3101
0
        if (!fd->refs->ref_id || !fd->refs->ref_id[id])
3102
0
            return -1;
3103
3104
        // Local copy already, so fall back to cram_get_ref().
3105
0
        return 0;
3106
0
    }
3107
3108
    /* Populate the local disk cache if required */
3109
0
    if (!is_local && local_cache && *local_cache) {
3110
0
        hFILE *fp;
3111
3112
0
        if (expand_cache_path(path, local_cache, tag->str+3) < 0) {
3113
0
            return 0; // Not fatal - we have the data already so keep going.
3114
0
        }
3115
0
        hts_log_info("Writing cache file '%s'", path);
3116
0
        mkdir_prefix(path, 01777);
3117
3118
0
        fp = hts_open_tmpfile(path, "wx", &path_tmp);
3119
0
        if (!fp) {
3120
0
            perror(path_tmp.s);
3121
0
            free(path_tmp.s);
3122
3123
            // Not fatal - we have the data already so keep going.
3124
0
            return 0;
3125
0
        }
3126
3127
        // Check md5sum
3128
0
        hts_md5_context *md5;
3129
0
        char unsigned md5_buf1[16];
3130
0
        char md5_buf2[33];
3131
3132
0
        if (!(md5 = hts_md5_init())) {
3133
0
            hclose_abruptly(fp);
3134
0
            unlink(path_tmp.s);
3135
0
            free(path_tmp.s);
3136
0
            return -1;
3137
0
        }
3138
0
        hts_md5_update(md5, r->seq, r->length);
3139
0
        hts_md5_final(md5_buf1, md5);
3140
0
        hts_md5_destroy(md5);
3141
0
        hts_md5_hex(md5_buf2, md5_buf1);
3142
3143
0
        if (strncmp(tag->str+3, md5_buf2, 32) != 0) {
3144
0
            hts_log_error("Mismatching md5sum for downloaded reference");
3145
0
            hclose_abruptly(fp);
3146
0
            unlink(path_tmp.s);
3147
0
            free(path_tmp.s);
3148
0
            return -1;
3149
0
        }
3150
3151
0
        ssize_t length_written = hwrite(fp, r->seq, r->length);
3152
0
        if (hclose(fp) < 0 || length_written != r->length ||
3153
0
            chmod(path_tmp.s, 0444) < 0 ||
3154
0
            rename(path_tmp.s, path) < 0) {
3155
0
            hts_log_error("Creating reference at %s failed: %s",
3156
0
                          path, strerror(errno));
3157
0
            unlink(path_tmp.s);
3158
0
        }
3159
0
    }
3160
3161
0
    free(path_tmp.s);
3162
0
    return 0;
3163
0
}
3164
3165
1.68k
static void cram_ref_incr_locked(refs_t *r, int id) {
3166
1.68k
    RP("%d INC REF %d, %d %p\n", gettid(), id,
3167
1.68k
       (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count+1:-999),
3168
1.68k
       id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
3169
3170
1.68k
    if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq)
3171
1.68k
        return;
3172
3173
0
    if (r->last_id == id)
3174
0
        r->last_id = -1;
3175
3176
0
    ++r->ref_id[id]->count;
3177
0
}
3178
3179
1.68k
void cram_ref_incr(refs_t *r, int id) {
3180
1.68k
    pthread_mutex_lock(&r->lock);
3181
1.68k
    cram_ref_incr_locked(r, id);
3182
1.68k
    pthread_mutex_unlock(&r->lock);
3183
1.68k
}
3184
3185
110
static void cram_ref_decr_locked(refs_t *r, int id) {
3186
110
    RP("%d DEC REF %d, %d %p\n", gettid(), id,
3187
110
       (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count-1:-999),
3188
110
       id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
3189
3190
110
    if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) {
3191
110
        return;
3192
110
    }
3193
3194
0
    if (--r->ref_id[id]->count <= 0) {
3195
0
        assert(r->ref_id[id]->count == 0);
3196
0
        if (r->last_id >= 0) {
3197
0
            if (r->ref_id[r->last_id]->count <= 0 &&
3198
0
                r->ref_id[r->last_id]->seq) {
3199
0
                RP("%d FREE REF %d (%p)\n", gettid(),
3200
0
                   r->last_id, r->ref_id[r->last_id]->seq);
3201
0
                ref_entry_free_seq(r->ref_id[r->last_id]);
3202
0
                if (r->ref_id[r->last_id]->is_md5) r->ref_id[r->last_id]->length = 0;
3203
0
            }
3204
0
        }
3205
0
        r->last_id = id;
3206
0
    }
3207
0
}
3208
3209
110
void cram_ref_decr(refs_t *r, int id) {
3210
110
    pthread_mutex_lock(&r->lock);
3211
110
    cram_ref_decr_locked(r, id);
3212
110
    pthread_mutex_unlock(&r->lock);
3213
110
}
3214
3215
/*
3216
 * Used by cram_ref_load and cram_get_ref. The file handle will have
3217
 * already been opened, so we can catch it. The ref_entry *e informs us
3218
 * of whether this is a multi-line fasta file or a raw MD5 style file.
3219
 * Either way we create a single contiguous sequence.
3220
 *
3221
 * Returns all or part of a reference sequence on success (malloced);
3222
 *         NULL on failure.
3223
 */
3224
static char *load_ref_portion(BGZF *fp, ref_entry *e,
3225
0
                              hts_pos_t start, hts_pos_t end) {
3226
0
    off_t offset, len;
3227
0
    char *seq;
3228
3229
0
    if (end < start)
3230
0
        end = start;
3231
3232
    /*
3233
     * Compute locations in file. This is trivial for the MD5 files, but
3234
     * is still necessary for the fasta variants.
3235
     *
3236
     * Note the offset here, as with faidx, has the assumption that white-
3237
     * space (the diff between line_length and bases_per_line) only occurs
3238
     * at the end of a line of text.
3239
     */
3240
0
    offset = e->line_length
3241
0
        ? e->offset + (start-1)/e->bases_per_line * e->line_length +
3242
0
          (start-1) % e->bases_per_line
3243
0
        : start-1;
3244
3245
0
    len = (e->line_length
3246
0
           ? e->offset + (end-1)/e->bases_per_line * e->line_length +
3247
0
             (end-1) % e->bases_per_line
3248
0
           : end-1) - offset + 1;
3249
3250
0
    if (bgzf_useek(fp, offset, SEEK_SET) < 0) {
3251
0
        perror("bgzf_useek() on reference file");
3252
0
        return NULL;
3253
0
    }
3254
3255
0
    if (len == 0 || !(seq = malloc(len))) {
3256
0
        return NULL;
3257
0
    }
3258
3259
0
    if (len != bgzf_read(fp, seq, len)) {
3260
0
        perror("bgzf_read() on reference file");
3261
0
        free(seq);
3262
0
        return NULL;
3263
0
    }
3264
3265
    /* Strip white-space if required. */
3266
0
    if (len != end-start+1) {
3267
0
        hts_pos_t i, j;
3268
0
        char *cp = seq;
3269
0
        char *cp_to;
3270
3271
        // Copy up to the first white-space, and then repeatedly just copy
3272
        // bases_per_line verbatim, and use the slow method to end again.
3273
        //
3274
        // This may seem excessive, but this code can be a significant
3275
        // portion of total CRAM decode CPU time for shallow data sets.
3276
0
        for (i = j = 0; i < len; i++) {
3277
0
            if (!isspace_c(cp[i]))
3278
0
                cp[j++] = cp[i] & ~0x20;
3279
0
            else
3280
0
                break;
3281
0
        }
3282
0
        while (i < len && isspace_c(cp[i]))
3283
0
            i++;
3284
0
        while (i < len - e->line_length) {
3285
0
            hts_pos_t j_end = j + e->bases_per_line;
3286
0
            while (j < j_end)
3287
0
                cp[j++] = cp[i++] & ~0x20; // toupper equiv
3288
0
            i += e->line_length - e->bases_per_line;
3289
0
        }
3290
0
        for (; i < len; i++) {
3291
0
            if (!isspace_c(cp[i]))
3292
0
                cp[j++] = cp[i] & ~0x20;
3293
0
        }
3294
3295
0
        cp_to = cp+j;
3296
3297
0
        if (cp_to - seq != end-start+1) {
3298
0
            hts_log_error("Malformed reference file");
3299
0
            free(seq);
3300
0
            return NULL;
3301
0
        }
3302
0
    } else {
3303
0
        int i;
3304
0
        for (i = 0; i < len; i++) {
3305
0
            seq[i] = toupper_c(seq[i]);
3306
0
        }
3307
0
    }
3308
3309
0
    return seq;
3310
0
}
3311
3312
/*
3313
 * Load the entire reference 'id'.
3314
 * This also increments the reference count by 1.
3315
 *
3316
 * Returns ref_entry on success;
3317
 *         NULL on failure
3318
 */
3319
0
ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) {
3320
0
    ref_entry *e = r->ref_id[id];
3321
0
    hts_pos_t start = 1, end = e->length;
3322
0
    char *seq;
3323
3324
0
    if (e->seq) {
3325
0
        return e;
3326
0
    }
3327
3328
0
    assert(e->count == 0);
3329
3330
0
    if (r->last) {
3331
#ifdef REF_DEBUG
3332
        int idx = 0;
3333
        for (idx = 0; idx < r->nref; idx++)
3334
            if (r->last == r->ref_id[idx])
3335
                break;
3336
        RP("%d cram_ref_load DECR %d\n", gettid(), idx);
3337
#endif
3338
0
        assert(r->last->count > 0);
3339
0
        if (--r->last->count <= 0) {
3340
0
            RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
3341
0
            if (r->last->seq)
3342
0
                ref_entry_free_seq(r->last);
3343
0
        }
3344
0
    }
3345
3346
0
    if (!r->fn)
3347
0
        return NULL;
3348
3349
    /* Open file if it's not already the current open reference */
3350
0
    if (strcmp(r->fn, e->fn) || r->fp == NULL) {
3351
0
        if (r->fp)
3352
0
            if (bgzf_close(r->fp) != 0)
3353
0
                return NULL;
3354
0
        r->fn = e->fn;
3355
0
        if (!(r->fp = bgzf_open_ref(r->fn, "r", is_md5)))
3356
0
            return NULL;
3357
0
    }
3358
3359
0
    RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
3360
3361
0
    if (!(seq = load_ref_portion(r->fp, e, start, end))) {
3362
0
        return NULL;
3363
0
    }
3364
3365
0
    RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
3366
3367
0
    RP("%d INC REF %d, %"PRId64"\n", gettid(), id, (e->count+1));
3368
0
    e->seq = seq;
3369
0
    e->mf = NULL;
3370
0
    e->count++;
3371
3372
    /*
3373
     * Also keep track of last used ref so incr/decr loops on the same
3374
     * sequence don't cause load/free loops.
3375
     */
3376
0
    RP("%d cram_ref_load INCR %d => %"PRId64"\n", gettid(), id, e->count+1);
3377
0
    r->last = e;
3378
0
    e->count++;
3379
3380
0
    return e;
3381
0
}
3382
3383
/*
3384
 * Returns a portion of a reference sequence from start to end inclusive.
3385
 * The returned pointer is owned by either the cram_file fd or by the
3386
 * internal refs_t structure and should not be freed  by the caller.
3387
 *
3388
 * The difference is whether or not this refs_t is in use by just the one
3389
 * cram_fd or by multiples, or whether we have multiple threads accessing
3390
 * references. In either case fd->shared will be true and we start using
3391
 * reference counting to track the number of users of a specific reference
3392
 * sequence.
3393
 *
3394
 * Otherwise the ref seq returned is allocated as part of cram_fd itself
3395
 * and will be freed up on the next call to cram_get_ref or cram_close.
3396
 *
3397
 * To return the entire reference sequence, specify start as 1 and end
3398
 * as 0.
3399
 *
3400
 * To cease using a reference, call cram_ref_decr().
3401
 *
3402
 * Returns reference on success,
3403
 *         NULL on failure
3404
 */
3405
5.96k
char *cram_get_ref(cram_fd *fd, int id, hts_pos_t start, hts_pos_t end) {
3406
5.96k
    ref_entry *r;
3407
5.96k
    char *seq;
3408
5.96k
    int ostart = start;
3409
3410
5.96k
    if (id == -1 || start < 1)
3411
2.09k
        return NULL;
3412
3413
    /* FIXME: axiomatic query of r->seq being true?
3414
     * Or shortcut for unsorted data where we load once and never free?
3415
     */
3416
3417
    //fd->shared_ref = 1; // hard code for now to simplify things
3418
3419
3.86k
    pthread_mutex_lock(&fd->ref_lock);
3420
3421
3.86k
    RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
3422
3423
    /*
3424
     * Unsorted data implies we want to fetch an entire reference at a time.
3425
     * We just deal with this at the moment by claiming we're sharing
3426
     * references instead, which has the same requirement.
3427
     */
3428
3.86k
    if (fd->unsorted)
3429
0
        fd->shared_ref = 1;
3430
3431
3432
    /* Sanity checking: does this ID exist? */
3433
3.86k
    if (id >= fd->refs->nref) {
3434
18
        hts_log_error("No reference found for id %d", id);
3435
18
        pthread_mutex_unlock(&fd->ref_lock);
3436
18
        return NULL;
3437
18
    }
3438
3439
3.85k
    if (!fd->refs || !fd->refs->ref_id[id]) {
3440
0
        hts_log_error("No reference found for id %d", id);
3441
0
        pthread_mutex_unlock(&fd->ref_lock);
3442
0
        return NULL;
3443
0
    }
3444
3445
3.85k
    if (!(r = fd->refs->ref_id[id])) {
3446
0
        hts_log_error("No reference found for id %d", id);
3447
0
        pthread_mutex_unlock(&fd->ref_lock);
3448
0
        return NULL;
3449
0
    }
3450
3451
3452
    /*
3453
     * It has an entry, but may not have been populated yet.
3454
     * Any manually loaded .fai files have their lengths known.
3455
     * A ref entry computed from @SQ lines (M5 or UR field) will have
3456
     * r->length == 0 unless it's been loaded once and verified that we have
3457
     * an on-disk filename for it.
3458
     *
3459
     * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
3460
     * open_path_mfile and libcurl, which isn't multi-thread safe unless I
3461
     * rewrite my code to have one curl handle per thread.
3462
     */
3463
3.85k
    pthread_mutex_lock(&fd->refs->lock);
3464
3.85k
    if (r->length == 0) {
3465
3.85k
        if (fd->ref_fn)
3466
0
            hts_log_warning("Reference file given, but ref '%s' not present",
3467
3.85k
                            r->name);
3468
3.85k
        if (cram_populate_ref(fd, id, r) == -1) {
3469
3.85k
            hts_log_warning("Failed to populate reference \"%s\"",
3470
3.85k
                            r->name);
3471
3.85k
            hts_log_warning("See https://www.htslib.org/doc/reference_seqs.html for further suggestions");
3472
3.85k
            pthread_mutex_unlock(&fd->refs->lock);
3473
3.85k
            pthread_mutex_unlock(&fd->ref_lock);
3474
3.85k
            return NULL;
3475
3.85k
        }
3476
0
        r = fd->refs->ref_id[id];
3477
0
        if (fd->unsorted)
3478
0
            cram_ref_incr_locked(fd->refs, id);
3479
0
    }
3480
3481
3482
    /*
3483
     * We now know that we the filename containing the reference, so check
3484
     * for limits. If it's over half the reference we'll load all of it in
3485
     * memory as this will speed up subsequent calls.
3486
     */
3487
0
    if (end < 1)
3488
0
        end = r->length;
3489
0
    if (end >= r->length)
3490
0
        end  = r->length;
3491
3492
0
    if (end - start >= 0.5*r->length || fd->shared_ref) {
3493
0
        start = 1;
3494
0
        end = r->length;
3495
0
    }
3496
3497
    /*
3498
     * Maybe we have it cached already? If so use it.
3499
     *
3500
     * Alternatively if we don't have the sequence but we're sharing
3501
     * references and/or are asking for the entire length of it, then
3502
     * load the full reference into the refs structure and return
3503
     * a pointer to that one instead.
3504
     */
3505
0
    if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
3506
0
        char *cp;
3507
3508
0
        if (id >= 0) {
3509
0
            if (r->seq) {
3510
0
                cram_ref_incr_locked(fd->refs, id);
3511
0
            } else {
3512
0
                ref_entry *e;
3513
0
                if (!(e = cram_ref_load(fd->refs, id, r->is_md5))) {
3514
0
                    pthread_mutex_unlock(&fd->refs->lock);
3515
0
                    pthread_mutex_unlock(&fd->ref_lock);
3516
0
                    return NULL;
3517
0
                }
3518
3519
                /* unsorted data implies cache ref indefinitely, to avoid
3520
                 * continually loading and unloading.
3521
                 */
3522
0
                if (fd->unsorted)
3523
0
                    cram_ref_incr_locked(fd->refs, id);
3524
0
            }
3525
3526
0
            fd->ref = NULL; /* We never access it directly */
3527
0
            fd->ref_start = 1;
3528
0
            fd->ref_end   = r->length;
3529
0
            fd->ref_id    = id;
3530
3531
0
            cp = fd->refs->ref_id[id]->seq + ostart-1;
3532
0
        } else {
3533
0
            fd->ref = NULL;
3534
0
            cp = NULL;
3535
0
        }
3536
3537
0
        RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
3538
3539
0
        pthread_mutex_unlock(&fd->refs->lock);
3540
0
        pthread_mutex_unlock(&fd->ref_lock);
3541
0
        return cp;
3542
0
    }
3543
3544
    /*
3545
     * Otherwise we're not sharing, we don't have a copy of it already and
3546
     * we're only asking for a small portion of it.
3547
     *
3548
     * In this case load up just that segment ourselves, freeing any old
3549
     * small segments in the process.
3550
     */
3551
3552
    /* Unmapped ref ID */
3553
0
    if (id < 0 || !fd->refs->fn) {
3554
0
        if (fd->ref_free) {
3555
0
            free(fd->ref_free);
3556
0
            fd->ref_free = NULL;
3557
0
        }
3558
0
        fd->ref = NULL;
3559
0
        fd->ref_id = id;
3560
0
        pthread_mutex_unlock(&fd->refs->lock);
3561
0
        pthread_mutex_unlock(&fd->ref_lock);
3562
0
        return NULL;
3563
0
    }
3564
3565
    /* Open file if it's not already the current open reference */
3566
0
    if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
3567
0
        if (fd->refs->fp)
3568
0
            if (bgzf_close(fd->refs->fp) != 0)
3569
0
                return NULL;
3570
0
        fd->refs->fn = r->fn;
3571
0
        if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r", r->is_md5))) {
3572
0
            pthread_mutex_unlock(&fd->refs->lock);
3573
0
            pthread_mutex_unlock(&fd->ref_lock);
3574
0
            return NULL;
3575
0
        }
3576
0
    }
3577
3578
0
    if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
3579
0
        pthread_mutex_unlock(&fd->refs->lock);
3580
0
        pthread_mutex_unlock(&fd->ref_lock);
3581
0
        return NULL;
3582
0
    }
3583
3584
0
    if (fd->ref_free)
3585
0
        free(fd->ref_free);
3586
3587
0
    fd->ref_id    = id;
3588
0
    fd->ref_start = start;
3589
0
    fd->ref_end   = end;
3590
0
    fd->ref_free = fd->ref;
3591
0
    seq = fd->ref;
3592
3593
0
    pthread_mutex_unlock(&fd->refs->lock);
3594
0
    pthread_mutex_unlock(&fd->ref_lock);
3595
3596
0
    return seq ? seq + ostart - start : NULL;
3597
0
}
3598
3599
/*
3600
 * If fd has been opened for reading, it may be permitted to specify 'fn'
3601
 * as NULL and let the code auto-detect the reference by parsing the
3602
 * SAM header @SQ lines.
3603
 */
3604
0
int cram_load_reference(cram_fd *fd, char *fn) {
3605
0
    int ret = 0;
3606
3607
0
    if (fn) {
3608
0
        fd->refs = refs_load_fai(fd->refs, fn,
3609
0
                                 !(fd->embed_ref>0 && fd->mode == 'r'));
3610
0
        fn = fd->refs ? fd->refs->fn : NULL;
3611
0
        if (!fn)
3612
0
            ret = -1;
3613
0
        sanitise_SQ_lines(fd);
3614
0
    }
3615
0
    fd->ref_fn = fn;
3616
3617
0
    if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
3618
0
        if (fd->refs)
3619
0
            refs_free(fd->refs);
3620
0
        if (!(fd->refs = refs_create()))
3621
0
            return -1;
3622
0
        if (-1 == refs_from_header(fd))
3623
0
            return -1;
3624
0
    }
3625
3626
0
    if (fd->header)
3627
0
        if (-1 == refs2id(fd->refs, fd->header))
3628
0
            return -1;
3629
3630
0
    return ret;
3631
0
}
3632
3633
/* ----------------------------------------------------------------------
3634
 * Containers
3635
 */
3636
3637
/*
3638
 * Creates a new container, specifying the maximum number of slices
3639
 * and records permitted.
3640
 *
3641
 * Returns cram_container ptr on success
3642
 *         NULL on failure
3643
 */
3644
45.5k
cram_container *cram_new_container(int nrec, int nslice) {
3645
45.5k
    cram_container *c = calloc(1, sizeof(*c));
3646
45.5k
    enum cram_DS_ID id;
3647
3648
45.5k
    if (!c)
3649
0
        return NULL;
3650
3651
45.5k
    c->curr_ref = -2;
3652
3653
45.5k
    c->max_c_rec = nrec * nslice;
3654
45.5k
    c->curr_c_rec = 0;
3655
3656
45.5k
    c->max_rec = nrec;
3657
45.5k
    c->record_counter = 0;
3658
45.5k
    c->num_bases = 0;
3659
45.5k
    c->s_num_bases = 0;
3660
3661
45.5k
    c->max_slice = nslice;
3662
45.5k
    c->curr_slice = 0;
3663
3664
45.5k
    c->pos_sorted = 1;
3665
45.5k
    c->max_apos   = 0;
3666
45.5k
    c->multi_seq  = 0;
3667
45.5k
    c->qs_seq_orient = 1;
3668
45.5k
    c->no_ref = 0;
3669
45.5k
    c->embed_ref = -1; // automatic selection
3670
3671
45.5k
    c->bams = NULL;
3672
3673
45.5k
    if (!(c->slices = calloc(nslice != 0 ? nslice : 1, sizeof(cram_slice *))))
3674
0
        goto err;
3675
45.5k
    c->slice = NULL;
3676
3677
45.5k
    if (!(c->comp_hdr = cram_new_compression_header()))
3678
0
        goto err;
3679
45.5k
    c->comp_hdr_block = NULL;
3680
3681
1.32M
    for (id = DS_RN; id < DS_TN; id++)
3682
1.27M
        if (!(c->stats[id] = cram_stats_create())) goto err;
3683
3684
    //c->aux_B_stats = cram_stats_create();
3685
3686
45.5k
    if (!(c->tags_used = kh_init(m_tagmap)))
3687
0
        goto err;
3688
45.5k
    c->refs_used = 0;
3689
45.5k
    c->ref_free = 0;
3690
3691
45.5k
    return c;
3692
3693
0
 err:
3694
0
    if (c) {
3695
0
        if (c->slices)
3696
0
            free(c->slices);
3697
0
        free(c);
3698
0
    }
3699
0
    return NULL;
3700
45.5k
}
3701
3702
3.66k
static void free_bam_list(bam_seq_t **bams, int max_rec) {
3703
3.66k
    int i;
3704
36.6M
    for (i = 0; i < max_rec; i++)
3705
36.6M
        bam_free(bams[i]);
3706
3707
3.66k
    free(bams);
3708
3.66k
}
3709
3710
75.3k
void cram_free_container(cram_container *c) {
3711
75.3k
    enum cram_DS_ID id;
3712
75.3k
    int i;
3713
3714
75.3k
    if (!c)
3715
0
        return;
3716
3717
75.3k
    if (c->refs_used)
3718
773
        free(c->refs_used);
3719
3720
75.3k
    if (c->landmark)
3721
47.0k
        free(c->landmark);
3722
3723
75.3k
    if (c->comp_hdr)
3724
46.1k
        cram_free_compression_header(c->comp_hdr);
3725
3726
75.3k
    if (c->comp_hdr_block)
3727
43.2k
        cram_free_block(c->comp_hdr_block);
3728
3729
    // Free the slices; filled out by encoder only
3730
75.3k
    if (c->slices) {
3731
84.2k
        for (i = 0; i < c->max_slice; i++) {
3732
38.7k
            if (c->slices[i])
3733
3.66k
                cram_free_slice(c->slices[i]);
3734
38.7k
            if (c->slices[i] == c->slice)
3735
38.7k
                c->slice = NULL;
3736
38.7k
        }
3737
45.5k
        free(c->slices);
3738
45.5k
    }
3739
3740
    // Free the current slice; set by both encoder & decoder
3741
75.3k
    if (c->slice) {
3742
0
        cram_free_slice(c->slice);
3743
0
        c->slice = NULL;
3744
0
    }
3745
3746
2.18M
    for (id = DS_RN; id < DS_TN; id++)
3747
2.11M
        if (c->stats[id]) cram_stats_free(c->stats[id]);
3748
3749
    //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
3750
3751
75.3k
    if (c->tags_used) {
3752
45.5k
        khint_t k;
3753
3754
466k
        for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
3755
420k
            if (!kh_exist(c->tags_used, k))
3756
210k
                continue;
3757
3758
210k
            cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
3759
210k
            if (tm) {
3760
210k
                cram_codec *c = tm->codec;
3761
3762
210k
                if (c) c->free(c);
3763
3764
                // If tm->blk or tm->blk2 is set, then we haven't yet got to
3765
                // cram_encode_container which copies the blocks to s->aux_block
3766
                // and NULLifies tm->blk*.  In this case we failed to complete
3767
                // the container construction, so we have to free up our partially
3768
                // converted CRAM.
3769
210k
                cram_free_block(tm->blk);
3770
210k
                cram_free_block(tm->blk2);
3771
210k
                free(tm);
3772
210k
            }
3773
210k
        }
3774
3775
45.5k
        kh_destroy(m_tagmap, c->tags_used);
3776
45.5k
    }
3777
3778
75.3k
    if (c->ref_free)
3779
18.8k
        free(c->ref);
3780
3781
75.3k
    if (c->bams)
3782
358
        free_bam_list(c->bams, c->max_c_rec);
3783
3784
75.3k
    free(c);
3785
75.3k
}
3786
3787
/*
3788
 * Reads a container header.
3789
 *
3790
 * Returns cram_container on success
3791
 *         NULL on failure or no container left (fd->err == 0).
3792
 */
3793
30.0k
cram_container *cram_read_container(cram_fd *fd) {
3794
30.0k
    cram_container c2, *c;
3795
30.0k
    int i, s;
3796
30.0k
    size_t rd = 0;
3797
30.0k
    uint32_t crc = 0;
3798
3799
30.0k
    fd->err = 0;
3800
30.0k
    fd->eof = 0;
3801
3802
30.0k
    memset(&c2, 0, sizeof(c2));
3803
30.0k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3804
23.8k
        if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) {
3805
42
            fd->eof = fd->empty_container ? 1 : 2;
3806
42
            return NULL;
3807
23.8k
        } else {
3808
23.8k
            rd+=s;
3809
23.8k
        }
3810
23.8k
    } else if (CRAM_MAJOR_VERS(fd->version) < 4) {
3811
2.55k
        uint32_t len;
3812
2.55k
        if ((s = int32_decode(fd, &c2.length)) == -1) {
3813
12
            if (CRAM_MAJOR_VERS(fd->version) == 2 &&
3814
6
                CRAM_MINOR_VERS(fd->version) == 0)
3815
3
                fd->eof = 1; // EOF blocks arrived in v2.1
3816
9
            else
3817
9
                fd->eof = fd->empty_container ? 1 : 2;
3818
12
            return NULL;
3819
2.54k
        } else {
3820
2.54k
            rd+=s;
3821
2.54k
        }
3822
2.54k
        len = le_int4(c2.length);
3823
2.54k
        crc = crc32(0L, (unsigned char *)&len, 4);
3824
3.64k
    } else {
3825
3.64k
        if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc))   == -1) {
3826
6
            fd->eof = fd->empty_container ? 1 : 2;
3827
6
            return NULL;
3828
3.64k
        } else {
3829
3.64k
            rd+=s;
3830
3.64k
        }
3831
3.64k
    }
3832
30.0k
    if ((s = fd->vv.varint_decode32s_crc(fd, &c2.ref_seq_id, &crc))   == -1) return NULL; else rd+=s;
3833
29.9k
    if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3834
3.63k
        int64_t i64;
3835
3.63k
        if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc))== -1) return NULL; else rd+=s;
3836
3.62k
        c2.ref_seq_start = i64;
3837
3.62k
        if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc)) == -1) return NULL; else rd+=s;
3838
3.62k
        c2.ref_seq_span = i64;
3839
26.3k
    } else {
3840
26.3k
        int32_t i32;
3841
26.3k
        if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s;
3842
26.3k
        c2.ref_seq_start = i32;
3843
26.3k
        if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s;
3844
26.2k
        c2.ref_seq_span = i32;
3845
26.2k
    }
3846
29.9k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_records, &crc))  == -1) return NULL; else rd+=s;
3847
3848
29.8k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3849
23.7k
        c2.record_counter = 0;
3850
23.7k
        c2.num_bases = 0;
3851
23.7k
    } else {
3852
6.16k
        if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3853
5.44k
            if ((s = fd->vv.varint_decode64_crc(fd, &c2.record_counter, &crc)) == -1)
3854
9
                return NULL;
3855
5.44k
            else
3856
5.44k
                rd += s;
3857
5.44k
        } else {
3858
715
            int32_t i32;
3859
715
            if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1)
3860
0
                return NULL;
3861
715
            else
3862
715
                rd += s;
3863
715
            c2.record_counter = i32;
3864
715
        }
3865
3866
6.15k
        if ((s = fd->vv.varint_decode64_crc(fd, &c2.num_bases, &crc))== -1)
3867
12
            return NULL;
3868
6.14k
        else
3869
6.14k
            rd += s;
3870
6.15k
    }
3871
29.8k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_blocks, &crc))   == -1)
3872
11
        return NULL;
3873
29.8k
    else
3874
29.8k
        rd+=s;
3875
29.8k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_landmarks, &crc))== -1)
3876
14
        return NULL;
3877
29.8k
    else
3878
29.8k
        rd+=s;
3879
3880
29.8k
    if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t))
3881
33
        return NULL;
3882
3883
29.8k
    if (!(c = calloc(1, sizeof(*c))))
3884
0
        return NULL;
3885
3886
29.8k
    *c = c2;
3887
29.8k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3888
29.8k
    if (c->num_landmarks > FUZZ_ALLOC_LIMIT/sizeof(int32_t)) {
3889
5
        fd->err = errno = ENOMEM;
3890
5
        cram_free_container(c);
3891
5
        return NULL;
3892
5
    }
3893
29.8k
#endif
3894
29.8k
    if (c->num_landmarks && !(c->landmark = malloc(c->num_landmarks * sizeof(int32_t)))) {
3895
0
        fd->err = errno;
3896
0
        cram_free_container(c);
3897
0
        return NULL;
3898
0
    }
3899
102k
    for (i = 0; i < c->num_landmarks; i++) {
3900
72.4k
        if ((s = fd->vv.varint_decode32_crc(fd, &c->landmark[i], &crc)) == -1) {
3901
202
            cram_free_container(c);
3902
202
            return NULL;
3903
72.2k
        } else {
3904
72.2k
            rd += s;
3905
72.2k
        }
3906
72.4k
    }
3907
3908
29.6k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3909
5.35k
        if (-1 == int32_decode(fd, (int32_t *)&c->crc32)) {
3910
9
            cram_free_container(c);
3911
9
            return NULL;
3912
5.34k
        } else {
3913
5.34k
            rd+=4;
3914
5.34k
        }
3915
3916
5.34k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3917
        // Pretend the CRC was OK so the fuzzer doesn't have to get it right
3918
5.34k
        crc = c->crc32;
3919
5.34k
#endif
3920
3921
5.34k
        if (crc != c->crc32) {
3922
0
            hts_log_error("Container header CRC32 failure");
3923
0
            cram_free_container(c);
3924
0
            return NULL;
3925
0
        }
3926
5.34k
    }
3927
3928
29.5k
    c->offset = rd;
3929
29.5k
    c->slices = NULL;
3930
29.5k
    c->slice = NULL;
3931
29.5k
    c->curr_slice = 0;
3932
29.5k
    c->max_slice = c->num_landmarks;
3933
29.5k
    c->slice_rec = 0;
3934
29.5k
    c->curr_rec = 0;
3935
29.5k
    c->max_rec = 0;
3936
3937
29.5k
    if (c->ref_seq_id == -2) {
3938
31
        c->multi_seq = 1;
3939
31
        fd->multi_seq = 1;
3940
31
    }
3941
3942
29.5k
    fd->empty_container =
3943
29.5k
        (c->num_records == 0 &&
3944
23.0k
         c->ref_seq_id == -1 &&
3945
29.5k
         c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
3946
3947
29.5k
    return c;
3948
29.6k
}
3949
3950
3951
/* MAXIMUM storage size needed for the container. */
3952
0
int cram_container_size(cram_container *c) {
3953
0
    return 55 + 5*c->num_landmarks;
3954
0
}
3955
3956
3957
/*
3958
 * Stores the container structure in dat and returns *size as the
3959
 * number of bytes written to dat[].  The input size of dat is also
3960
 * held in *size and should be initialised to cram_container_size(c).
3961
 *
3962
 * Returns 0 on success;
3963
 *        -1 on failure
3964
 */
3965
int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
3966
0
{
3967
0
    char *cp = (char *)dat;
3968
0
    int i;
3969
3970
    // Check the input buffer is large enough according to our stated
3971
    // requirements. (NOTE: it may actually take less.)
3972
0
    if (cram_container_size(c) > *size)
3973
0
        return -1;
3974
3975
0
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3976
0
        cp += itf8_put(cp, c->length);
3977
0
    } else {
3978
0
        *(int32_t *)cp = le_int4(c->length);
3979
0
        cp += 4;
3980
0
    }
3981
0
    if (c->multi_seq) {
3982
0
        cp += fd->vv.varint_put32(cp, NULL, -2);
3983
0
        cp += fd->vv.varint_put32(cp, NULL, 0);
3984
0
        cp += fd->vv.varint_put32(cp, NULL, 0);
3985
0
    } else {
3986
0
        cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
3987
0
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3988
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start);
3989
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span);
3990
0
        } else {
3991
0
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
3992
0
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
3993
0
        }
3994
0
    }
3995
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_records);
3996
0
    if (CRAM_MAJOR_VERS(fd->version) == 2) {
3997
0
        cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
3998
0
    } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3999
0
        cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
4000
0
    }
4001
0
    cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
4002
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
4003
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
4004
0
    for (i = 0; i < c->num_landmarks; i++)
4005
0
        cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
4006
4007
0
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
4008
0
        c->crc32 = crc32(0L, (uc *)dat, cp-dat);
4009
0
        cp[0] =  c->crc32        & 0xff;
4010
0
        cp[1] = (c->crc32 >>  8) & 0xff;
4011
0
        cp[2] = (c->crc32 >> 16) & 0xff;
4012
0
        cp[3] = (c->crc32 >> 24) & 0xff;
4013
0
        cp += 4;
4014
0
    }
4015
4016
0
    *size = cp-dat; // actual used size
4017
4018
0
    return 0;
4019
0
}
4020
4021
4022
/*
4023
 * Writes a container structure.
4024
 *
4025
 * Returns 0 on success
4026
 *        -1 on failure
4027
 */
4028
51.4k
int cram_write_container(cram_fd *fd, cram_container *c) {
4029
51.4k
    char buf_a[1024], *buf = buf_a, *cp;
4030
51.4k
    int i;
4031
4032
51.4k
    if (61 + c->num_landmarks * 10 >= 1024) {
4033
0
        buf = malloc(61 + c->num_landmarks * 10);
4034
0
        if (!buf)
4035
0
            return -1;
4036
0
    }
4037
51.4k
    cp = buf;
4038
4039
51.4k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4040
0
        cp += itf8_put(cp, c->length);
4041
51.4k
    } else if (CRAM_MAJOR_VERS(fd->version) <= 3) {
4042
51.4k
        *(int32_t *)cp = le_int4(c->length);
4043
51.4k
        cp += 4;
4044
51.4k
    } else {
4045
0
        cp += fd->vv.varint_put32(cp, NULL, c->length);
4046
0
    }
4047
51.4k
    if (c->multi_seq) {
4048
767
        cp += fd->vv.varint_put32(cp, NULL, (uint32_t)-2);
4049
767
        cp += fd->vv.varint_put32(cp, NULL, 0);
4050
767
        cp += fd->vv.varint_put32(cp, NULL, 0);
4051
50.7k
    } else {
4052
50.7k
        cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
4053
50.7k
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
4054
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start);
4055
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span);
4056
50.7k
        } else {
4057
50.7k
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
4058
50.7k
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
4059
50.7k
        }
4060
50.7k
    }
4061
51.4k
    cp += fd->vv.varint_put32(cp, NULL, c->num_records);
4062
51.4k
    if (CRAM_MAJOR_VERS(fd->version) >= 3)
4063
51.4k
        cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
4064
0
    else
4065
0
        cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
4066
51.4k
    cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
4067
51.4k
    cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
4068
51.4k
    cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
4069
103k
    for (i = 0; i < c->num_landmarks; i++)
4070
51.6k
        cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
4071
4072
51.4k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
4073
51.4k
        c->crc32 = crc32(0L, (uc *)buf, cp-buf);
4074
51.4k
        cp[0] =  c->crc32        & 0xff;
4075
51.4k
        cp[1] = (c->crc32 >>  8) & 0xff;
4076
51.4k
        cp[2] = (c->crc32 >> 16) & 0xff;
4077
51.4k
        cp[3] = (c->crc32 >> 24) & 0xff;
4078
51.4k
        cp += 4;
4079
51.4k
    }
4080
4081
51.4k
    if (cp-buf != hwrite(fd->fp, buf, cp-buf)) {
4082
0
        if (buf != buf_a)
4083
0
            free(buf);
4084
0
        return -1;
4085
0
    }
4086
4087
51.4k
    if (buf != buf_a)
4088
0
        free(buf);
4089
4090
51.4k
    return 0;
4091
51.4k
}
4092
4093
// common component shared by cram_flush_container{,_mt}
4094
37.9k
static int cram_flush_container2(cram_fd *fd, cram_container *c) {
4095
37.9k
    int i, j;
4096
4097
37.9k
    if (c->curr_slice > 0 && !c->slices)
4098
0
        return -1;
4099
4100
    //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
4101
4102
37.9k
    off_t c_offset = htell(fd->fp); // File offset of container
4103
4104
    /* Write the container struct itself */
4105
37.9k
    if (0 != cram_write_container(fd, c))
4106
0
        return -1;
4107
4108
37.9k
    off_t hdr_size = htell(fd->fp) - c_offset;
4109
4110
    /* And the compression header */
4111
37.9k
    if (0 != cram_write_block(fd, c->comp_hdr_block))
4112
0
        return -1;
4113
4114
    /* Followed by the slice blocks */
4115
37.9k
    off_t file_offset = htell(fd->fp);
4116
75.9k
    for (i = 0; i < c->curr_slice; i++) {
4117
37.9k
        cram_slice *s = c->slices[i];
4118
37.9k
        off_t spos = file_offset - c_offset - hdr_size;
4119
4120
37.9k
        if (0 != cram_write_block(fd, s->hdr_block))
4121
0
            return -1;
4122
4123
388k
        for (j = 0; j < s->hdr->num_blocks; j++) {
4124
350k
            if (0 != cram_write_block(fd, s->block[j]))
4125
0
                return -1;
4126
350k
        }
4127
4128
37.9k
        file_offset = htell(fd->fp);
4129
37.9k
        off_t sz = file_offset - c_offset - hdr_size - spos;
4130
4131
37.9k
        if (fd->idxfp) {
4132
0
            if (cram_index_slice(fd, c, s, fd->idxfp, c_offset, spos, sz) < 0)
4133
0
                return -1;
4134
0
        }
4135
37.9k
    }
4136
4137
37.9k
    return 0;
4138
37.9k
}
4139
4140
/*
4141
 * Flushes a completely or partially full container to disk, writing
4142
 * container structure, header and blocks. This also calls the encoder
4143
 * functions.
4144
 *
4145
 * Returns 0 on success
4146
 *        -1 on failure
4147
 */
4148
38.7k
int cram_flush_container(cram_fd *fd, cram_container *c) {
4149
    /* Encode the container blocks and generate compression header */
4150
38.7k
    if (0 != cram_encode_container(fd, c))
4151
725
        return -1;
4152
4153
37.9k
    return cram_flush_container2(fd, c);
4154
38.7k
}
4155
4156
typedef struct {
4157
    cram_fd *fd;
4158
    cram_container *c;
4159
} cram_job;
4160
4161
0
void *cram_flush_thread(void *arg) {
4162
0
    cram_job *j = (cram_job *)arg;
4163
4164
    /* Encode the container blocks and generate compression header */
4165
0
    if (0 != cram_encode_container(j->fd, j->c)) {
4166
0
        hts_log_error("Call to cram_encode_container failed");
4167
0
        return NULL;
4168
0
    }
4169
4170
0
    return arg;
4171
0
}
4172
4173
0
static int cram_flush_result(cram_fd *fd) {
4174
0
    int i, ret = 0;
4175
0
    hts_tpool_result *r;
4176
0
    cram_container *lc = NULL;
4177
4178
    // NB: we can have one result per slice, not per container,
4179
    // so we need to free the container only after all slices
4180
    // within it have been freed.  (Automatic via reference counting.)
4181
0
    while ((r = hts_tpool_next_result(fd->rqueue))) {
4182
0
        cram_job *j = (cram_job *)hts_tpool_result_data(r);
4183
0
        cram_container *c;
4184
4185
0
        if (!j) {
4186
0
            hts_tpool_delete_result(r, 0);
4187
0
            return -1;
4188
0
        }
4189
4190
0
        fd = j->fd;
4191
0
        c = j->c;
4192
4193
0
        if (fd->mode == 'w')
4194
0
            if (0 != cram_flush_container2(fd, c))
4195
0
                return -1;
4196
4197
        // Free the slices; filled out by encoder only
4198
0
        if (c->slices) {
4199
0
            for (i = 0; i < c->max_slice; i++) {
4200
0
                if (c->slices[i])
4201
0
                    cram_free_slice(c->slices[i]);
4202
0
                if (c->slices[i] == c->slice)
4203
0
                    c->slice = NULL;
4204
0
                c->slices[i] = NULL;
4205
0
            }
4206
0
        }
4207
4208
        // Free the current slice; set by both encoder & decoder
4209
0
        if (c->slice) {
4210
0
            cram_free_slice(c->slice);
4211
0
            c->slice = NULL;
4212
0
        }
4213
0
        c->curr_slice = 0;
4214
4215
        // Our jobs will be in order, so we free the last
4216
        // container when our job has switched to a new one.
4217
0
        if (c != lc) {
4218
0
            if (lc) {
4219
0
                if (fd->ctr == lc)
4220
0
                    fd->ctr = NULL;
4221
0
                if (fd->ctr_mt == lc)
4222
0
                    fd->ctr_mt = NULL;
4223
0
                cram_free_container(lc);
4224
0
            }
4225
0
            lc = c;
4226
0
        }
4227
4228
0
        hts_tpool_delete_result(r, 1);
4229
0
    }
4230
0
    if (lc) {
4231
0
        if (fd->ctr == lc)
4232
0
            fd->ctr = NULL;
4233
0
        if (fd->ctr_mt == lc)
4234
0
            fd->ctr_mt = NULL;
4235
0
        cram_free_container(lc);
4236
0
    }
4237
4238
0
    return ret;
4239
0
}
4240
4241
// Note: called while metrics_lock is held.
4242
// Will be left in this state too, but may temporarily unlock.
4243
6.45k
void reset_metrics(cram_fd *fd) {
4244
6.45k
    int i;
4245
4246
6.45k
    if (fd->pool) {
4247
        // If multi-threaded we have multiple blocks being
4248
        // compressed already and several on the to-do list
4249
        // (fd->rqueue->pending).  It's tricky to reset the
4250
        // metrics exactly the correct point, so instead we
4251
        // just flush the pool, reset, and then continue again.
4252
4253
        // Don't bother starting a new trial before then though.
4254
0
        for (i = 0; i < DS_END; i++) {
4255
0
            cram_metrics *m = fd->m[i];
4256
0
            if (!m)
4257
0
                continue;
4258
0
            m->next_trial = 999;
4259
0
        }
4260
4261
0
        pthread_mutex_unlock(&fd->metrics_lock);
4262
0
        hts_tpool_process_flush(fd->rqueue);
4263
0
        pthread_mutex_lock(&fd->metrics_lock);
4264
0
    }
4265
4266
309k
    for (i = 0; i < DS_END; i++) {
4267
303k
        cram_metrics *m = fd->m[i];
4268
303k
        if (!m)
4269
0
            continue;
4270
4271
303k
        m->trial = NTRIALS;
4272
303k
        m->next_trial = TRIAL_SPAN;
4273
303k
        m->revised_method = 0;
4274
303k
        m->unpackable = 0;
4275
4276
303k
        memset(m->sz, 0, sizeof(m->sz));
4277
303k
    }
4278
6.45k
}
4279
4280
38.7k
int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
4281
38.7k
    cram_job *j;
4282
4283
    // At the junction of mapped to unmapped data the compression
4284
    // methods may need to change due to very different statistical
4285
    // properties; particularly BA if minhash sorted.
4286
    //
4287
    // However with threading we'll have several in-flight blocks
4288
    // arriving out of order.
4289
    //
4290
    // So we do one trial reset of NThreads to last for NThreads
4291
    // duration to get us over this transition period, followed
4292
    // by another retrial of the usual ntrials & trial span.
4293
38.7k
    pthread_mutex_lock(&fd->metrics_lock);
4294
38.7k
    if (c->n_mapped < 0.3*c->curr_rec &&
4295
20.5k
        fd->last_mapped > 0.7*c->max_rec) {
4296
6.45k
        reset_metrics(fd);
4297
6.45k
    }
4298
38.7k
    fd->last_mapped = c->n_mapped * (c->max_rec+1)/(c->curr_rec+1) ;
4299
38.7k
    pthread_mutex_unlock(&fd->metrics_lock);
4300
4301
38.7k
    if (!fd->pool)
4302
38.7k
        return cram_flush_container(fd, c);
4303
4304
0
    if (!(j = malloc(sizeof(*j))))
4305
0
        return -1;
4306
0
    j->fd = fd;
4307
0
    j->c = c;
4308
4309
    // Flush the job.  Note our encoder queue may be full, so we
4310
    // either have to keep trying in non-blocking mode (what we do) or
4311
    // use a dedicated separate thread for draining the queue.
4312
0
    for (;;) {
4313
0
        errno = 0;
4314
0
        hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_flush_thread, j, 1);
4315
0
        int pending = (errno == EAGAIN);
4316
0
        if (cram_flush_result(fd) != 0)
4317
0
            return -1;
4318
0
        if (!pending)
4319
0
            break;
4320
4321
0
        hts_usleep(1000);
4322
0
    }
4323
4324
0
    return 0;
4325
0
}
4326
4327
/* ----------------------------------------------------------------------
4328
 * Compression headers; the first part of the container
4329
 */
4330
4331
/*
4332
 * Creates a new blank container compression header
4333
 *
4334
 * Returns header ptr on success
4335
 *         NULL on failure
4336
 */
4337
45.5k
cram_block_compression_hdr *cram_new_compression_header(void) {
4338
45.5k
    cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
4339
45.5k
    if (!hdr)
4340
0
        return NULL;
4341
4342
45.5k
    if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
4343
0
        free(hdr);
4344
0
        return NULL;
4345
0
    }
4346
4347
45.5k
    if (!(hdr->TD_hash = kh_init(m_s2i))) {
4348
0
        cram_free_block(hdr->TD_blk);
4349
0
        free(hdr);
4350
0
        return NULL;
4351
0
    }
4352
4353
45.5k
    if (!(hdr->TD_keys = string_pool_create(8192))) {
4354
0
        kh_destroy(m_s2i, hdr->TD_hash);
4355
0
        cram_free_block(hdr->TD_blk);
4356
0
        free(hdr);
4357
0
        return NULL;
4358
0
    }
4359
4360
45.5k
    return hdr;
4361
45.5k
}
4362
4363
47.6k
void cram_free_compression_header(cram_block_compression_hdr *hdr) {
4364
47.6k
    int i;
4365
4366
47.6k
    if (hdr->landmark)
4367
1.87k
        free(hdr->landmark);
4368
4369
47.6k
    if (hdr->preservation_map)
4370
40.0k
        kh_destroy(map, hdr->preservation_map);
4371
4372
1.57M
    for (i = 0; i < CRAM_MAP_HASH; i++) {
4373
1.52M
        cram_map *m, *m2;
4374
1.56M
        for (m = hdr->rec_encoding_map[i]; m; m = m2) {
4375
37.3k
            m2 = m->next;
4376
37.3k
            if (m->codec)
4377
0
                m->codec->free(m->codec);
4378
37.3k
            free(m);
4379
37.3k
        }
4380
1.52M
    }
4381
4382
1.57M
    for (i = 0; i < CRAM_MAP_HASH; i++) {
4383
1.52M
        cram_map *m, *m2;
4384
1.52M
        for (m = hdr->tag_encoding_map[i]; m; m = m2) {
4385
879
            m2 = m->next;
4386
879
            if (m->codec)
4387
879
                m->codec->free(m->codec);
4388
879
            free(m);
4389
879
        }
4390
1.52M
    }
4391
4392
2.28M
    for (i = 0; i < DS_END; i++) {
4393
2.23M
        if (hdr->codecs[i])
4394
722k
            hdr->codecs[i]->free(hdr->codecs[i]);
4395
2.23M
    }
4396
4397
47.6k
    if (hdr->TL)
4398
92
        free(hdr->TL);
4399
47.6k
    if (hdr->TD_blk)
4400
45.6k
        cram_free_block(hdr->TD_blk);
4401
47.6k
    if (hdr->TD_hash)
4402
45.5k
        kh_destroy(m_s2i, hdr->TD_hash);
4403
47.6k
    if (hdr->TD_keys)
4404
45.5k
        string_pool_destroy(hdr->TD_keys);
4405
4406
47.6k
    free(hdr);
4407
47.6k
}
4408
4409
4410
/* ----------------------------------------------------------------------
4411
 * Slices and slice headers
4412
 */
4413
4414
39.7k
void cram_free_slice_header(cram_block_slice_hdr *hdr) {
4415
39.7k
    if (!hdr)
4416
0
        return;
4417
4418
39.7k
    if (hdr->block_content_ids)
4419
39.3k
        free(hdr->block_content_ids);
4420
4421
39.7k
    free(hdr);
4422
4423
39.7k
    return;
4424
39.7k
}
4425
4426
39.9k
void cram_free_slice(cram_slice *s) {
4427
39.9k
    if (!s)
4428
0
        return;
4429
4430
39.9k
    if (s->hdr_block)
4431
38.9k
        cram_free_block(s->hdr_block);
4432
4433
39.9k
    if (s->block) {
4434
39.3k
        int i;
4435
4436
39.3k
        if (s->hdr) {
4437
6.52M
            for (i = 0; i < s->hdr->num_blocks; i++) {
4438
6.48M
                if (i > 0 && s->block[i] == s->block[0])
4439
6.12M
                    continue;
4440
354k
                cram_free_block(s->block[i]);
4441
354k
            }
4442
39.3k
        }
4443
39.3k
        free(s->block);
4444
39.3k
    }
4445
4446
39.9k
    {
4447
        // Normally already copied into s->block[], but potentially still
4448
        // here if we error part way through cram_encode_slice.
4449
39.9k
        int i;
4450
249k
        for (i = 0; i < s->naux_block; i++)
4451
209k
            cram_free_block(s->aux_block[i]);
4452
39.9k
    }
4453
4454
39.9k
    if (s->block_by_id)
4455
997
        free(s->block_by_id);
4456
4457
39.9k
    if (s->hdr)
4458
39.7k
        cram_free_slice_header(s->hdr);
4459
4460
39.9k
    if (s->seqs_blk)
4461
39.7k
        cram_free_block(s->seqs_blk);
4462
4463
39.9k
    if (s->qual_blk)
4464
1.45k
        cram_free_block(s->qual_blk);
4465
4466
39.9k
    if (s->name_blk)
4467
1.45k
        cram_free_block(s->name_blk);
4468
4469
39.9k
    if (s->aux_blk)
4470
39.7k
        cram_free_block(s->aux_blk);
4471
4472
39.9k
    if (s->base_blk)
4473
1.45k
        cram_free_block(s->base_blk);
4474
4475
39.9k
    if (s->soft_blk)
4476
1.45k
        cram_free_block(s->soft_blk);
4477
4478
39.9k
    if (s->cigar)
4479
39.7k
        free(s->cigar);
4480
4481
39.9k
    if (s->crecs)
4482
39.6k
        free(s->crecs);
4483
4484
39.9k
    if (s->features)
4485
17.8k
        free(s->features);
4486
4487
39.9k
    if (s->TN)
4488
0
        free(s->TN);
4489
4490
39.9k
    if (s->pair_keys)
4491
38.7k
        string_pool_destroy(s->pair_keys);
4492
4493
39.9k
    if (s->pair[0])
4494
38.7k
        kh_destroy(m_s2i, s->pair[0]);
4495
39.9k
    if (s->pair[1])
4496
38.7k
        kh_destroy(m_s2i, s->pair[1]);
4497
4498
39.9k
    if (s->aux_block)
4499
31.6k
        free(s->aux_block);
4500
4501
39.9k
    free(s);
4502
39.9k
}
4503
4504
/*
4505
 * Creates a new empty slice in memory, for subsequent writing to
4506
 * disk.
4507
 *
4508
 * Returns cram_slice ptr on success
4509
 *         NULL on failure
4510
 */
4511
38.7k
cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
4512
38.7k
    cram_slice *s = calloc(1, sizeof(*s));
4513
38.7k
    if (!s)
4514
0
        return NULL;
4515
4516
38.7k
    if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
4517
0
        goto err;
4518
38.7k
    s->hdr->content_type = type;
4519
4520
38.7k
    s->hdr_block = NULL;
4521
38.7k
    s->block = NULL;
4522
38.7k
    s->block_by_id = NULL;
4523
38.7k
    s->last_apos = 0;
4524
38.7k
    if (!(s->crecs = malloc(nrecs * sizeof(cram_record))))  goto err;
4525
38.7k
    s->cigar_alloc = 1024;
4526
38.7k
    if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err;
4527
38.7k
    s->ncigar = 0;
4528
4529
38.7k
    if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))       goto err;
4530
38.7k
    if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))   goto err;
4531
38.7k
    if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))   goto err;
4532
38.7k
    if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux)))  goto err;
4533
38.7k
    if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))   goto err;
4534
38.7k
    if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))   goto err;
4535
4536
38.7k
    s->features = NULL;
4537
38.7k
    s->nfeatures = s->afeatures = 0;
4538
4539
38.7k
#ifndef TN_external
4540
38.7k
    s->TN = NULL;
4541
38.7k
    s->nTN = s->aTN = 0;
4542
38.7k
#endif
4543
4544
    // Volatile keys as we do realloc in dstring
4545
38.7k
    if (!(s->pair_keys = string_pool_create(8192))) goto err;
4546
38.7k
    if (!(s->pair[0] = kh_init(m_s2i)))             goto err;
4547
38.7k
    if (!(s->pair[1] = kh_init(m_s2i)))             goto err;
4548
4549
#ifdef BA_external
4550
    s->BA_len = 0;
4551
#endif
4552
4553
38.7k
    return s;
4554
4555
0
 err:
4556
0
    if (s)
4557
0
        cram_free_slice(s);
4558
4559
0
    return NULL;
4560
38.7k
}
4561
4562
/*
4563
 * Loads an entire slice.
4564
 * FIXME: In 1.0 the native unit of slices within CRAM is broken
4565
 * as slices contain references to objects in other slices.
4566
 * To work around this while keeping the slice oriented outer loop
4567
 * we read all slices and stitch them together into a fake large
4568
 * slice instead.
4569
 *
4570
 * Returns cram_slice ptr on success
4571
 *         NULL on failure
4572
 */
4573
1.20k
cram_slice *cram_read_slice(cram_fd *fd) {
4574
1.20k
    cram_block *b = cram_read_block(fd);
4575
1.20k
    cram_slice *s = calloc(1, sizeof(*s));
4576
1.20k
    int i, n, max_id, min_id;
4577
4578
1.20k
    if (!b || !s)
4579
30
        goto err;
4580
4581
1.17k
    s->hdr_block = b;
4582
1.17k
    switch (b->content_type) {
4583
1.15k
    case MAPPED_SLICE:
4584
1.15k
    case UNMAPPED_SLICE:
4585
1.15k
        if (!(s->hdr = cram_decode_slice_header(fd, b)))
4586
84
            goto err;
4587
1.07k
        break;
4588
4589
1.07k
    default:
4590
11
        hts_log_error("Unexpected block of type %s",
4591
11
                      cram_content_type2str(b->content_type));
4592
11
        goto err;
4593
1.17k
    }
4594
4595
1.07k
    if (s->hdr->num_blocks < 1) {
4596
9
        hts_log_error("Slice does not include any data blocks");
4597
9
        goto err;
4598
9
    }
4599
4600
1.06k
    s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
4601
1.06k
    if (!s->block)
4602
0
        goto err;
4603
4604
5.51k
    for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
4605
4.51k
        if (!(s->block[i] = cram_read_block(fd)))
4606
69
            goto err;
4607
4608
4.44k
        if (s->block[i]->content_type == EXTERNAL) {
4609
942
            if (max_id < s->block[i]->content_id)
4610
12
                max_id = s->block[i]->content_id;
4611
942
            if (min_id > s->block[i]->content_id)
4612
906
                min_id = s->block[i]->content_id;
4613
942
        }
4614
4.44k
    }
4615
4616
997
    if (!(s->block_by_id = calloc(512, sizeof(s->block[0]))))
4617
0
        goto err;
4618
4619
5.18k
    for (i = 0; i < n; i++) {
4620
4.18k
        if (s->block[i]->content_type != EXTERNAL)
4621
3.24k
            continue;
4622
942
        uint32_t v = s->block[i]->content_id;
4623
942
        if (v >= 256)
4624
15
            v = 256 + v % 251;
4625
942
        s->block_by_id[v] = s->block[i];
4626
942
    }
4627
4628
    /* Initialise encoding/decoding tables */
4629
997
    s->cigar_alloc = 1024;
4630
997
    if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err;
4631
997
    s->ncigar = 0;
4632
4633
997
    if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))      goto err;
4634
997
    if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))  goto err;
4635
997
    if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))  goto err;
4636
997
    if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux))) goto err;
4637
997
    if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))  goto err;
4638
997
    if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))  goto err;
4639
4640
997
    s->crecs = NULL;
4641
4642
997
    s->last_apos = s->hdr->ref_seq_start;
4643
997
    s->decode_md = fd->decode_md;
4644
4645
997
    return s;
4646
4647
203
 err:
4648
203
    if (b)
4649
173
        cram_free_block(b);
4650
203
    if (s) {
4651
203
        s->hdr_block = NULL;
4652
203
        cram_free_slice(s);
4653
203
    }
4654
203
    return NULL;
4655
997
}
4656
4657
4658
/* ----------------------------------------------------------------------
4659
 * CRAM file definition (header)
4660
 */
4661
4662
/*
4663
 * Reads a CRAM file definition structure.
4664
 * Returns file_def ptr on success
4665
 *         NULL on failure
4666
 */
4667
8.51k
cram_file_def *cram_read_file_def(cram_fd *fd) {
4668
8.51k
    cram_file_def *def = malloc(sizeof(*def));
4669
8.51k
    if (!def)
4670
0
        return NULL;
4671
4672
8.51k
    if (26 != hread(fd->fp, &def->magic[0], 26)) {
4673
0
        free(def);
4674
0
        return NULL;
4675
0
    }
4676
4677
8.51k
    if (memcmp(def->magic, "CRAM", 4) != 0) {
4678
0
        free(def);
4679
0
        return NULL;
4680
0
    }
4681
4682
8.51k
    if (def->major_version > 4) {
4683
0
        hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x, 3.x or 4.x, got %d.%d",
4684
0
                      def->major_version, def->minor_version);
4685
0
        free(def);
4686
0
        return NULL;
4687
0
    }
4688
4689
8.51k
    fd->first_container += 26;
4690
8.51k
    fd->curr_position = fd->first_container;
4691
8.51k
    fd->last_slice = 0;
4692
4693
8.51k
    return def;
4694
8.51k
}
4695
4696
/*
4697
 * Writes a cram_file_def structure to cram_fd.
4698
 * Returns 0 on success
4699
 *        -1 on failure
4700
 */
4701
6.83k
int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
4702
6.83k
    return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1;
4703
6.83k
}
4704
4705
15.7k
void cram_free_file_def(cram_file_def *def) {
4706
15.7k
    if (def) free(def);
4707
15.7k
}
4708
4709
/* ----------------------------------------------------------------------
4710
 * SAM header I/O
4711
 */
4712
4713
4714
/*
4715
 * Reads the SAM header from the first CRAM data block.
4716
 * Also performs minimal parsing to extract read-group
4717
 * and sample information.
4718
4719
 * Returns SAM hdr ptr on success
4720
 *         NULL on failure
4721
 */
4722
8.51k
sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) {
4723
8.51k
    int32_t header_len;
4724
8.51k
    char *header;
4725
8.51k
    sam_hdr_t *hdr;
4726
4727
    /* 1.1 onwards stores the header in the first block of a container */
4728
8.51k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4729
        /* Length */
4730
7.34k
        if (-1 == int32_decode(fd, &header_len))
4731
0
            return NULL;
4732
4733
7.34k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
4734
7.34k
        if (header_len > FUZZ_ALLOC_LIMIT)
4735
0
            return NULL;
4736
7.34k
#endif
4737
4738
        /* Alloc and read */
4739
7.34k
        if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1)))
4740
0
            return NULL;
4741
4742
7.34k
        if (header_len != hread(fd->fp, header, header_len)) {
4743
3
            free(header);
4744
3
            return NULL;
4745
3
        }
4746
7.34k
        header[header_len] = '\0';
4747
4748
7.34k
        fd->first_container += 4 + header_len;
4749
7.34k
    } else {
4750
1.16k
        cram_container *c = cram_read_container(fd);
4751
1.16k
        cram_block *b;
4752
1.16k
        int i;
4753
1.16k
        int64_t len;
4754
4755
1.16k
        if (!c)
4756
2
            return NULL;
4757
4758
1.16k
        fd->first_container += c->length + c->offset;
4759
1.16k
        fd->curr_position = fd->first_container;
4760
4761
1.16k
        if (c->num_blocks < 1) {
4762
5
            cram_free_container(c);
4763
5
            return NULL;
4764
5
        }
4765
4766
1.15k
        if (!(b = cram_read_block(fd))) {
4767
27
            cram_free_container(c);
4768
27
            return NULL;
4769
27
        }
4770
1.13k
        if (cram_uncompress_block(b) != 0) {
4771
510
            cram_free_container(c);
4772
510
            cram_free_block(b);
4773
510
            return NULL;
4774
510
        }
4775
4776
622
        len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4777
622
            fd->vv.varint_size(b->content_id) +
4778
622
            fd->vv.varint_size(b->uncomp_size) +
4779
622
            fd->vv.varint_size(b->comp_size);
4780
4781
        /* Extract header from 1st block */
4782
622
        if (-1 == int32_get_blk(b, &header_len) ||
4783
600
            header_len < 0 || /* Spec. says signed...  why? */
4784
595
            b->uncomp_size - 4 < header_len) {
4785
40
            cram_free_container(c);
4786
40
            cram_free_block(b);
4787
40
            return NULL;
4788
40
        }
4789
582
        if (NULL == (header = malloc((size_t) header_len+1))) {
4790
0
            cram_free_container(c);
4791
0
            cram_free_block(b);
4792
0
            return NULL;
4793
0
        }
4794
582
        memcpy(header, BLOCK_END(b), header_len);
4795
582
        header[header_len] = '\0';
4796
582
        cram_free_block(b);
4797
4798
        /* Consume any remaining blocks */
4799
2.63k
        for (i = 1; i < c->num_blocks; i++) {
4800
2.06k
            if (!(b = cram_read_block(fd))) {
4801
9
                cram_free_container(c);
4802
9
                free(header);
4803
9
                return NULL;
4804
9
            }
4805
2.05k
            len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4806
2.05k
                fd->vv.varint_size(b->content_id) +
4807
2.05k
                fd->vv.varint_size(b->uncomp_size) +
4808
2.05k
                fd->vv.varint_size(b->comp_size);
4809
2.05k
            cram_free_block(b);
4810
2.05k
        }
4811
4812
573
        if (c->length > 0 && len > 0 && c->length > len) {
4813
            // Consume padding
4814
16
            char *pads = malloc(c->length - len);
4815
16
            if (!pads) {
4816
0
                cram_free_container(c);
4817
0
                free(header);
4818
0
                return NULL;
4819
0
            }
4820
4821
16
            if (c->length - len != hread(fd->fp, pads, c->length - len)) {
4822
4
                cram_free_container(c);
4823
4
                free(header);
4824
4
                free(pads);
4825
4
                return NULL;
4826
4
            }
4827
12
            free(pads);
4828
12
        }
4829
4830
569
        cram_free_container(c);
4831
569
    }
4832
4833
    /* Parse */
4834
7.91k
    hdr = sam_hdr_init();
4835
7.91k
    if (!hdr) {
4836
0
        free(header);
4837
0
        return NULL;
4838
0
    }
4839
4840
7.91k
    if (-1 == sam_hdr_add_lines(hdr, header, header_len)) {
4841
98
        free(header);
4842
98
        sam_hdr_destroy(hdr);
4843
98
        return NULL;
4844
98
    }
4845
4846
7.81k
    hdr->l_text = header_len;
4847
7.81k
    hdr->text = header;
4848
4849
7.81k
    return hdr;
4850
4851
7.91k
}
4852
4853
/*
4854
 * Converts 'in' to a full pathname to store in out.
4855
 * Out must be at least PATH_MAX bytes long.
4856
 */
4857
0
static void full_path(char *out, char *in) {
4858
0
    size_t in_l = strlen(in);
4859
0
    if (hisremote(in)) {
4860
0
        if (in_l > PATH_MAX) {
4861
0
            hts_log_error("Reference path is longer than %d", PATH_MAX);
4862
0
            return;
4863
0
        }
4864
0
        strncpy(out, in, PATH_MAX-1);
4865
0
        out[PATH_MAX-1] = 0;
4866
0
        return;
4867
0
    }
4868
0
    if (*in == '/' ||
4869
        // Windows paths
4870
0
        (in_l > 3 && toupper_c(*in) >= 'A'  && toupper_c(*in) <= 'Z' &&
4871
0
         in[1] == ':' && (in[2] == '/' || in[2] == '\\'))) {
4872
0
        strncpy(out, in, PATH_MAX-1);
4873
0
        out[PATH_MAX-1] = 0;
4874
0
    } else {
4875
0
        size_t len;
4876
4877
        // unable to get dir or out+in is too long
4878
0
        if (!getcwd(out, PATH_MAX) ||
4879
0
            (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
4880
0
            strncpy(out, in, PATH_MAX-1);
4881
0
            out[PATH_MAX-1] = 0;
4882
0
            return;
4883
0
        }
4884
4885
0
        snprintf(out+len, PATH_MAX - len, "/%s", in);
4886
4887
        // FIXME: cope with `pwd`/../../../foo.fa ?
4888
0
    }
4889
0
}
4890
4891
/*
4892
 * Writes a CRAM SAM header.
4893
 * Returns 0 on success
4894
 *        -1 on failure
4895
 */
4896
6.83k
int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) {
4897
6.83k
    size_t header_len;
4898
6.83k
    int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
4899
4900
    /* Write CRAM MAGIC if not yet written. */
4901
6.83k
    if (fd->file_def->major_version == 0) {
4902
6.83k
        fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version);
4903
6.83k
        fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version);
4904
6.83k
        if (0 != cram_write_file_def(fd, fd->file_def))
4905
0
            return -1;
4906
6.83k
    }
4907
4908
    /* 1.0 requires an UNKNOWN read-group */
4909
6.83k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4910
0
        if (!sam_hrecs_find_rg(hdr->hrecs, "UNKNOWN"))
4911
0
            if (sam_hdr_add_line(hdr, "RG",
4912
0
                            "ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
4913
0
                return -1;
4914
0
    }
4915
4916
6.83k
    if (-1 == refs_from_header(fd))
4917
0
        return -1;
4918
6.83k
    if (-1 == refs2id(fd->refs, fd->header))
4919
0
        return -1;
4920
4921
    /* Fix M5 strings */
4922
6.83k
    if (fd->refs && !fd->no_ref && fd->embed_ref <= 1) {
4923
6.83k
        int i;
4924
6.89k
        for (i = 0; i < hdr->hrecs->nref; i++) {
4925
3.50k
            sam_hrec_type_t *ty;
4926
3.50k
            char *ref;
4927
4928
3.50k
            if (!(ty = sam_hrecs_find_type_id(hdr->hrecs, "SQ", "SN", hdr->hrecs->ref[i].name)))
4929
0
                return -1;
4930
4931
3.50k
            if (!sam_hrecs_find_key(ty, "M5", NULL)) {
4932
3.44k
                char unsigned buf[16];
4933
3.44k
                char buf2[33];
4934
3.44k
                hts_pos_t rlen;
4935
3.44k
                hts_md5_context *md5;
4936
4937
3.44k
                if (!fd->refs ||
4938
3.44k
                    !fd->refs->ref_id ||
4939
3.44k
                    !fd->refs->ref_id[i]) {
4940
0
                    return -1;
4941
0
                }
4942
3.44k
                rlen = fd->refs->ref_id[i]->length;
4943
3.44k
                ref = cram_get_ref(fd, i, 1, rlen);
4944
3.44k
                if (NULL == ref) {
4945
3.44k
                    if (fd->embed_ref == -1) {
4946
                        // auto embed-ref
4947
3.44k
                        hts_log_warning("No M5 tags present and could not "
4948
3.44k
                                        "find reference");
4949
3.44k
                        hts_log_warning("Enabling embed_ref=2 option");
4950
3.44k
                        hts_log_warning("NOTE: the CRAM file will be bigger "
4951
3.44k
                                        "than using an external reference");
4952
3.44k
                        pthread_mutex_lock(&fd->ref_lock);
4953
                        // Best guess.  It may be unmapped data with broken
4954
                        // headers, in which case this will get ignored.
4955
3.44k
                        fd->embed_ref = 2;
4956
3.44k
                        pthread_mutex_unlock(&fd->ref_lock);
4957
3.44k
                        break;
4958
3.44k
                    }
4959
0
                    return -1;
4960
3.44k
                }
4961
0
                rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
4962
0
                if (!(md5 = hts_md5_init()))
4963
0
                    return -1;
4964
0
                if (HTS_POS_MAX <= ULONG_MAX) {
4965
                    // Platforms with 64-bit unsigned long update in one go
4966
0
                    hts_md5_update(md5, ref, rlen);
4967
0
                } else {
4968
                    // Those with 32-bit ulong (Windows) may have to loop
4969
                    // over epic references
4970
0
                    hts_pos_t pos = 0;
4971
0
                    while (rlen - pos > ULONG_MAX) {
4972
0
                        hts_md5_update(md5, ref + pos, ULONG_MAX);
4973
0
                        pos += ULONG_MAX;
4974
0
                    }
4975
0
                    hts_md5_update(md5, ref + pos, (unsigned long)(rlen - pos));
4976
0
                }
4977
0
                hts_md5_final(buf, md5);
4978
0
                hts_md5_destroy(md5);
4979
0
                cram_ref_decr(fd->refs, i);
4980
4981
0
                hts_md5_hex(buf2, buf);
4982
0
                fd->refs->ref_id[i]->validated_md5 = 1;
4983
0
                if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "M5", buf2, NULL))
4984
0
                    return -1;
4985
0
            }
4986
4987
58
            if (fd->ref_fn) {
4988
0
                char ref_fn[PATH_MAX];
4989
0
                full_path(ref_fn, fd->ref_fn);
4990
0
                if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "UR", ref_fn, NULL))
4991
0
                    return -1;
4992
0
            }
4993
58
        }
4994
6.83k
    }
4995
4996
    /* Length */
4997
6.83k
    header_len = sam_hdr_length(hdr);
4998
6.83k
    if (header_len > INT32_MAX) {
4999
0
        hts_log_error("Header is too long for CRAM format");
5000
0
        return -1;
5001
0
    }
5002
6.83k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
5003
0
        if (-1 == int32_encode(fd, header_len))
5004
0
            return -1;
5005
5006
        /* Text data */
5007
0
        if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len))
5008
0
            return -1;
5009
6.83k
    } else {
5010
        /* Create block(s) inside a container */
5011
6.83k
        cram_block *b = cram_new_block(FILE_HEADER, 0);
5012
6.83k
        cram_container *c = cram_new_container(0, 0);
5013
6.83k
        int padded_length;
5014
6.83k
        char *pads;
5015
6.83k
        int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3);
5016
5017
6.83k
        if (!b || !c) {
5018
0
            if (b) cram_free_block(b);
5019
0
            if (c) cram_free_container(c);
5020
0
            return -1;
5021
0
        }
5022
5023
6.83k
        if (int32_put_blk(b, header_len) < 0)
5024
0
            return -1;
5025
6.83k
        if (header_len)
5026
3.93k
            BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
5027
6.83k
        BLOCK_UPLEN(b);
5028
5029
        // Compress header block if V3.0 and above
5030
6.83k
        if (CRAM_MAJOR_VERS(fd->version) >= 3)
5031
6.83k
            if (cram_compress_block(fd, b, NULL, -1, -1) < 0)
5032
0
                return -1;
5033
5034
6.83k
        if (blank_block) {
5035
6.83k
            c->length = b->comp_size + 2 + 4*is_cram_3 +
5036
6.83k
                fd->vv.varint_size(b->content_id)   +
5037
6.83k
                fd->vv.varint_size(b->uncomp_size)  +
5038
6.83k
                fd->vv.varint_size(b->comp_size);
5039
5040
6.83k
            c->num_blocks = 2;
5041
6.83k
            c->num_landmarks = 2;
5042
6.83k
            if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
5043
0
                cram_free_block(b);
5044
0
                cram_free_container(c);
5045
0
                return -1;
5046
0
            }
5047
6.83k
            c->landmark[0] = 0;
5048
6.83k
            c->landmark[1] = c->length;
5049
5050
            // Plus extra storage for uncompressed secondary blank block
5051
6.83k
            padded_length = MIN(c->length*.5, 10000);
5052
6.83k
            c->length += padded_length + 2 + 4*is_cram_3 +
5053
6.83k
                fd->vv.varint_size(b->content_id) +
5054
6.83k
                fd->vv.varint_size(padded_length)*2;
5055
6.83k
        } else {
5056
            // Pad the block instead.
5057
0
            c->num_blocks = 1;
5058
0
            c->num_landmarks = 1;
5059
0
            if (!(c->landmark = malloc(sizeof(*c->landmark))))
5060
0
                return -1;
5061
0
            c->landmark[0] = 0;
5062
5063
0
            padded_length = MAX(c->length*1.5, 10000) - c->length;
5064
5065
0
            c->length = b->comp_size + padded_length +
5066
0
                2 + 4*is_cram_3 +
5067
0
                fd->vv.varint_size(b->content_id)   +
5068
0
                fd->vv.varint_size(b->uncomp_size)  +
5069
0
                fd->vv.varint_size(b->comp_size);
5070
5071
0
            if (NULL == (pads = calloc(1, padded_length))) {
5072
0
                cram_free_block(b);
5073
0
                cram_free_container(c);
5074
0
                return -1;
5075
0
            }
5076
0
            BLOCK_APPEND(b, pads, padded_length);
5077
0
            BLOCK_UPLEN(b);
5078
0
            free(pads);
5079
0
        }
5080
5081
6.83k
        if (-1 == cram_write_container(fd, c)) {
5082
0
            cram_free_block(b);
5083
0
            cram_free_container(c);
5084
0
            return -1;
5085
0
        }
5086
5087
6.83k
        if (-1 == cram_write_block(fd, b)) {
5088
0
            cram_free_block(b);
5089
0
            cram_free_container(c);
5090
0
            return -1;
5091
0
        }
5092
5093
6.83k
        if (blank_block) {
5094
6.83k
            BLOCK_RESIZE(b, padded_length);
5095
6.83k
            memset(BLOCK_DATA(b), 0, padded_length);
5096
6.83k
            BLOCK_SIZE(b) = padded_length;
5097
6.83k
            BLOCK_UPLEN(b);
5098
6.83k
            b->method = RAW;
5099
6.83k
            if (-1 == cram_write_block(fd, b)) {
5100
0
                cram_free_block(b);
5101
0
                cram_free_container(c);
5102
0
                return -1;
5103
0
            }
5104
6.83k
        }
5105
5106
6.83k
        cram_free_block(b);
5107
6.83k
        cram_free_container(c);
5108
6.83k
    }
5109
5110
6.83k
    if (0 != hflush(fd->fp))
5111
0
        return -1;
5112
5113
6.83k
    RP("=== Finishing saving header ===\n");
5114
5115
6.83k
    return 0;
5116
5117
0
 block_err:
5118
0
    return -1;
5119
6.83k
}
5120
5121
/* ----------------------------------------------------------------------
5122
 * The top-level cram opening, closing and option handling
5123
 */
5124
5125
/*
5126
 * Sets CRAM variable sized integer decode function tables.
5127
 * CRAM 1, 2, and 3.x all used ITF8 for uint32 and UTF8 for uint64.
5128
 * CRAM 4.x uses the same encoding mechanism for 32-bit and 64-bit
5129
 * (or anything inbetween), but also now supports signed values.
5130
 *
5131
 * Version is the CRAM major version number.
5132
 * vv is the vector table (probably &cram_fd->vv)
5133
 */
5134
15.7k
static void cram_init_varint(varint_vec *vv, int version) {
5135
15.7k
    if (version >= 4) {
5136
496
        vv->varint_get32 = uint7_get_32; // FIXME: varint.h API should be size agnostic
5137
496
        vv->varint_get32s = sint7_get_32;
5138
496
        vv->varint_get64 = uint7_get_64;
5139
496
        vv->varint_get64s = sint7_get_64;
5140
496
        vv->varint_put32 = uint7_put_32;
5141
496
        vv->varint_put32s = sint7_put_32;
5142
496
        vv->varint_put64 = uint7_put_64;
5143
496
        vv->varint_put64s = sint7_put_64;
5144
496
        vv->varint_put32_blk = uint7_put_blk_32;
5145
496
        vv->varint_put32s_blk = sint7_put_blk_32;
5146
496
        vv->varint_put64_blk = uint7_put_blk_64;
5147
496
        vv->varint_put64s_blk = sint7_put_blk_64;
5148
496
        vv->varint_size = uint7_size;
5149
496
        vv->varint_decode32_crc = uint7_decode_crc32;
5150
496
        vv->varint_decode32s_crc = sint7_decode_crc32;
5151
496
        vv->varint_decode64_crc = uint7_decode_crc64;
5152
15.2k
    } else {
5153
15.2k
        vv->varint_get32 = safe_itf8_get;
5154
15.2k
        vv->varint_get32s = safe_itf8_get;
5155
15.2k
        vv->varint_get64 = safe_ltf8_get;
5156
15.2k
        vv->varint_get64s = safe_ltf8_get;
5157
15.2k
        vv->varint_put32 = safe_itf8_put;
5158
15.2k
        vv->varint_put32s = safe_itf8_put;
5159
15.2k
        vv->varint_put64 = safe_ltf8_put;
5160
15.2k
        vv->varint_put64s = safe_ltf8_put;
5161
15.2k
        vv->varint_put32_blk = itf8_put_blk;
5162
15.2k
        vv->varint_put32s_blk = itf8_put_blk;
5163
15.2k
        vv->varint_put64_blk = ltf8_put_blk;
5164
15.2k
        vv->varint_put64s_blk = ltf8_put_blk;
5165
15.2k
        vv->varint_size = itf8_size;
5166
15.2k
        vv->varint_decode32_crc = itf8_decode_crc;
5167
15.2k
        vv->varint_decode32s_crc = itf8_decode_crc;
5168
15.2k
        vv->varint_decode64_crc = ltf8_decode_crc;
5169
15.2k
    }
5170
15.7k
}
5171
5172
/*
5173
 * Initialises the lookup tables. These could be global statics, but they're
5174
 * clumsy to setup in a multi-threaded environment unless we generate
5175
 * verbatim code and include that.
5176
 */
5177
15.7k
static void cram_init_tables(cram_fd *fd) {
5178
15.7k
    int i;
5179
5180
15.7k
    memset(fd->L1, 4, 256);
5181
15.7k
    fd->L1['A'] = 0; fd->L1['a'] = 0;
5182
15.7k
    fd->L1['C'] = 1; fd->L1['c'] = 1;
5183
15.7k
    fd->L1['G'] = 2; fd->L1['g'] = 2;
5184
15.7k
    fd->L1['T'] = 3; fd->L1['t'] = 3;
5185
5186
15.7k
    memset(fd->L2, 5, 256);
5187
15.7k
    fd->L2['A'] = 0; fd->L2['a'] = 0;
5188
15.7k
    fd->L2['C'] = 1; fd->L2['c'] = 1;
5189
15.7k
    fd->L2['G'] = 2; fd->L2['g'] = 2;
5190
15.7k
    fd->L2['T'] = 3; fd->L2['t'] = 3;
5191
15.7k
    fd->L2['N'] = 4; fd->L2['n'] = 4;
5192
5193
15.7k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
5194
3.76M
        for (i = 0; i < 0x200; i++) {
5195
3.76M
            int f = 0;
5196
5197
3.76M
            if (i & CRAM_FPAIRED)      f |= BAM_FPAIRED;
5198
3.76M
            if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
5199
3.76M
            if (i & CRAM_FUNMAP)       f |= BAM_FUNMAP;
5200
3.76M
            if (i & CRAM_FREVERSE)     f |= BAM_FREVERSE;
5201
3.76M
            if (i & CRAM_FREAD1)       f |= BAM_FREAD1;
5202
3.76M
            if (i & CRAM_FREAD2)       f |= BAM_FREAD2;
5203
3.76M
            if (i & CRAM_FSECONDARY)   f |= BAM_FSECONDARY;
5204
3.76M
            if (i & CRAM_FQCFAIL)      f |= BAM_FQCFAIL;
5205
3.76M
            if (i & CRAM_FDUP)         f |= BAM_FDUP;
5206
5207
3.76M
            fd->bam_flag_swap[i]  = f;
5208
3.76M
        }
5209
5210
30.0M
        for (i = 0; i < 0x1000; i++) {
5211
30.0M
            int g = 0;
5212
5213
30.0M
            if (i & BAM_FPAIRED)           g |= CRAM_FPAIRED;
5214
30.0M
            if (i & BAM_FPROPER_PAIR)  g |= CRAM_FPROPER_PAIR;
5215
30.0M
            if (i & BAM_FUNMAP)        g |= CRAM_FUNMAP;
5216
30.0M
            if (i & BAM_FREVERSE)      g |= CRAM_FREVERSE;
5217
30.0M
            if (i & BAM_FREAD1)        g |= CRAM_FREAD1;
5218
30.0M
            if (i & BAM_FREAD2)        g |= CRAM_FREAD2;
5219
30.0M
            if (i & BAM_FSECONDARY)    g |= CRAM_FSECONDARY;
5220
30.0M
            if (i & BAM_FQCFAIL)       g |= CRAM_FQCFAIL;
5221
30.0M
            if (i & BAM_FDUP)          g |= CRAM_FDUP;
5222
5223
30.0M
            fd->cram_flag_swap[i] = g;
5224
30.0M
        }
5225
8.36k
    } else {
5226
        /* NOP */
5227
34.2M
        for (i = 0; i < 0x1000; i++)
5228
34.2M
            fd->bam_flag_swap[i] = i;
5229
34.2M
        for (i = 0; i < 0x1000; i++)
5230
34.2M
            fd->cram_flag_swap[i] = i;
5231
8.36k
    }
5232
5233
15.7k
    memset(fd->cram_sub_matrix, 4, 32*32);
5234
518k
    for (i = 0; i < 32; i++) {
5235
502k
        fd->cram_sub_matrix[i]['A'&0x1f]=0;
5236
502k
        fd->cram_sub_matrix[i]['C'&0x1f]=1;
5237
502k
        fd->cram_sub_matrix[i]['G'&0x1f]=2;
5238
502k
        fd->cram_sub_matrix[i]['T'&0x1f]=3;
5239
502k
        fd->cram_sub_matrix[i]['N'&0x1f]=4;
5240
502k
    }
5241
94.2k
    for (i = 0; i < 20; i+=4) {
5242
78.5k
        int j;
5243
1.64M
        for (j = 0; j < 20; j++) {
5244
1.57M
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5245
1.57M
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5246
1.57M
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5247
1.57M
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5248
1.57M
        }
5249
78.5k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
5250
78.5k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
5251
78.5k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
5252
78.5k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
5253
78.5k
    }
5254
5255
15.7k
    cram_init_varint(&fd->vv, CRAM_MAJOR_VERS(fd->version));
5256
15.7k
}
5257
5258
// Default version numbers for CRAM
5259
static int major_version = 3;
5260
static int minor_version = 1;
5261
5262
/*
5263
 * Opens a CRAM file for read (mode "rb") or write ("wb").
5264
 * The filename may be "-" to indicate stdin or stdout.
5265
 *
5266
 * Returns file handle on success
5267
 *         NULL on failure.
5268
 */
5269
0
cram_fd *cram_open(const char *filename, const char *mode) {
5270
0
    hFILE *fp;
5271
0
    cram_fd *fd;
5272
0
    char fmode[3]= { mode[0], '\0', '\0' };
5273
5274
0
    if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
5275
0
        fmode[1] = 'b';
5276
0
    }
5277
5278
0
    fp = hopen(filename, fmode);
5279
0
    if (!fp)
5280
0
        return NULL;
5281
5282
0
    fd = cram_dopen(fp, filename, mode);
5283
0
    if (!fd)
5284
0
        hclose_abruptly(fp);
5285
5286
0
    return fd;
5287
0
}
5288
5289
/* Opens an existing stream for reading or writing.
5290
 *
5291
 * Returns file handle on success;
5292
 *         NULL on failure.
5293
 */
5294
15.7k
cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
5295
15.7k
    int i;
5296
15.7k
    char *cp;
5297
15.7k
    cram_fd *fd = calloc(1, sizeof(*fd));
5298
15.7k
    if (!fd)
5299
0
        return NULL;
5300
5301
15.7k
    fd->level = CRAM_DEFAULT_LEVEL;
5302
47.1k
    for (i = 0; mode[i]; i++) {
5303
31.4k
        if (mode[i] >= '0' && mode[i] <= '9') {
5304
0
            fd->level = mode[i] - '0';
5305
0
            break;
5306
0
        }
5307
31.4k
    }
5308
5309
15.7k
    fd->fp = fp;
5310
15.7k
    fd->mode = *mode;
5311
15.7k
    fd->first_container = 0;
5312
15.7k
    fd->curr_position = 0;
5313
5314
15.7k
    if (fd->mode == 'r') {
5315
        /* Reader */
5316
5317
8.51k
        if (!(fd->file_def = cram_read_file_def(fd)))
5318
0
            goto err;
5319
5320
8.51k
        fd->version = fd->file_def->major_version * 256 +
5321
8.51k
            fd->file_def->minor_version;
5322
5323
8.51k
        cram_init_tables(fd);
5324
5325
8.51k
        if (!(fd->header = cram_read_SAM_hdr(fd))) {
5326
698
            cram_free_file_def(fd->file_def);
5327
698
            goto err;
5328
698
        }
5329
5330
8.51k
    } else {
5331
        /* Writer */
5332
7.20k
        cram_file_def *def = calloc(1, sizeof(*def));
5333
7.20k
        if (!def)
5334
0
            return NULL;
5335
5336
7.20k
        fd->file_def = def;
5337
5338
7.20k
        def->magic[0] = 'C';
5339
7.20k
        def->magic[1] = 'R';
5340
7.20k
        def->magic[2] = 'A';
5341
7.20k
        def->magic[3] = 'M';
5342
7.20k
        def->major_version = 0; // Indicator to write file def later.
5343
7.20k
        def->minor_version = 0;
5344
7.20k
        memset(def->file_id, 0, 20);
5345
7.20k
        strncpy(def->file_id, filename, 20);
5346
5347
7.20k
        fd->version = major_version * 256 + minor_version;
5348
7.20k
        cram_init_tables(fd);
5349
5350
        /* SAM header written later along with this file_def */
5351
7.20k
    }
5352
5353
15.0k
    fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
5354
15.0k
    if (!fd->prefix)
5355
0
        goto err;
5356
15.0k
    fd->first_base = fd->last_base = -1;
5357
15.0k
    fd->record_counter = 0;
5358
5359
15.0k
    fd->ctr = NULL;
5360
15.0k
    fd->ctr_mt = NULL;
5361
15.0k
    fd->refs  = refs_create();
5362
15.0k
    if (!fd->refs)
5363
0
        goto err;
5364
15.0k
    fd->ref_id = -2;
5365
15.0k
    fd->ref = NULL;
5366
5367
15.0k
    fd->decode_md = 0;
5368
15.0k
    fd->seqs_per_slice = SEQS_PER_SLICE;
5369
15.0k
    fd->bases_per_slice = BASES_PER_SLICE;
5370
15.0k
    fd->slices_per_container = SLICE_PER_CNT;
5371
15.0k
    fd->embed_ref = -1; // automatic selection
5372
15.0k
    fd->no_ref = 0;
5373
15.0k
    fd->no_ref_counter = 0;
5374
15.0k
    fd->ap_delta = 0;
5375
15.0k
    fd->ignore_md5 = 0;
5376
15.0k
    fd->lossy_read_names = 0;
5377
15.0k
    fd->use_bz2 = 0;
5378
15.0k
    fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3);
5379
15.0k
    fd->use_tok = (CRAM_MAJOR_VERS(fd->version) >= 3) && (CRAM_MINOR_VERS(fd->version) >= 1);
5380
15.0k
    fd->use_lzma = 0;
5381
15.0k
    fd->multi_seq = -1;
5382
15.0k
    fd->multi_seq_user = -1;
5383
15.0k
    fd->unsorted   = 0;
5384
15.0k
    fd->shared_ref = 0;
5385
15.0k
    fd->store_md = 0;
5386
15.0k
    fd->store_nm = 0;
5387
15.0k
    fd->last_RI_count = 0;
5388
5389
15.0k
    fd->index       = NULL;
5390
15.0k
    fd->own_pool    = 0;
5391
15.0k
    fd->pool        = NULL;
5392
15.0k
    fd->rqueue      = NULL;
5393
15.0k
    fd->job_pending = NULL;
5394
15.0k
    fd->ooc         = 0;
5395
15.0k
    fd->required_fields = INT_MAX;
5396
5397
15.0k
    pthread_mutex_init(&fd->metrics_lock, NULL);
5398
15.0k
    pthread_mutex_init(&fd->ref_lock, NULL);
5399
15.0k
    pthread_mutex_init(&fd->range_lock, NULL);
5400
15.0k
    pthread_mutex_init(&fd->bam_list_lock, NULL);
5401
5402
720k
    for (i = 0; i < DS_END; i++) {
5403
705k
        fd->m[i] = cram_new_metrics();
5404
705k
        if (!fd->m[i])
5405
0
            goto err;
5406
705k
    }
5407
5408
15.0k
    if (!(fd->tags_used = kh_init(m_metrics)))
5409
0
        goto err;
5410
5411
15.0k
    fd->range.refid = -2; // no ref.
5412
15.0k
    fd->eof = 1;          // See samtools issue #150
5413
15.0k
    fd->ref_fn = NULL;
5414
5415
15.0k
    fd->bl = NULL;
5416
5417
    /* Initialise dummy refs from the @SQ headers */
5418
15.0k
    if (-1 == refs_from_header(fd))
5419
0
        goto err;
5420
5421
15.0k
    return fd;
5422
5423
698
 err:
5424
698
    if (fd)
5425
698
        free(fd);
5426
5427
698
    return NULL;
5428
15.0k
}
5429
5430
/*
5431
 * Seek within a CRAM file.
5432
 *
5433
 * Returns 0 on success
5434
 *        -1 on failure
5435
 */
5436
0
int cram_seek(cram_fd *fd, off_t offset, int whence) {
5437
0
    fd->ooc = 0;
5438
5439
0
    cram_drain_rqueue(fd);
5440
5441
0
    return hseek(fd->fp, offset, whence) >= 0 ? 0 : -1;
5442
0
}
5443
5444
/*
5445
 * Flushes a CRAM file.
5446
 * Useful for when writing to stdout without wishing to close the stream.
5447
 *
5448
 * Returns 0 on success
5449
 *        -1 on failure
5450
 */
5451
0
int cram_flush(cram_fd *fd) {
5452
0
    if (!fd)
5453
0
        return -1;
5454
5455
0
    int ret = 0;
5456
5457
0
    if (fd->mode == 'w' && fd->ctr) {
5458
0
        if(fd->ctr->slice)
5459
0
            cram_update_curr_slice(fd->ctr, fd->version);
5460
5461
0
        if (-1 == cram_flush_container_mt(fd, fd->ctr))
5462
0
            ret = -1;
5463
5464
0
        cram_free_container(fd->ctr);
5465
0
        if (fd->ctr_mt == fd->ctr)
5466
0
            fd->ctr_mt = NULL;
5467
0
        fd->ctr = NULL;
5468
0
    }
5469
5470
0
    return ret;
5471
0
}
5472
5473
/*
5474
 * Writes an EOF block to a CRAM file.
5475
 *
5476
 * Returns 0 on success
5477
 *        -1 on failure
5478
 */
5479
6.64k
int cram_write_eof_block(cram_fd *fd) {
5480
    // EOF block is a container with special values to aid detection
5481
6.64k
    if (CRAM_MAJOR_VERS(fd->version) >= 2) {
5482
        // Empty container with
5483
        //   ref_seq_id -1
5484
        //   start pos 0x454f46 ("EOF")
5485
        //   span 0
5486
        //   nrec 0
5487
        //   counter 0
5488
        //   nbases 0
5489
        //   1 block (landmark 0)
5490
        //   (CRC32)
5491
6.64k
        cram_container c;
5492
6.64k
        memset(&c, 0, sizeof(c));
5493
6.64k
        c.ref_seq_id = -1;
5494
6.64k
        c.ref_seq_start = 0x454f46; // "EOF"
5495
6.64k
        c.ref_seq_span = 0;
5496
6.64k
        c.record_counter = 0;
5497
6.64k
        c.num_bases = 0;
5498
6.64k
        c.num_blocks = 1;
5499
6.64k
        int32_t land[1] = {0};
5500
6.64k
        c.landmark = land;
5501
5502
        // An empty compression header block with
5503
        //   method raw (0)
5504
        //   type comp header (1)
5505
        //   content id 0
5506
        //   block contents size 6
5507
        //   raw size 6
5508
        //     empty preservation map (01 00)
5509
        //     empty data series map (01 00)
5510
        //     empty tag map (01 00)
5511
        //   block CRC
5512
6.64k
        cram_block_compression_hdr ch;
5513
6.64k
        memset(&ch, 0, sizeof(ch));
5514
6.64k
        c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch, 0);
5515
5516
6.64k
        c.length = c.comp_hdr_block->byte            // Landmark[0]
5517
6.64k
            + 5                                      // block struct
5518
6.64k
            + 4*(CRAM_MAJOR_VERS(fd->version) >= 3); // CRC
5519
6.64k
        if (cram_write_container(fd, &c) < 0 ||
5520
6.64k
            cram_write_block(fd, c.comp_hdr_block) < 0) {
5521
0
            cram_close(fd);
5522
0
            cram_free_block(c.comp_hdr_block);
5523
0
            return -1;
5524
0
        }
5525
6.64k
        if (ch.preservation_map)
5526
0
            kh_destroy(map, ch.preservation_map);
5527
6.64k
        cram_free_block(c.comp_hdr_block);
5528
5529
        // V2.1 bytes
5530
        // 0b 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id
5531
        // e0 45 4f 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5532
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5533
        // 00 01 00 06 06             // Comp.HDR blk
5534
        // 01 00 01 00 01 00          // Comp.HDR blk
5535
5536
        // V3.0 bytes:
5537
        // 0f 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id
5538
        // e0 45 4f 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5539
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5540
        // 05 bd d9 4f                // CRC32
5541
        // 00 01 00 06 06             // Comp.HDR blk
5542
        // 01 00 01 00 01 00          // Comp.HDR blk
5543
        // ee 63 01 4b                // CRC32
5544
5545
        // V4.0 bytes:
5546
        // 0f 00 00 00 8f ff ff ff    // Cont HDR: size, ref seq id
5547
        // 82 95 9e 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5548
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5549
        // ac d6 05 bc                // CRC32
5550
        // 00 01 00 06 06             // Comp.HDR blk
5551
        // 01 00 01 00 01 00          // Comp.HDR blk
5552
        // ee 63 01 4b                // CRC32
5553
6.64k
    }
5554
5555
6.64k
    return 0;
5556
6.64k
}
5557
5558
/*
5559
 * Closes a CRAM file.
5560
 * Returns 0 on success
5561
 *        -1 on failure
5562
 */
5563
15.0k
int cram_close(cram_fd *fd) {
5564
15.0k
    spare_bams *bl, *next;
5565
15.0k
    int i, ret = 0;
5566
5567
15.0k
    if (!fd)
5568
0
        return -1;
5569
5570
15.0k
    if (fd->mode == 'w' && fd->ctr) {
5571
3.49k
        if(fd->ctr->slice)
5572
3.49k
            cram_update_curr_slice(fd->ctr, fd->version);
5573
5574
3.49k
        if (-1 == cram_flush_container_mt(fd, fd->ctr))
5575
559
            ret = -1;
5576
3.49k
    }
5577
5578
15.0k
    if (fd->mode != 'w')
5579
7.81k
        cram_drain_rqueue(fd);
5580
5581
15.0k
    if (fd->pool && fd->eof >= 0 && fd->rqueue) {
5582
0
        hts_tpool_process_flush(fd->rqueue);
5583
5584
0
        if (0 != cram_flush_result(fd))
5585
0
            ret = -1;
5586
5587
0
        if (fd->mode == 'w')
5588
0
            fd->ctr = NULL; // prevent double freeing
5589
5590
        //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
5591
5592
0
        hts_tpool_process_destroy(fd->rqueue);
5593
0
    }
5594
5595
15.0k
    pthread_mutex_destroy(&fd->metrics_lock);
5596
15.0k
    pthread_mutex_destroy(&fd->ref_lock);
5597
15.0k
    pthread_mutex_destroy(&fd->range_lock);
5598
15.0k
    pthread_mutex_destroy(&fd->bam_list_lock);
5599
5600
15.0k
    if (ret == 0 && fd->mode == 'w') {
5601
        /* Write EOF block */
5602
6.64k
        if (0 != cram_write_eof_block(fd))
5603
0
            ret = -1;
5604
6.64k
    }
5605
5606
18.3k
    for (bl = fd->bl; bl; bl = next) {
5607
3.30k
        int max_rec = fd->seqs_per_slice * fd->slices_per_container;
5608
5609
3.30k
        next = bl->next;
5610
3.30k
        free_bam_list(bl->bams, max_rec);
5611
3.30k
        free(bl);
5612
3.30k
    }
5613
5614
15.0k
    if (hclose(fd->fp) != 0)
5615
0
        ret = -1;
5616
5617
15.0k
    if (fd->file_def)
5618
15.0k
        cram_free_file_def(fd->file_def);
5619
5620
15.0k
    if (fd->header)
5621
14.8k
        sam_hdr_destroy(fd->header);
5622
5623
15.0k
    free(fd->prefix);
5624
5625
15.0k
    if (fd->ctr)
5626
8.69k
        cram_free_container(fd->ctr);
5627
5628
15.0k
    if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
5629
205
        cram_free_container(fd->ctr_mt);
5630
5631
15.0k
    if (fd->refs)
5632
15.0k
        refs_free(fd->refs);
5633
15.0k
    if (fd->ref_free)
5634
0
        free(fd->ref_free);
5635
5636
720k
    for (i = 0; i < DS_END; i++)
5637
705k
        if (fd->m[i])
5638
705k
            free(fd->m[i]);
5639
5640
15.0k
    if (fd->tags_used) {
5641
15.0k
        khint_t k;
5642
5643
37.2k
        for (k = kh_begin(fd->tags_used); k != kh_end(fd->tags_used); k++) {
5644
22.2k
            if (kh_exist(fd->tags_used, k))
5645
11.4k
                free(kh_val(fd->tags_used, k));
5646
22.2k
        }
5647
5648
15.0k
        kh_destroy(m_metrics, fd->tags_used);
5649
15.0k
    }
5650
5651
15.0k
    if (fd->index)
5652
0
        cram_index_free(fd);
5653
5654
15.0k
    if (fd->own_pool && fd->pool)
5655
0
        hts_tpool_destroy(fd->pool);
5656
5657
15.0k
    if (fd->idxfp)
5658
0
        if (bgzf_close(fd->idxfp) < 0)
5659
0
            ret = -1;
5660
5661
15.0k
    free(fd);
5662
5663
15.0k
    return ret;
5664
15.0k
}
5665
5666
/*
5667
 * Returns 1 if we hit an EOF while reading.
5668
 */
5669
13.5k
int cram_eof(cram_fd *fd) {
5670
13.5k
    return fd->eof;
5671
13.5k
}
5672
5673
5674
/*
5675
 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5676
 * Use this immediately after opening.
5677
 *
5678
 * Returns 0 on success
5679
 *        -1 on failure
5680
 */
5681
7.81k
int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...) {
5682
7.81k
    int r;
5683
7.81k
    va_list args;
5684
5685
7.81k
    va_start(args, opt);
5686
7.81k
    r = cram_set_voption(fd, opt, args);
5687
7.81k
    va_end(args);
5688
5689
7.81k
    return r;
5690
7.81k
}
5691
5692
/*
5693
 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5694
 * Use this immediately after opening.
5695
 *
5696
 * Returns 0 on success
5697
 *        -1 on failure
5698
 */
5699
7.81k
int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
5700
7.81k
    refs_t *refs;
5701
5702
7.81k
    if (!fd) {
5703
0
        errno = EBADF;
5704
0
        return -1;
5705
0
    }
5706
5707
7.81k
    switch (opt) {
5708
7.81k
    case CRAM_OPT_DECODE_MD:
5709
7.81k
        fd->decode_md = va_arg(args, int);
5710
7.81k
        break;
5711
5712
0
    case CRAM_OPT_PREFIX:
5713
0
        if (fd->prefix)
5714
0
            free(fd->prefix);
5715
0
        if (!(fd->prefix = strdup(va_arg(args, char *))))
5716
0
            return -1;
5717
0
        break;
5718
5719
0
    case CRAM_OPT_VERBOSITY:
5720
0
        break;
5721
5722
0
    case CRAM_OPT_SEQS_PER_SLICE:
5723
0
        fd->seqs_per_slice = va_arg(args, int);
5724
0
        if (fd->bases_per_slice == BASES_PER_SLICE)
5725
0
            fd->bases_per_slice = fd->seqs_per_slice * 500;
5726
0
        break;
5727
5728
0
    case CRAM_OPT_BASES_PER_SLICE:
5729
0
        fd->bases_per_slice = va_arg(args, int);
5730
0
        break;
5731
5732
0
    case CRAM_OPT_SLICES_PER_CONTAINER:
5733
0
        fd->slices_per_container = va_arg(args, int);
5734
0
        break;
5735
5736
0
    case CRAM_OPT_EMBED_REF:
5737
0
        fd->embed_ref = va_arg(args, int);
5738
0
        break;
5739
5740
0
    case CRAM_OPT_NO_REF:
5741
0
        fd->no_ref = va_arg(args, int);
5742
0
        break;
5743
5744
0
    case CRAM_OPT_POS_DELTA:
5745
0
        fd->ap_delta = va_arg(args, int);
5746
0
        break;
5747
5748
0
    case CRAM_OPT_IGNORE_MD5:
5749
0
        fd->ignore_md5 = va_arg(args, int);
5750
0
        break;
5751
5752
0
    case CRAM_OPT_LOSSY_NAMES:
5753
0
        fd->lossy_read_names = va_arg(args, int);
5754
        // Currently lossy read names required paired (attached) reads.
5755
        // TLEN 0 or being 1 out causes read pairs to be detached, breaking
5756
        // the lossy read name compression, so we have extra options to
5757
        // slacken the exact TLEN round-trip checks.
5758
0
        fd->tlen_approx = fd->lossy_read_names;
5759
0
        fd->tlen_zero = fd->lossy_read_names;
5760
0
        break;
5761
5762
0
    case CRAM_OPT_USE_BZIP2:
5763
0
        fd->use_bz2 = va_arg(args, int);
5764
0
        break;
5765
5766
0
    case CRAM_OPT_USE_RANS:
5767
0
        fd->use_rans = va_arg(args, int);
5768
0
        break;
5769
5770
0
    case CRAM_OPT_USE_TOK:
5771
0
        fd->use_tok = va_arg(args, int);
5772
0
        break;
5773
5774
0
    case CRAM_OPT_USE_FQZ:
5775
0
        fd->use_fqz = va_arg(args, int);
5776
0
        break;
5777
5778
0
    case CRAM_OPT_USE_ARITH:
5779
0
        fd->use_arith = va_arg(args, int);
5780
0
        break;
5781
5782
0
    case CRAM_OPT_USE_LZMA:
5783
0
        fd->use_lzma = va_arg(args, int);
5784
0
        break;
5785
5786
0
    case CRAM_OPT_SHARED_REF:
5787
0
        fd->shared_ref = 1;
5788
0
        refs = va_arg(args, refs_t *);
5789
0
        if (refs != fd->refs) {
5790
0
            if (fd->refs)
5791
0
                refs_free(fd->refs);
5792
0
            fd->refs = refs;
5793
0
            fd->refs->count++;
5794
0
        }
5795
0
        break;
5796
5797
0
    case CRAM_OPT_RANGE: {
5798
0
        int r = cram_seek_to_refpos(fd, va_arg(args, cram_range *));
5799
0
        pthread_mutex_lock(&fd->range_lock);
5800
//        printf("opt range noseek to %p %d:%ld-%ld\n",
5801
//               fd, fd->range.refid, fd->range.start, fd->range.end);
5802
0
        if (fd->range.refid != -2)
5803
0
            fd->required_fields |= SAM_POS;
5804
0
        pthread_mutex_unlock(&fd->range_lock);
5805
0
        return r;
5806
0
    }
5807
5808
0
    case CRAM_OPT_RANGE_NOSEEK: {
5809
        // As per CRAM_OPT_RANGE, but no seeking
5810
0
        pthread_mutex_lock(&fd->range_lock);
5811
0
        cram_range *r = va_arg(args, cram_range *);
5812
0
        fd->range = *r;
5813
0
        if (r->refid == HTS_IDX_NOCOOR) {
5814
0
            fd->range.refid = -1;
5815
0
            fd->range.start = 0;
5816
0
        } else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) {
5817
0
            fd->range.refid = -2; // special case in cram_next_slice
5818
0
        }
5819
0
        if (fd->range.refid != -2)
5820
0
            fd->required_fields |= SAM_POS;
5821
0
        fd->ooc = 0;
5822
0
        fd->eof = 0;
5823
0
        pthread_mutex_unlock(&fd->range_lock);
5824
0
        return 0;
5825
0
    }
5826
5827
0
    case CRAM_OPT_REFERENCE:
5828
0
        return cram_load_reference(fd, va_arg(args, char *));
5829
5830
0
    case CRAM_OPT_VERSION: {
5831
0
        int major, minor;
5832
0
        char *s = va_arg(args, char *);
5833
0
        if (2 != sscanf(s, "%d.%d", &major, &minor)) {
5834
0
            hts_log_error("Malformed version string %s", s);
5835
0
            return -1;
5836
0
        }
5837
0
        if (!((major == 1 &&  minor == 0) ||
5838
0
              (major == 2 && (minor == 0 || minor == 1)) ||
5839
0
              (major == 3 && (minor == 0 || minor == 1)) ||
5840
0
              (major == 4 &&  minor == 0))) {
5841
0
            hts_log_error("Unknown version string; use 1.0, 2.0, 2.1, 3.0, 3.1 or 4.0");
5842
0
            errno = EINVAL;
5843
0
            return -1;
5844
0
        }
5845
5846
0
        if (major > 3 || (major == 3 && minor > 1)) {
5847
0
            hts_log_warning(
5848
0
                "CRAM version %s is still a draft and subject to change.\n"
5849
0
                "This is a technology demonstration that should not be "
5850
0
                "used for archival data.", s);
5851
0
        }
5852
5853
0
        fd->version = major*256 + minor;
5854
5855
0
        fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0;
5856
5857
0
        fd->use_tok = ((CRAM_MAJOR_VERS(fd->version) == 3 &&
5858
0
                        CRAM_MINOR_VERS(fd->version) >= 1) ||
5859
0
                        CRAM_MAJOR_VERS(fd->version) >= 4) ? 1 : 0;
5860
0
        cram_init_tables(fd);
5861
5862
0
        break;
5863
0
    }
5864
5865
0
    case CRAM_OPT_MULTI_SEQ_PER_SLICE:
5866
0
        fd->multi_seq_user = fd->multi_seq = va_arg(args, int);
5867
0
        break;
5868
5869
0
    case CRAM_OPT_NTHREADS: {
5870
0
        int nthreads =  va_arg(args, int);
5871
0
        if (nthreads >= 1) {
5872
0
            if (!(fd->pool = hts_tpool_init(nthreads)))
5873
0
                return -1;
5874
5875
0
            fd->rqueue = hts_tpool_process_init(fd->pool, nthreads*2, 0);
5876
0
            fd->shared_ref = 1;
5877
0
            fd->own_pool = 1;
5878
0
        }
5879
0
        break;
5880
0
    }
5881
5882
0
    case CRAM_OPT_THREAD_POOL: {
5883
0
        htsThreadPool *p = va_arg(args, htsThreadPool *);
5884
0
        fd->pool = p ? p->pool : NULL;
5885
0
        if (fd->pool) {
5886
0
            fd->rqueue = hts_tpool_process_init(fd->pool,
5887
0
                                                p->qsize ? p->qsize : hts_tpool_size(fd->pool)*2,
5888
0
                                                0);
5889
0
        }
5890
0
        fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
5891
0
        fd->own_pool = 0;
5892
5893
        //fd->qsize = 1;
5894
        //fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
5895
        //hts_tpool_dispatch(fd->pool, cram_decoder_thread, fd);
5896
0
        break;
5897
0
    }
5898
5899
0
    case CRAM_OPT_REQUIRED_FIELDS:
5900
0
        fd->required_fields = va_arg(args, int);
5901
0
        if (fd->range.refid != -2)
5902
0
            fd->required_fields |= SAM_POS;
5903
0
        break;
5904
5905
0
    case CRAM_OPT_STORE_MD:
5906
0
        fd->store_md = va_arg(args, int);
5907
0
        break;
5908
5909
0
    case CRAM_OPT_STORE_NM:
5910
0
        fd->store_nm = va_arg(args, int);
5911
0
        break;
5912
5913
0
    case HTS_OPT_COMPRESSION_LEVEL:
5914
0
        fd->level = va_arg(args, int);
5915
0
        break;
5916
5917
0
    case HTS_OPT_PROFILE: {
5918
0
        enum hts_profile_option prof = va_arg(args, int);
5919
0
        switch (prof) {
5920
0
        case HTS_PROFILE_FAST:
5921
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 1;
5922
0
            fd->use_tok = 0;
5923
0
            fd->seqs_per_slice = 10000;
5924
0
            break;
5925
5926
0
        case HTS_PROFILE_NORMAL:
5927
0
            break;
5928
5929
0
        case HTS_PROFILE_SMALL:
5930
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 6;
5931
0
            fd->use_bz2 = 1;
5932
0
            fd->use_fqz = 1;
5933
0
            fd->seqs_per_slice = 25000;
5934
0
            break;
5935
5936
0
        case HTS_PROFILE_ARCHIVE:
5937
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 7;
5938
0
            fd->use_bz2 = 1;
5939
0
            fd->use_fqz = 1;
5940
0
            fd->use_arith = 1;
5941
0
            if (fd->level > 7)
5942
0
                fd->use_lzma = 1;
5943
0
            fd->seqs_per_slice = 100000;
5944
0
            break;
5945
0
        }
5946
5947
0
        if (fd->bases_per_slice == BASES_PER_SLICE)
5948
0
            fd->bases_per_slice = fd->seqs_per_slice * 500;
5949
0
        break;
5950
0
    }
5951
5952
0
    default:
5953
0
        hts_log_error("Unknown CRAM option code %d", opt);
5954
0
        errno = EINVAL;
5955
0
        return -1;
5956
7.81k
    }
5957
5958
7.81k
    return 0;
5959
7.81k
}
5960
5961
int cram_check_EOF(cram_fd *fd)
5962
0
{
5963
    // Byte 9 in these templates is & with 0x0f to resolve differences
5964
    // between ITF-8 interpretations between early Java and C
5965
    // implementations of CRAM
5966
0
    static const unsigned char TEMPLATE_2_1[30] = {
5967
0
        0x0b, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
5968
0
        0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
5969
0
        0x01, 0x00, 0x06, 0x06, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00
5970
0
    };
5971
0
    static const unsigned char TEMPLATE_3[38] = {
5972
0
        0x0f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
5973
0
        0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x05,
5974
0
        0xbd, 0xd9, 0x4f, 0x00, 0x01, 0x00, 0x06, 0x06, 0x01, 0x00,
5975
0
        0x01, 0x00, 0x01, 0x00, 0xee, 0x63, 0x01, 0x4b
5976
0
    };
5977
5978
0
    unsigned char buf[38]; // max(sizeof TEMPLATE_*)
5979
5980
0
    uint8_t major = CRAM_MAJOR_VERS(fd->version);
5981
0
    uint8_t minor = CRAM_MINOR_VERS(fd->version);
5982
5983
0
    const unsigned char *template;
5984
0
    ssize_t template_len;
5985
0
    if ((major < 2) ||
5986
0
        (major == 2 && minor == 0)) {
5987
0
        return 3; // No EOF support in cram versions less than 2.1
5988
0
    } else if (major == 2 && minor == 1) {
5989
0
        template = TEMPLATE_2_1;
5990
0
        template_len = sizeof TEMPLATE_2_1;
5991
0
    } else {
5992
0
        template = TEMPLATE_3;
5993
0
        template_len = sizeof TEMPLATE_3;
5994
0
    }
5995
5996
0
    off_t offset = htell(fd->fp);
5997
0
    if (hseek(fd->fp, -template_len, SEEK_END) < 0) {
5998
0
        if (errno == ESPIPE) {
5999
0
            hclearerr(fd->fp);
6000
0
            return 2;
6001
0
        }
6002
0
        else {
6003
0
            return -1;
6004
0
        }
6005
0
    }
6006
0
    if (hread(fd->fp, buf, template_len) != template_len) return -1;
6007
0
    if (hseek(fd->fp, offset, SEEK_SET) < 0) return -1;
6008
0
    buf[8] &= 0x0f;
6009
0
    return (memcmp(template, buf, template_len) == 0)? 1 : 0;
6010
0
}