Coverage Report

Created: 2026-01-09 06:27

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/cram/cram_io.c
Line
Count
Source
1
/*
2
Copyright (c) 2012-2025 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * CRAM I/O primitives.
33
 *
34
 * - ITF8 encoding and decoding.
35
 * - Block based I/O
36
 * - Zlib inflating and deflating (memory)
37
 * - CRAM basic data structure reading and writing
38
 * - File opening / closing
39
 * - Reference sequence handling
40
 */
41
42
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
43
#include <config.h>
44
45
#include <stdio.h>
46
#include <errno.h>
47
#include <assert.h>
48
#include <stdlib.h>
49
#include <string.h>
50
#include <unistd.h>
51
#include <zlib.h>
52
#ifdef HAVE_LIBBZ2
53
#include <bzlib.h>
54
#endif
55
#ifdef HAVE_LIBLZMA
56
#ifdef HAVE_LZMA_H
57
#include <lzma.h>
58
#else
59
#include "../os/lzma_stub.h"
60
#endif
61
#endif
62
#include <sys/types.h>
63
#include <sys/stat.h>
64
#include <math.h>
65
#include <stdint.h>
66
67
#ifdef HAVE_LIBDEFLATE
68
#include <libdeflate.h>
69
#define crc32(a,b,c) libdeflate_crc32((a),(b),(c))
70
#endif
71
72
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
73
#include "../fuzz_settings.h"
74
#endif
75
76
#include "cram.h"
77
#include "os.h"
78
#include "../htslib/hts.h"
79
#include "../hts_internal.h"
80
#include "open_trace_file.h"
81
82
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
83
#include <htscodecs/rANS_static.h>
84
#include <htscodecs/rANS_static4x16.h>
85
#include <htscodecs/arith_dynamic.h>
86
#include <htscodecs/tokenise_name3.h>
87
#include <htscodecs/fqzcomp_qual.h>
88
#include <htscodecs/varint.h> // CRAM v4.0 variable-size integers
89
#else
90
#include "../htscodecs/htscodecs/rANS_static.h"
91
#include "../htscodecs/htscodecs/rANS_static4x16.h"
92
#include "../htscodecs/htscodecs/arith_dynamic.h"
93
#include "../htscodecs/htscodecs/tokenise_name3.h"
94
#include "../htscodecs/htscodecs/fqzcomp_qual.h"
95
#include "../htscodecs/htscodecs/varint.h"
96
#endif
97
98
//#define REF_DEBUG
99
100
#ifdef REF_DEBUG
101
#include <sys/syscall.h>
102
#define gettid() (int)syscall(SYS_gettid)
103
104
#define RP(...) fprintf (stderr, __VA_ARGS__)
105
#else
106
#define RP(...)
107
#endif
108
109
#include "../htslib/hfile.h"
110
#include "../htslib/bgzf.h"
111
#include "../htslib/faidx.h"
112
#include "../hts_internal.h"
113
114
#ifndef PATH_MAX
115
#define PATH_MAX FILENAME_MAX
116
#endif
117
118
521k
#define TRIAL_SPAN 70
119
521k
#define NTRIALS 3
120
121
4.57k
#define CRAM_DEFAULT_LEVEL 5
122
123
/* ----------------------------------------------------------------------
124
 * ITF8 encoding and decoding.
125
 *
126
 * Also see the itf8_get and itf8_put macros in cram_io.h
127
 */
128
129
/*
130
 * LEGACY: consider using itf8_decode_crc.
131
 *
132
 * Reads an integer in ITF-8 encoding from 'cp' and stores it in
133
 * *val.
134
 *
135
 * Returns the number of bytes read on success
136
 *        -1 on failure
137
 */
138
0
int itf8_decode(cram_fd *fd, int32_t *val_p) {
139
0
    static int nbytes[16] = {
140
0
        0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
141
0
        1,1,1,1,                                        // 1000xxxx - 1011xxxx
142
0
        2,2,                                            // 1100xxxx - 1101xxxx
143
0
        3,                                              // 1110xxxx
144
0
        4,                                              // 1111xxxx
145
0
    };
146
147
0
    static int nbits[16] = {
148
0
        0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
149
0
        0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
150
0
        0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
151
0
        0x0f,                                           // 1110xxxx
152
0
        0x0f,                                           // 1111xxxx
153
0
    };
154
155
0
    int32_t val = hgetc(fd->fp);
156
0
    if (val == -1)
157
0
        return -1;
158
159
0
    int i = nbytes[val>>4];
160
0
    val &= nbits[val>>4];
161
162
0
    switch(i) {
163
0
    case 0:
164
0
        *val_p = val;
165
0
        return 1;
166
167
0
    case 1:
168
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
169
0
        *val_p = val;
170
0
        return 2;
171
172
0
    case 2:
173
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
174
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
175
0
        *val_p = val;
176
0
        return 3;
177
178
0
    case 3:
179
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
180
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
181
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
182
0
        *val_p = val;
183
0
        return 4;
184
185
0
    case 4: // really 3.5 more, why make it different?
186
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
187
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
188
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
189
0
        val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f);
190
0
        *val_p = val;
191
0
    }
192
193
0
    return 5;
194
0
}
195
196
163k
int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
197
163k
    static int nbytes[16] = {
198
163k
        0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
199
163k
        1,1,1,1,                                        // 1000xxxx - 1011xxxx
200
163k
        2,2,                                            // 1100xxxx - 1101xxxx
201
163k
        3,                                              // 1110xxxx
202
163k
        4,                                              // 1111xxxx
203
163k
    };
204
205
163k
    static int nbits[16] = {
206
163k
        0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
207
163k
        0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
208
163k
        0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
209
163k
        0x0f,                                           // 1110xxxx
210
163k
        0x0f,                                           // 1111xxxx
211
163k
    };
212
163k
    unsigned char c[5];
213
214
163k
    int32_t val = hgetc(fd->fp);
215
163k
    if (val == -1)
216
24
        return -1;
217
218
163k
    c[0]=val;
219
220
163k
    int i = nbytes[val>>4];
221
163k
    val &= nbits[val>>4];
222
223
163k
    if (i > 0) {
224
25.1k
        if (hread(fd->fp, &c[1], i) < i)
225
0
            return -1;
226
25.1k
    }
227
228
163k
    switch(i) {
229
138k
    case 0:
230
138k
        *val_p = val;
231
138k
        *crc = crc32(*crc, c, 1);
232
138k
        return 1;
233
234
21.4k
    case 1:
235
21.4k
        val = (val<<8) | c[1];
236
21.4k
        *val_p = val;
237
21.4k
        *crc = crc32(*crc, c, 2);
238
21.4k
        return 2;
239
240
772
    case 2:
241
772
        val = (val<<8) | c[1];
242
772
        val = (val<<8) | c[2];
243
772
        *val_p = val;
244
772
        *crc = crc32(*crc, c, 3);
245
772
        return 3;
246
247
886
    case 3:
248
886
        val = (val<<8) | c[1];
249
886
        val = (val<<8) | c[2];
250
886
        val = (val<<8) | c[3];
251
886
        *val_p = val;
252
886
        *crc = crc32(*crc, c, 4);
253
886
        return 4;
254
255
2.11k
    case 4: // really 3.5 more, why make it different?
256
2.11k
        {
257
2.11k
            uint32_t uv = val;
258
2.11k
            uv = (uv<<8) |  c[1];
259
2.11k
            uv = (uv<<8) |  c[2];
260
2.11k
            uv = (uv<<8) |  c[3];
261
2.11k
            uv = (uv<<4) | (c[4] & 0x0f);
262
            // Avoid implementation-defined behaviour on negative values
263
2.11k
            *val_p = uv < 0x80000000UL ? (int32_t) uv : -((int32_t) (0xffffffffUL - uv)) - 1;
264
2.11k
            *crc = crc32(*crc, c, 5);
265
2.11k
        }
266
163k
    }
267
268
2.11k
    return 5;
269
163k
}
270
271
/*
272
 * Stores a value to memory in ITF-8 format.
273
 *
274
 * Returns the number of bytes required to store the number.
275
 * This is a maximum of 5 bytes.
276
 */
277
22.9M
static inline int itf8_put(char *cp, int32_t val) {
278
22.9M
    unsigned char *up = (unsigned char *)cp;
279
22.9M
    if        (!(val & ~0x00000007f)) { // 1 byte
280
21.0M
        *up = val;
281
21.0M
        return 1;
282
21.0M
    } else if (!(val & ~0x00003fff)) { // 2 byte
283
406k
        *up++ = (val >> 8 ) | 0x80;
284
406k
        *up   = val & 0xff;
285
406k
        return 2;
286
1.49M
    } else if (!(val & ~0x01fffff)) { // 3 byte
287
29.6k
        *up++ = (val >> 16) | 0xc0;
288
29.6k
        *up++ = (val >> 8 ) & 0xff;
289
29.6k
        *up   = val & 0xff;
290
29.6k
        return 3;
291
1.46M
    } else if (!(val & ~0x0fffffff)) { // 4 byte
292
1.25M
        *up++ = (val >> 24) | 0xe0;
293
1.25M
        *up++ = (val >> 16) & 0xff;
294
1.25M
        *up++ = (val >> 8 ) & 0xff;
295
1.25M
        *up   = val & 0xff;
296
1.25M
        return 4;
297
1.25M
    } else {                           // 5 byte
298
216k
        *up++ = 0xf0 | ((val>>28) & 0xff);
299
216k
        *up++ = (val >> 20) & 0xff;
300
216k
        *up++ = (val >> 12) & 0xff;
301
216k
        *up++ = (val >> 4 ) & 0xff;
302
216k
        *up = val & 0x0f;
303
216k
        return 5;
304
216k
    }
305
22.9M
}
306
307
308
/* 64-bit itf8 variant */
309
143k
static inline int ltf8_put(char *cp, int64_t val) {
310
143k
    unsigned char *up = (unsigned char *)cp;
311
143k
    if        (!(val & ~((1LL<<7)-1))) {
312
88.1k
        *up = val;
313
88.1k
        return 1;
314
88.1k
    } else if (!(val & ~((1LL<<(6+8))-1))) {
315
54.4k
        *up++ = (val >> 8 ) | 0x80;
316
54.4k
        *up   = val & 0xff;
317
54.4k
        return 2;
318
54.4k
    } else if (!(val & ~((1LL<<(5+2*8))-1))) {
319
1.16k
        *up++ = (val >> 16) | 0xc0;
320
1.16k
        *up++ = (val >> 8 ) & 0xff;
321
1.16k
        *up   = val & 0xff;
322
1.16k
        return 3;
323
1.16k
    } else if (!(val & ~((1LL<<(4+3*8))-1))) {
324
78
        *up++ = (val >> 24) | 0xe0;
325
78
        *up++ = (val >> 16) & 0xff;
326
78
        *up++ = (val >> 8 ) & 0xff;
327
78
        *up   = val & 0xff;
328
78
        return 4;
329
78
    } else if (!(val & ~((1LL<<(3+4*8))-1))) {
330
0
        *up++ = (val >> 32) | 0xf0;
331
0
        *up++ = (val >> 24) & 0xff;
332
0
        *up++ = (val >> 16) & 0xff;
333
0
        *up++ = (val >> 8 ) & 0xff;
334
0
        *up   = val & 0xff;
335
0
        return 5;
336
0
    } else if (!(val & ~((1LL<<(2+5*8))-1))) {
337
0
        *up++ = (val >> 40) | 0xf8;
338
0
        *up++ = (val >> 32) & 0xff;
339
0
        *up++ = (val >> 24) & 0xff;
340
0
        *up++ = (val >> 16) & 0xff;
341
0
        *up++ = (val >> 8 ) & 0xff;
342
0
        *up   = val & 0xff;
343
0
        return 6;
344
0
    } else if (!(val & ~((1LL<<(1+6*8))-1))) {
345
0
        *up++ = (val >> 48) | 0xfc;
346
0
        *up++ = (val >> 40) & 0xff;
347
0
        *up++ = (val >> 32) & 0xff;
348
0
        *up++ = (val >> 24) & 0xff;
349
0
        *up++ = (val >> 16) & 0xff;
350
0
        *up++ = (val >> 8 ) & 0xff;
351
0
        *up   = val & 0xff;
352
0
        return 7;
353
0
    } else if (!(val & ~((1LL<<(7*8))-1))) {
354
0
        *up++ = (val >> 56) | 0xfe;
355
0
        *up++ = (val >> 48) & 0xff;
356
0
        *up++ = (val >> 40) & 0xff;
357
0
        *up++ = (val >> 32) & 0xff;
358
0
        *up++ = (val >> 24) & 0xff;
359
0
        *up++ = (val >> 16) & 0xff;
360
0
        *up++ = (val >> 8 ) & 0xff;
361
0
        *up   = val & 0xff;
362
0
        return 8;
363
0
    } else {
364
0
        *up++ = 0xff;
365
0
        *up++ = (val >> 56) & 0xff;
366
0
        *up++ = (val >> 48) & 0xff;
367
0
        *up++ = (val >> 40) & 0xff;
368
0
        *up++ = (val >> 32) & 0xff;
369
0
        *up++ = (val >> 24) & 0xff;
370
0
        *up++ = (val >> 16) & 0xff;
371
0
        *up++ = (val >> 8 ) & 0xff;
372
0
        *up   = val & 0xff;
373
0
        return 9;
374
0
    }
375
143k
}
376
377
/*
378
 * Encodes and writes a single integer in ITF-8 format.
379
 * Returns 0 on success
380
 *        -1 on failure
381
 */
382
0
int itf8_encode(cram_fd *fd, int32_t val) {
383
0
    char buf[5];
384
0
    int len = itf8_put(buf, val);
385
0
    return hwrite(fd->fp, buf, len) == len ? 0 : -1;
386
0
}
387
388
const int itf8_bytes[16] = {
389
    1, 1, 1, 1,  1, 1, 1, 1,
390
    2, 2, 2, 2,  3, 3, 4, 5
391
};
392
393
const int ltf8_bytes[256] = {
394
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
395
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
396
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
397
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
398
399
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
400
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
401
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
402
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
403
404
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
405
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
406
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
407
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
408
409
    3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
410
    3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
411
412
    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
413
414
    5, 5, 5, 5,  5, 5, 5, 5,  6, 6, 6, 6,  7, 7, 8, 9
415
};
416
417
/*
418
 * LEGACY: consider using ltf8_decode_crc.
419
 */
420
0
int ltf8_decode(cram_fd *fd, int64_t *val_p) {
421
0
    int c = hgetc(fd->fp);
422
0
    int64_t val = (unsigned char)c;
423
0
    if (c == -1)
424
0
        return -1;
425
426
0
    if (val < 0x80) {
427
0
        *val_p =   val;
428
0
        return 1;
429
430
0
    } else if (val < 0xc0) {
431
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
432
0
        *val_p = val & (((1LL<<(6+8)))-1);
433
0
        return 2;
434
435
0
    } else if (val < 0xe0) {
436
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
437
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
438
0
        *val_p = val & ((1LL<<(5+2*8))-1);
439
0
        return 3;
440
441
0
    } else if (val < 0xf0) {
442
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
443
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
444
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
445
0
        *val_p = val & ((1LL<<(4+3*8))-1);
446
0
        return 4;
447
448
0
    } else if (val < 0xf8) {
449
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
450
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
451
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
452
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
453
0
        *val_p = val & ((1LL<<(3+4*8))-1);
454
0
        return 5;
455
456
0
    } else if (val < 0xfc) {
457
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
458
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
459
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
460
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
461
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
462
0
        *val_p = val & ((1LL<<(2+5*8))-1);
463
0
        return 6;
464
465
0
    } else if (val < 0xfe) {
466
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
467
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
468
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
469
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
470
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
471
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
472
0
        *val_p = val & ((1LL<<(1+6*8))-1);
473
0
        return 7;
474
475
0
    } else if (val < 0xff) {
476
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
477
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
478
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
479
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
480
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
481
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
482
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
483
0
        *val_p = val & ((1LL<<(7*8))-1);
484
0
        return 8;
485
486
0
    } else {
487
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
488
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
489
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
490
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
491
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
492
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
493
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
494
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
495
0
        *val_p = val;
496
0
    }
497
498
0
    return 9;
499
0
}
500
501
1.84k
int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
502
1.84k
    unsigned char c[9];
503
1.84k
    int64_t val = hgetc(fd->fp);
504
1.84k
    if (val < 0)
505
0
        return -1;
506
507
1.84k
    c[0] = val;
508
509
1.84k
    if (val < 0x80) {
510
1.73k
        *val_p =   val;
511
1.73k
        *crc = crc32(*crc, c, 1);
512
1.73k
        return 1;
513
514
1.73k
    } else if (val < 0xc0) {
515
45
        int v = hgetc(fd->fp);
516
45
        if (v < 0)
517
0
            return -1;
518
45
        val = (val<<8) | (c[1]=v);
519
45
        *val_p = val & (((1LL<<(6+8)))-1);
520
45
        *crc = crc32(*crc, c, 2);
521
45
        return 2;
522
523
72
    } else if (val < 0xe0) {
524
14
        if (hread(fd->fp, &c[1], 2) < 2)
525
0
            return -1;
526
14
        val = (val<<8) | c[1];
527
14
        val = (val<<8) | c[2];
528
14
        *val_p = val & ((1LL<<(5+2*8))-1);
529
14
        *crc = crc32(*crc, c, 3);
530
14
        return 3;
531
532
58
    } else if (val < 0xf0) {
533
9
        if (hread(fd->fp, &c[1], 3) < 3)
534
0
            return -1;
535
9
        val = (val<<8) | c[1];
536
9
        val = (val<<8) | c[2];
537
9
        val = (val<<8) | c[3];
538
9
        *val_p = val & ((1LL<<(4+3*8))-1);
539
9
        *crc = crc32(*crc, c, 4);
540
9
        return 4;
541
542
49
    } else if (val < 0xf8) {
543
24
        if (hread(fd->fp, &c[1], 4) < 4)
544
0
            return -1;
545
24
        val = (val<<8) | c[1];
546
24
        val = (val<<8) | c[2];
547
24
        val = (val<<8) | c[3];
548
24
        val = (val<<8) | c[4];
549
24
        *val_p = val & ((1LL<<(3+4*8))-1);
550
24
        *crc = crc32(*crc, c, 5);
551
24
        return 5;
552
553
25
    } else if (val < 0xfc) {
554
0
        if (hread(fd->fp, &c[1], 5) < 5)
555
0
            return -1;
556
0
        val = (val<<8) | c[1];
557
0
        val = (val<<8) | c[2];
558
0
        val = (val<<8) | c[3];
559
0
        val = (val<<8) | c[4];
560
0
        val = (val<<8) | c[5];
561
0
        *val_p = val & ((1LL<<(2+5*8))-1);
562
0
        *crc = crc32(*crc, c, 6);
563
0
        return 6;
564
565
25
    } else if (val < 0xfe) {
566
24
        if (hread(fd->fp, &c[1], 6) < 6)
567
0
            return -1;
568
24
        val = (val<<8) | c[1];
569
24
        val = (val<<8) | c[2];
570
24
        val = (val<<8) | c[3];
571
24
        val = (val<<8) | c[4];
572
24
        val = (val<<8) | c[5];
573
24
        val = (val<<8) | c[6];
574
24
        *val_p = val & ((1LL<<(1+6*8))-1);
575
24
        *crc = crc32(*crc, c, 7);
576
24
        return 7;
577
578
24
    } else if (val < 0xff) {
579
0
        uint64_t uval = val;
580
0
        if (hread(fd->fp, &c[1], 7) < 7)
581
0
            return -1;
582
0
        uval = (uval<<8) | c[1];
583
0
        uval = (uval<<8) | c[2];
584
0
        uval = (uval<<8) | c[3];
585
0
        uval = (uval<<8) | c[4];
586
0
        uval = (uval<<8) | c[5];
587
0
        uval = (uval<<8) | c[6];
588
0
        uval = (uval<<8) | c[7];
589
0
        *val_p = uval & ((1ULL<<(7*8))-1);
590
0
        *crc = crc32(*crc, c, 8);
591
0
        return 8;
592
593
1
    } else {
594
1
        uint64_t uval;
595
1
        if (hread(fd->fp, &c[1], 8) < 8)
596
0
            return -1;
597
1
        uval =             c[1];
598
1
        uval = (uval<<8) | c[2];
599
1
        uval = (uval<<8) | c[3];
600
1
        uval = (uval<<8) | c[4];
601
1
        uval = (uval<<8) | c[5];
602
1
        uval = (uval<<8) | c[6];
603
1
        uval = (uval<<8) | c[7];
604
1
        uval = (uval<<8) | c[8];
605
1
        *crc = crc32(*crc, c, 9);
606
        // Avoid implementation-defined behaviour on negative values
607
1
        *val_p = c[1] < 0x80 ? (int64_t) uval : -((int64_t) (0xffffffffffffffffULL - uval)) - 1;
608
1
    }
609
610
1
    return 9;
611
1.84k
}
612
613
/*
614
 * Pushes a value in ITF8 format onto the end of a block.
615
 * This shouldn't be used for high-volume data as it is not the fastest
616
 * method.
617
 *
618
 * Returns the number of bytes written
619
 */
620
15.0M
int itf8_put_blk(cram_block *blk, int32_t val) {
621
15.0M
    char buf[5];
622
15.0M
    int sz;
623
624
15.0M
    sz = itf8_put(buf, val);
625
15.0M
    BLOCK_APPEND(blk, buf, sz);
626
15.0M
    return sz;
627
628
0
 block_err:
629
0
    return -1;
630
15.0M
}
631
632
0
int ltf8_put_blk(cram_block *blk, int64_t val) {
633
0
    char buf[9];
634
0
    int sz;
635
636
0
    sz = ltf8_put(buf, val);
637
0
    BLOCK_APPEND(blk, buf, sz);
638
0
    return sz;
639
640
0
 block_err:
641
0
    return -1;
642
0
}
643
644
203k
static int64_t safe_itf8_get(char **cp, const char *endp, int *err) {
645
203k
    const unsigned char *up = (unsigned char *)*cp;
646
647
203k
    if (endp && endp - *cp < 5 &&
648
21.7k
        (*cp >= endp || endp - *cp < itf8_bytes[up[0]>>4])) {
649
12.4k
        if (err) *err = 1;
650
12.4k
        return 0;
651
12.4k
    }
652
653
191k
    if (up[0] < 0x80) {
654
171k
        (*cp)++;
655
171k
        return up[0];
656
171k
    } else if (up[0] < 0xc0) {
657
8.92k
        (*cp)+=2;
658
8.92k
        return ((up[0] <<8) |  up[1])                           & 0x3fff;
659
10.6k
    } else if (up[0] < 0xe0) {
660
2.49k
        (*cp)+=3;
661
2.49k
        return ((up[0]<<16) | (up[1]<< 8) |  up[2])             & 0x1fffff;
662
8.13k
    } else if (up[0] < 0xf0) {
663
2.99k
        (*cp)+=4;
664
2.99k
        uint32_t uv = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
665
2.99k
        return (int32_t)uv;
666
5.13k
    } else {
667
5.13k
        (*cp)+=5;
668
5.13k
        uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
669
5.13k
        return (int32_t)uv;
670
5.13k
    }
671
191k
}
672
673
896
static int64_t safe_ltf8_get(char **cp, const char *endp, int *err) {
674
896
    unsigned char *up = (unsigned char *)*cp;
675
676
896
    if (endp && endp - *cp < 9 &&
677
884
        (*cp >= endp || endp - *cp < ltf8_bytes[up[0]])) {
678
282
        if (err) *err = 1;
679
282
        return 0;
680
282
    }
681
682
614
    if (up[0] < 0x80) {
683
276
        (*cp)++;
684
276
        return up[0];
685
338
    } else if (up[0] < 0xc0) {
686
46
        (*cp)+=2;
687
46
        return (((uint64_t)up[0]<< 8) |
688
46
                 (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
689
292
    } else if (up[0] < 0xe0) {
690
23
        (*cp)+=3;
691
23
        return (((uint64_t)up[0]<<16) |
692
23
                ((uint64_t)up[1]<< 8) |
693
23
                 (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
694
269
    } else if (up[0] < 0xf0) {
695
79
        (*cp)+=4;
696
79
        return (((uint64_t)up[0]<<24) |
697
79
                ((uint64_t)up[1]<<16) |
698
79
                ((uint64_t)up[2]<< 8) |
699
79
                 (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
700
190
    } else if (up[0] < 0xf8) {
701
164
        (*cp)+=5;
702
164
        return (((uint64_t)up[0]<<32) |
703
164
                ((uint64_t)up[1]<<24) |
704
164
                ((uint64_t)up[2]<<16) |
705
164
                ((uint64_t)up[3]<< 8) |
706
164
                 (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
707
164
    } else if (up[0] < 0xfc) {
708
24
        (*cp)+=6;
709
24
        return (((uint64_t)up[0]<<40) |
710
24
                ((uint64_t)up[1]<<32) |
711
24
                ((uint64_t)up[2]<<24) |
712
24
                ((uint64_t)up[3]<<16) |
713
24
                ((uint64_t)up[4]<< 8) |
714
24
                 (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
715
24
    } else if (up[0] < 0xfe) {
716
0
        (*cp)+=7;
717
0
        return (((uint64_t)up[0]<<48) |
718
0
                ((uint64_t)up[1]<<40) |
719
0
                ((uint64_t)up[2]<<32) |
720
0
                ((uint64_t)up[3]<<24) |
721
0
                ((uint64_t)up[4]<<16) |
722
0
                ((uint64_t)up[5]<< 8) |
723
0
                 (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
724
2
    } else if (up[0] < 0xff) {
725
0
        (*cp)+=8;
726
0
        return (((uint64_t)up[1]<<48) |
727
0
                ((uint64_t)up[2]<<40) |
728
0
                ((uint64_t)up[3]<<32) |
729
0
                ((uint64_t)up[4]<<24) |
730
0
                ((uint64_t)up[5]<<16) |
731
0
                ((uint64_t)up[6]<< 8) |
732
0
                 (uint64_t)up[7]) & ((1LL<<(7*8))-1);
733
2
    } else {
734
2
        (*cp)+=9;
735
2
        return (((uint64_t)up[1]<<56) |
736
2
                ((uint64_t)up[2]<<48) |
737
2
                ((uint64_t)up[3]<<40) |
738
2
                ((uint64_t)up[4]<<32) |
739
2
                ((uint64_t)up[5]<<24) |
740
2
                ((uint64_t)up[6]<<16) |
741
2
                ((uint64_t)up[7]<< 8) |
742
2
                 (uint64_t)up[8]);
743
2
    }
744
614
}
745
746
// Wrapper for now
747
7.95M
static int safe_itf8_put(char *cp, char *cp_end, int32_t val) {
748
7.95M
    return itf8_put(cp, val);
749
7.95M
}
750
751
143k
static int safe_ltf8_put(char *cp, char *cp_end, int64_t val) {
752
143k
    return ltf8_put(cp, val);
753
143k
}
754
755
1.92M
static int itf8_size(int64_t v) {
756
1.92M
    return ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5);
757
1.92M
}
758
759
//-----------------------------------------------------------------------------
760
761
// CRAM v4.0 onwards uses a different variable sized integer encoding
762
// that is size agnostic.
763
764
// Local interface to varint.h inline version, so we can use in func ptr.
765
// Note a lot of these use the unsigned interface but take signed int64_t.
766
// This is because the old CRAM ITF8 inteface had signed -1 as unsigned
767
// 0xffffffff.
768
1.97k
static int uint7_size(int64_t v) {
769
1.97k
    return var_size_u64(v);
770
1.97k
}
771
772
0
static int64_t uint7_get_32(char **cp, const char *endp, int *err) {
773
0
    uint32_t val;
774
0
    int nb = var_get_u32((uint8_t *)(*cp), (const uint8_t *)endp, &val);
775
0
    (*cp) += nb;
776
0
    if (!nb && err) *err = 1;
777
0
    return val;
778
0
}
779
780
0
static int64_t sint7_get_32(char **cp, const char *endp, int *err) {
781
0
    int32_t val;
782
0
    int nb = var_get_s32((uint8_t *)(*cp), (const uint8_t *)endp, &val);
783
0
    (*cp) += nb;
784
0
    if (!nb && err) *err = 1;
785
0
    return val;
786
0
}
787
788
0
static int64_t uint7_get_64(char **cp, const char *endp, int *err) {
789
0
    uint64_t val;
790
0
    int nb = var_get_u64((uint8_t *)(*cp), (const uint8_t *)endp, &val);
791
0
    (*cp) += nb;
792
0
    if (!nb && err) *err = 1;
793
0
    return val;
794
0
}
795
796
0
static int64_t sint7_get_64(char **cp, const char *endp, int *err) {
797
0
    int64_t val;
798
0
    int nb = var_get_s64((uint8_t *)(*cp), (const uint8_t *)endp, &val);
799
0
    (*cp) += nb;
800
0
    if (!nb && err) *err = 1;
801
0
    return val;
802
0
}
803
804
0
static int uint7_put_32(char *cp, char *endp, int32_t val) {
805
0
    return var_put_u32((uint8_t *)cp, (uint8_t *)endp, val);
806
0
}
807
808
0
static int sint7_put_32(char *cp, char *endp, int32_t val) {
809
0
    return var_put_s32((uint8_t *)cp, (uint8_t *)endp, val);
810
0
}
811
812
0
static int uint7_put_64(char *cp, char *endp, int64_t val) {
813
0
    return var_put_u64((uint8_t *)cp, (uint8_t *)endp, val);
814
0
}
815
816
0
static int sint7_put_64(char *cp, char *endp, int64_t val) {
817
0
    return var_put_s64((uint8_t *)cp, (uint8_t *)endp, val);
818
0
}
819
820
// Put direct to to cram_block
821
0
static int uint7_put_blk_32(cram_block *blk, int32_t v) {
822
0
    uint8_t buf[10];
823
0
    int sz = var_put_u32(buf, buf+10, v);
824
0
    BLOCK_APPEND(blk, buf, sz);
825
0
    return sz;
826
827
0
 block_err:
828
0
    return -1;
829
0
}
830
831
0
static int sint7_put_blk_32(cram_block *blk, int32_t v) {
832
0
    uint8_t buf[10];
833
0
    int sz = var_put_s32(buf, buf+10, v);
834
0
    BLOCK_APPEND(blk, buf, sz);
835
0
    return sz;
836
837
0
 block_err:
838
0
    return -1;
839
0
}
840
841
0
static int uint7_put_blk_64(cram_block *blk, int64_t v) {
842
0
    uint8_t buf[10];
843
0
    int sz = var_put_u64(buf, buf+10, v);
844
0
    BLOCK_APPEND(blk, buf, sz);
845
0
    return sz;
846
847
0
 block_err:
848
0
    return -1;
849
0
}
850
851
0
static int sint7_put_blk_64(cram_block *blk, int64_t v) {
852
0
    uint8_t buf[10];
853
0
    int sz = var_put_s64(buf, buf+10, v);
854
0
    BLOCK_APPEND(blk, buf, sz);
855
0
    return sz;
856
857
0
 block_err:
858
0
    return -1;
859
0
}
860
861
// Decode 32-bits with CRC update from cram_fd
862
16.1k
static int uint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
863
16.1k
    uint8_t b[5], i = 0;
864
16.1k
    int c;
865
16.1k
    uint32_t v = 0;
866
867
#ifdef VARINT2
868
    b[0] = hgetc(fd->fp);
869
    if (b[0] < 177) {
870
    } else if (b[0] < 241) {
871
        b[1] = hgetc(fd->fp);
872
    } else if (b[0] < 249) {
873
        b[1] = hgetc(fd->fp);
874
        b[2] = hgetc(fd->fp);
875
    } else {
876
        int n = b[0]+2, z = 1;
877
        while (n-- >= 249)
878
            b[z++] = hgetc(fd->fp);
879
    }
880
    i = var_get_u32(b, NULL, &v);
881
#else
882
//    // Little endian
883
//    int s = 0;
884
//    do {
885
//        b[i++] = c = hgetc(fd->fp);
886
//        if (c < 0)
887
//            return -1;
888
//        v |= (c & 0x7f) << s;
889
//      s += 7;
890
//    } while (i < 5 && (c & 0x80));
891
892
    // Big endian, see also htscodecs/varint.h
893
22.4k
    do {
894
22.4k
        b[i++] = c = hgetc(fd->fp);
895
22.4k
        if (c < 0)
896
3
            return -1;
897
22.4k
        v = (v<<7) | (c & 0x7f);
898
22.4k
    } while (i < 5 && (c & 0x80));
899
16.1k
#endif
900
16.1k
    *crc = crc32(*crc, b, i);
901
902
16.1k
    *val_p = v;
903
16.1k
    return i;
904
16.1k
}
905
906
// Decode 32-bits with CRC update from cram_fd
907
2.39k
static int sint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
908
2.39k
    uint8_t b[5], i = 0;
909
2.39k
    int c;
910
2.39k
    uint32_t v = 0;
911
912
#ifdef VARINT2
913
    b[0] = hgetc(fd->fp);
914
    if (b[0] < 177) {
915
    } else if (b[0] < 241) {
916
        b[1] = hgetc(fd->fp);
917
    } else if (b[0] < 249) {
918
        b[1] = hgetc(fd->fp);
919
        b[2] = hgetc(fd->fp);
920
    } else {
921
        int n = b[0]+2, z = 1;
922
        while (n-- >= 249)
923
            b[z++] = hgetc(fd->fp);
924
    }
925
    i = var_get_u32(b, NULL, &v);
926
#else
927
//    // Little endian
928
//    int s = 0;
929
//    do {
930
//        b[i++] = c = hgetc(fd->fp);
931
//        if (c < 0)
932
//            return -1;
933
//        v |= (c & 0x7f) << s;
934
//      s += 7;
935
//    } while (i < 5 && (c & 0x80));
936
937
    // Big endian, see also htscodecs/varint.h
938
2.68k
    do {
939
2.68k
        b[i++] = c = hgetc(fd->fp);
940
2.68k
        if (c < 0)
941
0
            return -1;
942
2.68k
        v = (v<<7) | (c & 0x7f);
943
2.68k
    } while (i < 5 && (c & 0x80));
944
2.39k
#endif
945
2.39k
    *crc = crc32(*crc, b, i);
946
947
2.39k
    *val_p = (v>>1) ^ -(v&1);
948
2.39k
    return i;
949
2.39k
}
950
951
952
// Decode 64-bits with CRC update from cram_fd
953
9.59k
static int uint7_decode_crc64(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
954
9.59k
    uint8_t b[10], i = 0;
955
9.59k
    int c;
956
9.59k
    uint64_t v = 0;
957
958
#ifdef VARINT2
959
    b[0] = hgetc(fd->fp);
960
    if (b[0] < 177) {
961
    } else if (b[0] < 241) {
962
        b[1] = hgetc(fd->fp);
963
    } else if (b[0] < 249) {
964
        b[1] = hgetc(fd->fp);
965
        b[2] = hgetc(fd->fp);
966
    } else {
967
        int n = b[0]+2, z = 1;
968
        while (n-- >= 249)
969
            b[z++] = hgetc(fd->fp);
970
    }
971
    i = var_get_u64(b, NULL, &v);
972
#else
973
//    // Little endian
974
//    int s = 0;
975
//    do {
976
//        b[i++] = c = hgetc(fd->fp);
977
//        if (c < 0)
978
//            return -1;
979
//        v |= (c & 0x7f) << s;
980
//      s += 7;
981
//    } while (i < 10 && (c & 0x80));
982
983
    // Big endian, see also htscodecs/varint.h
984
10.6k
    do {
985
10.6k
        b[i++] = c = hgetc(fd->fp);
986
10.6k
        if (c < 0)
987
0
            return -1;
988
10.6k
        v = (v<<7) | (c & 0x7f);
989
10.6k
    } while (i < 5 && (c & 0x80));
990
9.59k
#endif
991
9.59k
    *crc = crc32(*crc, b, i);
992
993
9.59k
    *val_p = v;
994
9.59k
    return i;
995
9.59k
}
996
997
//-----------------------------------------------------------------------------
998
999
/*
1000
 * Decodes a 32-bit little endian value from fd and stores in val.
1001
 *
1002
 * Returns the number of bytes read on success
1003
 *         -1 on failure
1004
 */
1005
6.62k
static int int32_decode(cram_fd *fd, int32_t *val) {
1006
6.62k
    int32_t i;
1007
6.62k
    if (4 != hread(fd->fp, &i, 4))
1008
3
        return -1;
1009
1010
6.62k
    *val = le_int4(i);
1011
6.62k
    return 4;
1012
6.62k
}
1013
1014
/*
1015
 * Encodes a 32-bit little endian value 'val' and writes to fd.
1016
 *
1017
 * Returns the number of bytes written on success
1018
 *         -1 on failure
1019
 */
1020
516k
static int int32_encode(cram_fd *fd, int32_t val) {
1021
516k
    uint32_t v = le_int4(val);
1022
516k
    if (4 != hwrite(fd->fp, &v, 4))
1023
0
        return -1;
1024
1025
516k
    return 4;
1026
516k
}
1027
1028
/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
1029
58
int int32_get_blk(cram_block *b, int32_t *val) {
1030
58
    if (b->uncomp_size - BLOCK_SIZE(b) < 4)
1031
1
        return -1;
1032
1033
57
    uint32_t v =
1034
57
         ((uint32_t) b->data[b->byte  ])        |
1035
57
        (((uint32_t) b->data[b->byte+1]) <<  8) |
1036
57
        (((uint32_t) b->data[b->byte+2]) << 16) |
1037
57
        (((uint32_t) b->data[b->byte+3]) << 24);
1038
    // Avoid implementation-defined behaviour on negative values
1039
57
    *val = v < 0x80000000U ? (int32_t) v : -((int32_t) (0xffffffffU - v)) - 1;
1040
57
    BLOCK_SIZE(b) += 4;
1041
57
    return 4;
1042
58
}
1043
1044
/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
1045
2.50k
int int32_put_blk(cram_block *b, int32_t val) {
1046
2.50k
    unsigned char cp[4];
1047
2.50k
    uint32_t v = val;
1048
2.50k
    cp[0] = ( v      & 0xff);
1049
2.50k
    cp[1] = ((v>>8)  & 0xff);
1050
2.50k
    cp[2] = ((v>>16) & 0xff);
1051
2.50k
    cp[3] = ((v>>24) & 0xff);
1052
1053
2.50k
    BLOCK_APPEND(b, cp, 4);
1054
2.50k
    return 0;
1055
1056
0
 block_err:
1057
0
    return -1;
1058
2.50k
}
1059
1060
#ifdef HAVE_LIBDEFLATE
1061
/* ----------------------------------------------------------------------
1062
 * libdeflate compression code, with interface to match
1063
 * zlib_mem_{in,de}flate for simplicity elsewhere.
1064
 */
1065
1066
// Named the same as the version that uses zlib as we always use libdeflate for
1067
// decompression when available.
1068
char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1069
    struct libdeflate_decompressor *z = libdeflate_alloc_decompressor();
1070
    if (!z) {
1071
        hts_log_error("Call to libdeflate_alloc_decompressor failed");
1072
        return NULL;
1073
    }
1074
1075
    uint8_t *data = NULL, *new_data;
1076
    if (!*size)
1077
        *size = csize*2;
1078
    for(;;) {
1079
        new_data = realloc(data, *size);
1080
        if (!new_data) {
1081
            hts_log_error("Memory allocation failure");
1082
            goto fail;
1083
        }
1084
        data = new_data;
1085
1086
        int ret = libdeflate_gzip_decompress(z, cdata, csize, data, *size, size);
1087
1088
        // Auto grow output buffer size if needed and try again.
1089
        // Fortunately for all bar one call of this we know the size already.
1090
        if (ret == LIBDEFLATE_INSUFFICIENT_SPACE) {
1091
            (*size) *= 1.5;
1092
            continue;
1093
        }
1094
1095
        if (ret != LIBDEFLATE_SUCCESS) {
1096
            hts_log_error("Inflate operation failed: %d", ret);
1097
            goto fail;
1098
        } else {
1099
            break;
1100
        }
1101
    }
1102
1103
    libdeflate_free_decompressor(z);
1104
    return (char *)data;
1105
1106
 fail:
1107
    libdeflate_free_decompressor(z);
1108
    free(data);
1109
    return NULL;
1110
}
1111
1112
// Named differently as we use both zlib/libdeflate for compression.
1113
static char *libdeflate_deflate(char *data, size_t size, size_t *cdata_size,
1114
                                int level, int strat) {
1115
    level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default
1116
    level *= 1.23;     // NB levels go up to 12 here; 5 onwards is +1
1117
    level += level>=8; // 5,6,7->6,7,8  8->10  9->12
1118
    if (level > 12) level = 12;
1119
1120
    if (strat == Z_RLE) // not supported by libdeflate
1121
        level = 1;
1122
1123
    struct libdeflate_compressor *z = libdeflate_alloc_compressor(level);
1124
    if (!z) {
1125
        hts_log_error("Call to libdeflate_alloc_compressor failed");
1126
        return NULL;
1127
    }
1128
1129
    unsigned char *cdata = NULL; /* Compressed output */
1130
    size_t cdata_alloc;
1131
    cdata = malloc(cdata_alloc = size*1.05+100);
1132
    if (!cdata) {
1133
        hts_log_error("Memory allocation failure");
1134
        libdeflate_free_compressor(z);
1135
        return NULL;
1136
    }
1137
1138
    *cdata_size = libdeflate_gzip_compress(z, data, size, cdata, cdata_alloc);
1139
    libdeflate_free_compressor(z);
1140
1141
    if (*cdata_size == 0) {
1142
        hts_log_error("Call to libdeflate_gzip_compress failed");
1143
        free(cdata);
1144
        return NULL;
1145
    }
1146
1147
    return (char *)cdata;
1148
}
1149
1150
#else
1151
1152
/* ----------------------------------------------------------------------
1153
 * zlib compression code - from Gap5's tg_iface_g.c
1154
 * They're static here as they're only used within the cram_compress_block
1155
 * and cram_uncompress_block functions, which are the external interface.
1156
 */
1157
0
char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1158
0
    z_stream s;
1159
0
    unsigned char *data = NULL; /* Uncompressed output */
1160
0
    int data_alloc = 0;
1161
0
    int err;
1162
1163
    /* Starting point at uncompressed size, and scale after that */
1164
0
    data = malloc(data_alloc = csize*1.2+100);
1165
0
    if (!data)
1166
0
        return NULL;
1167
1168
    /* Initialise zlib stream */
1169
0
    s.zalloc = Z_NULL; /* use default allocation functions */
1170
0
    s.zfree  = Z_NULL;
1171
0
    s.opaque = Z_NULL;
1172
0
    s.next_in  = (unsigned char *)cdata;
1173
0
    s.avail_in = csize;
1174
0
    s.total_in = 0;
1175
0
    s.next_out  = data;
1176
0
    s.avail_out = data_alloc;
1177
0
    s.total_out = 0;
1178
1179
    //err = inflateInit(&s);
1180
0
    err = inflateInit2(&s, 15 + 32);
1181
0
    if (err != Z_OK) {
1182
0
        hts_log_error("Call to zlib inflateInit failed: %s", s.msg);
1183
0
        free(data);
1184
0
        return NULL;
1185
0
    }
1186
1187
    /* Decode to 'data' array */
1188
0
    for (;s.avail_in;) {
1189
0
        unsigned char *data_tmp;
1190
0
        int alloc_inc;
1191
1192
0
        s.next_out = &data[s.total_out];
1193
0
        err = inflate(&s, Z_NO_FLUSH);
1194
0
        if (err == Z_STREAM_END)
1195
0
            break;
1196
1197
0
        if (err != Z_OK) {
1198
0
            hts_log_error("Call to zlib inflate failed: %s", s.msg);
1199
0
            free(data);
1200
0
            inflateEnd(&s);
1201
0
            return NULL;
1202
0
        }
1203
1204
        /* More to come, so realloc based on growth so far */
1205
0
        alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
1206
0
        data = realloc((data_tmp = data), data_alloc += alloc_inc);
1207
0
        if (!data) {
1208
0
            free(data_tmp);
1209
0
            inflateEnd(&s);
1210
0
            return NULL;
1211
0
        }
1212
0
        s.avail_out += alloc_inc;
1213
0
    }
1214
0
    inflateEnd(&s);
1215
1216
0
    *size = s.total_out;
1217
0
    return (char *)data;
1218
0
}
1219
#endif
1220
1221
#if !defined(HAVE_LIBDEFLATE) || LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR ==  1 && LIBDEFLATE_VERSION_MINOR <= 8)
1222
static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
1223
342k
                              int level, int strat) {
1224
342k
    z_stream s;
1225
342k
    unsigned char *cdata = NULL; /* Compressed output */
1226
342k
    int cdata_alloc = 0;
1227
342k
    int cdata_pos = 0;
1228
342k
    int err;
1229
1230
342k
    cdata = malloc(cdata_alloc = size*1.05+100);
1231
342k
    if (!cdata)
1232
0
        return NULL;
1233
342k
    cdata_pos = 0;
1234
1235
    /* Initialise zlib stream */
1236
342k
    s.zalloc = Z_NULL; /* use default allocation functions */
1237
342k
    s.zfree  = Z_NULL;
1238
342k
    s.opaque = Z_NULL;
1239
342k
    s.next_in  = (unsigned char *)data;
1240
342k
    s.avail_in = size;
1241
342k
    s.total_in = 0;
1242
342k
    s.next_out  = cdata;
1243
342k
    s.avail_out = cdata_alloc;
1244
342k
    s.total_out = 0;
1245
342k
    s.data_type = Z_BINARY;
1246
1247
342k
    err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
1248
342k
    if (err != Z_OK) {
1249
0
        hts_log_error("Call to zlib deflateInit2 failed: %s", s.msg);
1250
0
        return NULL;
1251
0
    }
1252
1253
    /* Encode to 'cdata' array */
1254
684k
    for (;s.avail_in;) {
1255
342k
        s.next_out = &cdata[cdata_pos];
1256
342k
        s.avail_out = cdata_alloc - cdata_pos;
1257
342k
        if (cdata_alloc - cdata_pos <= 0) {
1258
0
            hts_log_error("Deflate produced larger output than expected");
1259
0
            return NULL;
1260
0
        }
1261
342k
        err = deflate(&s, Z_NO_FLUSH);
1262
342k
        cdata_pos = cdata_alloc - s.avail_out;
1263
342k
        if (err != Z_OK) {
1264
0
            hts_log_error("Call to zlib deflate failed: %s", s.msg);
1265
0
            break;
1266
0
        }
1267
342k
    }
1268
342k
    if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
1269
0
        hts_log_error("Call to zlib deflate failed: %s", s.msg);
1270
0
    }
1271
342k
    *cdata_size = s.total_out;
1272
1273
342k
    if (deflateEnd(&s) != Z_OK) {
1274
0
        hts_log_error("Call to zlib deflate failed: %s", s.msg);
1275
0
    }
1276
342k
    return (char *)cdata;
1277
342k
}
1278
#endif
1279
1280
#ifdef HAVE_LIBLZMA
1281
/* ------------------------------------------------------------------------ */
1282
/*
1283
 * Data compression routines using liblzma (xz)
1284
 *
1285
 * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
1286
 * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
1287
 * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
1288
 * as compression times.
1289
 *
1290
 * For now we disable this functionality. If it's to be reenabled make sure you
1291
 * improve the mem_inflate implementation as it's just a test hack at the
1292
 * moment.
1293
 */
1294
1295
static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
1296
0
                              int level) {
1297
0
    char *out;
1298
0
    size_t out_size = lzma_stream_buffer_bound(size);
1299
0
    *cdata_size = 0;
1300
1301
0
    out = malloc(out_size);
1302
1303
    /* Single call compression */
1304
0
    if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
1305
0
                                           (uint8_t *)data, size,
1306
0
                                           (uint8_t *)out, cdata_size,
1307
0
                                           out_size))
1308
0
        return NULL;
1309
1310
0
    return out;
1311
0
}
1312
1313
2
static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
1314
2
    lzma_stream strm = LZMA_STREAM_INIT;
1315
2
    size_t out_size = 0, out_pos = 0;
1316
2
    char *out = NULL, *new_out;
1317
2
    int r;
1318
1319
    /* Initiate the decoder */
1320
2
    if (LZMA_OK != lzma_stream_decoder(&strm, lzma_easy_decoder_memusage(9), 0))
1321
0
        return NULL;
1322
1323
    /* Decode loop */
1324
2
    strm.avail_in = csize;
1325
2
    strm.next_in = (uint8_t *)cdata;
1326
1327
4
    for (;strm.avail_in;) {
1328
2
        if (strm.avail_in > out_size - out_pos) {
1329
2
            out_size += strm.avail_in * 4 + 32768;
1330
2
            new_out = realloc(out, out_size);
1331
2
            if (!new_out)
1332
0
                goto fail;
1333
2
            out = new_out;
1334
2
        }
1335
2
        strm.avail_out = out_size - out_pos;
1336
2
        strm.next_out = (uint8_t *)&out[out_pos];
1337
1338
2
        r = lzma_code(&strm, LZMA_RUN);
1339
2
        if (LZMA_OK != r && LZMA_STREAM_END != r) {
1340
0
            hts_log_error("LZMA decode failure (error %d)", r);
1341
0
            goto fail;
1342
0
        }
1343
1344
2
        out_pos = strm.total_out;
1345
1346
2
        if (r == LZMA_STREAM_END)
1347
0
            break;
1348
2
    }
1349
1350
    /* finish up any unflushed data; necessary? */
1351
2
    r = lzma_code(&strm, LZMA_FINISH);
1352
2
    if (r != LZMA_OK && r != LZMA_STREAM_END) {
1353
0
        hts_log_error("Call to lzma_code failed with error %d", r);
1354
0
        goto fail;
1355
0
    }
1356
1357
2
    new_out = realloc(out, strm.total_out > 0 ? strm.total_out : 1);
1358
2
    if (new_out)
1359
2
        out = new_out;
1360
2
    *size = strm.total_out;
1361
1362
2
    lzma_end(&strm);
1363
1364
2
    return out;
1365
1366
0
 fail:
1367
0
    lzma_end(&strm);
1368
0
    free(out);
1369
0
    return NULL;
1370
2
}
1371
#endif
1372
1373
/* ----------------------------------------------------------------------
1374
 * CRAM blocks - the dynamically growable data block. We have code to
1375
 * create, update, (un)compress and read/write.
1376
 *
1377
 * These are derived from the deflate_interlaced.c blocks, but with the
1378
 * CRAM extension of content types and IDs.
1379
 */
1380
1381
/*
1382
 * Allocates a new cram_block structure with a specified content_type and
1383
 * id.
1384
 *
1385
 * Returns block pointer on success
1386
 *         NULL on failure
1387
 */
1388
cram_block *cram_new_block(enum cram_content_type content_type,
1389
1.40M
                           int content_id) {
1390
1.40M
    cram_block *b = malloc(sizeof(*b));
1391
1.40M
    if (!b)
1392
0
        return NULL;
1393
1.40M
    b->method = b->orig_method = RAW;
1394
1.40M
    b->content_type = content_type;
1395
1.40M
    b->content_id = content_id;
1396
1.40M
    b->comp_size = 0;
1397
1.40M
    b->uncomp_size = 0;
1398
1.40M
    b->data = NULL;
1399
1.40M
    b->alloc = 0;
1400
1.40M
    b->byte = 0;
1401
1.40M
    b->bit = 7; // MSB
1402
1.40M
    b->crc32 = 0;
1403
1.40M
    b->idx = 0;
1404
1.40M
    b->m = NULL;
1405
1406
1.40M
    return b;
1407
1.40M
}
1408
1409
/*
1410
 * Reads a block from a cram file.
1411
 * Returns cram_block pointer on success.
1412
 *         NULL on failure
1413
 */
1414
4.74k
cram_block *cram_read_block(cram_fd *fd) {
1415
4.74k
    cram_block *b = malloc(sizeof(*b));
1416
4.74k
    unsigned char c;
1417
4.74k
    uint32_t crc = 0;
1418
4.74k
    if (!b)
1419
0
        return NULL;
1420
1421
    //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
1422
1423
4.74k
    if (-1 == (b->method      = hgetc(fd->fp))) { free(b); return NULL; }
1424
4.74k
    c = b->method; crc = crc32(crc, &c, 1);
1425
4.74k
    if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; }
1426
4.74k
    c = b->content_type; crc = crc32(crc, &c, 1);
1427
4.74k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->content_id, &crc))  { free(b); return NULL; }
1428
4.74k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->comp_size, &crc))   { free(b); return NULL; }
1429
4.74k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; }
1430
1431
    //fprintf(stderr, "  method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
1432
    //      b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
1433
1434
4.74k
    if (b->method == RAW) {
1435
3.21k
        if (b->uncomp_size < 0 || b->comp_size != b->uncomp_size) {
1436
23
            free(b);
1437
23
            return NULL;
1438
23
        }
1439
3.19k
        b->alloc = b->uncomp_size;
1440
3.19k
        if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
1441
3.19k
        if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) {
1442
8
            free(b->data);
1443
8
            free(b);
1444
8
            return NULL;
1445
8
        }
1446
3.19k
    } else {
1447
1.52k
        if (b->comp_size < 0 || b->uncomp_size < 0) {
1448
10
            free(b);
1449
10
            return NULL;
1450
10
        }
1451
1.51k
        b->alloc = b->comp_size;
1452
1.51k
        if (!(b->data = malloc(b->comp_size)))  { free(b); return NULL; }
1453
1.51k
        if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) {
1454
0
            free(b->data);
1455
0
            free(b);
1456
0
            return NULL;
1457
0
        }
1458
1.51k
    }
1459
1460
4.70k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1461
709
        if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
1462
0
            free(b->data);
1463
0
            free(b);
1464
0
            return NULL;
1465
0
        }
1466
1467
709
        b->crc32_checked = fd->ignore_md5;
1468
709
        b->crc_part = crc;
1469
3.99k
    } else {
1470
3.99k
        b->crc32_checked = 1; // CRC not present
1471
3.99k
    }
1472
1473
4.70k
    b->orig_method = b->method;
1474
4.70k
    b->idx = 0;
1475
4.70k
    b->byte = 0;
1476
4.70k
    b->bit = 7; // MSB
1477
1478
4.70k
    return b;
1479
4.70k
}
1480
1481
1482
/*
1483
 * Computes the size of a cram block, including the block
1484
 * header itself.
1485
 */
1486
0
uint32_t cram_block_size(cram_block *b) {
1487
0
    unsigned char dat[100], *cp = dat;;
1488
0
    uint32_t sz;
1489
1490
0
    *cp++ = b->method;
1491
0
    *cp++ = b->content_type;
1492
0
    cp += itf8_put((char*)cp, b->content_id);
1493
0
    cp += itf8_put((char*)cp, b->comp_size);
1494
0
    cp += itf8_put((char*)cp, b->uncomp_size);
1495
1496
0
    sz = cp-dat + 4;
1497
0
    sz += b->method == RAW ? b->uncomp_size : b->comp_size;
1498
1499
0
    return sz;
1500
0
}
1501
1502
/*
1503
 * Writes a CRAM block.
1504
 * Returns 0 on success
1505
 *        -1 on failure
1506
 */
1507
516k
int cram_write_block(cram_fd *fd, cram_block *b) {
1508
516k
    char vardata[100];
1509
516k
    int vardata_o = 0;
1510
1511
516k
    assert(b->method != RAW || (b->comp_size == b->uncomp_size));
1512
1513
516k
    if (hputc(b->method,       fd->fp)  == EOF) return -1;
1514
516k
    if (hputc(b->content_type, fd->fp)  == EOF) return -1;
1515
516k
    vardata_o += fd->vv.varint_put32(vardata          , vardata+100, b->content_id);
1516
516k
    vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->comp_size);
1517
516k
    vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->uncomp_size);
1518
516k
    if (vardata_o != hwrite(fd->fp, vardata, vardata_o))
1519
0
        return -1;
1520
1521
516k
    if (b->data) {
1522
473k
        if (b->method == RAW) {
1523
403k
            if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size))
1524
0
                return -1;
1525
403k
        } else {
1526
69.6k
            if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size))
1527
0
                return -1;
1528
69.6k
        }
1529
473k
    } else {
1530
        // Absent blocks should be size 0
1531
42.8k
        assert(b->method == RAW && b->uncomp_size == 0);
1532
42.8k
    }
1533
1534
516k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1535
516k
        char dat[100], *cp = (char *)dat;
1536
516k
        uint32_t crc;
1537
1538
516k
        *cp++ = b->method;
1539
516k
        *cp++ = b->content_type;
1540
516k
        cp += fd->vv.varint_put32(cp, dat+100, b->content_id);
1541
516k
        cp += fd->vv.varint_put32(cp, dat+100, b->comp_size);
1542
516k
        cp += fd->vv.varint_put32(cp, dat+100, b->uncomp_size);
1543
516k
        crc = crc32(0L, (uc *)dat, cp-dat);
1544
1545
516k
        if (b->method == RAW) {
1546
446k
            b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
1547
446k
        } else {
1548
69.6k
            b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
1549
69.6k
        }
1550
1551
516k
        if (-1 == int32_encode(fd, b->crc32))
1552
0
            return -1;
1553
516k
    }
1554
1555
516k
    return 0;
1556
516k
}
1557
1558
/*
1559
 * Frees a CRAM block, deallocating internal data too.
1560
 */
1561
2.15M
void cram_free_block(cram_block *b) {
1562
2.15M
    if (!b)
1563
744k
        return;
1564
1.41M
    if (b->data)
1565
1.02M
        free(b->data);
1566
1.41M
    free(b);
1567
1.41M
}
1568
1569
/*
1570
 * Uncompresses a CRAM block, if compressed.
1571
 */
1572
3.26k
int cram_uncompress_block(cram_block *b) {
1573
3.26k
    char *uncomp;
1574
3.26k
    size_t uncomp_size = 0;
1575
1576
3.26k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1577
    // Pretend the CRC was OK so the fuzzer doesn't have to get it right
1578
3.26k
    b->crc32_checked = 1;
1579
3.26k
#endif
1580
1581
3.26k
    if (b->crc32_checked == 0) {
1582
0
        uint32_t crc = crc32(b->crc_part, b->data ? b->data : (uc *)"", b->alloc);
1583
0
        b->crc32_checked = 1;
1584
0
        if (crc != b->crc32) {
1585
0
            hts_log_error("Block CRC32 failure");
1586
0
            return -1;
1587
0
        }
1588
0
    }
1589
1590
3.26k
    if (b->uncomp_size == 0) {
1591
        // blank block
1592
2.67k
        b->method = RAW;
1593
2.67k
        return 0;
1594
2.67k
    }
1595
3.26k
    assert(b->uncomp_size >= 0); // cram_read_block should ensure this
1596
1597
587
    switch (b->method) {
1598
98
    case RAW:
1599
98
        return 0;
1600
1601
0
    case GZIP:
1602
0
        uncomp_size = b->uncomp_size;
1603
0
        uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1604
1605
0
        if (!uncomp)
1606
0
            return -1;
1607
0
        if (uncomp_size != b->uncomp_size) {
1608
0
            free(uncomp);
1609
0
            return -1;
1610
0
        }
1611
0
        free(b->data);
1612
0
        b->data = (unsigned char *)uncomp;
1613
0
        b->alloc = uncomp_size;
1614
0
        b->method = RAW;
1615
0
        break;
1616
1617
0
#ifdef HAVE_LIBBZ2
1618
0
    case BZIP2: {
1619
0
        unsigned int usize = b->uncomp_size;
1620
0
        if (!(uncomp = malloc(usize)))
1621
0
            return -1;
1622
0
        if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
1623
0
                                                (char *)b->data, b->comp_size,
1624
0
                                                0, 0)) {
1625
0
            free(uncomp);
1626
0
            return -1;
1627
0
        }
1628
0
        free(b->data);
1629
0
        b->data = (unsigned char *)uncomp;
1630
0
        b->alloc = usize;
1631
0
        b->method = RAW;
1632
0
        b->uncomp_size = usize; // Just in case it differs
1633
0
        break;
1634
0
    }
1635
#else
1636
    case BZIP2:
1637
        hts_log_error("Bzip2 compression is not compiled into this version. Please rebuild and try again");
1638
        return -1;
1639
#endif
1640
1641
0
#ifdef HAVE_LIBLZMA
1642
2
    case LZMA:
1643
2
        uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1644
2
        if (!uncomp)
1645
0
            return -1;
1646
2
        if (uncomp_size != b->uncomp_size) {
1647
2
            free(uncomp);
1648
2
            return -1;
1649
2
        }
1650
0
        free(b->data);
1651
0
        b->data = (unsigned char *)uncomp;
1652
0
        b->alloc = uncomp_size;
1653
0
        b->method = RAW;
1654
0
        break;
1655
#else
1656
    case LZMA:
1657
        hts_log_error("Lzma compression is not compiled into this version. Please rebuild and try again");
1658
        return -1;
1659
        break;
1660
#endif
1661
1662
3
    case RANS: {
1663
3
        unsigned int usize = b->uncomp_size, usize2;
1664
3
        uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2);
1665
3
        if (!uncomp)
1666
3
            return -1;
1667
0
        if (usize != usize2) {
1668
0
            free(uncomp);
1669
0
            return -1;
1670
0
        }
1671
0
        free(b->data);
1672
0
        b->data = (unsigned char *)uncomp;
1673
0
        b->alloc = usize2;
1674
0
        b->method = RAW;
1675
0
        b->uncomp_size = usize2; // Just in case it differs
1676
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1677
0
        break;
1678
0
    }
1679
1680
109
    case FQZ: {
1681
109
        uncomp_size = b->uncomp_size;
1682
109
        uncomp = fqz_decompress((char *)b->data, b->comp_size, &uncomp_size, NULL, 0);
1683
109
        if (!uncomp)
1684
76
            return -1;
1685
33
        free(b->data);
1686
33
        b->data = (unsigned char *)uncomp;
1687
33
        b->alloc = uncomp_size;
1688
33
        b->method = RAW;
1689
33
        b->uncomp_size = uncomp_size;
1690
33
        break;
1691
109
    }
1692
1693
21
    case RANS_PR0: {
1694
21
        unsigned int usize = b->uncomp_size, usize2;
1695
21
        uncomp = (char *)rans_uncompress_4x16(b->data, b->comp_size, &usize2);
1696
21
        if (!uncomp)
1697
15
            return -1;
1698
6
        if (usize != usize2) {
1699
6
            free(uncomp);
1700
6
            return -1;
1701
6
        }
1702
0
        b->orig_method = RANSPR;
1703
0
        free(b->data);
1704
0
        b->data = (unsigned char *)uncomp;
1705
0
        b->alloc = usize2;
1706
0
        b->method = RAW;
1707
0
        b->uncomp_size = usize2; // Just incase it differs
1708
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1709
0
        break;
1710
6
    }
1711
1712
4
    case ARITH_PR0: {
1713
4
        unsigned int usize = b->uncomp_size, usize2;
1714
4
        uncomp = (char *)arith_uncompress_to(b->data, b->comp_size, NULL, &usize2);
1715
4
        if (!uncomp)
1716
2
            return -1;
1717
2
        if (usize != usize2) {
1718
2
            free(uncomp);
1719
2
            return -1;
1720
2
        }
1721
0
        b->orig_method = ARITH;
1722
0
        free(b->data);
1723
0
        b->data = (unsigned char *)uncomp;
1724
0
        b->alloc = usize2;
1725
0
        b->method = RAW;
1726
0
        b->uncomp_size = usize2; // Just incase it differs
1727
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1728
0
        break;
1729
2
    }
1730
1731
345
    case TOK3: {
1732
345
        uint32_t out_len;
1733
345
        uint8_t *cp = tok3_decode_names(b->data, b->comp_size, &out_len);
1734
345
        if (!cp)
1735
338
            return -1;
1736
7
        b->orig_method = TOK3;
1737
7
        b->method = RAW;
1738
7
        free(b->data);
1739
7
        b->data = cp;
1740
7
        b->alloc = out_len;
1741
7
        b->uncomp_size = out_len;
1742
7
        break;
1743
345
    }
1744
1745
5
    default:
1746
5
        return -1;
1747
587
    }
1748
1749
40
    return 0;
1750
587
}
1751
1752
static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size,
1753
                                     int content_id, size_t *out_size,
1754
                                     enum cram_block_method_int method,
1755
1.05M
                                     int level, int strat) {
1756
1.05M
    switch (method) {
1757
102k
    case GZIP:
1758
175k
    case GZIP_RLE:
1759
342k
    case GZIP_1:
1760
        // Read names bizarrely benefit from zlib over libdeflate for
1761
        // mid-range compression levels.  Focusing purely of ratio or
1762
        // speed, libdeflate still wins.  It also seems to win for
1763
        // other data series too.
1764
        //
1765
        // Eg RN at level 5;  libdeflate=55.9MB  zlib=51.6MB
1766
#ifdef HAVE_LIBDEFLATE
1767
#  if (LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR == 1 && LIBDEFLATE_VERSION_MINOR <= 8))
1768
        if (content_id == DS_RN && level >= 4 && level <= 7)
1769
            return zlib_mem_deflate(in, in_size, out_size, level, strat);
1770
        else
1771
#  endif
1772
            return libdeflate_deflate(in, in_size, out_size, level, strat);
1773
#else
1774
342k
        return zlib_mem_deflate(in, in_size, out_size, level, strat);
1775
0
#endif
1776
1777
0
    case BZIP2: {
1778
0
#ifdef HAVE_LIBBZ2
1779
0
        unsigned int comp_size = in_size*1.01 + 600;
1780
0
        char *comp = malloc(comp_size);
1781
0
        if (!comp)
1782
0
            return NULL;
1783
1784
0
        if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
1785
0
                                              in, in_size,
1786
0
                                              level, 0, 30)) {
1787
0
            free(comp);
1788
0
            return NULL;
1789
0
        }
1790
0
        *out_size = comp_size;
1791
0
        return comp;
1792
#else
1793
        return NULL;
1794
#endif
1795
0
    }
1796
1797
0
    case FQZ:
1798
0
    case FQZ_b:
1799
0
    case FQZ_c:
1800
0
    case FQZ_d: {
1801
        // Extract the necessary portion of the slice into an fqz_slice struct.
1802
        // These previously were the same thing, but this permits us to detach
1803
        // the codec from the rest of this CRAM implementation.
1804
0
        fqz_slice *f = malloc(2*s->hdr->num_records * sizeof(uint32_t) + sizeof(fqz_slice));
1805
0
        if (!f)
1806
0
            return NULL;
1807
0
        f->num_records = s->hdr->num_records;
1808
0
        f->len = (uint32_t *)(((char *)f) + sizeof(fqz_slice));
1809
0
        f->flags = f->len + s->hdr->num_records;
1810
0
        int i;
1811
0
        for (i = 0; i < s->hdr->num_records; i++) {
1812
0
            f->flags[i] = s->crecs[i].flags;
1813
0
            f->len[i] = (i+1 < s->hdr->num_records
1814
0
                         ? s->crecs[i+1].qual - s->crecs[i].qual
1815
0
                         : s->block[DS_QS]->uncomp_size - s->crecs[i].qual);
1816
0
        }
1817
0
        char *comp = fqz_compress(strat & 0xff /* cram vers */, f,
1818
0
                                  in, in_size, out_size, strat >> 8, NULL);
1819
0
        free(f);
1820
0
        return comp;
1821
0
    }
1822
1823
0
    case LZMA:
1824
0
#ifdef HAVE_LIBLZMA
1825
0
        return lzma_mem_deflate(in, in_size, out_size, level);
1826
#else
1827
        return NULL;
1828
#endif
1829
1830
0
    case RANS0:
1831
0
    case RANS1: {
1832
0
        unsigned int out_size_i;
1833
0
        unsigned char *cp;
1834
0
        cp = rans_compress((unsigned char *)in, in_size, &out_size_i,
1835
0
                           method == RANS0 ? 0 : 1);
1836
0
        *out_size = out_size_i;
1837
0
        return (char *)cp;
1838
0
    }
1839
1840
264k
    case RANS_PR0:
1841
337k
    case RANS_PR1:
1842
439k
    case RANS_PR64:
1843
511k
    case RANS_PR9:
1844
617k
    case RANS_PR128:
1845
617k
    case RANS_PR129:
1846
617k
    case RANS_PR192:
1847
689k
    case RANS_PR193: {
1848
689k
        unsigned int out_size_i;
1849
689k
        unsigned char *cp;
1850
1851
        // see enum cram_block. We map RANS_* methods to order bit-fields
1852
689k
        static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1853
1854
689k
        int m = method == RANS_PR0 ? 0 : methmap[method - RANS_PR1];
1855
689k
        cp = rans_compress_4x16((unsigned char *)in, in_size, &out_size_i,
1856
689k
                                m | RANS_ORDER_SIMD_AUTO);
1857
689k
        *out_size = out_size_i;
1858
689k
        return (char *)cp;
1859
617k
    }
1860
1861
0
    case ARITH_PR0:
1862
0
    case ARITH_PR1:
1863
0
    case ARITH_PR64:
1864
0
    case ARITH_PR9:
1865
0
    case ARITH_PR128:
1866
0
    case ARITH_PR129:
1867
0
    case ARITH_PR192:
1868
0
    case ARITH_PR193: {
1869
0
        unsigned int out_size_i;
1870
0
        unsigned char *cp;
1871
1872
        // see enum cram_block. We map ARITH_* methods to order bit-fields
1873
0
        static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1874
1875
0
        cp = arith_compress_to((unsigned char *)in, in_size, NULL, &out_size_i,
1876
0
                               method == ARITH_PR0 ? 0 : methmap[method - ARITH_PR1]);
1877
0
        *out_size = out_size_i;
1878
0
        return (char *)cp;
1879
0
    }
1880
1881
22.6k
    case TOK3:
1882
22.6k
    case TOKA: {
1883
22.6k
        int out_len;
1884
22.6k
        int lev = level;
1885
22.6k
        if (method == TOK3 && lev > 3)
1886
22.6k
            lev = 3;
1887
22.6k
        uint8_t *cp = tok3_encode_names(in, in_size, lev, strat, &out_len, NULL);
1888
22.6k
        *out_size = out_len;
1889
22.6k
        return (char *)cp;
1890
22.6k
    }
1891
1892
0
    case RAW:
1893
0
        break;
1894
1895
0
    default:
1896
0
        return NULL;
1897
1.05M
    }
1898
1899
0
    return NULL;
1900
1.05M
}
1901
1902
/*
1903
 * A copy of cram_compress_block2 with added recursion detection.
1904
 * This is only called for error handling where the auto-tuning has failed.
1905
 * The simplest way of doing this is recusion + an additional argument, but
1906
 * we didn't want to complicate the existing code hence this is static.
1907
 */
1908
static int cram_compress_block3(cram_fd *fd, cram_slice *s,
1909
                                cram_block *b, cram_metrics *metrics,
1910
                                int method, int level,
1911
805k
                                int recurse) {
1912
1913
805k
    if (!b)
1914
43.3k
        return 0;
1915
1916
762k
    int orig_method = method;
1917
762k
    char *comp = NULL;
1918
762k
    size_t comp_size = 0;
1919
762k
    int strat;
1920
1921
    // Internally we have parameterised methods that externally map
1922
    // to the same CRAM method value.
1923
    // See enum_cram_block_method_int in cram_structs.h.
1924
762k
    int methmap[] = {
1925
        // Externally defined values
1926
762k
        RAW, GZIP, BZIP2, LZMA, RANS, RANSPR, ARITH, FQZ, TOK3,
1927
1928
        // Reserved for possible expansion
1929
762k
        0, 0,
1930
1931
        // Internally parameterised versions matching back to above
1932
        // external values
1933
762k
        GZIP, GZIP,
1934
762k
        FQZ, FQZ, FQZ,
1935
762k
        RANS,
1936
762k
        RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR,
1937
762k
        TOK3,
1938
762k
        ARITH,  ARITH,  ARITH,  ARITH,  ARITH,  ARITH,  ARITH,
1939
762k
    };
1940
1941
762k
    if (b->method != RAW) {
1942
        // Maybe already compressed if s->block[0] was compressed and
1943
        // we have e.g. s->block[DS_BA] set to s->block[0] due to only
1944
        // one base type present and hence using E_HUFFMAN on block 0.
1945
        // A second explicit attempt to compress the same block then
1946
        // occurs.
1947
0
        return 0;
1948
0
    }
1949
1950
762k
    if (method == -1) {
1951
2.50k
        method = 1<<GZIP;
1952
2.50k
        if (fd->use_bz2)
1953
0
            method |= 1<<BZIP2;
1954
2.50k
        if (fd->use_lzma)
1955
0
            method |= 1<<LZMA;
1956
2.50k
    }
1957
1958
762k
    if (level == -1)
1959
2.50k
        level = fd->level;
1960
1961
    //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
1962
1963
762k
    if (method == RAW || level == 0 || b->uncomp_size == 0) {
1964
331k
        b->method = RAW;
1965
331k
        b->comp_size = b->uncomp_size;
1966
        //fprintf(stderr, "Skip block id %d\n", b->content_id);
1967
331k
        return 0;
1968
331k
    }
1969
1970
430k
#ifndef ABS
1971
430k
#    define ABS(a) ((a)>=0?(a):-(a))
1972
430k
#endif
1973
1974
430k
    if (metrics) {
1975
422k
        pthread_mutex_lock(&fd->metrics_lock);
1976
        // Sudden changes in size trigger a retrial.  These are mainly
1977
        // triggered when switching to sorted / unsorted, where the number
1978
        // of elements in a slice radically changes.
1979
        //
1980
        // We also get large fluctuations based on genome coordinate for
1981
        // e.g. SA:Z and SC series, but we consider the typical scale of
1982
        // delta between blocks and use this to look for abnormality.
1983
1984
        // Equivalent to (but minus possible integer overflow)
1985
        //   (b->uncomp_size + 1000)/4 > metrics->input_avg_sz+1000 ||
1986
        //    b->uncomp_size + 1000    < (metrics->input_avg_sz+1000)/4)
1987
422k
        if (metrics->input_avg_sz &&
1988
144k
            (b->uncomp_size/4 - 750 > metrics->input_avg_sz ||
1989
143k
             b->uncomp_size         < metrics->input_avg_sz/4 - 750) &&
1990
1.47k
            ABS(b->uncomp_size-metrics->input_avg_sz)/10
1991
1.47k
                > metrics->input_avg_delta) {
1992
133
            metrics->next_trial = 0;
1993
133
        }
1994
1995
422k
        if (metrics->trial > 0 || --metrics->next_trial <= 0) {
1996
95.2k
            int m, unpackable = metrics->unpackable;
1997
95.2k
            size_t sz_best = b->uncomp_size;
1998
95.2k
            size_t sz[CRAM_MAX_METHOD] = {0};
1999
95.2k
            int method_best = 0; // RAW
2000
95.2k
            char *c_best = NULL, *c = NULL;
2001
2002
95.2k
            metrics->input_avg_delta =
2003
95.2k
                0.9 * (metrics->input_avg_delta +
2004
95.2k
                       ABS(b->uncomp_size - metrics->input_avg_sz));
2005
2006
95.2k
            metrics->input_avg_sz += b->uncomp_size*.2;
2007
95.2k
            metrics->input_avg_sz *= 0.8;
2008
2009
95.2k
            if (metrics->revised_method)
2010
46.6k
                method = metrics->revised_method;
2011
48.5k
            else
2012
48.5k
                metrics->revised_method = method;
2013
2014
95.2k
            if (metrics->next_trial <= 0) {
2015
3.36k
                metrics->next_trial = TRIAL_SPAN;
2016
3.36k
                metrics->trial = NTRIALS;
2017
111k
                for (m = 0; m < CRAM_MAX_METHOD; m++)
2018
107k
                    metrics->sz[m] /= 2;
2019
3.36k
                metrics->unpackable = 0;
2020
3.36k
            }
2021
2022
            // Compress this block using the best method
2023
95.2k
            if (unpackable && CRAM_MAJOR_VERS(fd->version) > 3) {
2024
                // No point trying bit-pack if 17+ symbols.
2025
0
                if (method & (1<<RANS_PR128))
2026
0
                    method = (method|(1<<RANS_PR0))&~(1<<RANS_PR128);
2027
0
                if (method & (1<<RANS_PR129))
2028
0
                    method = (method|(1<<RANS_PR1))&~(1<<RANS_PR129);
2029
0
                if (method & (1<<RANS_PR192))
2030
0
                    method = (method|(1<<RANS_PR64))&~(1<<RANS_PR192);
2031
0
                if (method & (1<<RANS_PR193))
2032
0
                    method = (method|(1<<RANS_PR64)|(1<<RANS_PR1))&~(1<<RANS_PR193);
2033
2034
0
                if (method & (1<<ARITH_PR128))
2035
0
                    method = (method|(1<<ARITH_PR0))&~(1<<ARITH_PR128);
2036
0
                if (method & (1<<ARITH_PR129))
2037
0
                    method = (method|(1<<ARITH_PR1))&~(1<<ARITH_PR129);
2038
0
                if (method & (1<<ARITH_PR192))
2039
0
                    method = (method|(1<<ARITH_PR64))&~(1<<ARITH_PR192);
2040
0
                if (method & (1u<<ARITH_PR193))
2041
0
                    method = (method|(1<<ARITH_PR64)|(1<<ARITH_PR1))&~(1u<<ARITH_PR193);
2042
0
            }
2043
2044
            // Libdeflate doesn't have a Z_RLE strategy.
2045
            // We treat it as level 1, but iff we haven't also
2046
            // explicitly listed that in the method list.
2047
#ifdef HAVE_LIBDEFLATE
2048
            if ((method & (1<<GZIP_RLE)) && (method & (1<<GZIP_1)))
2049
                method &= ~(1<<GZIP_RLE);
2050
#endif
2051
2052
95.2k
            pthread_mutex_unlock(&fd->metrics_lock);
2053
2054
3.14M
            for (m = 0; m < CRAM_MAX_METHOD; m++) {
2055
3.04M
                if (method & (1u<<m)) {
2056
719k
                    int lvl = level;
2057
719k
                    switch (m) {
2058
94.6k
                    case GZIP:     strat = Z_FILTERED; break;
2059
95.1k
                    case GZIP_1:   strat = Z_DEFAULT_STRATEGY; lvl = 1; break;
2060
72.4k
                    case GZIP_RLE: strat = Z_RLE; break;
2061
0
                    case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
2062
0
                    case FQZ_b:    strat = CRAM_MAJOR_VERS(fd->version)+256; break;
2063
0
                    case FQZ_c:    strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
2064
0
                    case FQZ_d:    strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
2065
22.4k
                    case TOK3:     strat = 0; break;
2066
0
                    case TOKA:     strat = 1; break;
2067
434k
                    default:       strat = 0;
2068
719k
                    }
2069
2070
719k
                    c = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2071
719k
                                                b->content_id, &sz[m], m, lvl, strat);
2072
2073
719k
                    if (c && sz_best > sz[m]) {
2074
53.9k
                        sz_best = sz[m];
2075
53.9k
                        method_best = m;
2076
53.9k
                        if (c_best)
2077
26.9k
                            free(c_best);
2078
53.9k
                        c_best = c;
2079
665k
                    } else if (c) {
2080
664k
                        free(c);
2081
664k
                    } else {
2082
1.16k
                        sz[m] = UINT_MAX; // arbitrarily worse than raw
2083
1.16k
                    }
2084
2.32M
                } else {
2085
2.32M
                    sz[m] = UINT_MAX; // arbitrarily worse than raw
2086
2.32M
                }
2087
3.04M
            }
2088
2089
95.2k
            if (c_best) {
2090
27.0k
                free(b->data);
2091
27.0k
                b->data = (unsigned char *)c_best;
2092
27.0k
                b->method = method_best; // adjusted to methmap[method_best] later
2093
27.0k
                b->comp_size = sz_best;
2094
27.0k
            }
2095
2096
            // Accumulate stats for all methods tried
2097
95.2k
            pthread_mutex_lock(&fd->metrics_lock);
2098
3.14M
            for (m = 0; m < CRAM_MAX_METHOD; m++)
2099
                // don't be overly sure on small blocks.
2100
                // +2000 means eg bzip2 vs gzip (1.07 to 1.04) or gz vs rans1
2101
                // needs to be at least 60 bytes smaller to overcome the
2102
                // fixed size addition.
2103
3.04M
                metrics->sz[m] += sz[m]+2000;
2104
2105
            // When enough trials performed, find the best on average
2106
95.2k
            if (--metrics->trial == 0) {
2107
23.2k
                int best_method = RAW;
2108
23.2k
                int best_sz = INT_MAX;
2109
2110
                // Relative costs of methods. See enum_cram_block_method_int
2111
                // and methmap
2112
23.2k
                double meth_cost[32] = {
2113
                    // Externally defined methods
2114
23.2k
                    1,    // 0  raw
2115
23.2k
                    1.04, // 1  gzip (Z_FILTERED)
2116
23.2k
                    1.07, // 2  bzip2
2117
23.2k
                    1.08, // 3  lzma
2118
23.2k
                    1.00, // 4  rans    (O0)
2119
23.2k
                    1.00, // 5  ranspr  (O0)
2120
23.2k
                    1.04, // 6  arithpr (O0)
2121
23.2k
                    1.05, // 7  fqz
2122
23.2k
                    1.05, // 8  tok3 (rans)
2123
23.2k
                    1.00, 1.00, // 9,10 reserved
2124
2125
                    // Paramterised versions of above
2126
23.2k
                    1.01, // gzip rle
2127
23.2k
                    1.01, // gzip -1
2128
2129
23.2k
                    1.05, 1.05, 1.05, // FQZ_b,c,d
2130
2131
23.2k
                    1.01, // rans O1
2132
2133
23.2k
                    1.01, // rans_pr1
2134
23.2k
                    1.00, // rans_pr64; if smaller, usually fast
2135
23.2k
                    1.03, // rans_pr65/9
2136
23.2k
                    1.00, // rans_pr128
2137
23.2k
                    1.01, // rans_pr129
2138
23.2k
                    1.00, // rans_pr192
2139
23.2k
                    1.01, // rans_pr193
2140
2141
23.2k
                    1.07, // tok3 arith
2142
2143
23.2k
                    1.04, // arith_pr1
2144
23.2k
                    1.04, // arith_pr64
2145
23.2k
                    1.04, // arith_pr9
2146
23.2k
                    1.03, // arith_pr128
2147
23.2k
                    1.04, // arith_pr129
2148
23.2k
                    1.04, // arith_pr192
2149
23.2k
                    1.04, // arith_pr193
2150
23.2k
                };
2151
2152
                // Scale methods by cost based on compression level
2153
23.2k
                if (fd->level <= 1) {
2154
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2155
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1)*4;
2156
23.2k
                } else if (fd->level <= 3) {
2157
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2158
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1);
2159
23.2k
                } else if (fd->level <= 6) {
2160
765k
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2161
742k
                        metrics->sz[m] *= 1+(meth_cost[m]-1)/2;
2162
23.2k
                } else if (fd->level <= 7) {
2163
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2164
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1)/3;
2165
0
                } // else cost is ignored
2166
2167
                // Ensure these are never used; BSC and ZSTD
2168
23.2k
                metrics->sz[9] = metrics->sz[10] = INT_MAX;
2169
2170
765k
                for (m = 0; m < CRAM_MAX_METHOD; m++) {
2171
742k
                    if ((!metrics->sz[m]) || (!(method & (1u<<m))))
2172
580k
                        continue;
2173
2174
162k
                    if (best_sz > metrics->sz[m])
2175
54.7k
                        best_sz = metrics->sz[m], best_method = m;
2176
162k
                }
2177
2178
23.2k
                if (best_method != metrics->method) {
2179
                    //metrics->trial = (NTRIALS+1)/2; // be sure
2180
                    //metrics->next_trial /= 1.5;
2181
10.0k
                    metrics->consistency = 0;
2182
13.1k
                } else {
2183
13.1k
                    metrics->next_trial *= MIN(2, 1+metrics->consistency/4.0);
2184
13.1k
                    metrics->consistency++;
2185
13.1k
                }
2186
2187
23.2k
                metrics->method = best_method;
2188
23.2k
                switch (best_method) {
2189
125
                case GZIP:     strat = Z_FILTERED; break;
2190
7.70k
                case GZIP_1:   strat = Z_DEFAULT_STRATEGY; break;
2191
1
                case GZIP_RLE: strat = Z_RLE; break;
2192
0
                case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
2193
0
                case FQZ_b:    strat = CRAM_MAJOR_VERS(fd->version)+256; break;
2194
0
                case FQZ_c:    strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
2195
0
                case FQZ_d:    strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
2196
245
                case TOK3:     strat = 0; break;
2197
0
                case TOKA:     strat = 1; break;
2198
15.1k
                default:       strat = 0;
2199
23.2k
                }
2200
23.2k
                metrics->strat  = strat;
2201
2202
                // If we see at least MAXFAIL trials in a row for a specific
2203
                // compression method with more than MAXDELTA aggregate
2204
                // size then we drop this from the list of methods used
2205
                // for this block type.
2206
245k
#define MAXDELTA 0.20
2207
541k
#define MAXFAILS 4
2208
765k
                for (m = 0; m < CRAM_MAX_METHOD; m++) {
2209
742k
                    if (best_method == m) {
2210
23.2k
                        metrics->cnt[m] = 0;
2211
23.2k
                        metrics->extra[m] = 0;
2212
719k
                    } else if (best_sz < metrics->sz[m]) {
2213
541k
                        double r = (double)metrics->sz[m] / best_sz - 1;
2214
541k
                        int mul = 1+(fd->level>=7);
2215
541k
                        if (++metrics->cnt[m] >= MAXFAILS*mul &&
2216
245k
                            (metrics->extra[m] += r) >= MAXDELTA*mul)
2217
108k
                            method &= ~(1u<<m);
2218
2219
                        // Special case for fqzcomp as it rarely changes
2220
541k
                        if (m == FQZ || m == FQZ_b || m == FQZ_c || m == FQZ_d) {
2221
88.4k
                            if (metrics->sz[m] > best_sz)
2222
88.4k
                                method &= ~(1u<<m);
2223
88.4k
                        }
2224
541k
                    }
2225
742k
                }
2226
2227
                //if (fd->verbose > 1 && method != metrics->revised_method)
2228
                //    fprintf(stderr, "%d: revising method from %x to %x\n",
2229
                //          b->content_id, metrics->revised_method, method);
2230
23.2k
                metrics->revised_method = method;
2231
23.2k
            }
2232
95.2k
            pthread_mutex_unlock(&fd->metrics_lock);
2233
327k
        } else {
2234
327k
            metrics->input_avg_delta =
2235
327k
                0.9 * (metrics->input_avg_delta +
2236
327k
                       ABS(b->uncomp_size - metrics->input_avg_sz));
2237
2238
327k
            metrics->input_avg_sz += b->uncomp_size*.2;
2239
327k
            metrics->input_avg_sz *= 0.8;
2240
2241
327k
            strat = metrics->strat;
2242
327k
            method = metrics->method;
2243
2244
327k
            pthread_mutex_unlock(&fd->metrics_lock);
2245
327k
            comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2246
327k
                                           b->content_id, &comp_size, method,
2247
327k
                                           method == GZIP_1 ? 1 : level,
2248
327k
                                           strat);
2249
327k
            if (!comp) {
2250
                // Our cached best method failed, but maybe another works?
2251
                // Rerun with trial mode engaged again.
2252
175
                if (!recurse) {
2253
175
                    hts_log_warning("Compressed block ID %d method %s failed, "
2254
175
                                    "redoing trial", b->content_id,
2255
175
                                    cram_block_method2str(method));
2256
175
                    pthread_mutex_lock(&fd->metrics_lock);
2257
175
                    metrics->trial = NTRIALS;
2258
175
                    metrics->next_trial = TRIAL_SPAN;
2259
175
                    metrics->revised_method = orig_method;
2260
175
                    pthread_mutex_unlock(&fd->metrics_lock);
2261
175
                    return cram_compress_block3(fd, s, b, metrics, method,
2262
175
                                                level, 1);
2263
175
                }
2264
0
                return -1;
2265
175
            }
2266
2267
327k
            if (comp_size < b->uncomp_size) {
2268
41.0k
                free(b->data);
2269
41.0k
                b->data = (unsigned char *)comp;
2270
41.0k
                b->comp_size = comp_size;
2271
41.0k
                b->method = method;
2272
286k
            } else {
2273
286k
                free(comp);
2274
286k
            }
2275
327k
        }
2276
2277
422k
    } else {
2278
        // no cached metrics, so just do zlib?
2279
7.74k
        comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2280
7.74k
                                       b->content_id, &comp_size, GZIP, level, Z_FILTERED);
2281
7.74k
        if (!comp) {
2282
0
            hts_log_error("Compression failed!");
2283
0
            return -1;
2284
0
        }
2285
2286
7.74k
        if (comp_size < b->uncomp_size) {
2287
1.48k
            free(b->data);
2288
1.48k
            b->data = (unsigned char *)comp;
2289
1.48k
            b->comp_size = comp_size;
2290
1.48k
            b->method = GZIP;
2291
6.25k
        } else {
2292
6.25k
            free(comp);
2293
6.25k
        }
2294
7.74k
        strat = Z_FILTERED;
2295
7.74k
    }
2296
2297
430k
    hts_log_info("Compressed block ID %d from %d to %d by method %s",
2298
430k
                 b->content_id, b->uncomp_size, b->comp_size,
2299
430k
                 cram_block_method2str(b->method));
2300
2301
430k
    b->method = methmap[b->method];
2302
2303
430k
    return 0;
2304
430k
}
2305
2306
/*
2307
 * Compresses a block using a selection of compression codecs and options.
2308
 * The best is learnt and used for subsequent slices, periodically resampling.
2309
 *
2310
 * Method and level -1 implies defaults, as specified in cram_fd.
2311
 */
2312
int cram_compress_block2(cram_fd *fd, cram_slice *s,
2313
                         cram_block *b, cram_metrics *metrics,
2314
805k
                         int method, int level) {
2315
805k
    return cram_compress_block3(fd, s, b, metrics, method, level, 0);
2316
805k
}
2317
2318
int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
2319
2.50k
                        int method, int level) {
2320
2.50k
    return cram_compress_block2(fd, NULL, b, metrics, method, level);
2321
2.50k
}
2322
2323
222k
cram_metrics *cram_new_metrics(void) {
2324
222k
    cram_metrics *m = calloc(1, sizeof(*m));
2325
222k
    if (!m)
2326
0
        return NULL;
2327
222k
    m->trial = NTRIALS-1;
2328
222k
    m->next_trial = TRIAL_SPAN/2; // learn quicker at start
2329
222k
    m->method = RAW;
2330
222k
    m->strat = 0;
2331
222k
    m->revised_method = 0;
2332
222k
    m->unpackable = 0;
2333
2334
222k
    return m;
2335
222k
}
2336
2337
430k
char *cram_block_method2str(enum cram_block_method_int m) {
2338
430k
    switch(m) {
2339
360k
    case RAW:         return "RAW";
2340
4.81k
    case GZIP:        return "GZIP";
2341
0
    case BZIP2:       return "BZIP2";
2342
0
    case LZMA:        return "LZMA";
2343
0
    case RANS0:       return "RANS0";
2344
0
    case RANS1:       return "RANS1";
2345
13
    case GZIP_RLE:    return "GZIP_RLE";
2346
7.79k
    case GZIP_1:      return "GZIP_1";
2347
0
    case FQZ:         return "FQZ";
2348
0
    case FQZ_b:       return "FQZ_b";
2349
0
    case FQZ_c:       return "FQZ_c";
2350
0
    case FQZ_d:       return "FQZ_d";
2351
2.37k
    case RANS_PR0:    return "RANS_PR0";
2352
465
    case RANS_PR1:    return "RANS_PR1";
2353
20.7k
    case RANS_PR64:   return "RANS_PR64";
2354
3
    case RANS_PR9:    return "RANS_PR9";
2355
33.0k
    case RANS_PR128:  return "RANS_PR128";
2356
0
    case RANS_PR129:  return "RANS_PR129";
2357
0
    case RANS_PR192:  return "RANS_PR192";
2358
361
    case RANS_PR193:  return "RANS_PR193";
2359
176
    case TOK3:        return "TOK3_R";
2360
0
    case TOKA:        return "TOK3_A";
2361
0
    case ARITH_PR0:   return "ARITH_PR0";
2362
0
    case ARITH_PR1:   return "ARITH_PR1";
2363
0
    case ARITH_PR64:  return "ARITH_PR64";
2364
0
    case ARITH_PR9:   return "ARITH_PR9";
2365
0
    case ARITH_PR128: return "ARITH_PR128";
2366
0
    case ARITH_PR129: return "ARITH_PR129";
2367
0
    case ARITH_PR192: return "ARITH_PR192";
2368
0
    case ARITH_PR193: return "ARITH_PR193";
2369
0
    case BM_ERROR: break;
2370
430k
    }
2371
0
    return "?";
2372
430k
}
2373
2374
13
char *cram_content_type2str(enum cram_content_type t) {
2375
13
    switch (t) {
2376
5
    case FILE_HEADER:         return "FILE_HEADER";
2377
0
    case COMPRESSION_HEADER:  return "COMPRESSION_HEADER";
2378
0
    case MAPPED_SLICE:        return "MAPPED_SLICE";
2379
0
    case UNMAPPED_SLICE:      return "UNMAPPED_SLICE";
2380
3
    case EXTERNAL:            return "EXTERNAL";
2381
0
    case CORE:                return "CORE";
2382
0
    case CT_ERROR:            break;
2383
13
    }
2384
5
    return "?";
2385
13
}
2386
2387
/* ----------------------------------------------------------------------
2388
 * Reference sequence handling
2389
 *
2390
 * These revolve around the refs_t structure, which may potentially be
2391
 * shared between multiple cram_fd.
2392
 *
2393
 * We start with refs_create() to allocate an empty refs_t and then
2394
 * populate it with @SQ line data using refs_from_header(). This is done on
2395
 * cram_open().  Also at start up we can call cram_load_reference() which
2396
 * is used with "scramble -r foo.fa". This replaces the fd->refs with the
2397
 * new one specified. In either case refs2id() is then called which
2398
 * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
2399
 *
2400
 * Later, possibly within a thread, we will want to know the actual ref
2401
 * seq itself, obtained by calling cram_get_ref().  This may use the
2402
 * UR: or M5: fields or the filename specified in the original
2403
 * cram_load_reference() call.
2404
 *
2405
 * Given the potential for multi-threaded reference usage, we have
2406
 * reference counting (sorry for the confusing double use of "ref") to
2407
 * track the number of callers interested in any specific reference.
2408
 */
2409
2410
/*
2411
 * Frees/unmaps a reference sequence and associated file handles.
2412
 */
2413
3.03k
static void ref_entry_free_seq(ref_entry *e) {
2414
3.03k
    if (e->mf)
2415
0
        mfclose(e->mf);
2416
3.03k
    if (e->seq && !e->mf)
2417
0
        free(e->seq);
2418
2419
3.03k
    e->seq = NULL;
2420
3.03k
    e->mf = NULL;
2421
3.03k
}
2422
2423
4.53k
void refs_free(refs_t *r) {
2424
4.53k
    RP("refs_free()\n");
2425
2426
4.53k
    if (--r->count > 0)
2427
0
        return;
2428
2429
4.53k
    if (!r)
2430
0
        return;
2431
2432
4.53k
    if (r->pool)
2433
4.53k
        string_pool_destroy(r->pool);
2434
2435
4.53k
    if (r->h_meta) {
2436
4.53k
        khint_t k;
2437
2438
13.2k
        for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) {
2439
8.75k
            ref_entry *e;
2440
2441
8.75k
            if (!kh_exist(r->h_meta, k))
2442
5.71k
                continue;
2443
3.03k
            if (!(e = kh_val(r->h_meta, k)))
2444
0
                continue;
2445
3.03k
            ref_entry_free_seq(e);
2446
3.03k
            free(e);
2447
3.03k
        }
2448
2449
4.53k
        kh_destroy(refs, r->h_meta);
2450
4.53k
    }
2451
2452
4.53k
    if (r->ref_id)
2453
3.12k
        free(r->ref_id);
2454
2455
4.53k
    if (r->fp)
2456
0
        bgzf_close(r->fp);
2457
2458
4.53k
    pthread_mutex_destroy(&r->lock);
2459
2460
4.53k
    free(r);
2461
4.53k
}
2462
2463
4.53k
static refs_t *refs_create(void) {
2464
4.53k
    refs_t *r = calloc(1, sizeof(*r));
2465
2466
4.53k
    RP("refs_create()\n");
2467
2468
4.53k
    if (!r)
2469
0
        return NULL;
2470
2471
4.53k
    if (!(r->pool = string_pool_create(8192)))
2472
0
        goto err;
2473
2474
4.53k
    r->ref_id = NULL; // see refs2id() to populate.
2475
4.53k
    r->count = 1;
2476
4.53k
    r->last = NULL;
2477
4.53k
    r->last_id = -1;
2478
2479
4.53k
    if (!(r->h_meta = kh_init(refs)))
2480
0
        goto err;
2481
2482
4.53k
    pthread_mutex_init(&r->lock, NULL);
2483
2484
4.53k
    return r;
2485
2486
0
 err:
2487
0
    refs_free(r);
2488
0
    return NULL;
2489
4.53k
}
2490
2491
/*
2492
 * Opens a reference fasta file as a BGZF stream, allowing for
2493
 * compressed files.  It automatically builds a .fai file if
2494
 * required and if compressed a .gzi bgzf index too.
2495
 *
2496
 * Returns a BGZF handle on success;
2497
 *         NULL on failure.
2498
 */
2499
905
static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) {
2500
905
    BGZF *fp;
2501
2502
905
    if (strncmp(fn, "file://", 7) == 0)
2503
0
        fn += 7;
2504
2505
905
    if (!is_md5 && !hisremote(fn)) {
2506
560
        char fai_file[PATH_MAX];
2507
2508
560
        snprintf(fai_file, PATH_MAX, "%s.fai", fn);
2509
560
        if (access(fai_file, R_OK) != 0)
2510
560
            if (fai_build(fn) != 0)
2511
560
                return NULL;
2512
560
    }
2513
2514
345
    if (!(fp = bgzf_open(fn, mode))) {
2515
345
        perror(fn);
2516
345
        return NULL;
2517
345
    }
2518
2519
0
    if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) {
2520
0
        hts_log_error("Unable to load .gzi index '%s.gzi'", fn);
2521
0
        bgzf_close(fp);
2522
0
        return NULL;
2523
0
    }
2524
2525
0
    return fp;
2526
0
}
2527
2528
/*
2529
 * Loads a FAI file for a reference.fasta.
2530
 * "is_err" indicates whether failure to load is worthy of emitting an
2531
 * error message. In some cases (eg with embedded references) we
2532
 * speculatively load, just in case, and silently ignore errors.
2533
 *
2534
 * Returns the refs_t struct on success (maybe newly allocated);
2535
 *         NULL on failure
2536
 */
2537
905
static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) {
2538
905
    hFILE *fp = NULL;
2539
905
    char fai_fn[PATH_MAX];
2540
905
    char line[8192];
2541
905
    refs_t *r = r_orig;
2542
905
    size_t fn_l = strlen(fn);
2543
905
    int id = 0, id_alloc = 0;
2544
2545
905
    RP("refs_load_fai %s\n", fn);
2546
2547
905
    if (!r)
2548
0
        if (!(r = refs_create()))
2549
0
            goto err;
2550
2551
905
    if (r->fp)
2552
0
        if (bgzf_close(r->fp) != 0)
2553
0
            goto err;
2554
905
    r->fp = NULL;
2555
2556
    /* Look for a FASTA##idx##FAI format */
2557
905
    char *fn_delim = strstr(fn, HTS_IDX_DELIM);
2558
905
    if (fn_delim) {
2559
25
        if (!(r->fn = string_ndup(r->pool, fn, fn_delim - fn)))
2560
0
            goto err;
2561
25
        fn_delim += strlen(HTS_IDX_DELIM);
2562
25
        snprintf(fai_fn, PATH_MAX, "%s", fn_delim);
2563
880
    } else {
2564
        /* An index file was provided, instead of the actual reference file */
2565
880
        if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0) {
2566
7
            if (!r->fn) {
2567
3
                if (!(r->fn = string_ndup(r->pool, fn, fn_l-4)))
2568
0
                    goto err;
2569
3
            }
2570
7
            snprintf(fai_fn, PATH_MAX, "%s", fn);
2571
873
        } else {
2572
        /* Only the reference file provided. Get the index file name from it */
2573
873
            if (!(r->fn = string_dup(r->pool, fn)))
2574
0
                goto err;
2575
873
            snprintf(fai_fn, PATH_MAX, "%.*s.fai", PATH_MAX-5, fn);
2576
873
        }
2577
880
    }
2578
2579
905
    if (!(r->fp = bgzf_open_ref(r->fn, "r", 0))) {
2580
905
        hts_log_error("Failed to open reference file '%s'", r->fn);
2581
905
        goto err;
2582
905
    }
2583
2584
0
    if (!(fp = hopen(fai_fn, "r"))) {
2585
0
        hts_log_error("Failed to open index file '%s'", fai_fn);
2586
0
        if (is_err)
2587
0
            perror(fai_fn);
2588
0
        goto err;
2589
0
    }
2590
0
    while (hgets(line, 8192, fp) != NULL) {
2591
0
        ref_entry *e = malloc(sizeof(*e));
2592
0
        char *cp;
2593
0
        int n;
2594
0
        khint_t k;
2595
2596
0
        if (!e)
2597
0
            return NULL;
2598
2599
        // id
2600
0
        for (cp = line; *cp && !isspace_c(*cp); cp++)
2601
0
            ;
2602
0
        *cp++ = 0;
2603
0
        e->name = string_dup(r->pool, line);
2604
2605
        // length
2606
0
        while (*cp && isspace_c(*cp))
2607
0
            cp++;
2608
0
        e->length = strtoll(cp, &cp, 10);
2609
2610
        // offset
2611
0
        while (*cp && isspace_c(*cp))
2612
0
            cp++;
2613
0
        e->offset = strtoll(cp, &cp, 10);
2614
2615
        // bases per line
2616
0
        while (*cp && isspace_c(*cp))
2617
0
            cp++;
2618
0
        e->bases_per_line = strtol(cp, &cp, 10);
2619
2620
        // line length
2621
0
        while (*cp && isspace_c(*cp))
2622
0
            cp++;
2623
0
        e->line_length = strtol(cp, &cp, 10);
2624
2625
        // filename
2626
0
        e->fn = r->fn;
2627
2628
0
        e->count = 0;
2629
0
        e->seq = NULL;
2630
0
        e->mf = NULL;
2631
0
        e->is_md5 = 0;
2632
0
        e->validated_md5 = 0;
2633
2634
0
        k = kh_put(refs, r->h_meta, e->name, &n);
2635
0
        if (-1 == n)  {
2636
0
            free(e);
2637
0
            return NULL;
2638
0
        }
2639
2640
0
        if (n) {
2641
0
            kh_val(r->h_meta, k) = e;
2642
0
        } else {
2643
0
            ref_entry *re = kh_val(r->h_meta, k);
2644
0
            if (re && (re->count != 0 || re->length != 0)) {
2645
                /* Keep old */
2646
0
                free(e);
2647
0
            } else {
2648
                /* Replace old */
2649
0
                if (re)
2650
0
                    free(re);
2651
0
                kh_val(r->h_meta, k) = e;
2652
0
            }
2653
0
        }
2654
2655
0
        if (id >= id_alloc) {
2656
0
            ref_entry **new_refs;
2657
0
            int x;
2658
2659
0
            id_alloc = id_alloc ?id_alloc*2 : 16;
2660
0
            new_refs = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
2661
0
            if (!new_refs)
2662
0
                goto err;
2663
0
            r->ref_id = new_refs;
2664
2665
0
            for (x = id; x < id_alloc; x++)
2666
0
                r->ref_id[x] = NULL;
2667
0
        }
2668
0
        r->ref_id[id] = e;
2669
0
        r->nref = ++id;
2670
0
    }
2671
2672
0
    if(hclose(fp) < 0)
2673
0
        goto err;
2674
0
    return r;
2675
2676
905
 err:
2677
905
    if (fp)
2678
0
        hclose_abruptly(fp);
2679
2680
905
    if (!r_orig)
2681
0
        refs_free(r);
2682
2683
905
    return NULL;
2684
0
}
2685
2686
/*
2687
 * Verifies that the CRAM @SQ lines and .fai files match.
2688
 */
2689
0
static void sanitise_SQ_lines(cram_fd *fd) {
2690
0
    int i;
2691
2692
0
    if (!fd->header || !fd->header->hrecs)
2693
0
        return;
2694
2695
0
    if (!fd->refs || !fd->refs->h_meta)
2696
0
        return;
2697
2698
0
    for (i = 0; i < fd->header->hrecs->nref; i++) {
2699
0
        const char *name = fd->header->hrecs->ref[i].name;
2700
0
        khint_t k = kh_get(refs, fd->refs->h_meta, name);
2701
0
        ref_entry *r;
2702
2703
        // We may have @SQ lines which have no known .fai, but do not
2704
        // in themselves pose a problem because they are unused in the file.
2705
0
        if (k == kh_end(fd->refs->h_meta))
2706
0
            continue;
2707
2708
0
        if (!(r = (ref_entry *)kh_val(fd->refs->h_meta, k)))
2709
0
            continue;
2710
2711
0
        if (r->length && r->length != fd->header->hrecs->ref[i].len) {
2712
0
            assert(strcmp(r->name, fd->header->hrecs->ref[i].name) == 0);
2713
2714
            // Should we also check MD5sums here to ensure the correct
2715
            // reference was given?
2716
0
            hts_log_warning("Header @SQ length mismatch for ref %s, %"PRIhts_pos" vs %d",
2717
0
                            r->name, fd->header->hrecs->ref[i].len, (int)r->length);
2718
2719
            // Fixing the parsed @SQ header will make MD:Z: strings work
2720
            // and also stop it producing N for the sequence.
2721
0
            fd->header->hrecs->ref[i].len = r->length;
2722
0
        }
2723
0
    }
2724
0
}
2725
2726
/*
2727
 * Indexes references by the order they appear in a BAM file. This may not
2728
 * necessarily be the same order they appear in the fasta reference file.
2729
 *
2730
 * Returns 0 on success
2731
 *        -1 on failure
2732
 */
2733
2.50k
int refs2id(refs_t *r, sam_hdr_t *hdr) {
2734
2.50k
    int i;
2735
2.50k
    sam_hrecs_t *h = hdr->hrecs;
2736
2737
2.50k
    if (r->ref_id)
2738
1.43k
        free(r->ref_id);
2739
2.50k
    if (r->last)
2740
0
        r->last = NULL;
2741
2742
2.50k
    r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
2743
2.50k
    if (!r->ref_id)
2744
0
        return -1;
2745
2746
2.50k
    r->nref = h->nref;
2747
4.34k
    for (i = 0; i < h->nref; i++) {
2748
1.83k
        khint_t k = kh_get(refs, r->h_meta, h->ref[i].name);
2749
1.83k
        if (k != kh_end(r->h_meta)) {
2750
1.83k
            r->ref_id[i] = kh_val(r->h_meta, k);
2751
1.83k
        } else {
2752
0
            hts_log_warning("Unable to find ref name '%s'", h->ref[i].name);
2753
0
        }
2754
1.83k
    }
2755
2756
2.50k
    return 0;
2757
2.50k
}
2758
2759
/*
2760
 * Generates refs_t entries based on @SQ lines in the header.
2761
 * Returns 0 on success
2762
 *         -1 on failure
2763
 */
2764
9.68k
static int refs_from_header(cram_fd *fd) {
2765
9.68k
    if (!fd)
2766
0
        return -1;
2767
2768
9.68k
    refs_t *r = fd->refs;
2769
9.68k
    if (!r)
2770
0
        return -1;
2771
2772
9.68k
    sam_hdr_t *h = fd->header;
2773
9.68k
    if (!h)
2774
2.79k
        return 0;
2775
2776
6.89k
    if (!h->hrecs) {
2777
3.71k
        if (-1 == sam_hdr_fill_hrecs(h))
2778
129
            return -1;
2779
3.71k
    }
2780
2781
6.76k
    if (h->hrecs->nref == 0)
2782
3.27k
        return 0;
2783
2784
    //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
2785
2786
    /* Existing refs are fine, as long as they're compatible with the hdr. */
2787
3.48k
    ref_entry **new_ref_id = realloc(r->ref_id, (r->nref + h->hrecs->nref) * sizeof(*r->ref_id));
2788
3.48k
    if (!new_ref_id)
2789
0
        return -1;
2790
3.48k
    r->ref_id = new_ref_id;
2791
2792
3.48k
    int i, j;
2793
    /* Copy info from h->ref[i] over to r */
2794
8.35k
    for (i = 0, j = r->nref; i < h->hrecs->nref; i++) {
2795
4.86k
        sam_hrec_type_t *ty;
2796
4.86k
        sam_hrec_tag_t *tag;
2797
4.86k
        khint_t k;
2798
4.86k
        int n;
2799
2800
4.86k
        k = kh_get(refs, r->h_meta, h->hrecs->ref[i].name);
2801
4.86k
        if (k != kh_end(r->h_meta))
2802
            // Ref already known about
2803
1.83k
            continue;
2804
2805
3.03k
        if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
2806
0
            return -1;
2807
2808
3.03k
        if (!h->hrecs->ref[i].name)
2809
0
            return -1;
2810
2811
3.03k
        r->ref_id[j]->name = string_dup(r->pool, h->hrecs->ref[i].name);
2812
3.03k
        if (!r->ref_id[j]->name) return -1;
2813
3.03k
        r->ref_id[j]->length = 0; // marker for not yet loaded
2814
2815
        /* Initialise likely filename if known */
2816
3.03k
        if ((ty = sam_hrecs_find_type_id(h->hrecs, "SQ", "SN", h->hrecs->ref[i].name))) {
2817
3.03k
            if ((tag = sam_hrecs_find_key(ty, "M5", NULL)))
2818
142
                r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
2819
2820
3.03k
            if ((tag = sam_hrecs_find_key(ty, "LN", NULL))) {
2821
                // LN tag used when constructing consensus reference
2822
3.03k
                r->ref_id[j]->LN_length = strtoll(tag->str+3, NULL, 0);
2823
                // See fuzz 382922241
2824
3.03k
                if (r->ref_id[j]->LN_length < 0)
2825
544
                    r->ref_id[j]->LN_length = 0;
2826
3.03k
            }
2827
3.03k
        }
2828
2829
3.03k
        k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
2830
3.03k
        if (n <= 0) // already exists or error
2831
0
            return -1;
2832
3.03k
        kh_val(r->h_meta, k) = r->ref_id[j];
2833
2834
3.03k
        j++;
2835
3.03k
    }
2836
3.48k
    r->nref = j;
2837
2838
3.48k
    return 0;
2839
3.48k
}
2840
2841
/*
2842
 * Attaches a header to a cram_fd.
2843
 *
2844
 * This should be used when creating a new cram_fd for writing where
2845
 * we have a header already constructed (eg from a file we've read
2846
 * in).
2847
 */
2848
2.63k
int cram_set_header2(cram_fd *fd, const sam_hdr_t *hdr) {
2849
2.63k
    if (!fd || !hdr )
2850
0
        return -1;
2851
2852
2.63k
    if (fd->header != hdr) {
2853
2.63k
        if (fd->header)
2854
0
            sam_hdr_destroy(fd->header);
2855
2.63k
        fd->header = sam_hdr_dup(hdr);
2856
2.63k
        if (!fd->header)
2857
0
            return -1;
2858
2.63k
    }
2859
2.63k
    return refs_from_header(fd);
2860
2.63k
}
2861
2862
0
int cram_set_header(cram_fd *fd, sam_hdr_t *hdr) {
2863
0
    return cram_set_header2(fd, hdr);
2864
0
}
2865
2866
/*
2867
 * Returns whether the path refers to a directory.
2868
 */
2869
0
static int is_directory(char *fn) {
2870
0
    struct stat buf;
2871
0
    if ( stat(fn,&buf) ) return 0;
2872
0
    return S_ISDIR(buf.st_mode);
2873
0
}
2874
2875
/*
2876
 * Converts a directory and a filename into an expanded path, replacing %s
2877
 * in directory with the filename and %[0-9]+s with portions of the filename
2878
 * Any remaining parts of filename are added to the end with /%s.
2879
 */
2880
0
static int expand_cache_path(char *path, char *dir, const char *fn) {
2881
0
    char *cp, *start = path;
2882
0
    size_t len;
2883
0
    size_t sz = PATH_MAX;
2884
2885
0
    while ((cp = strchr(dir, '%'))) {
2886
0
        if (cp-dir >= sz) return -1;
2887
0
        strncpy(path, dir, cp-dir);
2888
0
        path += cp-dir;
2889
0
        sz -= cp-dir;
2890
2891
0
        if (*++cp == 's') {
2892
0
            len = strlen(fn);
2893
0
            if (len >= sz) return -1;
2894
0
            strcpy(path, fn);
2895
0
            path += len;
2896
0
            sz -= len;
2897
0
            fn += len;
2898
0
            cp++;
2899
0
        } else if (*cp >= '0' && *cp <= '9') {
2900
0
            char *endp;
2901
0
            long l;
2902
2903
0
            l = strtol(cp, &endp, 10);
2904
0
            l = MIN(l, strlen(fn));
2905
0
            if (*endp == 's') {
2906
0
                if (l >= sz) return -1;
2907
0
                strncpy(path, fn, l);
2908
0
                path += l;
2909
0
                fn += l;
2910
0
                sz -= l;
2911
0
                *path = 0;
2912
0
                cp = endp+1;
2913
0
            } else {
2914
0
                if (sz < 3) return -1;
2915
0
                *path++ = '%';
2916
0
                *path++ = *cp++;
2917
0
            }
2918
0
        } else {
2919
0
            if (sz < 3) return -1;
2920
0
            *path++ = '%';
2921
0
            *path++ = *cp++;
2922
0
        }
2923
0
        dir = cp;
2924
0
    }
2925
2926
0
    len = strlen(dir);
2927
0
    if (len >= sz) return -1;
2928
0
    strcpy(path, dir);
2929
0
    path += len;
2930
0
    sz -= len;
2931
2932
0
    len = strlen(fn) + ((*fn && path > start && path[-1] != '/') ? 1 : 0);
2933
0
    if (len >= sz) return -1;
2934
0
    if (*fn && path > start && path[-1] != '/')
2935
0
        *path++ = '/';
2936
0
    strcpy(path, fn);
2937
0
    return 0;
2938
0
}
2939
2940
/*
2941
 * Make the directory containing path and any prefix directories.
2942
 */
2943
0
static void mkdir_prefix(char *path, int mode) {
2944
0
    char *cp = strrchr(path, '/');
2945
0
    if (!cp)
2946
0
        return;
2947
2948
0
    *cp = 0;
2949
0
    if (is_directory(path)) {
2950
0
        *cp = '/';
2951
0
        return;
2952
0
    }
2953
2954
0
    if (mkdir(path, mode) == 0) {
2955
0
        chmod(path, mode);
2956
0
        *cp = '/';
2957
0
        return;
2958
0
    }
2959
2960
0
    mkdir_prefix(path, mode);
2961
0
    mkdir(path, mode);
2962
0
    chmod(path, mode);
2963
0
    *cp = '/';
2964
0
}
2965
2966
/*
2967
 * Queries the M5 string from the header and attempts to populate the
2968
 * reference from this using the REF_PATH environment.
2969
 *
2970
 * Returns 0 on success
2971
 *        -1 on failure
2972
 */
2973
1.59k
static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
2974
1.59k
    char *ref_path = getenv("REF_PATH");
2975
1.59k
    sam_hrec_type_t *ty;
2976
1.59k
    sam_hrec_tag_t *tag;
2977
1.59k
    char path[PATH_MAX];
2978
1.59k
    kstring_t path_tmp = KS_INITIALIZE;
2979
1.59k
    char *local_cache = getenv("REF_CACHE");
2980
1.59k
    mFILE *mf;
2981
1.59k
    int local_path = 0;
2982
2983
1.59k
    hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id);
2984
2985
1.59k
    if (!r->name)
2986
0
        return -1;
2987
2988
1.59k
    if (!(ty = sam_hrecs_find_type_id(fd->header->hrecs, "SQ", "SN", r->name)))
2989
0
        return -1;
2990
2991
1.59k
    if (!(tag = sam_hrecs_find_key(ty, "M5", NULL)))
2992
1.58k
        goto no_M5;
2993
2994
16
    hts_log_info("Querying ref %s", tag->str+3);
2995
2996
    /* Use cache if available */
2997
16
    if (local_cache && *local_cache) {
2998
0
        struct stat sb;
2999
0
        if (expand_cache_path(path, local_cache, tag->str+3) == 0 &&
3000
0
            stat(path, &sb) == 0)
3001
            // Found it in the local cache
3002
0
            local_path = 1;
3003
0
    }
3004
3005
#ifndef HAVE_MMAP
3006
    char *path2;
3007
    /* Search local files in REF_PATH; we can open them and return as above */
3008
    if (!local_path && (path2 = find_path(tag->str+3, ref_path))) {
3009
        int len = snprintf(path, PATH_MAX, "%s", path2);
3010
        free(path2);
3011
        if (len > 0 && len < PATH_MAX) // in case it's too long
3012
            local_path = 1;
3013
    }
3014
#endif
3015
3016
    /* Found via REF_CACHE or local REF_PATH file */
3017
16
    if (local_path) {
3018
0
        struct stat sb;
3019
0
        BGZF *fp;
3020
3021
0
        if (0 == stat(path, &sb)
3022
0
            && S_ISREG(sb.st_mode)
3023
0
            && (fp = bgzf_open(path, "r"))) {
3024
0
            r->length = sb.st_size;
3025
0
            r->offset = r->line_length = r->bases_per_line = 0;
3026
3027
0
            r->fn = string_dup(fd->refs->pool, path);
3028
3029
0
            if (fd->refs->fp)
3030
0
                if (bgzf_close(fd->refs->fp) != 0)
3031
0
                    return -1;
3032
0
            fd->refs->fp = fp;
3033
0
            fd->refs->fn = r->fn;
3034
0
            r->is_md5 = 1;
3035
0
            r->validated_md5 = 1;
3036
3037
            // Fall back to cram_get_ref() where it'll do the actual
3038
            // reading of the file.
3039
0
            return 0;
3040
0
        }
3041
0
    }
3042
3043
3044
    /* Otherwise search full REF_PATH; slower as loads entire file */
3045
16
    int is_local = 0;
3046
16
    if ((mf = open_path_mfile(tag->str+3, ref_path, NULL, &is_local))) {
3047
0
        size_t sz;
3048
0
        r->seq = mfsteal(mf, &sz);
3049
0
        if (r->seq) {
3050
0
            r->mf = NULL;
3051
0
        } else {
3052
            // keep mf around as we couldn't detach
3053
0
            r->seq = mf->data;
3054
0
            r->mf = mf;
3055
0
        }
3056
0
        r->length = sz;
3057
0
        r->is_md5 = 1;
3058
0
        r->validated_md5 = 1;
3059
16
    } else {
3060
16
        refs_t *refs;
3061
16
        const char *fn;
3062
16
        sam_hrec_tag_t *UR_tag;
3063
3064
1.59k
    no_M5:
3065
        /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
3066
1.59k
        if (!(UR_tag = sam_hrecs_find_key(ty, "UR", NULL)))
3067
678
            return -1;
3068
3069
920
        if (strstr(UR_tag->str+3, "://") &&
3070
36
            strncmp(UR_tag->str+3, "file:", 5) != 0) {
3071
            // Documented as omitted, but accidentally supported until now
3072
15
            hts_log_error("UR tags pointing to remote files are not supported");
3073
15
            return -1;
3074
15
        }
3075
3076
905
        fn = (strncmp(UR_tag->str+3, "file:", 5) == 0)
3077
905
            ? UR_tag->str+8
3078
905
            : UR_tag->str+3;
3079
3080
905
        if (fd->refs->fp) {
3081
0
            if (bgzf_close(fd->refs->fp) != 0)
3082
0
                return -1;
3083
0
            fd->refs->fp = NULL;
3084
0
        }
3085
905
        if (!(refs = refs_load_fai(fd->refs, fn, 0)))
3086
905
            return -1;
3087
0
        sanitise_SQ_lines(fd);
3088
3089
0
        fd->refs = refs;
3090
0
        if (fd->refs->fp) {
3091
0
            if (bgzf_close(fd->refs->fp) != 0)
3092
0
                return -1;
3093
0
            fd->refs->fp = NULL;
3094
0
        }
3095
3096
0
        if (!fd->refs->fn)
3097
0
            return -1;
3098
3099
0
        if (-1 == refs2id(fd->refs, fd->header))
3100
0
            return -1;
3101
0
        if (!fd->refs->ref_id || !fd->refs->ref_id[id])
3102
0
            return -1;
3103
3104
        // Local copy already, so fall back to cram_get_ref().
3105
0
        return 0;
3106
0
    }
3107
3108
    /* Populate the local disk cache if required */
3109
0
    if (!is_local && local_cache && *local_cache) {
3110
0
        hFILE *fp;
3111
3112
0
        if (expand_cache_path(path, local_cache, tag->str+3) < 0) {
3113
0
            return 0; // Not fatal - we have the data already so keep going.
3114
0
        }
3115
0
        hts_log_info("Writing cache file '%s'", path);
3116
0
        mkdir_prefix(path, 01777);
3117
3118
0
        fp = hts_open_tmpfile(path, "wx", &path_tmp);
3119
0
        if (!fp) {
3120
0
            perror(path_tmp.s);
3121
0
            free(path_tmp.s);
3122
3123
            // Not fatal - we have the data already so keep going.
3124
0
            return 0;
3125
0
        }
3126
3127
        // Check md5sum
3128
0
        hts_md5_context *md5;
3129
0
        char unsigned md5_buf1[16];
3130
0
        char md5_buf2[33];
3131
3132
0
        if (!(md5 = hts_md5_init())) {
3133
0
            hclose_abruptly(fp);
3134
0
            unlink(path_tmp.s);
3135
0
            free(path_tmp.s);
3136
0
            return -1;
3137
0
        }
3138
0
        hts_md5_update(md5, r->seq, r->length);
3139
0
        hts_md5_final(md5_buf1, md5);
3140
0
        hts_md5_destroy(md5);
3141
0
        hts_md5_hex(md5_buf2, md5_buf1);
3142
3143
0
        if (strncmp(tag->str+3, md5_buf2, 32) != 0) {
3144
0
            hts_log_error("Mismatching md5sum for downloaded reference");
3145
0
            hclose_abruptly(fp);
3146
0
            unlink(path_tmp.s);
3147
0
            free(path_tmp.s);
3148
0
            return -1;
3149
0
        }
3150
3151
0
        ssize_t length_written = hwrite(fp, r->seq, r->length);
3152
0
        if (hclose(fp) < 0 || length_written != r->length ||
3153
0
            chmod(path_tmp.s, 0444) < 0 ||
3154
0
            rename(path_tmp.s, path) < 0) {
3155
0
            hts_log_error("Creating reference at %s failed: %s",
3156
0
                          path, strerror(errno));
3157
0
            unlink(path_tmp.s);
3158
0
        }
3159
0
    }
3160
3161
0
    free(path_tmp.s);
3162
0
    return 0;
3163
0
}
3164
3165
1.07k
static void cram_ref_incr_locked(refs_t *r, int id) {
3166
1.07k
    RP("%d INC REF %d, %d %p\n", gettid(), id,
3167
1.07k
       (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count+1:-999),
3168
1.07k
       id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
3169
3170
1.07k
    if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq)
3171
1.07k
        return;
3172
3173
0
    if (r->last_id == id)
3174
0
        r->last_id = -1;
3175
3176
0
    ++r->ref_id[id]->count;
3177
0
}
3178
3179
1.07k
void cram_ref_incr(refs_t *r, int id) {
3180
1.07k
    pthread_mutex_lock(&r->lock);
3181
1.07k
    cram_ref_incr_locked(r, id);
3182
1.07k
    pthread_mutex_unlock(&r->lock);
3183
1.07k
}
3184
3185
127
static void cram_ref_decr_locked(refs_t *r, int id) {
3186
127
    RP("%d DEC REF %d, %d %p\n", gettid(), id,
3187
127
       (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count-1:-999),
3188
127
       id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
3189
3190
127
    if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) {
3191
127
        return;
3192
127
    }
3193
3194
0
    if (--r->ref_id[id]->count <= 0) {
3195
0
        assert(r->ref_id[id]->count == 0);
3196
0
        if (r->last_id >= 0) {
3197
0
            if (r->ref_id[r->last_id]->count <= 0 &&
3198
0
                r->ref_id[r->last_id]->seq) {
3199
0
                RP("%d FREE REF %d (%p)\n", gettid(),
3200
0
                   r->last_id, r->ref_id[r->last_id]->seq);
3201
0
                ref_entry_free_seq(r->ref_id[r->last_id]);
3202
0
                if (r->ref_id[r->last_id]->is_md5) r->ref_id[r->last_id]->length = 0;
3203
0
            }
3204
0
        }
3205
0
        r->last_id = id;
3206
0
    }
3207
0
}
3208
3209
127
void cram_ref_decr(refs_t *r, int id) {
3210
127
    pthread_mutex_lock(&r->lock);
3211
127
    cram_ref_decr_locked(r, id);
3212
127
    pthread_mutex_unlock(&r->lock);
3213
127
}
3214
3215
/*
3216
 * Used by cram_ref_load and cram_get_ref. The file handle will have
3217
 * already been opened, so we can catch it. The ref_entry *e informs us
3218
 * of whether this is a multi-line fasta file or a raw MD5 style file.
3219
 * Either way we create a single contiguous sequence.
3220
 *
3221
 * Returns all or part of a reference sequence on success (malloced);
3222
 *         NULL on failure.
3223
 */
3224
static char *load_ref_portion(BGZF *fp, ref_entry *e,
3225
0
                              hts_pos_t start, hts_pos_t end) {
3226
0
    off_t offset, len;
3227
0
    char *seq;
3228
3229
0
    if (end < start)
3230
0
        end = start;
3231
3232
    /*
3233
     * Compute locations in file. This is trivial for the MD5 files, but
3234
     * is still necessary for the fasta variants.
3235
     *
3236
     * Note the offset here, as with faidx, has the assumption that white-
3237
     * space (the diff between line_length and bases_per_line) only occurs
3238
     * at the end of a line of text.
3239
     */
3240
0
    offset = e->line_length
3241
0
        ? e->offset + (start-1)/e->bases_per_line * e->line_length +
3242
0
          (start-1) % e->bases_per_line
3243
0
        : start-1;
3244
3245
0
    len = (e->line_length
3246
0
           ? e->offset + (end-1)/e->bases_per_line * e->line_length +
3247
0
             (end-1) % e->bases_per_line
3248
0
           : end-1) - offset + 1;
3249
3250
0
    if (bgzf_useek(fp, offset, SEEK_SET) < 0) {
3251
0
        perror("bgzf_useek() on reference file");
3252
0
        return NULL;
3253
0
    }
3254
3255
0
    if (len == 0 || !(seq = malloc(len))) {
3256
0
        return NULL;
3257
0
    }
3258
3259
0
    if (len != bgzf_read(fp, seq, len)) {
3260
0
        perror("bgzf_read() on reference file");
3261
0
        free(seq);
3262
0
        return NULL;
3263
0
    }
3264
3265
    /* Strip white-space if required. */
3266
0
    if (len != end-start+1) {
3267
0
        hts_pos_t i, j;
3268
0
        char *cp = seq;
3269
0
        char *cp_to;
3270
3271
        // Copy up to the first white-space, and then repeatedly just copy
3272
        // bases_per_line verbatim, and use the slow method to end again.
3273
        //
3274
        // This may seem excessive, but this code can be a significant
3275
        // portion of total CRAM decode CPU time for shallow data sets.
3276
0
        for (i = j = 0; i < len; i++) {
3277
0
            if (!isspace_c(cp[i]))
3278
0
                cp[j++] = cp[i] & ~0x20;
3279
0
            else
3280
0
                break;
3281
0
        }
3282
0
        while (i < len && isspace_c(cp[i]))
3283
0
            i++;
3284
0
        while (i < len - e->line_length) {
3285
0
            hts_pos_t j_end = j + e->bases_per_line;
3286
0
            while (j < j_end)
3287
0
                cp[j++] = cp[i++] & ~0x20; // toupper equiv
3288
0
            i += e->line_length - e->bases_per_line;
3289
0
        }
3290
0
        for (; i < len; i++) {
3291
0
            if (!isspace_c(cp[i]))
3292
0
                cp[j++] = cp[i] & ~0x20;
3293
0
        }
3294
3295
0
        cp_to = cp+j;
3296
3297
0
        if (cp_to - seq != end-start+1) {
3298
0
            hts_log_error("Malformed reference file");
3299
0
            free(seq);
3300
0
            return NULL;
3301
0
        }
3302
0
    } else {
3303
0
        int i;
3304
0
        for (i = 0; i < len; i++) {
3305
0
            seq[i] = toupper_c(seq[i]);
3306
0
        }
3307
0
    }
3308
3309
0
    return seq;
3310
0
}
3311
3312
/*
3313
 * Load the entire reference 'id'.
3314
 * This also increments the reference count by 1.
3315
 *
3316
 * Returns ref_entry on success;
3317
 *         NULL on failure
3318
 */
3319
0
ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) {
3320
0
    ref_entry *e = r->ref_id[id];
3321
0
    hts_pos_t start = 1, end = e->length;
3322
0
    char *seq;
3323
3324
0
    if (e->seq) {
3325
0
        return e;
3326
0
    }
3327
3328
0
    assert(e->count == 0);
3329
3330
0
    if (r->last) {
3331
#ifdef REF_DEBUG
3332
        int idx = 0;
3333
        for (idx = 0; idx < r->nref; idx++)
3334
            if (r->last == r->ref_id[idx])
3335
                break;
3336
        RP("%d cram_ref_load DECR %d\n", gettid(), idx);
3337
#endif
3338
0
        assert(r->last->count > 0);
3339
0
        if (--r->last->count <= 0) {
3340
0
            RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
3341
0
            if (r->last->seq)
3342
0
                ref_entry_free_seq(r->last);
3343
0
        }
3344
0
    }
3345
3346
0
    if (!r->fn)
3347
0
        return NULL;
3348
3349
    /* Open file if it's not already the current open reference */
3350
0
    if (strcmp(r->fn, e->fn) || r->fp == NULL) {
3351
0
        if (r->fp)
3352
0
            if (bgzf_close(r->fp) != 0)
3353
0
                return NULL;
3354
0
        r->fn = e->fn;
3355
0
        if (!(r->fp = bgzf_open_ref(r->fn, "r", is_md5)))
3356
0
            return NULL;
3357
0
    }
3358
3359
0
    RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
3360
3361
0
    if (!(seq = load_ref_portion(r->fp, e, start, end))) {
3362
0
        return NULL;
3363
0
    }
3364
3365
0
    RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
3366
3367
0
    RP("%d INC REF %d, %"PRId64"\n", gettid(), id, (e->count+1));
3368
0
    e->seq = seq;
3369
0
    e->mf = NULL;
3370
0
    e->count++;
3371
3372
    /*
3373
     * Also keep track of last used ref so incr/decr loops on the same
3374
     * sequence don't cause load/free loops.
3375
     */
3376
0
    RP("%d cram_ref_load INCR %d => %"PRId64"\n", gettid(), id, e->count+1);
3377
0
    r->last = e;
3378
0
    e->count++;
3379
3380
0
    return e;
3381
0
}
3382
3383
/*
3384
 * Returns a portion of a reference sequence from start to end inclusive.
3385
 * The returned pointer is owned by either the cram_file fd or by the
3386
 * internal refs_t structure and should not be freed  by the caller.
3387
 *
3388
 * The difference is whether or not this refs_t is in use by just the one
3389
 * cram_fd or by multiples, or whether we have multiple threads accessing
3390
 * references. In either case fd->shared will be true and we start using
3391
 * reference counting to track the number of users of a specific reference
3392
 * sequence.
3393
 *
3394
 * Otherwise the ref seq returned is allocated as part of cram_fd itself
3395
 * and will be freed up on the next call to cram_get_ref or cram_close.
3396
 *
3397
 * To return the entire reference sequence, specify start as 1 and end
3398
 * as 0.
3399
 *
3400
 * To cease using a reference, call cram_ref_decr().
3401
 *
3402
 * Returns reference on success,
3403
 *         NULL on failure
3404
 */
3405
3.42k
char *cram_get_ref(cram_fd *fd, int id, hts_pos_t start, hts_pos_t end) {
3406
3.42k
    ref_entry *r;
3407
3.42k
    char *seq;
3408
3.42k
    int ostart = start;
3409
3410
3.42k
    if (id == -1 || start < 1)
3411
1.83k
        return NULL;
3412
3413
    /* FIXME: axiomatic query of r->seq being true?
3414
     * Or shortcut for unsorted data where we load once and never free?
3415
     */
3416
3417
    //fd->shared_ref = 1; // hard code for now to simplify things
3418
3419
1.59k
    pthread_mutex_lock(&fd->ref_lock);
3420
3421
1.59k
    RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
3422
3423
    /*
3424
     * Unsorted data implies we want to fetch an entire reference at a time.
3425
     * We just deal with this at the moment by claiming we're sharing
3426
     * references instead, which has the same requirement.
3427
     */
3428
1.59k
    if (fd->unsorted)
3429
0
        fd->shared_ref = 1;
3430
3431
3432
    /* Sanity checking: does this ID exist? */
3433
1.59k
    if (id >= fd->refs->nref) {
3434
0
        hts_log_error("No reference found for id %d", id);
3435
0
        pthread_mutex_unlock(&fd->ref_lock);
3436
0
        return NULL;
3437
0
    }
3438
3439
1.59k
    if (!fd->refs || !fd->refs->ref_id[id]) {
3440
0
        hts_log_error("No reference found for id %d", id);
3441
0
        pthread_mutex_unlock(&fd->ref_lock);
3442
0
        return NULL;
3443
0
    }
3444
3445
1.59k
    if (!(r = fd->refs->ref_id[id])) {
3446
0
        hts_log_error("No reference found for id %d", id);
3447
0
        pthread_mutex_unlock(&fd->ref_lock);
3448
0
        return NULL;
3449
0
    }
3450
3451
3452
    /*
3453
     * It has an entry, but may not have been populated yet.
3454
     * Any manually loaded .fai files have their lengths known.
3455
     * A ref entry computed from @SQ lines (M5 or UR field) will have
3456
     * r->length == 0 unless it's been loaded once and verified that we have
3457
     * an on-disk filename for it.
3458
     *
3459
     * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
3460
     * open_path_mfile and libcurl, which isn't multi-thread safe unless I
3461
     * rewrite my code to have one curl handle per thread.
3462
     */
3463
1.59k
    pthread_mutex_lock(&fd->refs->lock);
3464
1.59k
    if (r->length == 0) {
3465
1.59k
        if (fd->ref_fn)
3466
0
            hts_log_warning("Reference file given, but ref '%s' not present",
3467
1.59k
                            r->name);
3468
1.59k
        if (cram_populate_ref(fd, id, r) == -1) {
3469
1.59k
            hts_log_warning("Failed to populate reference \"%s\"",
3470
1.59k
                            r->name);
3471
1.59k
            hts_log_warning("See https://www.htslib.org/doc/reference_seqs.html for further suggestions");
3472
1.59k
            pthread_mutex_unlock(&fd->refs->lock);
3473
1.59k
            pthread_mutex_unlock(&fd->ref_lock);
3474
1.59k
            return NULL;
3475
1.59k
        }
3476
0
        r = fd->refs->ref_id[id];
3477
0
        if (fd->unsorted)
3478
0
            cram_ref_incr_locked(fd->refs, id);
3479
0
    }
3480
3481
3482
    /*
3483
     * We now know that we the filename containing the reference, so check
3484
     * for limits. If it's over half the reference we'll load all of it in
3485
     * memory as this will speed up subsequent calls.
3486
     */
3487
0
    if (end < 1)
3488
0
        end = r->length;
3489
0
    if (end >= r->length)
3490
0
        end  = r->length;
3491
3492
0
    if (end - start >= 0.5*r->length || fd->shared_ref) {
3493
0
        start = 1;
3494
0
        end = r->length;
3495
0
    }
3496
3497
    /*
3498
     * Maybe we have it cached already? If so use it.
3499
     *
3500
     * Alternatively if we don't have the sequence but we're sharing
3501
     * references and/or are asking for the entire length of it, then
3502
     * load the full reference into the refs structure and return
3503
     * a pointer to that one instead.
3504
     */
3505
0
    if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
3506
0
        char *cp;
3507
3508
0
        if (id >= 0) {
3509
0
            if (r->seq) {
3510
0
                cram_ref_incr_locked(fd->refs, id);
3511
0
            } else {
3512
0
                ref_entry *e;
3513
0
                if (!(e = cram_ref_load(fd->refs, id, r->is_md5))) {
3514
0
                    pthread_mutex_unlock(&fd->refs->lock);
3515
0
                    pthread_mutex_unlock(&fd->ref_lock);
3516
0
                    return NULL;
3517
0
                }
3518
3519
                /* unsorted data implies cache ref indefinitely, to avoid
3520
                 * continually loading and unloading.
3521
                 */
3522
0
                if (fd->unsorted)
3523
0
                    cram_ref_incr_locked(fd->refs, id);
3524
0
            }
3525
3526
0
            fd->ref = NULL; /* We never access it directly */
3527
0
            fd->ref_start = 1;
3528
0
            fd->ref_end   = r->length;
3529
0
            fd->ref_id    = id;
3530
3531
0
            cp = fd->refs->ref_id[id]->seq + ostart-1;
3532
0
        } else {
3533
0
            fd->ref = NULL;
3534
0
            cp = NULL;
3535
0
        }
3536
3537
0
        RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
3538
3539
0
        pthread_mutex_unlock(&fd->refs->lock);
3540
0
        pthread_mutex_unlock(&fd->ref_lock);
3541
0
        return cp;
3542
0
    }
3543
3544
    /*
3545
     * Otherwise we're not sharing, we don't have a copy of it already and
3546
     * we're only asking for a small portion of it.
3547
     *
3548
     * In this case load up just that segment ourselves, freeing any old
3549
     * small segments in the process.
3550
     */
3551
3552
    /* Unmapped ref ID */
3553
0
    if (id < 0 || !fd->refs->fn) {
3554
0
        if (fd->ref_free) {
3555
0
            free(fd->ref_free);
3556
0
            fd->ref_free = NULL;
3557
0
        }
3558
0
        fd->ref = NULL;
3559
0
        fd->ref_id = id;
3560
0
        pthread_mutex_unlock(&fd->refs->lock);
3561
0
        pthread_mutex_unlock(&fd->ref_lock);
3562
0
        return NULL;
3563
0
    }
3564
3565
    /* Open file if it's not already the current open reference */
3566
0
    if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
3567
0
        if (fd->refs->fp)
3568
0
            if (bgzf_close(fd->refs->fp) != 0)
3569
0
                return NULL;
3570
0
        fd->refs->fn = r->fn;
3571
0
        if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r", r->is_md5))) {
3572
0
            pthread_mutex_unlock(&fd->refs->lock);
3573
0
            pthread_mutex_unlock(&fd->ref_lock);
3574
0
            return NULL;
3575
0
        }
3576
0
    }
3577
3578
0
    if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
3579
0
        pthread_mutex_unlock(&fd->refs->lock);
3580
0
        pthread_mutex_unlock(&fd->ref_lock);
3581
0
        return NULL;
3582
0
    }
3583
3584
0
    if (fd->ref_free)
3585
0
        free(fd->ref_free);
3586
3587
0
    fd->ref_id    = id;
3588
0
    fd->ref_start = start;
3589
0
    fd->ref_end   = end;
3590
0
    fd->ref_free = fd->ref;
3591
0
    seq = fd->ref;
3592
3593
0
    pthread_mutex_unlock(&fd->refs->lock);
3594
0
    pthread_mutex_unlock(&fd->ref_lock);
3595
3596
0
    return seq ? seq + ostart - start : NULL;
3597
0
}
3598
3599
/*
3600
 * If fd has been opened for reading, it may be permitted to specify 'fn'
3601
 * as NULL and let the code auto-detect the reference by parsing the
3602
 * SAM header @SQ lines.
3603
 */
3604
0
int cram_load_reference(cram_fd *fd, char *fn) {
3605
0
    int ret = 0;
3606
3607
0
    if (fn) {
3608
0
        fd->refs = refs_load_fai(fd->refs, fn,
3609
0
                                 !(fd->embed_ref>0 && fd->mode == 'r'));
3610
0
        fn = fd->refs ? fd->refs->fn : NULL;
3611
0
        if (!fn)
3612
0
            ret = -1;
3613
0
        sanitise_SQ_lines(fd);
3614
0
    }
3615
0
    fd->ref_fn = fn;
3616
3617
0
    if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
3618
0
        if (fd->refs)
3619
0
            refs_free(fd->refs);
3620
0
        if (!(fd->refs = refs_create()))
3621
0
            return -1;
3622
0
        if (-1 == refs_from_header(fd))
3623
0
            return -1;
3624
0
    }
3625
3626
0
    if (fd->header)
3627
0
        if (-1 == refs2id(fd->refs, fd->header))
3628
0
            return -1;
3629
3630
0
    return ret;
3631
0
}
3632
3633
/* ----------------------------------------------------------------------
3634
 * Containers
3635
 */
3636
3637
/*
3638
 * Creates a new container, specifying the maximum number of slices
3639
 * and records permitted.
3640
 *
3641
 * Returns cram_container ptr on success
3642
 *         NULL on failure
3643
 */
3644
47.2k
cram_container *cram_new_container(int nrec, int nslice) {
3645
47.2k
    cram_container *c = calloc(1, sizeof(*c));
3646
47.2k
    enum cram_DS_ID id;
3647
3648
47.2k
    if (!c)
3649
0
        return NULL;
3650
3651
47.2k
    c->curr_ref = -2;
3652
3653
47.2k
    c->max_c_rec = nrec * nslice;
3654
47.2k
    c->curr_c_rec = 0;
3655
3656
47.2k
    c->max_rec = nrec;
3657
47.2k
    c->record_counter = 0;
3658
47.2k
    c->num_bases = 0;
3659
47.2k
    c->s_num_bases = 0;
3660
3661
47.2k
    c->max_slice = nslice;
3662
47.2k
    c->curr_slice = 0;
3663
3664
47.2k
    c->pos_sorted = 1;
3665
47.2k
    c->max_apos   = 0;
3666
47.2k
    c->multi_seq  = 0;
3667
47.2k
    c->qs_seq_orient = 1;
3668
47.2k
    c->no_ref = 0;
3669
47.2k
    c->embed_ref = -1; // automatic selection
3670
3671
47.2k
    c->bams = NULL;
3672
3673
47.2k
    if (!(c->slices = calloc(nslice != 0 ? nslice : 1, sizeof(cram_slice *))))
3674
0
        goto err;
3675
47.2k
    c->slice = NULL;
3676
3677
47.2k
    if (!(c->comp_hdr = cram_new_compression_header()))
3678
0
        goto err;
3679
47.2k
    c->comp_hdr_block = NULL;
3680
3681
1.36M
    for (id = DS_RN; id < DS_TN; id++)
3682
1.32M
        if (!(c->stats[id] = cram_stats_create())) goto err;
3683
3684
    //c->aux_B_stats = cram_stats_create();
3685
3686
47.2k
    if (!(c->tags_used = kh_init(m_tagmap)))
3687
0
        goto err;
3688
47.2k
    c->refs_used = 0;
3689
47.2k
    c->ref_free = 0;
3690
3691
47.2k
    return c;
3692
3693
0
 err:
3694
0
    if (c) {
3695
0
        if (c->slices)
3696
0
            free(c->slices);
3697
0
        free(c);
3698
0
    }
3699
0
    return NULL;
3700
47.2k
}
3701
3702
1.70k
static void free_bam_list(bam_seq_t **bams, int max_rec) {
3703
1.70k
    int i;
3704
17.0M
    for (i = 0; i < max_rec; i++)
3705
17.0M
        bam_free(bams[i]);
3706
3707
1.70k
    free(bams);
3708
1.70k
}
3709
3710
64.0k
void cram_free_container(cram_container *c) {
3711
64.0k
    enum cram_DS_ID id;
3712
64.0k
    int i;
3713
3714
64.0k
    if (!c)
3715
0
        return;
3716
3717
64.0k
    if (c->refs_used)
3718
873
        free(c->refs_used);
3719
3720
64.0k
    if (c->landmark)
3721
47.8k
        free(c->landmark);
3722
3723
64.0k
    if (c->comp_hdr)
3724
47.3k
        cram_free_compression_header(c->comp_hdr);
3725
3726
64.0k
    if (c->comp_hdr_block)
3727
45.6k
        cram_free_block(c->comp_hdr_block);
3728
3729
    // Free the slices; filled out by encoder only
3730
64.0k
    if (c->slices) {
3731
91.9k
        for (i = 0; i < c->max_slice; i++) {
3732
44.7k
            if (c->slices[i])
3733
1.70k
                cram_free_slice(c->slices[i]);
3734
44.7k
            if (c->slices[i] == c->slice)
3735
44.7k
                c->slice = NULL;
3736
44.7k
        }
3737
47.2k
        free(c->slices);
3738
47.2k
    }
3739
3740
    // Free the current slice; set by both encoder & decoder
3741
64.0k
    if (c->slice) {
3742
0
        cram_free_slice(c->slice);
3743
0
        c->slice = NULL;
3744
0
    }
3745
3746
1.85M
    for (id = DS_RN; id < DS_TN; id++)
3747
1.79M
        if (c->stats[id]) cram_stats_free(c->stats[id]);
3748
3749
    //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
3750
3751
64.0k
    if (c->tags_used) {
3752
47.2k
        khint_t k;
3753
3754
536k
        for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
3755
489k
            if (!kh_exist(c->tags_used, k))
3756
241k
                continue;
3757
3758
248k
            cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
3759
248k
            if (tm) {
3760
248k
                cram_codec *c = tm->codec;
3761
3762
248k
                if (c) c->free(c);
3763
3764
                // If tm->blk or tm->blk2 is set, then we haven't yet got to
3765
                // cram_encode_container which copies the blocks to s->aux_block
3766
                // and NULLifies tm->blk*.  In this case we failed to complete
3767
                // the container construction, so we have to free up our partially
3768
                // converted CRAM.
3769
248k
                cram_free_block(tm->blk);
3770
248k
                cram_free_block(tm->blk2);
3771
248k
                free(tm);
3772
248k
            }
3773
248k
        }
3774
3775
47.2k
        kh_destroy(m_tagmap, c->tags_used);
3776
47.2k
    }
3777
3778
64.0k
    if (c->ref_free)
3779
21.4k
        free(c->ref);
3780
3781
64.0k
    if (c->bams)
3782
166
        free_bam_list(c->bams, c->max_c_rec);
3783
3784
64.0k
    free(c);
3785
64.0k
}
3786
3787
/*
3788
 * Reads a container header.
3789
 *
3790
 * Returns cram_container on success
3791
 *         NULL on failure or no container left (fd->err == 0).
3792
 */
3793
16.8k
cram_container *cram_read_container(cram_fd *fd) {
3794
16.8k
    cram_container c2, *c;
3795
16.8k
    int i, s;
3796
16.8k
    size_t rd = 0;
3797
16.8k
    uint32_t crc = 0;
3798
3799
16.8k
    fd->err = 0;
3800
16.8k
    fd->eof = 0;
3801
3802
16.8k
    memset(&c2, 0, sizeof(c2));
3803
16.8k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3804
13.5k
        if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) {
3805
0
            fd->eof = fd->empty_container ? 1 : 2;
3806
0
            return NULL;
3807
13.5k
        } else {
3808
13.5k
            rd+=s;
3809
13.5k
        }
3810
13.5k
    } else if (CRAM_MAJOR_VERS(fd->version) < 4) {
3811
931
        uint32_t len;
3812
931
        if ((s = int32_decode(fd, &c2.length)) == -1) {
3813
3
            if (CRAM_MAJOR_VERS(fd->version) == 2 &&
3814
0
                CRAM_MINOR_VERS(fd->version) == 0)
3815
0
                fd->eof = 1; // EOF blocks arrived in v2.1
3816
3
            else
3817
3
                fd->eof = fd->empty_container ? 1 : 2;
3818
3
            return NULL;
3819
928
        } else {
3820
928
            rd+=s;
3821
928
        }
3822
928
        len = le_int4(c2.length);
3823
928
        crc = crc32(0L, (unsigned char *)&len, 4);
3824
2.39k
    } else {
3825
2.39k
        if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc))   == -1) {
3826
0
            fd->eof = fd->empty_container ? 1 : 2;
3827
0
            return NULL;
3828
2.39k
        } else {
3829
2.39k
            rd+=s;
3830
2.39k
        }
3831
2.39k
    }
3832
16.8k
    if ((s = fd->vv.varint_decode32s_crc(fd, &c2.ref_seq_id, &crc))   == -1) return NULL; else rd+=s;
3833
16.8k
    if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3834
2.39k
        int64_t i64;
3835
2.39k
        if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc))== -1) return NULL; else rd+=s;
3836
2.39k
        c2.ref_seq_start = i64;
3837
2.39k
        if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc)) == -1) return NULL; else rd+=s;
3838
2.39k
        c2.ref_seq_span = i64;
3839
14.4k
    } else {
3840
14.4k
        int32_t i32;
3841
14.4k
        if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s;
3842
14.4k
        c2.ref_seq_start = i32;
3843
14.4k
        if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s;
3844
14.4k
        c2.ref_seq_span = i32;
3845
14.4k
    }
3846
16.8k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_records, &crc))  == -1) return NULL; else rd+=s;
3847
3848
16.8k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3849
13.5k
        c2.record_counter = 0;
3850
13.5k
        c2.num_bases = 0;
3851
13.5k
    } else {
3852
3.32k
        if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3853
3.31k
            if ((s = fd->vv.varint_decode64_crc(fd, &c2.record_counter, &crc)) == -1)
3854
0
                return NULL;
3855
3.31k
            else
3856
3.31k
                rd += s;
3857
3.31k
        } else {
3858
8
            int32_t i32;
3859
8
            if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1)
3860
0
                return NULL;
3861
8
            else
3862
8
                rd += s;
3863
8
            c2.record_counter = i32;
3864
8
        }
3865
3866
3.32k
        if ((s = fd->vv.varint_decode64_crc(fd, &c2.num_bases, &crc))== -1)
3867
0
            return NULL;
3868
3.32k
        else
3869
3.32k
            rd += s;
3870
3.32k
    }
3871
16.8k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_blocks, &crc))   == -1)
3872
0
        return NULL;
3873
16.8k
    else
3874
16.8k
        rd+=s;
3875
16.8k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_landmarks, &crc))== -1)
3876
0
        return NULL;
3877
16.8k
    else
3878
16.8k
        rd+=s;
3879
3880
16.8k
    if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t))
3881
18
        return NULL;
3882
3883
16.8k
    if (!(c = calloc(1, sizeof(*c))))
3884
0
        return NULL;
3885
3886
16.8k
    *c = c2;
3887
16.8k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3888
16.8k
    if (c->num_landmarks > FUZZ_ALLOC_LIMIT/sizeof(int32_t)) {
3889
0
        fd->err = errno = ENOMEM;
3890
0
        cram_free_container(c);
3891
0
        return NULL;
3892
0
    }
3893
16.8k
#endif
3894
16.8k
    if (c->num_landmarks && !(c->landmark = malloc(c->num_landmarks * sizeof(int32_t)))) {
3895
0
        fd->err = errno;
3896
0
        cram_free_container(c);
3897
0
        return NULL;
3898
0
    }
3899
72.3k
    for (i = 0; i < c->num_landmarks; i++) {
3900
55.5k
        if ((s = fd->vv.varint_decode32_crc(fd, &c->landmark[i], &crc)) == -1) {
3901
24
            cram_free_container(c);
3902
24
            return NULL;
3903
55.5k
        } else {
3904
55.5k
            rd += s;
3905
55.5k
        }
3906
55.5k
    }
3907
3908
16.7k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3909
3.29k
        if (-1 == int32_decode(fd, (int32_t *)&c->crc32)) {
3910
0
            cram_free_container(c);
3911
0
            return NULL;
3912
3.29k
        } else {
3913
3.29k
            rd+=4;
3914
3.29k
        }
3915
3916
3.29k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3917
        // Pretend the CRC was OK so the fuzzer doesn't have to get it right
3918
3.29k
        crc = c->crc32;
3919
3.29k
#endif
3920
3921
3.29k
        if (crc != c->crc32) {
3922
0
            hts_log_error("Container header CRC32 failure");
3923
0
            cram_free_container(c);
3924
0
            return NULL;
3925
0
        }
3926
3.29k
    }
3927
3928
16.7k
    c->offset = rd;
3929
16.7k
    c->slices = NULL;
3930
16.7k
    c->slice = NULL;
3931
16.7k
    c->curr_slice = 0;
3932
16.7k
    c->max_slice = c->num_landmarks;
3933
16.7k
    c->slice_rec = 0;
3934
16.7k
    c->curr_rec = 0;
3935
16.7k
    c->max_rec = 0;
3936
3937
16.7k
    if (c->ref_seq_id == -2) {
3938
0
        c->multi_seq = 1;
3939
0
        fd->multi_seq = 1;
3940
0
    }
3941
3942
16.7k
    fd->empty_container =
3943
16.7k
        (c->num_records == 0 &&
3944
15.4k
         c->ref_seq_id == -1 &&
3945
16.7k
         c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
3946
3947
16.7k
    return c;
3948
16.7k
}
3949
3950
3951
/* MAXIMUM storage size needed for the container. */
3952
0
int cram_container_size(cram_container *c) {
3953
0
    return 55 + 5*c->num_landmarks;
3954
0
}
3955
3956
3957
/*
3958
 * Stores the container structure in dat and returns *size as the
3959
 * number of bytes written to dat[].  The input size of dat is also
3960
 * held in *size and should be initialised to cram_container_size(c).
3961
 *
3962
 * Returns 0 on success;
3963
 *        -1 on failure
3964
 */
3965
int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
3966
0
{
3967
0
    char *cp = (char *)dat;
3968
0
    int i;
3969
3970
    // Check the input buffer is large enough according to our stated
3971
    // requirements. (NOTE: it may actually take less.)
3972
0
    if (cram_container_size(c) > *size)
3973
0
        return -1;
3974
3975
0
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3976
0
        cp += itf8_put(cp, c->length);
3977
0
    } else {
3978
0
        *(int32_t *)cp = le_int4(c->length);
3979
0
        cp += 4;
3980
0
    }
3981
0
    if (c->multi_seq) {
3982
0
        cp += fd->vv.varint_put32(cp, NULL, -2);
3983
0
        cp += fd->vv.varint_put32(cp, NULL, 0);
3984
0
        cp += fd->vv.varint_put32(cp, NULL, 0);
3985
0
    } else {
3986
0
        cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
3987
0
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3988
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start);
3989
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span);
3990
0
        } else {
3991
0
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
3992
0
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
3993
0
        }
3994
0
    }
3995
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_records);
3996
0
    if (CRAM_MAJOR_VERS(fd->version) == 2) {
3997
0
        cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
3998
0
    } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3999
0
        cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
4000
0
    }
4001
0
    cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
4002
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
4003
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
4004
0
    for (i = 0; i < c->num_landmarks; i++)
4005
0
        cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
4006
4007
0
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
4008
0
        c->crc32 = crc32(0L, (uc *)dat, cp-dat);
4009
0
        cp[0] =  c->crc32        & 0xff;
4010
0
        cp[1] = (c->crc32 >>  8) & 0xff;
4011
0
        cp[2] = (c->crc32 >> 16) & 0xff;
4012
0
        cp[3] = (c->crc32 >> 24) & 0xff;
4013
0
        cp += 4;
4014
0
    }
4015
4016
0
    *size = cp-dat; // actual used size
4017
4018
0
    return 0;
4019
0
}
4020
4021
4022
/*
4023
 * Writes a container structure.
4024
 *
4025
 * Returns 0 on success
4026
 *        -1 on failure
4027
 */
4028
49.6k
int cram_write_container(cram_fd *fd, cram_container *c) {
4029
49.6k
    char buf_a[1024], *buf = buf_a, *cp;
4030
49.6k
    int i;
4031
4032
49.6k
    if (61 + c->num_landmarks * 10 >= 1024) {
4033
0
        buf = malloc(61 + c->num_landmarks * 10);
4034
0
        if (!buf)
4035
0
            return -1;
4036
0
    }
4037
49.6k
    cp = buf;
4038
4039
49.6k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4040
0
        cp += itf8_put(cp, c->length);
4041
49.6k
    } else if (CRAM_MAJOR_VERS(fd->version) <= 3) {
4042
49.6k
        *(int32_t *)cp = le_int4(c->length);
4043
49.6k
        cp += 4;
4044
49.6k
    } else {
4045
0
        cp += fd->vv.varint_put32(cp, NULL, c->length);
4046
0
    }
4047
49.6k
    if (c->multi_seq) {
4048
871
        cp += fd->vv.varint_put32(cp, NULL, (uint32_t)-2);
4049
871
        cp += fd->vv.varint_put32(cp, NULL, 0);
4050
871
        cp += fd->vv.varint_put32(cp, NULL, 0);
4051
48.7k
    } else {
4052
48.7k
        cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
4053
48.7k
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
4054
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start);
4055
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span);
4056
48.7k
        } else {
4057
48.7k
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
4058
48.7k
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
4059
48.7k
        }
4060
48.7k
    }
4061
49.6k
    cp += fd->vv.varint_put32(cp, NULL, c->num_records);
4062
49.6k
    if (CRAM_MAJOR_VERS(fd->version) >= 3)
4063
49.6k
        cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
4064
0
    else
4065
0
        cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
4066
49.6k
    cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
4067
49.6k
    cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
4068
49.6k
    cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
4069
99.1k
    for (i = 0; i < c->num_landmarks; i++)
4070
49.5k
        cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
4071
4072
49.6k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
4073
49.6k
        c->crc32 = crc32(0L, (uc *)buf, cp-buf);
4074
49.6k
        cp[0] =  c->crc32        & 0xff;
4075
49.6k
        cp[1] = (c->crc32 >>  8) & 0xff;
4076
49.6k
        cp[2] = (c->crc32 >> 16) & 0xff;
4077
49.6k
        cp[3] = (c->crc32 >> 24) & 0xff;
4078
49.6k
        cp += 4;
4079
49.6k
    }
4080
4081
49.6k
    if (cp-buf != hwrite(fd->fp, buf, cp-buf)) {
4082
0
        if (buf != buf_a)
4083
0
            free(buf);
4084
0
        return -1;
4085
0
    }
4086
4087
49.6k
    if (buf != buf_a)
4088
0
        free(buf);
4089
4090
49.6k
    return 0;
4091
49.6k
}
4092
4093
// common component shared by cram_flush_container{,_mt}
4094
44.5k
static int cram_flush_container2(cram_fd *fd, cram_container *c) {
4095
44.5k
    int i, j;
4096
4097
44.5k
    if (c->curr_slice > 0 && !c->slices)
4098
0
        return -1;
4099
4100
    //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
4101
4102
44.5k
    off_t c_offset = htell(fd->fp); // File offset of container
4103
4104
    /* Write the container struct itself */
4105
44.5k
    if (0 != cram_write_container(fd, c))
4106
0
        return -1;
4107
4108
44.5k
    off_t hdr_size = htell(fd->fp) - c_offset;
4109
4110
    /* And the compression header */
4111
44.5k
    if (0 != cram_write_block(fd, c->comp_hdr_block))
4112
0
        return -1;
4113
4114
    /* Followed by the slice blocks */
4115
44.5k
    off_t file_offset = htell(fd->fp);
4116
89.0k
    for (i = 0; i < c->curr_slice; i++) {
4117
44.5k
        cram_slice *s = c->slices[i];
4118
44.5k
        off_t spos = file_offset - c_offset - hdr_size;
4119
4120
44.5k
        if (0 != cram_write_block(fd, s->hdr_block))
4121
0
            return -1;
4122
4123
463k
        for (j = 0; j < s->hdr->num_blocks; j++) {
4124
419k
            if (0 != cram_write_block(fd, s->block[j]))
4125
0
                return -1;
4126
419k
        }
4127
4128
44.5k
        file_offset = htell(fd->fp);
4129
44.5k
        off_t sz = file_offset - c_offset - hdr_size - spos;
4130
4131
44.5k
        if (fd->idxfp) {
4132
0
            if (cram_index_slice(fd, c, s, fd->idxfp, c_offset, spos, sz) < 0)
4133
0
                return -1;
4134
0
        }
4135
44.5k
    }
4136
4137
44.5k
    return 0;
4138
44.5k
}
4139
4140
/*
4141
 * Flushes a completely or partially full container to disk, writing
4142
 * container structure, header and blocks. This also calls the encoder
4143
 * functions.
4144
 *
4145
 * Returns 0 on success
4146
 *        -1 on failure
4147
 */
4148
44.7k
int cram_flush_container(cram_fd *fd, cram_container *c) {
4149
    /* Encode the container blocks and generate compression header */
4150
44.7k
    if (0 != cram_encode_container(fd, c))
4151
189
        return -1;
4152
4153
44.5k
    return cram_flush_container2(fd, c);
4154
44.7k
}
4155
4156
typedef struct {
4157
    cram_fd *fd;
4158
    cram_container *c;
4159
} cram_job;
4160
4161
0
void *cram_flush_thread(void *arg) {
4162
0
    cram_job *j = (cram_job *)arg;
4163
4164
    /* Encode the container blocks and generate compression header */
4165
0
    if (0 != cram_encode_container(j->fd, j->c)) {
4166
0
        hts_log_error("Call to cram_encode_container failed");
4167
0
        return NULL;
4168
0
    }
4169
4170
0
    return arg;
4171
0
}
4172
4173
0
static int cram_flush_result(cram_fd *fd) {
4174
0
    int i, ret = 0;
4175
0
    hts_tpool_result *r;
4176
0
    cram_container *lc = NULL;
4177
4178
    // NB: we can have one result per slice, not per container,
4179
    // so we need to free the container only after all slices
4180
    // within it have been freed.  (Automatic via reference counting.)
4181
0
    while ((r = hts_tpool_next_result(fd->rqueue))) {
4182
0
        cram_job *j = (cram_job *)hts_tpool_result_data(r);
4183
0
        cram_container *c;
4184
4185
0
        if (!j) {
4186
0
            hts_tpool_delete_result(r, 0);
4187
0
            return -1;
4188
0
        }
4189
4190
0
        fd = j->fd;
4191
0
        c = j->c;
4192
4193
0
        if (fd->mode == 'w')
4194
0
            if (0 != cram_flush_container2(fd, c))
4195
0
                return -1;
4196
4197
        // Free the slices; filled out by encoder only
4198
0
        if (c->slices) {
4199
0
            for (i = 0; i < c->max_slice; i++) {
4200
0
                if (c->slices[i])
4201
0
                    cram_free_slice(c->slices[i]);
4202
0
                if (c->slices[i] == c->slice)
4203
0
                    c->slice = NULL;
4204
0
                c->slices[i] = NULL;
4205
0
            }
4206
0
        }
4207
4208
        // Free the current slice; set by both encoder & decoder
4209
0
        if (c->slice) {
4210
0
            cram_free_slice(c->slice);
4211
0
            c->slice = NULL;
4212
0
        }
4213
0
        c->curr_slice = 0;
4214
4215
        // Our jobs will be in order, so we free the last
4216
        // container when our job has switched to a new one.
4217
0
        if (c != lc) {
4218
0
            if (lc) {
4219
0
                if (fd->ctr == lc)
4220
0
                    fd->ctr = NULL;
4221
0
                if (fd->ctr_mt == lc)
4222
0
                    fd->ctr_mt = NULL;
4223
0
                cram_free_container(lc);
4224
0
            }
4225
0
            lc = c;
4226
0
        }
4227
4228
0
        hts_tpool_delete_result(r, 1);
4229
0
    }
4230
0
    if (lc) {
4231
0
        if (fd->ctr == lc)
4232
0
            fd->ctr = NULL;
4233
0
        if (fd->ctr_mt == lc)
4234
0
            fd->ctr_mt = NULL;
4235
0
        cram_free_container(lc);
4236
0
    }
4237
4238
0
    return ret;
4239
0
}
4240
4241
// Note: called while metrics_lock is held.
4242
// Will be left in this state too, but may temporarily unlock.
4243
6.27k
void reset_metrics(cram_fd *fd) {
4244
6.27k
    int i;
4245
4246
6.27k
    if (fd->pool) {
4247
        // If multi-threaded we have multiple blocks being
4248
        // compressed already and several on the to-do list
4249
        // (fd->rqueue->pending).  It's tricky to reset the
4250
        // metrics exactly the correct point, so instead we
4251
        // just flush the pool, reset, and then continue again.
4252
4253
        // Don't bother starting a new trial before then though.
4254
0
        for (i = 0; i < DS_END; i++) {
4255
0
            cram_metrics *m = fd->m[i];
4256
0
            if (!m)
4257
0
                continue;
4258
0
            m->next_trial = 999;
4259
0
        }
4260
4261
0
        pthread_mutex_unlock(&fd->metrics_lock);
4262
0
        hts_tpool_process_flush(fd->rqueue);
4263
0
        pthread_mutex_lock(&fd->metrics_lock);
4264
0
    }
4265
4266
301k
    for (i = 0; i < DS_END; i++) {
4267
295k
        cram_metrics *m = fd->m[i];
4268
295k
        if (!m)
4269
0
            continue;
4270
4271
295k
        m->trial = NTRIALS;
4272
295k
        m->next_trial = TRIAL_SPAN;
4273
295k
        m->revised_method = 0;
4274
295k
        m->unpackable = 0;
4275
4276
295k
        memset(m->sz, 0, sizeof(m->sz));
4277
295k
    }
4278
6.27k
}
4279
4280
44.7k
int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
4281
44.7k
    cram_job *j;
4282
4283
    // At the junction of mapped to unmapped data the compression
4284
    // methods may need to change due to very different statistical
4285
    // properties; particularly BA if minhash sorted.
4286
    //
4287
    // However with threading we'll have several in-flight blocks
4288
    // arriving out of order.
4289
    //
4290
    // So we do one trial reset of NThreads to last for NThreads
4291
    // duration to get us over this transition period, followed
4292
    // by another retrial of the usual ntrials & trial span.
4293
44.7k
    pthread_mutex_lock(&fd->metrics_lock);
4294
44.7k
    if (c->n_mapped < 0.3*c->curr_rec &&
4295
23.9k
        fd->last_mapped > 0.7*c->max_rec) {
4296
6.27k
        reset_metrics(fd);
4297
6.27k
    }
4298
44.7k
    fd->last_mapped = c->n_mapped * (c->max_rec+1)/(c->curr_rec+1) ;
4299
44.7k
    pthread_mutex_unlock(&fd->metrics_lock);
4300
4301
44.7k
    if (!fd->pool)
4302
44.7k
        return cram_flush_container(fd, c);
4303
4304
0
    if (!(j = malloc(sizeof(*j))))
4305
0
        return -1;
4306
0
    j->fd = fd;
4307
0
    j->c = c;
4308
4309
    // Flush the job.  Note our encoder queue may be full, so we
4310
    // either have to keep trying in non-blocking mode (what we do) or
4311
    // use a dedicated separate thread for draining the queue.
4312
0
    for (;;) {
4313
0
        errno = 0;
4314
0
        hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_flush_thread, j, 1);
4315
0
        int pending = (errno == EAGAIN);
4316
0
        if (cram_flush_result(fd) != 0)
4317
0
            return -1;
4318
0
        if (!pending)
4319
0
            break;
4320
4321
0
        hts_usleep(1000);
4322
0
    }
4323
4324
0
    return 0;
4325
0
}
4326
4327
/* ----------------------------------------------------------------------
4328
 * Compression headers; the first part of the container
4329
 */
4330
4331
/*
4332
 * Creates a new blank container compression header
4333
 *
4334
 * Returns header ptr on success
4335
 *         NULL on failure
4336
 */
4337
47.2k
cram_block_compression_hdr *cram_new_compression_header(void) {
4338
47.2k
    cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
4339
47.2k
    if (!hdr)
4340
0
        return NULL;
4341
4342
47.2k
    if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
4343
0
        free(hdr);
4344
0
        return NULL;
4345
0
    }
4346
4347
47.2k
    if (!(hdr->TD_hash = kh_init(m_s2i))) {
4348
0
        cram_free_block(hdr->TD_blk);
4349
0
        free(hdr);
4350
0
        return NULL;
4351
0
    }
4352
4353
47.2k
    if (!(hdr->TD_keys = string_pool_create(8192))) {
4354
0
        kh_destroy(m_s2i, hdr->TD_hash);
4355
0
        cram_free_block(hdr->TD_blk);
4356
0
        free(hdr);
4357
0
        return NULL;
4358
0
    }
4359
4360
47.2k
    return hdr;
4361
47.2k
}
4362
4363
47.9k
void cram_free_compression_header(cram_block_compression_hdr *hdr) {
4364
47.9k
    int i;
4365
4366
47.9k
    if (hdr->landmark)
4367
689
        free(hdr->landmark);
4368
4369
47.9k
    if (hdr->preservation_map)
4370
45.2k
        kh_destroy(map, hdr->preservation_map);
4371
4372
1.58M
    for (i = 0; i < CRAM_MAP_HASH; i++) {
4373
1.53M
        cram_map *m, *m2;
4374
1.55M
        for (m = hdr->rec_encoding_map[i]; m; m = m2) {
4375
22.3k
            m2 = m->next;
4376
22.3k
            if (m->codec)
4377
0
                m->codec->free(m->codec);
4378
22.3k
            free(m);
4379
22.3k
        }
4380
1.53M
    }
4381
4382
1.58M
    for (i = 0; i < CRAM_MAP_HASH; i++) {
4383
1.53M
        cram_map *m, *m2;
4384
1.53M
        for (m = hdr->tag_encoding_map[i]; m; m = m2) {
4385
701
            m2 = m->next;
4386
701
            if (m->codec)
4387
701
                m->codec->free(m->codec);
4388
701
            free(m);
4389
701
        }
4390
1.53M
    }
4391
4392
2.29M
    for (i = 0; i < DS_END; i++) {
4393
2.25M
        if (hdr->codecs[i])
4394
832k
            hdr->codecs[i]->free(hdr->codecs[i]);
4395
2.25M
    }
4396
4397
47.9k
    if (hdr->TL)
4398
56
        free(hdr->TL);
4399
47.9k
    if (hdr->TD_blk)
4400
47.2k
        cram_free_block(hdr->TD_blk);
4401
47.9k
    if (hdr->TD_hash)
4402
47.2k
        kh_destroy(m_s2i, hdr->TD_hash);
4403
47.9k
    if (hdr->TD_keys)
4404
47.2k
        string_pool_destroy(hdr->TD_keys);
4405
4406
47.9k
    free(hdr);
4407
47.9k
}
4408
4409
4410
/* ----------------------------------------------------------------------
4411
 * Slices and slice headers
4412
 */
4413
4414
45.2k
void cram_free_slice_header(cram_block_slice_hdr *hdr) {
4415
45.2k
    if (!hdr)
4416
0
        return;
4417
4418
45.2k
    if (hdr->block_content_ids)
4419
45.0k
        free(hdr->block_content_ids);
4420
4421
45.2k
    free(hdr);
4422
4423
45.2k
    return;
4424
45.2k
}
4425
4426
45.2k
void cram_free_slice(cram_slice *s) {
4427
45.2k
    if (!s)
4428
0
        return;
4429
4430
45.2k
    if (s->hdr_block)
4431
45.0k
        cram_free_block(s->hdr_block);
4432
4433
45.2k
    if (s->block) {
4434
45.0k
        int i;
4435
4436
45.0k
        if (s->hdr) {
4437
466k
            for (i = 0; i < s->hdr->num_blocks; i++) {
4438
421k
                if (i > 0 && s->block[i] == s->block[0])
4439
1.66k
                    continue;
4440
420k
                cram_free_block(s->block[i]);
4441
420k
            }
4442
45.0k
        }
4443
45.0k
        free(s->block);
4444
45.0k
    }
4445
4446
45.2k
    {
4447
        // Normally already copied into s->block[], but potentially still
4448
        // here if we error part way through cram_encode_slice.
4449
45.2k
        int i;
4450
293k
        for (i = 0; i < s->naux_block; i++)
4451
247k
            cram_free_block(s->aux_block[i]);
4452
45.2k
    }
4453
4454
45.2k
    if (s->block_by_id)
4455
492
        free(s->block_by_id);
4456
4457
45.2k
    if (s->hdr)
4458
45.2k
        cram_free_slice_header(s->hdr);
4459
4460
45.2k
    if (s->seqs_blk)
4461
45.2k
        cram_free_block(s->seqs_blk);
4462
4463
45.2k
    if (s->qual_blk)
4464
673
        cram_free_block(s->qual_blk);
4465
4466
45.2k
    if (s->name_blk)
4467
673
        cram_free_block(s->name_blk);
4468
4469
45.2k
    if (s->aux_blk)
4470
45.2k
        cram_free_block(s->aux_blk);
4471
4472
45.2k
    if (s->base_blk)
4473
673
        cram_free_block(s->base_blk);
4474
4475
45.2k
    if (s->soft_blk)
4476
673
        cram_free_block(s->soft_blk);
4477
4478
45.2k
    if (s->cigar)
4479
45.2k
        free(s->cigar);
4480
4481
45.2k
    if (s->crecs)
4482
45.2k
        free(s->crecs);
4483
4484
45.2k
    if (s->features)
4485
20.7k
        free(s->features);
4486
4487
45.2k
    if (s->TN)
4488
0
        free(s->TN);
4489
4490
45.2k
    if (s->pair_keys)
4491
44.7k
        string_pool_destroy(s->pair_keys);
4492
4493
45.2k
    if (s->pair[0])
4494
44.7k
        kh_destroy(m_s2i, s->pair[0]);
4495
45.2k
    if (s->pair[1])
4496
44.7k
        kh_destroy(m_s2i, s->pair[1]);
4497
4498
45.2k
    if (s->aux_block)
4499
39.7k
        free(s->aux_block);
4500
4501
45.2k
    free(s);
4502
45.2k
}
4503
4504
/*
4505
 * Creates a new empty slice in memory, for subsequent writing to
4506
 * disk.
4507
 *
4508
 * Returns cram_slice ptr on success
4509
 *         NULL on failure
4510
 */
4511
44.7k
cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
4512
44.7k
    cram_slice *s = calloc(1, sizeof(*s));
4513
44.7k
    if (!s)
4514
0
        return NULL;
4515
4516
44.7k
    if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
4517
0
        goto err;
4518
44.7k
    s->hdr->content_type = type;
4519
4520
44.7k
    s->hdr_block = NULL;
4521
44.7k
    s->block = NULL;
4522
44.7k
    s->block_by_id = NULL;
4523
44.7k
    s->last_apos = 0;
4524
44.7k
    if (!(s->crecs = malloc(nrecs * sizeof(cram_record))))  goto err;
4525
44.7k
    s->cigar_alloc = 1024;
4526
44.7k
    if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err;
4527
44.7k
    s->ncigar = 0;
4528
4529
44.7k
    if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))       goto err;
4530
44.7k
    if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))   goto err;
4531
44.7k
    if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))   goto err;
4532
44.7k
    if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux)))  goto err;
4533
44.7k
    if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))   goto err;
4534
44.7k
    if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))   goto err;
4535
4536
44.7k
    s->features = NULL;
4537
44.7k
    s->nfeatures = s->afeatures = 0;
4538
4539
44.7k
#ifndef TN_external
4540
44.7k
    s->TN = NULL;
4541
44.7k
    s->nTN = s->aTN = 0;
4542
44.7k
#endif
4543
4544
    // Volatile keys as we do realloc in dstring
4545
44.7k
    if (!(s->pair_keys = string_pool_create(8192))) goto err;
4546
44.7k
    if (!(s->pair[0] = kh_init(m_s2i)))             goto err;
4547
44.7k
    if (!(s->pair[1] = kh_init(m_s2i)))             goto err;
4548
4549
#ifdef BA_external
4550
    s->BA_len = 0;
4551
#endif
4552
4553
44.7k
    return s;
4554
4555
0
 err:
4556
0
    if (s)
4557
0
        cram_free_slice(s);
4558
4559
0
    return NULL;
4560
44.7k
}
4561
4562
/*
4563
 * Loads an entire slice.
4564
 * FIXME: In 1.0 the native unit of slices within CRAM is broken
4565
 * as slices contain references to objects in other slices.
4566
 * To work around this while keeping the slice oriented outer loop
4567
 * we read all slices and stitch them together into a fake large
4568
 * slice instead.
4569
 *
4570
 * Returns cram_slice ptr on success
4571
 *         NULL on failure
4572
 */
4573
526
cram_slice *cram_read_slice(cram_fd *fd) {
4574
526
    cram_block *b = cram_read_block(fd);
4575
526
    cram_slice *s = calloc(1, sizeof(*s));
4576
526
    int i, n, max_id, min_id;
4577
4578
526
    if (!b || !s)
4579
5
        goto err;
4580
4581
521
    s->hdr_block = b;
4582
521
    switch (b->content_type) {
4583
504
    case MAPPED_SLICE:
4584
508
    case UNMAPPED_SLICE:
4585
508
        if (!(s->hdr = cram_decode_slice_header(fd, b)))
4586
4
            goto err;
4587
504
        break;
4588
4589
504
    default:
4590
13
        hts_log_error("Unexpected block of type %s",
4591
13
                      cram_content_type2str(b->content_type));
4592
13
        goto err;
4593
521
    }
4594
4595
504
    if (s->hdr->num_blocks < 1) {
4596
0
        hts_log_error("Slice does not include any data blocks");
4597
0
        goto err;
4598
0
    }
4599
4600
504
    s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
4601
504
    if (!s->block)
4602
0
        goto err;
4603
4604
2.83k
    for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
4605
2.34k
        if (!(s->block[i] = cram_read_block(fd)))
4606
12
            goto err;
4607
4608
2.33k
        if (s->block[i]->content_type == EXTERNAL) {
4609
537
            if (max_id < s->block[i]->content_id)
4610
0
                max_id = s->block[i]->content_id;
4611
537
            if (min_id > s->block[i]->content_id)
4612
492
                min_id = s->block[i]->content_id;
4613
537
        }
4614
2.33k
    }
4615
4616
492
    if (!(s->block_by_id = calloc(512, sizeof(s->block[0]))))
4617
0
        goto err;
4618
4619
2.74k
    for (i = 0; i < n; i++) {
4620
2.25k
        if (s->block[i]->content_type != EXTERNAL)
4621
1.72k
            continue;
4622
528
        uint32_t v = s->block[i]->content_id;
4623
528
        if (v >= 256)
4624
0
            v = 256 + v % 251;
4625
528
        s->block_by_id[v] = s->block[i];
4626
528
    }
4627
4628
    /* Initialise encoding/decoding tables */
4629
492
    s->cigar_alloc = 1024;
4630
492
    if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err;
4631
492
    s->ncigar = 0;
4632
4633
492
    if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))      goto err;
4634
492
    if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))  goto err;
4635
492
    if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))  goto err;
4636
492
    if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux))) goto err;
4637
492
    if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))  goto err;
4638
492
    if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))  goto err;
4639
4640
492
    s->crecs = NULL;
4641
4642
492
    s->last_apos = s->hdr->ref_seq_start;
4643
492
    s->decode_md = fd->decode_md;
4644
4645
492
    return s;
4646
4647
34
 err:
4648
34
    if (b)
4649
29
        cram_free_block(b);
4650
34
    if (s) {
4651
34
        s->hdr_block = NULL;
4652
34
        cram_free_slice(s);
4653
34
    }
4654
34
    return NULL;
4655
492
}
4656
4657
4658
/* ----------------------------------------------------------------------
4659
 * CRAM file definition (header)
4660
 */
4661
4662
/*
4663
 * Reads a CRAM file definition structure.
4664
 * Returns file_def ptr on success
4665
 *         NULL on failure
4666
 */
4667
1.78k
cram_file_def *cram_read_file_def(cram_fd *fd) {
4668
1.78k
    cram_file_def *def = malloc(sizeof(*def));
4669
1.78k
    if (!def)
4670
0
        return NULL;
4671
4672
1.78k
    if (26 != hread(fd->fp, &def->magic[0], 26)) {
4673
0
        free(def);
4674
0
        return NULL;
4675
0
    }
4676
4677
1.78k
    if (memcmp(def->magic, "CRAM", 4) != 0) {
4678
0
        free(def);
4679
0
        return NULL;
4680
0
    }
4681
4682
1.78k
    if (def->major_version > 4) {
4683
0
        hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x, 3.x or 4.x, got %d.%d",
4684
0
                      def->major_version, def->minor_version);
4685
0
        free(def);
4686
0
        return NULL;
4687
0
    }
4688
4689
1.78k
    fd->first_container += 26;
4690
1.78k
    fd->curr_position = fd->first_container;
4691
1.78k
    fd->last_slice = 0;
4692
4693
1.78k
    return def;
4694
1.78k
}
4695
4696
/*
4697
 * Writes a cram_file_def structure to cram_fd.
4698
 * Returns 0 on success
4699
 *        -1 on failure
4700
 */
4701
2.50k
int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
4702
2.50k
    return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1;
4703
2.50k
}
4704
4705
4.57k
void cram_free_file_def(cram_file_def *def) {
4706
4.57k
    if (def) free(def);
4707
4.57k
}
4708
4709
/* ----------------------------------------------------------------------
4710
 * SAM header I/O
4711
 */
4712
4713
4714
/*
4715
 * Reads the SAM header from the first CRAM data block.
4716
 * Also performs minimal parsing to extract read-group
4717
 * and sample information.
4718
4719
 * Returns SAM hdr ptr on success
4720
 *         NULL on failure
4721
 */
4722
1.78k
sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) {
4723
1.78k
    int32_t header_len;
4724
1.78k
    char *header;
4725
1.78k
    sam_hdr_t *hdr;
4726
4727
    /* 1.1 onwards stores the header in the first block of a container */
4728
1.78k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4729
        /* Length */
4730
1.69k
        if (-1 == int32_decode(fd, &header_len))
4731
0
            return NULL;
4732
4733
1.69k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
4734
1.69k
        if (header_len > FUZZ_ALLOC_LIMIT)
4735
0
            return NULL;
4736
1.69k
#endif
4737
4738
        /* Alloc and read */
4739
1.69k
        if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1)))
4740
0
            return NULL;
4741
4742
1.69k
        if (header_len != hread(fd->fp, header, header_len)) {
4743
0
            free(header);
4744
0
            return NULL;
4745
0
        }
4746
1.69k
        header[header_len] = '\0';
4747
4748
1.69k
        fd->first_container += 4 + header_len;
4749
1.69k
    } else {
4750
86
        cram_container *c = cram_read_container(fd);
4751
86
        cram_block *b;
4752
86
        int i;
4753
86
        int64_t len;
4754
4755
86
        if (!c)
4756
1
            return NULL;
4757
4758
85
        fd->first_container += c->length + c->offset;
4759
85
        fd->curr_position = fd->first_container;
4760
4761
85
        if (c->num_blocks < 1) {
4762
0
            cram_free_container(c);
4763
0
            return NULL;
4764
0
        }
4765
4766
85
        if (!(b = cram_read_block(fd))) {
4767
3
            cram_free_container(c);
4768
3
            return NULL;
4769
3
        }
4770
82
        if (cram_uncompress_block(b) != 0) {
4771
24
            cram_free_container(c);
4772
24
            cram_free_block(b);
4773
24
            return NULL;
4774
24
        }
4775
4776
58
        len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4777
58
            fd->vv.varint_size(b->content_id) +
4778
58
            fd->vv.varint_size(b->uncomp_size) +
4779
58
            fd->vv.varint_size(b->comp_size);
4780
4781
        /* Extract header from 1st block */
4782
58
        if (-1 == int32_get_blk(b, &header_len) ||
4783
57
            header_len < 0 || /* Spec. says signed...  why? */
4784
57
            b->uncomp_size - 4 < header_len) {
4785
2
            cram_free_container(c);
4786
2
            cram_free_block(b);
4787
2
            return NULL;
4788
2
        }
4789
56
        if (NULL == (header = malloc((size_t) header_len+1))) {
4790
0
            cram_free_container(c);
4791
0
            cram_free_block(b);
4792
0
            return NULL;
4793
0
        }
4794
56
        memcpy(header, BLOCK_END(b), header_len);
4795
56
        header[header_len] = '\0';
4796
56
        cram_free_block(b);
4797
4798
        /* Consume any remaining blocks */
4799
680
        for (i = 1; i < c->num_blocks; i++) {
4800
624
            if (!(b = cram_read_block(fd))) {
4801
0
                cram_free_container(c);
4802
0
                free(header);
4803
0
                return NULL;
4804
0
            }
4805
624
            len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4806
624
                fd->vv.varint_size(b->content_id) +
4807
624
                fd->vv.varint_size(b->uncomp_size) +
4808
624
                fd->vv.varint_size(b->comp_size);
4809
624
            cram_free_block(b);
4810
624
        }
4811
4812
56
        if (c->length > 0 && len > 0 && c->length > len) {
4813
            // Consume padding
4814
0
            char *pads = malloc(c->length - len);
4815
0
            if (!pads) {
4816
0
                cram_free_container(c);
4817
0
                free(header);
4818
0
                return NULL;
4819
0
            }
4820
4821
0
            if (c->length - len != hread(fd->fp, pads, c->length - len)) {
4822
0
                cram_free_container(c);
4823
0
                free(header);
4824
0
                free(pads);
4825
0
                return NULL;
4826
0
            }
4827
0
            free(pads);
4828
0
        }
4829
4830
56
        cram_free_container(c);
4831
56
    }
4832
4833
    /* Parse */
4834
1.75k
    hdr = sam_hdr_init();
4835
1.75k
    if (!hdr) {
4836
0
        free(header);
4837
0
        return NULL;
4838
0
    }
4839
4840
1.75k
    if (-1 == sam_hdr_add_lines(hdr, header, header_len)) {
4841
6
        free(header);
4842
6
        sam_hdr_destroy(hdr);
4843
6
        return NULL;
4844
6
    }
4845
4846
1.74k
    hdr->l_text = header_len;
4847
1.74k
    hdr->text = header;
4848
4849
1.74k
    return hdr;
4850
4851
1.75k
}
4852
4853
/*
4854
 * Converts 'in' to a full pathname to store in out.
4855
 * Out must be at least PATH_MAX bytes long.
4856
 */
4857
0
static void full_path(char *out, char *in) {
4858
0
    size_t in_l = strlen(in);
4859
0
    if (hisremote(in)) {
4860
0
        if (in_l > PATH_MAX) {
4861
0
            hts_log_error("Reference path is longer than %d", PATH_MAX);
4862
0
            return;
4863
0
        }
4864
0
        strncpy(out, in, PATH_MAX-1);
4865
0
        out[PATH_MAX-1] = 0;
4866
0
        return;
4867
0
    }
4868
0
    if (*in == '/' ||
4869
        // Windows paths
4870
0
        (in_l > 3 && toupper_c(*in) >= 'A'  && toupper_c(*in) <= 'Z' &&
4871
0
         in[1] == ':' && (in[2] == '/' || in[2] == '\\'))) {
4872
0
        strncpy(out, in, PATH_MAX-1);
4873
0
        out[PATH_MAX-1] = 0;
4874
0
    } else {
4875
0
        size_t len;
4876
4877
        // unable to get dir or out+in is too long
4878
0
        if (!getcwd(out, PATH_MAX) ||
4879
0
            (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
4880
0
            strncpy(out, in, PATH_MAX-1);
4881
0
            out[PATH_MAX-1] = 0;
4882
0
            return;
4883
0
        }
4884
4885
0
        snprintf(out+len, PATH_MAX - len, "/%s", in);
4886
4887
        // FIXME: cope with `pwd`/../../../foo.fa ?
4888
0
    }
4889
0
}
4890
4891
/*
4892
 * Writes a CRAM SAM header.
4893
 * Returns 0 on success
4894
 *        -1 on failure
4895
 */
4896
2.50k
int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) {
4897
2.50k
    size_t header_len;
4898
2.50k
    int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
4899
4900
    /* Write CRAM MAGIC if not yet written. */
4901
2.50k
    if (fd->file_def->major_version == 0) {
4902
2.50k
        fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version);
4903
2.50k
        fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version);
4904
2.50k
        if (0 != cram_write_file_def(fd, fd->file_def))
4905
0
            return -1;
4906
2.50k
    }
4907
4908
    /* 1.0 requires an UNKNOWN read-group */
4909
2.50k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4910
0
        if (!sam_hrecs_find_rg(hdr->hrecs, "UNKNOWN"))
4911
0
            if (sam_hdr_add_line(hdr, "RG",
4912
0
                            "ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
4913
0
                return -1;
4914
0
    }
4915
4916
2.50k
    if (-1 == refs_from_header(fd))
4917
0
        return -1;
4918
2.50k
    if (-1 == refs2id(fd->refs, fd->header))
4919
0
        return -1;
4920
4921
    /* Fix M5 strings */
4922
2.50k
    if (fd->refs && !fd->no_ref && fd->embed_ref <= 1) {
4923
2.50k
        int i;
4924
2.60k
        for (i = 0; i < hdr->hrecs->nref; i++) {
4925
1.50k
            sam_hrec_type_t *ty;
4926
1.50k
            char *ref;
4927
4928
1.50k
            if (!(ty = sam_hrecs_find_type_id(hdr->hrecs, "SQ", "SN", hdr->hrecs->ref[i].name)))
4929
0
                return -1;
4930
4931
1.50k
            if (!sam_hrecs_find_key(ty, "M5", NULL)) {
4932
1.40k
                char unsigned buf[16];
4933
1.40k
                char buf2[33];
4934
1.40k
                hts_pos_t rlen;
4935
1.40k
                hts_md5_context *md5;
4936
4937
1.40k
                if (!fd->refs ||
4938
1.40k
                    !fd->refs->ref_id ||
4939
1.40k
                    !fd->refs->ref_id[i]) {
4940
0
                    return -1;
4941
0
                }
4942
1.40k
                rlen = fd->refs->ref_id[i]->length;
4943
1.40k
                ref = cram_get_ref(fd, i, 1, rlen);
4944
1.40k
                if (NULL == ref) {
4945
1.40k
                    if (fd->embed_ref == -1) {
4946
                        // auto embed-ref
4947
1.40k
                        hts_log_warning("No M5 tags present and could not "
4948
1.40k
                                        "find reference");
4949
1.40k
                        hts_log_warning("Enabling embed_ref=2 option");
4950
1.40k
                        hts_log_warning("NOTE: the CRAM file will be bigger "
4951
1.40k
                                        "than using an external reference");
4952
1.40k
                        pthread_mutex_lock(&fd->ref_lock);
4953
                        // Best guess.  It may be unmapped data with broken
4954
                        // headers, in which case this will get ignored.
4955
1.40k
                        fd->embed_ref = 2;
4956
1.40k
                        pthread_mutex_unlock(&fd->ref_lock);
4957
1.40k
                        break;
4958
1.40k
                    }
4959
0
                    return -1;
4960
1.40k
                }
4961
0
                rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
4962
0
                if (!(md5 = hts_md5_init()))
4963
0
                    return -1;
4964
0
                if (HTS_POS_MAX <= ULONG_MAX) {
4965
                    // Platforms with 64-bit unsigned long update in one go
4966
0
                    hts_md5_update(md5, ref, rlen);
4967
0
                } else {
4968
                    // Those with 32-bit ulong (Windows) may have to loop
4969
                    // over epic references
4970
0
                    hts_pos_t pos = 0;
4971
0
                    while (rlen - pos > ULONG_MAX) {
4972
0
                        hts_md5_update(md5, ref + pos, ULONG_MAX);
4973
0
                        pos += ULONG_MAX;
4974
0
                    }
4975
0
                    hts_md5_update(md5, ref + pos, (unsigned long)(rlen - pos));
4976
0
                }
4977
0
                hts_md5_final(buf, md5);
4978
0
                hts_md5_destroy(md5);
4979
0
                cram_ref_decr(fd->refs, i);
4980
4981
0
                hts_md5_hex(buf2, buf);
4982
0
                fd->refs->ref_id[i]->validated_md5 = 1;
4983
0
                if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "M5", buf2, NULL))
4984
0
                    return -1;
4985
0
            }
4986
4987
95
            if (fd->ref_fn) {
4988
0
                char ref_fn[PATH_MAX];
4989
0
                full_path(ref_fn, fd->ref_fn);
4990
0
                if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "UR", ref_fn, NULL))
4991
0
                    return -1;
4992
0
            }
4993
95
        }
4994
2.50k
    }
4995
4996
    /* Length */
4997
2.50k
    header_len = sam_hdr_length(hdr);
4998
2.50k
    if (header_len > INT32_MAX) {
4999
0
        hts_log_error("Header is too long for CRAM format");
5000
0
        return -1;
5001
0
    }
5002
2.50k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
5003
0
        if (-1 == int32_encode(fd, header_len))
5004
0
            return -1;
5005
5006
        /* Text data */
5007
0
        if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len))
5008
0
            return -1;
5009
2.50k
    } else {
5010
        /* Create block(s) inside a container */
5011
2.50k
        cram_block *b = cram_new_block(FILE_HEADER, 0);
5012
2.50k
        cram_container *c = cram_new_container(0, 0);
5013
2.50k
        int padded_length;
5014
2.50k
        char *pads;
5015
2.50k
        int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3);
5016
5017
2.50k
        if (!b || !c) {
5018
0
            if (b) cram_free_block(b);
5019
0
            if (c) cram_free_container(c);
5020
0
            return -1;
5021
0
        }
5022
5023
2.50k
        if (int32_put_blk(b, header_len) < 0)
5024
0
            return -1;
5025
2.50k
        if (header_len)
5026
1.57k
            BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
5027
2.50k
        BLOCK_UPLEN(b);
5028
5029
        // Compress header block if V3.0 and above
5030
2.50k
        if (CRAM_MAJOR_VERS(fd->version) >= 3)
5031
2.50k
            if (cram_compress_block(fd, b, NULL, -1, -1) < 0)
5032
0
                return -1;
5033
5034
2.50k
        if (blank_block) {
5035
2.50k
            c->length = b->comp_size + 2 + 4*is_cram_3 +
5036
2.50k
                fd->vv.varint_size(b->content_id)   +
5037
2.50k
                fd->vv.varint_size(b->uncomp_size)  +
5038
2.50k
                fd->vv.varint_size(b->comp_size);
5039
5040
2.50k
            c->num_blocks = 2;
5041
2.50k
            c->num_landmarks = 2;
5042
2.50k
            if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
5043
0
                cram_free_block(b);
5044
0
                cram_free_container(c);
5045
0
                return -1;
5046
0
            }
5047
2.50k
            c->landmark[0] = 0;
5048
2.50k
            c->landmark[1] = c->length;
5049
5050
            // Plus extra storage for uncompressed secondary blank block
5051
2.50k
            padded_length = MIN(c->length*.5, 10000);
5052
2.50k
            c->length += padded_length + 2 + 4*is_cram_3 +
5053
2.50k
                fd->vv.varint_size(b->content_id) +
5054
2.50k
                fd->vv.varint_size(padded_length)*2;
5055
2.50k
        } else {
5056
            // Pad the block instead.
5057
0
            c->num_blocks = 1;
5058
0
            c->num_landmarks = 1;
5059
0
            if (!(c->landmark = malloc(sizeof(*c->landmark))))
5060
0
                return -1;
5061
0
            c->landmark[0] = 0;
5062
5063
0
            padded_length = MAX(c->length*1.5, 10000) - c->length;
5064
5065
0
            c->length = b->comp_size + padded_length +
5066
0
                2 + 4*is_cram_3 +
5067
0
                fd->vv.varint_size(b->content_id)   +
5068
0
                fd->vv.varint_size(b->uncomp_size)  +
5069
0
                fd->vv.varint_size(b->comp_size);
5070
5071
0
            if (NULL == (pads = calloc(1, padded_length))) {
5072
0
                cram_free_block(b);
5073
0
                cram_free_container(c);
5074
0
                return -1;
5075
0
            }
5076
0
            BLOCK_APPEND(b, pads, padded_length);
5077
0
            BLOCK_UPLEN(b);
5078
0
            free(pads);
5079
0
        }
5080
5081
2.50k
        if (-1 == cram_write_container(fd, c)) {
5082
0
            cram_free_block(b);
5083
0
            cram_free_container(c);
5084
0
            return -1;
5085
0
        }
5086
5087
2.50k
        if (-1 == cram_write_block(fd, b)) {
5088
0
            cram_free_block(b);
5089
0
            cram_free_container(c);
5090
0
            return -1;
5091
0
        }
5092
5093
2.50k
        if (blank_block) {
5094
2.50k
            BLOCK_RESIZE(b, padded_length);
5095
2.50k
            memset(BLOCK_DATA(b), 0, padded_length);
5096
2.50k
            BLOCK_SIZE(b) = padded_length;
5097
2.50k
            BLOCK_UPLEN(b);
5098
2.50k
            b->method = RAW;
5099
2.50k
            if (-1 == cram_write_block(fd, b)) {
5100
0
                cram_free_block(b);
5101
0
                cram_free_container(c);
5102
0
                return -1;
5103
0
            }
5104
2.50k
        }
5105
5106
2.50k
        cram_free_block(b);
5107
2.50k
        cram_free_container(c);
5108
2.50k
    }
5109
5110
2.50k
    if (0 != hflush(fd->fp))
5111
0
        return -1;
5112
5113
2.50k
    RP("=== Finishing saving header ===\n");
5114
5115
2.50k
    return 0;
5116
5117
0
 block_err:
5118
0
    return -1;
5119
2.50k
}
5120
5121
/* ----------------------------------------------------------------------
5122
 * The top-level cram opening, closing and option handling
5123
 */
5124
5125
/*
5126
 * Sets CRAM variable sized integer decode function tables.
5127
 * CRAM 1, 2, and 3.x all used ITF8 for uint32 and UTF8 for uint64.
5128
 * CRAM 4.x uses the same encoding mechanism for 32-bit and 64-bit
5129
 * (or anything inbetween), but also now supports signed values.
5130
 *
5131
 * Version is the CRAM major version number.
5132
 * vv is the vector table (probably &cram_fd->vv)
5133
 */
5134
4.57k
static void cram_init_varint(varint_vec *vv, int version) {
5135
4.57k
    if (version >= 4) {
5136
40
        vv->varint_get32 = uint7_get_32; // FIXME: varint.h API should be size agnostic
5137
40
        vv->varint_get32s = sint7_get_32;
5138
40
        vv->varint_get64 = uint7_get_64;
5139
40
        vv->varint_get64s = sint7_get_64;
5140
40
        vv->varint_put32 = uint7_put_32;
5141
40
        vv->varint_put32s = sint7_put_32;
5142
40
        vv->varint_put64 = uint7_put_64;
5143
40
        vv->varint_put64s = sint7_put_64;
5144
40
        vv->varint_put32_blk = uint7_put_blk_32;
5145
40
        vv->varint_put32s_blk = sint7_put_blk_32;
5146
40
        vv->varint_put64_blk = uint7_put_blk_64;
5147
40
        vv->varint_put64s_blk = sint7_put_blk_64;
5148
40
        vv->varint_size = uint7_size;
5149
40
        vv->varint_decode32_crc = uint7_decode_crc32;
5150
40
        vv->varint_decode32s_crc = sint7_decode_crc32;
5151
40
        vv->varint_decode64_crc = uint7_decode_crc64;
5152
4.53k
    } else {
5153
4.53k
        vv->varint_get32 = safe_itf8_get;
5154
4.53k
        vv->varint_get32s = safe_itf8_get;
5155
4.53k
        vv->varint_get64 = safe_ltf8_get;
5156
4.53k
        vv->varint_get64s = safe_ltf8_get;
5157
4.53k
        vv->varint_put32 = safe_itf8_put;
5158
4.53k
        vv->varint_put32s = safe_itf8_put;
5159
4.53k
        vv->varint_put64 = safe_ltf8_put;
5160
4.53k
        vv->varint_put64s = safe_ltf8_put;
5161
4.53k
        vv->varint_put32_blk = itf8_put_blk;
5162
4.53k
        vv->varint_put32s_blk = itf8_put_blk;
5163
4.53k
        vv->varint_put64_blk = ltf8_put_blk;
5164
4.53k
        vv->varint_put64s_blk = ltf8_put_blk;
5165
4.53k
        vv->varint_size = itf8_size;
5166
4.53k
        vv->varint_decode32_crc = itf8_decode_crc;
5167
4.53k
        vv->varint_decode32s_crc = itf8_decode_crc;
5168
4.53k
        vv->varint_decode64_crc = ltf8_decode_crc;
5169
4.53k
    }
5170
4.57k
}
5171
5172
/*
5173
 * Initialises the lookup tables. These could be global statics, but they're
5174
 * clumsy to setup in a multi-threaded environment unless we generate
5175
 * verbatim code and include that.
5176
 */
5177
4.57k
static void cram_init_tables(cram_fd *fd) {
5178
4.57k
    int i;
5179
5180
4.57k
    memset(fd->L1, 4, 256);
5181
4.57k
    fd->L1['A'] = 0; fd->L1['a'] = 0;
5182
4.57k
    fd->L1['C'] = 1; fd->L1['c'] = 1;
5183
4.57k
    fd->L1['G'] = 2; fd->L1['g'] = 2;
5184
4.57k
    fd->L1['T'] = 3; fd->L1['t'] = 3;
5185
5186
4.57k
    memset(fd->L2, 5, 256);
5187
4.57k
    fd->L2['A'] = 0; fd->L2['a'] = 0;
5188
4.57k
    fd->L2['C'] = 1; fd->L2['c'] = 1;
5189
4.57k
    fd->L2['G'] = 2; fd->L2['g'] = 2;
5190
4.57k
    fd->L2['T'] = 3; fd->L2['t'] = 3;
5191
4.57k
    fd->L2['N'] = 4; fd->L2['n'] = 4;
5192
5193
4.57k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
5194
869k
        for (i = 0; i < 0x200; i++) {
5195
867k
            int f = 0;
5196
5197
867k
            if (i & CRAM_FPAIRED)      f |= BAM_FPAIRED;
5198
867k
            if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
5199
867k
            if (i & CRAM_FUNMAP)       f |= BAM_FUNMAP;
5200
867k
            if (i & CRAM_FREVERSE)     f |= BAM_FREVERSE;
5201
867k
            if (i & CRAM_FREAD1)       f |= BAM_FREAD1;
5202
867k
            if (i & CRAM_FREAD2)       f |= BAM_FREAD2;
5203
867k
            if (i & CRAM_FSECONDARY)   f |= BAM_FSECONDARY;
5204
867k
            if (i & CRAM_FQCFAIL)      f |= BAM_FQCFAIL;
5205
867k
            if (i & CRAM_FDUP)         f |= BAM_FDUP;
5206
5207
867k
            fd->bam_flag_swap[i]  = f;
5208
867k
        }
5209
5210
6.94M
        for (i = 0; i < 0x1000; i++) {
5211
6.93M
            int g = 0;
5212
5213
6.93M
            if (i & BAM_FPAIRED)           g |= CRAM_FPAIRED;
5214
6.93M
            if (i & BAM_FPROPER_PAIR)  g |= CRAM_FPROPER_PAIR;
5215
6.93M
            if (i & BAM_FUNMAP)        g |= CRAM_FUNMAP;
5216
6.93M
            if (i & BAM_FREVERSE)      g |= CRAM_FREVERSE;
5217
6.93M
            if (i & BAM_FREAD1)        g |= CRAM_FREAD1;
5218
6.93M
            if (i & BAM_FREAD2)        g |= CRAM_FREAD2;
5219
6.93M
            if (i & BAM_FSECONDARY)    g |= CRAM_FSECONDARY;
5220
6.93M
            if (i & BAM_FQCFAIL)       g |= CRAM_FQCFAIL;
5221
6.93M
            if (i & BAM_FDUP)          g |= CRAM_FDUP;
5222
5223
6.93M
            fd->cram_flag_swap[i] = g;
5224
6.93M
        }
5225
2.87k
    } else {
5226
        /* NOP */
5227
11.7M
        for (i = 0; i < 0x1000; i++)
5228
11.7M
            fd->bam_flag_swap[i] = i;
5229
11.7M
        for (i = 0; i < 0x1000; i++)
5230
11.7M
            fd->cram_flag_swap[i] = i;
5231
2.87k
    }
5232
5233
4.57k
    memset(fd->cram_sub_matrix, 4, 32*32);
5234
150k
    for (i = 0; i < 32; i++) {
5235
146k
        fd->cram_sub_matrix[i]['A'&0x1f]=0;
5236
146k
        fd->cram_sub_matrix[i]['C'&0x1f]=1;
5237
146k
        fd->cram_sub_matrix[i]['G'&0x1f]=2;
5238
146k
        fd->cram_sub_matrix[i]['T'&0x1f]=3;
5239
146k
        fd->cram_sub_matrix[i]['N'&0x1f]=4;
5240
146k
    }
5241
27.4k
    for (i = 0; i < 20; i+=4) {
5242
22.8k
        int j;
5243
479k
        for (j = 0; j < 20; j++) {
5244
457k
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5245
457k
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5246
457k
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5247
457k
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5248
457k
        }
5249
22.8k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
5250
22.8k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
5251
22.8k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
5252
22.8k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
5253
22.8k
    }
5254
5255
4.57k
    cram_init_varint(&fd->vv, CRAM_MAJOR_VERS(fd->version));
5256
4.57k
}
5257
5258
// Default version numbers for CRAM
5259
static int major_version = 3;
5260
static int minor_version = 1;
5261
5262
/*
5263
 * Opens a CRAM file for read (mode "rb") or write ("wb").
5264
 * The filename may be "-" to indicate stdin or stdout.
5265
 *
5266
 * Returns file handle on success
5267
 *         NULL on failure.
5268
 */
5269
0
cram_fd *cram_open(const char *filename, const char *mode) {
5270
0
    hFILE *fp;
5271
0
    cram_fd *fd;
5272
0
    char fmode[3]= { mode[0], '\0', '\0' };
5273
5274
0
    if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
5275
0
        fmode[1] = 'b';
5276
0
    }
5277
5278
0
    fp = hopen(filename, fmode);
5279
0
    if (!fp)
5280
0
        return NULL;
5281
5282
0
    fd = cram_dopen(fp, filename, mode);
5283
0
    if (!fd)
5284
0
        hclose_abruptly(fp);
5285
5286
0
    return fd;
5287
0
}
5288
5289
/* Opens an existing stream for reading or writing.
5290
 *
5291
 * Returns file handle on success;
5292
 *         NULL on failure.
5293
 */
5294
4.57k
cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
5295
4.57k
    int i;
5296
4.57k
    char *cp;
5297
4.57k
    cram_fd *fd = calloc(1, sizeof(*fd));
5298
4.57k
    if (!fd)
5299
0
        return NULL;
5300
5301
4.57k
    fd->level = CRAM_DEFAULT_LEVEL;
5302
13.7k
    for (i = 0; mode[i]; i++) {
5303
9.14k
        if (mode[i] >= '0' && mode[i] <= '9') {
5304
0
            fd->level = mode[i] - '0';
5305
0
            break;
5306
0
        }
5307
9.14k
    }
5308
5309
4.57k
    fd->fp = fp;
5310
4.57k
    fd->mode = *mode;
5311
4.57k
    fd->first_container = 0;
5312
4.57k
    fd->curr_position = 0;
5313
5314
4.57k
    if (fd->mode == 'r') {
5315
        /* Reader */
5316
5317
1.78k
        if (!(fd->file_def = cram_read_file_def(fd)))
5318
0
            goto err;
5319
5320
1.78k
        fd->version = fd->file_def->major_version * 256 +
5321
1.78k
            fd->file_def->minor_version;
5322
5323
1.78k
        cram_init_tables(fd);
5324
5325
1.78k
        if (!(fd->header = cram_read_SAM_hdr(fd))) {
5326
36
            cram_free_file_def(fd->file_def);
5327
36
            goto err;
5328
36
        }
5329
5330
2.79k
    } else {
5331
        /* Writer */
5332
2.79k
        cram_file_def *def = calloc(1, sizeof(*def));
5333
2.79k
        if (!def)
5334
0
            return NULL;
5335
5336
2.79k
        fd->file_def = def;
5337
5338
2.79k
        def->magic[0] = 'C';
5339
2.79k
        def->magic[1] = 'R';
5340
2.79k
        def->magic[2] = 'A';
5341
2.79k
        def->magic[3] = 'M';
5342
2.79k
        def->major_version = 0; // Indicator to write file def later.
5343
2.79k
        def->minor_version = 0;
5344
2.79k
        memset(def->file_id, 0, 20);
5345
2.79k
        strncpy(def->file_id, filename, 20);
5346
5347
2.79k
        fd->version = major_version * 256 + minor_version;
5348
2.79k
        cram_init_tables(fd);
5349
5350
        /* SAM header written later along with this file_def */
5351
2.79k
    }
5352
5353
4.53k
    fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
5354
4.53k
    if (!fd->prefix)
5355
0
        goto err;
5356
4.53k
    fd->first_base = fd->last_base = -1;
5357
4.53k
    fd->record_counter = 0;
5358
5359
4.53k
    fd->ctr = NULL;
5360
4.53k
    fd->ctr_mt = NULL;
5361
4.53k
    fd->refs  = refs_create();
5362
4.53k
    if (!fd->refs)
5363
0
        goto err;
5364
4.53k
    fd->ref_id = -2;
5365
4.53k
    fd->ref = NULL;
5366
5367
4.53k
    fd->decode_md = 0;
5368
4.53k
    fd->seqs_per_slice = SEQS_PER_SLICE;
5369
4.53k
    fd->bases_per_slice = BASES_PER_SLICE;
5370
4.53k
    fd->slices_per_container = SLICE_PER_CNT;
5371
4.53k
    fd->embed_ref = -1; // automatic selection
5372
4.53k
    fd->no_ref = 0;
5373
4.53k
    fd->no_ref_counter = 0;
5374
4.53k
    fd->ap_delta = 0;
5375
4.53k
    fd->ignore_md5 = 0;
5376
4.53k
    fd->lossy_read_names = 0;
5377
4.53k
    fd->use_bz2 = 0;
5378
4.53k
    fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3);
5379
4.53k
    fd->use_tok = (CRAM_MAJOR_VERS(fd->version) >= 3) && (CRAM_MINOR_VERS(fd->version) >= 1);
5380
4.53k
    fd->use_lzma = 0;
5381
4.53k
    fd->multi_seq = -1;
5382
4.53k
    fd->multi_seq_user = -1;
5383
4.53k
    fd->unsorted   = 0;
5384
4.53k
    fd->shared_ref = 0;
5385
4.53k
    fd->store_md = 0;
5386
4.53k
    fd->store_nm = 0;
5387
4.53k
    fd->last_RI_count = 0;
5388
5389
4.53k
    fd->index       = NULL;
5390
4.53k
    fd->own_pool    = 0;
5391
4.53k
    fd->pool        = NULL;
5392
4.53k
    fd->rqueue      = NULL;
5393
4.53k
    fd->job_pending = NULL;
5394
4.53k
    fd->ooc         = 0;
5395
4.53k
    fd->required_fields = INT_MAX;
5396
5397
4.53k
    pthread_mutex_init(&fd->metrics_lock, NULL);
5398
4.53k
    pthread_mutex_init(&fd->ref_lock, NULL);
5399
4.53k
    pthread_mutex_init(&fd->range_lock, NULL);
5400
4.53k
    pthread_mutex_init(&fd->bam_list_lock, NULL);
5401
5402
217k
    for (i = 0; i < DS_END; i++) {
5403
213k
        fd->m[i] = cram_new_metrics();
5404
213k
        if (!fd->m[i])
5405
0
            goto err;
5406
213k
    }
5407
5408
4.53k
    if (!(fd->tags_used = kh_init(m_metrics)))
5409
0
        goto err;
5410
5411
4.53k
    fd->range.refid = -2; // no ref.
5412
4.53k
    fd->eof = 1;          // See samtools issue #150
5413
4.53k
    fd->ref_fn = NULL;
5414
5415
4.53k
    fd->bl = NULL;
5416
5417
    /* Initialise dummy refs from the @SQ headers */
5418
4.53k
    if (-1 == refs_from_header(fd))
5419
0
        goto err;
5420
5421
4.53k
    return fd;
5422
5423
36
 err:
5424
36
    if (fd)
5425
36
        free(fd);
5426
5427
36
    return NULL;
5428
4.53k
}
5429
5430
/*
5431
 * Seek within a CRAM file.
5432
 *
5433
 * Returns 0 on success
5434
 *        -1 on failure
5435
 */
5436
0
int cram_seek(cram_fd *fd, off_t offset, int whence) {
5437
0
    fd->ooc = 0;
5438
5439
0
    cram_drain_rqueue(fd);
5440
5441
0
    return hseek(fd->fp, offset, whence) >= 0 ? 0 : -1;
5442
0
}
5443
5444
/*
5445
 * Flushes a CRAM file.
5446
 * Useful for when writing to stdout without wishing to close the stream.
5447
 *
5448
 * Returns 0 on success
5449
 *        -1 on failure
5450
 */
5451
0
int cram_flush(cram_fd *fd) {
5452
0
    if (!fd)
5453
0
        return -1;
5454
5455
0
    int ret = 0;
5456
5457
0
    if (fd->mode == 'w' && fd->ctr) {
5458
0
        if(fd->ctr->slice)
5459
0
            cram_update_curr_slice(fd->ctr, fd->version);
5460
5461
0
        if (-1 == cram_flush_container_mt(fd, fd->ctr))
5462
0
            ret = -1;
5463
5464
0
        cram_free_container(fd->ctr);
5465
0
        if (fd->ctr_mt == fd->ctr)
5466
0
            fd->ctr_mt = NULL;
5467
0
        fd->ctr = NULL;
5468
0
    }
5469
5470
0
    return ret;
5471
0
}
5472
5473
/*
5474
 * Writes an EOF block to a CRAM file.
5475
 *
5476
 * Returns 0 on success
5477
 *        -1 on failure
5478
 */
5479
2.61k
int cram_write_eof_block(cram_fd *fd) {
5480
    // EOF block is a container with special values to aid detection
5481
2.61k
    if (CRAM_MAJOR_VERS(fd->version) >= 2) {
5482
        // Empty container with
5483
        //   ref_seq_id -1
5484
        //   start pos 0x454f46 ("EOF")
5485
        //   span 0
5486
        //   nrec 0
5487
        //   counter 0
5488
        //   nbases 0
5489
        //   1 block (landmark 0)
5490
        //   (CRC32)
5491
2.61k
        cram_container c;
5492
2.61k
        memset(&c, 0, sizeof(c));
5493
2.61k
        c.ref_seq_id = -1;
5494
2.61k
        c.ref_seq_start = 0x454f46; // "EOF"
5495
2.61k
        c.ref_seq_span = 0;
5496
2.61k
        c.record_counter = 0;
5497
2.61k
        c.num_bases = 0;
5498
2.61k
        c.num_blocks = 1;
5499
2.61k
        int32_t land[1] = {0};
5500
2.61k
        c.landmark = land;
5501
5502
        // An empty compression header block with
5503
        //   method raw (0)
5504
        //   type comp header (1)
5505
        //   content id 0
5506
        //   block contents size 6
5507
        //   raw size 6
5508
        //     empty preservation map (01 00)
5509
        //     empty data series map (01 00)
5510
        //     empty tag map (01 00)
5511
        //   block CRC
5512
2.61k
        cram_block_compression_hdr ch;
5513
2.61k
        memset(&ch, 0, sizeof(ch));
5514
2.61k
        c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch, 0);
5515
5516
2.61k
        c.length = c.comp_hdr_block->byte            // Landmark[0]
5517
2.61k
            + 5                                      // block struct
5518
2.61k
            + 4*(CRAM_MAJOR_VERS(fd->version) >= 3); // CRC
5519
2.61k
        if (cram_write_container(fd, &c) < 0 ||
5520
2.61k
            cram_write_block(fd, c.comp_hdr_block) < 0) {
5521
0
            cram_close(fd);
5522
0
            cram_free_block(c.comp_hdr_block);
5523
0
            return -1;
5524
0
        }
5525
2.61k
        if (ch.preservation_map)
5526
0
            kh_destroy(map, ch.preservation_map);
5527
2.61k
        cram_free_block(c.comp_hdr_block);
5528
5529
        // V2.1 bytes
5530
        // 0b 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id
5531
        // e0 45 4f 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5532
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5533
        // 00 01 00 06 06             // Comp.HDR blk
5534
        // 01 00 01 00 01 00          // Comp.HDR blk
5535
5536
        // V3.0 bytes:
5537
        // 0f 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id
5538
        // e0 45 4f 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5539
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5540
        // 05 bd d9 4f                // CRC32
5541
        // 00 01 00 06 06             // Comp.HDR blk
5542
        // 01 00 01 00 01 00          // Comp.HDR blk
5543
        // ee 63 01 4b                // CRC32
5544
5545
        // V4.0 bytes:
5546
        // 0f 00 00 00 8f ff ff ff    // Cont HDR: size, ref seq id
5547
        // 82 95 9e 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5548
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5549
        // ac d6 05 bc                // CRC32
5550
        // 00 01 00 06 06             // Comp.HDR blk
5551
        // 01 00 01 00 01 00          // Comp.HDR blk
5552
        // ee 63 01 4b                // CRC32
5553
2.61k
    }
5554
5555
2.61k
    return 0;
5556
2.61k
}
5557
5558
/*
5559
 * Closes a CRAM file.
5560
 * Returns 0 on success
5561
 *        -1 on failure
5562
 */
5563
4.53k
int cram_close(cram_fd *fd) {
5564
4.53k
    spare_bams *bl, *next;
5565
4.53k
    int i, ret = 0;
5566
5567
4.53k
    if (!fd)
5568
0
        return -1;
5569
5570
4.53k
    if (fd->mode == 'w' && fd->ctr) {
5571
1.68k
        if(fd->ctr->slice)
5572
1.68k
            cram_update_curr_slice(fd->ctr, fd->version);
5573
5574
1.68k
        if (-1 == cram_flush_container_mt(fd, fd->ctr))
5575
171
            ret = -1;
5576
1.68k
    }
5577
5578
4.53k
    if (fd->mode != 'w')
5579
1.74k
        cram_drain_rqueue(fd);
5580
5581
4.53k
    if (fd->pool && fd->eof >= 0 && fd->rqueue) {
5582
0
        hts_tpool_process_flush(fd->rqueue);
5583
5584
0
        if (0 != cram_flush_result(fd))
5585
0
            ret = -1;
5586
5587
0
        if (fd->mode == 'w')
5588
0
            fd->ctr = NULL; // prevent double freeing
5589
5590
        //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
5591
5592
0
        hts_tpool_process_destroy(fd->rqueue);
5593
0
    }
5594
5595
4.53k
    pthread_mutex_destroy(&fd->metrics_lock);
5596
4.53k
    pthread_mutex_destroy(&fd->ref_lock);
5597
4.53k
    pthread_mutex_destroy(&fd->range_lock);
5598
4.53k
    pthread_mutex_destroy(&fd->bam_list_lock);
5599
5600
4.53k
    if (ret == 0 && fd->mode == 'w') {
5601
        /* Write EOF block */
5602
2.61k
        if (0 != cram_write_eof_block(fd))
5603
0
            ret = -1;
5604
2.61k
    }
5605
5606
6.07k
    for (bl = fd->bl; bl; bl = next) {
5607
1.53k
        int max_rec = fd->seqs_per_slice * fd->slices_per_container;
5608
5609
1.53k
        next = bl->next;
5610
1.53k
        free_bam_list(bl->bams, max_rec);
5611
1.53k
        free(bl);
5612
1.53k
    }
5613
5614
4.53k
    if (hclose(fd->fp) != 0)
5615
0
        ret = -1;
5616
5617
4.53k
    if (fd->file_def)
5618
4.53k
        cram_free_file_def(fd->file_def);
5619
5620
4.53k
    if (fd->header)
5621
4.38k
        sam_hdr_destroy(fd->header);
5622
5623
4.53k
    free(fd->prefix);
5624
5625
4.53k
    if (fd->ctr)
5626
2.80k
        cram_free_container(fd->ctr);
5627
5628
4.53k
    if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
5629
27
        cram_free_container(fd->ctr_mt);
5630
5631
4.53k
    if (fd->refs)
5632
4.53k
        refs_free(fd->refs);
5633
4.53k
    if (fd->ref_free)
5634
0
        free(fd->ref_free);
5635
5636
217k
    for (i = 0; i < DS_END; i++)
5637
213k
        if (fd->m[i])
5638
213k
            free(fd->m[i]);
5639
5640
4.53k
    if (fd->tags_used) {
5641
4.53k
        khint_t k;
5642
5643
21.7k
        for (k = kh_begin(fd->tags_used); k != kh_end(fd->tags_used); k++) {
5644
17.2k
            if (kh_exist(fd->tags_used, k))
5645
9.27k
                free(kh_val(fd->tags_used, k));
5646
17.2k
        }
5647
5648
4.53k
        kh_destroy(m_metrics, fd->tags_used);
5649
4.53k
    }
5650
5651
4.53k
    if (fd->index)
5652
0
        cram_index_free(fd);
5653
5654
4.53k
    if (fd->own_pool && fd->pool)
5655
0
        hts_tpool_destroy(fd->pool);
5656
5657
4.53k
    if (fd->idxfp)
5658
0
        if (bgzf_close(fd->idxfp) < 0)
5659
0
            ret = -1;
5660
5661
4.53k
    free(fd);
5662
5663
4.53k
    return ret;
5664
4.53k
}
5665
5666
/*
5667
 * Returns 1 if we hit an EOF while reading.
5668
 */
5669
2.92k
int cram_eof(cram_fd *fd) {
5670
2.92k
    return fd->eof;
5671
2.92k
}
5672
5673
5674
/*
5675
 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5676
 * Use this immediately after opening.
5677
 *
5678
 * Returns 0 on success
5679
 *        -1 on failure
5680
 */
5681
1.74k
int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...) {
5682
1.74k
    int r;
5683
1.74k
    va_list args;
5684
5685
1.74k
    va_start(args, opt);
5686
1.74k
    r = cram_set_voption(fd, opt, args);
5687
1.74k
    va_end(args);
5688
5689
1.74k
    return r;
5690
1.74k
}
5691
5692
/*
5693
 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5694
 * Use this immediately after opening.
5695
 *
5696
 * Returns 0 on success
5697
 *        -1 on failure
5698
 */
5699
1.74k
int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
5700
1.74k
    refs_t *refs;
5701
5702
1.74k
    if (!fd) {
5703
0
        errno = EBADF;
5704
0
        return -1;
5705
0
    }
5706
5707
1.74k
    switch (opt) {
5708
1.74k
    case CRAM_OPT_DECODE_MD:
5709
1.74k
        fd->decode_md = va_arg(args, int);
5710
1.74k
        break;
5711
5712
0
    case CRAM_OPT_PREFIX:
5713
0
        if (fd->prefix)
5714
0
            free(fd->prefix);
5715
0
        if (!(fd->prefix = strdup(va_arg(args, char *))))
5716
0
            return -1;
5717
0
        break;
5718
5719
0
    case CRAM_OPT_VERBOSITY:
5720
0
        break;
5721
5722
0
    case CRAM_OPT_SEQS_PER_SLICE:
5723
0
        fd->seqs_per_slice = va_arg(args, int);
5724
0
        if (fd->bases_per_slice == BASES_PER_SLICE)
5725
0
            fd->bases_per_slice = fd->seqs_per_slice * 500;
5726
0
        break;
5727
5728
0
    case CRAM_OPT_BASES_PER_SLICE:
5729
0
        fd->bases_per_slice = va_arg(args, int);
5730
0
        break;
5731
5732
0
    case CRAM_OPT_SLICES_PER_CONTAINER:
5733
0
        fd->slices_per_container = va_arg(args, int);
5734
0
        break;
5735
5736
0
    case CRAM_OPT_EMBED_REF:
5737
0
        fd->embed_ref = va_arg(args, int);
5738
0
        break;
5739
5740
0
    case CRAM_OPT_NO_REF:
5741
0
        fd->no_ref = va_arg(args, int);
5742
0
        break;
5743
5744
0
    case CRAM_OPT_POS_DELTA:
5745
0
        fd->ap_delta = va_arg(args, int);
5746
0
        break;
5747
5748
0
    case CRAM_OPT_IGNORE_MD5:
5749
0
        fd->ignore_md5 = va_arg(args, int);
5750
0
        break;
5751
5752
0
    case CRAM_OPT_LOSSY_NAMES:
5753
0
        fd->lossy_read_names = va_arg(args, int);
5754
        // Currently lossy read names required paired (attached) reads.
5755
        // TLEN 0 or being 1 out causes read pairs to be detached, breaking
5756
        // the lossy read name compression, so we have extra options to
5757
        // slacken the exact TLEN round-trip checks.
5758
0
        fd->tlen_approx = fd->lossy_read_names;
5759
0
        fd->tlen_zero = fd->lossy_read_names;
5760
0
        break;
5761
5762
0
    case CRAM_OPT_USE_BZIP2:
5763
0
        fd->use_bz2 = va_arg(args, int);
5764
0
        break;
5765
5766
0
    case CRAM_OPT_USE_RANS:
5767
0
        fd->use_rans = va_arg(args, int);
5768
0
        break;
5769
5770
0
    case CRAM_OPT_USE_TOK:
5771
0
        fd->use_tok = va_arg(args, int);
5772
0
        break;
5773
5774
0
    case CRAM_OPT_USE_FQZ:
5775
0
        fd->use_fqz = va_arg(args, int);
5776
0
        break;
5777
5778
0
    case CRAM_OPT_USE_ARITH:
5779
0
        fd->use_arith = va_arg(args, int);
5780
0
        break;
5781
5782
0
    case CRAM_OPT_USE_LZMA:
5783
0
        fd->use_lzma = va_arg(args, int);
5784
0
        break;
5785
5786
0
    case CRAM_OPT_SHARED_REF:
5787
0
        fd->shared_ref = 1;
5788
0
        refs = va_arg(args, refs_t *);
5789
0
        if (refs != fd->refs) {
5790
0
            if (fd->refs)
5791
0
                refs_free(fd->refs);
5792
0
            fd->refs = refs;
5793
0
            fd->refs->count++;
5794
0
        }
5795
0
        break;
5796
5797
0
    case CRAM_OPT_RANGE: {
5798
0
        int r = cram_seek_to_refpos(fd, va_arg(args, cram_range *));
5799
0
        pthread_mutex_lock(&fd->range_lock);
5800
//        printf("opt range noseek to %p %d:%ld-%ld\n",
5801
//               fd, fd->range.refid, fd->range.start, fd->range.end);
5802
0
        if (fd->range.refid != -2)
5803
0
            fd->required_fields |= SAM_POS;
5804
0
        pthread_mutex_unlock(&fd->range_lock);
5805
0
        return r;
5806
0
    }
5807
5808
0
    case CRAM_OPT_RANGE_NOSEEK: {
5809
        // As per CRAM_OPT_RANGE, but no seeking
5810
0
        pthread_mutex_lock(&fd->range_lock);
5811
0
        cram_range *r = va_arg(args, cram_range *);
5812
0
        fd->range = *r;
5813
0
        if (r->refid == HTS_IDX_NOCOOR) {
5814
0
            fd->range.refid = -1;
5815
0
            fd->range.start = 0;
5816
0
        } else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) {
5817
0
            fd->range.refid = -2; // special case in cram_next_slice
5818
0
        }
5819
0
        if (fd->range.refid != -2)
5820
0
            fd->required_fields |= SAM_POS;
5821
0
        fd->ooc = 0;
5822
0
        fd->eof = 0;
5823
0
        pthread_mutex_unlock(&fd->range_lock);
5824
0
        return 0;
5825
0
    }
5826
5827
0
    case CRAM_OPT_REFERENCE:
5828
0
        return cram_load_reference(fd, va_arg(args, char *));
5829
5830
0
    case CRAM_OPT_VERSION: {
5831
0
        int major, minor;
5832
0
        char *s = va_arg(args, char *);
5833
0
        if (2 != sscanf(s, "%d.%d", &major, &minor)) {
5834
0
            hts_log_error("Malformed version string %s", s);
5835
0
            return -1;
5836
0
        }
5837
0
        if (!((major == 1 &&  minor == 0) ||
5838
0
              (major == 2 && (minor == 0 || minor == 1)) ||
5839
0
              (major == 3 && (minor == 0 || minor == 1)) ||
5840
0
              (major == 4 &&  minor == 0))) {
5841
0
            hts_log_error("Unknown version string; use 1.0, 2.0, 2.1, 3.0, 3.1 or 4.0");
5842
0
            errno = EINVAL;
5843
0
            return -1;
5844
0
        }
5845
5846
0
        if (major > 3 || (major == 3 && minor > 1)) {
5847
0
            hts_log_warning(
5848
0
                "CRAM version %s is still a draft and subject to change.\n"
5849
0
                "This is a technology demonstration that should not be "
5850
0
                "used for archival data.", s);
5851
0
        }
5852
5853
0
        fd->version = major*256 + minor;
5854
5855
0
        fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0;
5856
5857
0
        fd->use_tok = ((CRAM_MAJOR_VERS(fd->version) == 3 &&
5858
0
                        CRAM_MINOR_VERS(fd->version) >= 1) ||
5859
0
                        CRAM_MAJOR_VERS(fd->version) >= 4) ? 1 : 0;
5860
0
        cram_init_tables(fd);
5861
5862
0
        break;
5863
0
    }
5864
5865
0
    case CRAM_OPT_MULTI_SEQ_PER_SLICE:
5866
0
        fd->multi_seq_user = fd->multi_seq = va_arg(args, int);
5867
0
        break;
5868
5869
0
    case CRAM_OPT_NTHREADS: {
5870
0
        int nthreads =  va_arg(args, int);
5871
0
        if (nthreads >= 1) {
5872
0
            if (!(fd->pool = hts_tpool_init(nthreads)))
5873
0
                return -1;
5874
5875
0
            fd->rqueue = hts_tpool_process_init(fd->pool, nthreads*2, 0);
5876
0
            fd->shared_ref = 1;
5877
0
            fd->own_pool = 1;
5878
0
        }
5879
0
        break;
5880
0
    }
5881
5882
0
    case CRAM_OPT_THREAD_POOL: {
5883
0
        htsThreadPool *p = va_arg(args, htsThreadPool *);
5884
0
        fd->pool = p ? p->pool : NULL;
5885
0
        if (fd->pool) {
5886
0
            fd->rqueue = hts_tpool_process_init(fd->pool,
5887
0
                                                p->qsize ? p->qsize : hts_tpool_size(fd->pool)*2,
5888
0
                                                0);
5889
0
        }
5890
0
        fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
5891
0
        fd->own_pool = 0;
5892
5893
        //fd->qsize = 1;
5894
        //fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
5895
        //hts_tpool_dispatch(fd->pool, cram_decoder_thread, fd);
5896
0
        break;
5897
0
    }
5898
5899
0
    case CRAM_OPT_REQUIRED_FIELDS:
5900
0
        fd->required_fields = va_arg(args, int);
5901
0
        if (fd->range.refid != -2)
5902
0
            fd->required_fields |= SAM_POS;
5903
0
        break;
5904
5905
0
    case CRAM_OPT_STORE_MD:
5906
0
        fd->store_md = va_arg(args, int);
5907
0
        break;
5908
5909
0
    case CRAM_OPT_STORE_NM:
5910
0
        fd->store_nm = va_arg(args, int);
5911
0
        break;
5912
5913
0
    case HTS_OPT_COMPRESSION_LEVEL:
5914
0
        fd->level = va_arg(args, int);
5915
0
        break;
5916
5917
0
    case HTS_OPT_PROFILE: {
5918
0
        enum hts_profile_option prof = va_arg(args, int);
5919
0
        switch (prof) {
5920
0
        case HTS_PROFILE_FAST:
5921
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 1;
5922
0
            fd->use_tok = 0;
5923
0
            fd->seqs_per_slice = 10000;
5924
0
            break;
5925
5926
0
        case HTS_PROFILE_NORMAL:
5927
0
            break;
5928
5929
0
        case HTS_PROFILE_SMALL:
5930
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 6;
5931
0
            fd->use_bz2 = 1;
5932
0
            fd->use_fqz = 1;
5933
0
            fd->seqs_per_slice = 25000;
5934
0
            break;
5935
5936
0
        case HTS_PROFILE_ARCHIVE:
5937
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 7;
5938
0
            fd->use_bz2 = 1;
5939
0
            fd->use_fqz = 1;
5940
0
            fd->use_arith = 1;
5941
0
            if (fd->level > 7)
5942
0
                fd->use_lzma = 1;
5943
0
            fd->seqs_per_slice = 100000;
5944
0
            break;
5945
0
        }
5946
5947
0
        if (fd->bases_per_slice == BASES_PER_SLICE)
5948
0
            fd->bases_per_slice = fd->seqs_per_slice * 500;
5949
0
        break;
5950
0
    }
5951
5952
0
    default:
5953
0
        hts_log_error("Unknown CRAM option code %d", opt);
5954
0
        errno = EINVAL;
5955
0
        return -1;
5956
1.74k
    }
5957
5958
1.74k
    return 0;
5959
1.74k
}
5960
5961
int cram_check_EOF(cram_fd *fd)
5962
0
{
5963
    // Byte 9 in these templates is & with 0x0f to resolve differences
5964
    // between ITF-8 interpretations between early Java and C
5965
    // implementations of CRAM
5966
0
    static const unsigned char TEMPLATE_2_1[30] = {
5967
0
        0x0b, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
5968
0
        0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
5969
0
        0x01, 0x00, 0x06, 0x06, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00
5970
0
    };
5971
0
    static const unsigned char TEMPLATE_3[38] = {
5972
0
        0x0f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
5973
0
        0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x05,
5974
0
        0xbd, 0xd9, 0x4f, 0x00, 0x01, 0x00, 0x06, 0x06, 0x01, 0x00,
5975
0
        0x01, 0x00, 0x01, 0x00, 0xee, 0x63, 0x01, 0x4b
5976
0
    };
5977
5978
0
    unsigned char buf[38]; // max(sizeof TEMPLATE_*)
5979
5980
0
    uint8_t major = CRAM_MAJOR_VERS(fd->version);
5981
0
    uint8_t minor = CRAM_MINOR_VERS(fd->version);
5982
5983
0
    const unsigned char *template;
5984
0
    ssize_t template_len;
5985
0
    if ((major < 2) ||
5986
0
        (major == 2 && minor == 0)) {
5987
0
        return 3; // No EOF support in cram versions less than 2.1
5988
0
    } else if (major == 2 && minor == 1) {
5989
0
        template = TEMPLATE_2_1;
5990
0
        template_len = sizeof TEMPLATE_2_1;
5991
0
    } else {
5992
0
        template = TEMPLATE_3;
5993
0
        template_len = sizeof TEMPLATE_3;
5994
0
    }
5995
5996
0
    off_t offset = htell(fd->fp);
5997
0
    if (hseek(fd->fp, -template_len, SEEK_END) < 0) {
5998
0
        if (errno == ESPIPE) {
5999
0
            hclearerr(fd->fp);
6000
0
            return 2;
6001
0
        }
6002
0
        else {
6003
0
            return -1;
6004
0
        }
6005
0
    }
6006
0
    if (hread(fd->fp, buf, template_len) != template_len) return -1;
6007
0
    if (hseek(fd->fp, offset, SEEK_SET) < 0) return -1;
6008
0
    buf[8] &= 0x0f;
6009
0
    return (memcmp(template, buf, template_len) == 0)? 1 : 0;
6010
0
}