Coverage Report

Created: 2025-08-08 07:03

/src/htslib/cram/cram_io.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2012-2025 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * CRAM I/O primitives.
33
 *
34
 * - ITF8 encoding and decoding.
35
 * - Block based I/O
36
 * - Zlib inflating and deflating (memory)
37
 * - CRAM basic data structure reading and writing
38
 * - File opening / closing
39
 * - Reference sequence handling
40
 */
41
42
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
43
#include <config.h>
44
45
#include <stdio.h>
46
#include <errno.h>
47
#include <assert.h>
48
#include <stdlib.h>
49
#include <string.h>
50
#include <unistd.h>
51
#include <zlib.h>
52
#ifdef HAVE_LIBBZ2
53
#include <bzlib.h>
54
#endif
55
#ifdef HAVE_LIBLZMA
56
#ifdef HAVE_LZMA_H
57
#include <lzma.h>
58
#else
59
#include "../os/lzma_stub.h"
60
#endif
61
#endif
62
#include <sys/types.h>
63
#include <sys/stat.h>
64
#include <math.h>
65
#include <stdint.h>
66
67
#ifdef HAVE_LIBDEFLATE
68
#include <libdeflate.h>
69
#define crc32(a,b,c) libdeflate_crc32((a),(b),(c))
70
#endif
71
72
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
73
#include "../fuzz_settings.h"
74
#endif
75
76
#include "cram.h"
77
#include "os.h"
78
#include "../htslib/hts.h"
79
#include "../hts_internal.h"
80
#include "open_trace_file.h"
81
82
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
83
#include <htscodecs/rANS_static.h>
84
#include <htscodecs/rANS_static4x16.h>
85
#include <htscodecs/arith_dynamic.h>
86
#include <htscodecs/tokenise_name3.h>
87
#include <htscodecs/fqzcomp_qual.h>
88
#include <htscodecs/varint.h> // CRAM v4.0 variable-size integers
89
#else
90
#include "../htscodecs/htscodecs/rANS_static.h"
91
#include "../htscodecs/htscodecs/rANS_static4x16.h"
92
#include "../htscodecs/htscodecs/arith_dynamic.h"
93
#include "../htscodecs/htscodecs/tokenise_name3.h"
94
#include "../htscodecs/htscodecs/fqzcomp_qual.h"
95
#include "../htscodecs/htscodecs/varint.h"
96
#endif
97
98
//#define REF_DEBUG
99
100
#ifdef REF_DEBUG
101
#include <sys/syscall.h>
102
#define gettid() (int)syscall(SYS_gettid)
103
104
#define RP(...) fprintf (stderr, __VA_ARGS__)
105
#else
106
#define RP(...)
107
#endif
108
109
#include "../htslib/hfile.h"
110
#include "../htslib/bgzf.h"
111
#include "../htslib/faidx.h"
112
#include "../hts_internal.h"
113
114
#ifndef PATH_MAX
115
#define PATH_MAX FILENAME_MAX
116
#endif
117
118
1.42M
#define TRIAL_SPAN 70
119
1.42M
#define NTRIALS 3
120
121
25.1k
#define CRAM_DEFAULT_LEVEL 5
122
123
/* ----------------------------------------------------------------------
124
 * ITF8 encoding and decoding.
125
 *
126
 * Also see the itf8_get and itf8_put macros in cram_io.h
127
 */
128
129
/*
130
 * LEGACY: consider using itf8_decode_crc.
131
 *
132
 * Reads an integer in ITF-8 encoding from 'cp' and stores it in
133
 * *val.
134
 *
135
 * Returns the number of bytes read on success
136
 *        -1 on failure
137
 */
138
0
int itf8_decode(cram_fd *fd, int32_t *val_p) {
139
0
    static int nbytes[16] = {
140
0
        0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
141
0
        1,1,1,1,                                        // 1000xxxx - 1011xxxx
142
0
        2,2,                                            // 1100xxxx - 1101xxxx
143
0
        3,                                              // 1110xxxx
144
0
        4,                                              // 1111xxxx
145
0
    };
146
147
0
    static int nbits[16] = {
148
0
        0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
149
0
        0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
150
0
        0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
151
0
        0x0f,                                           // 1110xxxx
152
0
        0x0f,                                           // 1111xxxx
153
0
    };
154
155
0
    int32_t val = hgetc(fd->fp);
156
0
    if (val == -1)
157
0
        return -1;
158
159
0
    int i = nbytes[val>>4];
160
0
    val &= nbits[val>>4];
161
162
0
    switch(i) {
163
0
    case 0:
164
0
        *val_p = val;
165
0
        return 1;
166
167
0
    case 1:
168
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
169
0
        *val_p = val;
170
0
        return 2;
171
172
0
    case 2:
173
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
174
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
175
0
        *val_p = val;
176
0
        return 3;
177
178
0
    case 3:
179
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
180
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
181
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
182
0
        *val_p = val;
183
0
        return 4;
184
185
0
    case 4: // really 3.5 more, why make it different?
186
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
187
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
188
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
189
0
        val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f);
190
0
        *val_p = val;
191
0
    }
192
193
0
    return 5;
194
0
}
195
196
261k
int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
197
261k
    static int nbytes[16] = {
198
261k
        0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
199
261k
        1,1,1,1,                                        // 1000xxxx - 1011xxxx
200
261k
        2,2,                                            // 1100xxxx - 1101xxxx
201
261k
        3,                                              // 1110xxxx
202
261k
        4,                                              // 1111xxxx
203
261k
    };
204
205
261k
    static int nbits[16] = {
206
261k
        0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
207
261k
        0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
208
261k
        0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
209
261k
        0x0f,                                           // 1110xxxx
210
261k
        0x0f,                                           // 1111xxxx
211
261k
    };
212
261k
    unsigned char c[5];
213
214
261k
    int32_t val = hgetc(fd->fp);
215
261k
    if (val == -1)
216
402
        return -1;
217
218
260k
    c[0]=val;
219
220
260k
    int i = nbytes[val>>4];
221
260k
    val &= nbits[val>>4];
222
223
260k
    if (i > 0) {
224
20.2k
        if (hread(fd->fp, &c[1], i) < i)
225
143
            return -1;
226
20.2k
    }
227
228
260k
    switch(i) {
229
240k
    case 0:
230
240k
        *val_p = val;
231
240k
        *crc = crc32(*crc, c, 1);
232
240k
        return 1;
233
234
9.39k
    case 1:
235
9.39k
        val = (val<<8) | c[1];
236
9.39k
        *val_p = val;
237
9.39k
        *crc = crc32(*crc, c, 2);
238
9.39k
        return 2;
239
240
4.84k
    case 2:
241
4.84k
        val = (val<<8) | c[1];
242
4.84k
        val = (val<<8) | c[2];
243
4.84k
        *val_p = val;
244
4.84k
        *crc = crc32(*crc, c, 3);
245
4.84k
        return 3;
246
247
1.48k
    case 3:
248
1.48k
        val = (val<<8) | c[1];
249
1.48k
        val = (val<<8) | c[2];
250
1.48k
        val = (val<<8) | c[3];
251
1.48k
        *val_p = val;
252
1.48k
        *crc = crc32(*crc, c, 4);
253
1.48k
        return 4;
254
255
4.41k
    case 4: // really 3.5 more, why make it different?
256
4.41k
        {
257
4.41k
            uint32_t uv = val;
258
4.41k
            uv = (uv<<8) |  c[1];
259
4.41k
            uv = (uv<<8) |  c[2];
260
4.41k
            uv = (uv<<8) |  c[3];
261
4.41k
            uv = (uv<<4) | (c[4] & 0x0f);
262
            // Avoid implementation-defined behaviour on negative values
263
4.41k
            *val_p = uv < 0x80000000UL ? (int32_t) uv : -((int32_t) (0xffffffffUL - uv)) - 1;
264
4.41k
            *crc = crc32(*crc, c, 5);
265
4.41k
        }
266
260k
    }
267
268
4.41k
    return 5;
269
260k
}
270
271
/*
272
 * Stores a value to memory in ITF-8 format.
273
 *
274
 * Returns the number of bytes required to store the number.
275
 * This is a maximum of 5 bytes.
276
 */
277
27.3M
static inline int itf8_put(char *cp, int32_t val) {
278
27.3M
    unsigned char *up = (unsigned char *)cp;
279
27.3M
    if        (!(val & ~0x00000007f)) { // 1 byte
280
25.4M
        *up = val;
281
25.4M
        return 1;
282
25.4M
    } else if (!(val & ~0x00003fff)) { // 2 byte
283
514k
        *up++ = (val >> 8 ) | 0x80;
284
514k
        *up   = val & 0xff;
285
514k
        return 2;
286
1.29M
    } else if (!(val & ~0x01fffff)) { // 3 byte
287
20.9k
        *up++ = (val >> 16) | 0xc0;
288
20.9k
        *up++ = (val >> 8 ) & 0xff;
289
20.9k
        *up   = val & 0xff;
290
20.9k
        return 3;
291
1.27M
    } else if (!(val & ~0x0fffffff)) { // 4 byte
292
970k
        *up++ = (val >> 24) | 0xe0;
293
970k
        *up++ = (val >> 16) & 0xff;
294
970k
        *up++ = (val >> 8 ) & 0xff;
295
970k
        *up   = val & 0xff;
296
970k
        return 4;
297
970k
    } else {                           // 5 byte
298
303k
        *up++ = 0xf0 | ((val>>28) & 0xff);
299
303k
        *up++ = (val >> 20) & 0xff;
300
303k
        *up++ = (val >> 12) & 0xff;
301
303k
        *up++ = (val >> 4 ) & 0xff;
302
303k
        *up = val & 0x0f;
303
303k
        return 5;
304
303k
    }
305
27.3M
}
306
307
308
/* 64-bit itf8 variant */
309
233k
static inline int ltf8_put(char *cp, int64_t val) {
310
233k
    unsigned char *up = (unsigned char *)cp;
311
233k
    if        (!(val & ~((1LL<<7)-1))) {
312
160k
        *up = val;
313
160k
        return 1;
314
160k
    } else if (!(val & ~((1LL<<(6+8))-1))) {
315
72.3k
        *up++ = (val >> 8 ) | 0x80;
316
72.3k
        *up   = val & 0xff;
317
72.3k
        return 2;
318
72.3k
    } else if (!(val & ~((1LL<<(5+2*8))-1))) {
319
1.36k
        *up++ = (val >> 16) | 0xc0;
320
1.36k
        *up++ = (val >> 8 ) & 0xff;
321
1.36k
        *up   = val & 0xff;
322
1.36k
        return 3;
323
1.36k
    } else if (!(val & ~((1LL<<(4+3*8))-1))) {
324
51
        *up++ = (val >> 24) | 0xe0;
325
51
        *up++ = (val >> 16) & 0xff;
326
51
        *up++ = (val >> 8 ) & 0xff;
327
51
        *up   = val & 0xff;
328
51
        return 4;
329
51
    } else if (!(val & ~((1LL<<(3+4*8))-1))) {
330
0
        *up++ = (val >> 32) | 0xf0;
331
0
        *up++ = (val >> 24) & 0xff;
332
0
        *up++ = (val >> 16) & 0xff;
333
0
        *up++ = (val >> 8 ) & 0xff;
334
0
        *up   = val & 0xff;
335
0
        return 5;
336
0
    } else if (!(val & ~((1LL<<(2+5*8))-1))) {
337
0
        *up++ = (val >> 40) | 0xf8;
338
0
        *up++ = (val >> 32) & 0xff;
339
0
        *up++ = (val >> 24) & 0xff;
340
0
        *up++ = (val >> 16) & 0xff;
341
0
        *up++ = (val >> 8 ) & 0xff;
342
0
        *up   = val & 0xff;
343
0
        return 6;
344
0
    } else if (!(val & ~((1LL<<(1+6*8))-1))) {
345
0
        *up++ = (val >> 48) | 0xfc;
346
0
        *up++ = (val >> 40) & 0xff;
347
0
        *up++ = (val >> 32) & 0xff;
348
0
        *up++ = (val >> 24) & 0xff;
349
0
        *up++ = (val >> 16) & 0xff;
350
0
        *up++ = (val >> 8 ) & 0xff;
351
0
        *up   = val & 0xff;
352
0
        return 7;
353
0
    } else if (!(val & ~((1LL<<(7*8))-1))) {
354
0
        *up++ = (val >> 56) | 0xfe;
355
0
        *up++ = (val >> 48) & 0xff;
356
0
        *up++ = (val >> 40) & 0xff;
357
0
        *up++ = (val >> 32) & 0xff;
358
0
        *up++ = (val >> 24) & 0xff;
359
0
        *up++ = (val >> 16) & 0xff;
360
0
        *up++ = (val >> 8 ) & 0xff;
361
0
        *up   = val & 0xff;
362
0
        return 8;
363
0
    } else {
364
0
        *up++ = 0xff;
365
0
        *up++ = (val >> 56) & 0xff;
366
0
        *up++ = (val >> 48) & 0xff;
367
0
        *up++ = (val >> 40) & 0xff;
368
0
        *up++ = (val >> 32) & 0xff;
369
0
        *up++ = (val >> 24) & 0xff;
370
0
        *up++ = (val >> 16) & 0xff;
371
0
        *up++ = (val >> 8 ) & 0xff;
372
0
        *up   = val & 0xff;
373
0
        return 9;
374
0
    }
375
233k
}
376
377
/*
378
 * Encodes and writes a single integer in ITF-8 format.
379
 * Returns 0 on success
380
 *        -1 on failure
381
 */
382
0
int itf8_encode(cram_fd *fd, int32_t val) {
383
0
    char buf[5];
384
0
    int len = itf8_put(buf, val);
385
0
    return hwrite(fd->fp, buf, len) == len ? 0 : -1;
386
0
}
387
388
const int itf8_bytes[16] = {
389
    1, 1, 1, 1,  1, 1, 1, 1,
390
    2, 2, 2, 2,  3, 3, 4, 5
391
};
392
393
const int ltf8_bytes[256] = {
394
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
395
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
396
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
397
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
398
399
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
400
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
401
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
402
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
403
404
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
405
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
406
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
407
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
408
409
    3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
410
    3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
411
412
    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
413
414
    5, 5, 5, 5,  5, 5, 5, 5,  6, 6, 6, 6,  7, 7, 8, 9
415
};
416
417
/*
418
 * LEGACY: consider using ltf8_decode_crc.
419
 */
420
0
int ltf8_decode(cram_fd *fd, int64_t *val_p) {
421
0
    int c = hgetc(fd->fp);
422
0
    int64_t val = (unsigned char)c;
423
0
    if (c == -1)
424
0
        return -1;
425
426
0
    if (val < 0x80) {
427
0
        *val_p =   val;
428
0
        return 1;
429
430
0
    } else if (val < 0xc0) {
431
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
432
0
        *val_p = val & (((1LL<<(6+8)))-1);
433
0
        return 2;
434
435
0
    } else if (val < 0xe0) {
436
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
437
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
438
0
        *val_p = val & ((1LL<<(5+2*8))-1);
439
0
        return 3;
440
441
0
    } else if (val < 0xf0) {
442
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
443
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
444
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
445
0
        *val_p = val & ((1LL<<(4+3*8))-1);
446
0
        return 4;
447
448
0
    } else if (val < 0xf8) {
449
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
450
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
451
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
452
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
453
0
        *val_p = val & ((1LL<<(3+4*8))-1);
454
0
        return 5;
455
456
0
    } else if (val < 0xfc) {
457
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
458
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
459
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
460
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
461
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
462
0
        *val_p = val & ((1LL<<(2+5*8))-1);
463
0
        return 6;
464
465
0
    } else if (val < 0xfe) {
466
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
467
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
468
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
469
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
470
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
471
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
472
0
        *val_p = val & ((1LL<<(1+6*8))-1);
473
0
        return 7;
474
475
0
    } else if (val < 0xff) {
476
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
477
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
478
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
479
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
480
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
481
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
482
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
483
0
        *val_p = val & ((1LL<<(7*8))-1);
484
0
        return 8;
485
486
0
    } else {
487
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
488
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
489
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
490
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
491
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
492
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
493
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
494
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
495
0
        *val_p = val;
496
0
    }
497
498
0
    return 9;
499
0
}
500
501
3.09k
int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
502
3.09k
    unsigned char c[9];
503
3.09k
    int64_t val = hgetc(fd->fp);
504
3.09k
    if (val < 0)
505
13
        return -1;
506
507
3.07k
    c[0] = val;
508
509
3.07k
    if (val < 0x80) {
510
2.73k
        *val_p =   val;
511
2.73k
        *crc = crc32(*crc, c, 1);
512
2.73k
        return 1;
513
514
2.73k
    } else if (val < 0xc0) {
515
113
        int v = hgetc(fd->fp);
516
113
        if (v < 0)
517
12
            return -1;
518
101
        val = (val<<8) | (c[1]=v);
519
101
        *val_p = val & (((1LL<<(6+8)))-1);
520
101
        *crc = crc32(*crc, c, 2);
521
101
        return 2;
522
523
230
    } else if (val < 0xe0) {
524
36
        if (hread(fd->fp, &c[1], 2) < 2)
525
4
            return -1;
526
32
        val = (val<<8) | c[1];
527
32
        val = (val<<8) | c[2];
528
32
        *val_p = val & ((1LL<<(5+2*8))-1);
529
32
        *crc = crc32(*crc, c, 3);
530
32
        return 3;
531
532
194
    } else if (val < 0xf0) {
533
31
        if (hread(fd->fp, &c[1], 3) < 3)
534
4
            return -1;
535
27
        val = (val<<8) | c[1];
536
27
        val = (val<<8) | c[2];
537
27
        val = (val<<8) | c[3];
538
27
        *val_p = val & ((1LL<<(4+3*8))-1);
539
27
        *crc = crc32(*crc, c, 4);
540
27
        return 4;
541
542
163
    } else if (val < 0xf8) {
543
47
        if (hread(fd->fp, &c[1], 4) < 4)
544
4
            return -1;
545
43
        val = (val<<8) | c[1];
546
43
        val = (val<<8) | c[2];
547
43
        val = (val<<8) | c[3];
548
43
        val = (val<<8) | c[4];
549
43
        *val_p = val & ((1LL<<(3+4*8))-1);
550
43
        *crc = crc32(*crc, c, 5);
551
43
        return 5;
552
553
116
    } else if (val < 0xfc) {
554
16
        if (hread(fd->fp, &c[1], 5) < 5)
555
4
            return -1;
556
12
        val = (val<<8) | c[1];
557
12
        val = (val<<8) | c[2];
558
12
        val = (val<<8) | c[3];
559
12
        val = (val<<8) | c[4];
560
12
        val = (val<<8) | c[5];
561
12
        *val_p = val & ((1LL<<(2+5*8))-1);
562
12
        *crc = crc32(*crc, c, 6);
563
12
        return 6;
564
565
100
    } else if (val < 0xfe) {
566
16
        if (hread(fd->fp, &c[1], 6) < 6)
567
3
            return -1;
568
13
        val = (val<<8) | c[1];
569
13
        val = (val<<8) | c[2];
570
13
        val = (val<<8) | c[3];
571
13
        val = (val<<8) | c[4];
572
13
        val = (val<<8) | c[5];
573
13
        val = (val<<8) | c[6];
574
13
        *val_p = val & ((1LL<<(1+6*8))-1);
575
13
        *crc = crc32(*crc, c, 7);
576
13
        return 7;
577
578
84
    } else if (val < 0xff) {
579
12
        uint64_t uval = val;
580
12
        if (hread(fd->fp, &c[1], 7) < 7)
581
1
            return -1;
582
11
        uval = (uval<<8) | c[1];
583
11
        uval = (uval<<8) | c[2];
584
11
        uval = (uval<<8) | c[3];
585
11
        uval = (uval<<8) | c[4];
586
11
        uval = (uval<<8) | c[5];
587
11
        uval = (uval<<8) | c[6];
588
11
        uval = (uval<<8) | c[7];
589
11
        *val_p = uval & ((1ULL<<(7*8))-1);
590
11
        *crc = crc32(*crc, c, 8);
591
11
        return 8;
592
593
72
    } else {
594
72
        uint64_t uval;
595
72
        if (hread(fd->fp, &c[1], 8) < 8)
596
5
            return -1;
597
67
        uval =             c[1];
598
67
        uval = (uval<<8) | c[2];
599
67
        uval = (uval<<8) | c[3];
600
67
        uval = (uval<<8) | c[4];
601
67
        uval = (uval<<8) | c[5];
602
67
        uval = (uval<<8) | c[6];
603
67
        uval = (uval<<8) | c[7];
604
67
        uval = (uval<<8) | c[8];
605
67
        *crc = crc32(*crc, c, 9);
606
        // Avoid implementation-defined behaviour on negative values
607
67
        *val_p = c[1] < 0x80 ? (int64_t) uval : -((int64_t) (0xffffffffffffffffULL - uval)) - 1;
608
67
    }
609
610
67
    return 9;
611
3.07k
}
612
613
/*
614
 * Pushes a value in ITF8 format onto the end of a block.
615
 * This shouldn't be used for high-volume data as it is not the fastest
616
 * method.
617
 *
618
 * Returns the number of bytes written
619
 */
620
17.7M
int itf8_put_blk(cram_block *blk, int32_t val) {
621
17.7M
    char buf[5];
622
17.7M
    int sz;
623
624
17.7M
    sz = itf8_put(buf, val);
625
17.7M
    BLOCK_APPEND(blk, buf, sz);
626
17.7M
    return sz;
627
628
0
 block_err:
629
0
    return -1;
630
17.7M
}
631
632
0
int ltf8_put_blk(cram_block *blk, int64_t val) {
633
0
    char buf[9];
634
0
    int sz;
635
636
0
    sz = ltf8_put(buf, val);
637
0
    BLOCK_APPEND(blk, buf, sz);
638
0
    return sz;
639
640
0
 block_err:
641
0
    return -1;
642
0
}
643
644
289k
static int64_t safe_itf8_get(char **cp, const char *endp, int *err) {
645
289k
    const unsigned char *up = (unsigned char *)*cp;
646
647
289k
    if (endp && endp - *cp < 5 &&
648
289k
        (*cp >= endp || endp - *cp < itf8_bytes[up[0]>>4])) {
649
34.4k
        if (err) *err = 1;
650
34.4k
        return 0;
651
34.4k
    }
652
653
255k
    if (up[0] < 0x80) {
654
231k
        (*cp)++;
655
231k
        return up[0];
656
231k
    } else if (up[0] < 0xc0) {
657
11.2k
        (*cp)+=2;
658
11.2k
        return ((up[0] <<8) |  up[1])                           & 0x3fff;
659
12.4k
    } else if (up[0] < 0xe0) {
660
2.29k
        (*cp)+=3;
661
2.29k
        return ((up[0]<<16) | (up[1]<< 8) |  up[2])             & 0x1fffff;
662
10.1k
    } else if (up[0] < 0xf0) {
663
2.67k
        (*cp)+=4;
664
2.67k
        uint32_t uv = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
665
2.67k
        return (int32_t)uv;
666
7.44k
    } else {
667
7.44k
        (*cp)+=5;
668
7.44k
        uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
669
7.44k
        return (int32_t)uv;
670
7.44k
    }
671
255k
}
672
673
1.20k
static int64_t safe_ltf8_get(char **cp, const char *endp, int *err) {
674
1.20k
    unsigned char *up = (unsigned char *)*cp;
675
676
1.20k
    if (endp && endp - *cp < 9 &&
677
1.20k
        (*cp >= endp || endp - *cp < ltf8_bytes[up[0]])) {
678
468
        if (err) *err = 1;
679
468
        return 0;
680
468
    }
681
682
740
    if (up[0] < 0x80) {
683
336
        (*cp)++;
684
336
        return up[0];
685
404
    } else if (up[0] < 0xc0) {
686
54
        (*cp)+=2;
687
54
        return (((uint64_t)up[0]<< 8) |
688
54
                 (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
689
350
    } else if (up[0] < 0xe0) {
690
89
        (*cp)+=3;
691
89
        return (((uint64_t)up[0]<<16) |
692
89
                ((uint64_t)up[1]<< 8) |
693
89
                 (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
694
261
    } else if (up[0] < 0xf0) {
695
60
        (*cp)+=4;
696
60
        return (((uint64_t)up[0]<<24) |
697
60
                ((uint64_t)up[1]<<16) |
698
60
                ((uint64_t)up[2]<< 8) |
699
60
                 (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
700
201
    } else if (up[0] < 0xf8) {
701
123
        (*cp)+=5;
702
123
        return (((uint64_t)up[0]<<32) |
703
123
                ((uint64_t)up[1]<<24) |
704
123
                ((uint64_t)up[2]<<16) |
705
123
                ((uint64_t)up[3]<< 8) |
706
123
                 (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
707
123
    } else if (up[0] < 0xfc) {
708
9
        (*cp)+=6;
709
9
        return (((uint64_t)up[0]<<40) |
710
9
                ((uint64_t)up[1]<<32) |
711
9
                ((uint64_t)up[2]<<24) |
712
9
                ((uint64_t)up[3]<<16) |
713
9
                ((uint64_t)up[4]<< 8) |
714
9
                 (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
715
69
    } else if (up[0] < 0xfe) {
716
6
        (*cp)+=7;
717
6
        return (((uint64_t)up[0]<<48) |
718
6
                ((uint64_t)up[1]<<40) |
719
6
                ((uint64_t)up[2]<<32) |
720
6
                ((uint64_t)up[3]<<24) |
721
6
                ((uint64_t)up[4]<<16) |
722
6
                ((uint64_t)up[5]<< 8) |
723
6
                 (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
724
63
    } else if (up[0] < 0xff) {
725
3
        (*cp)+=8;
726
3
        return (((uint64_t)up[1]<<48) |
727
3
                ((uint64_t)up[2]<<40) |
728
3
                ((uint64_t)up[3]<<32) |
729
3
                ((uint64_t)up[4]<<24) |
730
3
                ((uint64_t)up[5]<<16) |
731
3
                ((uint64_t)up[6]<< 8) |
732
3
                 (uint64_t)up[7]) & ((1LL<<(7*8))-1);
733
60
    } else {
734
60
        (*cp)+=9;
735
60
        return (((uint64_t)up[1]<<56) |
736
60
                ((uint64_t)up[2]<<48) |
737
60
                ((uint64_t)up[3]<<40) |
738
60
                ((uint64_t)up[4]<<32) |
739
60
                ((uint64_t)up[5]<<24) |
740
60
                ((uint64_t)up[6]<<16) |
741
60
                ((uint64_t)up[7]<< 8) |
742
60
                 (uint64_t)up[8]);
743
60
    }
744
740
}
745
746
// Wrapper for now
747
9.52M
static int safe_itf8_put(char *cp, char *cp_end, int32_t val) {
748
9.52M
    return itf8_put(cp, val);
749
9.52M
}
750
751
233k
static int safe_ltf8_put(char *cp, char *cp_end, int64_t val) {
752
233k
    return ltf8_put(cp, val);
753
233k
}
754
755
2.11M
static int itf8_size(int64_t v) {
756
2.11M
    return ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5);
757
2.11M
}
758
759
//-----------------------------------------------------------------------------
760
761
// CRAM v4.0 onwards uses a different variable sized integer encoding
762
// that is size agnostic.
763
764
// Local interface to varint.h inline version, so we can use in func ptr.
765
// Note a lot of these use the unsigned interface but take signed int64_t.
766
// This is because the old CRAM ITF8 inteface had signed -1 as unsigned
767
// 0xffffffff.
768
6.99k
static int uint7_size(int64_t v) {
769
6.99k
    return var_size_u64(v);
770
6.99k
}
771
772
4.07k
static int64_t uint7_get_32(char **cp, const char *endp, int *err) {
773
4.07k
    uint32_t val;
774
4.07k
    int nb = var_get_u32((uint8_t *)(*cp), (const uint8_t *)endp, &val);
775
4.07k
    (*cp) += nb;
776
4.07k
    if (!nb && err) *err = 1;
777
4.07k
    return val;
778
4.07k
}
779
780
0
static int64_t sint7_get_32(char **cp, const char *endp, int *err) {
781
0
    int32_t val;
782
0
    int nb = var_get_s32((uint8_t *)(*cp), (const uint8_t *)endp, &val);
783
0
    (*cp) += nb;
784
0
    if (!nb && err) *err = 1;
785
0
    return val;
786
0
}
787
788
6.35M
static int64_t uint7_get_64(char **cp, const char *endp, int *err) {
789
6.35M
    uint64_t val;
790
6.35M
    int nb = var_get_u64((uint8_t *)(*cp), (const uint8_t *)endp, &val);
791
6.35M
    (*cp) += nb;
792
6.35M
    if (!nb && err) *err = 1;
793
6.35M
    return val;
794
6.35M
}
795
796
84
static int64_t sint7_get_64(char **cp, const char *endp, int *err) {
797
84
    int64_t val;
798
84
    int nb = var_get_s64((uint8_t *)(*cp), (const uint8_t *)endp, &val);
799
84
    (*cp) += nb;
800
84
    if (!nb && err) *err = 1;
801
84
    return val;
802
84
}
803
804
0
static int uint7_put_32(char *cp, char *endp, int32_t val) {
805
0
    return var_put_u32((uint8_t *)cp, (uint8_t *)endp, val);
806
0
}
807
808
0
static int sint7_put_32(char *cp, char *endp, int32_t val) {
809
0
    return var_put_s32((uint8_t *)cp, (uint8_t *)endp, val);
810
0
}
811
812
0
static int uint7_put_64(char *cp, char *endp, int64_t val) {
813
0
    return var_put_u64((uint8_t *)cp, (uint8_t *)endp, val);
814
0
}
815
816
0
static int sint7_put_64(char *cp, char *endp, int64_t val) {
817
0
    return var_put_s64((uint8_t *)cp, (uint8_t *)endp, val);
818
0
}
819
820
// Put direct to to cram_block
821
0
static int uint7_put_blk_32(cram_block *blk, int32_t v) {
822
0
    uint8_t buf[10];
823
0
    int sz = var_put_u32(buf, buf+10, v);
824
0
    BLOCK_APPEND(blk, buf, sz);
825
0
    return sz;
826
827
0
 block_err:
828
0
    return -1;
829
0
}
830
831
0
static int sint7_put_blk_32(cram_block *blk, int32_t v) {
832
0
    uint8_t buf[10];
833
0
    int sz = var_put_s32(buf, buf+10, v);
834
0
    BLOCK_APPEND(blk, buf, sz);
835
0
    return sz;
836
837
0
 block_err:
838
0
    return -1;
839
0
}
840
841
0
static int uint7_put_blk_64(cram_block *blk, int64_t v) {
842
0
    uint8_t buf[10];
843
0
    int sz = var_put_u64(buf, buf+10, v);
844
0
    BLOCK_APPEND(blk, buf, sz);
845
0
    return sz;
846
847
0
 block_err:
848
0
    return -1;
849
0
}
850
851
0
static int sint7_put_blk_64(cram_block *blk, int64_t v) {
852
0
    uint8_t buf[10];
853
0
    int sz = var_put_s64(buf, buf+10, v);
854
0
    BLOCK_APPEND(blk, buf, sz);
855
0
    return sz;
856
857
0
 block_err:
858
0
    return -1;
859
0
}
860
861
// Decode 32-bits with CRC update from cram_fd
862
27.8k
static int uint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
863
27.8k
    uint8_t b[5], i = 0;
864
27.8k
    int c;
865
27.8k
    uint32_t v = 0;
866
867
#ifdef VARINT2
868
    b[0] = hgetc(fd->fp);
869
    if (b[0] < 177) {
870
    } else if (b[0] < 241) {
871
        b[1] = hgetc(fd->fp);
872
    } else if (b[0] < 249) {
873
        b[1] = hgetc(fd->fp);
874
        b[2] = hgetc(fd->fp);
875
    } else {
876
        int n = b[0]+2, z = 1;
877
        while (n-- >= 249)
878
            b[z++] = hgetc(fd->fp);
879
    }
880
    i = var_get_u32(b, NULL, &v);
881
#else
882
//    // Little endian
883
//    int s = 0;
884
//    do {
885
//        b[i++] = c = hgetc(fd->fp);
886
//        if (c < 0)
887
//            return -1;
888
//        v |= (c & 0x7f) << s;
889
//      s += 7;
890
//    } while (i < 5 && (c & 0x80));
891
892
    // Big endian, see also htscodecs/varint.h
893
31.8k
    do {
894
31.8k
        b[i++] = c = hgetc(fd->fp);
895
31.8k
        if (c < 0)
896
99
            return -1;
897
31.7k
        v = (v<<7) | (c & 0x7f);
898
31.7k
    } while (i < 5 && (c & 0x80));
899
27.7k
#endif
900
27.7k
    *crc = crc32(*crc, b, i);
901
902
27.7k
    *val_p = v;
903
27.7k
    return i;
904
27.8k
}
905
906
// Decode 32-bits with CRC update from cram_fd
907
3.30k
static int sint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
908
3.30k
    uint8_t b[5], i = 0;
909
3.30k
    int c;
910
3.30k
    uint32_t v = 0;
911
912
#ifdef VARINT2
913
    b[0] = hgetc(fd->fp);
914
    if (b[0] < 177) {
915
    } else if (b[0] < 241) {
916
        b[1] = hgetc(fd->fp);
917
    } else if (b[0] < 249) {
918
        b[1] = hgetc(fd->fp);
919
        b[2] = hgetc(fd->fp);
920
    } else {
921
        int n = b[0]+2, z = 1;
922
        while (n-- >= 249)
923
            b[z++] = hgetc(fd->fp);
924
    }
925
    i = var_get_u32(b, NULL, &v);
926
#else
927
//    // Little endian
928
//    int s = 0;
929
//    do {
930
//        b[i++] = c = hgetc(fd->fp);
931
//        if (c < 0)
932
//            return -1;
933
//        v |= (c & 0x7f) << s;
934
//      s += 7;
935
//    } while (i < 5 && (c & 0x80));
936
937
    // Big endian, see also htscodecs/varint.h
938
3.97k
    do {
939
3.97k
        b[i++] = c = hgetc(fd->fp);
940
3.97k
        if (c < 0)
941
16
            return -1;
942
3.95k
        v = (v<<7) | (c & 0x7f);
943
3.95k
    } while (i < 5 && (c & 0x80));
944
3.29k
#endif
945
3.29k
    *crc = crc32(*crc, b, i);
946
947
3.29k
    *val_p = (v>>1) ^ -(v&1);
948
3.29k
    return i;
949
3.30k
}
950
951
952
// Decode 64-bits with CRC update from cram_fd
953
13.0k
static int uint7_decode_crc64(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
954
13.0k
    uint8_t b[10], i = 0;
955
13.0k
    int c;
956
13.0k
    uint64_t v = 0;
957
958
#ifdef VARINT2
959
    b[0] = hgetc(fd->fp);
960
    if (b[0] < 177) {
961
    } else if (b[0] < 241) {
962
        b[1] = hgetc(fd->fp);
963
    } else if (b[0] < 249) {
964
        b[1] = hgetc(fd->fp);
965
        b[2] = hgetc(fd->fp);
966
    } else {
967
        int n = b[0]+2, z = 1;
968
        while (n-- >= 249)
969
            b[z++] = hgetc(fd->fp);
970
    }
971
    i = var_get_u64(b, NULL, &v);
972
#else
973
//    // Little endian
974
//    int s = 0;
975
//    do {
976
//        b[i++] = c = hgetc(fd->fp);
977
//        if (c < 0)
978
//            return -1;
979
//        v |= (c & 0x7f) << s;
980
//      s += 7;
981
//    } while (i < 10 && (c & 0x80));
982
983
    // Big endian, see also htscodecs/varint.h
984
14.1k
    do {
985
14.1k
        b[i++] = c = hgetc(fd->fp);
986
14.1k
        if (c < 0)
987
40
            return -1;
988
14.1k
        v = (v<<7) | (c & 0x7f);
989
14.1k
    } while (i < 5 && (c & 0x80));
990
13.0k
#endif
991
13.0k
    *crc = crc32(*crc, b, i);
992
993
13.0k
    *val_p = v;
994
13.0k
    return i;
995
13.0k
}
996
997
//-----------------------------------------------------------------------------
998
999
/*
1000
 * Decodes a 32-bit little endian value from fd and stores in val.
1001
 *
1002
 * Returns the number of bytes read on success
1003
 *         -1 on failure
1004
 */
1005
19.0k
static int int32_decode(cram_fd *fd, int32_t *val) {
1006
19.0k
    int32_t i;
1007
19.0k
    if (4 != hread(fd->fp, &i, 4))
1008
52
        return -1;
1009
1010
19.0k
    *val = le_int4(i);
1011
19.0k
    return 4;
1012
19.0k
}
1013
1014
/*
1015
 * Encodes a 32-bit little endian value 'val' and writes to fd.
1016
 *
1017
 * Returns the number of bytes written on success
1018
 *         -1 on failure
1019
 */
1020
555k
static int int32_encode(cram_fd *fd, int32_t val) {
1021
555k
    uint32_t v = le_int4(val);
1022
555k
    if (4 != hwrite(fd->fp, &v, 4))
1023
0
        return -1;
1024
1025
555k
    return 4;
1026
555k
}
1027
1028
/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
1029
859
int int32_get_blk(cram_block *b, int32_t *val) {
1030
859
    if (b->uncomp_size - BLOCK_SIZE(b) < 4)
1031
84
        return -1;
1032
1033
775
    uint32_t v =
1034
775
         ((uint32_t) b->data[b->byte  ])        |
1035
775
        (((uint32_t) b->data[b->byte+1]) <<  8) |
1036
775
        (((uint32_t) b->data[b->byte+2]) << 16) |
1037
775
        (((uint32_t) b->data[b->byte+3]) << 24);
1038
    // Avoid implementation-defined behaviour on negative values
1039
775
    *val = v < 0x80000000U ? (int32_t) v : -((int32_t) (0xffffffffU - v)) - 1;
1040
775
    BLOCK_SIZE(b) += 4;
1041
775
    return 4;
1042
859
}
1043
1044
/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
1045
10.7k
int int32_put_blk(cram_block *b, int32_t val) {
1046
10.7k
    unsigned char cp[4];
1047
10.7k
    uint32_t v = val;
1048
10.7k
    cp[0] = ( v      & 0xff);
1049
10.7k
    cp[1] = ((v>>8)  & 0xff);
1050
10.7k
    cp[2] = ((v>>16) & 0xff);
1051
10.7k
    cp[3] = ((v>>24) & 0xff);
1052
1053
10.7k
    BLOCK_APPEND(b, cp, 4);
1054
10.7k
    return 0;
1055
1056
0
 block_err:
1057
0
    return -1;
1058
10.7k
}
1059
1060
#ifdef HAVE_LIBDEFLATE
1061
/* ----------------------------------------------------------------------
1062
 * libdeflate compression code, with interface to match
1063
 * zlib_mem_{in,de}flate for simplicity elsewhere.
1064
 */
1065
1066
// Named the same as the version that uses zlib as we always use libdeflate for
1067
// decompression when available.
1068
char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1069
    struct libdeflate_decompressor *z = libdeflate_alloc_decompressor();
1070
    if (!z) {
1071
        hts_log_error("Call to libdeflate_alloc_decompressor failed");
1072
        return NULL;
1073
    }
1074
1075
    uint8_t *data = NULL, *new_data;
1076
    if (!*size)
1077
        *size = csize*2;
1078
    for(;;) {
1079
        new_data = realloc(data, *size);
1080
        if (!new_data) {
1081
            hts_log_error("Memory allocation failure");
1082
            goto fail;
1083
        }
1084
        data = new_data;
1085
1086
        int ret = libdeflate_gzip_decompress(z, cdata, csize, data, *size, size);
1087
1088
        // Auto grow output buffer size if needed and try again.
1089
        // Fortunately for all bar one call of this we know the size already.
1090
        if (ret == LIBDEFLATE_INSUFFICIENT_SPACE) {
1091
            (*size) *= 1.5;
1092
            continue;
1093
        }
1094
1095
        if (ret != LIBDEFLATE_SUCCESS) {
1096
            hts_log_error("Inflate operation failed: %d", ret);
1097
            goto fail;
1098
        } else {
1099
            break;
1100
        }
1101
    }
1102
1103
    libdeflate_free_decompressor(z);
1104
    return (char *)data;
1105
1106
 fail:
1107
    libdeflate_free_decompressor(z);
1108
    free(data);
1109
    return NULL;
1110
}
1111
1112
// Named differently as we use both zlib/libdeflate for compression.
1113
static char *libdeflate_deflate(char *data, size_t size, size_t *cdata_size,
1114
                                int level, int strat) {
1115
    level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default
1116
    level *= 1.23;     // NB levels go up to 12 here; 5 onwards is +1
1117
    level += level>=8; // 5,6,7->6,7,8  8->10  9->12
1118
    if (level > 12) level = 12;
1119
1120
    if (strat == Z_RLE) // not supported by libdeflate
1121
        level = 1;
1122
1123
    struct libdeflate_compressor *z = libdeflate_alloc_compressor(level);
1124
    if (!z) {
1125
        hts_log_error("Call to libdeflate_alloc_compressor failed");
1126
        return NULL;
1127
    }
1128
1129
    unsigned char *cdata = NULL; /* Compressed output */
1130
    size_t cdata_alloc;
1131
    cdata = malloc(cdata_alloc = size*1.05+100);
1132
    if (!cdata) {
1133
        hts_log_error("Memory allocation failure");
1134
        libdeflate_free_compressor(z);
1135
        return NULL;
1136
    }
1137
1138
    *cdata_size = libdeflate_gzip_compress(z, data, size, cdata, cdata_alloc);
1139
    libdeflate_free_compressor(z);
1140
1141
    if (*cdata_size == 0) {
1142
        hts_log_error("Call to libdeflate_gzip_compress failed");
1143
        free(cdata);
1144
        return NULL;
1145
    }
1146
1147
    return (char *)cdata;
1148
}
1149
1150
#else
1151
1152
/* ----------------------------------------------------------------------
1153
 * zlib compression code - from Gap5's tg_iface_g.c
1154
 * They're static here as they're only used within the cram_compress_block
1155
 * and cram_uncompress_block functions, which are the external interface.
1156
 */
1157
15
char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1158
15
    z_stream s;
1159
15
    unsigned char *data = NULL; /* Uncompressed output */
1160
15
    int data_alloc = 0;
1161
15
    int err;
1162
1163
    /* Starting point at uncompressed size, and scale after that */
1164
15
    data = malloc(data_alloc = csize*1.2+100);
1165
15
    if (!data)
1166
0
        return NULL;
1167
1168
    /* Initialise zlib stream */
1169
15
    s.zalloc = Z_NULL; /* use default allocation functions */
1170
15
    s.zfree  = Z_NULL;
1171
15
    s.opaque = Z_NULL;
1172
15
    s.next_in  = (unsigned char *)cdata;
1173
15
    s.avail_in = csize;
1174
15
    s.total_in = 0;
1175
15
    s.next_out  = data;
1176
15
    s.avail_out = data_alloc;
1177
15
    s.total_out = 0;
1178
1179
    //err = inflateInit(&s);
1180
15
    err = inflateInit2(&s, 15 + 32);
1181
15
    if (err != Z_OK) {
1182
0
        hts_log_error("Call to zlib inflateInit failed: %s", s.msg);
1183
0
        free(data);
1184
0
        return NULL;
1185
0
    }
1186
1187
    /* Decode to 'data' array */
1188
22
    for (;s.avail_in;) {
1189
13
        unsigned char *data_tmp;
1190
13
        int alloc_inc;
1191
1192
13
        s.next_out = &data[s.total_out];
1193
13
        err = inflate(&s, Z_NO_FLUSH);
1194
13
        if (err == Z_STREAM_END)
1195
0
            break;
1196
1197
13
        if (err != Z_OK) {
1198
6
            hts_log_error("Call to zlib inflate failed: %s", s.msg);
1199
6
            free(data);
1200
6
            inflateEnd(&s);
1201
6
            return NULL;
1202
6
        }
1203
1204
        /* More to come, so realloc based on growth so far */
1205
7
        alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
1206
7
        data = realloc((data_tmp = data), data_alloc += alloc_inc);
1207
7
        if (!data) {
1208
0
            free(data_tmp);
1209
0
            inflateEnd(&s);
1210
0
            return NULL;
1211
0
        }
1212
7
        s.avail_out += alloc_inc;
1213
7
    }
1214
9
    inflateEnd(&s);
1215
1216
9
    *size = s.total_out;
1217
9
    return (char *)data;
1218
15
}
1219
#endif
1220
1221
#if !defined(HAVE_LIBDEFLATE) || LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR ==  1 && LIBDEFLATE_VERSION_MINOR <= 8)
1222
static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
1223
409k
                              int level, int strat) {
1224
409k
    z_stream s;
1225
409k
    unsigned char *cdata = NULL; /* Compressed output */
1226
409k
    int cdata_alloc = 0;
1227
409k
    int cdata_pos = 0;
1228
409k
    int err;
1229
1230
409k
    cdata = malloc(cdata_alloc = size*1.05+100);
1231
409k
    if (!cdata)
1232
0
        return NULL;
1233
409k
    cdata_pos = 0;
1234
1235
    /* Initialise zlib stream */
1236
409k
    s.zalloc = Z_NULL; /* use default allocation functions */
1237
409k
    s.zfree  = Z_NULL;
1238
409k
    s.opaque = Z_NULL;
1239
409k
    s.next_in  = (unsigned char *)data;
1240
409k
    s.avail_in = size;
1241
409k
    s.total_in = 0;
1242
409k
    s.next_out  = cdata;
1243
409k
    s.avail_out = cdata_alloc;
1244
409k
    s.total_out = 0;
1245
409k
    s.data_type = Z_BINARY;
1246
1247
409k
    err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
1248
409k
    if (err != Z_OK) {
1249
0
        hts_log_error("Call to zlib deflateInit2 failed: %s", s.msg);
1250
0
        return NULL;
1251
0
    }
1252
1253
    /* Encode to 'cdata' array */
1254
818k
    for (;s.avail_in;) {
1255
409k
        s.next_out = &cdata[cdata_pos];
1256
409k
        s.avail_out = cdata_alloc - cdata_pos;
1257
409k
        if (cdata_alloc - cdata_pos <= 0) {
1258
0
            hts_log_error("Deflate produced larger output than expected");
1259
0
            return NULL;
1260
0
        }
1261
409k
        err = deflate(&s, Z_NO_FLUSH);
1262
409k
        cdata_pos = cdata_alloc - s.avail_out;
1263
409k
        if (err != Z_OK) {
1264
0
            hts_log_error("Call to zlib deflate failed: %s", s.msg);
1265
0
            break;
1266
0
        }
1267
409k
    }
1268
409k
    if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
1269
0
        hts_log_error("Call to zlib deflate failed: %s", s.msg);
1270
0
    }
1271
409k
    *cdata_size = s.total_out;
1272
1273
409k
    if (deflateEnd(&s) != Z_OK) {
1274
0
        hts_log_error("Call to zlib deflate failed: %s", s.msg);
1275
0
    }
1276
409k
    return (char *)cdata;
1277
409k
}
1278
#endif
1279
1280
#ifdef HAVE_LIBLZMA
1281
/* ------------------------------------------------------------------------ */
1282
/*
1283
 * Data compression routines using liblzma (xz)
1284
 *
1285
 * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
1286
 * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
1287
 * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
1288
 * as compression times.
1289
 *
1290
 * For now we disable this functionality. If it's to be reenabled make sure you
1291
 * improve the mem_inflate implementation as it's just a test hack at the
1292
 * moment.
1293
 */
1294
1295
static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
1296
0
                              int level) {
1297
0
    char *out;
1298
0
    size_t out_size = lzma_stream_buffer_bound(size);
1299
0
    *cdata_size = 0;
1300
1301
0
    out = malloc(out_size);
1302
1303
    /* Single call compression */
1304
0
    if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
1305
0
                                           (uint8_t *)data, size,
1306
0
                                           (uint8_t *)out, cdata_size,
1307
0
                                           out_size))
1308
0
        return NULL;
1309
1310
0
    return out;
1311
0
}
1312
1313
14
static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
1314
14
    lzma_stream strm = LZMA_STREAM_INIT;
1315
14
    size_t out_size = 0, out_pos = 0;
1316
14
    char *out = NULL, *new_out;
1317
14
    int r;
1318
1319
    /* Initiate the decoder */
1320
14
    if (LZMA_OK != lzma_stream_decoder(&strm, lzma_easy_decoder_memusage(9), 0))
1321
0
        return NULL;
1322
1323
    /* Decode loop */
1324
14
    strm.avail_in = csize;
1325
14
    strm.next_in = (uint8_t *)cdata;
1326
1327
20
    for (;strm.avail_in;) {
1328
12
        if (strm.avail_in > out_size - out_pos) {
1329
12
            out_size += strm.avail_in * 4 + 32768;
1330
12
            new_out = realloc(out, out_size);
1331
12
            if (!new_out)
1332
0
                goto fail;
1333
12
            out = new_out;
1334
12
        }
1335
12
        strm.avail_out = out_size - out_pos;
1336
12
        strm.next_out = (uint8_t *)&out[out_pos];
1337
1338
12
        r = lzma_code(&strm, LZMA_RUN);
1339
12
        if (LZMA_OK != r && LZMA_STREAM_END != r) {
1340
6
            hts_log_error("LZMA decode failure (error %d)", r);
1341
6
            goto fail;
1342
6
        }
1343
1344
6
        out_pos = strm.total_out;
1345
1346
6
        if (r == LZMA_STREAM_END)
1347
0
            break;
1348
6
    }
1349
1350
    /* finish up any unflushed data; necessary? */
1351
8
    r = lzma_code(&strm, LZMA_FINISH);
1352
8
    if (r != LZMA_OK && r != LZMA_STREAM_END) {
1353
0
        hts_log_error("Call to lzma_code failed with error %d", r);
1354
0
        goto fail;
1355
0
    }
1356
1357
8
    new_out = realloc(out, strm.total_out > 0 ? strm.total_out : 1);
1358
8
    if (new_out)
1359
8
        out = new_out;
1360
8
    *size = strm.total_out;
1361
1362
8
    lzma_end(&strm);
1363
1364
8
    return out;
1365
1366
6
 fail:
1367
6
    lzma_end(&strm);
1368
6
    free(out);
1369
6
    return NULL;
1370
8
}
1371
#endif
1372
1373
/* ----------------------------------------------------------------------
1374
 * CRAM blocks - the dynamically growable data block. We have code to
1375
 * create, update, (un)compress and read/write.
1376
 *
1377
 * These are derived from the deflate_interlaced.c blocks, but with the
1378
 * CRAM extension of content types and IDs.
1379
 */
1380
1381
/*
1382
 * Allocates a new cram_block structure with a specified content_type and
1383
 * id.
1384
 *
1385
 * Returns block pointer on success
1386
 *         NULL on failure
1387
 */
1388
cram_block *cram_new_block(enum cram_content_type content_type,
1389
1.67M
                           int content_id) {
1390
1.67M
    cram_block *b = malloc(sizeof(*b));
1391
1.67M
    if (!b)
1392
0
        return NULL;
1393
1.67M
    b->method = b->orig_method = RAW;
1394
1.67M
    b->content_type = content_type;
1395
1.67M
    b->content_id = content_id;
1396
1.67M
    b->comp_size = 0;
1397
1.67M
    b->uncomp_size = 0;
1398
1.67M
    b->data = NULL;
1399
1.67M
    b->alloc = 0;
1400
1.67M
    b->byte = 0;
1401
1.67M
    b->bit = 7; // MSB
1402
1.67M
    b->crc32 = 0;
1403
1.67M
    b->idx = 0;
1404
1.67M
    b->m = NULL;
1405
1406
1.67M
    return b;
1407
1.67M
}
1408
1409
/*
1410
 * Reads a block from a cram file.
1411
 * Returns cram_block pointer on success.
1412
 *         NULL on failure
1413
 */
1414
16.0k
cram_block *cram_read_block(cram_fd *fd) {
1415
16.0k
    cram_block *b = malloc(sizeof(*b));
1416
16.0k
    unsigned char c;
1417
16.0k
    uint32_t crc = 0;
1418
16.0k
    if (!b)
1419
0
        return NULL;
1420
1421
    //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
1422
1423
16.0k
    if (-1 == (b->method      = hgetc(fd->fp))) { free(b); return NULL; }
1424
16.0k
    c = b->method; crc = crc32(crc, &c, 1);
1425
16.0k
    if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; }
1426
15.9k
    c = b->content_type; crc = crc32(crc, &c, 1);
1427
15.9k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->content_id, &crc))  { free(b); return NULL; }
1428
15.9k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->comp_size, &crc))   { free(b); return NULL; }
1429
15.9k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; }
1430
1431
    //fprintf(stderr, "  method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
1432
    //      b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
1433
1434
15.9k
    if (b->method == RAW) {
1435
6.84k
        if (b->uncomp_size < 0 || b->comp_size != b->uncomp_size) {
1436
125
            free(b);
1437
125
            return NULL;
1438
125
        }
1439
6.71k
        b->alloc = b->uncomp_size;
1440
6.71k
        if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
1441
6.71k
        if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) {
1442
47
            free(b->data);
1443
47
            free(b);
1444
47
            return NULL;
1445
47
        }
1446
9.08k
    } else {
1447
9.08k
        if (b->comp_size < 0 || b->uncomp_size < 0) {
1448
74
            free(b);
1449
74
            return NULL;
1450
74
        }
1451
9.01k
        b->alloc = b->comp_size;
1452
9.01k
        if (!(b->data = malloc(b->comp_size)))  { free(b); return NULL; }
1453
9.01k
        if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) {
1454
92
            free(b->data);
1455
92
            free(b);
1456
92
            return NULL;
1457
92
        }
1458
9.01k
    }
1459
1460
15.5k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1461
3.00k
        if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
1462
7
            free(b->data);
1463
7
            free(b);
1464
7
            return NULL;
1465
7
        }
1466
1467
3.00k
        b->crc32_checked = fd->ignore_md5;
1468
3.00k
        b->crc_part = crc;
1469
12.5k
    } else {
1470
12.5k
        b->crc32_checked = 1; // CRC not present
1471
12.5k
    }
1472
1473
15.5k
    b->orig_method = b->method;
1474
15.5k
    b->idx = 0;
1475
15.5k
    b->byte = 0;
1476
15.5k
    b->bit = 7; // MSB
1477
1478
15.5k
    return b;
1479
15.5k
}
1480
1481
1482
/*
1483
 * Computes the size of a cram block, including the block
1484
 * header itself.
1485
 */
1486
0
uint32_t cram_block_size(cram_block *b) {
1487
0
    unsigned char dat[100], *cp = dat;;
1488
0
    uint32_t sz;
1489
1490
0
    *cp++ = b->method;
1491
0
    *cp++ = b->content_type;
1492
0
    cp += itf8_put((char*)cp, b->content_id);
1493
0
    cp += itf8_put((char*)cp, b->comp_size);
1494
0
    cp += itf8_put((char*)cp, b->uncomp_size);
1495
1496
0
    sz = cp-dat + 4;
1497
0
    sz += b->method == RAW ? b->uncomp_size : b->comp_size;
1498
1499
0
    return sz;
1500
0
}
1501
1502
/*
1503
 * Writes a CRAM block.
1504
 * Returns 0 on success
1505
 *        -1 on failure
1506
 */
1507
555k
int cram_write_block(cram_fd *fd, cram_block *b) {
1508
555k
    char vardata[100];
1509
555k
    int vardata_o = 0;
1510
1511
555k
    assert(b->method != RAW || (b->comp_size == b->uncomp_size));
1512
1513
555k
    if (hputc(b->method,       fd->fp)  == EOF) return -1;
1514
555k
    if (hputc(b->content_type, fd->fp)  == EOF) return -1;
1515
555k
    vardata_o += fd->vv.varint_put32(vardata          , vardata+100, b->content_id);
1516
555k
    vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->comp_size);
1517
555k
    vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->uncomp_size);
1518
555k
    if (vardata_o != hwrite(fd->fp, vardata, vardata_o))
1519
0
        return -1;
1520
1521
555k
    if (b->data) {
1522
494k
        if (b->method == RAW) {
1523
427k
            if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size))
1524
0
                return -1;
1525
427k
        } else {
1526
66.5k
            if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size))
1527
0
                return -1;
1528
66.5k
        }
1529
494k
    } else {
1530
        // Absent blocks should be size 0
1531
61.6k
        assert(b->method == RAW && b->uncomp_size == 0);
1532
61.6k
    }
1533
1534
555k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1535
555k
        char dat[100], *cp = (char *)dat;
1536
555k
        uint32_t crc;
1537
1538
555k
        *cp++ = b->method;
1539
555k
        *cp++ = b->content_type;
1540
555k
        cp += fd->vv.varint_put32(cp, dat+100, b->content_id);
1541
555k
        cp += fd->vv.varint_put32(cp, dat+100, b->comp_size);
1542
555k
        cp += fd->vv.varint_put32(cp, dat+100, b->uncomp_size);
1543
555k
        crc = crc32(0L, (uc *)dat, cp-dat);
1544
1545
555k
        if (b->method == RAW) {
1546
489k
            b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
1547
489k
        } else {
1548
66.5k
            b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
1549
66.5k
        }
1550
1551
555k
        if (-1 == int32_encode(fd, b->crc32))
1552
0
            return -1;
1553
555k
    }
1554
1555
555k
    return 0;
1556
555k
}
1557
1558
/*
1559
 * Frees a CRAM block, deallocating internal data too.
1560
 */
1561
2.33M
void cram_free_block(cram_block *b) {
1562
2.33M
    if (!b)
1563
643k
        return;
1564
1.69M
    if (b->data)
1565
1.12M
        free(b->data);
1566
1.69M
    free(b);
1567
1.69M
}
1568
1569
/*
1570
 * Uncompresses a CRAM block, if compressed.
1571
 */
1572
10.3k
int cram_uncompress_block(cram_block *b) {
1573
10.3k
    char *uncomp;
1574
10.3k
    size_t uncomp_size = 0;
1575
1576
10.3k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1577
    // Pretend the CRC was OK so the fuzzer doesn't have to get it right
1578
10.3k
    b->crc32_checked = 1;
1579
10.3k
#endif
1580
1581
10.3k
    if (b->crc32_checked == 0) {
1582
0
        uint32_t crc = crc32(b->crc_part, b->data ? b->data : (uc *)"", b->alloc);
1583
0
        b->crc32_checked = 1;
1584
0
        if (crc != b->crc32) {
1585
0
            hts_log_error("Block CRC32 failure");
1586
0
            return -1;
1587
0
        }
1588
0
    }
1589
1590
10.3k
    if (b->uncomp_size == 0) {
1591
        // blank block
1592
3.31k
        b->method = RAW;
1593
3.31k
        return 0;
1594
3.31k
    }
1595
7.04k
    assert(b->uncomp_size >= 0); // cram_read_block should ensure this
1596
1597
7.04k
    switch (b->method) {
1598
706
    case RAW:
1599
706
        return 0;
1600
1601
15
    case GZIP:
1602
15
        uncomp_size = b->uncomp_size;
1603
15
        uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1604
1605
15
        if (!uncomp)
1606
6
            return -1;
1607
9
        if (uncomp_size != b->uncomp_size) {
1608
9
            free(uncomp);
1609
9
            return -1;
1610
9
        }
1611
0
        free(b->data);
1612
0
        b->data = (unsigned char *)uncomp;
1613
0
        b->alloc = uncomp_size;
1614
0
        b->method = RAW;
1615
0
        break;
1616
1617
0
#ifdef HAVE_LIBBZ2
1618
6
    case BZIP2: {
1619
6
        unsigned int usize = b->uncomp_size;
1620
6
        if (!(uncomp = malloc(usize)))
1621
0
            return -1;
1622
6
        if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
1623
6
                                                (char *)b->data, b->comp_size,
1624
6
                                                0, 0)) {
1625
6
            free(uncomp);
1626
6
            return -1;
1627
6
        }
1628
0
        free(b->data);
1629
0
        b->data = (unsigned char *)uncomp;
1630
0
        b->alloc = usize;
1631
0
        b->method = RAW;
1632
0
        b->uncomp_size = usize; // Just in case it differs
1633
0
        break;
1634
6
    }
1635
#else
1636
    case BZIP2:
1637
        hts_log_error("Bzip2 compression is not compiled into this version. Please rebuild and try again");
1638
        return -1;
1639
#endif
1640
1641
0
#ifdef HAVE_LIBLZMA
1642
14
    case LZMA:
1643
14
        uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1644
14
        if (!uncomp)
1645
6
            return -1;
1646
8
        if (uncomp_size != b->uncomp_size) {
1647
8
            free(uncomp);
1648
8
            return -1;
1649
8
        }
1650
0
        free(b->data);
1651
0
        b->data = (unsigned char *)uncomp;
1652
0
        b->alloc = uncomp_size;
1653
0
        b->method = RAW;
1654
0
        break;
1655
#else
1656
    case LZMA:
1657
        hts_log_error("Lzma compression is not compiled into this version. Please rebuild and try again");
1658
        return -1;
1659
        break;
1660
#endif
1661
1662
569
    case RANS: {
1663
569
        unsigned int usize = b->uncomp_size, usize2;
1664
569
        uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2);
1665
569
        if (!uncomp)
1666
268
            return -1;
1667
301
        if (usize != usize2) {
1668
297
            free(uncomp);
1669
297
            return -1;
1670
297
        }
1671
4
        free(b->data);
1672
4
        b->data = (unsigned char *)uncomp;
1673
4
        b->alloc = usize2;
1674
4
        b->method = RAW;
1675
4
        b->uncomp_size = usize2; // Just in case it differs
1676
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1677
4
        break;
1678
301
    }
1679
1680
1.02k
    case FQZ: {
1681
1.02k
        uncomp_size = b->uncomp_size;
1682
1.02k
        uncomp = fqz_decompress((char *)b->data, b->comp_size, &uncomp_size, NULL, 0);
1683
1.02k
        if (!uncomp)
1684
742
            return -1;
1685
283
        free(b->data);
1686
283
        b->data = (unsigned char *)uncomp;
1687
283
        b->alloc = uncomp_size;
1688
283
        b->method = RAW;
1689
283
        b->uncomp_size = uncomp_size;
1690
283
        break;
1691
1.02k
    }
1692
1693
1.37k
    case RANS_PR0: {
1694
1.37k
        unsigned int usize = b->uncomp_size, usize2;
1695
1.37k
        uncomp = (char *)rans_uncompress_4x16(b->data, b->comp_size, &usize2);
1696
1.37k
        if (!uncomp)
1697
1.03k
            return -1;
1698
341
        if (usize != usize2) {
1699
235
            free(uncomp);
1700
235
            return -1;
1701
235
        }
1702
106
        b->orig_method = RANSPR;
1703
106
        free(b->data);
1704
106
        b->data = (unsigned char *)uncomp;
1705
106
        b->alloc = usize2;
1706
106
        b->method = RAW;
1707
106
        b->uncomp_size = usize2; // Just incase it differs
1708
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1709
106
        break;
1710
341
    }
1711
1712
732
    case ARITH_PR0: {
1713
732
        unsigned int usize = b->uncomp_size, usize2;
1714
732
        uncomp = (char *)arith_uncompress_to(b->data, b->comp_size, NULL, &usize2);
1715
732
        if (!uncomp)
1716
464
            return -1;
1717
268
        if (usize != usize2) {
1718
75
            free(uncomp);
1719
75
            return -1;
1720
75
        }
1721
193
        b->orig_method = ARITH;
1722
193
        free(b->data);
1723
193
        b->data = (unsigned char *)uncomp;
1724
193
        b->alloc = usize2;
1725
193
        b->method = RAW;
1726
193
        b->uncomp_size = usize2; // Just incase it differs
1727
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1728
193
        break;
1729
268
    }
1730
1731
2.58k
    case TOK3: {
1732
2.58k
        uint32_t out_len;
1733
2.58k
        uint8_t *cp = tok3_decode_names(b->data, b->comp_size, &out_len);
1734
2.58k
        if (!cp)
1735
2.44k
            return -1;
1736
137
        b->orig_method = TOK3;
1737
137
        b->method = RAW;
1738
137
        free(b->data);
1739
137
        b->data = cp;
1740
137
        b->alloc = out_len;
1741
137
        b->uncomp_size = out_len;
1742
137
        break;
1743
2.58k
    }
1744
1745
14
    default:
1746
14
        return -1;
1747
7.04k
    }
1748
1749
723
    return 0;
1750
7.04k
}
1751
1752
static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size,
1753
                                     int content_id, size_t *out_size,
1754
                                     enum cram_block_method_int method,
1755
1.08M
                                     int level, int strat) {
1756
1.08M
    switch (method) {
1757
128k
    case GZIP:
1758
204k
    case GZIP_RLE:
1759
409k
    case GZIP_1:
1760
        // Read names bizarrely benefit from zlib over libdeflate for
1761
        // mid-range compression levels.  Focusing purely of ratio or
1762
        // speed, libdeflate still wins.  It also seems to win for
1763
        // other data series too.
1764
        //
1765
        // Eg RN at level 5;  libdeflate=55.9MB  zlib=51.6MB
1766
#ifdef HAVE_LIBDEFLATE
1767
#  if (LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR == 1 && LIBDEFLATE_VERSION_MINOR <= 8))
1768
        if (content_id == DS_RN && level >= 4 && level <= 7)
1769
            return zlib_mem_deflate(in, in_size, out_size, level, strat);
1770
        else
1771
#  endif
1772
            return libdeflate_deflate(in, in_size, out_size, level, strat);
1773
#else
1774
409k
        return zlib_mem_deflate(in, in_size, out_size, level, strat);
1775
0
#endif
1776
1777
0
    case BZIP2: {
1778
0
#ifdef HAVE_LIBBZ2
1779
0
        unsigned int comp_size = in_size*1.01 + 600;
1780
0
        char *comp = malloc(comp_size);
1781
0
        if (!comp)
1782
0
            return NULL;
1783
1784
0
        if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
1785
0
                                              in, in_size,
1786
0
                                              level, 0, 30)) {
1787
0
            free(comp);
1788
0
            return NULL;
1789
0
        }
1790
0
        *out_size = comp_size;
1791
0
        return comp;
1792
#else
1793
        return NULL;
1794
#endif
1795
0
    }
1796
1797
0
    case FQZ:
1798
0
    case FQZ_b:
1799
0
    case FQZ_c:
1800
0
    case FQZ_d: {
1801
        // Extract the necessary portion of the slice into an fqz_slice struct.
1802
        // These previously were the same thing, but this permits us to detach
1803
        // the codec from the rest of this CRAM implementation.
1804
0
        fqz_slice *f = malloc(2*s->hdr->num_records * sizeof(uint32_t) + sizeof(fqz_slice));
1805
0
        if (!f)
1806
0
            return NULL;
1807
0
        f->num_records = s->hdr->num_records;
1808
0
        f->len = (uint32_t *)(((char *)f) + sizeof(fqz_slice));
1809
0
        f->flags = f->len + s->hdr->num_records;
1810
0
        int i;
1811
0
        for (i = 0; i < s->hdr->num_records; i++) {
1812
0
            f->flags[i] = s->crecs[i].flags;
1813
0
            f->len[i] = (i+1 < s->hdr->num_records
1814
0
                         ? s->crecs[i+1].qual - s->crecs[i].qual
1815
0
                         : s->block[DS_QS]->uncomp_size - s->crecs[i].qual);
1816
0
        }
1817
0
        char *comp = fqz_compress(strat & 0xff /* cram vers */, f,
1818
0
                                  in, in_size, out_size, strat >> 8, NULL);
1819
0
        free(f);
1820
0
        return comp;
1821
0
    }
1822
1823
0
    case LZMA:
1824
0
#ifdef HAVE_LIBLZMA
1825
0
        return lzma_mem_deflate(in, in_size, out_size, level);
1826
#else
1827
        return NULL;
1828
#endif
1829
1830
0
    case RANS0:
1831
0
    case RANS1: {
1832
0
        unsigned int out_size_i;
1833
0
        unsigned char *cp;
1834
0
        cp = rans_compress((unsigned char *)in, in_size, &out_size_i,
1835
0
                           method == RANS0 ? 0 : 1);
1836
0
        *out_size = out_size_i;
1837
0
        return (char *)cp;
1838
0
    }
1839
1840
206k
    case RANS_PR0:
1841
283k
    case RANS_PR1:
1842
386k
    case RANS_PR64:
1843
462k
    case RANS_PR9:
1844
566k
    case RANS_PR128:
1845
566k
    case RANS_PR129:
1846
566k
    case RANS_PR192:
1847
642k
    case RANS_PR193: {
1848
642k
        unsigned int out_size_i;
1849
642k
        unsigned char *cp;
1850
1851
        // see enum cram_block. We map RANS_* methods to order bit-fields
1852
642k
        static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1853
1854
642k
        int m = method == RANS_PR0 ? 0 : methmap[method - RANS_PR1];
1855
642k
        cp = rans_compress_4x16((unsigned char *)in, in_size, &out_size_i,
1856
642k
                                m | RANS_ORDER_SIMD_AUTO);
1857
642k
        *out_size = out_size_i;
1858
642k
        return (char *)cp;
1859
566k
    }
1860
1861
0
    case ARITH_PR0:
1862
0
    case ARITH_PR1:
1863
0
    case ARITH_PR64:
1864
0
    case ARITH_PR9:
1865
0
    case ARITH_PR128:
1866
0
    case ARITH_PR129:
1867
0
    case ARITH_PR192:
1868
0
    case ARITH_PR193: {
1869
0
        unsigned int out_size_i;
1870
0
        unsigned char *cp;
1871
1872
        // see enum cram_block. We map ARITH_* methods to order bit-fields
1873
0
        static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1874
1875
0
        cp = arith_compress_to((unsigned char *)in, in_size, NULL, &out_size_i,
1876
0
                               method == ARITH_PR0 ? 0 : methmap[method - ARITH_PR1]);
1877
0
        *out_size = out_size_i;
1878
0
        return (char *)cp;
1879
0
    }
1880
1881
32.5k
    case TOK3:
1882
32.5k
    case TOKA: {
1883
32.5k
        int out_len;
1884
32.5k
        int lev = level;
1885
32.5k
        if (method == TOK3 && lev > 3)
1886
32.5k
            lev = 3;
1887
32.5k
        uint8_t *cp = tok3_encode_names(in, in_size, lev, strat, &out_len, NULL);
1888
32.5k
        *out_size = out_len;
1889
32.5k
        return (char *)cp;
1890
32.5k
    }
1891
1892
0
    case RAW:
1893
0
        break;
1894
1895
0
    default:
1896
0
        return NULL;
1897
1.08M
    }
1898
1899
0
    return NULL;
1900
1.08M
}
1901
1902
/*
1903
 * A copy of cram_compress_block2 with added recursion detection.
1904
 * This is only called for error handling where the auto-tuning has failed.
1905
 * The simplest way of doing this is recusion + an additional argument, but
1906
 * we didn't want to complicate the existing code hence this is static.
1907
 */
1908
static int cram_compress_block3(cram_fd *fd, cram_slice *s,
1909
                                cram_block *b, cram_metrics *metrics,
1910
                                int method, int level,
1911
953k
                                int recurse) {
1912
1913
953k
    if (!b)
1914
60.8k
        return 0;
1915
1916
892k
    int orig_method = method;
1917
892k
    char *comp = NULL;
1918
892k
    size_t comp_size = 0;
1919
892k
    int strat;
1920
1921
    // Internally we have parameterised methods that externally map
1922
    // to the same CRAM method value.
1923
    // See enum_cram_block_method_int in cram_structs.h.
1924
892k
    int methmap[] = {
1925
        // Externally defined values
1926
892k
        RAW, GZIP, BZIP2, LZMA, RANS, RANSPR, ARITH, FQZ, TOK3,
1927
1928
        // Reserved for possible expansion
1929
892k
        0, 0,
1930
1931
        // Internally parameterised versions matching back to above
1932
        // external values
1933
892k
        GZIP, GZIP,
1934
892k
        FQZ, FQZ, FQZ,
1935
892k
        RANS,
1936
892k
        RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR,
1937
892k
        TOK3,
1938
892k
        ARITH,  ARITH,  ARITH,  ARITH,  ARITH,  ARITH,  ARITH,
1939
892k
    };
1940
1941
892k
    if (b->method != RAW) {
1942
        // Maybe already compressed if s->block[0] was compressed and
1943
        // we have e.g. s->block[DS_BA] set to s->block[0] due to only
1944
        // one base type present and hence using E_HUFFMAN on block 0.
1945
        // A second explicit attempt to compress the same block then
1946
        // occurs.
1947
0
        return 0;
1948
0
    }
1949
1950
892k
    if (method == -1) {
1951
10.7k
        method = 1<<GZIP;
1952
10.7k
        if (fd->use_bz2)
1953
0
            method |= 1<<BZIP2;
1954
10.7k
        if (fd->use_lzma)
1955
0
            method |= 1<<LZMA;
1956
10.7k
    }
1957
1958
892k
    if (level == -1)
1959
10.7k
        level = fd->level;
1960
1961
    //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
1962
1963
892k
    if (method == RAW || level == 0 || b->uncomp_size == 0) {
1964
483k
        b->method = RAW;
1965
483k
        b->comp_size = b->uncomp_size;
1966
        //fprintf(stderr, "Skip block id %d\n", b->content_id);
1967
483k
        return 0;
1968
483k
    }
1969
1970
409k
#ifndef ABS
1971
409k
#    define ABS(a) ((a)>=0?(a):-(a))
1972
409k
#endif
1973
1974
409k
    if (metrics) {
1975
389k
        pthread_mutex_lock(&fd->metrics_lock);
1976
        // Sudden changes in size trigger a retrial.  These are mainly
1977
        // triggered when switching to sorted / unsorted, where the number
1978
        // of elements in a slice radically changes.
1979
        //
1980
        // We also get large fluctuations based on genome coordinate for
1981
        // e.g. SA:Z and SC series, but we consider the typical scale of
1982
        // delta between blocks and use this to look for abnormality.
1983
1984
        // Equivalent to (but minus possible integer overflow)
1985
        //   (b->uncomp_size + 1000)/4 > metrics->input_avg_sz+1000 ||
1986
        //    b->uncomp_size + 1000    < (metrics->input_avg_sz+1000)/4)
1987
389k
        if (metrics->input_avg_sz &&
1988
389k
            (b->uncomp_size/4 - 750 > metrics->input_avg_sz ||
1989
113k
             b->uncomp_size         < metrics->input_avg_sz/4 - 750) &&
1990
389k
            ABS(b->uncomp_size-metrics->input_avg_sz)/10
1991
2.31k
                > metrics->input_avg_delta) {
1992
141
            metrics->next_trial = 0;
1993
141
        }
1994
1995
389k
        if (metrics->trial > 0 || --metrics->next_trial <= 0) {
1996
108k
            int m, unpackable = metrics->unpackable;
1997
108k
            size_t sz_best = b->uncomp_size;
1998
108k
            size_t sz[CRAM_MAX_METHOD] = {0};
1999
108k
            int method_best = 0; // RAW
2000
108k
            char *c_best = NULL, *c = NULL;
2001
2002
108k
            metrics->input_avg_delta =
2003
108k
                0.9 * (metrics->input_avg_delta +
2004
108k
                       ABS(b->uncomp_size - metrics->input_avg_sz));
2005
2006
108k
            metrics->input_avg_sz += b->uncomp_size*.2;
2007
108k
            metrics->input_avg_sz *= 0.8;
2008
2009
108k
            if (metrics->revised_method)
2010
51.6k
                method = metrics->revised_method;
2011
57.1k
            else
2012
57.1k
                metrics->revised_method = method;
2013
2014
108k
            if (metrics->next_trial <= 0) {
2015
2.42k
                metrics->next_trial = TRIAL_SPAN;
2016
2.42k
                metrics->trial = NTRIALS;
2017
80.0k
                for (m = 0; m < CRAM_MAX_METHOD; m++)
2018
77.6k
                    metrics->sz[m] /= 2;
2019
2.42k
                metrics->unpackable = 0;
2020
2.42k
            }
2021
2022
            // Compress this block using the best method
2023
108k
            if (unpackable && CRAM_MAJOR_VERS(fd->version) > 3) {
2024
                // No point trying bit-pack if 17+ symbols.
2025
0
                if (method & (1<<RANS_PR128))
2026
0
                    method = (method|(1<<RANS_PR0))&~(1<<RANS_PR128);
2027
0
                if (method & (1<<RANS_PR129))
2028
0
                    method = (method|(1<<RANS_PR1))&~(1<<RANS_PR129);
2029
0
                if (method & (1<<RANS_PR192))
2030
0
                    method = (method|(1<<RANS_PR64))&~(1<<RANS_PR192);
2031
0
                if (method & (1<<RANS_PR193))
2032
0
                    method = (method|(1<<RANS_PR64)|(1<<RANS_PR1))&~(1<<RANS_PR193);
2033
2034
0
                if (method & (1<<ARITH_PR128))
2035
0
                    method = (method|(1<<ARITH_PR0))&~(1<<ARITH_PR128);
2036
0
                if (method & (1<<ARITH_PR129))
2037
0
                    method = (method|(1<<ARITH_PR1))&~(1<<ARITH_PR129);
2038
0
                if (method & (1<<ARITH_PR192))
2039
0
                    method = (method|(1<<ARITH_PR64))&~(1<<ARITH_PR192);
2040
0
                if (method & (1u<<ARITH_PR193))
2041
0
                    method = (method|(1<<ARITH_PR64)|(1<<ARITH_PR1))&~(1u<<ARITH_PR193);
2042
0
            }
2043
2044
            // Libdeflate doesn't have a Z_RLE strategy.
2045
            // We treat it as level 1, but iff we haven't also
2046
            // explicitly listed that in the method list.
2047
#ifdef HAVE_LIBDEFLATE
2048
            if ((method & (1<<GZIP_RLE)) && (method & (1<<GZIP_1)))
2049
                method &= ~(1<<GZIP_RLE);
2050
#endif
2051
2052
108k
            pthread_mutex_unlock(&fd->metrics_lock);
2053
2054
3.58M
            for (m = 0; m < CRAM_MAX_METHOD; m++) {
2055
3.48M
                if (method & (1u<<m)) {
2056
783k
                    int lvl = level;
2057
783k
                    switch (m) {
2058
108k
                    case GZIP:     strat = Z_FILTERED; break;
2059
108k
                    case GZIP_1:   strat = Z_DEFAULT_STRATEGY; lvl = 1; break;
2060
76.2k
                    case GZIP_RLE: strat = Z_RLE; break;
2061
0
                    case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
2062
0
                    case FQZ_b:    strat = CRAM_MAJOR_VERS(fd->version)+256; break;
2063
0
                    case FQZ_c:    strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
2064
0
                    case FQZ_d:    strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
2065
32.2k
                    case TOK3:     strat = 0; break;
2066
0
                    case TOKA:     strat = 1; break;
2067
457k
                    default:       strat = 0;
2068
783k
                    }
2069
2070
783k
                    c = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2071
783k
                                                b->content_id, &sz[m], m, lvl, strat);
2072
2073
783k
                    if (c && sz_best > sz[m]) {
2074
52.3k
                        sz_best = sz[m];
2075
52.3k
                        method_best = m;
2076
52.3k
                        if (c_best)
2077
26.9k
                            free(c_best);
2078
52.3k
                        c_best = c;
2079
731k
                    } else if (c) {
2080
727k
                        free(c);
2081
727k
                    } else {
2082
3.27k
                        sz[m] = UINT_MAX; // arbitrarily worse than raw
2083
3.27k
                    }
2084
2.69M
                } else {
2085
2.69M
                    sz[m] = UINT_MAX; // arbitrarily worse than raw
2086
2.69M
                }
2087
3.48M
            }
2088
2089
108k
            if (c_best) {
2090
25.3k
                free(b->data);
2091
25.3k
                b->data = (unsigned char *)c_best;
2092
25.3k
                b->method = method_best; // adjusted to methmap[method_best] later
2093
25.3k
                b->comp_size = sz_best;
2094
25.3k
            }
2095
2096
            // Accumulate stats for all methods tried
2097
108k
            pthread_mutex_lock(&fd->metrics_lock);
2098
3.58M
            for (m = 0; m < CRAM_MAX_METHOD; m++)
2099
                // don't be overly sure on small blocks.
2100
                // +2000 means eg bzip2 vs gzip (1.07 to 1.04) or gz vs rans1
2101
                // needs to be at least 60 bytes smaller to overcome the
2102
                // fixed size addition.
2103
3.48M
                metrics->sz[m] += sz[m]+2000;
2104
2105
            // When enough trials performed, find the best on average
2106
108k
            if (--metrics->trial == 0) {
2107
29.1k
                int best_method = RAW;
2108
29.1k
                int best_sz = INT_MAX;
2109
2110
                // Relative costs of methods. See enum_cram_block_method_int
2111
                // and methmap
2112
29.1k
                double meth_cost[32] = {
2113
                    // Externally defined methods
2114
29.1k
                    1,    // 0  raw
2115
29.1k
                    1.04, // 1  gzip (Z_FILTERED)
2116
29.1k
                    1.07, // 2  bzip2
2117
29.1k
                    1.08, // 3  lzma
2118
29.1k
                    1.00, // 4  rans    (O0)
2119
29.1k
                    1.00, // 5  ranspr  (O0)
2120
29.1k
                    1.04, // 6  arithpr (O0)
2121
29.1k
                    1.05, // 7  fqz
2122
29.1k
                    1.05, // 8  tok3 (rans)
2123
29.1k
                    1.00, 1.00, // 9,10 reserved
2124
2125
                    // Paramterised versions of above
2126
29.1k
                    1.01, // gzip rle
2127
29.1k
                    1.01, // gzip -1
2128
2129
29.1k
                    1.05, 1.05, 1.05, // FQZ_b,c,d
2130
2131
29.1k
                    1.01, // rans O1
2132
2133
29.1k
                    1.01, // rans_pr1
2134
29.1k
                    1.00, // rans_pr64; if smaller, usually fast
2135
29.1k
                    1.03, // rans_pr65/9
2136
29.1k
                    1.00, // rans_pr128
2137
29.1k
                    1.01, // rans_pr129
2138
29.1k
                    1.00, // rans_pr192
2139
29.1k
                    1.01, // rans_pr193
2140
2141
29.1k
                    1.07, // tok3 arith
2142
2143
29.1k
                    1.04, // arith_pr1
2144
29.1k
                    1.04, // arith_pr64
2145
29.1k
                    1.04, // arith_pr9
2146
29.1k
                    1.03, // arith_pr128
2147
29.1k
                    1.04, // arith_pr129
2148
29.1k
                    1.04, // arith_pr192
2149
29.1k
                    1.04, // arith_pr193
2150
29.1k
                };
2151
2152
                // Scale methods by cost based on compression level
2153
29.1k
                if (fd->level <= 1) {
2154
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2155
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1)*4;
2156
29.1k
                } else if (fd->level <= 3) {
2157
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2158
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1);
2159
29.1k
                } else if (fd->level <= 6) {
2160
961k
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2161
932k
                        metrics->sz[m] *= 1+(meth_cost[m]-1)/2;
2162
29.1k
                } else if (fd->level <= 7) {
2163
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2164
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1)/3;
2165
0
                } // else cost is ignored
2166
2167
                // Ensure these are never used; BSC and ZSTD
2168
29.1k
                metrics->sz[9] = metrics->sz[10] = INT_MAX;
2169
2170
961k
                for (m = 0; m < CRAM_MAX_METHOD; m++) {
2171
932k
                    if ((!metrics->sz[m]) || (!(method & (1u<<m))))
2172
741k
                        continue;
2173
2174
191k
                    if (best_sz > metrics->sz[m])
2175
65.6k
                        best_sz = metrics->sz[m], best_method = m;
2176
191k
                }
2177
2178
29.1k
                if (best_method != metrics->method) {
2179
                    //metrics->trial = (NTRIALS+1)/2; // be sure
2180
                    //metrics->next_trial /= 1.5;
2181
13.7k
                    metrics->consistency = 0;
2182
15.4k
                } else {
2183
15.4k
                    metrics->next_trial *= MIN(2, 1+metrics->consistency/4.0);
2184
15.4k
                    metrics->consistency++;
2185
15.4k
                }
2186
2187
29.1k
                metrics->method = best_method;
2188
29.1k
                switch (best_method) {
2189
154
                case GZIP:     strat = Z_FILTERED; break;
2190
11.7k
                case GZIP_1:   strat = Z_DEFAULT_STRATEGY; break;
2191
3
                case GZIP_RLE: strat = Z_RLE; break;
2192
0
                case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
2193
0
                case FQZ_b:    strat = CRAM_MAJOR_VERS(fd->version)+256; break;
2194
0
                case FQZ_c:    strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
2195
0
                case FQZ_d:    strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
2196
315
                case TOK3:     strat = 0; break;
2197
0
                case TOKA:     strat = 1; break;
2198
16.9k
                default:       strat = 0;
2199
29.1k
                }
2200
29.1k
                metrics->strat  = strat;
2201
2202
                // If we see at least MAXFAIL trials in a row for a specific
2203
                // compression method with more than MAXDELTA aggregate
2204
                // size then we drop this from the list of methods used
2205
                // for this block type.
2206
284k
#define MAXDELTA 0.20
2207
655k
#define MAXFAILS 4
2208
961k
                for (m = 0; m < CRAM_MAX_METHOD; m++) {
2209
932k
                    if (best_method == m) {
2210
29.1k
                        metrics->cnt[m] = 0;
2211
29.1k
                        metrics->extra[m] = 0;
2212
903k
                    } else if (best_sz < metrics->sz[m]) {
2213
655k
                        double r = (double)metrics->sz[m] / best_sz - 1;
2214
655k
                        int mul = 1+(fd->level>=7);
2215
655k
                        if (++metrics->cnt[m] >= MAXFAILS*mul &&
2216
655k
                            (metrics->extra[m] += r) >= MAXDELTA*mul)
2217
148k
                            method &= ~(1u<<m);
2218
2219
                        // Special case for fqzcomp as it rarely changes
2220
655k
                        if (m == FQZ || m == FQZ_b || m == FQZ_c || m == FQZ_d) {
2221
110k
                            if (metrics->sz[m] > best_sz)
2222
110k
                                method &= ~(1u<<m);
2223
110k
                        }
2224
655k
                    }
2225
932k
                }
2226
2227
                //if (fd->verbose > 1 && method != metrics->revised_method)
2228
                //    fprintf(stderr, "%d: revising method from %x to %x\n",
2229
                //          b->content_id, metrics->revised_method, method);
2230
29.1k
                metrics->revised_method = method;
2231
29.1k
            }
2232
108k
            pthread_mutex_unlock(&fd->metrics_lock);
2233
280k
        } else {
2234
280k
            metrics->input_avg_delta =
2235
280k
                0.9 * (metrics->input_avg_delta +
2236
280k
                       ABS(b->uncomp_size - metrics->input_avg_sz));
2237
2238
280k
            metrics->input_avg_sz += b->uncomp_size*.2;
2239
280k
            metrics->input_avg_sz *= 0.8;
2240
2241
280k
            strat = metrics->strat;
2242
280k
            method = metrics->method;
2243
2244
280k
            pthread_mutex_unlock(&fd->metrics_lock);
2245
280k
            comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2246
280k
                                           b->content_id, &comp_size, method,
2247
280k
                                           method == GZIP_1 ? 1 : level,
2248
280k
                                           strat);
2249
280k
            if (!comp) {
2250
                // Our cached best method failed, but maybe another works?
2251
                // Rerun with trial mode engaged again.
2252
204
                if (!recurse) {
2253
204
                    hts_log_warning("Compressed block ID %d method %s failed, "
2254
204
                                    "redoing trial", b->content_id,
2255
204
                                    cram_block_method2str(method));
2256
204
                    pthread_mutex_lock(&fd->metrics_lock);
2257
204
                    metrics->trial = NTRIALS;
2258
204
                    metrics->next_trial = TRIAL_SPAN;
2259
204
                    metrics->revised_method = orig_method;
2260
204
                    pthread_mutex_unlock(&fd->metrics_lock);
2261
204
                    return cram_compress_block3(fd, s, b, metrics, method,
2262
204
                                                level, 1);
2263
204
                }
2264
0
                return -1;
2265
204
            }
2266
2267
280k
            if (comp_size < b->uncomp_size) {
2268
39.3k
                free(b->data);
2269
39.3k
                b->data = (unsigned char *)comp;
2270
39.3k
                b->comp_size = comp_size;
2271
39.3k
                b->method = method;
2272
241k
            } else {
2273
241k
                free(comp);
2274
241k
            }
2275
280k
        }
2276
2277
389k
    } else {
2278
        // no cached metrics, so just do zlib?
2279
20.0k
        comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2280
20.0k
                                       b->content_id, &comp_size, GZIP, level, Z_FILTERED);
2281
20.0k
        if (!comp) {
2282
0
            hts_log_error("Compression failed!");
2283
0
            return -1;
2284
0
        }
2285
2286
20.0k
        if (comp_size < b->uncomp_size) {
2287
1.97k
            free(b->data);
2288
1.97k
            b->data = (unsigned char *)comp;
2289
1.97k
            b->comp_size = comp_size;
2290
1.97k
            b->method = GZIP;
2291
18.0k
        } else {
2292
18.0k
            free(comp);
2293
18.0k
        }
2294
20.0k
        strat = Z_FILTERED;
2295
20.0k
    }
2296
2297
409k
    hts_log_info("Compressed block ID %d from %d to %d by method %s",
2298
409k
                 b->content_id, b->uncomp_size, b->comp_size,
2299
409k
                 cram_block_method2str(b->method));
2300
2301
409k
    b->method = methmap[b->method];
2302
2303
409k
    return 0;
2304
409k
}
2305
2306
/*
2307
 * Compresses a block using a selection of compression codecs and options.
2308
 * The best is learnt and used for subsequent slices, periodically resampling.
2309
 *
2310
 * Method and level -1 implies defaults, as specified in cram_fd.
2311
 */
2312
int cram_compress_block2(cram_fd *fd, cram_slice *s,
2313
                         cram_block *b, cram_metrics *metrics,
2314
953k
                         int method, int level) {
2315
953k
    return cram_compress_block3(fd, s, b, metrics, method, level, 0);
2316
953k
}
2317
2318
int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
2319
10.7k
                        int method, int level) {
2320
10.7k
    return cram_compress_block2(fd, NULL, b, metrics, method, level);
2321
10.7k
}
2322
2323
1.08M
cram_metrics *cram_new_metrics(void) {
2324
1.08M
    cram_metrics *m = calloc(1, sizeof(*m));
2325
1.08M
    if (!m)
2326
0
        return NULL;
2327
1.08M
    m->trial = NTRIALS-1;
2328
1.08M
    m->next_trial = TRIAL_SPAN/2; // learn quicker at start
2329
1.08M
    m->method = RAW;
2330
1.08M
    m->strat = 0;
2331
1.08M
    m->revised_method = 0;
2332
1.08M
    m->unpackable = 0;
2333
2334
1.08M
    return m;
2335
1.08M
}
2336
2337
409k
char *cram_block_method2str(enum cram_block_method_int m) {
2338
409k
    switch(m) {
2339
342k
    case RAW:         return "RAW";
2340
6.98k
    case GZIP:        return "GZIP";
2341
0
    case BZIP2:       return "BZIP2";
2342
0
    case LZMA:        return "LZMA";
2343
0
    case RANS0:       return "RANS0";
2344
0
    case RANS1:       return "RANS1";
2345
35
    case GZIP_RLE:    return "GZIP_RLE";
2346
8.19k
    case GZIP_1:      return "GZIP_1";
2347
0
    case FQZ:         return "FQZ";
2348
0
    case FQZ_b:       return "FQZ_b";
2349
0
    case FQZ_c:       return "FQZ_c";
2350
0
    case FQZ_d:       return "FQZ_d";
2351
1.62k
    case RANS_PR0:    return "RANS_PR0";
2352
451
    case RANS_PR1:    return "RANS_PR1";
2353
20.1k
    case RANS_PR64:   return "RANS_PR64";
2354
4
    case RANS_PR9:    return "RANS_PR9";
2355
28.8k
    case RANS_PR128:  return "RANS_PR128";
2356
0
    case RANS_PR129:  return "RANS_PR129";
2357
0
    case RANS_PR192:  return "RANS_PR192";
2358
449
    case RANS_PR193:  return "RANS_PR193";
2359
211
    case TOK3:        return "TOK3_R";
2360
0
    case TOKA:        return "TOK3_A";
2361
0
    case ARITH_PR0:   return "ARITH_PR0";
2362
0
    case ARITH_PR1:   return "ARITH_PR1";
2363
0
    case ARITH_PR64:  return "ARITH_PR64";
2364
0
    case ARITH_PR9:   return "ARITH_PR9";
2365
0
    case ARITH_PR128: return "ARITH_PR128";
2366
0
    case ARITH_PR129: return "ARITH_PR129";
2367
0
    case ARITH_PR192: return "ARITH_PR192";
2368
0
    case ARITH_PR193: return "ARITH_PR193";
2369
0
    case BM_ERROR: break;
2370
409k
    }
2371
0
    return "?";
2372
409k
}
2373
2374
16
char *cram_content_type2str(enum cram_content_type t) {
2375
16
    switch (t) {
2376
6
    case FILE_HEADER:         return "FILE_HEADER";
2377
2
    case COMPRESSION_HEADER:  return "COMPRESSION_HEADER";
2378
0
    case MAPPED_SLICE:        return "MAPPED_SLICE";
2379
0
    case UNMAPPED_SLICE:      return "UNMAPPED_SLICE";
2380
0
    case EXTERNAL:            return "EXTERNAL";
2381
3
    case CORE:                return "CORE";
2382
0
    case CT_ERROR:            break;
2383
16
    }
2384
5
    return "?";
2385
16
}
2386
2387
/* ----------------------------------------------------------------------
2388
 * Reference sequence handling
2389
 *
2390
 * These revolve around the refs_t structure, which may potentially be
2391
 * shared between multiple cram_fd.
2392
 *
2393
 * We start with refs_create() to allocate an empty refs_t and then
2394
 * populate it with @SQ line data using refs_from_header(). This is done on
2395
 * cram_open().  Also at start up we can call cram_load_reference() which
2396
 * is used with "scramble -r foo.fa". This replaces the fd->refs with the
2397
 * new one specified. In either case refs2id() is then called which
2398
 * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
2399
 *
2400
 * Later, possibly within a thread, we will want to know the actual ref
2401
 * seq itself, obtained by calling cram_get_ref().  This may use the
2402
 * UR: or M5: fields or the filename specified in the original
2403
 * cram_load_reference() call.
2404
 *
2405
 * Given the potential for multi-threaded reference usage, we have
2406
 * reference counting (sorry for the confusing double use of "ref") to
2407
 * track the number of callers interested in any specific reference.
2408
 */
2409
2410
/*
2411
 * Frees/unmaps a reference sequence and associated file handles.
2412
 */
2413
10.3k
static void ref_entry_free_seq(ref_entry *e) {
2414
10.3k
    if (e->mf)
2415
0
        mfclose(e->mf);
2416
10.3k
    if (e->seq && !e->mf)
2417
0
        free(e->seq);
2418
2419
10.3k
    e->seq = NULL;
2420
10.3k
    e->mf = NULL;
2421
10.3k
}
2422
2423
22.8k
void refs_free(refs_t *r) {
2424
22.8k
    RP("refs_free()\n");
2425
2426
22.8k
    if (--r->count > 0)
2427
0
        return;
2428
2429
22.8k
    if (!r)
2430
0
        return;
2431
2432
22.8k
    if (r->pool)
2433
22.8k
        string_pool_destroy(r->pool);
2434
2435
22.8k
    if (r->h_meta) {
2436
22.8k
        khint_t k;
2437
2438
50.4k
        for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) {
2439
27.5k
            ref_entry *e;
2440
2441
27.5k
            if (!kh_exist(r->h_meta, k))
2442
17.2k
                continue;
2443
10.3k
            if (!(e = kh_val(r->h_meta, k)))
2444
0
                continue;
2445
10.3k
            ref_entry_free_seq(e);
2446
10.3k
            free(e);
2447
10.3k
        }
2448
2449
22.8k
        kh_destroy(refs, r->h_meta);
2450
22.8k
    }
2451
2452
22.8k
    if (r->ref_id)
2453
11.6k
        free(r->ref_id);
2454
2455
22.8k
    if (r->fp)
2456
0
        bgzf_close(r->fp);
2457
2458
22.8k
    pthread_mutex_destroy(&r->lock);
2459
2460
22.8k
    free(r);
2461
22.8k
}
2462
2463
22.8k
static refs_t *refs_create(void) {
2464
22.8k
    refs_t *r = calloc(1, sizeof(*r));
2465
2466
22.8k
    RP("refs_create()\n");
2467
2468
22.8k
    if (!r)
2469
0
        return NULL;
2470
2471
22.8k
    if (!(r->pool = string_pool_create(8192)))
2472
0
        goto err;
2473
2474
22.8k
    r->ref_id = NULL; // see refs2id() to populate.
2475
22.8k
    r->count = 1;
2476
22.8k
    r->last = NULL;
2477
22.8k
    r->last_id = -1;
2478
2479
22.8k
    if (!(r->h_meta = kh_init(refs)))
2480
0
        goto err;
2481
2482
22.8k
    pthread_mutex_init(&r->lock, NULL);
2483
2484
22.8k
    return r;
2485
2486
0
 err:
2487
0
    refs_free(r);
2488
0
    return NULL;
2489
22.8k
}
2490
2491
/*
2492
 * Opens a reference fasta file as a BGZF stream, allowing for
2493
 * compressed files.  It automatically builds a .fai file if
2494
 * required and if compressed a .gzi bgzf index too.
2495
 *
2496
 * Returns a BGZF handle on success;
2497
 *         NULL on failure.
2498
 */
2499
2.14k
static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) {
2500
2.14k
    BGZF *fp;
2501
2502
2.14k
    if (strncmp(fn, "file://", 7) == 0)
2503
1
        fn += 7;
2504
2505
2.14k
    if (!is_md5 && !hisremote(fn)) {
2506
1.24k
        char fai_file[PATH_MAX];
2507
2508
1.24k
        snprintf(fai_file, PATH_MAX, "%s.fai", fn);
2509
1.24k
        if (access(fai_file, R_OK) != 0)
2510
1.24k
            if (fai_build(fn) != 0)
2511
1.24k
                return NULL;
2512
1.24k
    }
2513
2514
898
    if (!(fp = bgzf_open(fn, mode))) {
2515
898
        perror(fn);
2516
898
        return NULL;
2517
898
    }
2518
2519
0
    if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) {
2520
0
        hts_log_error("Unable to load .gzi index '%s.gzi'", fn);
2521
0
        bgzf_close(fp);
2522
0
        return NULL;
2523
0
    }
2524
2525
0
    return fp;
2526
0
}
2527
2528
/*
2529
 * Loads a FAI file for a reference.fasta.
2530
 * "is_err" indicates whether failure to load is worthy of emitting an
2531
 * error message. In some cases (eg with embedded references) we
2532
 * speculatively load, just in case, and silently ignore errors.
2533
 *
2534
 * Returns the refs_t struct on success (maybe newly allocated);
2535
 *         NULL on failure
2536
 */
2537
2.14k
static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) {
2538
2.14k
    hFILE *fp = NULL;
2539
2.14k
    char fai_fn[PATH_MAX];
2540
2.14k
    char line[8192];
2541
2.14k
    refs_t *r = r_orig;
2542
2.14k
    size_t fn_l = strlen(fn);
2543
2.14k
    int id = 0, id_alloc = 0;
2544
2545
2.14k
    RP("refs_load_fai %s\n", fn);
2546
2547
2.14k
    if (!r)
2548
0
        if (!(r = refs_create()))
2549
0
            goto err;
2550
2551
2.14k
    if (r->fp)
2552
0
        if (bgzf_close(r->fp) != 0)
2553
0
            goto err;
2554
2.14k
    r->fp = NULL;
2555
2556
    /* Look for a FASTA##idx##FAI format */
2557
2.14k
    char *fn_delim = strstr(fn, HTS_IDX_DELIM);
2558
2.14k
    if (fn_delim) {
2559
34
        if (!(r->fn = string_ndup(r->pool, fn, fn_delim - fn)))
2560
0
            goto err;
2561
34
        fn_delim += strlen(HTS_IDX_DELIM);
2562
34
        snprintf(fai_fn, PATH_MAX, "%s", fn_delim);
2563
2.11k
    } else {
2564
        /* An index file was provided, instead of the actual reference file */
2565
2.11k
        if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0) {
2566
48
            if (!r->fn) {
2567
44
                if (!(r->fn = string_ndup(r->pool, fn, fn_l-4)))
2568
0
                    goto err;
2569
44
            }
2570
48
            snprintf(fai_fn, PATH_MAX, "%s", fn);
2571
2.06k
        } else {
2572
        /* Only the reference file provided. Get the index file name from it */
2573
2.06k
            if (!(r->fn = string_dup(r->pool, fn)))
2574
0
                goto err;
2575
2.06k
            snprintf(fai_fn, PATH_MAX, "%.*s.fai", PATH_MAX-5, fn);
2576
2.06k
        }
2577
2.11k
    }
2578
2579
2.14k
    if (!(r->fp = bgzf_open_ref(r->fn, "r", 0))) {
2580
2.14k
        hts_log_error("Failed to open reference file '%s'", r->fn);
2581
2.14k
        goto err;
2582
2.14k
    }
2583
2584
0
    if (!(fp = hopen(fai_fn, "r"))) {
2585
0
        hts_log_error("Failed to open index file '%s'", fai_fn);
2586
0
        if (is_err)
2587
0
            perror(fai_fn);
2588
0
        goto err;
2589
0
    }
2590
0
    while (hgets(line, 8192, fp) != NULL) {
2591
0
        ref_entry *e = malloc(sizeof(*e));
2592
0
        char *cp;
2593
0
        int n;
2594
0
        khint_t k;
2595
2596
0
        if (!e)
2597
0
            return NULL;
2598
2599
        // id
2600
0
        for (cp = line; *cp && !isspace_c(*cp); cp++)
2601
0
            ;
2602
0
        *cp++ = 0;
2603
0
        e->name = string_dup(r->pool, line);
2604
2605
        // length
2606
0
        while (*cp && isspace_c(*cp))
2607
0
            cp++;
2608
0
        e->length = strtoll(cp, &cp, 10);
2609
2610
        // offset
2611
0
        while (*cp && isspace_c(*cp))
2612
0
            cp++;
2613
0
        e->offset = strtoll(cp, &cp, 10);
2614
2615
        // bases per line
2616
0
        while (*cp && isspace_c(*cp))
2617
0
            cp++;
2618
0
        e->bases_per_line = strtol(cp, &cp, 10);
2619
2620
        // line length
2621
0
        while (*cp && isspace_c(*cp))
2622
0
            cp++;
2623
0
        e->line_length = strtol(cp, &cp, 10);
2624
2625
        // filename
2626
0
        e->fn = r->fn;
2627
2628
0
        e->count = 0;
2629
0
        e->seq = NULL;
2630
0
        e->mf = NULL;
2631
0
        e->is_md5 = 0;
2632
0
        e->validated_md5 = 0;
2633
2634
0
        k = kh_put(refs, r->h_meta, e->name, &n);
2635
0
        if (-1 == n)  {
2636
0
            free(e);
2637
0
            return NULL;
2638
0
        }
2639
2640
0
        if (n) {
2641
0
            kh_val(r->h_meta, k) = e;
2642
0
        } else {
2643
0
            ref_entry *re = kh_val(r->h_meta, k);
2644
0
            if (re && (re->count != 0 || re->length != 0)) {
2645
                /* Keep old */
2646
0
                free(e);
2647
0
            } else {
2648
                /* Replace old */
2649
0
                if (re)
2650
0
                    free(re);
2651
0
                kh_val(r->h_meta, k) = e;
2652
0
            }
2653
0
        }
2654
2655
0
        if (id >= id_alloc) {
2656
0
            ref_entry **new_refs;
2657
0
            int x;
2658
2659
0
            id_alloc = id_alloc ?id_alloc*2 : 16;
2660
0
            new_refs = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
2661
0
            if (!new_refs)
2662
0
                goto err;
2663
0
            r->ref_id = new_refs;
2664
2665
0
            for (x = id; x < id_alloc; x++)
2666
0
                r->ref_id[x] = NULL;
2667
0
        }
2668
0
        r->ref_id[id] = e;
2669
0
        r->nref = ++id;
2670
0
    }
2671
2672
0
    if(hclose(fp) < 0)
2673
0
        goto err;
2674
0
    return r;
2675
2676
2.14k
 err:
2677
2.14k
    if (fp)
2678
0
        hclose_abruptly(fp);
2679
2680
2.14k
    if (!r_orig)
2681
0
        refs_free(r);
2682
2683
2.14k
    return NULL;
2684
0
}
2685
2686
/*
2687
 * Verifies that the CRAM @SQ lines and .fai files match.
2688
 */
2689
0
static void sanitise_SQ_lines(cram_fd *fd) {
2690
0
    int i;
2691
2692
0
    if (!fd->header || !fd->header->hrecs)
2693
0
        return;
2694
2695
0
    if (!fd->refs || !fd->refs->h_meta)
2696
0
        return;
2697
2698
0
    for (i = 0; i < fd->header->hrecs->nref; i++) {
2699
0
        const char *name = fd->header->hrecs->ref[i].name;
2700
0
        khint_t k = kh_get(refs, fd->refs->h_meta, name);
2701
0
        ref_entry *r;
2702
2703
        // We may have @SQ lines which have no known .fai, but do not
2704
        // in themselves pose a problem because they are unused in the file.
2705
0
        if (k == kh_end(fd->refs->h_meta))
2706
0
            continue;
2707
2708
0
        if (!(r = (ref_entry *)kh_val(fd->refs->h_meta, k)))
2709
0
            continue;
2710
2711
0
        if (r->length && r->length != fd->header->hrecs->ref[i].len) {
2712
0
            assert(strcmp(r->name, fd->header->hrecs->ref[i].name) == 0);
2713
2714
            // Should we also check MD5sums here to ensure the correct
2715
            // reference was given?
2716
0
            hts_log_warning("Header @SQ length mismatch for ref %s, %"PRIhts_pos" vs %d",
2717
0
                            r->name, fd->header->hrecs->ref[i].len, (int)r->length);
2718
2719
            // Fixing the parsed @SQ header will make MD:Z: strings work
2720
            // and also stop it producing N for the sequence.
2721
0
            fd->header->hrecs->ref[i].len = r->length;
2722
0
        }
2723
0
    }
2724
0
}
2725
2726
/*
2727
 * Indexes references by the order they appear in a BAM file. This may not
2728
 * necessarily be the same order they appear in the fasta reference file.
2729
 *
2730
 * Returns 0 on success
2731
 *        -1 on failure
2732
 */
2733
10.7k
int refs2id(refs_t *r, sam_hdr_t *hdr) {
2734
10.7k
    int i;
2735
10.7k
    sam_hrecs_t *h = hdr->hrecs;
2736
2737
10.7k
    if (r->ref_id)
2738
4.01k
        free(r->ref_id);
2739
10.7k
    if (r->last)
2740
0
        r->last = NULL;
2741
2742
10.7k
    r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
2743
10.7k
    if (!r->ref_id)
2744
0
        return -1;
2745
2746
10.7k
    r->nref = h->nref;
2747
18.8k
    for (i = 0; i < h->nref; i++) {
2748
8.16k
        khint_t k = kh_get(refs, r->h_meta, h->ref[i].name);
2749
8.16k
        if (k != kh_end(r->h_meta)) {
2750
8.16k
            r->ref_id[i] = kh_val(r->h_meta, k);
2751
8.16k
        } else {
2752
0
            hts_log_warning("Unable to find ref name '%s'", h->ref[i].name);
2753
0
        }
2754
8.16k
    }
2755
2756
10.7k
    return 0;
2757
10.7k
}
2758
2759
/*
2760
 * Generates refs_t entries based on @SQ lines in the header.
2761
 * Returns 0 on success
2762
 *         -1 on failure
2763
 */
2764
45.7k
static int refs_from_header(cram_fd *fd) {
2765
45.7k
    if (!fd)
2766
0
        return -1;
2767
2768
45.7k
    refs_t *r = fd->refs;
2769
45.7k
    if (!r)
2770
0
        return -1;
2771
2772
45.7k
    sam_hdr_t *h = fd->header;
2773
45.7k
    if (!h)
2774
12.5k
        return 0;
2775
2776
33.1k
    if (!h->hrecs) {
2777
21.0k
        if (-1 == sam_hdr_fill_hrecs(h))
2778
1.43k
            return -1;
2779
21.0k
    }
2780
2781
31.7k
    if (h->hrecs->nref == 0)
2782
22.8k
        return 0;
2783
2784
    //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
2785
2786
    /* Existing refs are fine, as long as they're compatible with the hdr. */
2787
8.92k
    ref_entry **new_ref_id = realloc(r->ref_id, (r->nref + h->hrecs->nref) * sizeof(*r->ref_id));
2788
8.92k
    if (!new_ref_id)
2789
0
        return -1;
2790
8.92k
    r->ref_id = new_ref_id;
2791
2792
8.92k
    int i, j;
2793
    /* Copy info from h->ref[i] over to r */
2794
27.4k
    for (i = 0, j = r->nref; i < h->hrecs->nref; i++) {
2795
18.5k
        sam_hrec_type_t *ty;
2796
18.5k
        sam_hrec_tag_t *tag;
2797
18.5k
        khint_t k;
2798
18.5k
        int n;
2799
2800
18.5k
        k = kh_get(refs, r->h_meta, h->hrecs->ref[i].name);
2801
18.5k
        if (k != kh_end(r->h_meta))
2802
            // Ref already known about
2803
8.16k
            continue;
2804
2805
10.3k
        if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
2806
0
            return -1;
2807
2808
10.3k
        if (!h->hrecs->ref[i].name)
2809
0
            return -1;
2810
2811
10.3k
        r->ref_id[j]->name = string_dup(r->pool, h->hrecs->ref[i].name);
2812
10.3k
        if (!r->ref_id[j]->name) return -1;
2813
10.3k
        r->ref_id[j]->length = 0; // marker for not yet loaded
2814
2815
        /* Initialise likely filename if known */
2816
10.3k
        if ((ty = sam_hrecs_find_type_id(h->hrecs, "SQ", "SN", h->hrecs->ref[i].name))) {
2817
10.3k
            if ((tag = sam_hrecs_find_key(ty, "M5", NULL)))
2818
64
                r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
2819
2820
10.3k
            if ((tag = sam_hrecs_find_key(ty, "LN", NULL))) {
2821
                // LN tag used when constructing consensus reference
2822
10.3k
                r->ref_id[j]->LN_length = strtoll(tag->str+3, NULL, 0);
2823
                // See fuzz 382922241
2824
10.3k
                if (r->ref_id[j]->LN_length < 0)
2825
817
                    r->ref_id[j]->LN_length = 0;
2826
10.3k
            }
2827
10.3k
        }
2828
2829
10.3k
        k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
2830
10.3k
        if (n <= 0) // already exists or error
2831
0
            return -1;
2832
10.3k
        kh_val(r->h_meta, k) = r->ref_id[j];
2833
2834
10.3k
        j++;
2835
10.3k
    }
2836
8.92k
    r->nref = j;
2837
2838
8.92k
    return 0;
2839
8.92k
}
2840
2841
/*
2842
 * Attaches a header to a cram_fd.
2843
 *
2844
 * This should be used when creating a new cram_fd for writing where
2845
 * we have a header already constructed (eg from a file we've read
2846
 * in).
2847
 */
2848
12.1k
int cram_set_header2(cram_fd *fd, const sam_hdr_t *hdr) {
2849
12.1k
    if (!fd || !hdr )
2850
0
        return -1;
2851
2852
12.1k
    if (fd->header != hdr) {
2853
12.1k
        if (fd->header)
2854
0
            sam_hdr_destroy(fd->header);
2855
12.1k
        fd->header = sam_hdr_dup(hdr);
2856
12.1k
        if (!fd->header)
2857
0
            return -1;
2858
12.1k
    }
2859
12.1k
    return refs_from_header(fd);
2860
12.1k
}
2861
2862
0
int cram_set_header(cram_fd *fd, sam_hdr_t *hdr) {
2863
0
    return cram_set_header2(fd, hdr);
2864
0
}
2865
2866
/*
2867
 * Returns whether the path refers to a directory.
2868
 */
2869
0
static int is_directory(char *fn) {
2870
0
    struct stat buf;
2871
0
    if ( stat(fn,&buf) ) return 0;
2872
0
    return S_ISDIR(buf.st_mode);
2873
0
}
2874
2875
/*
2876
 * Converts a directory and a filename into an expanded path, replacing %s
2877
 * in directory with the filename and %[0-9]+s with portions of the filename
2878
 * Any remaining parts of filename are added to the end with /%s.
2879
 */
2880
0
static int expand_cache_path(char *path, char *dir, const char *fn) {
2881
0
    char *cp, *start = path;
2882
0
    size_t len;
2883
0
    size_t sz = PATH_MAX;
2884
2885
0
    while ((cp = strchr(dir, '%'))) {
2886
0
        if (cp-dir >= sz) return -1;
2887
0
        strncpy(path, dir, cp-dir);
2888
0
        path += cp-dir;
2889
0
        sz -= cp-dir;
2890
2891
0
        if (*++cp == 's') {
2892
0
            len = strlen(fn);
2893
0
            if (len >= sz) return -1;
2894
0
            strcpy(path, fn);
2895
0
            path += len;
2896
0
            sz -= len;
2897
0
            fn += len;
2898
0
            cp++;
2899
0
        } else if (*cp >= '0' && *cp <= '9') {
2900
0
            char *endp;
2901
0
            long l;
2902
2903
0
            l = strtol(cp, &endp, 10);
2904
0
            l = MIN(l, strlen(fn));
2905
0
            if (*endp == 's') {
2906
0
                if (l >= sz) return -1;
2907
0
                strncpy(path, fn, l);
2908
0
                path += l;
2909
0
                fn += l;
2910
0
                sz -= l;
2911
0
                *path = 0;
2912
0
                cp = endp+1;
2913
0
            } else {
2914
0
                if (sz < 3) return -1;
2915
0
                *path++ = '%';
2916
0
                *path++ = *cp++;
2917
0
            }
2918
0
        } else {
2919
0
            if (sz < 3) return -1;
2920
0
            *path++ = '%';
2921
0
            *path++ = *cp++;
2922
0
        }
2923
0
        dir = cp;
2924
0
    }
2925
2926
0
    len = strlen(dir);
2927
0
    if (len >= sz) return -1;
2928
0
    strcpy(path, dir);
2929
0
    path += len;
2930
0
    sz -= len;
2931
2932
0
    len = strlen(fn) + ((*fn && path > start && path[-1] != '/') ? 1 : 0);
2933
0
    if (len >= sz) return -1;
2934
0
    if (*fn && path > start && path[-1] != '/')
2935
0
        *path++ = '/';
2936
0
    strcpy(path, fn);
2937
0
    return 0;
2938
0
}
2939
2940
/*
2941
 * Make the directory containing path and any prefix directories.
2942
 */
2943
0
static void mkdir_prefix(char *path, int mode) {
2944
0
    char *cp = strrchr(path, '/');
2945
0
    if (!cp)
2946
0
        return;
2947
2948
0
    *cp = 0;
2949
0
    if (is_directory(path)) {
2950
0
        *cp = '/';
2951
0
        return;
2952
0
    }
2953
2954
0
    if (mkdir(path, mode) == 0) {
2955
0
        chmod(path, mode);
2956
0
        *cp = '/';
2957
0
        return;
2958
0
    }
2959
2960
0
    mkdir_prefix(path, mode);
2961
0
    mkdir(path, mode);
2962
0
    chmod(path, mode);
2963
0
    *cp = '/';
2964
0
}
2965
2966
/*
2967
 * Queries the M5 string from the header and attempts to populate the
2968
 * reference from this using the REF_PATH environment.
2969
 *
2970
 * Returns 0 on success
2971
 *        -1 on failure
2972
 */
2973
4.43k
static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
2974
4.43k
    char *ref_path = getenv("REF_PATH");
2975
4.43k
    sam_hrec_type_t *ty;
2976
4.43k
    sam_hrec_tag_t *tag;
2977
4.43k
    char path[PATH_MAX];
2978
4.43k
    kstring_t path_tmp = KS_INITIALIZE;
2979
4.43k
    char *local_cache = getenv("REF_CACHE");
2980
4.43k
    mFILE *mf;
2981
4.43k
    int local_path = 0;
2982
2983
4.43k
    hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id);
2984
2985
4.43k
    if (!r->name)
2986
0
        return -1;
2987
2988
4.43k
    if (!(ty = sam_hrecs_find_type_id(fd->header->hrecs, "SQ", "SN", r->name)))
2989
0
        return -1;
2990
2991
4.43k
    if (!(tag = sam_hrecs_find_key(ty, "M5", NULL)))
2992
4.40k
        goto no_M5;
2993
2994
31
    hts_log_info("Querying ref %s", tag->str+3);
2995
2996
    /* Use cache if available */
2997
31
    if (local_cache && *local_cache) {
2998
0
        struct stat sb;
2999
0
        if (expand_cache_path(path, local_cache, tag->str+3) == 0 &&
3000
0
            stat(path, &sb) == 0)
3001
            // Found it in the local cache
3002
0
            local_path = 1;
3003
0
    }
3004
3005
#ifndef HAVE_MMAP
3006
    char *path2;
3007
    /* Search local files in REF_PATH; we can open them and return as above */
3008
    if (!local_path && (path2 = find_path(tag->str+3, ref_path))) {
3009
        int len = snprintf(path, PATH_MAX, "%s", path2);
3010
        free(path2);
3011
        if (len > 0 && len < PATH_MAX) // in case it's too long
3012
            local_path = 1;
3013
    }
3014
#endif
3015
3016
    /* Found via REF_CACHE or local REF_PATH file */
3017
31
    if (local_path) {
3018
0
        struct stat sb;
3019
0
        BGZF *fp;
3020
3021
0
        if (0 == stat(path, &sb)
3022
0
            && S_ISREG(sb.st_mode)
3023
0
            && (fp = bgzf_open(path, "r"))) {
3024
0
            r->length = sb.st_size;
3025
0
            r->offset = r->line_length = r->bases_per_line = 0;
3026
3027
0
            r->fn = string_dup(fd->refs->pool, path);
3028
3029
0
            if (fd->refs->fp)
3030
0
                if (bgzf_close(fd->refs->fp) != 0)
3031
0
                    return -1;
3032
0
            fd->refs->fp = fp;
3033
0
            fd->refs->fn = r->fn;
3034
0
            r->is_md5 = 1;
3035
0
            r->validated_md5 = 1;
3036
3037
            // Fall back to cram_get_ref() where it'll do the actual
3038
            // reading of the file.
3039
0
            return 0;
3040
0
        }
3041
0
    }
3042
3043
3044
    /* Otherwise search full REF_PATH; slower as loads entire file */
3045
31
    int is_local = 0;
3046
31
    if ((mf = open_path_mfile(tag->str+3, ref_path, NULL, &is_local))) {
3047
0
        size_t sz;
3048
0
        r->seq = mfsteal(mf, &sz);
3049
0
        if (r->seq) {
3050
0
            r->mf = NULL;
3051
0
        } else {
3052
            // keep mf around as we couldn't detach
3053
0
            r->seq = mf->data;
3054
0
            r->mf = mf;
3055
0
        }
3056
0
        r->length = sz;
3057
0
        r->is_md5 = 1;
3058
0
        r->validated_md5 = 1;
3059
31
    } else {
3060
31
        refs_t *refs;
3061
31
        const char *fn;
3062
31
        sam_hrec_tag_t *UR_tag;
3063
3064
4.43k
    no_M5:
3065
        /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
3066
4.43k
        if (!(UR_tag = sam_hrecs_find_key(ty, "UR", NULL)))
3067
2.26k
            return -1;
3068
3069
2.16k
        if (strstr(UR_tag->str+3, "://") &&
3070
2.16k
            strncmp(UR_tag->str+3, "file:", 5) != 0) {
3071
            // Documented as omitted, but accidentally supported until now
3072
24
            hts_log_error("UR tags pointing to remote files are not supported");
3073
24
            return -1;
3074
24
        }
3075
3076
2.14k
        fn = (strncmp(UR_tag->str+3, "file:", 5) == 0)
3077
2.14k
            ? UR_tag->str+8
3078
2.14k
            : UR_tag->str+3;
3079
3080
2.14k
        if (fd->refs->fp) {
3081
0
            if (bgzf_close(fd->refs->fp) != 0)
3082
0
                return -1;
3083
0
            fd->refs->fp = NULL;
3084
0
        }
3085
2.14k
        if (!(refs = refs_load_fai(fd->refs, fn, 0)))
3086
2.14k
            return -1;
3087
0
        sanitise_SQ_lines(fd);
3088
3089
0
        fd->refs = refs;
3090
0
        if (fd->refs->fp) {
3091
0
            if (bgzf_close(fd->refs->fp) != 0)
3092
0
                return -1;
3093
0
            fd->refs->fp = NULL;
3094
0
        }
3095
3096
0
        if (!fd->refs->fn)
3097
0
            return -1;
3098
3099
0
        if (-1 == refs2id(fd->refs, fd->header))
3100
0
            return -1;
3101
0
        if (!fd->refs->ref_id || !fd->refs->ref_id[id])
3102
0
            return -1;
3103
3104
        // Local copy already, so fall back to cram_get_ref().
3105
0
        return 0;
3106
0
    }
3107
3108
    /* Populate the local disk cache if required */
3109
0
    if (!is_local && local_cache && *local_cache) {
3110
0
        hFILE *fp;
3111
3112
0
        if (expand_cache_path(path, local_cache, tag->str+3) < 0) {
3113
0
            return 0; // Not fatal - we have the data already so keep going.
3114
0
        }
3115
0
        hts_log_info("Writing cache file '%s'", path);
3116
0
        mkdir_prefix(path, 01777);
3117
3118
0
        fp = hts_open_tmpfile(path, "wx", &path_tmp);
3119
0
        if (!fp) {
3120
0
            perror(path_tmp.s);
3121
0
            free(path_tmp.s);
3122
3123
            // Not fatal - we have the data already so keep going.
3124
0
            return 0;
3125
0
        }
3126
3127
        // Check md5sum
3128
0
        hts_md5_context *md5;
3129
0
        char unsigned md5_buf1[16];
3130
0
        char md5_buf2[33];
3131
3132
0
        if (!(md5 = hts_md5_init())) {
3133
0
            hclose_abruptly(fp);
3134
0
            unlink(path_tmp.s);
3135
0
            free(path_tmp.s);
3136
0
            return -1;
3137
0
        }
3138
0
        hts_md5_update(md5, r->seq, r->length);
3139
0
        hts_md5_final(md5_buf1, md5);
3140
0
        hts_md5_destroy(md5);
3141
0
        hts_md5_hex(md5_buf2, md5_buf1);
3142
3143
0
        if (strncmp(tag->str+3, md5_buf2, 32) != 0) {
3144
0
            hts_log_error("Mismatching md5sum for downloaded reference");
3145
0
            hclose_abruptly(fp);
3146
0
            unlink(path_tmp.s);
3147
0
            free(path_tmp.s);
3148
0
            return -1;
3149
0
        }
3150
3151
0
        ssize_t length_written = hwrite(fp, r->seq, r->length);
3152
0
        if (hclose(fp) < 0 || length_written != r->length ||
3153
0
            chmod(path_tmp.s, 0444) < 0 ||
3154
0
            rename(path_tmp.s, path) < 0) {
3155
0
            hts_log_error("Creating reference at %s failed: %s",
3156
0
                          path, strerror(errno));
3157
0
            unlink(path_tmp.s);
3158
0
        }
3159
0
    }
3160
3161
0
    free(path_tmp.s);
3162
0
    return 0;
3163
0
}
3164
3165
1.98k
static void cram_ref_incr_locked(refs_t *r, int id) {
3166
1.98k
    RP("%d INC REF %d, %d %p\n", gettid(), id,
3167
1.98k
       (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count+1:-999),
3168
1.98k
       id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
3169
3170
1.98k
    if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq)
3171
1.98k
        return;
3172
3173
0
    if (r->last_id == id)
3174
0
        r->last_id = -1;
3175
3176
0
    ++r->ref_id[id]->count;
3177
0
}
3178
3179
1.98k
void cram_ref_incr(refs_t *r, int id) {
3180
1.98k
    pthread_mutex_lock(&r->lock);
3181
1.98k
    cram_ref_incr_locked(r, id);
3182
1.98k
    pthread_mutex_unlock(&r->lock);
3183
1.98k
}
3184
3185
144
static void cram_ref_decr_locked(refs_t *r, int id) {
3186
144
    RP("%d DEC REF %d, %d %p\n", gettid(), id,
3187
144
       (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count-1:-999),
3188
144
       id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
3189
3190
144
    if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) {
3191
144
        return;
3192
144
    }
3193
3194
0
    if (--r->ref_id[id]->count <= 0) {
3195
0
        assert(r->ref_id[id]->count == 0);
3196
0
        if (r->last_id >= 0) {
3197
0
            if (r->ref_id[r->last_id]->count <= 0 &&
3198
0
                r->ref_id[r->last_id]->seq) {
3199
0
                RP("%d FREE REF %d (%p)\n", gettid(),
3200
0
                   r->last_id, r->ref_id[r->last_id]->seq);
3201
0
                ref_entry_free_seq(r->ref_id[r->last_id]);
3202
0
                if (r->ref_id[r->last_id]->is_md5) r->ref_id[r->last_id]->length = 0;
3203
0
            }
3204
0
        }
3205
0
        r->last_id = id;
3206
0
    }
3207
0
}
3208
3209
144
void cram_ref_decr(refs_t *r, int id) {
3210
144
    pthread_mutex_lock(&r->lock);
3211
144
    cram_ref_decr_locked(r, id);
3212
144
    pthread_mutex_unlock(&r->lock);
3213
144
}
3214
3215
/*
3216
 * Used by cram_ref_load and cram_get_ref. The file handle will have
3217
 * already been opened, so we can catch it. The ref_entry *e informs us
3218
 * of whether this is a multi-line fasta file or a raw MD5 style file.
3219
 * Either way we create a single contiguous sequence.
3220
 *
3221
 * Returns all or part of a reference sequence on success (malloced);
3222
 *         NULL on failure.
3223
 */
3224
static char *load_ref_portion(BGZF *fp, ref_entry *e,
3225
0
                              hts_pos_t start, hts_pos_t end) {
3226
0
    off_t offset, len;
3227
0
    char *seq;
3228
3229
0
    if (end < start)
3230
0
        end = start;
3231
3232
    /*
3233
     * Compute locations in file. This is trivial for the MD5 files, but
3234
     * is still necessary for the fasta variants.
3235
     *
3236
     * Note the offset here, as with faidx, has the assumption that white-
3237
     * space (the diff between line_length and bases_per_line) only occurs
3238
     * at the end of a line of text.
3239
     */
3240
0
    offset = e->line_length
3241
0
        ? e->offset + (start-1)/e->bases_per_line * e->line_length +
3242
0
          (start-1) % e->bases_per_line
3243
0
        : start-1;
3244
3245
0
    len = (e->line_length
3246
0
           ? e->offset + (end-1)/e->bases_per_line * e->line_length +
3247
0
             (end-1) % e->bases_per_line
3248
0
           : end-1) - offset + 1;
3249
3250
0
    if (bgzf_useek(fp, offset, SEEK_SET) < 0) {
3251
0
        perror("bgzf_useek() on reference file");
3252
0
        return NULL;
3253
0
    }
3254
3255
0
    if (len == 0 || !(seq = malloc(len))) {
3256
0
        return NULL;
3257
0
    }
3258
3259
0
    if (len != bgzf_read(fp, seq, len)) {
3260
0
        perror("bgzf_read() on reference file");
3261
0
        free(seq);
3262
0
        return NULL;
3263
0
    }
3264
3265
    /* Strip white-space if required. */
3266
0
    if (len != end-start+1) {
3267
0
        hts_pos_t i, j;
3268
0
        char *cp = seq;
3269
0
        char *cp_to;
3270
3271
        // Copy up to the first white-space, and then repeatedly just copy
3272
        // bases_per_line verbatim, and use the slow method to end again.
3273
        //
3274
        // This may seem excessive, but this code can be a significant
3275
        // portion of total CRAM decode CPU time for shallow data sets.
3276
0
        for (i = j = 0; i < len; i++) {
3277
0
            if (!isspace_c(cp[i]))
3278
0
                cp[j++] = cp[i] & ~0x20;
3279
0
            else
3280
0
                break;
3281
0
        }
3282
0
        while (i < len && isspace_c(cp[i]))
3283
0
            i++;
3284
0
        while (i < len - e->line_length) {
3285
0
            hts_pos_t j_end = j + e->bases_per_line;
3286
0
            while (j < j_end)
3287
0
                cp[j++] = cp[i++] & ~0x20; // toupper equiv
3288
0
            i += e->line_length - e->bases_per_line;
3289
0
        }
3290
0
        for (; i < len; i++) {
3291
0
            if (!isspace_c(cp[i]))
3292
0
                cp[j++] = cp[i] & ~0x20;
3293
0
        }
3294
3295
0
        cp_to = cp+j;
3296
3297
0
        if (cp_to - seq != end-start+1) {
3298
0
            hts_log_error("Malformed reference file");
3299
0
            free(seq);
3300
0
            return NULL;
3301
0
        }
3302
0
    } else {
3303
0
        int i;
3304
0
        for (i = 0; i < len; i++) {
3305
0
            seq[i] = toupper_c(seq[i]);
3306
0
        }
3307
0
    }
3308
3309
0
    return seq;
3310
0
}
3311
3312
/*
3313
 * Load the entire reference 'id'.
3314
 * This also increments the reference count by 1.
3315
 *
3316
 * Returns ref_entry on success;
3317
 *         NULL on failure
3318
 */
3319
0
ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) {
3320
0
    ref_entry *e = r->ref_id[id];
3321
0
    hts_pos_t start = 1, end = e->length;
3322
0
    char *seq;
3323
3324
0
    if (e->seq) {
3325
0
        return e;
3326
0
    }
3327
3328
0
    assert(e->count == 0);
3329
3330
0
    if (r->last) {
3331
#ifdef REF_DEBUG
3332
        int idx = 0;
3333
        for (idx = 0; idx < r->nref; idx++)
3334
            if (r->last == r->ref_id[idx])
3335
                break;
3336
        RP("%d cram_ref_load DECR %d\n", gettid(), idx);
3337
#endif
3338
0
        assert(r->last->count > 0);
3339
0
        if (--r->last->count <= 0) {
3340
0
            RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
3341
0
            if (r->last->seq)
3342
0
                ref_entry_free_seq(r->last);
3343
0
        }
3344
0
    }
3345
3346
0
    if (!r->fn)
3347
0
        return NULL;
3348
3349
    /* Open file if it's not already the current open reference */
3350
0
    if (strcmp(r->fn, e->fn) || r->fp == NULL) {
3351
0
        if (r->fp)
3352
0
            if (bgzf_close(r->fp) != 0)
3353
0
                return NULL;
3354
0
        r->fn = e->fn;
3355
0
        if (!(r->fp = bgzf_open_ref(r->fn, "r", is_md5)))
3356
0
            return NULL;
3357
0
    }
3358
3359
0
    RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
3360
3361
0
    if (!(seq = load_ref_portion(r->fp, e, start, end))) {
3362
0
        return NULL;
3363
0
    }
3364
3365
0
    RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
3366
3367
0
    RP("%d INC REF %d, %"PRId64"\n", gettid(), id, (e->count+1));
3368
0
    e->seq = seq;
3369
0
    e->mf = NULL;
3370
0
    e->count++;
3371
3372
    /*
3373
     * Also keep track of last used ref so incr/decr loops on the same
3374
     * sequence don't cause load/free loops.
3375
     */
3376
0
    RP("%d cram_ref_load INCR %d => %"PRId64"\n", gettid(), id, e->count+1);
3377
0
    r->last = e;
3378
0
    e->count++;
3379
3380
0
    return e;
3381
0
}
3382
3383
/*
3384
 * Returns a portion of a reference sequence from start to end inclusive.
3385
 * The returned pointer is owned by either the cram_file fd or by the
3386
 * internal refs_t structure and should not be freed  by the caller.
3387
 *
3388
 * The difference is whether or not this refs_t is in use by just the one
3389
 * cram_fd or by multiples, or whether we have multiple threads accessing
3390
 * references. In either case fd->shared will be true and we start using
3391
 * reference counting to track the number of users of a specific reference
3392
 * sequence.
3393
 *
3394
 * Otherwise the ref seq returned is allocated as part of cram_fd itself
3395
 * and will be freed up on the next call to cram_get_ref or cram_close.
3396
 *
3397
 * To return the entire reference sequence, specify start as 1 and end
3398
 * as 0.
3399
 *
3400
 * To cease using a reference, call cram_ref_decr().
3401
 *
3402
 * Returns reference on success,
3403
 *         NULL on failure
3404
 */
3405
8.52k
char *cram_get_ref(cram_fd *fd, int id, hts_pos_t start, hts_pos_t end) {
3406
8.52k
    ref_entry *r;
3407
8.52k
    char *seq;
3408
8.52k
    int ostart = start;
3409
3410
8.52k
    if (id == -1 || start < 1)
3411
4.08k
        return NULL;
3412
3413
    /* FIXME: axiomatic query of r->seq being true?
3414
     * Or shortcut for unsorted data where we load once and never free?
3415
     */
3416
3417
    //fd->shared_ref = 1; // hard code for now to simplify things
3418
3419
4.44k
    pthread_mutex_lock(&fd->ref_lock);
3420
3421
4.44k
    RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
3422
3423
    /*
3424
     * Unsorted data implies we want to fetch an entire reference at a time.
3425
     * We just deal with this at the moment by claiming we're sharing
3426
     * references instead, which has the same requirement.
3427
     */
3428
4.44k
    if (fd->unsorted)
3429
0
        fd->shared_ref = 1;
3430
3431
3432
    /* Sanity checking: does this ID exist? */
3433
4.44k
    if (id >= fd->refs->nref) {
3434
12
        hts_log_error("No reference found for id %d", id);
3435
12
        pthread_mutex_unlock(&fd->ref_lock);
3436
12
        return NULL;
3437
12
    }
3438
3439
4.43k
    if (!fd->refs || !fd->refs->ref_id[id]) {
3440
0
        hts_log_error("No reference found for id %d", id);
3441
0
        pthread_mutex_unlock(&fd->ref_lock);
3442
0
        return NULL;
3443
0
    }
3444
3445
4.43k
    if (!(r = fd->refs->ref_id[id])) {
3446
0
        hts_log_error("No reference found for id %d", id);
3447
0
        pthread_mutex_unlock(&fd->ref_lock);
3448
0
        return NULL;
3449
0
    }
3450
3451
3452
    /*
3453
     * It has an entry, but may not have been populated yet.
3454
     * Any manually loaded .fai files have their lengths known.
3455
     * A ref entry computed from @SQ lines (M5 or UR field) will have
3456
     * r->length == 0 unless it's been loaded once and verified that we have
3457
     * an on-disk filename for it.
3458
     *
3459
     * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
3460
     * open_path_mfile and libcurl, which isn't multi-thread safe unless I
3461
     * rewrite my code to have one curl handle per thread.
3462
     */
3463
4.43k
    pthread_mutex_lock(&fd->refs->lock);
3464
4.43k
    if (r->length == 0) {
3465
4.43k
        if (fd->ref_fn)
3466
0
            hts_log_warning("Reference file given, but ref '%s' not present",
3467
4.43k
                            r->name);
3468
4.43k
        if (cram_populate_ref(fd, id, r) == -1) {
3469
4.43k
            hts_log_warning("Failed to populate reference \"%s\"",
3470
4.43k
                            r->name);
3471
4.43k
            hts_log_warning("See https://www.htslib.org/doc/reference_seqs.html for further suggestions");
3472
4.43k
            pthread_mutex_unlock(&fd->refs->lock);
3473
4.43k
            pthread_mutex_unlock(&fd->ref_lock);
3474
4.43k
            return NULL;
3475
4.43k
        }
3476
0
        r = fd->refs->ref_id[id];
3477
0
        if (fd->unsorted)
3478
0
            cram_ref_incr_locked(fd->refs, id);
3479
0
    }
3480
3481
3482
    /*
3483
     * We now know that we the filename containing the reference, so check
3484
     * for limits. If it's over half the reference we'll load all of it in
3485
     * memory as this will speed up subsequent calls.
3486
     */
3487
0
    if (end < 1)
3488
0
        end = r->length;
3489
0
    if (end >= r->length)
3490
0
        end  = r->length;
3491
3492
0
    if (end - start >= 0.5*r->length || fd->shared_ref) {
3493
0
        start = 1;
3494
0
        end = r->length;
3495
0
    }
3496
3497
    /*
3498
     * Maybe we have it cached already? If so use it.
3499
     *
3500
     * Alternatively if we don't have the sequence but we're sharing
3501
     * references and/or are asking for the entire length of it, then
3502
     * load the full reference into the refs structure and return
3503
     * a pointer to that one instead.
3504
     */
3505
0
    if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
3506
0
        char *cp;
3507
3508
0
        if (id >= 0) {
3509
0
            if (r->seq) {
3510
0
                cram_ref_incr_locked(fd->refs, id);
3511
0
            } else {
3512
0
                ref_entry *e;
3513
0
                if (!(e = cram_ref_load(fd->refs, id, r->is_md5))) {
3514
0
                    pthread_mutex_unlock(&fd->refs->lock);
3515
0
                    pthread_mutex_unlock(&fd->ref_lock);
3516
0
                    return NULL;
3517
0
                }
3518
3519
                /* unsorted data implies cache ref indefinitely, to avoid
3520
                 * continually loading and unloading.
3521
                 */
3522
0
                if (fd->unsorted)
3523
0
                    cram_ref_incr_locked(fd->refs, id);
3524
0
            }
3525
3526
0
            fd->ref = NULL; /* We never access it directly */
3527
0
            fd->ref_start = 1;
3528
0
            fd->ref_end   = r->length;
3529
0
            fd->ref_id    = id;
3530
3531
0
            cp = fd->refs->ref_id[id]->seq + ostart-1;
3532
0
        } else {
3533
0
            fd->ref = NULL;
3534
0
            cp = NULL;
3535
0
        }
3536
3537
0
        RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
3538
3539
0
        pthread_mutex_unlock(&fd->refs->lock);
3540
0
        pthread_mutex_unlock(&fd->ref_lock);
3541
0
        return cp;
3542
0
    }
3543
3544
    /*
3545
     * Otherwise we're not sharing, we don't have a copy of it already and
3546
     * we're only asking for a small portion of it.
3547
     *
3548
     * In this case load up just that segment ourselves, freeing any old
3549
     * small segments in the process.
3550
     */
3551
3552
    /* Unmapped ref ID */
3553
0
    if (id < 0 || !fd->refs->fn) {
3554
0
        if (fd->ref_free) {
3555
0
            free(fd->ref_free);
3556
0
            fd->ref_free = NULL;
3557
0
        }
3558
0
        fd->ref = NULL;
3559
0
        fd->ref_id = id;
3560
0
        pthread_mutex_unlock(&fd->refs->lock);
3561
0
        pthread_mutex_unlock(&fd->ref_lock);
3562
0
        return NULL;
3563
0
    }
3564
3565
    /* Open file if it's not already the current open reference */
3566
0
    if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
3567
0
        if (fd->refs->fp)
3568
0
            if (bgzf_close(fd->refs->fp) != 0)
3569
0
                return NULL;
3570
0
        fd->refs->fn = r->fn;
3571
0
        if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r", r->is_md5))) {
3572
0
            pthread_mutex_unlock(&fd->refs->lock);
3573
0
            pthread_mutex_unlock(&fd->ref_lock);
3574
0
            return NULL;
3575
0
        }
3576
0
    }
3577
3578
0
    if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
3579
0
        pthread_mutex_unlock(&fd->refs->lock);
3580
0
        pthread_mutex_unlock(&fd->ref_lock);
3581
0
        return NULL;
3582
0
    }
3583
3584
0
    if (fd->ref_free)
3585
0
        free(fd->ref_free);
3586
3587
0
    fd->ref_id    = id;
3588
0
    fd->ref_start = start;
3589
0
    fd->ref_end   = end;
3590
0
    fd->ref_free = fd->ref;
3591
0
    seq = fd->ref;
3592
3593
0
    pthread_mutex_unlock(&fd->refs->lock);
3594
0
    pthread_mutex_unlock(&fd->ref_lock);
3595
3596
0
    return seq ? seq + ostart - start : NULL;
3597
0
}
3598
3599
/*
3600
 * If fd has been opened for reading, it may be permitted to specify 'fn'
3601
 * as NULL and let the code auto-detect the reference by parsing the
3602
 * SAM header @SQ lines.
3603
 */
3604
0
int cram_load_reference(cram_fd *fd, char *fn) {
3605
0
    int ret = 0;
3606
3607
0
    if (fn) {
3608
0
        fd->refs = refs_load_fai(fd->refs, fn,
3609
0
                                 !(fd->embed_ref>0 && fd->mode == 'r'));
3610
0
        fn = fd->refs ? fd->refs->fn : NULL;
3611
0
        if (!fn)
3612
0
            ret = -1;
3613
0
        sanitise_SQ_lines(fd);
3614
0
    }
3615
0
    fd->ref_fn = fn;
3616
3617
0
    if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
3618
0
        if (fd->refs)
3619
0
            refs_free(fd->refs);
3620
0
        if (!(fd->refs = refs_create()))
3621
0
            return -1;
3622
0
        if (-1 == refs_from_header(fd))
3623
0
            return -1;
3624
0
    }
3625
3626
0
    if (fd->header)
3627
0
        if (-1 == refs2id(fd->refs, fd->header))
3628
0
            return -1;
3629
3630
0
    return ret;
3631
0
}
3632
3633
/* ----------------------------------------------------------------------
3634
 * Containers
3635
 */
3636
3637
/*
3638
 * Creates a new container, specifying the maximum number of slices
3639
 * and records permitted.
3640
 *
3641
 * Returns cram_container ptr on success
3642
 *         NULL on failure
3643
 */
3644
74.2k
cram_container *cram_new_container(int nrec, int nslice) {
3645
74.2k
    cram_container *c = calloc(1, sizeof(*c));
3646
74.2k
    enum cram_DS_ID id;
3647
3648
74.2k
    if (!c)
3649
0
        return NULL;
3650
3651
74.2k
    c->curr_ref = -2;
3652
3653
74.2k
    c->max_c_rec = nrec * nslice;
3654
74.2k
    c->curr_c_rec = 0;
3655
3656
74.2k
    c->max_rec = nrec;
3657
74.2k
    c->record_counter = 0;
3658
74.2k
    c->num_bases = 0;
3659
74.2k
    c->s_num_bases = 0;
3660
3661
74.2k
    c->max_slice = nslice;
3662
74.2k
    c->curr_slice = 0;
3663
3664
74.2k
    c->pos_sorted = 1;
3665
74.2k
    c->max_apos   = 0;
3666
74.2k
    c->multi_seq  = 0;
3667
74.2k
    c->qs_seq_orient = 1;
3668
74.2k
    c->no_ref = 0;
3669
74.2k
    c->embed_ref = -1; // automatic selection
3670
3671
74.2k
    c->bams = NULL;
3672
3673
74.2k
    if (!(c->slices = calloc(nslice != 0 ? nslice : 1, sizeof(cram_slice *))))
3674
0
        goto err;
3675
74.2k
    c->slice = NULL;
3676
3677
74.2k
    if (!(c->comp_hdr = cram_new_compression_header()))
3678
0
        goto err;
3679
74.2k
    c->comp_hdr_block = NULL;
3680
3681
2.15M
    for (id = DS_RN; id < DS_TN; id++)
3682
2.07M
        if (!(c->stats[id] = cram_stats_create())) goto err;
3683
3684
    //c->aux_B_stats = cram_stats_create();
3685
3686
74.2k
    if (!(c->tags_used = kh_init(m_tagmap)))
3687
0
        goto err;
3688
74.2k
    c->refs_used = 0;
3689
74.2k
    c->ref_free = 0;
3690
3691
74.2k
    return c;
3692
3693
0
 err:
3694
0
    if (c) {
3695
0
        if (c->slices)
3696
0
            free(c->slices);
3697
0
        free(c);
3698
0
    }
3699
0
    return NULL;
3700
74.2k
}
3701
3702
5.74k
static void free_bam_list(bam_seq_t **bams, int max_rec) {
3703
5.74k
    int i;
3704
57.4M
    for (i = 0; i < max_rec; i++)
3705
57.4M
        bam_free(bams[i]);
3706
3707
5.74k
    free(bams);
3708
5.74k
}
3709
3710
102k
void cram_free_container(cram_container *c) {
3711
102k
    enum cram_DS_ID id;
3712
102k
    int i;
3713
3714
102k
    if (!c)
3715
0
        return;
3716
3717
102k
    if (c->refs_used)
3718
975
        free(c->refs_used);
3719
3720
102k
    if (c->landmark)
3721
75.7k
        free(c->landmark);
3722
3723
102k
    if (c->comp_hdr)
3724
74.7k
        cram_free_compression_header(c->comp_hdr);
3725
3726
102k
    if (c->comp_hdr_block)
3727
69.7k
        cram_free_block(c->comp_hdr_block);
3728
3729
    // Free the slices; filled out by encoder only
3730
102k
    if (c->slices) {
3731
137k
        for (i = 0; i < c->max_slice; i++) {
3732
63.4k
            if (c->slices[i])
3733
5.74k
                cram_free_slice(c->slices[i]);
3734
63.4k
            if (c->slices[i] == c->slice)
3735
63.4k
                c->slice = NULL;
3736
63.4k
        }
3737
74.2k
        free(c->slices);
3738
74.2k
    }
3739
3740
    // Free the current slice; set by both encoder & decoder
3741
102k
    if (c->slice) {
3742
0
        cram_free_slice(c->slice);
3743
0
        c->slice = NULL;
3744
0
    }
3745
3746
2.97M
    for (id = DS_RN; id < DS_TN; id++)
3747
2.87M
        if (c->stats[id]) cram_stats_free(c->stats[id]);
3748
3749
    //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
3750
3751
102k
    if (c->tags_used) {
3752
74.2k
        khint_t k;
3753
3754
488k
        for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
3755
414k
            if (!kh_exist(c->tags_used, k))
3756
224k
                continue;
3757
3758
189k
            cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
3759
189k
            if (tm) {
3760
189k
                cram_codec *c = tm->codec;
3761
3762
189k
                if (c) c->free(c);
3763
3764
                // If tm->blk or tm->blk2 is set, then we haven't yet got to
3765
                // cram_encode_container which copies the blocks to s->aux_block
3766
                // and NULLifies tm->blk*.  In this case we failed to complete
3767
                // the container construction, so we have to free up our partially
3768
                // converted CRAM.
3769
189k
                cram_free_block(tm->blk);
3770
189k
                cram_free_block(tm->blk2);
3771
189k
                free(tm);
3772
189k
            }
3773
189k
        }
3774
3775
74.2k
        kh_destroy(m_tagmap, c->tags_used);
3776
74.2k
    }
3777
3778
102k
    if (c->ref_free)
3779
30.1k
        free(c->ref);
3780
3781
102k
    if (c->bams)
3782
421
        free_bam_list(c->bams, c->max_c_rec);
3783
3784
102k
    free(c);
3785
102k
}
3786
3787
/*
3788
 * Reads a container header.
3789
 *
3790
 * Returns cram_container on success
3791
 *         NULL on failure or no container left (fd->err == 0).
3792
 */
3793
29.0k
cram_container *cram_read_container(cram_fd *fd) {
3794
29.0k
    cram_container c2, *c;
3795
29.0k
    int i, s;
3796
29.0k
    size_t rd = 0;
3797
29.0k
    uint32_t crc = 0;
3798
3799
29.0k
    fd->err = 0;
3800
29.0k
    fd->eof = 0;
3801
3802
29.0k
    memset(&c2, 0, sizeof(c2));
3803
29.0k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3804
23.1k
        if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) {
3805
62
            fd->eof = fd->empty_container ? 1 : 2;
3806
62
            return NULL;
3807
23.1k
        } else {
3808
23.1k
            rd+=s;
3809
23.1k
        }
3810
23.1k
    } else if (CRAM_MAJOR_VERS(fd->version) < 4) {
3811
2.59k
        uint32_t len;
3812
2.59k
        if ((s = int32_decode(fd, &c2.length)) == -1) {
3813
32
            if (CRAM_MAJOR_VERS(fd->version) == 2 &&
3814
32
                CRAM_MINOR_VERS(fd->version) == 0)
3815
16
                fd->eof = 1; // EOF blocks arrived in v2.1
3816
16
            else
3817
16
                fd->eof = fd->empty_container ? 1 : 2;
3818
32
            return NULL;
3819
2.56k
        } else {
3820
2.56k
            rd+=s;
3821
2.56k
        }
3822
2.56k
        len = le_int4(c2.length);
3823
2.56k
        crc = crc32(0L, (unsigned char *)&len, 4);
3824
3.32k
    } else {
3825
3.32k
        if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc))   == -1) {
3826
19
            fd->eof = fd->empty_container ? 1 : 2;
3827
19
            return NULL;
3828
3.30k
        } else {
3829
3.30k
            rd+=s;
3830
3.30k
        }
3831
3.32k
    }
3832
28.9k
    if ((s = fd->vv.varint_decode32s_crc(fd, &c2.ref_seq_id, &crc))   == -1) return NULL; else rd+=s;
3833
28.8k
    if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3834
3.29k
        int64_t i64;
3835
3.29k
        if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc))== -1) return NULL; else rd+=s;
3836
3.28k
        c2.ref_seq_start = i64;
3837
3.28k
        if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc)) == -1) return NULL; else rd+=s;
3838
3.27k
        c2.ref_seq_span = i64;
3839
25.6k
    } else {
3840
25.6k
        int32_t i32;
3841
25.6k
        if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s;
3842
25.5k
        c2.ref_seq_start = i32;
3843
25.5k
        if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s;
3844
25.5k
        c2.ref_seq_span = i32;
3845
25.5k
    }
3846
28.7k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_records, &crc))  == -1) return NULL; else rd+=s;
3847
3848
28.7k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3849
22.9k
        c2.record_counter = 0;
3850
22.9k
        c2.num_bases = 0;
3851
22.9k
    } else {
3852
5.78k
        if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3853
3.83k
            if ((s = fd->vv.varint_decode64_crc(fd, &c2.record_counter, &crc)) == -1)
3854
19
                return NULL;
3855
3.81k
            else
3856
3.81k
                rd += s;
3857
3.83k
        } else {
3858
1.95k
            int32_t i32;
3859
1.95k
            if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1)
3860
12
                return NULL;
3861
1.94k
            else
3862
1.94k
                rd += s;
3863
1.94k
            c2.record_counter = i32;
3864
1.94k
        }
3865
3866
5.75k
        if ((s = fd->vv.varint_decode64_crc(fd, &c2.num_bases, &crc))== -1)
3867
49
            return NULL;
3868
5.70k
        else
3869
5.70k
            rd += s;
3870
5.75k
    }
3871
28.6k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_blocks, &crc))   == -1)
3872
52
        return NULL;
3873
28.6k
    else
3874
28.6k
        rd+=s;
3875
28.6k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_landmarks, &crc))== -1)
3876
22
        return NULL;
3877
28.5k
    else
3878
28.5k
        rd+=s;
3879
3880
28.5k
    if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t))
3881
81
        return NULL;
3882
3883
28.5k
    if (!(c = calloc(1, sizeof(*c))))
3884
0
        return NULL;
3885
3886
28.5k
    *c = c2;
3887
28.5k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3888
28.5k
    if (c->num_landmarks > FUZZ_ALLOC_LIMIT/sizeof(int32_t)) {
3889
11
        fd->err = errno = ENOMEM;
3890
11
        cram_free_container(c);
3891
11
        return NULL;
3892
11
    }
3893
28.4k
#endif
3894
28.4k
    if (c->num_landmarks && !(c->landmark = malloc(c->num_landmarks * sizeof(int32_t)))) {
3895
0
        fd->err = errno;
3896
0
        cram_free_container(c);
3897
0
        return NULL;
3898
0
    }
3899
78.2k
    for (i = 0; i < c->num_landmarks; i++) {
3900
49.9k
        if ((s = fd->vv.varint_decode32_crc(fd, &c->landmark[i], &crc)) == -1) {
3901
207
            cram_free_container(c);
3902
207
            return NULL;
3903
49.7k
        } else {
3904
49.7k
            rd += s;
3905
49.7k
        }
3906
49.9k
    }
3907
3908
28.2k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3909
3.67k
        if (-1 == int32_decode(fd, (int32_t *)&c->crc32)) {
3910
11
            cram_free_container(c);
3911
11
            return NULL;
3912
3.66k
        } else {
3913
3.66k
            rd+=4;
3914
3.66k
        }
3915
3916
3.66k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3917
        // Pretend the CRC was OK so the fuzzer doesn't have to get it right
3918
3.66k
        crc = c->crc32;
3919
3.66k
#endif
3920
3921
3.66k
        if (crc != c->crc32) {
3922
0
            hts_log_error("Container header CRC32 failure");
3923
0
            cram_free_container(c);
3924
0
            return NULL;
3925
0
        }
3926
3.66k
    }
3927
3928
28.2k
    c->offset = rd;
3929
28.2k
    c->slices = NULL;
3930
28.2k
    c->slice = NULL;
3931
28.2k
    c->curr_slice = 0;
3932
28.2k
    c->max_slice = c->num_landmarks;
3933
28.2k
    c->slice_rec = 0;
3934
28.2k
    c->curr_rec = 0;
3935
28.2k
    c->max_rec = 0;
3936
3937
28.2k
    if (c->ref_seq_id == -2) {
3938
35
        c->multi_seq = 1;
3939
35
        fd->multi_seq = 1;
3940
35
    }
3941
3942
28.2k
    fd->empty_container =
3943
28.2k
        (c->num_records == 0 &&
3944
28.2k
         c->ref_seq_id == -1 &&
3945
28.2k
         c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
3946
3947
28.2k
    return c;
3948
28.2k
}
3949
3950
3951
/* MAXIMUM storage size needed for the container. */
3952
0
int cram_container_size(cram_container *c) {
3953
0
    return 55 + 5*c->num_landmarks;
3954
0
}
3955
3956
3957
/*
3958
 * Stores the container structure in dat and returns *size as the
3959
 * number of bytes written to dat[].  The input size of dat is also
3960
 * held in *size and should be initialised to cram_container_size(c).
3961
 *
3962
 * Returns 0 on success;
3963
 *        -1 on failure
3964
 */
3965
int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
3966
0
{
3967
0
    char *cp = (char *)dat;
3968
0
    int i;
3969
3970
    // Check the input buffer is large enough according to our stated
3971
    // requirements. (NOTE: it may actually take less.)
3972
0
    if (cram_container_size(c) > *size)
3973
0
        return -1;
3974
3975
0
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3976
0
        cp += itf8_put(cp, c->length);
3977
0
    } else {
3978
0
        *(int32_t *)cp = le_int4(c->length);
3979
0
        cp += 4;
3980
0
    }
3981
0
    if (c->multi_seq) {
3982
0
        cp += fd->vv.varint_put32(cp, NULL, -2);
3983
0
        cp += fd->vv.varint_put32(cp, NULL, 0);
3984
0
        cp += fd->vv.varint_put32(cp, NULL, 0);
3985
0
    } else {
3986
0
        cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
3987
0
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3988
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start);
3989
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span);
3990
0
        } else {
3991
0
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
3992
0
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
3993
0
        }
3994
0
    }
3995
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_records);
3996
0
    if (CRAM_MAJOR_VERS(fd->version) == 2) {
3997
0
        cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
3998
0
    } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3999
0
        cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
4000
0
    }
4001
0
    cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
4002
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
4003
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
4004
0
    for (i = 0; i < c->num_landmarks; i++)
4005
0
        cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
4006
4007
0
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
4008
0
        c->crc32 = crc32(0L, (uc *)dat, cp-dat);
4009
0
        cp[0] =  c->crc32        & 0xff;
4010
0
        cp[1] = (c->crc32 >>  8) & 0xff;
4011
0
        cp[2] = (c->crc32 >> 16) & 0xff;
4012
0
        cp[3] = (c->crc32 >> 24) & 0xff;
4013
0
        cp += 4;
4014
0
    }
4015
4016
0
    *size = cp-dat; // actual used size
4017
4018
0
    return 0;
4019
0
}
4020
4021
4022
/*
4023
 * Writes a container structure.
4024
 *
4025
 * Returns 0 on success
4026
 *        -1 on failure
4027
 */
4028
85.5k
int cram_write_container(cram_fd *fd, cram_container *c) {
4029
85.5k
    char buf_a[1024], *buf = buf_a, *cp;
4030
85.5k
    int i;
4031
4032
85.5k
    if (61 + c->num_landmarks * 10 >= 1024) {
4033
0
        buf = malloc(61 + c->num_landmarks * 10);
4034
0
        if (!buf)
4035
0
            return -1;
4036
0
    }
4037
85.5k
    cp = buf;
4038
4039
85.5k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4040
0
        cp += itf8_put(cp, c->length);
4041
85.5k
    } else if (CRAM_MAJOR_VERS(fd->version) <= 3) {
4042
85.5k
        *(int32_t *)cp = le_int4(c->length);
4043
85.5k
        cp += 4;
4044
85.5k
    } else {
4045
0
        cp += fd->vv.varint_put32(cp, NULL, c->length);
4046
0
    }
4047
85.5k
    if (c->multi_seq) {
4048
971
        cp += fd->vv.varint_put32(cp, NULL, (uint32_t)-2);
4049
971
        cp += fd->vv.varint_put32(cp, NULL, 0);
4050
971
        cp += fd->vv.varint_put32(cp, NULL, 0);
4051
84.5k
    } else {
4052
84.5k
        cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
4053
84.5k
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
4054
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start);
4055
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span);
4056
84.5k
        } else {
4057
84.5k
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
4058
84.5k
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
4059
84.5k
        }
4060
84.5k
    }
4061
85.5k
    cp += fd->vv.varint_put32(cp, NULL, c->num_records);
4062
85.5k
    if (CRAM_MAJOR_VERS(fd->version) >= 3)
4063
85.5k
        cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
4064
0
    else
4065
0
        cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
4066
85.5k
    cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
4067
85.5k
    cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
4068
85.5k
    cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
4069
169k
    for (i = 0; i < c->num_landmarks; i++)
4070
84.2k
        cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
4071
4072
85.5k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
4073
85.5k
        c->crc32 = crc32(0L, (uc *)buf, cp-buf);
4074
85.5k
        cp[0] =  c->crc32        & 0xff;
4075
85.5k
        cp[1] = (c->crc32 >>  8) & 0xff;
4076
85.5k
        cp[2] = (c->crc32 >> 16) & 0xff;
4077
85.5k
        cp[3] = (c->crc32 >> 24) & 0xff;
4078
85.5k
        cp += 4;
4079
85.5k
    }
4080
4081
85.5k
    if (cp-buf != hwrite(fd->fp, buf, cp-buf)) {
4082
0
        if (buf != buf_a)
4083
0
            free(buf);
4084
0
        return -1;
4085
0
    }
4086
4087
85.5k
    if (buf != buf_a)
4088
0
        free(buf);
4089
4090
85.5k
    return 0;
4091
85.5k
}
4092
4093
// common component shared by cram_flush_container{,_mt}
4094
62.8k
static int cram_flush_container2(cram_fd *fd, cram_container *c) {
4095
62.8k
    int i, j;
4096
4097
62.8k
    if (c->curr_slice > 0 && !c->slices)
4098
0
        return -1;
4099
4100
    //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
4101
4102
62.8k
    off_t c_offset = htell(fd->fp); // File offset of container
4103
4104
    /* Write the container struct itself */
4105
62.8k
    if (0 != cram_write_container(fd, c))
4106
0
        return -1;
4107
4108
62.8k
    off_t hdr_size = htell(fd->fp) - c_offset;
4109
4110
    /* And the compression header */
4111
62.8k
    if (0 != cram_write_block(fd, c->comp_hdr_block))
4112
0
        return -1;
4113
4114
    /* Followed by the slice blocks */
4115
62.8k
    off_t file_offset = htell(fd->fp);
4116
125k
    for (i = 0; i < c->curr_slice; i++) {
4117
62.8k
        cram_slice *s = c->slices[i];
4118
62.8k
        off_t spos = file_offset - c_offset - hdr_size;
4119
4120
62.8k
        if (0 != cram_write_block(fd, s->hdr_block))
4121
0
            return -1;
4122
4123
459k
        for (j = 0; j < s->hdr->num_blocks; j++) {
4124
396k
            if (0 != cram_write_block(fd, s->block[j]))
4125
0
                return -1;
4126
396k
        }
4127
4128
62.8k
        file_offset = htell(fd->fp);
4129
62.8k
        off_t sz = file_offset - c_offset - hdr_size - spos;
4130
4131
62.8k
        if (fd->idxfp) {
4132
0
            if (cram_index_slice(fd, c, s, fd->idxfp, c_offset, spos, sz) < 0)
4133
0
                return -1;
4134
0
        }
4135
62.8k
    }
4136
4137
62.8k
    return 0;
4138
62.8k
}
4139
4140
/*
4141
 * Flushes a completely or partially full container to disk, writing
4142
 * container structure, header and blocks. This also calls the encoder
4143
 * functions.
4144
 *
4145
 * Returns 0 on success
4146
 *        -1 on failure
4147
 */
4148
63.4k
int cram_flush_container(cram_fd *fd, cram_container *c) {
4149
    /* Encode the container blocks and generate compression header */
4150
63.4k
    if (0 != cram_encode_container(fd, c))
4151
636
        return -1;
4152
4153
62.8k
    return cram_flush_container2(fd, c);
4154
63.4k
}
4155
4156
typedef struct {
4157
    cram_fd *fd;
4158
    cram_container *c;
4159
} cram_job;
4160
4161
0
void *cram_flush_thread(void *arg) {
4162
0
    cram_job *j = (cram_job *)arg;
4163
4164
    /* Encode the container blocks and generate compression header */
4165
0
    if (0 != cram_encode_container(j->fd, j->c)) {
4166
0
        hts_log_error("Call to cram_encode_container failed");
4167
0
        return NULL;
4168
0
    }
4169
4170
0
    return arg;
4171
0
}
4172
4173
0
static int cram_flush_result(cram_fd *fd) {
4174
0
    int i, ret = 0;
4175
0
    hts_tpool_result *r;
4176
0
    cram_container *lc = NULL;
4177
4178
    // NB: we can have one result per slice, not per container,
4179
    // so we need to free the container only after all slices
4180
    // within it have been freed.  (Automatic via reference counting.)
4181
0
    while ((r = hts_tpool_next_result(fd->rqueue))) {
4182
0
        cram_job *j = (cram_job *)hts_tpool_result_data(r);
4183
0
        cram_container *c;
4184
4185
0
        if (!j) {
4186
0
            hts_tpool_delete_result(r, 0);
4187
0
            return -1;
4188
0
        }
4189
4190
0
        fd = j->fd;
4191
0
        c = j->c;
4192
4193
0
        if (fd->mode == 'w')
4194
0
            if (0 != cram_flush_container2(fd, c))
4195
0
                return -1;
4196
4197
        // Free the slices; filled out by encoder only
4198
0
        if (c->slices) {
4199
0
            for (i = 0; i < c->max_slice; i++) {
4200
0
                if (c->slices[i])
4201
0
                    cram_free_slice(c->slices[i]);
4202
0
                if (c->slices[i] == c->slice)
4203
0
                    c->slice = NULL;
4204
0
                c->slices[i] = NULL;
4205
0
            }
4206
0
        }
4207
4208
        // Free the current slice; set by both encoder & decoder
4209
0
        if (c->slice) {
4210
0
            cram_free_slice(c->slice);
4211
0
            c->slice = NULL;
4212
0
        }
4213
0
        c->curr_slice = 0;
4214
4215
        // Our jobs will be in order, so we free the last
4216
        // container when our job has switched to a new one.
4217
0
        if (c != lc) {
4218
0
            if (lc) {
4219
0
                if (fd->ctr == lc)
4220
0
                    fd->ctr = NULL;
4221
0
                if (fd->ctr_mt == lc)
4222
0
                    fd->ctr_mt = NULL;
4223
0
                cram_free_container(lc);
4224
0
            }
4225
0
            lc = c;
4226
0
        }
4227
4228
0
        hts_tpool_delete_result(r, 1);
4229
0
    }
4230
0
    if (lc) {
4231
0
        if (fd->ctr == lc)
4232
0
            fd->ctr = NULL;
4233
0
        if (fd->ctr_mt == lc)
4234
0
            fd->ctr_mt = NULL;
4235
0
        cram_free_container(lc);
4236
0
    }
4237
4238
0
    return ret;
4239
0
}
4240
4241
// Note: called while metrics_lock is held.
4242
// Will be left in this state too, but may temporarily unlock.
4243
7.20k
void reset_metrics(cram_fd *fd) {
4244
7.20k
    int i;
4245
4246
7.20k
    if (fd->pool) {
4247
        // If multi-threaded we have multiple blocks being
4248
        // compressed already and several on the to-do list
4249
        // (fd->rqueue->pending).  It's tricky to reset the
4250
        // metrics exactly the correct point, so instead we
4251
        // just flush the pool, reset, and then continue again.
4252
4253
        // Don't bother starting a new trial before then though.
4254
0
        for (i = 0; i < DS_END; i++) {
4255
0
            cram_metrics *m = fd->m[i];
4256
0
            if (!m)
4257
0
                continue;
4258
0
            m->next_trial = 999;
4259
0
        }
4260
4261
0
        pthread_mutex_unlock(&fd->metrics_lock);
4262
0
        hts_tpool_process_flush(fd->rqueue);
4263
0
        pthread_mutex_lock(&fd->metrics_lock);
4264
0
    }
4265
4266
345k
    for (i = 0; i < DS_END; i++) {
4267
338k
        cram_metrics *m = fd->m[i];
4268
338k
        if (!m)
4269
0
            continue;
4270
4271
338k
        m->trial = NTRIALS;
4272
338k
        m->next_trial = TRIAL_SPAN;
4273
338k
        m->revised_method = 0;
4274
338k
        m->unpackable = 0;
4275
4276
338k
        memset(m->sz, 0, sizeof(m->sz));
4277
338k
    }
4278
7.20k
}
4279
4280
63.4k
int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
4281
63.4k
    cram_job *j;
4282
4283
    // At the junction of mapped to unmapped data the compression
4284
    // methods may need to change due to very different statistical
4285
    // properties; particularly BA if minhash sorted.
4286
    //
4287
    // However with threading we'll have several in-flight blocks
4288
    // arriving out of order.
4289
    //
4290
    // So we do one trial reset of NThreads to last for NThreads
4291
    // duration to get us over this transition period, followed
4292
    // by another retrial of the usual ntrials & trial span.
4293
63.4k
    pthread_mutex_lock(&fd->metrics_lock);
4294
63.4k
    if (c->n_mapped < 0.3*c->curr_rec &&
4295
63.4k
        fd->last_mapped > 0.7*c->max_rec) {
4296
7.20k
        reset_metrics(fd);
4297
7.20k
    }
4298
63.4k
    fd->last_mapped = c->n_mapped * (c->max_rec+1)/(c->curr_rec+1) ;
4299
63.4k
    pthread_mutex_unlock(&fd->metrics_lock);
4300
4301
63.4k
    if (!fd->pool)
4302
63.4k
        return cram_flush_container(fd, c);
4303
4304
0
    if (!(j = malloc(sizeof(*j))))
4305
0
        return -1;
4306
0
    j->fd = fd;
4307
0
    j->c = c;
4308
4309
    // Flush the job.  Note our encoder queue may be full, so we
4310
    // either have to keep trying in non-blocking mode (what we do) or
4311
    // use a dedicated separate thread for draining the queue.
4312
0
    for (;;) {
4313
0
        errno = 0;
4314
0
        hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_flush_thread, j, 1);
4315
0
        int pending = (errno == EAGAIN);
4316
0
        if (cram_flush_result(fd) != 0)
4317
0
            return -1;
4318
0
        if (!pending)
4319
0
            break;
4320
4321
0
        hts_usleep(1000);
4322
0
    }
4323
4324
0
    return 0;
4325
0
}
4326
4327
/* ----------------------------------------------------------------------
4328
 * Compression headers; the first part of the container
4329
 */
4330
4331
/*
4332
 * Creates a new blank container compression header
4333
 *
4334
 * Returns header ptr on success
4335
 *         NULL on failure
4336
 */
4337
74.2k
cram_block_compression_hdr *cram_new_compression_header(void) {
4338
74.2k
    cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
4339
74.2k
    if (!hdr)
4340
0
        return NULL;
4341
4342
74.2k
    if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
4343
0
        free(hdr);
4344
0
        return NULL;
4345
0
    }
4346
4347
74.2k
    if (!(hdr->TD_hash = kh_init(m_s2i))) {
4348
0
        cram_free_block(hdr->TD_blk);
4349
0
        free(hdr);
4350
0
        return NULL;
4351
0
    }
4352
4353
74.2k
    if (!(hdr->TD_keys = string_pool_create(8192))) {
4354
0
        kh_destroy(m_s2i, hdr->TD_hash);
4355
0
        cram_free_block(hdr->TD_blk);
4356
0
        free(hdr);
4357
0
        return NULL;
4358
0
    }
4359
4360
74.2k
    return hdr;
4361
74.2k
}
4362
4363
76.8k
void cram_free_compression_header(cram_block_compression_hdr *hdr) {
4364
76.8k
    int i;
4365
4366
76.8k
    if (hdr->landmark)
4367
2.41k
        free(hdr->landmark);
4368
4369
76.8k
    if (hdr->preservation_map)
4370
76.8k
        kh_destroy(map, hdr->preservation_map);
4371
4372
2.53M
    for (i = 0; i < CRAM_MAP_HASH; i++) {
4373
2.45M
        cram_map *m, *m2;
4374
2.50M
        for (m = hdr->rec_encoding_map[i]; m; m = m2) {
4375
41.5k
            m2 = m->next;
4376
41.5k
            if (m->codec)
4377
0
                m->codec->free(m->codec);
4378
41.5k
            free(m);
4379
41.5k
        }
4380
2.45M
    }
4381
4382
2.53M
    for (i = 0; i < CRAM_MAP_HASH; i++) {
4383
2.45M
        cram_map *m, *m2;
4384
2.46M
        for (m = hdr->tag_encoding_map[i]; m; m = m2) {
4385
1.33k
            m2 = m->next;
4386
1.33k
            if (m->codec)
4387
1.33k
                m->codec->free(m->codec);
4388
1.33k
            free(m);
4389
1.33k
        }
4390
2.45M
    }
4391
4392
3.68M
    for (i = 0; i < DS_END; i++) {
4393
3.61M
        if (hdr->codecs[i])
4394
1.17M
            hdr->codecs[i]->free(hdr->codecs[i]);
4395
3.61M
    }
4396
4397
76.8k
    if (hdr->TL)
4398
90
        free(hdr->TL);
4399
76.8k
    if (hdr->TD_blk)
4400
74.3k
        cram_free_block(hdr->TD_blk);
4401
76.8k
    if (hdr->TD_hash)
4402
76.8k
        kh_destroy(m_s2i, hdr->TD_hash);
4403
76.8k
    if (hdr->TD_keys)
4404
74.2k
        string_pool_destroy(hdr->TD_keys);
4405
4406
76.8k
    free(hdr);
4407
76.8k
}
4408
4409
4410
/* ----------------------------------------------------------------------
4411
 * Slices and slice headers
4412
 */
4413
4414
64.2k
void cram_free_slice_header(cram_block_slice_hdr *hdr) {
4415
64.2k
    if (!hdr)
4416
0
        return;
4417
4418
64.2k
    if (hdr->block_content_ids)
4419
63.7k
        free(hdr->block_content_ids);
4420
4421
64.2k
    free(hdr);
4422
4423
64.2k
    return;
4424
64.2k
}
4425
4426
64.3k
void cram_free_slice(cram_slice *s) {
4427
64.3k
    if (!s)
4428
0
        return;
4429
4430
64.3k
    if (s->hdr_block)
4431
63.5k
        cram_free_block(s->hdr_block);
4432
4433
64.3k
    if (s->block) {
4434
63.6k
        int i;
4435
4436
63.6k
        if (s->hdr) {
4437
6.68M
            for (i = 0; i < s->hdr->num_blocks; i++) {
4438
6.62M
                if (i > 0 && s->block[i] == s->block[0])
4439
6.14M
                    continue;
4440
472k
                cram_free_block(s->block[i]);
4441
472k
            }
4442
63.6k
        }
4443
63.6k
        free(s->block);
4444
63.6k
    }
4445
4446
64.3k
    {
4447
        // Normally already copied into s->block[], but potentially still
4448
        // here if we error part way through cram_encode_slice.
4449
64.3k
        int i;
4450
253k
        for (i = 0; i < s->naux_block; i++)
4451
189k
            cram_free_block(s->aux_block[i]);
4452
64.3k
    }
4453
4454
64.3k
    if (s->block_by_id)
4455
660
        free(s->block_by_id);
4456
4457
64.3k
    if (s->hdr)
4458
64.2k
        cram_free_slice_header(s->hdr);
4459
4460
64.3k
    if (s->seqs_blk)
4461
64.1k
        cram_free_block(s->seqs_blk);
4462
4463
64.3k
    if (s->qual_blk)
4464
1.17k
        cram_free_block(s->qual_blk);
4465
4466
64.3k
    if (s->name_blk)
4467
1.17k
        cram_free_block(s->name_blk);
4468
4469
64.3k
    if (s->aux_blk)
4470
64.1k
        cram_free_block(s->aux_blk);
4471
4472
64.3k
    if (s->base_blk)
4473
1.17k
        cram_free_block(s->base_blk);
4474
4475
64.3k
    if (s->soft_blk)
4476
1.17k
        cram_free_block(s->soft_blk);
4477
4478
64.3k
    if (s->cigar)
4479
64.1k
        free(s->cigar);
4480
4481
64.3k
    if (s->crecs)
4482
64.1k
        free(s->crecs);
4483
4484
64.3k
    if (s->features)
4485
28.3k
        free(s->features);
4486
4487
64.3k
    if (s->TN)
4488
0
        free(s->TN);
4489
4490
64.3k
    if (s->pair_keys)
4491
63.4k
        string_pool_destroy(s->pair_keys);
4492
4493
64.3k
    if (s->pair[0])
4494
64.3k
        kh_destroy(m_s2i, s->pair[0]);
4495
64.3k
    if (s->pair[1])
4496
64.3k
        kh_destroy(m_s2i, s->pair[1]);
4497
4498
64.3k
    if (s->aux_block)
4499
53.6k
        free(s->aux_block);
4500
4501
64.3k
    free(s);
4502
64.3k
}
4503
4504
/*
4505
 * Creates a new empty slice in memory, for subsequent writing to
4506
 * disk.
4507
 *
4508
 * Returns cram_slice ptr on success
4509
 *         NULL on failure
4510
 */
4511
63.4k
cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
4512
63.4k
    cram_slice *s = calloc(1, sizeof(*s));
4513
63.4k
    if (!s)
4514
0
        return NULL;
4515
4516
63.4k
    if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
4517
0
        goto err;
4518
63.4k
    s->hdr->content_type = type;
4519
4520
63.4k
    s->hdr_block = NULL;
4521
63.4k
    s->block = NULL;
4522
63.4k
    s->block_by_id = NULL;
4523
63.4k
    s->last_apos = 0;
4524
63.4k
    if (!(s->crecs = malloc(nrecs * sizeof(cram_record))))  goto err;
4525
63.4k
    s->cigar_alloc = 1024;
4526
63.4k
    if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err;
4527
63.4k
    s->ncigar = 0;
4528
4529
63.4k
    if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))       goto err;
4530
63.4k
    if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))   goto err;
4531
63.4k
    if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))   goto err;
4532
63.4k
    if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux)))  goto err;
4533
63.4k
    if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))   goto err;
4534
63.4k
    if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))   goto err;
4535
4536
63.4k
    s->features = NULL;
4537
63.4k
    s->nfeatures = s->afeatures = 0;
4538
4539
63.4k
#ifndef TN_external
4540
63.4k
    s->TN = NULL;
4541
63.4k
    s->nTN = s->aTN = 0;
4542
63.4k
#endif
4543
4544
    // Volatile keys as we do realloc in dstring
4545
63.4k
    if (!(s->pair_keys = string_pool_create(8192))) goto err;
4546
63.4k
    if (!(s->pair[0] = kh_init(m_s2i)))             goto err;
4547
63.4k
    if (!(s->pair[1] = kh_init(m_s2i)))             goto err;
4548
4549
#ifdef BA_external
4550
    s->BA_len = 0;
4551
#endif
4552
4553
63.4k
    return s;
4554
4555
0
 err:
4556
0
    if (s)
4557
0
        cram_free_slice(s);
4558
4559
0
    return NULL;
4560
63.4k
}
4561
4562
/*
4563
 * Loads an entire slice.
4564
 * FIXME: In 1.0 the native unit of slices within CRAM is broken
4565
 * as slices contain references to objects in other slices.
4566
 * To work around this while keeping the slice oriented outer loop
4567
 * we read all slices and stitch them together into a fake large
4568
 * slice instead.
4569
 *
4570
 * Returns cram_slice ptr on success
4571
 *         NULL on failure
4572
 */
4573
836
cram_slice *cram_read_slice(cram_fd *fd) {
4574
836
    cram_block *b = cram_read_block(fd);
4575
836
    cram_slice *s = calloc(1, sizeof(*s));
4576
836
    int i, n, max_id, min_id;
4577
4578
836
    if (!b || !s)
4579
47
        goto err;
4580
4581
789
    s->hdr_block = b;
4582
789
    switch (b->content_type) {
4583
765
    case MAPPED_SLICE:
4584
773
    case UNMAPPED_SLICE:
4585
773
        if (!(s->hdr = cram_decode_slice_header(fd, b)))
4586
53
            goto err;
4587
720
        break;
4588
4589
720
    default:
4590
16
        hts_log_error("Unexpected block of type %s",
4591
16
                      cram_content_type2str(b->content_type));
4592
16
        goto err;
4593
789
    }
4594
4595
720
    if (s->hdr->num_blocks < 1) {
4596
8
        hts_log_error("Slice does not include any data blocks");
4597
8
        goto err;
4598
8
    }
4599
4600
712
    s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
4601
712
    if (!s->block)
4602
0
        goto err;
4603
4604
3.69k
    for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
4605
3.03k
        if (!(s->block[i] = cram_read_block(fd)))
4606
52
            goto err;
4607
4608
2.97k
        if (s->block[i]->content_type == EXTERNAL) {
4609
630
            if (max_id < s->block[i]->content_id)
4610
15
                max_id = s->block[i]->content_id;
4611
630
            if (min_id > s->block[i]->content_id)
4612
585
                min_id = s->block[i]->content_id;
4613
630
        }
4614
2.97k
    }
4615
4616
660
    if (!(s->block_by_id = calloc(512, sizeof(s->block[0]))))
4617
0
        goto err;
4618
4619
3.39k
    for (i = 0; i < n; i++) {
4620
2.73k
        if (s->block[i]->content_type != EXTERNAL)
4621
2.10k
            continue;
4622
627
        uint32_t v = s->block[i]->content_id;
4623
627
        if (v >= 256)
4624
17
            v = 256 + v % 251;
4625
627
        s->block_by_id[v] = s->block[i];
4626
627
    }
4627
4628
    /* Initialise encoding/decoding tables */
4629
660
    s->cigar_alloc = 1024;
4630
660
    if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err;
4631
660
    s->ncigar = 0;
4632
4633
660
    if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))      goto err;
4634
660
    if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))  goto err;
4635
660
    if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))  goto err;
4636
660
    if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux))) goto err;
4637
660
    if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))  goto err;
4638
660
    if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))  goto err;
4639
4640
660
    s->crecs = NULL;
4641
4642
660
    s->last_apos = s->hdr->ref_seq_start;
4643
660
    s->decode_md = fd->decode_md;
4644
4645
660
    return s;
4646
4647
176
 err:
4648
176
    if (b)
4649
129
        cram_free_block(b);
4650
176
    if (s) {
4651
176
        s->hdr_block = NULL;
4652
176
        cram_free_slice(s);
4653
176
    }
4654
176
    return NULL;
4655
660
}
4656
4657
4658
/* ----------------------------------------------------------------------
4659
 * CRAM file definition (header)
4660
 */
4661
4662
/*
4663
 * Reads a CRAM file definition structure.
4664
 * Returns file_def ptr on success
4665
 *         NULL on failure
4666
 */
4667
12.5k
cram_file_def *cram_read_file_def(cram_fd *fd) {
4668
12.5k
    cram_file_def *def = malloc(sizeof(*def));
4669
12.5k
    if (!def)
4670
0
        return NULL;
4671
4672
12.5k
    if (26 != hread(fd->fp, &def->magic[0], 26)) {
4673
6
        free(def);
4674
6
        return NULL;
4675
6
    }
4676
4677
12.5k
    if (memcmp(def->magic, "CRAM", 4) != 0) {
4678
1
        free(def);
4679
1
        return NULL;
4680
1
    }
4681
4682
12.5k
    if (def->major_version > 4) {
4683
1
        hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x, 3.x or 4.x, got %d.%d",
4684
1
                      def->major_version, def->minor_version);
4685
1
        free(def);
4686
1
        return NULL;
4687
1
    }
4688
4689
12.5k
    fd->first_container += 26;
4690
12.5k
    fd->curr_position = fd->first_container;
4691
12.5k
    fd->last_slice = 0;
4692
4693
12.5k
    return def;
4694
12.5k
}
4695
4696
/*
4697
 * Writes a cram_file_def structure to cram_fd.
4698
 * Returns 0 on success
4699
 *        -1 on failure
4700
 */
4701
10.7k
int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
4702
10.7k
    return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1;
4703
10.7k
}
4704
4705
25.1k
void cram_free_file_def(cram_file_def *def) {
4706
25.1k
    if (def) free(def);
4707
25.1k
}
4708
4709
/* ----------------------------------------------------------------------
4710
 * SAM header I/O
4711
 */
4712
4713
4714
/*
4715
 * Reads the SAM header from the first CRAM data block.
4716
 * Also performs minimal parsing to extract read-group
4717
 * and sample information.
4718
4719
 * Returns SAM hdr ptr on success
4720
 *         NULL on failure
4721
 */
4722
12.5k
sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) {
4723
12.5k
    int32_t header_len;
4724
12.5k
    char *header;
4725
12.5k
    sam_hdr_t *hdr;
4726
4727
    /* 1.1 onwards stores the header in the first block of a container */
4728
12.5k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4729
        /* Length */
4730
9.79k
        if (-1 == int32_decode(fd, &header_len))
4731
2
            return NULL;
4732
4733
9.79k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
4734
9.79k
        if (header_len > FUZZ_ALLOC_LIMIT)
4735
2
            return NULL;
4736
9.78k
#endif
4737
4738
        /* Alloc and read */
4739
9.78k
        if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1)))
4740
0
            return NULL;
4741
4742
9.78k
        if (header_len != hread(fd->fp, header, header_len)) {
4743
34
            free(header);
4744
34
            return NULL;
4745
34
        }
4746
9.75k
        header[header_len] = '\0';
4747
4748
9.75k
        fd->first_container += 4 + header_len;
4749
9.75k
    } else {
4750
2.76k
        cram_container *c = cram_read_container(fd);
4751
2.76k
        cram_block *b;
4752
2.76k
        int i;
4753
2.76k
        int64_t len;
4754
4755
2.76k
        if (!c)
4756
148
            return NULL;
4757
4758
2.61k
        fd->first_container += c->length + c->offset;
4759
2.61k
        fd->curr_position = fd->first_container;
4760
4761
2.61k
        if (c->num_blocks < 1) {
4762
76
            cram_free_container(c);
4763
76
            return NULL;
4764
76
        }
4765
4766
2.54k
        if (!(b = cram_read_block(fd))) {
4767
131
            cram_free_container(c);
4768
131
            return NULL;
4769
131
        }
4770
2.40k
        if (cram_uncompress_block(b) != 0) {
4771
1.55k
            cram_free_container(c);
4772
1.55k
            cram_free_block(b);
4773
1.55k
            return NULL;
4774
1.55k
        }
4775
4776
859
        len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4777
859
            fd->vv.varint_size(b->content_id) +
4778
859
            fd->vv.varint_size(b->uncomp_size) +
4779
859
            fd->vv.varint_size(b->comp_size);
4780
4781
        /* Extract header from 1st block */
4782
859
        if (-1 == int32_get_blk(b, &header_len) ||
4783
859
            header_len < 0 || /* Spec. says signed...  why? */
4784
859
            b->uncomp_size - 4 < header_len) {
4785
131
            cram_free_container(c);
4786
131
            cram_free_block(b);
4787
131
            return NULL;
4788
131
        }
4789
728
        if (NULL == (header = malloc((size_t) header_len+1))) {
4790
0
            cram_free_container(c);
4791
0
            cram_free_block(b);
4792
0
            return NULL;
4793
0
        }
4794
728
        memcpy(header, BLOCK_END(b), header_len);
4795
728
        header[header_len] = '\0';
4796
728
        cram_free_block(b);
4797
4798
        /* Consume any remaining blocks */
4799
3.23k
        for (i = 1; i < c->num_blocks; i++) {
4800
2.52k
            if (!(b = cram_read_block(fd))) {
4801
15
                cram_free_container(c);
4802
15
                free(header);
4803
15
                return NULL;
4804
15
            }
4805
2.50k
            len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4806
2.50k
                fd->vv.varint_size(b->content_id) +
4807
2.50k
                fd->vv.varint_size(b->uncomp_size) +
4808
2.50k
                fd->vv.varint_size(b->comp_size);
4809
2.50k
            cram_free_block(b);
4810
2.50k
        }
4811
4812
713
        if (c->length > 0 && len > 0 && c->length > len) {
4813
            // Consume padding
4814
35
            char *pads = malloc(c->length - len);
4815
35
            if (!pads) {
4816
0
                cram_free_container(c);
4817
0
                free(header);
4818
0
                return NULL;
4819
0
            }
4820
4821
35
            if (c->length - len != hread(fd->fp, pads, c->length - len)) {
4822
22
                cram_free_container(c);
4823
22
                free(header);
4824
22
                free(pads);
4825
22
                return NULL;
4826
22
            }
4827
13
            free(pads);
4828
13
        }
4829
4830
691
        cram_free_container(c);
4831
691
    }
4832
4833
    /* Parse */
4834
10.4k
    hdr = sam_hdr_init();
4835
10.4k
    if (!hdr) {
4836
0
        free(header);
4837
0
        return NULL;
4838
0
    }
4839
4840
10.4k
    if (-1 == sam_hdr_add_lines(hdr, header, header_len)) {
4841
125
        free(header);
4842
125
        sam_hdr_destroy(hdr);
4843
125
        return NULL;
4844
125
    }
4845
4846
10.3k
    hdr->l_text = header_len;
4847
10.3k
    hdr->text = header;
4848
4849
10.3k
    return hdr;
4850
4851
10.4k
}
4852
4853
/*
4854
 * Converts 'in' to a full pathname to store in out.
4855
 * Out must be at least PATH_MAX bytes long.
4856
 */
4857
0
static void full_path(char *out, char *in) {
4858
0
    size_t in_l = strlen(in);
4859
0
    if (hisremote(in)) {
4860
0
        if (in_l > PATH_MAX) {
4861
0
            hts_log_error("Reference path is longer than %d", PATH_MAX);
4862
0
            return;
4863
0
        }
4864
0
        strncpy(out, in, PATH_MAX-1);
4865
0
        out[PATH_MAX-1] = 0;
4866
0
        return;
4867
0
    }
4868
0
    if (*in == '/' ||
4869
        // Windows paths
4870
0
        (in_l > 3 && toupper_c(*in) >= 'A'  && toupper_c(*in) <= 'Z' &&
4871
0
         in[1] == ':' && (in[2] == '/' || in[2] == '\\'))) {
4872
0
        strncpy(out, in, PATH_MAX-1);
4873
0
        out[PATH_MAX-1] = 0;
4874
0
    } else {
4875
0
        size_t len;
4876
4877
        // unable to get dir or out+in is too long
4878
0
        if (!getcwd(out, PATH_MAX) ||
4879
0
            (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
4880
0
            strncpy(out, in, PATH_MAX-1);
4881
0
            out[PATH_MAX-1] = 0;
4882
0
            return;
4883
0
        }
4884
4885
0
        snprintf(out+len, PATH_MAX - len, "/%s", in);
4886
4887
        // FIXME: cope with `pwd`/../../../foo.fa ?
4888
0
    }
4889
0
}
4890
4891
/*
4892
 * Writes a CRAM SAM header.
4893
 * Returns 0 on success
4894
 *        -1 on failure
4895
 */
4896
10.7k
int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) {
4897
10.7k
    size_t header_len;
4898
10.7k
    int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
4899
4900
    /* Write CRAM MAGIC if not yet written. */
4901
10.7k
    if (fd->file_def->major_version == 0) {
4902
10.7k
        fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version);
4903
10.7k
        fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version);
4904
10.7k
        if (0 != cram_write_file_def(fd, fd->file_def))
4905
0
            return -1;
4906
10.7k
    }
4907
4908
    /* 1.0 requires an UNKNOWN read-group */
4909
10.7k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4910
0
        if (!sam_hrecs_find_rg(hdr->hrecs, "UNKNOWN"))
4911
0
            if (sam_hdr_add_line(hdr, "RG",
4912
0
                            "ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
4913
0
                return -1;
4914
0
    }
4915
4916
10.7k
    if (-1 == refs_from_header(fd))
4917
0
        return -1;
4918
10.7k
    if (-1 == refs2id(fd->refs, fd->header))
4919
0
        return -1;
4920
4921
    /* Fix M5 strings */
4922
10.7k
    if (fd->refs && !fd->no_ref && fd->embed_ref <= 1) {
4923
10.7k
        int i;
4924
10.7k
        for (i = 0; i < hdr->hrecs->nref; i++) {
4925
4.02k
            sam_hrec_type_t *ty;
4926
4.02k
            char *ref;
4927
4928
4.02k
            if (!(ty = sam_hrecs_find_type_id(hdr->hrecs, "SQ", "SN", hdr->hrecs->ref[i].name)))
4929
0
                return -1;
4930
4931
4.02k
            if (!sam_hrecs_find_key(ty, "M5", NULL)) {
4932
3.96k
                char unsigned buf[16];
4933
3.96k
                char buf2[33];
4934
3.96k
                hts_pos_t rlen;
4935
3.96k
                hts_md5_context *md5;
4936
4937
3.96k
                if (!fd->refs ||
4938
3.96k
                    !fd->refs->ref_id ||
4939
3.96k
                    !fd->refs->ref_id[i]) {
4940
0
                    return -1;
4941
0
                }
4942
3.96k
                rlen = fd->refs->ref_id[i]->length;
4943
3.96k
                ref = cram_get_ref(fd, i, 1, rlen);
4944
3.96k
                if (NULL == ref) {
4945
3.96k
                    if (fd->embed_ref == -1) {
4946
                        // auto embed-ref
4947
3.96k
                        hts_log_warning("No M5 tags present and could not "
4948
3.96k
                                        "find reference");
4949
3.96k
                        hts_log_warning("Enabling embed_ref=2 option");
4950
3.96k
                        hts_log_warning("NOTE: the CRAM file will be bigger "
4951
3.96k
                                        "than using an external reference");
4952
3.96k
                        pthread_mutex_lock(&fd->ref_lock);
4953
                        // Best guess.  It may be unmapped data with broken
4954
                        // headers, in which case this will get ignored.
4955
3.96k
                        fd->embed_ref = 2;
4956
3.96k
                        pthread_mutex_unlock(&fd->ref_lock);
4957
3.96k
                        break;
4958
3.96k
                    }
4959
0
                    return -1;
4960
3.96k
                }
4961
0
                rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
4962
0
                if (!(md5 = hts_md5_init()))
4963
0
                    return -1;
4964
0
                if (HTS_POS_MAX <= ULONG_MAX) {
4965
                    // Platforms with 64-bit unsigned long update in one go
4966
0
                    hts_md5_update(md5, ref, rlen);
4967
0
                } else {
4968
                    // Those with 32-bit ulong (Windows) may have to loop
4969
                    // over epic references
4970
0
                    hts_pos_t pos = 0;
4971
0
                    while (rlen - pos > ULONG_MAX) {
4972
0
                        hts_md5_update(md5, ref + pos, ULONG_MAX);
4973
0
                        pos += ULONG_MAX;
4974
0
                    }
4975
0
                    hts_md5_update(md5, ref + pos, (unsigned long)(rlen - pos));
4976
0
                }
4977
0
                hts_md5_final(buf, md5);
4978
0
                hts_md5_destroy(md5);
4979
0
                cram_ref_decr(fd->refs, i);
4980
4981
0
                hts_md5_hex(buf2, buf);
4982
0
                fd->refs->ref_id[i]->validated_md5 = 1;
4983
0
                if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "M5", buf2, NULL))
4984
0
                    return -1;
4985
0
            }
4986
4987
58
            if (fd->ref_fn) {
4988
0
                char ref_fn[PATH_MAX];
4989
0
                full_path(ref_fn, fd->ref_fn);
4990
0
                if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "UR", ref_fn, NULL))
4991
0
                    return -1;
4992
0
            }
4993
58
        }
4994
10.7k
    }
4995
4996
    /* Length */
4997
10.7k
    header_len = sam_hdr_length(hdr);
4998
10.7k
    if (header_len > INT32_MAX) {
4999
0
        hts_log_error("Header is too long for CRAM format");
5000
0
        return -1;
5001
0
    }
5002
10.7k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
5003
0
        if (-1 == int32_encode(fd, header_len))
5004
0
            return -1;
5005
5006
        /* Text data */
5007
0
        if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len))
5008
0
            return -1;
5009
10.7k
    } else {
5010
        /* Create block(s) inside a container */
5011
10.7k
        cram_block *b = cram_new_block(FILE_HEADER, 0);
5012
10.7k
        cram_container *c = cram_new_container(0, 0);
5013
10.7k
        int padded_length;
5014
10.7k
        char *pads;
5015
10.7k
        int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3);
5016
5017
10.7k
        if (!b || !c) {
5018
0
            if (b) cram_free_block(b);
5019
0
            if (c) cram_free_container(c);
5020
0
            return -1;
5021
0
        }
5022
5023
10.7k
        if (int32_put_blk(b, header_len) < 0)
5024
0
            return -1;
5025
10.7k
        if (header_len)
5026
5.36k
            BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
5027
10.7k
        BLOCK_UPLEN(b);
5028
5029
        // Compress header block if V3.0 and above
5030
10.7k
        if (CRAM_MAJOR_VERS(fd->version) >= 3)
5031
10.7k
            if (cram_compress_block(fd, b, NULL, -1, -1) < 0)
5032
0
                return -1;
5033
5034
10.7k
        if (blank_block) {
5035
10.7k
            c->length = b->comp_size + 2 + 4*is_cram_3 +
5036
10.7k
                fd->vv.varint_size(b->content_id)   +
5037
10.7k
                fd->vv.varint_size(b->uncomp_size)  +
5038
10.7k
                fd->vv.varint_size(b->comp_size);
5039
5040
10.7k
            c->num_blocks = 2;
5041
10.7k
            c->num_landmarks = 2;
5042
10.7k
            if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
5043
0
                cram_free_block(b);
5044
0
                cram_free_container(c);
5045
0
                return -1;
5046
0
            }
5047
10.7k
            c->landmark[0] = 0;
5048
10.7k
            c->landmark[1] = c->length;
5049
5050
            // Plus extra storage for uncompressed secondary blank block
5051
10.7k
            padded_length = MIN(c->length*.5, 10000);
5052
10.7k
            c->length += padded_length + 2 + 4*is_cram_3 +
5053
10.7k
                fd->vv.varint_size(b->content_id) +
5054
10.7k
                fd->vv.varint_size(padded_length)*2;
5055
10.7k
        } else {
5056
            // Pad the block instead.
5057
0
            c->num_blocks = 1;
5058
0
            c->num_landmarks = 1;
5059
0
            if (!(c->landmark = malloc(sizeof(*c->landmark))))
5060
0
                return -1;
5061
0
            c->landmark[0] = 0;
5062
5063
0
            padded_length = MAX(c->length*1.5, 10000) - c->length;
5064
5065
0
            c->length = b->comp_size + padded_length +
5066
0
                2 + 4*is_cram_3 +
5067
0
                fd->vv.varint_size(b->content_id)   +
5068
0
                fd->vv.varint_size(b->uncomp_size)  +
5069
0
                fd->vv.varint_size(b->comp_size);
5070
5071
0
            if (NULL == (pads = calloc(1, padded_length))) {
5072
0
                cram_free_block(b);
5073
0
                cram_free_container(c);
5074
0
                return -1;
5075
0
            }
5076
0
            BLOCK_APPEND(b, pads, padded_length);
5077
0
            BLOCK_UPLEN(b);
5078
0
            free(pads);
5079
0
        }
5080
5081
10.7k
        if (-1 == cram_write_container(fd, c)) {
5082
0
            cram_free_block(b);
5083
0
            cram_free_container(c);
5084
0
            return -1;
5085
0
        }
5086
5087
10.7k
        if (-1 == cram_write_block(fd, b)) {
5088
0
            cram_free_block(b);
5089
0
            cram_free_container(c);
5090
0
            return -1;
5091
0
        }
5092
5093
10.7k
        if (blank_block) {
5094
10.7k
            BLOCK_RESIZE(b, padded_length);
5095
10.7k
            memset(BLOCK_DATA(b), 0, padded_length);
5096
10.7k
            BLOCK_SIZE(b) = padded_length;
5097
10.7k
            BLOCK_UPLEN(b);
5098
10.7k
            b->method = RAW;
5099
10.7k
            if (-1 == cram_write_block(fd, b)) {
5100
0
                cram_free_block(b);
5101
0
                cram_free_container(c);
5102
0
                return -1;
5103
0
            }
5104
10.7k
        }
5105
5106
10.7k
        cram_free_block(b);
5107
10.7k
        cram_free_container(c);
5108
10.7k
    }
5109
5110
10.7k
    if (0 != hflush(fd->fp))
5111
0
        return -1;
5112
5113
10.7k
    RP("=== Finishing saving header ===\n");
5114
5115
10.7k
    return 0;
5116
5117
0
 block_err:
5118
0
    return -1;
5119
10.7k
}
5120
5121
/* ----------------------------------------------------------------------
5122
 * The top-level cram opening, closing and option handling
5123
 */
5124
5125
/*
5126
 * Sets CRAM variable sized integer decode function tables.
5127
 * CRAM 1, 2, and 3.x all used ITF8 for uint32 and UTF8 for uint64.
5128
 * CRAM 4.x uses the same encoding mechanism for 32-bit and 64-bit
5129
 * (or anything inbetween), but also now supports signed values.
5130
 *
5131
 * Version is the CRAM major version number.
5132
 * vv is the vector table (probably &cram_fd->vv)
5133
 */
5134
25.1k
static void cram_init_varint(varint_vec *vv, int version) {
5135
25.1k
    if (version >= 4) {
5136
798
        vv->varint_get32 = uint7_get_32; // FIXME: varint.h API should be size agnostic
5137
798
        vv->varint_get32s = sint7_get_32;
5138
798
        vv->varint_get64 = uint7_get_64;
5139
798
        vv->varint_get64s = sint7_get_64;
5140
798
        vv->varint_put32 = uint7_put_32;
5141
798
        vv->varint_put32s = sint7_put_32;
5142
798
        vv->varint_put64 = uint7_put_64;
5143
798
        vv->varint_put64s = sint7_put_64;
5144
798
        vv->varint_put32_blk = uint7_put_blk_32;
5145
798
        vv->varint_put32s_blk = sint7_put_blk_32;
5146
798
        vv->varint_put64_blk = uint7_put_blk_64;
5147
798
        vv->varint_put64s_blk = sint7_put_blk_64;
5148
798
        vv->varint_size = uint7_size;
5149
798
        vv->varint_decode32_crc = uint7_decode_crc32;
5150
798
        vv->varint_decode32s_crc = sint7_decode_crc32;
5151
798
        vv->varint_decode64_crc = uint7_decode_crc64;
5152
24.3k
    } else {
5153
24.3k
        vv->varint_get32 = safe_itf8_get;
5154
24.3k
        vv->varint_get32s = safe_itf8_get;
5155
24.3k
        vv->varint_get64 = safe_ltf8_get;
5156
24.3k
        vv->varint_get64s = safe_ltf8_get;
5157
24.3k
        vv->varint_put32 = safe_itf8_put;
5158
24.3k
        vv->varint_put32s = safe_itf8_put;
5159
24.3k
        vv->varint_put64 = safe_ltf8_put;
5160
24.3k
        vv->varint_put64s = safe_ltf8_put;
5161
24.3k
        vv->varint_put32_blk = itf8_put_blk;
5162
24.3k
        vv->varint_put32s_blk = itf8_put_blk;
5163
24.3k
        vv->varint_put64_blk = ltf8_put_blk;
5164
24.3k
        vv->varint_put64s_blk = ltf8_put_blk;
5165
24.3k
        vv->varint_size = itf8_size;
5166
24.3k
        vv->varint_decode32_crc = itf8_decode_crc;
5167
24.3k
        vv->varint_decode32s_crc = itf8_decode_crc;
5168
24.3k
        vv->varint_decode64_crc = ltf8_decode_crc;
5169
24.3k
    }
5170
25.1k
}
5171
5172
/*
5173
 * Initialises the lookup tables. These could be global statics, but they're
5174
 * clumsy to setup in a multi-threaded environment unless we generate
5175
 * verbatim code and include that.
5176
 */
5177
25.1k
static void cram_init_tables(cram_fd *fd) {
5178
25.1k
    int i;
5179
5180
25.1k
    memset(fd->L1, 4, 256);
5181
25.1k
    fd->L1['A'] = 0; fd->L1['a'] = 0;
5182
25.1k
    fd->L1['C'] = 1; fd->L1['c'] = 1;
5183
25.1k
    fd->L1['G'] = 2; fd->L1['g'] = 2;
5184
25.1k
    fd->L1['T'] = 3; fd->L1['t'] = 3;
5185
5186
25.1k
    memset(fd->L2, 5, 256);
5187
25.1k
    fd->L2['A'] = 0; fd->L2['a'] = 0;
5188
25.1k
    fd->L2['C'] = 1; fd->L2['c'] = 1;
5189
25.1k
    fd->L2['G'] = 2; fd->L2['g'] = 2;
5190
25.1k
    fd->L2['T'] = 3; fd->L2['t'] = 3;
5191
25.1k
    fd->L2['N'] = 4; fd->L2['n'] = 4;
5192
5193
25.1k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
5194
5.02M
        for (i = 0; i < 0x200; i++) {
5195
5.01M
            int f = 0;
5196
5197
5.01M
            if (i & CRAM_FPAIRED)      f |= BAM_FPAIRED;
5198
5.01M
            if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
5199
5.01M
            if (i & CRAM_FUNMAP)       f |= BAM_FUNMAP;
5200
5.01M
            if (i & CRAM_FREVERSE)     f |= BAM_FREVERSE;
5201
5.01M
            if (i & CRAM_FREAD1)       f |= BAM_FREAD1;
5202
5.01M
            if (i & CRAM_FREAD2)       f |= BAM_FREAD2;
5203
5.01M
            if (i & CRAM_FSECONDARY)   f |= BAM_FSECONDARY;
5204
5.01M
            if (i & CRAM_FQCFAIL)      f |= BAM_FQCFAIL;
5205
5.01M
            if (i & CRAM_FDUP)         f |= BAM_FDUP;
5206
5207
5.01M
            fd->bam_flag_swap[i]  = f;
5208
5.01M
        }
5209
5210
40.1M
        for (i = 0; i < 0x1000; i++) {
5211
40.1M
            int g = 0;
5212
5213
40.1M
            if (i & BAM_FPAIRED)           g |= CRAM_FPAIRED;
5214
40.1M
            if (i & BAM_FPROPER_PAIR)  g |= CRAM_FPROPER_PAIR;
5215
40.1M
            if (i & BAM_FUNMAP)        g |= CRAM_FUNMAP;
5216
40.1M
            if (i & BAM_FREVERSE)      g |= CRAM_FREVERSE;
5217
40.1M
            if (i & BAM_FREAD1)        g |= CRAM_FREAD1;
5218
40.1M
            if (i & BAM_FREAD2)        g |= CRAM_FREAD2;
5219
40.1M
            if (i & BAM_FSECONDARY)    g |= CRAM_FSECONDARY;
5220
40.1M
            if (i & BAM_FQCFAIL)       g |= CRAM_FQCFAIL;
5221
40.1M
            if (i & BAM_FDUP)          g |= CRAM_FDUP;
5222
5223
40.1M
            fd->cram_flag_swap[i] = g;
5224
40.1M
        }
5225
15.3k
    } else {
5226
        /* NOP */
5227
62.8M
        for (i = 0; i < 0x1000; i++)
5228
62.8M
            fd->bam_flag_swap[i] = i;
5229
62.8M
        for (i = 0; i < 0x1000; i++)
5230
62.8M
            fd->cram_flag_swap[i] = i;
5231
15.3k
    }
5232
5233
25.1k
    memset(fd->cram_sub_matrix, 4, 32*32);
5234
829k
    for (i = 0; i < 32; i++) {
5235
804k
        fd->cram_sub_matrix[i]['A'&0x1f]=0;
5236
804k
        fd->cram_sub_matrix[i]['C'&0x1f]=1;
5237
804k
        fd->cram_sub_matrix[i]['G'&0x1f]=2;
5238
804k
        fd->cram_sub_matrix[i]['T'&0x1f]=3;
5239
804k
        fd->cram_sub_matrix[i]['N'&0x1f]=4;
5240
804k
    }
5241
150k
    for (i = 0; i < 20; i+=4) {
5242
125k
        int j;
5243
2.63M
        for (j = 0; j < 20; j++) {
5244
2.51M
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5245
2.51M
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5246
2.51M
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5247
2.51M
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5248
2.51M
        }
5249
125k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
5250
125k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
5251
125k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
5252
125k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
5253
125k
    }
5254
5255
25.1k
    cram_init_varint(&fd->vv, CRAM_MAJOR_VERS(fd->version));
5256
25.1k
}
5257
5258
// Default version numbers for CRAM
5259
static int major_version = 3;
5260
static int minor_version = 1;
5261
5262
/*
5263
 * Opens a CRAM file for read (mode "rb") or write ("wb").
5264
 * The filename may be "-" to indicate stdin or stdout.
5265
 *
5266
 * Returns file handle on success
5267
 *         NULL on failure.
5268
 */
5269
0
cram_fd *cram_open(const char *filename, const char *mode) {
5270
0
    hFILE *fp;
5271
0
    cram_fd *fd;
5272
0
    char fmode[3]= { mode[0], '\0', '\0' };
5273
5274
0
    if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
5275
0
        fmode[1] = 'b';
5276
0
    }
5277
5278
0
    fp = hopen(filename, fmode);
5279
0
    if (!fp)
5280
0
        return NULL;
5281
5282
0
    fd = cram_dopen(fp, filename, mode);
5283
0
    if (!fd)
5284
0
        hclose_abruptly(fp);
5285
5286
0
    return fd;
5287
0
}
5288
5289
/* Opens an existing stream for reading or writing.
5290
 *
5291
 * Returns file handle on success;
5292
 *         NULL on failure.
5293
 */
5294
25.1k
cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
5295
25.1k
    int i;
5296
25.1k
    char *cp;
5297
25.1k
    cram_fd *fd = calloc(1, sizeof(*fd));
5298
25.1k
    if (!fd)
5299
0
        return NULL;
5300
5301
25.1k
    fd->level = CRAM_DEFAULT_LEVEL;
5302
75.4k
    for (i = 0; mode[i]; i++) {
5303
50.2k
        if (mode[i] >= '0' && mode[i] <= '9') {
5304
0
            fd->level = mode[i] - '0';
5305
0
            break;
5306
0
        }
5307
50.2k
    }
5308
5309
25.1k
    fd->fp = fp;
5310
25.1k
    fd->mode = *mode;
5311
25.1k
    fd->first_container = 0;
5312
25.1k
    fd->curr_position = 0;
5313
5314
25.1k
    if (fd->mode == 'r') {
5315
        /* Reader */
5316
5317
12.5k
        if (!(fd->file_def = cram_read_file_def(fd)))
5318
8
            goto err;
5319
5320
12.5k
        fd->version = fd->file_def->major_version * 256 +
5321
12.5k
            fd->file_def->minor_version;
5322
5323
12.5k
        cram_init_tables(fd);
5324
5325
12.5k
        if (!(fd->header = cram_read_SAM_hdr(fd))) {
5326
2.23k
            cram_free_file_def(fd->file_def);
5327
2.23k
            goto err;
5328
2.23k
        }
5329
5330
12.5k
    } else {
5331
        /* Writer */
5332
12.5k
        cram_file_def *def = calloc(1, sizeof(*def));
5333
12.5k
        if (!def)
5334
0
            return NULL;
5335
5336
12.5k
        fd->file_def = def;
5337
5338
12.5k
        def->magic[0] = 'C';
5339
12.5k
        def->magic[1] = 'R';
5340
12.5k
        def->magic[2] = 'A';
5341
12.5k
        def->magic[3] = 'M';
5342
12.5k
        def->major_version = 0; // Indicator to write file def later.
5343
12.5k
        def->minor_version = 0;
5344
12.5k
        memset(def->file_id, 0, 20);
5345
12.5k
        strncpy(def->file_id, filename, 20);
5346
5347
12.5k
        fd->version = major_version * 256 + minor_version;
5348
12.5k
        cram_init_tables(fd);
5349
5350
        /* SAM header written later along with this file_def */
5351
12.5k
    }
5352
5353
22.8k
    fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
5354
22.8k
    if (!fd->prefix)
5355
0
        goto err;
5356
22.8k
    fd->first_base = fd->last_base = -1;
5357
22.8k
    fd->record_counter = 0;
5358
5359
22.8k
    fd->ctr = NULL;
5360
22.8k
    fd->ctr_mt = NULL;
5361
22.8k
    fd->refs  = refs_create();
5362
22.8k
    if (!fd->refs)
5363
0
        goto err;
5364
22.8k
    fd->ref_id = -2;
5365
22.8k
    fd->ref = NULL;
5366
5367
22.8k
    fd->decode_md = 0;
5368
22.8k
    fd->seqs_per_slice = SEQS_PER_SLICE;
5369
22.8k
    fd->bases_per_slice = BASES_PER_SLICE;
5370
22.8k
    fd->slices_per_container = SLICE_PER_CNT;
5371
22.8k
    fd->embed_ref = -1; // automatic selection
5372
22.8k
    fd->no_ref = 0;
5373
22.8k
    fd->no_ref_counter = 0;
5374
22.8k
    fd->ap_delta = 0;
5375
22.8k
    fd->ignore_md5 = 0;
5376
22.8k
    fd->lossy_read_names = 0;
5377
22.8k
    fd->use_bz2 = 0;
5378
22.8k
    fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3);
5379
22.8k
    fd->use_tok = (CRAM_MAJOR_VERS(fd->version) >= 3) && (CRAM_MINOR_VERS(fd->version) >= 1);
5380
22.8k
    fd->use_lzma = 0;
5381
22.8k
    fd->multi_seq = -1;
5382
22.8k
    fd->multi_seq_user = -1;
5383
22.8k
    fd->unsorted   = 0;
5384
22.8k
    fd->shared_ref = 0;
5385
22.8k
    fd->store_md = 0;
5386
22.8k
    fd->store_nm = 0;
5387
22.8k
    fd->last_RI_count = 0;
5388
5389
22.8k
    fd->index       = NULL;
5390
22.8k
    fd->own_pool    = 0;
5391
22.8k
    fd->pool        = NULL;
5392
22.8k
    fd->rqueue      = NULL;
5393
22.8k
    fd->job_pending = NULL;
5394
22.8k
    fd->ooc         = 0;
5395
22.8k
    fd->required_fields = INT_MAX;
5396
5397
22.8k
    pthread_mutex_init(&fd->metrics_lock, NULL);
5398
22.8k
    pthread_mutex_init(&fd->ref_lock, NULL);
5399
22.8k
    pthread_mutex_init(&fd->range_lock, NULL);
5400
22.8k
    pthread_mutex_init(&fd->bam_list_lock, NULL);
5401
5402
1.09M
    for (i = 0; i < DS_END; i++) {
5403
1.07M
        fd->m[i] = cram_new_metrics();
5404
1.07M
        if (!fd->m[i])
5405
0
            goto err;
5406
1.07M
    }
5407
5408
22.8k
    if (!(fd->tags_used = kh_init(m_metrics)))
5409
0
        goto err;
5410
5411
22.8k
    fd->range.refid = -2; // no ref.
5412
22.8k
    fd->eof = 1;          // See samtools issue #150
5413
22.8k
    fd->ref_fn = NULL;
5414
5415
22.8k
    fd->bl = NULL;
5416
5417
    /* Initialise dummy refs from the @SQ headers */
5418
22.8k
    if (-1 == refs_from_header(fd))
5419
0
        goto err;
5420
5421
22.8k
    return fd;
5422
5423
2.24k
 err:
5424
2.24k
    if (fd)
5425
2.24k
        free(fd);
5426
5427
2.24k
    return NULL;
5428
22.8k
}
5429
5430
/*
5431
 * Seek within a CRAM file.
5432
 *
5433
 * Returns 0 on success
5434
 *        -1 on failure
5435
 */
5436
0
int cram_seek(cram_fd *fd, off_t offset, int whence) {
5437
0
    fd->ooc = 0;
5438
5439
0
    cram_drain_rqueue(fd);
5440
5441
0
    return hseek(fd->fp, offset, whence) >= 0 ? 0 : -1;
5442
0
}
5443
5444
/*
5445
 * Flushes a CRAM file.
5446
 * Useful for when writing to stdout without wishing to close the stream.
5447
 *
5448
 * Returns 0 on success
5449
 *        -1 on failure
5450
 */
5451
0
int cram_flush(cram_fd *fd) {
5452
0
    if (!fd)
5453
0
        return -1;
5454
5455
0
    int ret = 0;
5456
5457
0
    if (fd->mode == 'w' && fd->ctr) {
5458
0
        if(fd->ctr->slice)
5459
0
            cram_update_curr_slice(fd->ctr, fd->version);
5460
5461
0
        if (-1 == cram_flush_container_mt(fd, fd->ctr))
5462
0
            ret = -1;
5463
5464
0
        cram_free_container(fd->ctr);
5465
0
        if (fd->ctr_mt == fd->ctr)
5466
0
            fd->ctr_mt = NULL;
5467
0
        fd->ctr = NULL;
5468
0
    }
5469
5470
0
    return ret;
5471
0
}
5472
5473
/*
5474
 * Writes an EOF block to a CRAM file.
5475
 *
5476
 * Returns 0 on success
5477
 *        -1 on failure
5478
 */
5479
11.9k
int cram_write_eof_block(cram_fd *fd) {
5480
    // EOF block is a container with special values to aid detection
5481
11.9k
    if (CRAM_MAJOR_VERS(fd->version) >= 2) {
5482
        // Empty container with
5483
        //   ref_seq_id -1
5484
        //   start pos 0x454f46 ("EOF")
5485
        //   span 0
5486
        //   nrec 0
5487
        //   counter 0
5488
        //   nbases 0
5489
        //   1 block (landmark 0)
5490
        //   (CRC32)
5491
11.9k
        cram_container c;
5492
11.9k
        memset(&c, 0, sizeof(c));
5493
11.9k
        c.ref_seq_id = -1;
5494
11.9k
        c.ref_seq_start = 0x454f46; // "EOF"
5495
11.9k
        c.ref_seq_span = 0;
5496
11.9k
        c.record_counter = 0;
5497
11.9k
        c.num_bases = 0;
5498
11.9k
        c.num_blocks = 1;
5499
11.9k
        int32_t land[1] = {0};
5500
11.9k
        c.landmark = land;
5501
5502
        // An empty compression header block with
5503
        //   method raw (0)
5504
        //   type comp header (1)
5505
        //   content id 0
5506
        //   block contents size 6
5507
        //   raw size 6
5508
        //     empty preservation map (01 00)
5509
        //     empty data series map (01 00)
5510
        //     empty tag map (01 00)
5511
        //   block CRC
5512
11.9k
        cram_block_compression_hdr ch;
5513
11.9k
        memset(&ch, 0, sizeof(ch));
5514
11.9k
        c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch, 0);
5515
5516
11.9k
        c.length = c.comp_hdr_block->byte            // Landmark[0]
5517
11.9k
            + 5                                      // block struct
5518
11.9k
            + 4*(CRAM_MAJOR_VERS(fd->version) >= 3); // CRC
5519
11.9k
        if (cram_write_container(fd, &c) < 0 ||
5520
11.9k
            cram_write_block(fd, c.comp_hdr_block) < 0) {
5521
0
            cram_close(fd);
5522
0
            cram_free_block(c.comp_hdr_block);
5523
0
            return -1;
5524
0
        }
5525
11.9k
        if (ch.preservation_map)
5526
11.9k
            kh_destroy(map, ch.preservation_map);
5527
11.9k
        cram_free_block(c.comp_hdr_block);
5528
5529
        // V2.1 bytes
5530
        // 0b 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id
5531
        // e0 45 4f 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5532
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5533
        // 00 01 00 06 06             // Comp.HDR blk
5534
        // 01 00 01 00 01 00          // Comp.HDR blk
5535
5536
        // V3.0 bytes:
5537
        // 0f 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id
5538
        // e0 45 4f 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5539
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5540
        // 05 bd d9 4f                // CRC32
5541
        // 00 01 00 06 06             // Comp.HDR blk
5542
        // 01 00 01 00 01 00          // Comp.HDR blk
5543
        // ee 63 01 4b                // CRC32
5544
5545
        // V4.0 bytes:
5546
        // 0f 00 00 00 8f ff ff ff    // Cont HDR: size, ref seq id
5547
        // 82 95 9e 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5548
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5549
        // ac d6 05 bc                // CRC32
5550
        // 00 01 00 06 06             // Comp.HDR blk
5551
        // 01 00 01 00 01 00          // Comp.HDR blk
5552
        // ee 63 01 4b                // CRC32
5553
11.9k
    }
5554
5555
11.9k
    return 0;
5556
11.9k
}
5557
5558
/*
5559
 * Closes a CRAM file.
5560
 * Returns 0 on success
5561
 *        -1 on failure
5562
 */
5563
22.8k
int cram_close(cram_fd *fd) {
5564
22.8k
    spare_bams *bl, *next;
5565
22.8k
    int i, ret = 0;
5566
5567
22.8k
    if (!fd)
5568
0
        return -1;
5569
5570
22.8k
    if (fd->mode == 'w' && fd->ctr) {
5571
5.71k
        if(fd->ctr->slice)
5572
5.71k
            cram_update_curr_slice(fd->ctr, fd->version);
5573
5574
5.71k
        if (-1 == cram_flush_container_mt(fd, fd->ctr))
5575
607
            ret = -1;
5576
5.71k
    }
5577
5578
22.8k
    if (fd->mode != 'w')
5579
10.3k
        cram_drain_rqueue(fd);
5580
5581
22.8k
    if (fd->pool && fd->eof >= 0 && fd->rqueue) {
5582
0
        hts_tpool_process_flush(fd->rqueue);
5583
5584
0
        if (0 != cram_flush_result(fd))
5585
0
            ret = -1;
5586
5587
0
        if (fd->mode == 'w')
5588
0
            fd->ctr = NULL; // prevent double freeing
5589
5590
        //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
5591
5592
0
        hts_tpool_process_destroy(fd->rqueue);
5593
0
    }
5594
5595
22.8k
    pthread_mutex_destroy(&fd->metrics_lock);
5596
22.8k
    pthread_mutex_destroy(&fd->ref_lock);
5597
22.8k
    pthread_mutex_destroy(&fd->range_lock);
5598
22.8k
    pthread_mutex_destroy(&fd->bam_list_lock);
5599
5600
22.8k
    if (ret == 0 && fd->mode == 'w') {
5601
        /* Write EOF block */
5602
11.9k
        if (0 != cram_write_eof_block(fd))
5603
0
            ret = -1;
5604
11.9k
    }
5605
5606
28.2k
    for (bl = fd->bl; bl; bl = next) {
5607
5.32k
        int max_rec = fd->seqs_per_slice * fd->slices_per_container;
5608
5609
5.32k
        next = bl->next;
5610
5.32k
        free_bam_list(bl->bams, max_rec);
5611
5.32k
        free(bl);
5612
5.32k
    }
5613
5614
22.8k
    if (hclose(fd->fp) != 0)
5615
0
        ret = -1;
5616
5617
22.8k
    if (fd->file_def)
5618
22.8k
        cram_free_file_def(fd->file_def);
5619
5620
22.8k
    if (fd->header)
5621
22.4k
        sam_hdr_destroy(fd->header);
5622
5623
22.8k
    free(fd->prefix);
5624
5625
22.8k
    if (fd->ctr)
5626
12.5k
        cram_free_container(fd->ctr);
5627
5628
22.8k
    if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
5629
88
        cram_free_container(fd->ctr_mt);
5630
5631
22.8k
    if (fd->refs)
5632
22.8k
        refs_free(fd->refs);
5633
22.8k
    if (fd->ref_free)
5634
0
        free(fd->ref_free);
5635
5636
1.09M
    for (i = 0; i < DS_END; i++)
5637
1.07M
        if (fd->m[i])
5638
1.07M
            free(fd->m[i]);
5639
5640
22.8k
    if (fd->tags_used) {
5641
22.8k
        khint_t k;
5642
5643
46.0k
        for (k = kh_begin(fd->tags_used); k != kh_end(fd->tags_used); k++) {
5644
23.1k
            if (kh_exist(fd->tags_used, k))
5645
11.0k
                free(kh_val(fd->tags_used, k));
5646
23.1k
        }
5647
5648
22.8k
        kh_destroy(m_metrics, fd->tags_used);
5649
22.8k
    }
5650
5651
22.8k
    if (fd->index)
5652
0
        cram_index_free(fd);
5653
5654
22.8k
    if (fd->own_pool && fd->pool)
5655
0
        hts_tpool_destroy(fd->pool);
5656
5657
22.8k
    if (fd->idxfp)
5658
0
        if (bgzf_close(fd->idxfp) < 0)
5659
0
            ret = -1;
5660
5661
22.8k
    free(fd);
5662
5663
22.8k
    return ret;
5664
22.8k
}
5665
5666
/*
5667
 * Returns 1 if we hit an EOF while reading.
5668
 */
5669
17.8k
int cram_eof(cram_fd *fd) {
5670
17.8k
    return fd->eof;
5671
17.8k
}
5672
5673
5674
/*
5675
 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5676
 * Use this immediately after opening.
5677
 *
5678
 * Returns 0 on success
5679
 *        -1 on failure
5680
 */
5681
10.3k
int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...) {
5682
10.3k
    int r;
5683
10.3k
    va_list args;
5684
5685
10.3k
    va_start(args, opt);
5686
10.3k
    r = cram_set_voption(fd, opt, args);
5687
10.3k
    va_end(args);
5688
5689
10.3k
    return r;
5690
10.3k
}
5691
5692
/*
5693
 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5694
 * Use this immediately after opening.
5695
 *
5696
 * Returns 0 on success
5697
 *        -1 on failure
5698
 */
5699
10.3k
int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
5700
10.3k
    refs_t *refs;
5701
5702
10.3k
    if (!fd) {
5703
0
        errno = EBADF;
5704
0
        return -1;
5705
0
    }
5706
5707
10.3k
    switch (opt) {
5708
10.3k
    case CRAM_OPT_DECODE_MD:
5709
10.3k
        fd->decode_md = va_arg(args, int);
5710
10.3k
        break;
5711
5712
0
    case CRAM_OPT_PREFIX:
5713
0
        if (fd->prefix)
5714
0
            free(fd->prefix);
5715
0
        if (!(fd->prefix = strdup(va_arg(args, char *))))
5716
0
            return -1;
5717
0
        break;
5718
5719
0
    case CRAM_OPT_VERBOSITY:
5720
0
        break;
5721
5722
0
    case CRAM_OPT_SEQS_PER_SLICE:
5723
0
        fd->seqs_per_slice = va_arg(args, int);
5724
0
        if (fd->bases_per_slice == BASES_PER_SLICE)
5725
0
            fd->bases_per_slice = fd->seqs_per_slice * 500;
5726
0
        break;
5727
5728
0
    case CRAM_OPT_BASES_PER_SLICE:
5729
0
        fd->bases_per_slice = va_arg(args, int);
5730
0
        break;
5731
5732
0
    case CRAM_OPT_SLICES_PER_CONTAINER:
5733
0
        fd->slices_per_container = va_arg(args, int);
5734
0
        break;
5735
5736
0
    case CRAM_OPT_EMBED_REF:
5737
0
        fd->embed_ref = va_arg(args, int);
5738
0
        break;
5739
5740
0
    case CRAM_OPT_NO_REF:
5741
0
        fd->no_ref = va_arg(args, int);
5742
0
        break;
5743
5744
0
    case CRAM_OPT_POS_DELTA:
5745
0
        fd->ap_delta = va_arg(args, int);
5746
0
        break;
5747
5748
0
    case CRAM_OPT_IGNORE_MD5:
5749
0
        fd->ignore_md5 = va_arg(args, int);
5750
0
        break;
5751
5752
0
    case CRAM_OPT_LOSSY_NAMES:
5753
0
        fd->lossy_read_names = va_arg(args, int);
5754
        // Currently lossy read names required paired (attached) reads.
5755
        // TLEN 0 or being 1 out causes read pairs to be detached, breaking
5756
        // the lossy read name compression, so we have extra options to
5757
        // slacken the exact TLEN round-trip checks.
5758
0
        fd->tlen_approx = fd->lossy_read_names;
5759
0
        fd->tlen_zero = fd->lossy_read_names;
5760
0
        break;
5761
5762
0
    case CRAM_OPT_USE_BZIP2:
5763
0
        fd->use_bz2 = va_arg(args, int);
5764
0
        break;
5765
5766
0
    case CRAM_OPT_USE_RANS:
5767
0
        fd->use_rans = va_arg(args, int);
5768
0
        break;
5769
5770
0
    case CRAM_OPT_USE_TOK:
5771
0
        fd->use_tok = va_arg(args, int);
5772
0
        break;
5773
5774
0
    case CRAM_OPT_USE_FQZ:
5775
0
        fd->use_fqz = va_arg(args, int);
5776
0
        break;
5777
5778
0
    case CRAM_OPT_USE_ARITH:
5779
0
        fd->use_arith = va_arg(args, int);
5780
0
        break;
5781
5782
0
    case CRAM_OPT_USE_LZMA:
5783
0
        fd->use_lzma = va_arg(args, int);
5784
0
        break;
5785
5786
0
    case CRAM_OPT_SHARED_REF:
5787
0
        fd->shared_ref = 1;
5788
0
        refs = va_arg(args, refs_t *);
5789
0
        if (refs != fd->refs) {
5790
0
            if (fd->refs)
5791
0
                refs_free(fd->refs);
5792
0
            fd->refs = refs;
5793
0
            fd->refs->count++;
5794
0
        }
5795
0
        break;
5796
5797
0
    case CRAM_OPT_RANGE: {
5798
0
        int r = cram_seek_to_refpos(fd, va_arg(args, cram_range *));
5799
0
        pthread_mutex_lock(&fd->range_lock);
5800
//        printf("opt range noseek to %p %d:%ld-%ld\n",
5801
//               fd, fd->range.refid, fd->range.start, fd->range.end);
5802
0
        if (fd->range.refid != -2)
5803
0
            fd->required_fields |= SAM_POS;
5804
0
        pthread_mutex_unlock(&fd->range_lock);
5805
0
        return r;
5806
0
    }
5807
5808
0
    case CRAM_OPT_RANGE_NOSEEK: {
5809
        // As per CRAM_OPT_RANGE, but no seeking
5810
0
        pthread_mutex_lock(&fd->range_lock);
5811
0
        cram_range *r = va_arg(args, cram_range *);
5812
0
        fd->range = *r;
5813
0
        if (r->refid == HTS_IDX_NOCOOR) {
5814
0
            fd->range.refid = -1;
5815
0
            fd->range.start = 0;
5816
0
        } else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) {
5817
0
            fd->range.refid = -2; // special case in cram_next_slice
5818
0
        }
5819
0
        if (fd->range.refid != -2)
5820
0
            fd->required_fields |= SAM_POS;
5821
0
        fd->ooc = 0;
5822
0
        fd->eof = 0;
5823
0
        pthread_mutex_unlock(&fd->range_lock);
5824
0
        return 0;
5825
0
    }
5826
5827
0
    case CRAM_OPT_REFERENCE:
5828
0
        return cram_load_reference(fd, va_arg(args, char *));
5829
5830
0
    case CRAM_OPT_VERSION: {
5831
0
        int major, minor;
5832
0
        char *s = va_arg(args, char *);
5833
0
        if (2 != sscanf(s, "%d.%d", &major, &minor)) {
5834
0
            hts_log_error("Malformed version string %s", s);
5835
0
            return -1;
5836
0
        }
5837
0
        if (!((major == 1 &&  minor == 0) ||
5838
0
              (major == 2 && (minor == 0 || minor == 1)) ||
5839
0
              (major == 3 && (minor == 0 || minor == 1)) ||
5840
0
              (major == 4 &&  minor == 0))) {
5841
0
            hts_log_error("Unknown version string; use 1.0, 2.0, 2.1, 3.0, 3.1 or 4.0");
5842
0
            errno = EINVAL;
5843
0
            return -1;
5844
0
        }
5845
5846
0
        if (major > 3 || (major == 3 && minor > 1)) {
5847
0
            hts_log_warning(
5848
0
                "CRAM version %s is still a draft and subject to change.\n"
5849
0
                "This is a technology demonstration that should not be "
5850
0
                "used for archival data.", s);
5851
0
        }
5852
5853
0
        fd->version = major*256 + minor;
5854
5855
0
        fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0;
5856
5857
0
        fd->use_tok = ((CRAM_MAJOR_VERS(fd->version) == 3 &&
5858
0
                        CRAM_MINOR_VERS(fd->version) >= 1) ||
5859
0
                        CRAM_MAJOR_VERS(fd->version) >= 4) ? 1 : 0;
5860
0
        cram_init_tables(fd);
5861
5862
0
        break;
5863
0
    }
5864
5865
0
    case CRAM_OPT_MULTI_SEQ_PER_SLICE:
5866
0
        fd->multi_seq_user = fd->multi_seq = va_arg(args, int);
5867
0
        break;
5868
5869
0
    case CRAM_OPT_NTHREADS: {
5870
0
        int nthreads =  va_arg(args, int);
5871
0
        if (nthreads >= 1) {
5872
0
            if (!(fd->pool = hts_tpool_init(nthreads)))
5873
0
                return -1;
5874
5875
0
            fd->rqueue = hts_tpool_process_init(fd->pool, nthreads*2, 0);
5876
0
            fd->shared_ref = 1;
5877
0
            fd->own_pool = 1;
5878
0
        }
5879
0
        break;
5880
0
    }
5881
5882
0
    case CRAM_OPT_THREAD_POOL: {
5883
0
        htsThreadPool *p = va_arg(args, htsThreadPool *);
5884
0
        fd->pool = p ? p->pool : NULL;
5885
0
        if (fd->pool) {
5886
0
            fd->rqueue = hts_tpool_process_init(fd->pool,
5887
0
                                                p->qsize ? p->qsize : hts_tpool_size(fd->pool)*2,
5888
0
                                                0);
5889
0
        }
5890
0
        fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
5891
0
        fd->own_pool = 0;
5892
5893
        //fd->qsize = 1;
5894
        //fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
5895
        //hts_tpool_dispatch(fd->pool, cram_decoder_thread, fd);
5896
0
        break;
5897
0
    }
5898
5899
0
    case CRAM_OPT_REQUIRED_FIELDS:
5900
0
        fd->required_fields = va_arg(args, int);
5901
0
        if (fd->range.refid != -2)
5902
0
            fd->required_fields |= SAM_POS;
5903
0
        break;
5904
5905
0
    case CRAM_OPT_STORE_MD:
5906
0
        fd->store_md = va_arg(args, int);
5907
0
        break;
5908
5909
0
    case CRAM_OPT_STORE_NM:
5910
0
        fd->store_nm = va_arg(args, int);
5911
0
        break;
5912
5913
0
    case HTS_OPT_COMPRESSION_LEVEL:
5914
0
        fd->level = va_arg(args, int);
5915
0
        break;
5916
5917
0
    case HTS_OPT_PROFILE: {
5918
0
        enum hts_profile_option prof = va_arg(args, int);
5919
0
        switch (prof) {
5920
0
        case HTS_PROFILE_FAST:
5921
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 1;
5922
0
            fd->use_tok = 0;
5923
0
            fd->seqs_per_slice = 10000;
5924
0
            break;
5925
5926
0
        case HTS_PROFILE_NORMAL:
5927
0
            break;
5928
5929
0
        case HTS_PROFILE_SMALL:
5930
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 6;
5931
0
            fd->use_bz2 = 1;
5932
0
            fd->use_fqz = 1;
5933
0
            fd->seqs_per_slice = 25000;
5934
0
            break;
5935
5936
0
        case HTS_PROFILE_ARCHIVE:
5937
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 7;
5938
0
            fd->use_bz2 = 1;
5939
0
            fd->use_fqz = 1;
5940
0
            fd->use_arith = 1;
5941
0
            if (fd->level > 7)
5942
0
                fd->use_lzma = 1;
5943
0
            fd->seqs_per_slice = 100000;
5944
0
            break;
5945
0
        }
5946
5947
0
        if (fd->bases_per_slice == BASES_PER_SLICE)
5948
0
            fd->bases_per_slice = fd->seqs_per_slice * 500;
5949
0
        break;
5950
0
    }
5951
5952
0
    default:
5953
0
        hts_log_error("Unknown CRAM option code %d", opt);
5954
0
        errno = EINVAL;
5955
0
        return -1;
5956
10.3k
    }
5957
5958
10.3k
    return 0;
5959
10.3k
}
5960
5961
int cram_check_EOF(cram_fd *fd)
5962
0
{
5963
    // Byte 9 in these templates is & with 0x0f to resolve differences
5964
    // between ITF-8 interpretations between early Java and C
5965
    // implementations of CRAM
5966
0
    static const unsigned char TEMPLATE_2_1[30] = {
5967
0
        0x0b, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
5968
0
        0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
5969
0
        0x01, 0x00, 0x06, 0x06, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00
5970
0
    };
5971
0
    static const unsigned char TEMPLATE_3[38] = {
5972
0
        0x0f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
5973
0
        0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x05,
5974
0
        0xbd, 0xd9, 0x4f, 0x00, 0x01, 0x00, 0x06, 0x06, 0x01, 0x00,
5975
0
        0x01, 0x00, 0x01, 0x00, 0xee, 0x63, 0x01, 0x4b
5976
0
    };
5977
5978
0
    unsigned char buf[38]; // max(sizeof TEMPLATE_*)
5979
5980
0
    uint8_t major = CRAM_MAJOR_VERS(fd->version);
5981
0
    uint8_t minor = CRAM_MINOR_VERS(fd->version);
5982
5983
0
    const unsigned char *template;
5984
0
    ssize_t template_len;
5985
0
    if ((major < 2) ||
5986
0
        (major == 2 && minor == 0)) {
5987
0
        return 3; // No EOF support in cram versions less than 2.1
5988
0
    } else if (major == 2 && minor == 1) {
5989
0
        template = TEMPLATE_2_1;
5990
0
        template_len = sizeof TEMPLATE_2_1;
5991
0
    } else {
5992
0
        template = TEMPLATE_3;
5993
0
        template_len = sizeof TEMPLATE_3;
5994
0
    }
5995
5996
0
    off_t offset = htell(fd->fp);
5997
0
    if (hseek(fd->fp, -template_len, SEEK_END) < 0) {
5998
0
        if (errno == ESPIPE) {
5999
0
            hclearerr(fd->fp);
6000
0
            return 2;
6001
0
        }
6002
0
        else {
6003
0
            return -1;
6004
0
        }
6005
0
    }
6006
0
    if (hread(fd->fp, buf, template_len) != template_len) return -1;
6007
0
    if (hseek(fd->fp, offset, SEEK_SET) < 0) return -1;
6008
0
    buf[8] &= 0x0f;
6009
0
    return (memcmp(template, buf, template_len) == 0)? 1 : 0;
6010
0
}