Coverage Report

Created: 2026-06-30 06:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/cram/cram_io.c
Line
Count
Source
1
/*
2
Copyright (c) 2012-2026 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * CRAM I/O primitives.
33
 *
34
 * - ITF8 encoding and decoding.
35
 * - Block based I/O
36
 * - Zlib inflating and deflating (memory)
37
 * - CRAM basic data structure reading and writing
38
 * - File opening / closing
39
 * - Reference sequence handling
40
 */
41
42
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
43
#include <config.h>
44
45
#include <stdio.h>
46
#include <errno.h>
47
#include <assert.h>
48
#include <stdlib.h>
49
#include <string.h>
50
#include <unistd.h>
51
#include <zlib.h>
52
#ifdef HAVE_LIBBZ2
53
#include <bzlib.h>
54
#endif
55
#ifdef HAVE_LIBLZMA
56
#ifdef HAVE_LZMA_H
57
#include <lzma.h>
58
#else
59
#include "../os/lzma_stub.h"
60
#endif
61
#endif
62
#include <sys/types.h>
63
#include <sys/stat.h>
64
#include <math.h>
65
#include <stdint.h>
66
67
#ifdef HAVE_LIBDEFLATE
68
#include <libdeflate.h>
69
#define crc32(a,b,c) libdeflate_crc32((a),(b),(c))
70
#endif
71
72
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
73
#include "../fuzz_settings.h"
74
#endif
75
76
#include "cram.h"
77
#include "os.h"
78
#include "../htslib/hts.h"
79
#include "../htslib/hts_alloc.h"
80
#include "../hts_internal.h"
81
#include "open_trace_file.h"
82
83
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
84
#include <htscodecs/rANS_static.h>
85
#include <htscodecs/rANS_static4x16.h>
86
#include <htscodecs/arith_dynamic.h>
87
#include <htscodecs/tokenise_name3.h>
88
#include <htscodecs/fqzcomp_qual.h>
89
#else
90
#include "../htscodecs/htscodecs/rANS_static.h"
91
#include "../htscodecs/htscodecs/rANS_static4x16.h"
92
#include "../htscodecs/htscodecs/arith_dynamic.h"
93
#include "../htscodecs/htscodecs/tokenise_name3.h"
94
#include "../htscodecs/htscodecs/fqzcomp_qual.h"
95
#include "../htscodecs/htscodecs/varint.h"
96
#endif
97
98
//#define REF_DEBUG
99
100
#ifdef REF_DEBUG
101
#include <sys/syscall.h>
102
#define gettid() (int)syscall(SYS_gettid)
103
104
#define RP(...) fprintf (stderr, __VA_ARGS__)
105
#else
106
#define RP(...)
107
#endif
108
109
#include "../htslib/hfile.h"
110
#include "../htslib/bgzf.h"
111
#include "../htslib/faidx.h"
112
#include "../hts_internal.h"
113
114
#ifndef PATH_MAX
115
#define PATH_MAX FILENAME_MAX
116
#endif
117
118
756k
#define TRIAL_SPAN 70
119
756k
#define NTRIALS 3
120
121
17.7k
#define CRAM_DEFAULT_LEVEL 5
122
123
/* ----------------------------------------------------------------------
124
 * ITF8 encoding and decoding.
125
 *
126
 * Also see the itf8_get and itf8_put macros in cram_io.h
127
 */
128
129
/*
130
 * LEGACY: consider using itf8_decode_crc.
131
 *
132
 * Reads an integer in ITF-8 encoding from 'cp' and stores it in
133
 * *val.
134
 *
135
 * Returns the number of bytes read on success
136
 *        -1 on failure
137
 */
138
0
int itf8_decode(cram_fd *fd, int32_t *val_p) {
139
0
    static int nbytes[16] = {
140
0
        0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
141
0
        1,1,1,1,                                        // 1000xxxx - 1011xxxx
142
0
        2,2,                                            // 1100xxxx - 1101xxxx
143
0
        3,                                              // 1110xxxx
144
0
        4,                                              // 1111xxxx
145
0
    };
146
147
0
    static int nbits[16] = {
148
0
        0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
149
0
        0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
150
0
        0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
151
0
        0x0f,                                           // 1110xxxx
152
0
        0x0f,                                           // 1111xxxx
153
0
    };
154
155
0
    int32_t val = hgetc(fd->fp);
156
0
    if (val == -1)
157
0
        return -1;
158
159
0
    int i = nbytes[val>>4];
160
0
    val &= nbits[val>>4];
161
162
0
    switch(i) {
163
0
    case 0:
164
0
        *val_p = val;
165
0
        return 1;
166
167
0
    case 1:
168
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
169
0
        *val_p = val;
170
0
        return 2;
171
172
0
    case 2:
173
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
174
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
175
0
        *val_p = val;
176
0
        return 3;
177
178
0
    case 3:
179
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
180
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
181
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
182
0
        *val_p = val;
183
0
        return 4;
184
185
0
    case 4: // really 3.5 more, why make it different?
186
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
187
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
188
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
189
0
        val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f);
190
0
        *val_p = val;
191
0
    }
192
193
0
    return 5;
194
0
}
195
196
133k
int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
197
133k
    static int nbytes[16] = {
198
133k
        0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
199
133k
        1,1,1,1,                                        // 1000xxxx - 1011xxxx
200
133k
        2,2,                                            // 1100xxxx - 1101xxxx
201
133k
        3,                                              // 1110xxxx
202
133k
        4,                                              // 1111xxxx
203
133k
    };
204
205
133k
    static int nbits[16] = {
206
133k
        0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
207
133k
        0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
208
133k
        0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
209
133k
        0x0f,                                           // 1110xxxx
210
133k
        0x0f,                                           // 1111xxxx
211
133k
    };
212
133k
    unsigned char c[5];
213
214
133k
    int32_t val = hgetc(fd->fp);
215
133k
    if (val == -1)
216
274
        return -1;
217
218
132k
    c[0]=val;
219
220
132k
    int i = nbytes[val>>4];
221
132k
    val &= nbits[val>>4];
222
223
132k
    if (i > 0) {
224
27.9k
        if (hread(fd->fp, &c[1], i) < i)
225
95
            return -1;
226
27.9k
    }
227
228
132k
    switch(i) {
229
104k
    case 0:
230
104k
        *val_p = val;
231
104k
        *crc = crc32(*crc, c, 1);
232
104k
        return 1;
233
234
20.2k
    case 1:
235
20.2k
        val = (val<<8) | c[1];
236
20.2k
        *val_p = val;
237
20.2k
        *crc = crc32(*crc, c, 2);
238
20.2k
        return 2;
239
240
3.86k
    case 2:
241
3.86k
        val = (val<<8) | c[1];
242
3.86k
        val = (val<<8) | c[2];
243
3.86k
        *val_p = val;
244
3.86k
        *crc = crc32(*crc, c, 3);
245
3.86k
        return 3;
246
247
1.01k
    case 3:
248
1.01k
        val = (val<<8) | c[1];
249
1.01k
        val = (val<<8) | c[2];
250
1.01k
        val = (val<<8) | c[3];
251
1.01k
        *val_p = val;
252
1.01k
        *crc = crc32(*crc, c, 4);
253
1.01k
        return 4;
254
255
2.72k
    case 4: // really 3.5 more, why make it different?
256
2.72k
        {
257
2.72k
            uint32_t uv = val;
258
2.72k
            uv = (uv<<8) |  c[1];
259
2.72k
            uv = (uv<<8) |  c[2];
260
2.72k
            uv = (uv<<8) |  c[3];
261
2.72k
            uv = (uv<<4) | (c[4] & 0x0f);
262
            // Avoid implementation-defined behaviour on negative values
263
2.72k
            *val_p = uv < 0x80000000UL ? (int32_t) uv : -((int32_t) (0xffffffffUL - uv)) - 1;
264
2.72k
            *crc = crc32(*crc, c, 5);
265
2.72k
        }
266
132k
    }
267
268
2.72k
    return 5;
269
132k
}
270
271
/*
272
 * Stores a value to memory in ITF-8 format.
273
 *
274
 * Returns the number of bytes required to store the number.
275
 * This is a maximum of 5 bytes.
276
 */
277
11.8M
static inline int itf8_put(char *cp, int32_t val) {
278
11.8M
    unsigned char *up = (unsigned char *)cp;
279
11.8M
    if        (!(val & ~0x00000007f)) { // 1 byte
280
11.7M
        *up = val;
281
11.7M
        return 1;
282
11.7M
    } else if (!(val & ~0x00003fff)) { // 2 byte
283
72.5k
        *up++ = (val >> 8 ) | 0x80;
284
72.5k
        *up   = val & 0xff;
285
72.5k
        return 2;
286
72.5k
    } else if (!(val & ~0x01fffff)) { // 3 byte
287
2.70k
        *up++ = (val >> 16) | 0xc0;
288
2.70k
        *up++ = (val >> 8 ) & 0xff;
289
2.70k
        *up   = val & 0xff;
290
2.70k
        return 3;
291
42.3k
    } else if (!(val & ~0x0fffffff)) { // 4 byte
292
16.4k
        *up++ = (val >> 24) | 0xe0;
293
16.4k
        *up++ = (val >> 16) & 0xff;
294
16.4k
        *up++ = (val >> 8 ) & 0xff;
295
16.4k
        *up   = val & 0xff;
296
16.4k
        return 4;
297
25.8k
    } else {                           // 5 byte
298
25.8k
        *up++ = 0xf0 | ((val>>28) & 0xff);
299
25.8k
        *up++ = (val >> 20) & 0xff;
300
25.8k
        *up++ = (val >> 12) & 0xff;
301
25.8k
        *up++ = (val >> 4 ) & 0xff;
302
25.8k
        *up = val & 0x0f;
303
25.8k
        return 5;
304
25.8k
    }
305
11.8M
}
306
307
308
/* 64-bit itf8 variant */
309
36.5k
static inline int ltf8_put(char *cp, int64_t val) {
310
36.5k
    unsigned char *up = (unsigned char *)cp;
311
36.5k
    if        (!(val & ~((1LL<<7)-1))) {
312
34.5k
        *up = val;
313
34.5k
        return 1;
314
34.5k
    } else if (!(val & ~((1LL<<(6+8))-1))) {
315
894
        *up++ = (val >> 8 ) | 0x80;
316
894
        *up   = val & 0xff;
317
894
        return 2;
318
1.06k
    } else if (!(val & ~((1LL<<(5+2*8))-1))) {
319
1.03k
        *up++ = (val >> 16) | 0xc0;
320
1.03k
        *up++ = (val >> 8 ) & 0xff;
321
1.03k
        *up   = val & 0xff;
322
1.03k
        return 3;
323
1.03k
    } else if (!(val & ~((1LL<<(4+3*8))-1))) {
324
39
        *up++ = (val >> 24) | 0xe0;
325
39
        *up++ = (val >> 16) & 0xff;
326
39
        *up++ = (val >> 8 ) & 0xff;
327
39
        *up   = val & 0xff;
328
39
        return 4;
329
39
    } else if (!(val & ~((1LL<<(3+4*8))-1))) {
330
0
        *up++ = (val >> 32) | 0xf0;
331
0
        *up++ = (val >> 24) & 0xff;
332
0
        *up++ = (val >> 16) & 0xff;
333
0
        *up++ = (val >> 8 ) & 0xff;
334
0
        *up   = val & 0xff;
335
0
        return 5;
336
0
    } else if (!(val & ~((1LL<<(2+5*8))-1))) {
337
0
        *up++ = (val >> 40) | 0xf8;
338
0
        *up++ = (val >> 32) & 0xff;
339
0
        *up++ = (val >> 24) & 0xff;
340
0
        *up++ = (val >> 16) & 0xff;
341
0
        *up++ = (val >> 8 ) & 0xff;
342
0
        *up   = val & 0xff;
343
0
        return 6;
344
0
    } else if (!(val & ~((1LL<<(1+6*8))-1))) {
345
0
        *up++ = (val >> 48) | 0xfc;
346
0
        *up++ = (val >> 40) & 0xff;
347
0
        *up++ = (val >> 32) & 0xff;
348
0
        *up++ = (val >> 24) & 0xff;
349
0
        *up++ = (val >> 16) & 0xff;
350
0
        *up++ = (val >> 8 ) & 0xff;
351
0
        *up   = val & 0xff;
352
0
        return 7;
353
0
    } else if (!(val & ~((1LL<<(7*8))-1))) {
354
0
        *up++ = (val >> 56) | 0xfe;
355
0
        *up++ = (val >> 48) & 0xff;
356
0
        *up++ = (val >> 40) & 0xff;
357
0
        *up++ = (val >> 32) & 0xff;
358
0
        *up++ = (val >> 24) & 0xff;
359
0
        *up++ = (val >> 16) & 0xff;
360
0
        *up++ = (val >> 8 ) & 0xff;
361
0
        *up   = val & 0xff;
362
0
        return 8;
363
0
    } else {
364
0
        *up++ = 0xff;
365
0
        *up++ = (val >> 56) & 0xff;
366
0
        *up++ = (val >> 48) & 0xff;
367
0
        *up++ = (val >> 40) & 0xff;
368
0
        *up++ = (val >> 32) & 0xff;
369
0
        *up++ = (val >> 24) & 0xff;
370
0
        *up++ = (val >> 16) & 0xff;
371
0
        *up++ = (val >> 8 ) & 0xff;
372
0
        *up   = val & 0xff;
373
0
        return 9;
374
0
    }
375
36.5k
}
376
377
/*
378
 * Encodes and writes a single integer in ITF-8 format.
379
 * Returns 0 on success
380
 *        -1 on failure
381
 */
382
0
int itf8_encode(cram_fd *fd, int32_t val) {
383
0
    char buf[5];
384
0
    int len = itf8_put(buf, val);
385
0
    return hwrite(fd->fp, buf, len) == len ? 0 : -1;
386
0
}
387
388
const int itf8_bytes[16] = {
389
    1, 1, 1, 1,  1, 1, 1, 1,
390
    2, 2, 2, 2,  3, 3, 4, 5
391
};
392
393
const int ltf8_bytes[256] = {
394
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
395
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
396
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
397
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
398
399
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
400
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
401
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
402
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
403
404
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
405
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
406
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
407
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
408
409
    3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
410
    3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
411
412
    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
413
414
    5, 5, 5, 5,  5, 5, 5, 5,  6, 6, 6, 6,  7, 7, 8, 9
415
};
416
417
/*
418
 * LEGACY: consider using ltf8_decode_crc.
419
 */
420
0
int ltf8_decode(cram_fd *fd, int64_t *val_p) {
421
0
    int c = hgetc(fd->fp);
422
0
    int64_t val = (unsigned char)c;
423
0
    if (c == -1)
424
0
        return -1;
425
426
0
    if (val < 0x80) {
427
0
        *val_p =   val;
428
0
        return 1;
429
430
0
    } else if (val < 0xc0) {
431
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
432
0
        *val_p = val & (((1LL<<(6+8)))-1);
433
0
        return 2;
434
435
0
    } else if (val < 0xe0) {
436
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
437
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
438
0
        *val_p = val & ((1LL<<(5+2*8))-1);
439
0
        return 3;
440
441
0
    } else if (val < 0xf0) {
442
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
443
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
444
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
445
0
        *val_p = val & ((1LL<<(4+3*8))-1);
446
0
        return 4;
447
448
0
    } else if (val < 0xf8) {
449
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
450
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
451
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
452
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
453
0
        *val_p = val & ((1LL<<(3+4*8))-1);
454
0
        return 5;
455
456
0
    } else if (val < 0xfc) {
457
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
458
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
459
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
460
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
461
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
462
0
        *val_p = val & ((1LL<<(2+5*8))-1);
463
0
        return 6;
464
465
0
    } else if (val < 0xfe) {
466
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
467
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
468
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
469
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
470
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
471
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
472
0
        *val_p = val & ((1LL<<(1+6*8))-1);
473
0
        return 7;
474
475
0
    } else if (val < 0xff) {
476
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
477
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
478
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
479
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
480
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
481
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
482
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
483
0
        *val_p = val & ((1LL<<(7*8))-1);
484
0
        return 8;
485
486
0
    } else {
487
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
488
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
489
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
490
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
491
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
492
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
493
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
494
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
495
0
        *val_p = val;
496
0
    }
497
498
0
    return 9;
499
0
}
500
501
3.79k
int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
502
3.79k
    unsigned char c[9];
503
3.79k
    int64_t val = hgetc(fd->fp);
504
3.79k
    if (val < 0)
505
16
        return -1;
506
507
3.77k
    c[0] = val;
508
509
3.77k
    if (val < 0x80) {
510
3.18k
        *val_p =   val;
511
3.18k
        *crc = crc32(*crc, c, 1);
512
3.18k
        return 1;
513
514
3.18k
    } else if (val < 0xc0) {
515
107
        int v = hgetc(fd->fp);
516
107
        if (v < 0)
517
5
            return -1;
518
102
        val = (val<<8) | (c[1]=v);
519
102
        *val_p = val & (((1LL<<(6+8)))-1);
520
102
        *crc = crc32(*crc, c, 2);
521
102
        return 2;
522
523
490
    } else if (val < 0xe0) {
524
36
        if (hread(fd->fp, &c[1], 2) < 2)
525
4
            return -1;
526
32
        val = (val<<8) | c[1];
527
32
        val = (val<<8) | c[2];
528
32
        *val_p = val & ((1LL<<(5+2*8))-1);
529
32
        *crc = crc32(*crc, c, 3);
530
32
        return 3;
531
532
454
    } else if (val < 0xf0) {
533
46
        if (hread(fd->fp, &c[1], 3) < 3)
534
4
            return -1;
535
42
        val = (val<<8) | c[1];
536
42
        val = (val<<8) | c[2];
537
42
        val = (val<<8) | c[3];
538
42
        *val_p = val & ((1LL<<(4+3*8))-1);
539
42
        *crc = crc32(*crc, c, 4);
540
42
        return 4;
541
542
408
    } else if (val < 0xf8) {
543
37
        if (hread(fd->fp, &c[1], 4) < 4)
544
7
            return -1;
545
30
        val = (val<<8) | c[1];
546
30
        val = (val<<8) | c[2];
547
30
        val = (val<<8) | c[3];
548
30
        val = (val<<8) | c[4];
549
30
        *val_p = val & ((1LL<<(3+4*8))-1);
550
30
        *crc = crc32(*crc, c, 5);
551
30
        return 5;
552
553
371
    } else if (val < 0xfc) {
554
98
        if (hread(fd->fp, &c[1], 5) < 5)
555
5
            return -1;
556
93
        val = (val<<8) | c[1];
557
93
        val = (val<<8) | c[2];
558
93
        val = (val<<8) | c[3];
559
93
        val = (val<<8) | c[4];
560
93
        val = (val<<8) | c[5];
561
93
        *val_p = val & ((1LL<<(2+5*8))-1);
562
93
        *crc = crc32(*crc, c, 6);
563
93
        return 6;
564
565
273
    } else if (val < 0xfe) {
566
96
        if (hread(fd->fp, &c[1], 6) < 6)
567
5
            return -1;
568
91
        val = (val<<8) | c[1];
569
91
        val = (val<<8) | c[2];
570
91
        val = (val<<8) | c[3];
571
91
        val = (val<<8) | c[4];
572
91
        val = (val<<8) | c[5];
573
91
        val = (val<<8) | c[6];
574
91
        *val_p = val & ((1LL<<(1+6*8))-1);
575
91
        *crc = crc32(*crc, c, 7);
576
91
        return 7;
577
578
177
    } else if (val < 0xff) {
579
18
        uint64_t uval = val;
580
18
        if (hread(fd->fp, &c[1], 7) < 7)
581
1
            return -1;
582
17
        uval = (uval<<8) | c[1];
583
17
        uval = (uval<<8) | c[2];
584
17
        uval = (uval<<8) | c[3];
585
17
        uval = (uval<<8) | c[4];
586
17
        uval = (uval<<8) | c[5];
587
17
        uval = (uval<<8) | c[6];
588
17
        uval = (uval<<8) | c[7];
589
17
        *val_p = uval & ((1ULL<<(7*8))-1);
590
17
        *crc = crc32(*crc, c, 8);
591
17
        return 8;
592
593
159
    } else {
594
159
        uint64_t uval;
595
159
        if (hread(fd->fp, &c[1], 8) < 8)
596
9
            return -1;
597
150
        uval =             c[1];
598
150
        uval = (uval<<8) | c[2];
599
150
        uval = (uval<<8) | c[3];
600
150
        uval = (uval<<8) | c[4];
601
150
        uval = (uval<<8) | c[5];
602
150
        uval = (uval<<8) | c[6];
603
150
        uval = (uval<<8) | c[7];
604
150
        uval = (uval<<8) | c[8];
605
150
        *crc = crc32(*crc, c, 9);
606
        // Avoid implementation-defined behaviour on negative values
607
150
        *val_p = c[1] < 0x80 ? (int64_t) uval : -((int64_t) (0xffffffffffffffffULL - uval)) - 1;
608
150
    }
609
610
150
    return 9;
611
3.77k
}
612
613
/*
614
 * Pushes a value in ITF8 format onto the end of a block.
615
 * This shouldn't be used for high-volume data as it is not the fastest
616
 * method.
617
 *
618
 * Returns the number of bytes written
619
 */
620
11.2M
int itf8_put_blk(cram_block *blk, int32_t val) {
621
11.2M
    char buf[5];
622
11.2M
    int sz;
623
624
11.2M
    sz = itf8_put(buf, val);
625
11.2M
    BLOCK_APPEND(blk, buf, sz);
626
11.2M
    return sz;
627
628
0
 block_err:
629
0
    return -1;
630
11.2M
}
631
632
0
int ltf8_put_blk(cram_block *blk, int64_t val) {
633
0
    char buf[9];
634
0
    int sz;
635
636
0
    sz = ltf8_put(buf, val);
637
0
    BLOCK_APPEND(blk, buf, sz);
638
0
    return sz;
639
640
0
 block_err:
641
0
    return -1;
642
0
}
643
644
1.43M
static int64_t safe_itf8_get(char **cp, const char *endp, int *err) {
645
1.43M
    const unsigned char *up = (unsigned char *)*cp;
646
647
1.43M
    if (endp && endp - *cp < 5 &&
648
1.31M
        (*cp >= endp || endp - *cp < itf8_bytes[up[0]>>4])) {
649
1.31M
        if (err) *err = 1;
650
1.31M
        return 0;
651
1.31M
    }
652
653
117k
    if (up[0] < 0x80) {
654
107k
        (*cp)++;
655
107k
        return up[0];
656
107k
    } else if (up[0] < 0xc0) {
657
4.33k
        (*cp)+=2;
658
4.33k
        return ((up[0] <<8) |  up[1])                           & 0x3fff;
659
5.46k
    } else if (up[0] < 0xe0) {
660
2.05k
        (*cp)+=3;
661
2.05k
        return ((up[0]<<16) | (up[1]<< 8) |  up[2])             & 0x1fffff;
662
3.41k
    } else if (up[0] < 0xf0) {
663
894
        (*cp)+=4;
664
894
        uint32_t uv = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
665
894
        return (int32_t)uv;
666
2.52k
    } else {
667
2.52k
        (*cp)+=5;
668
2.52k
        uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
669
2.52k
        return (int32_t)uv;
670
2.52k
    }
671
117k
}
672
673
0
static int64_t safe_ltf8_get(char **cp, const char *endp, int *err) {
674
0
    unsigned char *up = (unsigned char *)*cp;
675
676
0
    if (endp && endp - *cp < 9 &&
677
0
        (*cp >= endp || endp - *cp < ltf8_bytes[up[0]])) {
678
0
        if (err) *err = 1;
679
0
        return 0;
680
0
    }
681
682
0
    if (up[0] < 0x80) {
683
0
        (*cp)++;
684
0
        return up[0];
685
0
    } else if (up[0] < 0xc0) {
686
0
        (*cp)+=2;
687
0
        return (((uint64_t)up[0]<< 8) |
688
0
                 (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
689
0
    } else if (up[0] < 0xe0) {
690
0
        (*cp)+=3;
691
0
        return (((uint64_t)up[0]<<16) |
692
0
                ((uint64_t)up[1]<< 8) |
693
0
                 (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
694
0
    } else if (up[0] < 0xf0) {
695
0
        (*cp)+=4;
696
0
        return (((uint64_t)up[0]<<24) |
697
0
                ((uint64_t)up[1]<<16) |
698
0
                ((uint64_t)up[2]<< 8) |
699
0
                 (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
700
0
    } else if (up[0] < 0xf8) {
701
0
        (*cp)+=5;
702
0
        return (((uint64_t)up[0]<<32) |
703
0
                ((uint64_t)up[1]<<24) |
704
0
                ((uint64_t)up[2]<<16) |
705
0
                ((uint64_t)up[3]<< 8) |
706
0
                 (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
707
0
    } else if (up[0] < 0xfc) {
708
0
        (*cp)+=6;
709
0
        return (((uint64_t)up[0]<<40) |
710
0
                ((uint64_t)up[1]<<32) |
711
0
                ((uint64_t)up[2]<<24) |
712
0
                ((uint64_t)up[3]<<16) |
713
0
                ((uint64_t)up[4]<< 8) |
714
0
                 (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
715
0
    } else if (up[0] < 0xfe) {
716
0
        (*cp)+=7;
717
0
        return (((uint64_t)up[0]<<48) |
718
0
                ((uint64_t)up[1]<<40) |
719
0
                ((uint64_t)up[2]<<32) |
720
0
                ((uint64_t)up[3]<<24) |
721
0
                ((uint64_t)up[4]<<16) |
722
0
                ((uint64_t)up[5]<< 8) |
723
0
                 (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
724
0
    } else if (up[0] < 0xff) {
725
0
        (*cp)+=8;
726
0
        return (((uint64_t)up[1]<<48) |
727
0
                ((uint64_t)up[2]<<40) |
728
0
                ((uint64_t)up[3]<<32) |
729
0
                ((uint64_t)up[4]<<24) |
730
0
                ((uint64_t)up[5]<<16) |
731
0
                ((uint64_t)up[6]<< 8) |
732
0
                 (uint64_t)up[7]) & ((1LL<<(7*8))-1);
733
0
    } else {
734
0
        (*cp)+=9;
735
0
        return (((uint64_t)up[1]<<56) |
736
0
                ((uint64_t)up[2]<<48) |
737
0
                ((uint64_t)up[3]<<40) |
738
0
                ((uint64_t)up[4]<<32) |
739
0
                ((uint64_t)up[5]<<24) |
740
0
                ((uint64_t)up[6]<<16) |
741
0
                ((uint64_t)up[7]<< 8) |
742
0
                 (uint64_t)up[8]);
743
0
    }
744
0
}
745
746
// Wrapper for now
747
550k
static int safe_itf8_put(char *cp, char *cp_end, int32_t val) {
748
550k
    return itf8_put(cp, val);
749
550k
}
750
751
36.5k
static int safe_ltf8_put(char *cp, char *cp_end, int64_t val) {
752
36.5k
    return ltf8_put(cp, val);
753
36.5k
}
754
755
129k
static int itf8_size(int64_t v) {
756
129k
    return ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5);
757
129k
}
758
759
//-----------------------------------------------------------------------------
760
761
/*
762
 * Decodes a 32-bit little endian value from fd and stores in val.
763
 *
764
 * Returns the number of bytes read on success
765
 *         -1 on failure
766
 */
767
11.1k
static int int32_decode(cram_fd *fd, int32_t *val) {
768
11.1k
    int32_t i;
769
11.1k
    if (4 != hread(fd->fp, &i, 4))
770
36
        return -1;
771
772
11.1k
    *val = le_int4(i);
773
11.1k
    return 4;
774
11.1k
}
775
776
/*
777
 * Encodes a 32-bit little endian value 'val' and writes to fd.
778
 *
779
 * Returns the number of bytes written on success
780
 *         -1 on failure
781
 */
782
38.5k
static int int32_encode(cram_fd *fd, int32_t val) {
783
38.5k
    uint32_t v = le_int4(val);
784
38.5k
    if (4 != hwrite(fd->fp, &v, 4))
785
0
        return -1;
786
787
38.5k
    return 4;
788
38.5k
}
789
790
/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
791
331
int int32_get_blk(cram_block *b, int32_t *val) {
792
331
    if (b->uncomp_size - BLOCK_SIZE(b) < 4)
793
27
        return -1;
794
795
304
    uint32_t v =
796
304
         ((uint32_t) b->data[b->byte  ])        |
797
304
        (((uint32_t) b->data[b->byte+1]) <<  8) |
798
304
        (((uint32_t) b->data[b->byte+2]) << 16) |
799
304
        (((uint32_t) b->data[b->byte+3]) << 24);
800
    // Avoid implementation-defined behaviour on negative values
801
304
    *val = v < 0x80000000U ? (int32_t) v : -((int32_t) (0xffffffffU - v)) - 1;
802
304
    BLOCK_SIZE(b) += 4;
803
304
    return 4;
804
331
}
805
806
/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
807
5.44k
int int32_put_blk(cram_block *b, int32_t val) {
808
5.44k
    unsigned char cp[4];
809
5.44k
    uint32_t v = val;
810
5.44k
    cp[0] = ( v      & 0xff);
811
5.44k
    cp[1] = ((v>>8)  & 0xff);
812
5.44k
    cp[2] = ((v>>16) & 0xff);
813
5.44k
    cp[3] = ((v>>24) & 0xff);
814
815
5.44k
    BLOCK_APPEND(b, cp, 4);
816
5.44k
    return 0;
817
818
0
 block_err:
819
0
    return -1;
820
5.44k
}
821
822
#ifdef HAVE_LIBDEFLATE
823
/* ----------------------------------------------------------------------
824
 * libdeflate compression code, with interface to match
825
 * zlib_mem_{in,de}flate for simplicity elsewhere.
826
 */
827
828
// Named the same as the version that uses zlib as we always use libdeflate for
829
// decompression when available.
830
char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
831
    struct libdeflate_decompressor *z = libdeflate_alloc_decompressor();
832
    if (!z) {
833
        hts_log_error("Call to libdeflate_alloc_decompressor failed");
834
        return NULL;
835
    }
836
837
    uint8_t *data = NULL, *new_data;
838
    if (!*size)
839
        *size = csize*2;
840
    for(;;) {
841
        new_data = realloc(data, *size);
842
        if (!new_data) {
843
            hts_log_error("Memory allocation failure");
844
            goto fail;
845
        }
846
        data = new_data;
847
848
        int ret = libdeflate_gzip_decompress(z, cdata, csize, data, *size, size);
849
850
        // Auto grow output buffer size if needed and try again.
851
        // Fortunately for all bar one call of this we know the size already.
852
        if (ret == LIBDEFLATE_INSUFFICIENT_SPACE) {
853
            (*size) *= 1.5;
854
            continue;
855
        }
856
857
        if (ret != LIBDEFLATE_SUCCESS) {
858
            hts_log_error("Inflate operation failed: %d", ret);
859
            goto fail;
860
        } else {
861
            break;
862
        }
863
    }
864
865
    libdeflate_free_decompressor(z);
866
    return (char *)data;
867
868
 fail:
869
    libdeflate_free_decompressor(z);
870
    free(data);
871
    return NULL;
872
}
873
874
// Named differently as we use both zlib/libdeflate for compression.
875
static char *libdeflate_deflate(char *data, size_t size, size_t *cdata_size,
876
                                int level, int strat) {
877
    level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default
878
    level *= 1.23;     // NB levels go up to 12 here; 5 onwards is +1
879
    level += level>=8; // 5,6,7->6,7,8  8->10  9->12
880
    if (level > 12) level = 12;
881
882
    if (strat == Z_RLE) // not supported by libdeflate
883
        level = 1;
884
885
    struct libdeflate_compressor *z = libdeflate_alloc_compressor(level);
886
    if (!z) {
887
        hts_log_error("Call to libdeflate_alloc_compressor failed");
888
        return NULL;
889
    }
890
891
    unsigned char *cdata = NULL; /* Compressed output */
892
    size_t cdata_alloc;
893
    cdata = malloc(cdata_alloc = size*1.05+100);
894
    if (!cdata) {
895
        hts_log_error("Memory allocation failure");
896
        libdeflate_free_compressor(z);
897
        return NULL;
898
    }
899
900
    *cdata_size = libdeflate_gzip_compress(z, data, size, cdata, cdata_alloc);
901
    libdeflate_free_compressor(z);
902
903
    if (*cdata_size == 0) {
904
        hts_log_error("Call to libdeflate_gzip_compress failed");
905
        free(cdata);
906
        return NULL;
907
    }
908
909
    return (char *)cdata;
910
}
911
912
#else
913
914
/* ----------------------------------------------------------------------
915
 * zlib compression code - from Gap5's tg_iface_g.c
916
 * They're static here as they're only used within the cram_compress_block
917
 * and cram_uncompress_block functions, which are the external interface.
918
 */
919
10
char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
920
10
    z_stream s;
921
10
    unsigned char *data = NULL; /* Uncompressed output */
922
10
    int data_alloc = 0;
923
10
    int err;
924
925
    /* Starting point at uncompressed size, and scale after that */
926
10
    data = malloc(data_alloc = csize*1.2+100);
927
10
    if (!data)
928
0
        return NULL;
929
930
    /* Initialise zlib stream */
931
10
    s.zalloc = Z_NULL; /* use default allocation functions */
932
10
    s.zfree  = Z_NULL;
933
10
    s.opaque = Z_NULL;
934
10
    s.next_in  = (unsigned char *)cdata;
935
10
    s.avail_in = csize;
936
10
    s.total_in = 0;
937
10
    s.next_out  = data;
938
10
    s.avail_out = data_alloc;
939
10
    s.total_out = 0;
940
941
    //err = inflateInit(&s);
942
10
    err = inflateInit2(&s, 15 + 32);
943
10
    if (err != Z_OK) {
944
0
        hts_log_error("Call to zlib inflateInit failed: %s", s.msg);
945
0
        free(data);
946
0
        return NULL;
947
0
    }
948
949
    /* Decode to 'data' array */
950
14
    for (;s.avail_in;) {
951
8
        unsigned char *data_tmp;
952
8
        int alloc_inc;
953
954
8
        s.next_out = &data[s.total_out];
955
8
        err = inflate(&s, Z_NO_FLUSH);
956
8
        if (err == Z_STREAM_END)
957
0
            break;
958
959
8
        if (err != Z_OK) {
960
4
            hts_log_error("Call to zlib inflate failed: %s", s.msg);
961
4
            free(data);
962
4
            inflateEnd(&s);
963
4
            return NULL;
964
4
        }
965
966
        /* More to come, so realloc based on growth so far */
967
4
        alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
968
4
        data = realloc((data_tmp = data), data_alloc += alloc_inc);
969
4
        if (!data) {
970
0
            free(data_tmp);
971
0
            inflateEnd(&s);
972
0
            return NULL;
973
0
        }
974
4
        s.avail_out += alloc_inc;
975
4
    }
976
6
    inflateEnd(&s);
977
978
6
    *size = s.total_out;
979
6
    return (char *)data;
980
10
}
981
#endif
982
983
#if !defined(HAVE_LIBDEFLATE) || LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR ==  1 && LIBDEFLATE_VERSION_MINOR <= 8)
984
static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
985
33.9k
                              int level, int strat) {
986
33.9k
    z_stream s;
987
33.9k
    unsigned char *cdata = NULL; /* Compressed output */
988
33.9k
    int cdata_alloc = 0;
989
33.9k
    int cdata_pos = 0;
990
33.9k
    int err;
991
992
33.9k
    cdata = malloc(cdata_alloc = size*1.05+100);
993
33.9k
    if (!cdata)
994
0
        return NULL;
995
33.9k
    cdata_pos = 0;
996
997
    /* Initialise zlib stream */
998
33.9k
    s.zalloc = Z_NULL; /* use default allocation functions */
999
33.9k
    s.zfree  = Z_NULL;
1000
33.9k
    s.opaque = Z_NULL;
1001
33.9k
    s.next_in  = (unsigned char *)data;
1002
33.9k
    s.avail_in = size;
1003
33.9k
    s.total_in = 0;
1004
33.9k
    s.next_out  = cdata;
1005
33.9k
    s.avail_out = cdata_alloc;
1006
33.9k
    s.total_out = 0;
1007
33.9k
    s.data_type = Z_BINARY;
1008
1009
33.9k
    err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
1010
33.9k
    if (err != Z_OK) {
1011
0
        hts_log_error("Call to zlib deflateInit2 failed: %s", s.msg);
1012
0
        return NULL;
1013
0
    }
1014
1015
    /* Encode to 'cdata' array */
1016
67.8k
    for (;s.avail_in;) {
1017
33.9k
        s.next_out = &cdata[cdata_pos];
1018
33.9k
        s.avail_out = cdata_alloc - cdata_pos;
1019
33.9k
        if (cdata_alloc - cdata_pos <= 0) {
1020
0
            hts_log_error("Deflate produced larger output than expected");
1021
0
            return NULL;
1022
0
        }
1023
33.9k
        err = deflate(&s, Z_NO_FLUSH);
1024
33.9k
        cdata_pos = cdata_alloc - s.avail_out;
1025
33.9k
        if (err != Z_OK) {
1026
0
            hts_log_error("Call to zlib deflate failed: %s", s.msg);
1027
0
            break;
1028
0
        }
1029
33.9k
    }
1030
33.9k
    if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
1031
0
        hts_log_error("Call to zlib deflate failed: %s", s.msg);
1032
0
    }
1033
33.9k
    *cdata_size = s.total_out;
1034
1035
33.9k
    if (deflateEnd(&s) != Z_OK) {
1036
0
        hts_log_error("Call to zlib deflate failed: %s", s.msg);
1037
0
    }
1038
33.9k
    return (char *)cdata;
1039
33.9k
}
1040
#endif
1041
1042
#ifdef HAVE_LIBLZMA
1043
/* ------------------------------------------------------------------------ */
1044
/*
1045
 * Data compression routines using liblzma (xz)
1046
 *
1047
 * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
1048
 * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
1049
 * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
1050
 * as compression times.
1051
 *
1052
 * For now we disable this functionality. If it's to be reenabled make sure you
1053
 * improve the mem_inflate implementation as it's just a test hack at the
1054
 * moment.
1055
 */
1056
1057
static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
1058
0
                              int level) {
1059
0
    char *out;
1060
0
    size_t out_size = lzma_stream_buffer_bound(size);
1061
0
    *cdata_size = 0;
1062
1063
0
    out = malloc(out_size);
1064
1065
    /* Single call compression */
1066
0
    if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
1067
0
                                           (uint8_t *)data, size,
1068
0
                                           (uint8_t *)out, cdata_size,
1069
0
                                           out_size))
1070
0
        return NULL;
1071
1072
0
    return out;
1073
0
}
1074
1075
16
static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
1076
16
    lzma_stream strm = LZMA_STREAM_INIT;
1077
16
    size_t out_size = 0, out_pos = 0;
1078
16
    char *out = NULL, *new_out;
1079
16
    int r;
1080
1081
    /* Initiate the decoder */
1082
16
    if (LZMA_OK != lzma_stream_decoder(&strm, lzma_easy_decoder_memusage(9), 0))
1083
0
        return NULL;
1084
1085
    /* Decode loop */
1086
16
    strm.avail_in = csize;
1087
16
    strm.next_in = (uint8_t *)cdata;
1088
1089
24
    for (;strm.avail_in;) {
1090
15
        if (strm.avail_in > out_size - out_pos) {
1091
15
            out_size += strm.avail_in * 4 + 32768;
1092
15
            new_out = realloc(out, out_size);
1093
15
            if (!new_out)
1094
0
                goto fail;
1095
15
            out = new_out;
1096
15
        }
1097
15
        strm.avail_out = out_size - out_pos;
1098
15
        strm.next_out = (uint8_t *)&out[out_pos];
1099
1100
15
        r = lzma_code(&strm, LZMA_RUN);
1101
15
        if (LZMA_OK != r && LZMA_STREAM_END != r) {
1102
7
            hts_log_error("LZMA decode failure (error %d)", r);
1103
7
            goto fail;
1104
7
        }
1105
1106
8
        out_pos = strm.total_out;
1107
1108
8
        if (r == LZMA_STREAM_END)
1109
0
            break;
1110
8
    }
1111
1112
    /* finish up any unflushed data; necessary? */
1113
9
    r = lzma_code(&strm, LZMA_FINISH);
1114
9
    if (r != LZMA_OK && r != LZMA_STREAM_END) {
1115
0
        hts_log_error("Call to lzma_code failed with error %d", r);
1116
0
        goto fail;
1117
0
    }
1118
1119
9
    new_out = realloc(out, strm.total_out > 0 ? strm.total_out : 1);
1120
9
    if (new_out)
1121
9
        out = new_out;
1122
9
    *size = strm.total_out;
1123
1124
9
    lzma_end(&strm);
1125
1126
9
    return out;
1127
1128
7
 fail:
1129
7
    lzma_end(&strm);
1130
7
    free(out);
1131
7
    return NULL;
1132
9
}
1133
#endif
1134
1135
/* ----------------------------------------------------------------------
1136
 * CRAM blocks - the dynamically growable data block. We have code to
1137
 * create, update, (un)compress and read/write.
1138
 *
1139
 * These are derived from the deflate_interlaced.c blocks, but with the
1140
 * CRAM extension of content types and IDs.
1141
 */
1142
1143
/*
1144
 * Allocates a new cram_block structure with a specified content_type and
1145
 * id.
1146
 *
1147
 * Returns block pointer on success
1148
 *         NULL on failure
1149
 */
1150
cram_block *cram_new_block(enum cram_content_type content_type,
1151
91.6k
                           int content_id) {
1152
91.6k
    cram_block *b = malloc(sizeof(*b));
1153
91.6k
    if (!b)
1154
0
        return NULL;
1155
91.6k
    b->method = b->orig_method = RAW;
1156
91.6k
    b->content_type = content_type;
1157
91.6k
    b->content_id = content_id;
1158
91.6k
    b->comp_size = 0;
1159
91.6k
    b->uncomp_size = 0;
1160
91.6k
    b->data = NULL;
1161
91.6k
    b->alloc = 0;
1162
91.6k
    b->byte = 0;
1163
91.6k
    b->bit = 7; // MSB
1164
91.6k
    b->crc32 = 0;
1165
91.6k
    b->idx = 0;
1166
91.6k
    b->m = NULL;
1167
1168
91.6k
    return b;
1169
91.6k
}
1170
1171
/*
1172
 * Reads a block from a cram file.
1173
 * Returns cram_block pointer on success.
1174
 *         NULL on failure
1175
 */
1176
7.38k
cram_block *cram_read_block(cram_fd *fd) {
1177
7.38k
    cram_block *b = malloc(sizeof(*b));
1178
7.38k
    unsigned char c;
1179
7.38k
    uint32_t crc = 0;
1180
7.38k
    if (!b)
1181
0
        return NULL;
1182
1183
    //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
1184
1185
7.38k
    if (-1 == (b->method      = hgetc(fd->fp))) { free(b); return NULL; }
1186
7.35k
    if (b->method > TOK3) {
1187
221
        hts_log_error("Unknown block compression method %d", (int) b->method);
1188
221
        free(b); return NULL;
1189
221
    }
1190
7.13k
    c = b->method; crc = crc32(crc, &c, 1);
1191
7.13k
    if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; }
1192
7.12k
    c = b->content_type; crc = crc32(crc, &c, 1);
1193
7.12k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->content_id, &crc))  { free(b); return NULL; }
1194
7.10k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->comp_size, &crc))   { free(b); return NULL; }
1195
7.10k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; }
1196
1197
    //fprintf(stderr, "  method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
1198
    //      b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
1199
1200
7.09k
    if (b->method == RAW) {
1201
2.17k
        if (b->uncomp_size < 0 || b->comp_size != b->uncomp_size) {
1202
46
            free(b);
1203
46
            return NULL;
1204
46
        }
1205
2.12k
        b->alloc = b->uncomp_size;
1206
2.12k
        if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
1207
2.12k
        if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) {
1208
6
            free(b->data);
1209
6
            free(b);
1210
6
            return NULL;
1211
6
        }
1212
4.92k
    } else {
1213
4.92k
        if (b->comp_size < 0 || b->uncomp_size < 0) {
1214
17
            free(b);
1215
17
            return NULL;
1216
17
        }
1217
4.91k
        b->alloc = b->comp_size;
1218
4.91k
        if (!(b->data = malloc(b->comp_size)))  { free(b); return NULL; }
1219
4.91k
        if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) {
1220
32
            free(b->data);
1221
32
            free(b);
1222
32
            return NULL;
1223
32
        }
1224
4.91k
    }
1225
1226
6.99k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1227
317
        if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
1228
8
            free(b->data);
1229
8
            free(b);
1230
8
            return NULL;
1231
8
        }
1232
1233
309
        b->crc32_checked = fd->ignore_md5;
1234
309
        b->crc_part = crc;
1235
6.68k
    } else {
1236
6.68k
        b->crc32_checked = 1; // CRC not present
1237
6.68k
    }
1238
1239
6.98k
    b->orig_method = b->method;
1240
6.98k
    b->idx = 0;
1241
6.98k
    b->byte = 0;
1242
6.98k
    b->bit = 7; // MSB
1243
1244
6.98k
    return b;
1245
6.99k
}
1246
1247
1248
/*
1249
 * Computes the size of a cram block, including the block
1250
 * header itself.
1251
 */
1252
0
uint32_t cram_block_size(cram_block *b) {
1253
0
    unsigned char dat[100], *cp = dat;;
1254
0
    uint32_t sz;
1255
1256
0
    *cp++ = b->method;
1257
0
    *cp++ = b->content_type;
1258
0
    cp += itf8_put((char*)cp, b->content_id);
1259
0
    cp += itf8_put((char*)cp, b->comp_size);
1260
0
    cp += itf8_put((char*)cp, b->uncomp_size);
1261
1262
0
    sz = cp-dat + 4;
1263
0
    sz += b->method == RAW ? b->uncomp_size : b->comp_size;
1264
1265
0
    return sz;
1266
0
}
1267
1268
/*
1269
 * Writes a CRAM block.
1270
 * Returns 0 on success
1271
 *        -1 on failure
1272
 */
1273
38.5k
int cram_write_block(cram_fd *fd, cram_block *b) {
1274
38.5k
    char vardata[100];
1275
38.5k
    int vardata_o = 0;
1276
1277
38.5k
    assert(b->method != RAW || (b->comp_size == b->uncomp_size));
1278
1279
38.5k
    if (hputc(b->method,       fd->fp)  == EOF) return -1;
1280
38.5k
    if (hputc(b->content_type, fd->fp)  == EOF) return -1;
1281
38.5k
    vardata_o += fd->vv.varint_put32(vardata          , vardata+100, b->content_id);
1282
38.5k
    vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->comp_size);
1283
38.5k
    vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->uncomp_size);
1284
38.5k
    if (vardata_o != hwrite(fd->fp, vardata, vardata_o))
1285
0
        return -1;
1286
1287
38.5k
    if (b->data) {
1288
35.6k
        if (b->method == RAW) {
1289
28.7k
            if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size))
1290
0
                return -1;
1291
28.7k
        } else {
1292
6.92k
            if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size))
1293
0
                return -1;
1294
6.92k
        }
1295
35.6k
    } else {
1296
        // Absent blocks should be size 0
1297
2.85k
        assert(b->method == RAW && b->uncomp_size == 0);
1298
2.85k
    }
1299
1300
38.5k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1301
38.5k
        char dat[100], *cp = (char *)dat;
1302
38.5k
        uint32_t crc;
1303
1304
38.5k
        *cp++ = b->method;
1305
38.5k
        *cp++ = b->content_type;
1306
38.5k
        cp += fd->vv.varint_put32(cp, dat+100, b->content_id);
1307
38.5k
        cp += fd->vv.varint_put32(cp, dat+100, b->comp_size);
1308
38.5k
        cp += fd->vv.varint_put32(cp, dat+100, b->uncomp_size);
1309
38.5k
        crc = crc32(0L, (uc *)dat, cp-dat);
1310
1311
38.5k
        if (b->method == RAW) {
1312
31.5k
            b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
1313
31.5k
        } else {
1314
6.92k
            b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
1315
6.92k
        }
1316
1317
38.5k
        if (-1 == int32_encode(fd, b->crc32))
1318
0
            return -1;
1319
38.5k
    }
1320
1321
38.5k
    return 0;
1322
38.5k
}
1323
1324
/*
1325
 * Frees a CRAM block, deallocating internal data too.
1326
 */
1327
104k
void cram_free_block(cram_block *b) {
1328
104k
    if (!b)
1329
5.45k
        return;
1330
98.6k
    if (b->data)
1331
56.8k
        free(b->data);
1332
98.6k
    free(b);
1333
98.6k
}
1334
1335
/*
1336
 * Uncompresses a CRAM block, if compressed.
1337
 */
1338
5.13k
int cram_uncompress_block(cram_block *b) {
1339
5.13k
    char *uncomp;
1340
5.13k
    size_t uncomp_size = 0;
1341
1342
5.13k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1343
    // Pretend the CRC was OK so the fuzzer doesn't have to get it right
1344
5.13k
    b->crc32_checked = 1;
1345
5.13k
#endif
1346
1347
5.13k
    if (b->crc32_checked == 0) {
1348
0
        uint32_t crc = crc32(b->crc_part, b->data ? b->data : (uc *)"", b->alloc);
1349
0
        b->crc32_checked = 1;
1350
0
        if (crc != b->crc32) {
1351
0
            hts_log_error("Block CRC32 failure");
1352
0
            return -1;
1353
0
        }
1354
0
    }
1355
1356
5.13k
    if (b->uncomp_size == 0) {
1357
        // blank block
1358
38
        b->method = RAW;
1359
38
        return 0;
1360
38
    }
1361
5.13k
    assert(b->uncomp_size >= 0); // cram_read_block should ensure this
1362
1363
5.10k
    switch (b->method) {
1364
286
    case RAW:
1365
286
        return 0;
1366
1367
10
    case GZIP:
1368
10
        uncomp_size = b->uncomp_size;
1369
10
        uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1370
1371
10
        if (!uncomp)
1372
4
            return -1;
1373
6
        if (uncomp_size != b->uncomp_size) {
1374
6
            free(uncomp);
1375
6
            return -1;
1376
6
        }
1377
0
        free(b->data);
1378
0
        b->data = (unsigned char *)uncomp;
1379
0
        b->alloc = uncomp_size;
1380
0
        b->method = RAW;
1381
0
        break;
1382
1383
0
#ifdef HAVE_LIBBZ2
1384
4
    case BZIP2: {
1385
4
        unsigned int usize = b->uncomp_size;
1386
4
        if (!(uncomp = malloc(usize)))
1387
0
            return -1;
1388
4
        if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
1389
4
                                                (char *)b->data, b->comp_size,
1390
4
                                                0, 0)) {
1391
4
            free(uncomp);
1392
4
            return -1;
1393
4
        }
1394
0
        free(b->data);
1395
0
        b->data = (unsigned char *)uncomp;
1396
0
        b->alloc = usize;
1397
0
        b->method = RAW;
1398
0
        b->uncomp_size = usize; // Just in case it differs
1399
0
        break;
1400
4
    }
1401
#else
1402
    case BZIP2:
1403
        hts_log_error("Bzip2 compression is not compiled into this version. Please rebuild and try again");
1404
        return -1;
1405
#endif
1406
1407
0
#ifdef HAVE_LIBLZMA
1408
16
    case LZMA:
1409
16
        uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1410
16
        if (!uncomp)
1411
7
            return -1;
1412
9
        if (uncomp_size != b->uncomp_size) {
1413
9
            free(uncomp);
1414
9
            return -1;
1415
9
        }
1416
0
        free(b->data);
1417
0
        b->data = (unsigned char *)uncomp;
1418
0
        b->alloc = uncomp_size;
1419
0
        b->method = RAW;
1420
0
        break;
1421
#else
1422
    case LZMA:
1423
        hts_log_error("Lzma compression is not compiled into this version. Please rebuild and try again");
1424
        return -1;
1425
        break;
1426
#endif
1427
1428
426
    case RANS: {
1429
426
        unsigned int usize = b->uncomp_size, usize2 = 0;
1430
426
        uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2);
1431
426
        if (!uncomp)
1432
209
            return -1;
1433
217
        if (usize != usize2) {
1434
201
            free(uncomp);
1435
201
            return -1;
1436
201
        }
1437
16
        free(b->data);
1438
16
        b->data = (unsigned char *)uncomp;
1439
16
        b->alloc = usize2;
1440
16
        b->method = RAW;
1441
16
        b->uncomp_size = usize2; // Just in case it differs
1442
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1443
16
        break;
1444
217
    }
1445
1446
737
    case FQZ: {
1447
737
        uncomp_size = b->uncomp_size;
1448
737
        uncomp = fqz_decompress((char *)b->data, b->comp_size, &uncomp_size, NULL, 0);
1449
737
        if (!uncomp)
1450
570
            return -1;
1451
167
        free(b->data);
1452
167
        b->data = (unsigned char *)uncomp;
1453
167
        b->alloc = uncomp_size;
1454
167
        b->method = RAW;
1455
167
        b->uncomp_size = uncomp_size;
1456
167
        break;
1457
737
    }
1458
1459
967
    case RANS_PR0: {
1460
967
        unsigned int usize = b->uncomp_size, usize2 = 0;
1461
967
        uncomp = (char *)rans_uncompress_4x16(b->data, b->comp_size, &usize2);
1462
967
        if (!uncomp)
1463
689
            return -1;
1464
278
        if (usize != usize2) {
1465
182
            free(uncomp);
1466
182
            return -1;
1467
182
        }
1468
96
        b->orig_method = RANSPR;
1469
96
        free(b->data);
1470
96
        b->data = (unsigned char *)uncomp;
1471
96
        b->alloc = usize2;
1472
96
        b->method = RAW;
1473
96
        b->uncomp_size = usize2; // Just incase it differs
1474
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1475
96
        break;
1476
278
    }
1477
1478
553
    case ARITH_PR0: {
1479
553
        unsigned int usize = b->uncomp_size, usize2 = 0;
1480
553
        uncomp = (char *)arith_uncompress_to(b->data, b->comp_size, NULL, &usize2);
1481
553
        if (!uncomp)
1482
381
            return -1;
1483
172
        if (usize != usize2) {
1484
55
            free(uncomp);
1485
55
            return -1;
1486
55
        }
1487
117
        b->orig_method = ARITH;
1488
117
        free(b->data);
1489
117
        b->data = (unsigned char *)uncomp;
1490
117
        b->alloc = usize2;
1491
117
        b->method = RAW;
1492
117
        b->uncomp_size = usize2; // Just incase it differs
1493
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1494
117
        break;
1495
172
    }
1496
1497
2.10k
    case TOK3: {
1498
2.10k
        uint32_t out_len;
1499
2.10k
        uint8_t *cp = tok3_decode_names(b->data, b->comp_size, &out_len);
1500
2.10k
        if (!cp)
1501
1.98k
            return -1;
1502
115
        b->orig_method = TOK3;
1503
115
        b->method = RAW;
1504
115
        free(b->data);
1505
115
        b->data = cp;
1506
115
        b->alloc = out_len;
1507
115
        b->uncomp_size = out_len;
1508
115
        break;
1509
2.10k
    }
1510
1511
0
    default:
1512
0
        return -1;
1513
5.10k
    }
1514
1515
511
    return 0;
1516
5.10k
}
1517
1518
static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size,
1519
                                     int content_id, size_t *out_size,
1520
                                     enum cram_block_method_int method,
1521
78.4k
                                     int level, int strat) {
1522
78.4k
    switch (method) {
1523
16.5k
    case GZIP:
1524
22.9k
    case GZIP_RLE:
1525
33.9k
    case GZIP_1:
1526
        // Read names bizarrely benefit from zlib over libdeflate for
1527
        // mid-range compression levels.  Focusing purely of ratio or
1528
        // speed, libdeflate still wins.  It also seems to win for
1529
        // other data series too.
1530
        //
1531
        // Eg RN at level 5;  libdeflate=55.9MB  zlib=51.6MB
1532
#ifdef HAVE_LIBDEFLATE
1533
#  if (LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR == 1 && LIBDEFLATE_VERSION_MINOR <= 8))
1534
        if (content_id == DS_RN && level >= 4 && level <= 7)
1535
            return zlib_mem_deflate(in, in_size, out_size, level, strat);
1536
        else
1537
#  endif
1538
            return libdeflate_deflate(in, in_size, out_size, level, strat);
1539
#else
1540
33.9k
        return zlib_mem_deflate(in, in_size, out_size, level, strat);
1541
0
#endif
1542
1543
0
    case BZIP2: {
1544
0
#ifdef HAVE_LIBBZ2
1545
0
        unsigned int comp_size = in_size*1.01 + 600;
1546
0
        char *comp = malloc(comp_size);
1547
0
        if (!comp)
1548
0
            return NULL;
1549
1550
0
        if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
1551
0
                                              in, in_size,
1552
0
                                              level, 0, 30)) {
1553
0
            free(comp);
1554
0
            return NULL;
1555
0
        }
1556
0
        *out_size = comp_size;
1557
0
        return comp;
1558
#else
1559
        return NULL;
1560
#endif
1561
0
    }
1562
1563
0
    case FQZ:
1564
0
    case FQZ_b:
1565
0
    case FQZ_c:
1566
0
    case FQZ_d: {
1567
        // Extract the necessary portion of the slice into an fqz_slice struct.
1568
        // These previously were the same thing, but this permits us to detach
1569
        // the codec from the rest of this CRAM implementation.
1570
0
        fqz_slice *f = malloc(2*s->hdr->num_records * sizeof(uint32_t) + sizeof(fqz_slice));
1571
0
        if (!f)
1572
0
            return NULL;
1573
0
        f->num_records = s->hdr->num_records;
1574
0
        f->len = (uint32_t *)(((char *)f) + sizeof(fqz_slice));
1575
0
        f->flags = f->len + s->hdr->num_records;
1576
0
        int i;
1577
0
        for (i = 0; i < s->hdr->num_records; i++) {
1578
0
            f->flags[i] = s->crecs[i].flags;
1579
0
            f->len[i] = (i+1 < s->hdr->num_records
1580
0
                         ? s->crecs[i+1].qual - s->crecs[i].qual
1581
0
                         : s->block[DS_QS]->uncomp_size - s->crecs[i].qual);
1582
0
        }
1583
0
        char *comp = fqz_compress(strat & 0xff /* cram vers */, f,
1584
0
                                  in, in_size, out_size, strat >> 8, NULL);
1585
0
        free(f);
1586
0
        return comp;
1587
0
    }
1588
1589
0
    case LZMA:
1590
0
#ifdef HAVE_LIBLZMA
1591
0
        return lzma_mem_deflate(in, in_size, out_size, level);
1592
#else
1593
        return NULL;
1594
#endif
1595
1596
0
    case RANS0:
1597
0
    case RANS1: {
1598
0
        unsigned int out_size_i;
1599
0
        unsigned char *cp;
1600
0
        cp = rans_compress((unsigned char *)in, in_size, &out_size_i,
1601
0
                           method == RANS0 ? 0 : 1);
1602
0
        *out_size = out_size_i;
1603
0
        return (char *)cp;
1604
0
    }
1605
1606
6.55k
    case RANS_PR0:
1607
13.2k
    case RANS_PR1:
1608
20.1k
    case RANS_PR64:
1609
26.5k
    case RANS_PR9:
1610
33.4k
    case RANS_PR128:
1611
33.4k
    case RANS_PR129:
1612
33.4k
    case RANS_PR192:
1613
39.8k
    case RANS_PR193: {
1614
39.8k
        unsigned int out_size_i;
1615
39.8k
        unsigned char *cp;
1616
1617
        // see enum cram_block. We map RANS_* methods to order bit-fields
1618
39.8k
        static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1619
1620
39.8k
        int m = method == RANS_PR0 ? 0 : methmap[method - RANS_PR1];
1621
39.8k
        cp = rans_compress_4x16((unsigned char *)in, in_size, &out_size_i,
1622
39.8k
                                m | RANS_ORDER_SIMD_AUTO);
1623
39.8k
        *out_size = out_size_i;
1624
39.8k
        return (char *)cp;
1625
33.4k
    }
1626
1627
0
    case ARITH_PR0:
1628
0
    case ARITH_PR1:
1629
0
    case ARITH_PR64:
1630
0
    case ARITH_PR9:
1631
0
    case ARITH_PR128:
1632
0
    case ARITH_PR129:
1633
0
    case ARITH_PR192:
1634
0
    case ARITH_PR193: {
1635
0
        unsigned int out_size_i;
1636
0
        unsigned char *cp;
1637
1638
        // see enum cram_block. We map ARITH_* methods to order bit-fields
1639
0
        static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1640
1641
0
        cp = arith_compress_to((unsigned char *)in, in_size, NULL, &out_size_i,
1642
0
                               method == ARITH_PR0 ? 0 : methmap[method - ARITH_PR1]);
1643
0
        *out_size = out_size_i;
1644
0
        return (char *)cp;
1645
0
    }
1646
1647
4.61k
    case TOK3:
1648
4.61k
    case TOKA: {
1649
4.61k
        int out_len;
1650
4.61k
        int lev = level;
1651
4.61k
        if (method == TOK3 && lev > 3)
1652
4.61k
            lev = 3;
1653
4.61k
        uint8_t *cp = tok3_encode_names(in, in_size, lev, strat, &out_len, NULL);
1654
4.61k
        *out_size = out_len;
1655
4.61k
        return (char *)cp;
1656
4.61k
    }
1657
1658
0
    case RAW:
1659
0
        break;
1660
1661
0
    default:
1662
0
        return NULL;
1663
78.4k
    }
1664
1665
0
    return NULL;
1666
78.4k
}
1667
1668
/*
1669
 * A copy of cram_compress_block2 with added recursion detection.
1670
 * This is only called for error handling where the auto-tuning has failed.
1671
 * The simplest way of doing this is recusion + an additional argument, but
1672
 * we didn't want to complicate the existing code hence this is static.
1673
 */
1674
static int cram_compress_block3(cram_fd *fd, cram_slice *s,
1675
                                cram_block *b, cram_metrics *metrics,
1676
                                int method, int level,
1677
39.5k
                                int recurse) {
1678
1679
39.5k
    if (!b)
1680
1.55k
        return 0;
1681
1682
37.9k
    int orig_method = method;
1683
37.9k
    char *comp = NULL;
1684
37.9k
    size_t comp_size = 0;
1685
37.9k
    int strat;
1686
1687
    // Internally we have parameterised methods that externally map
1688
    // to the same CRAM method value.
1689
    // See enum_cram_block_method_int in cram_structs.h.
1690
37.9k
    int methmap[] = {
1691
        // Externally defined values
1692
37.9k
        RAW, GZIP, BZIP2, LZMA, RANS, RANSPR, ARITH, FQZ, TOK3,
1693
1694
        // Reserved for possible expansion
1695
37.9k
        0, 0,
1696
1697
        // Internally parameterised versions matching back to above
1698
        // external values
1699
37.9k
        GZIP, GZIP,
1700
37.9k
        FQZ, FQZ, FQZ,
1701
37.9k
        RANS,
1702
37.9k
        RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR,
1703
37.9k
        TOK3,
1704
37.9k
        ARITH,  ARITH,  ARITH,  ARITH,  ARITH,  ARITH,  ARITH,
1705
37.9k
    };
1706
1707
37.9k
    if (b->method != RAW) {
1708
        // Maybe already compressed if s->block[0] was compressed and
1709
        // we have e.g. s->block[DS_BA] set to s->block[0] due to only
1710
        // one base type present and hence using E_HUFFMAN on block 0.
1711
        // A second explicit attempt to compress the same block then
1712
        // occurs.
1713
0
        return 0;
1714
0
    }
1715
1716
37.9k
    if (method == -1) {
1717
5.44k
        method = 1<<GZIP;
1718
5.44k
        if (fd->use_bz2)
1719
0
            method |= 1<<BZIP2;
1720
5.44k
        if (fd->use_lzma)
1721
0
            method |= 1<<LZMA;
1722
5.44k
    }
1723
1724
37.9k
    if (level == -1)
1725
5.44k
        level = fd->level;
1726
1727
    //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
1728
1729
37.9k
    if (method == RAW || level == 0 || b->uncomp_size == 0) {
1730
19.7k
        b->method = RAW;
1731
19.7k
        b->comp_size = b->uncomp_size;
1732
        //fprintf(stderr, "Skip block id %d\n", b->content_id);
1733
19.7k
        return 0;
1734
19.7k
    }
1735
1736
18.1k
#ifndef ABS
1737
18.1k
#    define ABS(a) ((a)>=0?(a):-(a))
1738
18.1k
#endif
1739
1740
18.1k
    if (metrics) {
1741
12.6k
        pthread_mutex_lock(&fd->metrics_lock);
1742
        // Sudden changes in size trigger a retrial.  These are mainly
1743
        // triggered when switching to sorted / unsorted, where the number
1744
        // of elements in a slice radically changes.
1745
        //
1746
        // We also get large fluctuations based on genome coordinate for
1747
        // e.g. SA:Z and SC series, but we consider the typical scale of
1748
        // delta between blocks and use this to look for abnormality.
1749
1750
        // Equivalent to (but minus possible integer overflow)
1751
        //   (b->uncomp_size + 1000)/4 > metrics->input_avg_sz+1000 ||
1752
        //    b->uncomp_size + 1000    < (metrics->input_avg_sz+1000)/4)
1753
12.6k
        if (metrics->input_avg_sz &&
1754
3.28k
            (b->uncomp_size/4 - 750 > metrics->input_avg_sz ||
1755
2.97k
             b->uncomp_size         < metrics->input_avg_sz/4 - 750) &&
1756
378
            ABS(b->uncomp_size-metrics->input_avg_sz)/10
1757
378
                > metrics->input_avg_delta) {
1758
19
            metrics->next_trial = 0;
1759
19
        }
1760
1761
12.6k
        if (metrics->trial > 0 || --metrics->next_trial <= 0) {
1762
10.8k
            int m, unpackable = metrics->unpackable;
1763
10.8k
            size_t sz_best = b->uncomp_size;
1764
10.8k
            size_t sz[CRAM_MAX_METHOD] = {0};
1765
10.8k
            int method_best = 0; // RAW
1766
10.8k
            char *c_best = NULL, *c = NULL;
1767
1768
10.8k
            metrics->input_avg_delta =
1769
10.8k
                0.9 * (metrics->input_avg_delta +
1770
10.8k
                       ABS(b->uncomp_size - metrics->input_avg_sz));
1771
1772
10.8k
            metrics->input_avg_sz += b->uncomp_size*.2;
1773
10.8k
            metrics->input_avg_sz *= 0.8;
1774
1775
10.8k
            if (metrics->revised_method)
1776
3.04k
                method = metrics->revised_method;
1777
7.81k
            else
1778
7.81k
                metrics->revised_method = method;
1779
1780
10.8k
            if (metrics->next_trial <= 0) {
1781
23
                metrics->next_trial = TRIAL_SPAN;
1782
23
                metrics->trial = NTRIALS;
1783
759
                for (m = 0; m < CRAM_MAX_METHOD; m++)
1784
736
                    metrics->sz[m] /= 2;
1785
23
                metrics->unpackable = 0;
1786
23
            }
1787
1788
            // Compress this block using the best method
1789
10.8k
            if (unpackable && CRAM_MAJOR_VERS(fd->version) > 3) {
1790
                // No point trying bit-pack if 17+ symbols.
1791
0
                if (method & (1<<RANS_PR128))
1792
0
                    method = (method|(1<<RANS_PR0))&~(1<<RANS_PR128);
1793
0
                if (method & (1<<RANS_PR129))
1794
0
                    method = (method|(1<<RANS_PR1))&~(1<<RANS_PR129);
1795
0
                if (method & (1<<RANS_PR192))
1796
0
                    method = (method|(1<<RANS_PR64))&~(1<<RANS_PR192);
1797
0
                if (method & (1<<RANS_PR193))
1798
0
                    method = (method|(1<<RANS_PR64)|(1<<RANS_PR1))&~(1<<RANS_PR193);
1799
1800
0
                if (method & (1<<ARITH_PR128))
1801
0
                    method = (method|(1<<ARITH_PR0))&~(1<<ARITH_PR128);
1802
0
                if (method & (1<<ARITH_PR129))
1803
0
                    method = (method|(1<<ARITH_PR1))&~(1<<ARITH_PR129);
1804
0
                if (method & (1<<ARITH_PR192))
1805
0
                    method = (method|(1<<ARITH_PR64))&~(1<<ARITH_PR192);
1806
0
                if (method & (1u<<ARITH_PR193))
1807
0
                    method = (method|(1<<ARITH_PR64)|(1<<ARITH_PR1))&~(1u<<ARITH_PR193);
1808
0
            }
1809
1810
            // Libdeflate doesn't have a Z_RLE strategy.
1811
            // We treat it as level 1, but iff we haven't also
1812
            // explicitly listed that in the method list.
1813
#ifdef HAVE_LIBDEFLATE
1814
            if ((method & (1<<GZIP_RLE)) && (method & (1<<GZIP_1)))
1815
                method &= ~(1<<GZIP_RLE);
1816
#endif
1817
1818
10.8k
            pthread_mutex_unlock(&fd->metrics_lock);
1819
1820
358k
            for (m = 0; m < CRAM_MAX_METHOD; m++) {
1821
347k
                if (method & (1u<<m)) {
1822
71.0k
                    int lvl = level;
1823
71.0k
                    switch (m) {
1824
10.8k
                    case GZIP:     strat = Z_FILTERED; break;
1825
10.8k
                    case GZIP_1:   strat = Z_DEFAULT_STRATEGY; lvl = 1; break;
1826
6.41k
                    case GZIP_RLE: strat = Z_RLE; break;
1827
0
                    case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
1828
0
                    case FQZ_b:    strat = CRAM_MAJOR_VERS(fd->version)+256; break;
1829
0
                    case FQZ_c:    strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
1830
0
                    case FQZ_d:    strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
1831
4.44k
                    case TOK3:     strat = 0; break;
1832
0
                    case TOKA:     strat = 1; break;
1833
38.5k
                    default:       strat = 0;
1834
71.0k
                    }
1835
1836
71.0k
                    c = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
1837
71.0k
                                                b->content_id, &sz[m], m, lvl, strat);
1838
1839
71.0k
                    if (c && sz_best > sz[m]) {
1840
10.5k
                        sz_best = sz[m];
1841
10.5k
                        method_best = m;
1842
10.5k
                        if (c_best)
1843
5.32k
                            free(c_best);
1844
10.5k
                        c_best = c;
1845
60.5k
                    } else if (c) {
1846
59.2k
                        free(c);
1847
59.2k
                    } else {
1848
1.27k
                        sz[m] = UINT_MAX; // arbitrarily worse than raw
1849
1.27k
                    }
1850
276k
                } else {
1851
276k
                    sz[m] = UINT_MAX; // arbitrarily worse than raw
1852
276k
                }
1853
347k
            }
1854
1855
10.8k
            if (c_best) {
1856
5.25k
                free(b->data);
1857
5.25k
                b->data = (unsigned char *)c_best;
1858
5.25k
                b->method = method_best; // adjusted to methmap[method_best] later
1859
5.25k
                b->comp_size = sz_best;
1860
5.25k
            }
1861
1862
            // Accumulate stats for all methods tried
1863
10.8k
            pthread_mutex_lock(&fd->metrics_lock);
1864
358k
            for (m = 0; m < CRAM_MAX_METHOD; m++)
1865
                // don't be overly sure on small blocks.
1866
                // +2000 means eg bzip2 vs gzip (1.07 to 1.04) or gz vs rans1
1867
                // needs to be at least 60 bytes smaller to overcome the
1868
                // fixed size addition.
1869
347k
                metrics->sz[m] += sz[m]+2000;
1870
1871
            // When enough trials performed, find the best on average
1872
10.8k
            if (--metrics->trial == 0) {
1873
2.68k
                int best_method = RAW;
1874
2.68k
                int best_sz = INT_MAX;
1875
1876
                // Relative costs of methods. See enum_cram_block_method_int
1877
                // and methmap
1878
2.68k
                double meth_cost[32] = {
1879
                    // Externally defined methods
1880
2.68k
                    1,    // 0  raw
1881
2.68k
                    1.04, // 1  gzip (Z_FILTERED)
1882
2.68k
                    1.07, // 2  bzip2
1883
2.68k
                    1.08, // 3  lzma
1884
2.68k
                    1.00, // 4  rans    (O0)
1885
2.68k
                    1.00, // 5  ranspr  (O0)
1886
2.68k
                    1.04, // 6  arithpr (O0)
1887
2.68k
                    1.05, // 7  fqz
1888
2.68k
                    1.05, // 8  tok3 (rans)
1889
2.68k
                    1.00, 1.00, // 9,10 reserved
1890
1891
                    // Paramterised versions of above
1892
2.68k
                    1.01, // gzip rle
1893
2.68k
                    1.01, // gzip -1
1894
1895
2.68k
                    1.05, 1.05, 1.05, // FQZ_b,c,d
1896
1897
2.68k
                    1.01, // rans O1
1898
1899
2.68k
                    1.01, // rans_pr1
1900
2.68k
                    1.00, // rans_pr64; if smaller, usually fast
1901
2.68k
                    1.03, // rans_pr65/9
1902
2.68k
                    1.00, // rans_pr128
1903
2.68k
                    1.01, // rans_pr129
1904
2.68k
                    1.00, // rans_pr192
1905
2.68k
                    1.01, // rans_pr193
1906
1907
2.68k
                    1.07, // tok3 arith
1908
1909
2.68k
                    1.04, // arith_pr1
1910
2.68k
                    1.04, // arith_pr64
1911
2.68k
                    1.04, // arith_pr9
1912
2.68k
                    1.03, // arith_pr128
1913
2.68k
                    1.04, // arith_pr129
1914
2.68k
                    1.04, // arith_pr192
1915
2.68k
                    1.04, // arith_pr193
1916
2.68k
                };
1917
1918
                // Scale methods by cost based on compression level
1919
2.68k
                if (fd->level <= 1) {
1920
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
1921
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1)*4;
1922
2.68k
                } else if (fd->level <= 3) {
1923
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
1924
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1);
1925
2.68k
                } else if (fd->level <= 6) {
1926
88.4k
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
1927
85.7k
                        metrics->sz[m] *= 1+(meth_cost[m]-1)/2;
1928
2.68k
                } else if (fd->level <= 7) {
1929
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
1930
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1)/3;
1931
0
                } // else cost is ignored
1932
1933
                // Ensure these are never used; BSC and ZSTD
1934
2.68k
                metrics->sz[9] = metrics->sz[10] = INT_MAX;
1935
1936
88.4k
                for (m = 0; m < CRAM_MAX_METHOD; m++) {
1937
85.7k
                    if ((!metrics->sz[m]) || (!(method & (1u<<m))))
1938
72.3k
                        continue;
1939
1940
13.4k
                    if (best_sz > metrics->sz[m])
1941
5.99k
                        best_sz = metrics->sz[m], best_method = m;
1942
13.4k
                }
1943
1944
2.68k
                if (best_method != metrics->method) {
1945
                    //metrics->trial = (NTRIALS+1)/2; // be sure
1946
                    //metrics->next_trial /= 1.5;
1947
2.54k
                    metrics->consistency = 0;
1948
2.54k
                } else {
1949
131
                    metrics->next_trial *= MIN(2, 1+metrics->consistency/4.0);
1950
131
                    metrics->consistency++;
1951
131
                }
1952
1953
2.68k
                metrics->method = best_method;
1954
2.68k
                switch (best_method) {
1955
116
                case GZIP:     strat = Z_FILTERED; break;
1956
1.53k
                case GZIP_1:   strat = Z_DEFAULT_STRATEGY; break;
1957
0
                case GZIP_RLE: strat = Z_RLE; break;
1958
0
                case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
1959
0
                case FQZ_b:    strat = CRAM_MAJOR_VERS(fd->version)+256; break;
1960
0
                case FQZ_c:    strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
1961
0
                case FQZ_d:    strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
1962
255
                case TOK3:     strat = 0; break;
1963
0
                case TOKA:     strat = 1; break;
1964
776
                default:       strat = 0;
1965
2.68k
                }
1966
2.68k
                metrics->strat  = strat;
1967
1968
                // If we see at least MAXFAIL trials in a row for a specific
1969
                // compression method with more than MAXDELTA aggregate
1970
                // size then we drop this from the list of methods used
1971
                // for this block type.
1972
2.68k
#define MAXDELTA 0.20
1973
45.9k
#define MAXFAILS 4
1974
88.4k
                for (m = 0; m < CRAM_MAX_METHOD; m++) {
1975
85.7k
                    if (best_method == m) {
1976
2.68k
                        metrics->cnt[m] = 0;
1977
2.68k
                        metrics->extra[m] = 0;
1978
83.0k
                    } else if (best_sz < metrics->sz[m]) {
1979
45.9k
                        double r = (double)metrics->sz[m] / best_sz - 1;
1980
45.9k
                        int mul = 1+(fd->level>=7);
1981
45.9k
                        if (++metrics->cnt[m] >= MAXFAILS*mul &&
1982
757
                            (metrics->extra[m] += r) >= MAXDELTA*mul)
1983
247
                            method &= ~(1u<<m);
1984
1985
                        // Special case for fqzcomp as it rarely changes
1986
45.9k
                        if (m == FQZ || m == FQZ_b || m == FQZ_c || m == FQZ_d) {
1987
8.14k
                            if (metrics->sz[m] > best_sz)
1988
8.14k
                                method &= ~(1u<<m);
1989
8.14k
                        }
1990
45.9k
                    }
1991
85.7k
                }
1992
1993
                //if (fd->verbose > 1 && method != metrics->revised_method)
1994
                //    fprintf(stderr, "%d: revising method from %x to %x\n",
1995
                //          b->content_id, metrics->revised_method, method);
1996
2.68k
                metrics->revised_method = method;
1997
2.68k
            }
1998
10.8k
            pthread_mutex_unlock(&fd->metrics_lock);
1999
10.8k
        } else {
2000
1.76k
            metrics->input_avg_delta =
2001
1.76k
                0.9 * (metrics->input_avg_delta +
2002
1.76k
                       ABS(b->uncomp_size - metrics->input_avg_sz));
2003
2004
1.76k
            metrics->input_avg_sz += b->uncomp_size*.2;
2005
1.76k
            metrics->input_avg_sz *= 0.8;
2006
2007
1.76k
            strat = metrics->strat;
2008
1.76k
            method = metrics->method;
2009
2010
1.76k
            pthread_mutex_unlock(&fd->metrics_lock);
2011
1.76k
            comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2012
1.76k
                                           b->content_id, &comp_size, method,
2013
1.76k
                                           method == GZIP_1 ? 1 : level,
2014
1.76k
                                           strat);
2015
1.76k
            if (!comp) {
2016
                // Our cached best method failed, but maybe another works?
2017
                // Rerun with trial mode engaged again.
2018
168
                if (!recurse) {
2019
168
                    hts_log_warning("Compressed block ID %d method %s failed, "
2020
168
                                    "redoing trial", b->content_id,
2021
168
                                    cram_block_method2str(method));
2022
168
                    pthread_mutex_lock(&fd->metrics_lock);
2023
168
                    metrics->trial = NTRIALS;
2024
168
                    metrics->next_trial = TRIAL_SPAN;
2025
168
                    metrics->revised_method = orig_method;
2026
168
                    pthread_mutex_unlock(&fd->metrics_lock);
2027
168
                    return cram_compress_block3(fd, s, b, metrics, method,
2028
168
                                                level, 1);
2029
168
                }
2030
0
                return -1;
2031
168
            }
2032
2033
1.59k
            if (comp_size < b->uncomp_size) {
2034
1.48k
                free(b->data);
2035
1.48k
                b->data = (unsigned char *)comp;
2036
1.48k
                b->comp_size = comp_size;
2037
1.48k
                b->method = method;
2038
1.48k
            } else {
2039
111
                free(comp);
2040
111
            }
2041
1.59k
        }
2042
2043
12.6k
    } else {
2044
        // no cached metrics, so just do zlib?
2045
5.56k
        comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2046
5.56k
                                       b->content_id, &comp_size, GZIP, level, Z_FILTERED);
2047
5.56k
        if (!comp) {
2048
0
            hts_log_error("Compression failed!");
2049
0
            return -1;
2050
0
        }
2051
2052
5.56k
        if (comp_size < b->uncomp_size) {
2053
194
            free(b->data);
2054
194
            b->data = (unsigned char *)comp;
2055
194
            b->comp_size = comp_size;
2056
194
            b->method = GZIP;
2057
5.36k
        } else {
2058
5.36k
            free(comp);
2059
5.36k
        }
2060
5.56k
        strat = Z_FILTERED;
2061
5.56k
    }
2062
2063
18.0k
    hts_log_info("Compressed block ID %d from %d to %d by method %s",
2064
18.0k
                 b->content_id, b->uncomp_size, b->comp_size,
2065
18.0k
                 cram_block_method2str(b->method));
2066
2067
18.0k
    b->method = methmap[b->method];
2068
2069
18.0k
    return 0;
2070
18.1k
}
2071
2072
/*
2073
 * Compresses a block using a selection of compression codecs and options.
2074
 * The best is learnt and used for subsequent slices, periodically resampling.
2075
 *
2076
 * Method and level -1 implies defaults, as specified in cram_fd.
2077
 */
2078
int cram_compress_block2(cram_fd *fd, cram_slice *s,
2079
                         cram_block *b, cram_metrics *metrics,
2080
39.3k
                         int method, int level) {
2081
39.3k
    return cram_compress_block3(fd, s, b, metrics, method, level, 0);
2082
39.3k
}
2083
2084
int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
2085
5.44k
                        int method, int level) {
2086
5.44k
    return cram_compress_block2(fd, NULL, b, metrics, method, level);
2087
5.44k
}
2088
2089
755k
cram_metrics *cram_new_metrics(void) {
2090
755k
    cram_metrics *m = calloc(1, sizeof(*m));
2091
755k
    if (!m)
2092
0
        return NULL;
2093
755k
    m->trial = NTRIALS-1;
2094
755k
    m->next_trial = TRIAL_SPAN/2; // learn quicker at start
2095
755k
    m->method = RAW;
2096
755k
    m->strat = 0;
2097
755k
    m->revised_method = 0;
2098
755k
    m->unpackable = 0;
2099
2100
755k
    return m;
2101
755k
}
2102
2103
18.1k
char *cram_block_method2str(enum cram_block_method_int m) {
2104
18.1k
    switch(m) {
2105
11.0k
    case RAW:         return "RAW";
2106
2.23k
    case GZIP:        return "GZIP";
2107
0
    case BZIP2:       return "BZIP2";
2108
0
    case LZMA:        return "LZMA";
2109
0
    case RANS0:       return "RANS0";
2110
0
    case RANS1:       return "RANS1";
2111
4
    case GZIP_RLE:    return "GZIP_RLE";
2112
342
    case GZIP_1:      return "GZIP_1";
2113
0
    case FQZ:         return "FQZ";
2114
0
    case FQZ_b:       return "FQZ_b";
2115
0
    case FQZ_c:       return "FQZ_c";
2116
0
    case FQZ_d:       return "FQZ_d";
2117
317
    case RANS_PR0:    return "RANS_PR0";
2118
441
    case RANS_PR1:    return "RANS_PR1";
2119
897
    case RANS_PR64:   return "RANS_PR64";
2120
5
    case RANS_PR9:    return "RANS_PR9";
2121
2.58k
    case RANS_PR128:  return "RANS_PR128";
2122
0
    case RANS_PR129:  return "RANS_PR129";
2123
0
    case RANS_PR192:  return "RANS_PR192";
2124
103
    case RANS_PR193:  return "RANS_PR193";
2125
168
    case TOK3:        return "TOK3_R";
2126
0
    case TOKA:        return "TOK3_A";
2127
0
    case ARITH_PR0:   return "ARITH_PR0";
2128
0
    case ARITH_PR1:   return "ARITH_PR1";
2129
0
    case ARITH_PR64:  return "ARITH_PR64";
2130
0
    case ARITH_PR9:   return "ARITH_PR9";
2131
0
    case ARITH_PR128: return "ARITH_PR128";
2132
0
    case ARITH_PR129: return "ARITH_PR129";
2133
0
    case ARITH_PR192: return "ARITH_PR192";
2134
0
    case ARITH_PR193: return "ARITH_PR193";
2135
0
    case BM_ERROR: break;
2136
18.1k
    }
2137
0
    return "?";
2138
18.1k
}
2139
2140
3
char *cram_content_type2str(enum cram_content_type t) {
2141
3
    switch (t) {
2142
0
    case FILE_HEADER:         return "FILE_HEADER";
2143
0
    case COMPRESSION_HEADER:  return "COMPRESSION_HEADER";
2144
0
    case MAPPED_SLICE:        return "MAPPED_SLICE";
2145
0
    case UNMAPPED_SLICE:      return "UNMAPPED_SLICE";
2146
0
    case EXTERNAL:            return "EXTERNAL";
2147
3
    case CORE:                return "CORE";
2148
0
    case CT_ERROR:            break;
2149
3
    }
2150
0
    return "?";
2151
3
}
2152
2153
/* ----------------------------------------------------------------------
2154
 * Reference sequence handling
2155
 *
2156
 * These revolve around the refs_t structure, which may potentially be
2157
 * shared between multiple cram_fd.
2158
 *
2159
 * We start with refs_create() to allocate an empty refs_t and then
2160
 * populate it with @SQ line data using refs_from_header(). This is done on
2161
 * cram_open().  Also at start up we can call cram_load_reference() which
2162
 * is used with "scramble -r foo.fa". This replaces the fd->refs with the
2163
 * new one specified. In either case refs2id() is then called which
2164
 * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
2165
 *
2166
 * Later, possibly within a thread, we will want to know the actual ref
2167
 * seq itself, obtained by calling cram_get_ref().  This may use the
2168
 * UR: or M5: fields or the filename specified in the original
2169
 * cram_load_reference() call.
2170
 *
2171
 * Given the potential for multi-threaded reference usage, we have
2172
 * reference counting (sorry for the confusing double use of "ref") to
2173
 * track the number of callers interested in any specific reference.
2174
 */
2175
2176
/*
2177
 * Frees/unmaps a reference sequence and associated file handles.
2178
 */
2179
694
static void ref_entry_free_seq(ref_entry *e) {
2180
694
    if (e->mf)
2181
0
        mfclose(e->mf);
2182
694
    if (e->seq && !e->mf)
2183
0
        free(e->seq);
2184
2185
694
    e->seq = NULL;
2186
694
    e->mf = NULL;
2187
694
}
2188
2189
16.0k
void refs_free(refs_t *r) {
2190
16.0k
    RP("refs_free()\n");
2191
2192
16.0k
    if (!r)
2193
0
        return;
2194
2195
16.0k
    if (--r->count > 0)
2196
0
        return;
2197
2198
16.0k
    if (r->pool)
2199
16.0k
        string_pool_destroy(r->pool);
2200
2201
16.0k
    if (r->h_meta) {
2202
16.0k
        khint_t k;
2203
2204
17.7k
        for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) {
2205
1.71k
            ref_entry *e;
2206
2207
1.71k
            if (!kh_exist(r->h_meta, k))
2208
1.02k
                continue;
2209
694
            if (!(e = kh_val(r->h_meta, k)))
2210
0
                continue;
2211
694
            ref_entry_free_seq(e);
2212
694
            free(e);
2213
694
        }
2214
2215
16.0k
        kh_destroy(refs, r->h_meta);
2216
16.0k
    }
2217
2218
16.0k
    if (r->ref_id)
2219
5.49k
        free(r->ref_id);
2220
2221
16.0k
    if (r->fp)
2222
0
        bgzf_close(r->fp);
2223
2224
16.0k
    pthread_mutex_destroy(&r->lock);
2225
2226
16.0k
    free(r);
2227
16.0k
}
2228
2229
16.0k
static refs_t *refs_create(void) {
2230
16.0k
    refs_t *r = calloc(1, sizeof(*r));
2231
2232
16.0k
    RP("refs_create()\n");
2233
2234
16.0k
    if (!r)
2235
0
        return NULL;
2236
2237
16.0k
    if (!(r->pool = string_pool_create(8192)))
2238
0
        goto err;
2239
2240
16.0k
    r->ref_id = NULL; // see refs2id() to populate.
2241
16.0k
    r->count = 1;
2242
16.0k
    r->last = NULL;
2243
16.0k
    r->last_id = -1;
2244
2245
16.0k
    if (!(r->h_meta = kh_init(refs)))
2246
0
        goto err;
2247
2248
16.0k
    pthread_mutex_init(&r->lock, NULL);
2249
2250
16.0k
    return r;
2251
2252
0
 err:
2253
0
    refs_free(r);
2254
0
    return NULL;
2255
16.0k
}
2256
2257
/*
2258
 * Opens a reference fasta file as a BGZF stream, allowing for
2259
 * compressed files.  It automatically builds a .fai file if
2260
 * required and if compressed a .gzi bgzf index too.
2261
 *
2262
 * Returns a BGZF handle on success;
2263
 *         NULL on failure.
2264
 */
2265
2
static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) {
2266
2
    BGZF *fp;
2267
2268
2
    if (strncmp(fn, "file://", 7) == 0)
2269
0
        fn += 7;
2270
2271
2
    if (!is_md5 && !hisremote(fn)) {
2272
2
        char fai_file[PATH_MAX];
2273
2274
2
        snprintf(fai_file, PATH_MAX, "%s.fai", fn);
2275
2
        if (access(fai_file, R_OK) != 0)
2276
2
            if (fai_build(fn) != 0)
2277
2
                return NULL;
2278
2
    }
2279
2280
0
    if (!(fp = bgzf_open(fn, mode))) {
2281
0
        perror(fn);
2282
0
        return NULL;
2283
0
    }
2284
2285
0
    if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) {
2286
0
        hts_log_error("Unable to load .gzi index '%s.gzi'", fn);
2287
0
        bgzf_close(fp);
2288
0
        return NULL;
2289
0
    }
2290
2291
0
    return fp;
2292
0
}
2293
2294
/*
2295
 * Loads a FAI file for a reference.fasta.
2296
 * "is_err" indicates whether failure to load is worthy of emitting an
2297
 * error message. In some cases (eg with embedded references) we
2298
 * speculatively load, just in case, and silently ignore errors.
2299
 *
2300
 * Returns the refs_t struct on success (maybe newly allocated);
2301
 *         NULL on failure
2302
 */
2303
2
static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) {
2304
2
    hFILE *fp = NULL;
2305
2
    char fai_fn[PATH_MAX];
2306
2
    char line[8192];
2307
2
    refs_t *r = r_orig;
2308
2
    size_t fn_l = strlen(fn);
2309
2
    int id = 0, id_alloc = 0;
2310
2311
2
    RP("refs_load_fai %s\n", fn);
2312
2313
2
    if (!r)
2314
0
        if (!(r = refs_create()))
2315
0
            goto err;
2316
2317
2
    if (r->fp)
2318
0
        if (bgzf_close(r->fp) != 0)
2319
0
            goto err;
2320
2
    r->fp = NULL;
2321
2322
    /* Look for a FASTA##idx##FAI format */
2323
2
    char *fn_delim = strstr(fn, HTS_IDX_DELIM);
2324
2
    if (fn_delim) {
2325
0
        if (!(r->fn = string_ndup(r->pool, fn, fn_delim - fn)))
2326
0
            goto err;
2327
0
        fn_delim += strlen(HTS_IDX_DELIM);
2328
0
        snprintf(fai_fn, PATH_MAX, "%s", fn_delim);
2329
2
    } else {
2330
        /* An index file was provided, instead of the actual reference file */
2331
2
        if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0) {
2332
0
            if (!r->fn) {
2333
0
                if (!(r->fn = string_ndup(r->pool, fn, fn_l-4)))
2334
0
                    goto err;
2335
0
            }
2336
0
            snprintf(fai_fn, PATH_MAX, "%s", fn);
2337
2
        } else {
2338
        /* Only the reference file provided. Get the index file name from it */
2339
2
            if (!(r->fn = string_dup(r->pool, fn)))
2340
0
                goto err;
2341
2
            snprintf(fai_fn, PATH_MAX, "%.*s.fai", PATH_MAX-5, fn);
2342
2
        }
2343
2
    }
2344
2345
2
    if (!(r->fp = bgzf_open_ref(r->fn, "r", 0))) {
2346
2
        hts_log_error("Failed to open reference file '%s'", r->fn);
2347
2
        goto err;
2348
2
    }
2349
2350
0
    if (!(fp = hopen(fai_fn, "r"))) {
2351
0
        hts_log_error("Failed to open index file '%s'", fai_fn);
2352
0
        if (is_err)
2353
0
            perror(fai_fn);
2354
0
        goto err;
2355
0
    }
2356
0
    while (hgets(line, 8192, fp) != NULL) {
2357
0
        ref_entry *e = malloc(sizeof(*e));
2358
0
        char *cp;
2359
0
        int n;
2360
0
        khint_t k;
2361
2362
0
        if (!e)
2363
0
            return NULL;
2364
2365
        // id
2366
0
        for (cp = line; *cp && !isspace_c(*cp); cp++)
2367
0
            ;
2368
0
        *cp++ = 0;
2369
0
        e->name = string_dup(r->pool, line);
2370
2371
        // length
2372
0
        while (*cp && isspace_c(*cp))
2373
0
            cp++;
2374
0
        e->length = strtoll(cp, &cp, 10);
2375
2376
        // offset
2377
0
        while (*cp && isspace_c(*cp))
2378
0
            cp++;
2379
0
        e->offset = strtoll(cp, &cp, 10);
2380
2381
        // bases per line
2382
0
        while (*cp && isspace_c(*cp))
2383
0
            cp++;
2384
0
        e->bases_per_line = strtol(cp, &cp, 10);
2385
2386
        // line length
2387
0
        while (*cp && isspace_c(*cp))
2388
0
            cp++;
2389
0
        e->line_length = strtol(cp, &cp, 10);
2390
2391
        // filename
2392
0
        e->fn = r->fn;
2393
2394
0
        e->count = 0;
2395
0
        e->seq = NULL;
2396
0
        e->mf = NULL;
2397
0
        e->is_md5 = 0;
2398
0
        e->validated_md5 = 0;
2399
2400
0
        k = kh_put(refs, r->h_meta, e->name, &n);
2401
0
        if (-1 == n)  {
2402
0
            free(e);
2403
0
            return NULL;
2404
0
        }
2405
2406
0
        if (n) {
2407
0
            kh_val(r->h_meta, k) = e;
2408
0
        } else {
2409
0
            ref_entry *re = kh_val(r->h_meta, k);
2410
0
            if (re && (re->count != 0 || re->length != 0)) {
2411
                /* Keep old */
2412
0
                free(e);
2413
0
            } else {
2414
                /* Replace old */
2415
0
                if (re)
2416
0
                    free(re);
2417
0
                kh_val(r->h_meta, k) = e;
2418
0
            }
2419
0
        }
2420
2421
0
        if (id >= id_alloc) {
2422
0
            ref_entry **new_refs;
2423
0
            int x;
2424
2425
0
            id_alloc = id_alloc ?id_alloc*2 : 16;
2426
0
            new_refs = hts_realloc_p(r->ref_id, sizeof(*r->ref_id), id_alloc);
2427
0
            if (!new_refs)
2428
0
                goto err;
2429
0
            r->ref_id = new_refs;
2430
2431
0
            for (x = id; x < id_alloc; x++)
2432
0
                r->ref_id[x] = NULL;
2433
0
        }
2434
0
        r->ref_id[id] = e;
2435
0
        r->nref = ++id;
2436
0
    }
2437
2438
0
    if(hclose(fp) < 0)
2439
0
        goto err;
2440
0
    return r;
2441
2442
2
 err:
2443
2
    if (fp)
2444
0
        hclose_abruptly(fp);
2445
2446
2
    if (!r_orig)
2447
0
        refs_free(r);
2448
2449
2
    return NULL;
2450
0
}
2451
2452
/*
2453
 * Verifies that the CRAM @SQ lines and .fai files match.
2454
 */
2455
0
static void sanitise_SQ_lines(cram_fd *fd) {
2456
0
    int i;
2457
2458
0
    if (!fd->header || !fd->header->hrecs)
2459
0
        return;
2460
2461
0
    if (!fd->refs || !fd->refs->h_meta)
2462
0
        return;
2463
2464
0
    for (i = 0; i < fd->header->hrecs->nref; i++) {
2465
0
        const char *name = fd->header->hrecs->ref[i].name;
2466
0
        khint_t k = kh_get(refs, fd->refs->h_meta, name);
2467
0
        ref_entry *r;
2468
2469
        // We may have @SQ lines which have no known .fai, but do not
2470
        // in themselves pose a problem because they are unused in the file.
2471
0
        if (k == kh_end(fd->refs->h_meta))
2472
0
            continue;
2473
2474
0
        if (!(r = (ref_entry *)kh_val(fd->refs->h_meta, k)))
2475
0
            continue;
2476
2477
0
        if (r->length && r->length != fd->header->hrecs->ref[i].len) {
2478
0
            assert(strcmp(r->name, fd->header->hrecs->ref[i].name) == 0);
2479
2480
            // Should we also check MD5sums here to ensure the correct
2481
            // reference was given?
2482
0
            hts_log_warning("Header @SQ length mismatch for ref %s, %"PRIhts_pos" vs %d",
2483
0
                            r->name, fd->header->hrecs->ref[i].len, (int)r->length);
2484
2485
            // Fixing the parsed @SQ header will make MD:Z: strings work
2486
            // and also stop it producing N for the sequence.
2487
0
            fd->header->hrecs->ref[i].len = r->length;
2488
0
        }
2489
0
    }
2490
0
}
2491
2492
/*
2493
 * Indexes references by the order they appear in a BAM file. This may not
2494
 * necessarily be the same order they appear in the fasta reference file.
2495
 *
2496
 * Returns 0 on success
2497
 *        -1 on failure
2498
 */
2499
5.44k
int refs2id(refs_t *r, sam_hdr_t *hdr) {
2500
5.44k
    int i;
2501
5.44k
    sam_hrecs_t *h = hdr->hrecs;
2502
2503
5.44k
    if (r->ref_id)
2504
292
        free(r->ref_id);
2505
5.44k
    if (r->last)
2506
0
        r->last = NULL;
2507
2508
5.44k
    r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
2509
5.44k
    if (!r->ref_id)
2510
0
        return -1;
2511
2512
5.44k
    r->nref = h->nref;
2513
5.94k
    for (i = 0; i < h->nref; i++) {
2514
498
        khint_t k = kh_get(refs, r->h_meta, h->ref[i].name);
2515
498
        if (k != kh_end(r->h_meta)) {
2516
498
            r->ref_id[i] = kh_val(r->h_meta, k);
2517
498
        } else {
2518
0
            hts_log_warning("Unable to find ref name '%s'", h->ref[i].name);
2519
0
        }
2520
498
    }
2521
2522
5.44k
    return 0;
2523
5.44k
}
2524
2525
/*
2526
 * Generates refs_t entries based on @SQ lines in the header.
2527
 * Returns 0 on success
2528
 *         -1 on failure
2529
 */
2530
27.0k
static int refs_from_header(cram_fd *fd) {
2531
27.0k
    if (!fd)
2532
0
        return -1;
2533
2534
27.0k
    refs_t *r = fd->refs;
2535
27.0k
    if (!r)
2536
0
        return -1;
2537
2538
27.0k
    sam_hdr_t *h = fd->header;
2539
27.0k
    if (!h)
2540
8.77k
        return 0;
2541
2542
18.2k
    if (!h->hrecs) {
2543
12.3k
        if (-1 == sam_hdr_fill_hrecs(h))
2544
77
            return -1;
2545
12.3k
    }
2546
2547
18.1k
    if (h->hrecs->nref == 0)
2548
17.5k
        return 0;
2549
2550
    //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
2551
2552
    /* Existing refs are fine, as long as they're compatible with the hdr. */
2553
632
    ref_entry **new_ref_id = hts_realloc_ps(r->ref_id, sizeof(*r->ref_id),
2554
632
                                            r->nref, h->hrecs->nref);
2555
632
    if (!new_ref_id)
2556
0
        return -1;
2557
632
    r->ref_id = new_ref_id;
2558
2559
632
    int i, j;
2560
    /* Copy info from h->ref[i] over to r */
2561
1.82k
    for (i = 0, j = r->nref; i < h->hrecs->nref; i++) {
2562
1.19k
        sam_hrec_type_t *ty;
2563
1.19k
        sam_hrec_tag_t *tag;
2564
1.19k
        khint_t k;
2565
1.19k
        int n;
2566
2567
1.19k
        k = kh_get(refs, r->h_meta, h->hrecs->ref[i].name);
2568
1.19k
        if (k != kh_end(r->h_meta))
2569
            // Ref already known about
2570
498
            continue;
2571
2572
694
        if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
2573
0
            return -1;
2574
2575
694
        if (!h->hrecs->ref[i].name)
2576
0
            return -1;
2577
2578
694
        r->ref_id[j]->name = string_dup(r->pool, h->hrecs->ref[i].name);
2579
694
        if (!r->ref_id[j]->name) return -1;
2580
694
        r->ref_id[j]->length = 0; // marker for not yet loaded
2581
2582
        /* Initialise likely filename if known */
2583
694
        if ((ty = sam_hrecs_find_type_id(h->hrecs, "SQ", "SN", h->hrecs->ref[i].name))) {
2584
694
            if ((tag = sam_hrecs_find_key(ty, "M5", NULL)))
2585
113
                r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
2586
2587
694
            if ((tag = sam_hrecs_find_key(ty, "LN", NULL))) {
2588
                // LN tag used when constructing consensus reference
2589
694
                r->ref_id[j]->LN_length = strtoll(tag->str+3, NULL, 0);
2590
                // See fuzz 382922241
2591
694
                if (r->ref_id[j]->LN_length < 0)
2592
1
                    r->ref_id[j]->LN_length = 0;
2593
694
            }
2594
694
        }
2595
2596
694
        k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
2597
694
        if (n <= 0) // already exists or error
2598
0
            return -1;
2599
694
        kh_val(r->h_meta, k) = r->ref_id[j];
2600
2601
694
        j++;
2602
694
    }
2603
632
    r->nref = j;
2604
2605
632
    return 0;
2606
632
}
2607
2608
/*
2609
 * Attaches a header to a cram_fd.
2610
 *
2611
 * This should be used when creating a new cram_fd for writing where
2612
 * we have a header already constructed (eg from a file we've read
2613
 * in).
2614
 */
2615
5.52k
int cram_set_header2(cram_fd *fd, const sam_hdr_t *hdr) {
2616
5.52k
    if (!fd || !hdr )
2617
0
        return -1;
2618
2619
5.52k
    if (fd->header != hdr) {
2620
5.52k
        if (fd->header)
2621
0
            sam_hdr_destroy(fd->header);
2622
5.52k
        fd->header = sam_hdr_dup(hdr);
2623
5.52k
        if (!fd->header)
2624
0
            return -1;
2625
5.52k
    }
2626
5.52k
    return refs_from_header(fd);
2627
5.52k
}
2628
2629
0
int cram_set_header(cram_fd *fd, sam_hdr_t *hdr) {
2630
0
    return cram_set_header2(fd, hdr);
2631
0
}
2632
2633
/*
2634
 * Returns whether the path refers to a directory.
2635
 */
2636
0
static int is_directory(char *fn) {
2637
0
    struct stat buf;
2638
0
    if ( stat(fn,&buf) ) return 0;
2639
0
    return S_ISDIR(buf.st_mode);
2640
0
}
2641
2642
/*
2643
 * Converts a directory and a filename into an expanded path, replacing %s
2644
 * in directory with the filename and %[0-9]+s with portions of the filename
2645
 * Any remaining parts of filename are added to the end with /%s.
2646
 */
2647
0
static int expand_cache_path(char *path, char *dir, const char *fn) {
2648
0
    char *cp, *start = path;
2649
0
    size_t len;
2650
0
    size_t sz = PATH_MAX;
2651
2652
0
    while ((cp = strchr(dir, '%'))) {
2653
0
        if (cp-dir >= sz) return -1;
2654
0
        strncpy(path, dir, cp-dir);
2655
0
        path += cp-dir;
2656
0
        sz -= cp-dir;
2657
2658
0
        if (*++cp == 's') {
2659
0
            len = strlen(fn);
2660
0
            if (len >= sz) return -1;
2661
0
            strcpy(path, fn);
2662
0
            path += len;
2663
0
            sz -= len;
2664
0
            fn += len;
2665
0
            cp++;
2666
0
        } else if (*cp >= '0' && *cp <= '9') {
2667
0
            char *endp;
2668
0
            long l;
2669
2670
0
            l = strtol(cp, &endp, 10);
2671
0
            l = MIN(l, strlen(fn));
2672
0
            if (*endp == 's') {
2673
0
                if (l >= sz) return -1;
2674
0
                strncpy(path, fn, l);
2675
0
                path += l;
2676
0
                fn += l;
2677
0
                sz -= l;
2678
0
                *path = 0;
2679
0
                cp = endp+1;
2680
0
            } else {
2681
0
                if (sz < 3) return -1;
2682
0
                *path++ = '%';
2683
0
                *path++ = *cp++;
2684
0
            }
2685
0
        } else {
2686
0
            if (sz < 3) return -1;
2687
0
            *path++ = '%';
2688
0
            *path++ = *cp++;
2689
0
        }
2690
0
        dir = cp;
2691
0
    }
2692
2693
0
    len = strlen(dir);
2694
0
    if (len >= sz) return -1;
2695
0
    strcpy(path, dir);
2696
0
    path += len;
2697
0
    sz -= len;
2698
2699
0
    len = strlen(fn) + ((*fn && path > start && path[-1] != '/') ? 1 : 0);
2700
0
    if (len >= sz) return -1;
2701
0
    if (*fn && path > start && path[-1] != '/')
2702
0
        *path++ = '/';
2703
0
    strcpy(path, fn);
2704
0
    return 0;
2705
0
}
2706
2707
/*
2708
 * Make the directory containing path and any prefix directories.
2709
 */
2710
0
static void mkdir_prefix(char *path, int mode) {
2711
0
    char *cp = strrchr(path, '/');
2712
0
    if (!cp)
2713
0
        return;
2714
2715
0
    *cp = 0;
2716
0
    if (is_directory(path)) {
2717
0
        *cp = '/';
2718
0
        return;
2719
0
    }
2720
2721
0
    if (mkdir(path, mode) == 0) {
2722
0
        chmod(path, mode);
2723
0
        *cp = '/';
2724
0
        return;
2725
0
    }
2726
2727
0
    mkdir_prefix(path, mode);
2728
0
    mkdir(path, mode);
2729
0
    chmod(path, mode);
2730
0
    *cp = '/';
2731
0
}
2732
2733
/*
2734
 * Queries the M5 string from the header and attempts to populate the
2735
 * reference from this using the REF_PATH environment.
2736
 *
2737
 * Returns 0 on success
2738
 *        -1 on failure
2739
 */
2740
278
static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
2741
278
    char *ref_path = getenv("REF_PATH");
2742
278
    sam_hrec_type_t *ty;
2743
278
    sam_hrec_tag_t *tag;
2744
278
    char path[PATH_MAX];
2745
278
    kstring_t path_tmp = KS_INITIALIZE;
2746
278
    char *local_cache = getenv("REF_CACHE");
2747
278
    mFILE *mf;
2748
278
    int local_path = 0;
2749
2750
278
    hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id);
2751
2752
278
    if (!r->name)
2753
0
        return -1;
2754
2755
278
    if (!(ty = sam_hrecs_find_type_id(fd->header->hrecs, "SQ", "SN", r->name)))
2756
0
        return -1;
2757
2758
278
    if (!(tag = sam_hrecs_find_key(ty, "M5", NULL)))
2759
277
        goto no_M5;
2760
2761
1
    hts_log_info("Querying ref %s", tag->str+3);
2762
2763
    /* Use cache if available */
2764
1
    if (local_cache && *local_cache) {
2765
0
        struct stat sb;
2766
0
        if (expand_cache_path(path, local_cache, tag->str+3) == 0 &&
2767
0
            stat(path, &sb) == 0)
2768
            // Found it in the local cache
2769
0
            local_path = 1;
2770
0
    }
2771
2772
#ifndef HAVE_MMAP
2773
    char *path2;
2774
    /* Search local files in REF_PATH; we can open them and return as above */
2775
    if (!local_path && (path2 = find_path(tag->str+3, ref_path))) {
2776
        int len = snprintf(path, PATH_MAX, "%s", path2);
2777
        free(path2);
2778
        if (len > 0 && len < PATH_MAX) // in case it's too long
2779
            local_path = 1;
2780
    }
2781
#endif
2782
2783
    /* Found via REF_CACHE or local REF_PATH file */
2784
1
    if (local_path) {
2785
0
        struct stat sb;
2786
0
        BGZF *fp;
2787
2788
0
        if (0 == stat(path, &sb)
2789
0
            && S_ISREG(sb.st_mode)
2790
0
            && (fp = bgzf_open(path, "r"))) {
2791
0
            r->length = sb.st_size;
2792
0
            r->offset = r->line_length = r->bases_per_line = 0;
2793
2794
0
            r->fn = string_dup(fd->refs->pool, path);
2795
2796
0
            if (fd->refs->fp)
2797
0
                if (bgzf_close(fd->refs->fp) != 0)
2798
0
                    return -1;
2799
0
            fd->refs->fp = fp;
2800
0
            fd->refs->fn = r->fn;
2801
0
            r->is_md5 = 1;
2802
0
            r->validated_md5 = 1;
2803
2804
            // Fall back to cram_get_ref() where it'll do the actual
2805
            // reading of the file.
2806
0
            return 0;
2807
0
        }
2808
0
    }
2809
2810
2811
    /* Otherwise search full REF_PATH; slower as loads entire file */
2812
1
    int is_local = 0;
2813
1
    if ((mf = open_path_mfile(tag->str+3, ref_path, NULL, &is_local))) {
2814
0
        size_t sz;
2815
0
        r->seq = mfsteal(mf, &sz);
2816
0
        if (r->seq) {
2817
0
            r->mf = NULL;
2818
0
        } else {
2819
            // keep mf around as we couldn't detach
2820
0
            r->seq = mf->data;
2821
0
            r->mf = mf;
2822
0
        }
2823
0
        r->length = sz;
2824
0
        r->is_md5 = 1;
2825
0
        r->validated_md5 = 1;
2826
1
    } else {
2827
1
        refs_t *refs;
2828
1
        const char *fn;
2829
1
        sam_hrec_tag_t *UR_tag;
2830
2831
278
    no_M5:
2832
        /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
2833
278
        if (!(UR_tag = sam_hrecs_find_key(ty, "UR", NULL)))
2834
276
            return -1;
2835
2836
2
        if (strstr(UR_tag->str+3, "://") &&
2837
0
            strncmp(UR_tag->str+3, "file:", 5) != 0) {
2838
            // Documented as omitted, but accidentally supported until now
2839
0
            hts_log_error("UR tags pointing to remote files are not supported");
2840
0
            return -1;
2841
0
        }
2842
2843
2
        fn = (strncmp(UR_tag->str+3, "file:", 5) == 0)
2844
2
            ? UR_tag->str+8
2845
2
            : UR_tag->str+3;
2846
2847
2
        if (fd->refs->fp) {
2848
0
            if (bgzf_close(fd->refs->fp) != 0)
2849
0
                return -1;
2850
0
            fd->refs->fp = NULL;
2851
0
        }
2852
2
        if (!(refs = refs_load_fai(fd->refs, fn, 0)))
2853
2
            return -1;
2854
0
        sanitise_SQ_lines(fd);
2855
2856
0
        fd->refs = refs;
2857
0
        if (fd->refs->fp) {
2858
0
            if (bgzf_close(fd->refs->fp) != 0)
2859
0
                return -1;
2860
0
            fd->refs->fp = NULL;
2861
0
        }
2862
2863
0
        if (!fd->refs->fn)
2864
0
            return -1;
2865
2866
0
        if (-1 == refs2id(fd->refs, fd->header))
2867
0
            return -1;
2868
0
        if (!fd->refs->ref_id || !fd->refs->ref_id[id])
2869
0
            return -1;
2870
2871
        // Local copy already, so fall back to cram_get_ref().
2872
0
        return 0;
2873
0
    }
2874
2875
    /* Populate the local disk cache if required */
2876
0
    if (!is_local && local_cache && *local_cache) {
2877
0
        hFILE *fp;
2878
2879
0
        if (expand_cache_path(path, local_cache, tag->str+3) < 0) {
2880
0
            return 0; // Not fatal - we have the data already so keep going.
2881
0
        }
2882
0
        hts_log_info("Writing cache file '%s'", path);
2883
0
        mkdir_prefix(path, 01777);
2884
2885
0
        fp = hts_open_tmpfile(path, "wx", &path_tmp);
2886
0
        if (!fp) {
2887
0
            perror(path_tmp.s);
2888
0
            free(path_tmp.s);
2889
2890
            // Not fatal - we have the data already so keep going.
2891
0
            return 0;
2892
0
        }
2893
2894
        // Check md5sum
2895
0
        hts_md5_context *md5;
2896
0
        char unsigned md5_buf1[16];
2897
0
        char md5_buf2[33];
2898
2899
0
        if (!(md5 = hts_md5_init())) {
2900
0
            hclose_abruptly(fp);
2901
0
            unlink(path_tmp.s);
2902
0
            free(path_tmp.s);
2903
0
            return -1;
2904
0
        }
2905
0
        hts_md5_update(md5, r->seq, r->length);
2906
0
        hts_md5_final(md5_buf1, md5);
2907
0
        hts_md5_destroy(md5);
2908
0
        hts_md5_hex(md5_buf2, md5_buf1);
2909
2910
0
        if (strncmp(tag->str+3, md5_buf2, 32) != 0) {
2911
0
            hts_log_error("Mismatching md5sum for downloaded reference");
2912
0
            hclose_abruptly(fp);
2913
0
            unlink(path_tmp.s);
2914
0
            free(path_tmp.s);
2915
0
            return -1;
2916
0
        }
2917
2918
0
        ssize_t length_written = hwrite(fp, r->seq, r->length);
2919
0
        if (hclose(fp) < 0 || length_written != r->length ||
2920
0
            chmod(path_tmp.s, 0444) < 0 ||
2921
0
            rename(path_tmp.s, path) < 0) {
2922
0
            hts_log_error("Creating reference at %s failed: %s",
2923
0
                          path, strerror(errno));
2924
0
            unlink(path_tmp.s);
2925
0
        }
2926
0
    }
2927
2928
0
    free(path_tmp.s);
2929
0
    return 0;
2930
0
}
2931
2932
42
static void cram_ref_incr_locked(refs_t *r, int id) {
2933
42
    RP("%d INC REF %d, %d %p\n", gettid(), id,
2934
42
       (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count+1:-999),
2935
42
       id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
2936
2937
42
    if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq)
2938
42
        return;
2939
2940
0
    if (r->last_id == id)
2941
0
        r->last_id = -1;
2942
2943
0
    ++r->ref_id[id]->count;
2944
0
}
2945
2946
42
void cram_ref_incr(refs_t *r, int id) {
2947
42
    pthread_mutex_lock(&r->lock);
2948
42
    cram_ref_incr_locked(r, id);
2949
42
    pthread_mutex_unlock(&r->lock);
2950
42
}
2951
2952
0
static void cram_ref_decr_locked(refs_t *r, int id) {
2953
0
    RP("%d DEC REF %d, %d %p\n", gettid(), id,
2954
0
       (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count-1:-999),
2955
0
       id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
2956
2957
0
    if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) {
2958
0
        return;
2959
0
    }
2960
2961
0
    if (--r->ref_id[id]->count <= 0) {
2962
0
        assert(r->ref_id[id]->count == 0);
2963
0
        if (r->last_id >= 0) {
2964
0
            if (r->ref_id[r->last_id]->count <= 0 &&
2965
0
                r->ref_id[r->last_id]->seq) {
2966
0
                RP("%d FREE REF %d (%p)\n", gettid(),
2967
0
                   r->last_id, r->ref_id[r->last_id]->seq);
2968
0
                ref_entry_free_seq(r->ref_id[r->last_id]);
2969
0
                if (r->ref_id[r->last_id]->is_md5) r->ref_id[r->last_id]->length = 0;
2970
0
            }
2971
0
        }
2972
0
        r->last_id = id;
2973
0
    }
2974
0
}
2975
2976
0
void cram_ref_decr(refs_t *r, int id) {
2977
0
    pthread_mutex_lock(&r->lock);
2978
0
    cram_ref_decr_locked(r, id);
2979
0
    pthread_mutex_unlock(&r->lock);
2980
0
}
2981
2982
/*
2983
 * Used by cram_ref_load and cram_get_ref. The file handle will have
2984
 * already been opened, so we can catch it. The ref_entry *e informs us
2985
 * of whether this is a multi-line fasta file or a raw MD5 style file.
2986
 * Either way we create a single contiguous sequence.
2987
 *
2988
 * Returns all or part of a reference sequence on success (malloced);
2989
 *         NULL on failure.
2990
 */
2991
static char *load_ref_portion(BGZF *fp, ref_entry *e,
2992
0
                              hts_pos_t start, hts_pos_t end) {
2993
0
    off_t offset, len;
2994
0
    char *seq;
2995
2996
0
    if (end < start)
2997
0
        end = start;
2998
2999
    /*
3000
     * Compute locations in file. This is trivial for the MD5 files, but
3001
     * is still necessary for the fasta variants.
3002
     *
3003
     * Note the offset here, as with faidx, has the assumption that white-
3004
     * space (the diff between line_length and bases_per_line) only occurs
3005
     * at the end of a line of text.
3006
     */
3007
0
    offset = e->line_length
3008
0
        ? e->offset + (start-1)/e->bases_per_line * e->line_length +
3009
0
          (start-1) % e->bases_per_line
3010
0
        : start-1;
3011
3012
0
    len = (e->line_length
3013
0
           ? e->offset + (end-1)/e->bases_per_line * e->line_length +
3014
0
             (end-1) % e->bases_per_line
3015
0
           : end-1) - offset + 1;
3016
3017
0
    if (bgzf_useek(fp, offset, SEEK_SET) < 0) {
3018
0
        perror("bgzf_useek() on reference file");
3019
0
        return NULL;
3020
0
    }
3021
3022
0
    if (len == 0 || !(seq = malloc(len))) {
3023
0
        return NULL;
3024
0
    }
3025
3026
0
    if (len != bgzf_read(fp, seq, len)) {
3027
0
        perror("bgzf_read() on reference file");
3028
0
        free(seq);
3029
0
        return NULL;
3030
0
    }
3031
3032
    /* Strip white-space if required. */
3033
0
    if (len != end-start+1) {
3034
0
        hts_pos_t i, j;
3035
0
        char *cp = seq;
3036
0
        char *cp_to;
3037
3038
        // Copy up to the first white-space, and then repeatedly just copy
3039
        // bases_per_line verbatim, and use the slow method to end again.
3040
        //
3041
        // This may seem excessive, but this code can be a significant
3042
        // portion of total CRAM decode CPU time for shallow data sets.
3043
0
        for (i = j = 0; i < len; i++) {
3044
0
            if (!isspace_c(cp[i]))
3045
0
                cp[j++] = cp[i] & ~0x20;
3046
0
            else
3047
0
                break;
3048
0
        }
3049
0
        while (i < len && isspace_c(cp[i]))
3050
0
            i++;
3051
0
        while (i < len - e->line_length) {
3052
0
            hts_pos_t j_end = j + e->bases_per_line;
3053
0
            while (j < j_end)
3054
0
                cp[j++] = cp[i++] & ~0x20; // toupper equiv
3055
0
            i += e->line_length - e->bases_per_line;
3056
0
        }
3057
0
        for (; i < len; i++) {
3058
0
            if (!isspace_c(cp[i]))
3059
0
                cp[j++] = cp[i] & ~0x20;
3060
0
        }
3061
3062
0
        cp_to = cp+j;
3063
3064
0
        if (cp_to - seq != end-start+1) {
3065
0
            hts_log_error("Malformed reference file");
3066
0
            free(seq);
3067
0
            return NULL;
3068
0
        }
3069
0
    } else {
3070
0
        int i;
3071
0
        for (i = 0; i < len; i++) {
3072
0
            seq[i] = toupper_c(seq[i]);
3073
0
        }
3074
0
    }
3075
3076
0
    return seq;
3077
0
}
3078
3079
/*
3080
 * Load the entire reference 'id'.
3081
 * This also increments the reference count by 1.
3082
 *
3083
 * Returns ref_entry on success;
3084
 *         NULL on failure
3085
 */
3086
0
ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) {
3087
0
    ref_entry *e = r->ref_id[id];
3088
0
    hts_pos_t start = 1, end = e->length;
3089
0
    char *seq;
3090
3091
0
    if (e->seq) {
3092
0
        return e;
3093
0
    }
3094
3095
0
    assert(e->count == 0);
3096
3097
0
    if (r->last) {
3098
#ifdef REF_DEBUG
3099
        int idx = 0;
3100
        for (idx = 0; idx < r->nref; idx++)
3101
            if (r->last == r->ref_id[idx])
3102
                break;
3103
        RP("%d cram_ref_load DECR %d\n", gettid(), idx);
3104
#endif
3105
0
        assert(r->last->count > 0);
3106
0
        if (--r->last->count <= 0) {
3107
0
            RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
3108
0
            if (r->last->seq)
3109
0
                ref_entry_free_seq(r->last);
3110
0
        }
3111
0
    }
3112
3113
0
    if (!r->fn)
3114
0
        return NULL;
3115
3116
    /* Open file if it's not already the current open reference */
3117
0
    if (strcmp(r->fn, e->fn) || r->fp == NULL) {
3118
0
        if (r->fp)
3119
0
            if (bgzf_close(r->fp) != 0)
3120
0
                return NULL;
3121
0
        r->fn = e->fn;
3122
0
        if (!(r->fp = bgzf_open_ref(r->fn, "r", is_md5)))
3123
0
            return NULL;
3124
0
    }
3125
3126
0
    RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
3127
3128
0
    if (!(seq = load_ref_portion(r->fp, e, start, end))) {
3129
0
        return NULL;
3130
0
    }
3131
3132
0
    RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
3133
3134
0
    RP("%d INC REF %d, %"PRId64"\n", gettid(), id, (e->count+1));
3135
0
    e->seq = seq;
3136
0
    e->mf = NULL;
3137
0
    e->count++;
3138
3139
    /*
3140
     * Also keep track of last used ref so incr/decr loops on the same
3141
     * sequence don't cause load/free loops.
3142
     */
3143
0
    RP("%d cram_ref_load INCR %d => %"PRId64"\n", gettid(), id, e->count+1);
3144
0
    r->last = e;
3145
0
    e->count++;
3146
3147
0
    return e;
3148
0
}
3149
3150
/*
3151
 * Returns a portion of a reference sequence from start to end inclusive.
3152
 * The returned pointer is owned by either the cram_file fd or by the
3153
 * internal refs_t structure and should not be freed  by the caller.
3154
 *
3155
 * The difference is whether or not this refs_t is in use by just the one
3156
 * cram_fd or by multiples, or whether we have multiple threads accessing
3157
 * references. In either case fd->shared will be true and we start using
3158
 * reference counting to track the number of users of a specific reference
3159
 * sequence.
3160
 *
3161
 * Otherwise the ref seq returned is allocated as part of cram_fd itself
3162
 * and will be freed up on the next call to cram_get_ref or cram_close.
3163
 *
3164
 * To return the entire reference sequence, specify start as 1 and end
3165
 * as 0.
3166
 *
3167
 * To cease using a reference, call cram_ref_decr().
3168
 *
3169
 * Returns reference on success,
3170
 *         NULL on failure
3171
 */
3172
3.24k
char *cram_get_ref(cram_fd *fd, int id, hts_pos_t start, hts_pos_t end) {
3173
3.24k
    ref_entry *r;
3174
3.24k
    char *seq;
3175
3.24k
    int ostart = start;
3176
3177
3.24k
    if (id == -1 || start < 1)
3178
2.96k
        return NULL;
3179
3180
    /* FIXME: axiomatic query of r->seq being true?
3181
     * Or shortcut for unsorted data where we load once and never free?
3182
     */
3183
3184
    //fd->shared_ref = 1; // hard code for now to simplify things
3185
3186
284
    pthread_mutex_lock(&fd->ref_lock);
3187
3188
284
    RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
3189
3190
    /*
3191
     * Unsorted data implies we want to fetch an entire reference at a time.
3192
     * We just deal with this at the moment by claiming we're sharing
3193
     * references instead, which has the same requirement.
3194
     */
3195
284
    if (fd->unsorted)
3196
0
        fd->shared_ref = 1;
3197
3198
3199
    /* Sanity checking: does this ID exist? */
3200
284
    if (!fd->refs || id < 0 || id >= fd->refs->nref || !fd->refs->ref_id[id]) {
3201
6
        hts_log_error("No reference found for id %d", id);
3202
6
        pthread_mutex_unlock(&fd->ref_lock);
3203
6
        return NULL;
3204
6
    }
3205
3206
278
    r = fd->refs->ref_id[id];
3207
3208
    /*
3209
     * It has an entry, but may not have been populated yet.
3210
     * Any manually loaded .fai files have their lengths known.
3211
     * A ref entry computed from @SQ lines (M5 or UR field) will have
3212
     * r->length == 0 unless it's been loaded once and verified that we have
3213
     * an on-disk filename for it.
3214
     *
3215
     * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
3216
     * open_path_mfile and libcurl, which isn't multi-thread safe unless I
3217
     * rewrite my code to have one curl handle per thread.
3218
     */
3219
278
    pthread_mutex_lock(&fd->refs->lock);
3220
278
    if (r->length == 0) {
3221
278
        if (fd->ref_fn)
3222
0
            hts_log_warning("Reference file given, but ref '%s' not present",
3223
278
                            r->name);
3224
278
        if (cram_populate_ref(fd, id, r) == -1) {
3225
278
            hts_log_warning("Failed to populate reference \"%s\"",
3226
278
                            r->name);
3227
278
            hts_log_warning("See https://www.htslib.org/doc/reference_seqs.html for further suggestions");
3228
278
            pthread_mutex_unlock(&fd->refs->lock);
3229
278
            pthread_mutex_unlock(&fd->ref_lock);
3230
278
            return NULL;
3231
278
        }
3232
0
        r = fd->refs->ref_id[id];
3233
0
        if (fd->unsorted)
3234
0
            cram_ref_incr_locked(fd->refs, id);
3235
0
    }
3236
3237
3238
    /*
3239
     * We now know that we the filename containing the reference, so check
3240
     * for limits. If it's over half the reference we'll load all of it in
3241
     * memory as this will speed up subsequent calls.
3242
     */
3243
0
    if (end < 1)
3244
0
        end = r->length;
3245
0
    if (end >= r->length)
3246
0
        end  = r->length;
3247
3248
0
    if (end - start >= 0.5*r->length || fd->shared_ref) {
3249
0
        start = 1;
3250
0
        end = r->length;
3251
0
    }
3252
3253
    /*
3254
     * Maybe we have it cached already? If so use it.
3255
     *
3256
     * Alternatively if we don't have the sequence but we're sharing
3257
     * references and/or are asking for the entire length of it, then
3258
     * load the full reference into the refs structure and return
3259
     * a pointer to that one instead.
3260
     */
3261
0
    if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
3262
0
        char *cp;
3263
3264
0
        if (id >= 0) {
3265
0
            if (r->seq) {
3266
0
                cram_ref_incr_locked(fd->refs, id);
3267
0
            } else {
3268
0
                ref_entry *e;
3269
0
                if (!(e = cram_ref_load(fd->refs, id, r->is_md5))) {
3270
0
                    pthread_mutex_unlock(&fd->refs->lock);
3271
0
                    pthread_mutex_unlock(&fd->ref_lock);
3272
0
                    return NULL;
3273
0
                }
3274
3275
                /* unsorted data implies cache ref indefinitely, to avoid
3276
                 * continually loading and unloading.
3277
                 */
3278
0
                if (fd->unsorted)
3279
0
                    cram_ref_incr_locked(fd->refs, id);
3280
0
            }
3281
3282
0
            fd->ref = NULL; /* We never access it directly */
3283
0
            fd->ref_start = 1;
3284
0
            fd->ref_end   = r->length;
3285
0
            fd->ref_id    = id;
3286
3287
0
            cp = fd->refs->ref_id[id]->seq + ostart-1;
3288
0
        } else {
3289
0
            fd->ref = NULL;
3290
0
            cp = NULL;
3291
0
        }
3292
3293
0
        RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
3294
3295
0
        pthread_mutex_unlock(&fd->refs->lock);
3296
0
        pthread_mutex_unlock(&fd->ref_lock);
3297
0
        return cp;
3298
0
    }
3299
3300
    /*
3301
     * Otherwise we're not sharing, we don't have a copy of it already and
3302
     * we're only asking for a small portion of it.
3303
     *
3304
     * In this case load up just that segment ourselves, freeing any old
3305
     * small segments in the process.
3306
     */
3307
3308
    /* Unmapped ref ID */
3309
0
    if (id < 0 || !fd->refs->fn) {
3310
0
        if (fd->ref_free) {
3311
0
            free(fd->ref_free);
3312
0
            fd->ref_free = NULL;
3313
0
        }
3314
0
        fd->ref = NULL;
3315
0
        fd->ref_id = id;
3316
0
        pthread_mutex_unlock(&fd->refs->lock);
3317
0
        pthread_mutex_unlock(&fd->ref_lock);
3318
0
        return NULL;
3319
0
    }
3320
3321
    /* Open file if it's not already the current open reference */
3322
0
    if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
3323
0
        if (fd->refs->fp)
3324
0
            if (bgzf_close(fd->refs->fp) != 0)
3325
0
                return NULL;
3326
0
        fd->refs->fn = r->fn;
3327
0
        if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r", r->is_md5))) {
3328
0
            pthread_mutex_unlock(&fd->refs->lock);
3329
0
            pthread_mutex_unlock(&fd->ref_lock);
3330
0
            return NULL;
3331
0
        }
3332
0
    }
3333
3334
0
    if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
3335
0
        pthread_mutex_unlock(&fd->refs->lock);
3336
0
        pthread_mutex_unlock(&fd->ref_lock);
3337
0
        return NULL;
3338
0
    }
3339
3340
0
    if (fd->ref_free)
3341
0
        free(fd->ref_free);
3342
3343
0
    fd->ref_id    = id;
3344
0
    fd->ref_start = start;
3345
0
    fd->ref_end   = end;
3346
0
    fd->ref_free = fd->ref;
3347
0
    seq = fd->ref;
3348
3349
0
    pthread_mutex_unlock(&fd->refs->lock);
3350
0
    pthread_mutex_unlock(&fd->ref_lock);
3351
3352
0
    return seq ? seq + ostart - start : NULL;
3353
0
}
3354
3355
/*
3356
 * If fd has been opened for reading, it may be permitted to specify 'fn'
3357
 * as NULL and let the code auto-detect the reference by parsing the
3358
 * SAM header @SQ lines.
3359
 */
3360
0
int cram_load_reference(cram_fd *fd, char *fn) {
3361
0
    int ret = 0;
3362
3363
0
    if (fn) {
3364
0
        fd->refs = refs_load_fai(fd->refs, fn,
3365
0
                                 !(fd->embed_ref>0 && fd->mode == 'r'));
3366
0
        fn = fd->refs ? fd->refs->fn : NULL;
3367
0
        if (!fn)
3368
0
            ret = -1;
3369
0
        sanitise_SQ_lines(fd);
3370
0
    }
3371
0
    fd->ref_fn = fn;
3372
3373
0
    if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
3374
0
        if (fd->refs)
3375
0
            refs_free(fd->refs);
3376
0
        if (!(fd->refs = refs_create()))
3377
0
            return -1;
3378
0
        if (-1 == refs_from_header(fd))
3379
0
            return -1;
3380
0
    }
3381
3382
0
    if (fd->header)
3383
0
        if (-1 == refs2id(fd->refs, fd->header))
3384
0
            return -1;
3385
3386
0
    return ret;
3387
0
}
3388
3389
/* ----------------------------------------------------------------------
3390
 * Containers
3391
 */
3392
3393
/*
3394
 * Creates a new container, specifying the maximum number of slices
3395
 * and records permitted.
3396
 *
3397
 * Returns cram_container ptr on success
3398
 *         NULL on failure
3399
 */
3400
8.64k
cram_container *cram_new_container(int nrec, int nslice) {
3401
8.64k
    cram_container *c = calloc(1, sizeof(*c));
3402
8.64k
    enum cram_DS_ID id;
3403
3404
8.64k
    if (!c)
3405
0
        return NULL;
3406
3407
8.64k
    c->curr_ref = -2;
3408
3409
8.64k
    c->max_c_rec = nrec * nslice;
3410
8.64k
    c->curr_c_rec = 0;
3411
3412
8.64k
    c->max_rec = nrec;
3413
8.64k
    c->record_counter = 0;
3414
8.64k
    c->num_bases = 0;
3415
8.64k
    c->s_num_bases = 0;
3416
3417
8.64k
    c->max_slice = nslice;
3418
8.64k
    c->curr_slice = 0;
3419
3420
8.64k
    c->pos_sorted = 1;
3421
8.64k
    c->max_apos   = 0;
3422
8.64k
    c->multi_seq  = 0;
3423
8.64k
    c->qs_seq_orient = 1;
3424
8.64k
    c->no_ref = 0;
3425
8.64k
    c->embed_ref = -1; // automatic selection
3426
3427
8.64k
    c->bams = NULL;
3428
3429
8.64k
    if (!(c->slices = calloc(nslice != 0 ? nslice : 1, sizeof(cram_slice *))))
3430
0
        goto err;
3431
8.64k
    c->slice = NULL;
3432
3433
8.64k
    if (!(c->comp_hdr = cram_new_compression_header()))
3434
0
        goto err;
3435
8.64k
    c->comp_hdr_block = NULL;
3436
3437
250k
    for (id = DS_RN; id < DS_TN; id++)
3438
242k
        if (!(c->stats[id] = cram_stats_create())) goto err;
3439
3440
    //c->aux_B_stats = cram_stats_create();
3441
3442
8.64k
    if (!(c->tags_used = kh_init(m_tagmap)))
3443
0
        goto err;
3444
8.64k
    c->refs_used = 0;
3445
8.64k
    c->ref_free = 0;
3446
3447
8.64k
    return c;
3448
3449
0
 err:
3450
0
    if (c) {
3451
0
        if (c->slices)
3452
0
            free(c->slices);
3453
0
        free(c);
3454
0
    }
3455
0
    return NULL;
3456
8.64k
}
3457
3458
2.62k
static void free_bam_list(bam_seq_t *bams, int max_rec) {
3459
2.62k
    int i;
3460
26.2M
    for (i = 0; i < max_rec; i++)
3461
26.2M
        bam_free(&bams[i]);
3462
3463
2.62k
    free(bams);
3464
2.62k
}
3465
3466
19.6k
void cram_free_container(cram_container *c) {
3467
19.6k
    enum cram_DS_ID id;
3468
19.6k
    int i;
3469
3470
19.6k
    if (!c)
3471
0
        return;
3472
3473
19.6k
    if (c->refs_used)
3474
9
        free(c->refs_used);
3475
3476
19.6k
    if (c->landmark)
3477
9.48k
        free(c->landmark);
3478
3479
19.6k
    if (c->comp_hdr)
3480
8.98k
        cram_free_compression_header(c->comp_hdr);
3481
3482
19.6k
    if (c->comp_hdr_block)
3483
8.06k
        cram_free_block(c->comp_hdr_block);
3484
3485
    // Free the slices; filled out by encoder only
3486
19.6k
    if (c->slices) {
3487
11.8k
        for (i = 0; i < c->max_slice; i++) {
3488
3.20k
            if (c->slices[i])
3489
2.62k
                cram_free_slice(c->slices[i]);
3490
3.20k
            if (c->slices[i] == c->slice)
3491
3.20k
                c->slice = NULL;
3492
3.20k
        }
3493
8.64k
        free(c->slices);
3494
8.64k
    }
3495
3496
    // Free the current slice; set by both encoder & decoder
3497
19.6k
    if (c->slice) {
3498
0
        cram_free_slice(c->slice);
3499
0
        c->slice = NULL;
3500
0
    }
3501
3502
570k
    for (id = DS_RN; id < DS_TN; id++)
3503
550k
        if (c->stats[id]) cram_stats_free(c->stats[id]);
3504
3505
    //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
3506
3507
19.6k
    if (c->tags_used) {
3508
8.64k
        khint_t k;
3509
3510
13.3k
        for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
3511
4.72k
            if (!kh_exist(c->tags_used, k))
3512
2.73k
                continue;
3513
3514
1.98k
            cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
3515
1.98k
            if (tm) {
3516
1.98k
                cram_codec *c = tm->codec;
3517
3518
1.98k
                if (c) c->free(c);
3519
3520
                // If tm->blk or tm->blk2 is set, then we haven't yet got to
3521
                // cram_encode_container which copies the blocks to s->aux_block
3522
                // and NULLifies tm->blk*.  In this case we failed to complete
3523
                // the container construction, so we have to free up our partially
3524
                // converted CRAM.
3525
1.98k
                cram_free_block(tm->blk);
3526
1.98k
                cram_free_block(tm->blk2);
3527
1.98k
                free(tm);
3528
1.98k
            }
3529
1.98k
        }
3530
3531
8.64k
        kh_destroy(m_tagmap, c->tags_used);
3532
8.64k
    }
3533
3534
19.6k
    if (c->ref_free)
3535
152
        free(c->ref);
3536
3537
19.6k
    if (c->bams)
3538
292
        free_bam_list(c->bams, c->max_c_rec);
3539
3540
19.6k
    free(c);
3541
19.6k
}
3542
3543
/*
3544
 * Reads a container header.
3545
 *
3546
 * Returns cram_container on success
3547
 *         NULL on failure or no container left (fd->err == 0).
3548
 */
3549
11.3k
cram_container *cram_read_container(cram_fd *fd) {
3550
11.3k
    cram_container c2, *c;
3551
11.3k
    int i, s;
3552
11.3k
    size_t rd = 0;
3553
11.3k
    uint32_t crc = 0;
3554
3555
11.3k
    fd->err = 0;
3556
11.3k
    fd->eof = 0;
3557
3558
11.3k
    memset(&c2, 0, sizeof(c2));
3559
11.3k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3560
8.67k
        if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) {
3561
27
            fd->eof = fd->empty_container ? 1 : 2;
3562
27
            return NULL;
3563
8.64k
        } else {
3564
8.64k
            rd+=s;
3565
8.64k
        }
3566
8.67k
    } else if (CRAM_MAJOR_VERS(fd->version) < 4) {
3567
2.33k
        uint32_t len;
3568
2.33k
        if ((s = int32_decode(fd, &c2.length)) == -1) {
3569
21
            if (CRAM_MAJOR_VERS(fd->version) == 2 &&
3570
8
                CRAM_MINOR_VERS(fd->version) == 0)
3571
4
                fd->eof = 1; // EOF blocks arrived in v2.1
3572
17
            else
3573
17
                fd->eof = fd->empty_container ? 1 : 2;
3574
21
            return NULL;
3575
2.31k
        } else {
3576
2.31k
            rd+=s;
3577
2.31k
        }
3578
2.31k
        len = le_int4(c2.length);
3579
2.31k
        crc = crc32(0L, (unsigned char *)&len, 4);
3580
2.31k
    }
3581
11.2k
    if ((s = fd->vv.varint_decode32s_crc(fd, &c2.ref_seq_id, &crc))   == -1) return NULL; else rd+=s;
3582
11.2k
    int32_t i32;
3583
11.2k
    if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s;
3584
11.1k
    c2.ref_seq_start = i32;
3585
11.1k
    if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s;
3586
11.1k
    c2.ref_seq_span = i32;
3587
11.1k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_records, &crc))  == -1) return NULL; else rd+=s;
3588
3589
11.1k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3590
8.58k
        c2.record_counter = 0;
3591
8.58k
        c2.num_bases = 0;
3592
8.58k
    } else {
3593
2.56k
        if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3594
1.25k
            if ((s = fd->vv.varint_decode64_crc(fd, &c2.record_counter, &crc)) == -1)
3595
21
                return NULL;
3596
1.23k
            else
3597
1.23k
                rd += s;
3598
1.30k
        } else {
3599
1.30k
            int32_t i32;
3600
1.30k
            if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1)
3601
4
                return NULL;
3602
1.30k
            else
3603
1.30k
                rd += s;
3604
1.30k
            c2.record_counter = i32;
3605
1.30k
        }
3606
3607
2.53k
        if ((s = fd->vv.varint_decode64_crc(fd, &c2.num_bases, &crc))== -1)
3608
35
            return NULL;
3609
2.50k
        else
3610
2.50k
            rd += s;
3611
2.53k
    }
3612
11.0k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_blocks, &crc))   == -1)
3613
15
        return NULL;
3614
11.0k
    else
3615
11.0k
        rd+=s;
3616
11.0k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_landmarks, &crc))== -1)
3617
35
        return NULL;
3618
11.0k
    else
3619
11.0k
        rd+=s;
3620
3621
11.0k
    if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t))
3622
25
        return NULL;
3623
3624
11.0k
    if (!(c = calloc(1, sizeof(*c))))
3625
0
        return NULL;
3626
3627
11.0k
    *c = c2;
3628
11.0k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3629
11.0k
    if (c->num_landmarks > FUZZ_ALLOC_LIMIT/sizeof(int32_t)) {
3630
5
        fd->err = errno = ENOMEM;
3631
5
        cram_free_container(c);
3632
5
        return NULL;
3633
5
    }
3634
11.0k
#endif
3635
11.0k
    if (c->num_landmarks && !(c->landmark = hts_malloc_p(sizeof(*c->landmark), c->num_landmarks))) {
3636
0
        fd->err = errno;
3637
0
        cram_free_container(c);
3638
0
        return NULL;
3639
0
    }
3640
45.7k
    for (i = 0; i < c->num_landmarks; i++) {
3641
34.8k
        if ((s = fd->vv.varint_decode32_crc(fd, &c->landmark[i], &crc)) == -1) {
3642
158
            cram_free_container(c);
3643
158
            return NULL;
3644
34.7k
        } else {
3645
34.7k
            rd += s;
3646
34.7k
        }
3647
34.8k
    }
3648
3649
10.8k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3650
1.11k
        if (-1 == int32_decode(fd, (int32_t *)&c->crc32)) {
3651
5
            cram_free_container(c);
3652
5
            return NULL;
3653
1.11k
        } else {
3654
1.11k
            rd+=4;
3655
1.11k
        }
3656
3657
1.11k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3658
        // Pretend the CRC was OK so the fuzzer doesn't have to get it right
3659
1.11k
        crc = c->crc32;
3660
1.11k
#endif
3661
3662
1.11k
        if (crc != c->crc32) {
3663
0
            hts_log_error("Container header CRC32 failure");
3664
0
            cram_free_container(c);
3665
0
            return NULL;
3666
0
        }
3667
1.11k
    }
3668
3669
10.8k
    c->offset = rd;
3670
10.8k
    c->slices = NULL;
3671
10.8k
    c->slice = NULL;
3672
10.8k
    c->curr_slice = 0;
3673
10.8k
    c->max_slice = c->num_landmarks;
3674
10.8k
    c->slice_rec = 0;
3675
10.8k
    c->curr_rec = 0;
3676
10.8k
    c->max_rec = 0;
3677
3678
10.8k
    if (c->ref_seq_id == -2) {
3679
12
        c->multi_seq = 1;
3680
12
        fd->multi_seq = 1;
3681
12
    }
3682
3683
10.8k
    fd->empty_container =
3684
10.8k
        (c->num_records == 0 &&
3685
5.37k
         c->ref_seq_id == -1 &&
3686
10.8k
         c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
3687
3688
10.8k
    return c;
3689
10.8k
}
3690
3691
3692
/* MAXIMUM storage size needed for the container. */
3693
0
int cram_container_size(cram_container *c) {
3694
0
    return 55 + 5*c->num_landmarks;
3695
0
}
3696
3697
3698
/*
3699
 * Stores the container structure in dat and returns *size as the
3700
 * number of bytes written to dat[].  The input size of dat is also
3701
 * held in *size and should be initialised to cram_container_size(c).
3702
 *
3703
 * Returns 0 on success;
3704
 *        -1 on failure
3705
 */
3706
int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
3707
0
{
3708
0
    char *cp = (char *)dat;
3709
0
    int i;
3710
3711
    // Check the input buffer is large enough according to our stated
3712
    // requirements. (NOTE: it may actually take less.)
3713
0
    if (cram_container_size(c) > *size)
3714
0
        return -1;
3715
3716
0
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3717
0
        cp += itf8_put(cp, c->length);
3718
0
    } else {
3719
0
        *(int32_t *)cp = le_int4(c->length);
3720
0
        cp += 4;
3721
0
    }
3722
0
    if (c->multi_seq) {
3723
0
        cp += fd->vv.varint_put32(cp, NULL, -2);
3724
0
        cp += fd->vv.varint_put32(cp, NULL, 0);
3725
0
        cp += fd->vv.varint_put32(cp, NULL, 0);
3726
0
    } else {
3727
0
        cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
3728
0
        cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
3729
0
        cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
3730
0
    }
3731
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_records);
3732
0
    if (CRAM_MAJOR_VERS(fd->version) == 2) {
3733
0
        cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
3734
0
    } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3735
0
        cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
3736
0
    }
3737
0
    cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
3738
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
3739
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
3740
0
    for (i = 0; i < c->num_landmarks; i++)
3741
0
        cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
3742
3743
0
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3744
0
        c->crc32 = crc32(0L, (uc *)dat, cp-dat);
3745
0
        cp[0] =  c->crc32        & 0xff;
3746
0
        cp[1] = (c->crc32 >>  8) & 0xff;
3747
0
        cp[2] = (c->crc32 >> 16) & 0xff;
3748
0
        cp[3] = (c->crc32 >> 24) & 0xff;
3749
0
        cp += 4;
3750
0
    }
3751
3752
0
    *size = cp-dat; // actual used size
3753
3754
0
    return 0;
3755
0
}
3756
3757
3758
/*
3759
 * Writes a container structure.
3760
 *
3761
 * Returns 0 on success
3762
 *        -1 on failure
3763
 */
3764
16.8k
int cram_write_container(cram_fd *fd, cram_container *c) {
3765
16.8k
    char buf_a[1024], *buf = buf_a, *cp;
3766
16.8k
    int i;
3767
3768
16.8k
    if (61 + c->num_landmarks * 10 >= 1024) {
3769
0
        buf = malloc(61 + c->num_landmarks * 10);
3770
0
        if (!buf)
3771
0
            return -1;
3772
0
    }
3773
16.8k
    cp = buf;
3774
3775
16.8k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3776
0
        cp += itf8_put(cp, c->length);
3777
16.8k
    } else if (CRAM_MAJOR_VERS(fd->version) <= 3) {
3778
16.8k
        *(int32_t *)cp = le_int4(c->length);
3779
16.8k
        cp += 4;
3780
16.8k
    } else {
3781
0
        cp += fd->vv.varint_put32(cp, NULL, c->length);
3782
0
    }
3783
16.8k
    if (c->multi_seq) {
3784
8
        cp += fd->vv.varint_put32(cp, NULL, (uint32_t)-2);
3785
8
        cp += fd->vv.varint_put32(cp, NULL, 0);
3786
8
        cp += fd->vv.varint_put32(cp, NULL, 0);
3787
16.7k
    } else {
3788
16.7k
        cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
3789
16.7k
        cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
3790
16.7k
        cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
3791
16.7k
    }
3792
16.8k
    cp += fd->vv.varint_put32(cp, NULL, c->num_records);
3793
16.8k
    if (CRAM_MAJOR_VERS(fd->version) >= 3)
3794
16.8k
        cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
3795
0
    else
3796
0
        cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
3797
16.8k
    cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
3798
16.8k
    cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
3799
16.8k
    cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
3800
30.5k
    for (i = 0; i < c->num_landmarks; i++)
3801
13.7k
        cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
3802
3803
16.8k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3804
16.8k
        c->crc32 = crc32(0L, (uc *)buf, cp-buf);
3805
16.8k
        cp[0] =  c->crc32        & 0xff;
3806
16.8k
        cp[1] = (c->crc32 >>  8) & 0xff;
3807
16.8k
        cp[2] = (c->crc32 >> 16) & 0xff;
3808
16.8k
        cp[3] = (c->crc32 >> 24) & 0xff;
3809
16.8k
        cp += 4;
3810
16.8k
    }
3811
3812
16.8k
    if (cp-buf != hwrite(fd->fp, buf, cp-buf)) {
3813
0
        if (buf != buf_a)
3814
0
            free(buf);
3815
0
        return -1;
3816
0
    }
3817
3818
16.8k
    if (buf != buf_a)
3819
0
        free(buf);
3820
3821
16.8k
    return 0;
3822
16.8k
}
3823
3824
// common component shared by cram_flush_container{,_mt}
3825
2.89k
static int cram_flush_container2(cram_fd *fd, cram_container *c) {
3826
2.89k
    int i, j;
3827
3828
2.89k
    if (c->curr_slice > 0 && !c->slices)
3829
0
        return -1;
3830
3831
    //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
3832
3833
2.89k
    off_t c_offset = htell(fd->fp); // File offset of container
3834
3835
    /* Write the container struct itself */
3836
2.89k
    if (0 != cram_write_container(fd, c))
3837
0
        return -1;
3838
3839
2.89k
    off_t hdr_size = htell(fd->fp) - c_offset;
3840
3841
    /* And the compression header */
3842
2.89k
    if (0 != cram_write_block(fd, c->comp_hdr_block))
3843
0
        return -1;
3844
3845
    /* Followed by the slice blocks */
3846
2.89k
    off_t file_offset = htell(fd->fp);
3847
5.78k
    for (i = 0; i < c->curr_slice; i++) {
3848
2.89k
        cram_slice *s = c->slices[i];
3849
2.89k
        off_t spos = file_offset - c_offset - hdr_size;
3850
3851
2.89k
        if (0 != cram_write_block(fd, s->hdr_block))
3852
0
            return -1;
3853
3854
16.2k
        for (j = 0; j < s->hdr->num_blocks; j++) {
3855
13.3k
            if (0 != cram_write_block(fd, s->block[j]))
3856
0
                return -1;
3857
13.3k
        }
3858
3859
2.89k
        file_offset = htell(fd->fp);
3860
2.89k
        off_t sz = file_offset - c_offset - hdr_size - spos;
3861
3862
2.89k
        if (fd->idxfp) {
3863
0
            if (cram_index_slice(fd, c, s, fd->idxfp, c_offset, spos, sz) < 0)
3864
0
                return -1;
3865
0
        }
3866
2.89k
    }
3867
3868
2.89k
    return 0;
3869
2.89k
}
3870
3871
/*
3872
 * Flushes a completely or partially full container to disk, writing
3873
 * container structure, header and blocks. This also calls the encoder
3874
 * functions.
3875
 *
3876
 * Returns 0 on success
3877
 *        -1 on failure
3878
 */
3879
3.20k
int cram_flush_container(cram_fd *fd, cram_container *c) {
3880
    /* Encode the container blocks and generate compression header */
3881
3.20k
    if (0 != cram_encode_container(fd, c))
3882
309
        return -1;
3883
3884
2.89k
    return cram_flush_container2(fd, c);
3885
3.20k
}
3886
3887
typedef struct {
3888
    cram_fd *fd;
3889
    cram_container *c;
3890
} cram_job;
3891
3892
0
void *cram_flush_thread(void *arg) {
3893
0
    cram_job *j = (cram_job *)arg;
3894
3895
    /* Encode the container blocks and generate compression header */
3896
0
    if (0 != cram_encode_container(j->fd, j->c)) {
3897
0
        hts_log_error("Call to cram_encode_container failed");
3898
0
        return NULL;
3899
0
    }
3900
3901
0
    return arg;
3902
0
}
3903
3904
0
static int cram_flush_result(cram_fd *fd) {
3905
0
    int i, ret = 0;
3906
0
    hts_tpool_result *r;
3907
0
    cram_container *lc = NULL;
3908
3909
    // NB: we can have one result per slice, not per container,
3910
    // so we need to free the container only after all slices
3911
    // within it have been freed.  (Automatic via reference counting.)
3912
0
    while ((r = hts_tpool_next_result(fd->rqueue))) {
3913
0
        cram_job *j = (cram_job *)hts_tpool_result_data(r);
3914
0
        cram_container *c;
3915
3916
0
        if (!j) {
3917
0
            hts_tpool_delete_result(r, 0);
3918
0
            return -1;
3919
0
        }
3920
3921
0
        fd = j->fd;
3922
0
        c = j->c;
3923
3924
0
        if (fd->mode == 'w')
3925
0
            if (0 != cram_flush_container2(fd, c))
3926
0
                return -1;
3927
3928
        // Free the slices; filled out by encoder only
3929
0
        if (c->slices) {
3930
0
            for (i = 0; i < c->max_slice; i++) {
3931
0
                if (c->slices[i])
3932
0
                    cram_free_slice(c->slices[i]);
3933
0
                if (c->slices[i] == c->slice)
3934
0
                    c->slice = NULL;
3935
0
                c->slices[i] = NULL;
3936
0
            }
3937
0
        }
3938
3939
        // Free the current slice; set by both encoder & decoder
3940
0
        if (c->slice) {
3941
0
            cram_free_slice(c->slice);
3942
0
            c->slice = NULL;
3943
0
        }
3944
0
        c->curr_slice = 0;
3945
3946
        // Our jobs will be in order, so we free the last
3947
        // container when our job has switched to a new one.
3948
0
        if (c != lc) {
3949
0
            if (lc) {
3950
0
                if (fd->ctr == lc)
3951
0
                    fd->ctr = NULL;
3952
0
                if (fd->ctr_mt == lc)
3953
0
                    fd->ctr_mt = NULL;
3954
0
                cram_free_container(lc);
3955
0
            }
3956
0
            lc = c;
3957
0
        }
3958
3959
0
        hts_tpool_delete_result(r, 1);
3960
0
    }
3961
0
    if (lc) {
3962
0
        if (fd->ctr == lc)
3963
0
            fd->ctr = NULL;
3964
0
        if (fd->ctr_mt == lc)
3965
0
            fd->ctr_mt = NULL;
3966
0
        cram_free_container(lc);
3967
0
    }
3968
3969
0
    return ret;
3970
0
}
3971
3972
// Note: called while metrics_lock is held.
3973
// Will be left in this state too, but may temporarily unlock.
3974
18
void reset_metrics(cram_fd *fd) {
3975
18
    int i;
3976
3977
18
    if (fd->pool) {
3978
        // If multi-threaded we have multiple blocks being
3979
        // compressed already and several on the to-do list
3980
        // (fd->rqueue->pending).  It's tricky to reset the
3981
        // metrics exactly the correct point, so instead we
3982
        // just flush the pool, reset, and then continue again.
3983
3984
        // Don't bother starting a new trial before then though.
3985
0
        for (i = 0; i < DS_END; i++) {
3986
0
            cram_metrics *m = fd->m[i];
3987
0
            if (!m)
3988
0
                continue;
3989
0
            m->next_trial = 999;
3990
0
        }
3991
3992
0
        pthread_mutex_unlock(&fd->metrics_lock);
3993
0
        hts_tpool_process_flush(fd->rqueue);
3994
0
        pthread_mutex_lock(&fd->metrics_lock);
3995
0
    }
3996
3997
864
    for (i = 0; i < DS_END; i++) {
3998
846
        cram_metrics *m = fd->m[i];
3999
846
        if (!m)
4000
0
            continue;
4001
4002
846
        m->trial = NTRIALS;
4003
846
        m->next_trial = TRIAL_SPAN;
4004
846
        m->revised_method = 0;
4005
846
        m->unpackable = 0;
4006
4007
846
        memset(m->sz, 0, sizeof(m->sz));
4008
846
    }
4009
18
}
4010
4011
3.20k
int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
4012
3.20k
    cram_job *j;
4013
4014
    // At the junction of mapped to unmapped data the compression
4015
    // methods may need to change due to very different statistical
4016
    // properties; particularly BA if minhash sorted.
4017
    //
4018
    // However with threading we'll have several in-flight blocks
4019
    // arriving out of order.
4020
    //
4021
    // So we do one trial reset of NThreads to last for NThreads
4022
    // duration to get us over this transition period, followed
4023
    // by another retrial of the usual ntrials & trial span.
4024
3.20k
    pthread_mutex_lock(&fd->metrics_lock);
4025
3.20k
    if (c->n_mapped < 0.3*c->curr_rec &&
4026
2.87k
        fd->last_mapped > 0.7*c->max_rec) {
4027
18
        reset_metrics(fd);
4028
18
    }
4029
3.20k
    fd->last_mapped = c->n_mapped * (c->max_rec+1)/(c->curr_rec+1) ;
4030
3.20k
    pthread_mutex_unlock(&fd->metrics_lock);
4031
4032
3.20k
    if (!fd->pool)
4033
3.20k
        return cram_flush_container(fd, c);
4034
4035
0
    if (!(j = malloc(sizeof(*j))))
4036
0
        return -1;
4037
0
    j->fd = fd;
4038
0
    j->c = c;
4039
4040
    // Flush the job.  Note our encoder queue may be full, so we
4041
    // either have to keep trying in non-blocking mode (what we do) or
4042
    // use a dedicated separate thread for draining the queue.
4043
0
    for (;;) {
4044
0
        errno = 0;
4045
0
        hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_flush_thread, j, 1);
4046
0
        int pending = (errno == EAGAIN);
4047
0
        if (cram_flush_result(fd) != 0)
4048
0
            return -1;
4049
0
        if (!pending)
4050
0
            break;
4051
4052
0
        hts_usleep(1000);
4053
0
    }
4054
4055
0
    return 0;
4056
0
}
4057
4058
/* ----------------------------------------------------------------------
4059
 * Compression headers; the first part of the container
4060
 */
4061
4062
/*
4063
 * Creates a new blank container compression header
4064
 *
4065
 * Returns header ptr on success
4066
 *         NULL on failure
4067
 */
4068
8.64k
cram_block_compression_hdr *cram_new_compression_header(void) {
4069
8.64k
    cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
4070
8.64k
    if (!hdr)
4071
0
        return NULL;
4072
4073
8.64k
    if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
4074
0
        free(hdr);
4075
0
        return NULL;
4076
0
    }
4077
4078
8.64k
    if (!(hdr->TD_hash = kh_init(m_s2i))) {
4079
0
        cram_free_block(hdr->TD_blk);
4080
0
        free(hdr);
4081
0
        return NULL;
4082
0
    }
4083
4084
8.64k
    if (!(hdr->TD_keys = string_pool_create(8192))) {
4085
0
        kh_destroy(m_s2i, hdr->TD_hash);
4086
0
        cram_free_block(hdr->TD_blk);
4087
0
        free(hdr);
4088
0
        return NULL;
4089
0
    }
4090
4091
8.64k
    return hdr;
4092
8.64k
}
4093
4094
10.4k
void cram_free_compression_header(cram_block_compression_hdr *hdr) {
4095
10.4k
    int i;
4096
4097
10.4k
    if (hdr->landmark)
4098
1.77k
        free(hdr->landmark);
4099
4100
10.4k
    if (hdr->preservation_map)
4101
4.66k
        kh_destroy(map, hdr->preservation_map);
4102
4103
343k
    for (i = 0; i < CRAM_MAP_HASH; i++) {
4104
333k
        cram_map *m, *m2;
4105
351k
        for (m = hdr->rec_encoding_map[i]; m; m = m2) {
4106
18.3k
            m2 = m->next;
4107
18.3k
            if (m->codec)
4108
0
                m->codec->free(m->codec);
4109
18.3k
            free(m);
4110
18.3k
        }
4111
333k
    }
4112
4113
343k
    for (i = 0; i < CRAM_MAP_HASH; i++) {
4114
333k
        cram_map *m, *m2;
4115
334k
        for (m = hdr->tag_encoding_map[i]; m; m = m2) {
4116
711
            m2 = m->next;
4117
711
            if (m->codec)
4118
711
                m->codec->free(m->codec);
4119
711
            free(m);
4120
711
        }
4121
333k
    }
4122
4123
500k
    for (i = 0; i < DS_END; i++) {
4124
489k
        if (hdr->codecs[i])
4125
51.8k
            hdr->codecs[i]->free(hdr->codecs[i]);
4126
489k
    }
4127
4128
10.4k
    if (hdr->TL)
4129
54
        free(hdr->TL);
4130
10.4k
    if (hdr->TD_blk)
4131
8.70k
        cram_free_block(hdr->TD_blk);
4132
10.4k
    if (hdr->TD_hash)
4133
8.64k
        kh_destroy(m_s2i, hdr->TD_hash);
4134
10.4k
    if (hdr->TD_keys)
4135
8.64k
        string_pool_destroy(hdr->TD_keys);
4136
4137
10.4k
    free(hdr);
4138
10.4k
}
4139
4140
4141
/* ----------------------------------------------------------------------
4142
 * Slices and slice headers
4143
 */
4144
4145
3.27k
void cram_free_slice_header(cram_block_slice_hdr *hdr) {
4146
3.27k
    if (!hdr)
4147
0
        return;
4148
4149
3.27k
    if (hdr->block_content_ids)
4150
2.98k
        free(hdr->block_content_ids);
4151
4152
3.27k
    free(hdr);
4153
4154
3.27k
    return;
4155
3.27k
}
4156
4157
3.37k
void cram_free_slice(cram_slice *s) {
4158
3.37k
    if (!s)
4159
0
        return;
4160
4161
3.37k
    if (s->bl) {
4162
0
        free_bam_list(s->bl->bams, s->bl->nbams);
4163
0
        free(s->bl);
4164
0
    }
4165
4166
3.37k
    if (s->hdr_block)
4167
2.91k
        cram_free_block(s->hdr_block);
4168
4169
3.37k
    if (s->block) {
4170
2.97k
        int i;
4171
4172
2.97k
        if (s->hdr) {
4173
6.14M
            for (i = 0; i < s->hdr->num_blocks; i++) {
4174
6.14M
                if (i > 0 && s->block[i] == s->block[0])
4175
6.12M
                    continue;
4176
14.2k
                cram_free_block(s->block[i]);
4177
14.2k
            }
4178
2.97k
        }
4179
2.97k
        free(s->block);
4180
2.97k
    }
4181
4182
3.37k
    {
4183
        // Normally already copied into s->block[], but potentially still
4184
        // here if we error part way through cram_encode_slice.
4185
3.37k
        int i;
4186
4.77k
        for (i = 0; i < s->naux_block; i++)
4187
1.39k
            cram_free_block(s->aux_block[i]);
4188
3.37k
    }
4189
4190
3.37k
    if (s->block_by_id)
4191
18
        free(s->block_by_id);
4192
4193
3.37k
    if (s->hdr)
4194
3.27k
        cram_free_slice_header(s->hdr);
4195
4196
3.37k
    if (s->seqs_blk)
4197
3.22k
        cram_free_block(s->seqs_blk);
4198
4199
3.37k
    if (s->qual_blk)
4200
316
        cram_free_block(s->qual_blk);
4201
4202
3.37k
    if (s->name_blk)
4203
316
        cram_free_block(s->name_blk);
4204
4205
3.37k
    if (s->aux_blk)
4206
3.22k
        cram_free_block(s->aux_blk);
4207
4208
3.37k
    if (s->base_blk)
4209
316
        cram_free_block(s->base_blk);
4210
4211
3.37k
    if (s->soft_blk)
4212
316
        cram_free_block(s->soft_blk);
4213
4214
3.37k
    if (s->cigar)
4215
3.22k
        free(s->cigar);
4216
4217
3.37k
    if (s->crecs)
4218
3.21k
        free(s->crecs);
4219
4220
3.37k
    if (s->features)
4221
150
        free(s->features);
4222
4223
3.37k
    if (s->TN)
4224
0
        free(s->TN);
4225
4226
3.37k
    if (s->pair_keys)
4227
3.20k
        string_pool_destroy(s->pair_keys);
4228
4229
3.37k
    if (s->pair[0])
4230
3.20k
        kh_destroy(m_s2i, s->pair[0]);
4231
3.37k
    if (s->pair[1])
4232
3.20k
        kh_destroy(m_s2i, s->pair[1]);
4233
4234
3.37k
    if (s->aux_block)
4235
552
        free(s->aux_block);
4236
4237
3.37k
    free(s);
4238
3.37k
}
4239
4240
/*
4241
 * Creates a new empty slice in memory, for subsequent writing to
4242
 * disk.
4243
 *
4244
 * Returns cram_slice ptr on success
4245
 *         NULL on failure
4246
 */
4247
3.20k
cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
4248
3.20k
    cram_slice *s = calloc(1, sizeof(*s));
4249
3.20k
    if (!s)
4250
0
        return NULL;
4251
4252
3.20k
    if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
4253
0
        goto err;
4254
3.20k
    s->hdr->content_type = type;
4255
4256
3.20k
    s->hdr_block = NULL;
4257
3.20k
    s->block = NULL;
4258
3.20k
    s->block_by_id = NULL;
4259
3.20k
    s->last_apos = 0;
4260
3.20k
    if (!(s->crecs = hts_malloc_p(sizeof(*s->crecs), nrecs)))  goto err;
4261
3.20k
    s->cigar_alloc = 1024;
4262
3.20k
    if (!(s->cigar = hts_malloc_p(sizeof(*s->cigar), s->cigar_alloc))) goto err;
4263
3.20k
    s->ncigar = 0;
4264
4265
3.20k
    if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))       goto err;
4266
3.20k
    if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))   goto err;
4267
3.20k
    if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))   goto err;
4268
3.20k
    if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux)))  goto err;
4269
3.20k
    if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))   goto err;
4270
3.20k
    if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))   goto err;
4271
4272
3.20k
    s->features = NULL;
4273
3.20k
    s->nfeatures = s->afeatures = 0;
4274
4275
3.20k
#ifndef TN_external
4276
3.20k
    s->TN = NULL;
4277
3.20k
    s->nTN = s->aTN = 0;
4278
3.20k
#endif
4279
4280
    // Volatile keys as we do realloc in dstring
4281
3.20k
    if (!(s->pair_keys = string_pool_create(8192))) goto err;
4282
3.20k
    if (!(s->pair[0] = kh_init(m_s2i)))             goto err;
4283
3.20k
    if (!(s->pair[1] = kh_init(m_s2i)))             goto err;
4284
4285
#ifdef BA_external
4286
    s->BA_len = 0;
4287
#endif
4288
4289
3.20k
    return s;
4290
4291
0
 err:
4292
0
    if (s)
4293
0
        cram_free_slice(s);
4294
4295
0
    return NULL;
4296
3.20k
}
4297
4298
/*
4299
 * Loads an entire slice.
4300
 * FIXME: In 1.0 the native unit of slices within CRAM is broken
4301
 * as slices contain references to objects in other slices.
4302
 * To work around this while keeping the slice oriented outer loop
4303
 * we read all slices and stitch them together into a fake large
4304
 * slice instead.
4305
 *
4306
 * Returns cram_slice ptr on success
4307
 *         NULL on failure
4308
 */
4309
171
cram_slice *cram_read_slice(cram_fd *fd) {
4310
171
    cram_block *b = cram_read_block(fd);
4311
171
    cram_slice *s = calloc(1, sizeof(*s));
4312
171
    int i, n, max_id, min_id;
4313
4314
171
    if (!b || !s)
4315
15
        goto err;
4316
4317
156
    s->hdr_block = b;
4318
156
    switch (b->content_type) {
4319
144
    case MAPPED_SLICE:
4320
153
    case UNMAPPED_SLICE:
4321
153
        if (!(s->hdr = cram_decode_slice_header(fd, b)))
4322
78
            goto err;
4323
75
        break;
4324
4325
75
    default:
4326
3
        hts_log_error("Unexpected block of type %s",
4327
3
                      cram_content_type2str(b->content_type));
4328
3
        goto err;
4329
156
    }
4330
4331
75
    if (s->hdr->num_blocks < 1) {
4332
3
        hts_log_error("Slice does not include any data blocks");
4333
3
        goto err;
4334
3
    }
4335
4336
72
    s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
4337
72
    if (!s->block)
4338
0
        goto err;
4339
4340
306
    for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
4341
288
        if (!(s->block[i] = cram_read_block(fd)))
4342
54
            goto err;
4343
4344
234
        if (s->block[i]->content_type == EXTERNAL) {
4345
36
            if (max_id < s->block[i]->content_id)
4346
24
                max_id = s->block[i]->content_id;
4347
36
            if (min_id > s->block[i]->content_id)
4348
18
                min_id = s->block[i]->content_id;
4349
36
        }
4350
234
    }
4351
4352
18
    if (!(s->block_by_id = calloc(512, sizeof(s->block[0]))))
4353
0
        goto err;
4354
4355
54
    for (i = 0; i < n; i++) {
4356
36
        if (s->block[i]->content_type != EXTERNAL)
4357
15
            continue;
4358
21
        uint32_t v = s->block[i]->content_id;
4359
21
        if (v >= 256)
4360
9
            v = 256 + v % 251;
4361
21
        s->block_by_id[v] = s->block[i];
4362
21
    }
4363
4364
    /* Initialise encoding/decoding tables */
4365
18
    s->cigar_alloc = 1024;
4366
18
    if (!(s->cigar = hts_malloc_p(sizeof(*s->cigar), s->cigar_alloc))) goto err;
4367
18
    s->ncigar = 0;
4368
4369
18
    if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))      goto err;
4370
18
    if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))  goto err;
4371
18
    if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))  goto err;
4372
18
    if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux))) goto err;
4373
18
    if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))  goto err;
4374
18
    if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))  goto err;
4375
4376
18
    s->crecs = NULL;
4377
4378
18
    s->last_apos = s->hdr->ref_seq_start;
4379
18
    s->decode_md = fd->decode_md;
4380
4381
18
    return s;
4382
4383
153
 err:
4384
153
    if (b)
4385
138
        cram_free_block(b);
4386
153
    if (s) {
4387
153
        s->hdr_block = NULL;
4388
153
        cram_free_slice(s);
4389
153
    }
4390
153
    return NULL;
4391
18
}
4392
4393
4394
/* ----------------------------------------------------------------------
4395
 * CRAM file definition (header)
4396
 */
4397
4398
/*
4399
 * Reads a CRAM file definition structure.
4400
 * Returns file_def ptr on success
4401
 *         NULL on failure
4402
 */
4403
8.98k
cram_file_def *cram_read_file_def(cram_fd *fd) {
4404
8.98k
    cram_file_def *def = malloc(sizeof(*def));
4405
8.98k
    if (!def)
4406
0
        return NULL;
4407
4408
8.98k
    if (26 != hread(fd->fp, &def->magic[0], 26)) {
4409
9
        free(def);
4410
9
        return NULL;
4411
9
    }
4412
4413
8.97k
    if (memcmp(def->magic, "CRAM", 4) != 0) {
4414
1
        free(def);
4415
1
        return NULL;
4416
1
    }
4417
4418
8.97k
    if (def->major_version > 4) {
4419
1
        hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x, 3.x or 4.x, got %d.%d",
4420
1
                      def->major_version, def->minor_version);
4421
1
        free(def);
4422
1
        return NULL;
4423
1
    }
4424
4425
8.97k
    fd->first_container += 26;
4426
8.97k
    fd->curr_position = fd->first_container;
4427
8.97k
    fd->last_slice = 0;
4428
4429
8.97k
    return def;
4430
8.97k
}
4431
4432
/*
4433
 * Writes a cram_file_def structure to cram_fd.
4434
 * Returns 0 on success
4435
 *        -1 on failure
4436
 */
4437
5.44k
int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
4438
5.44k
    return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1;
4439
5.44k
}
4440
4441
17.7k
void cram_free_file_def(cram_file_def *def) {
4442
17.7k
    if (def) free(def);
4443
17.7k
}
4444
4445
/* ----------------------------------------------------------------------
4446
 * SAM header I/O
4447
 */
4448
4449
4450
/*
4451
 * Reads the SAM header from the first CRAM data block.
4452
 * Also performs minimal parsing to extract read-group
4453
 * and sample information.
4454
4455
 * Returns SAM hdr ptr on success
4456
 *         NULL on failure
4457
 */
4458
8.97k
sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) {
4459
8.97k
    int32_t header_len;
4460
8.97k
    char *header;
4461
8.97k
    sam_hdr_t *hdr;
4462
4463
    /* 1.1 onwards stores the header in the first block of a container */
4464
8.97k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4465
        /* Length */
4466
7.38k
        if (-1 == int32_decode(fd, &header_len))
4467
2
            return NULL;
4468
4469
7.37k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
4470
7.37k
        if (header_len > FUZZ_ALLOC_LIMIT)
4471
2
            return NULL;
4472
7.37k
#endif
4473
4474
        /* Alloc and read */
4475
7.37k
        if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1)))
4476
0
            return NULL;
4477
4478
7.37k
        if (header_len != hread(fd->fp, header, header_len)) {
4479
13
            free(header);
4480
13
            return NULL;
4481
13
        }
4482
7.36k
        header[header_len] = '\0';
4483
4484
7.36k
        fd->first_container += 4 + header_len;
4485
7.36k
    } else {
4486
1.59k
        cram_container *c = cram_read_container(fd);
4487
1.59k
        cram_block *b;
4488
1.59k
        int i;
4489
1.59k
        int64_t len;
4490
4491
1.59k
        if (!c)
4492
145
            return NULL;
4493
4494
1.44k
        fd->first_container += c->length + c->offset;
4495
1.44k
        fd->curr_position = fd->first_container;
4496
4497
1.44k
        if (c->num_blocks < 1) {
4498
46
            cram_free_container(c);
4499
46
            return NULL;
4500
46
        }
4501
4502
1.40k
        if (!(b = cram_read_block(fd))) {
4503
96
            cram_free_container(c);
4504
96
            return NULL;
4505
96
        }
4506
1.30k
        if (cram_uncompress_block(b) != 0) {
4507
976
            cram_free_container(c);
4508
976
            cram_free_block(b);
4509
976
            return NULL;
4510
976
        }
4511
4512
331
        len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4513
331
            fd->vv.varint_size(b->content_id) +
4514
331
            fd->vv.varint_size(b->uncomp_size) +
4515
331
            fd->vv.varint_size(b->comp_size);
4516
4517
        /* Extract header from 1st block */
4518
331
        if (-1 == int32_get_blk(b, &header_len) ||
4519
304
            header_len < 0 || /* Spec. says signed...  why? */
4520
301
            b->uncomp_size - 4 < header_len) {
4521
47
            cram_free_container(c);
4522
47
            cram_free_block(b);
4523
47
            return NULL;
4524
47
        }
4525
284
        if (NULL == (header = malloc((size_t) header_len+1))) {
4526
0
            cram_free_container(c);
4527
0
            cram_free_block(b);
4528
0
            return NULL;
4529
0
        }
4530
284
        memcpy(header, BLOCK_END(b), header_len);
4531
284
        header[header_len] = '\0';
4532
284
        cram_free_block(b);
4533
4534
        /* Consume any remaining blocks */
4535
410
        for (i = 1; i < c->num_blocks; i++) {
4536
220
            if (!(b = cram_read_block(fd))) {
4537
94
                cram_free_container(c);
4538
94
                free(header);
4539
94
                return NULL;
4540
94
            }
4541
126
            len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4542
126
                fd->vv.varint_size(b->content_id) +
4543
126
                fd->vv.varint_size(b->uncomp_size) +
4544
126
                fd->vv.varint_size(b->comp_size);
4545
126
            cram_free_block(b);
4546
126
        }
4547
4548
190
        if (c->length > 0 && len > 0 && c->length > len) {
4549
            // Consume padding
4550
17
            char *pads = malloc(c->length - len);
4551
17
            if (!pads) {
4552
0
                cram_free_container(c);
4553
0
                free(header);
4554
0
                return NULL;
4555
0
            }
4556
4557
17
            if (c->length - len != hread(fd->fp, pads, c->length - len)) {
4558
16
                cram_free_container(c);
4559
16
                free(header);
4560
16
                free(pads);
4561
16
                return NULL;
4562
16
            }
4563
1
            free(pads);
4564
1
        }
4565
4566
174
        cram_free_container(c);
4567
174
    }
4568
4569
    /* Parse */
4570
7.53k
    hdr = sam_hdr_init();
4571
7.53k
    if (!hdr) {
4572
0
        free(header);
4573
0
        return NULL;
4574
0
    }
4575
4576
7.53k
    if (-1 == sam_hdr_add_lines(hdr, header, header_len)) {
4577
274
        free(header);
4578
274
        sam_hdr_destroy(hdr);
4579
274
        return NULL;
4580
274
    }
4581
4582
7.26k
    hdr->l_text = header_len;
4583
7.26k
    hdr->text = header;
4584
4585
7.26k
    return hdr;
4586
4587
7.53k
}
4588
4589
/*
4590
 * Converts 'in' to a full pathname to store in out.
4591
 * Out must be at least PATH_MAX bytes long.
4592
 */
4593
0
static void full_path(char *out, char *in) {
4594
0
    size_t in_l = strlen(in);
4595
0
    if (hisremote(in)) {
4596
0
        if (in_l > PATH_MAX) {
4597
0
            hts_log_error("Reference path is longer than %d", PATH_MAX);
4598
0
            return;
4599
0
        }
4600
0
        strncpy(out, in, PATH_MAX-1);
4601
0
        out[PATH_MAX-1] = 0;
4602
0
        return;
4603
0
    }
4604
0
    if (*in == '/' ||
4605
        // Windows paths
4606
0
        (in_l > 3 && toupper_c(*in) >= 'A'  && toupper_c(*in) <= 'Z' &&
4607
0
         in[1] == ':' && (in[2] == '/' || in[2] == '\\'))) {
4608
0
        strncpy(out, in, PATH_MAX-1);
4609
0
        out[PATH_MAX-1] = 0;
4610
0
    } else {
4611
0
        size_t len;
4612
4613
        // unable to get dir or out+in is too long
4614
0
        if (!getcwd(out, PATH_MAX) ||
4615
0
            (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
4616
0
            strncpy(out, in, PATH_MAX-1);
4617
0
            out[PATH_MAX-1] = 0;
4618
0
            return;
4619
0
        }
4620
4621
0
        snprintf(out+len, PATH_MAX - len, "/%s", in);
4622
4623
        // FIXME: cope with `pwd`/../../../foo.fa ?
4624
0
    }
4625
0
}
4626
4627
/*
4628
 * Writes a CRAM SAM header.
4629
 * Returns 0 on success
4630
 *        -1 on failure
4631
 */
4632
5.44k
int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) {
4633
5.44k
    size_t header_len;
4634
5.44k
    int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
4635
4636
    /* Write CRAM MAGIC if not yet written. */
4637
5.44k
    if (fd->file_def->major_version == 0) {
4638
5.44k
        fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version);
4639
5.44k
        fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version);
4640
5.44k
        if (0 != cram_write_file_def(fd, fd->file_def))
4641
0
            return -1;
4642
5.44k
    }
4643
4644
    /* 1.0 requires an UNKNOWN read-group */
4645
5.44k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4646
0
        if (!sam_hrecs_find_rg(hdr->hrecs, "UNKNOWN"))
4647
0
            if (sam_hdr_add_line(hdr, "RG",
4648
0
                            "ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
4649
0
                return -1;
4650
0
    }
4651
4652
5.44k
    if (-1 == refs_from_header(fd))
4653
0
        return -1;
4654
5.44k
    if (-1 == refs2id(fd->refs, fd->header))
4655
0
        return -1;
4656
4657
    /* Fix M5 strings */
4658
5.44k
    if (fd->refs && !fd->no_ref && fd->embed_ref <= 1) {
4659
5.44k
        int i;
4660
5.52k
        for (i = 0; i < hdr->hrecs->nref; i++) {
4661
362
            sam_hrec_type_t *ty;
4662
362
            char *ref;
4663
4664
362
            if (!(ty = sam_hrecs_find_type_id(hdr->hrecs, "SQ", "SN", hdr->hrecs->ref[i].name)))
4665
0
                return -1;
4666
4667
362
            if (!sam_hrecs_find_key(ty, "M5", NULL)) {
4668
277
                char unsigned buf[16];
4669
277
                char buf2[33];
4670
277
                hts_pos_t rlen;
4671
277
                hts_md5_context *md5;
4672
4673
277
                if (!fd->refs ||
4674
277
                    !fd->refs->ref_id ||
4675
277
                    !fd->refs->ref_id[i]) {
4676
0
                    return -1;
4677
0
                }
4678
277
                rlen = fd->refs->ref_id[i]->length;
4679
277
                ref = cram_get_ref(fd, i, 1, rlen);
4680
277
                if (NULL == ref) {
4681
277
                    if (fd->embed_ref == -1) {
4682
                        // auto embed-ref
4683
277
                        hts_log_warning("No M5 tags present and could not "
4684
277
                                        "find reference");
4685
277
                        hts_log_warning("Enabling embed_ref=2 option");
4686
277
                        hts_log_warning("NOTE: the CRAM file will be bigger "
4687
277
                                        "than using an external reference");
4688
277
                        pthread_mutex_lock(&fd->ref_lock);
4689
                        // Best guess.  It may be unmapped data with broken
4690
                        // headers, in which case this will get ignored.
4691
277
                        fd->embed_ref = 2;
4692
277
                        pthread_mutex_unlock(&fd->ref_lock);
4693
277
                        break;
4694
277
                    }
4695
0
                    return -1;
4696
277
                }
4697
0
                rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
4698
0
                if (!(md5 = hts_md5_init()))
4699
0
                    return -1;
4700
0
                if (HTS_POS_MAX <= ULONG_MAX) {
4701
                    // Platforms with 64-bit unsigned long update in one go
4702
0
                    hts_md5_update(md5, ref, rlen);
4703
0
                } else {
4704
                    // Those with 32-bit ulong (Windows) may have to loop
4705
                    // over epic references
4706
0
                    hts_pos_t pos = 0;
4707
0
                    while (rlen - pos > ULONG_MAX) {
4708
0
                        hts_md5_update(md5, ref + pos, ULONG_MAX);
4709
0
                        pos += ULONG_MAX;
4710
0
                    }
4711
0
                    hts_md5_update(md5, ref + pos, (unsigned long)(rlen - pos));
4712
0
                }
4713
0
                hts_md5_final(buf, md5);
4714
0
                hts_md5_destroy(md5);
4715
0
                cram_ref_decr(fd->refs, i);
4716
4717
0
                hts_md5_hex(buf2, buf);
4718
0
                fd->refs->ref_id[i]->validated_md5 = 1;
4719
0
                if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "M5", buf2, NULL))
4720
0
                    return -1;
4721
0
            }
4722
4723
85
            if (fd->ref_fn) {
4724
0
                char ref_fn[PATH_MAX];
4725
0
                full_path(ref_fn, fd->ref_fn);
4726
0
                if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "UR", ref_fn, NULL))
4727
0
                    return -1;
4728
0
            }
4729
85
        }
4730
5.44k
    }
4731
4732
    /* Length */
4733
5.44k
    header_len = sam_hdr_length(hdr);
4734
5.44k
    if (header_len > INT32_MAX) {
4735
0
        hts_log_error("Header is too long for CRAM format");
4736
0
        return -1;
4737
0
    }
4738
5.44k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4739
0
        if (-1 == int32_encode(fd, header_len))
4740
0
            return -1;
4741
4742
        /* Text data */
4743
0
        if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len))
4744
0
            return -1;
4745
5.44k
    } else {
4746
        /* Create block(s) inside a container */
4747
5.44k
        cram_block *b = cram_new_block(FILE_HEADER, 0);
4748
5.44k
        cram_container *c = cram_new_container(0, 0);
4749
5.44k
        int padded_length;
4750
5.44k
        char *pads;
4751
5.44k
        int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3);
4752
4753
5.44k
        if (!b || !c) {
4754
0
            if (b) cram_free_block(b);
4755
0
            if (c) cram_free_container(c);
4756
0
            return -1;
4757
0
        }
4758
4759
5.44k
        if (int32_put_blk(b, header_len) < 0)
4760
0
            return -1;
4761
5.44k
        if (header_len)
4762
1.70k
            BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
4763
5.44k
        BLOCK_UPLEN(b);
4764
4765
        // Compress header block if V3.0 and above
4766
5.44k
        if (CRAM_MAJOR_VERS(fd->version) >= 3)
4767
5.44k
            if (cram_compress_block(fd, b, NULL, -1, -1) < 0)
4768
0
                return -1;
4769
4770
5.44k
        if (blank_block) {
4771
5.44k
            c->length = b->comp_size + 2 + 4*is_cram_3 +
4772
5.44k
                fd->vv.varint_size(b->content_id)   +
4773
5.44k
                fd->vv.varint_size(b->uncomp_size)  +
4774
5.44k
                fd->vv.varint_size(b->comp_size);
4775
4776
5.44k
            c->num_blocks = 2;
4777
5.44k
            c->num_landmarks = 2;
4778
5.44k
            if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
4779
0
                cram_free_block(b);
4780
0
                cram_free_container(c);
4781
0
                return -1;
4782
0
            }
4783
5.44k
            c->landmark[0] = 0;
4784
5.44k
            c->landmark[1] = c->length;
4785
4786
            // Plus extra storage for uncompressed secondary blank block
4787
5.44k
            padded_length = MIN(c->length*.5, 10000);
4788
5.44k
            c->length += padded_length + 2 + 4*is_cram_3 +
4789
5.44k
                fd->vv.varint_size(b->content_id) +
4790
5.44k
                fd->vv.varint_size(padded_length)*2;
4791
5.44k
        } else {
4792
            // Pad the block instead.
4793
0
            c->num_blocks = 1;
4794
0
            c->num_landmarks = 1;
4795
0
            if (!(c->landmark = malloc(sizeof(*c->landmark))))
4796
0
                return -1;
4797
0
            c->landmark[0] = 0;
4798
4799
0
            padded_length = MAX(c->length*1.5, 10000) - c->length;
4800
4801
0
            c->length = b->comp_size + padded_length +
4802
0
                2 + 4*is_cram_3 +
4803
0
                fd->vv.varint_size(b->content_id)   +
4804
0
                fd->vv.varint_size(b->uncomp_size)  +
4805
0
                fd->vv.varint_size(b->comp_size);
4806
4807
0
            if (NULL == (pads = calloc(1, padded_length))) {
4808
0
                cram_free_block(b);
4809
0
                cram_free_container(c);
4810
0
                return -1;
4811
0
            }
4812
0
            BLOCK_APPEND(b, pads, padded_length);
4813
0
            BLOCK_UPLEN(b);
4814
0
            free(pads);
4815
0
        }
4816
4817
5.44k
        if (-1 == cram_write_container(fd, c)) {
4818
0
            cram_free_block(b);
4819
0
            cram_free_container(c);
4820
0
            return -1;
4821
0
        }
4822
4823
5.44k
        if (-1 == cram_write_block(fd, b)) {
4824
0
            cram_free_block(b);
4825
0
            cram_free_container(c);
4826
0
            return -1;
4827
0
        }
4828
4829
5.44k
        if (blank_block) {
4830
5.44k
            BLOCK_RESIZE(b, padded_length);
4831
5.44k
            memset(BLOCK_DATA(b), 0, padded_length);
4832
5.44k
            BLOCK_SIZE(b) = padded_length;
4833
5.44k
            BLOCK_UPLEN(b);
4834
5.44k
            b->method = RAW;
4835
5.44k
            if (-1 == cram_write_block(fd, b)) {
4836
0
                cram_free_block(b);
4837
0
                cram_free_container(c);
4838
0
                return -1;
4839
0
            }
4840
5.44k
        }
4841
4842
5.44k
        cram_free_block(b);
4843
5.44k
        cram_free_container(c);
4844
5.44k
    }
4845
4846
5.44k
    if (0 != hflush(fd->fp))
4847
0
        return -1;
4848
4849
5.44k
    RP("=== Finishing saving header ===\n");
4850
4851
5.44k
    return 0;
4852
4853
0
 block_err:
4854
0
    return -1;
4855
5.44k
}
4856
4857
/* ----------------------------------------------------------------------
4858
 * The top-level cram opening, closing and option handling
4859
 */
4860
4861
/*
4862
 * Sets CRAM variable sized integer decode function tables.
4863
 * CRAM 1, 2, and 3.x all used ITF8 for uint32 and UTF8 for uint64.
4864
 * CRAM 4.x uses the same encoding mechanism for 32-bit and 64-bit
4865
 * (or anything inbetween), but also now supports signed values.
4866
 *
4867
 * Version is the CRAM major version number.
4868
 * vv is the vector table (probably &cram_fd->vv)
4869
 */
4870
17.7k
static void cram_init_varint(varint_vec *vv, int version) {
4871
17.7k
    vv->varint_get32 = safe_itf8_get;
4872
17.7k
    vv->varint_get32s = safe_itf8_get;
4873
17.7k
    vv->varint_get64 = safe_ltf8_get;
4874
17.7k
    vv->varint_get64s = safe_ltf8_get;
4875
17.7k
    vv->varint_put32 = safe_itf8_put;
4876
17.7k
    vv->varint_put32s = safe_itf8_put;
4877
17.7k
    vv->varint_put64 = safe_ltf8_put;
4878
17.7k
    vv->varint_put64s = safe_ltf8_put;
4879
17.7k
    vv->varint_put32_blk = itf8_put_blk;
4880
17.7k
    vv->varint_put32s_blk = itf8_put_blk;
4881
17.7k
    vv->varint_put64_blk = ltf8_put_blk;
4882
17.7k
    vv->varint_put64s_blk = ltf8_put_blk;
4883
17.7k
    vv->varint_size = itf8_size;
4884
17.7k
    vv->varint_decode32_crc = itf8_decode_crc;
4885
17.7k
    vv->varint_decode32s_crc = itf8_decode_crc;
4886
17.7k
    vv->varint_decode64_crc = ltf8_decode_crc;
4887
17.7k
}
4888
4889
/*
4890
 * Initialises the lookup tables. These could be global statics, but they're
4891
 * clumsy to setup in a multi-threaded environment unless we generate
4892
 * verbatim code and include that.
4893
 */
4894
17.7k
static void cram_init_tables(cram_fd *fd) {
4895
17.7k
    int i;
4896
4897
17.7k
    memset(fd->L1, 4, 256);
4898
17.7k
    fd->L1['A'] = 0; fd->L1['a'] = 0;
4899
17.7k
    fd->L1['C'] = 1; fd->L1['c'] = 1;
4900
17.7k
    fd->L1['G'] = 2; fd->L1['g'] = 2;
4901
17.7k
    fd->L1['T'] = 3; fd->L1['t'] = 3;
4902
4903
17.7k
    memset(fd->L2, 5, 256);
4904
17.7k
    fd->L2['A'] = 0; fd->L2['a'] = 0;
4905
17.7k
    fd->L2['C'] = 1; fd->L2['c'] = 1;
4906
17.7k
    fd->L2['G'] = 2; fd->L2['g'] = 2;
4907
17.7k
    fd->L2['T'] = 3; fd->L2['t'] = 3;
4908
17.7k
    fd->L2['N'] = 4; fd->L2['n'] = 4;
4909
4910
17.7k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4911
3.78M
        for (i = 0; i < 0x200; i++) {
4912
3.77M
            int f = 0;
4913
4914
3.77M
            if (i & CRAM_FPAIRED)      f |= BAM_FPAIRED;
4915
3.77M
            if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
4916
3.77M
            if (i & CRAM_FUNMAP)       f |= BAM_FUNMAP;
4917
3.77M
            if (i & CRAM_FREVERSE)     f |= BAM_FREVERSE;
4918
3.77M
            if (i & CRAM_FREAD1)       f |= BAM_FREAD1;
4919
3.77M
            if (i & CRAM_FREAD2)       f |= BAM_FREAD2;
4920
3.77M
            if (i & CRAM_FSECONDARY)   f |= BAM_FSECONDARY;
4921
3.77M
            if (i & CRAM_FQCFAIL)      f |= BAM_FQCFAIL;
4922
3.77M
            if (i & CRAM_FDUP)         f |= BAM_FDUP;
4923
4924
3.77M
            fd->bam_flag_swap[i]  = f;
4925
3.77M
        }
4926
4927
30.2M
        for (i = 0; i < 0x1000; i++) {
4928
30.2M
            int g = 0;
4929
4930
30.2M
            if (i & BAM_FPAIRED)           g |= CRAM_FPAIRED;
4931
30.2M
            if (i & BAM_FPROPER_PAIR)  g |= CRAM_FPROPER_PAIR;
4932
30.2M
            if (i & BAM_FUNMAP)        g |= CRAM_FUNMAP;
4933
30.2M
            if (i & BAM_FREVERSE)      g |= CRAM_FREVERSE;
4934
30.2M
            if (i & BAM_FREAD1)        g |= CRAM_FREAD1;
4935
30.2M
            if (i & BAM_FREAD2)        g |= CRAM_FREAD2;
4936
30.2M
            if (i & BAM_FSECONDARY)    g |= CRAM_FSECONDARY;
4937
30.2M
            if (i & BAM_FQCFAIL)       g |= CRAM_FQCFAIL;
4938
30.2M
            if (i & BAM_FDUP)          g |= CRAM_FDUP;
4939
4940
30.2M
            fd->cram_flag_swap[i] = g;
4941
30.2M
        }
4942
10.3k
    } else {
4943
        /* NOP */
4944
42.4M
        for (i = 0; i < 0x1000; i++)
4945
42.4M
            fd->bam_flag_swap[i] = i;
4946
42.4M
        for (i = 0; i < 0x1000; i++)
4947
42.4M
            fd->cram_flag_swap[i] = i;
4948
10.3k
    }
4949
4950
17.7k
    memset(fd->cram_sub_matrix, 4, 32*32);
4951
585k
    for (i = 0; i < 32; i++) {
4952
567k
        fd->cram_sub_matrix[i]['A'&0x1f]=0;
4953
567k
        fd->cram_sub_matrix[i]['C'&0x1f]=1;
4954
567k
        fd->cram_sub_matrix[i]['G'&0x1f]=2;
4955
567k
        fd->cram_sub_matrix[i]['T'&0x1f]=3;
4956
567k
        fd->cram_sub_matrix[i]['N'&0x1f]=4;
4957
567k
    }
4958
106k
    for (i = 0; i < 20; i+=4) {
4959
88.7k
        int j;
4960
1.86M
        for (j = 0; j < 20; j++) {
4961
1.77M
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
4962
1.77M
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
4963
1.77M
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
4964
1.77M
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
4965
1.77M
        }
4966
88.7k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
4967
88.7k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
4968
88.7k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
4969
88.7k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
4970
88.7k
    }
4971
4972
17.7k
    cram_init_varint(&fd->vv, CRAM_MAJOR_VERS(fd->version));
4973
17.7k
}
4974
4975
// Default version numbers for CRAM
4976
static int major_version = 3;
4977
static int minor_version = 1;
4978
4979
/*
4980
 * Opens a CRAM file for read (mode "rb") or write ("wb").
4981
 * The filename may be "-" to indicate stdin or stdout.
4982
 *
4983
 * Returns file handle on success
4984
 *         NULL on failure.
4985
 */
4986
0
cram_fd *cram_open(const char *filename, const char *mode) {
4987
0
    hFILE *fp;
4988
0
    cram_fd *fd;
4989
0
    char fmode[3]= { mode[0], '\0', '\0' };
4990
4991
0
    if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
4992
0
        fmode[1] = 'b';
4993
0
    }
4994
4995
0
    fp = hopen(filename, fmode);
4996
0
    if (!fp)
4997
0
        return NULL;
4998
4999
0
    fd = cram_dopen(fp, filename, mode);
5000
0
    if (!fd)
5001
0
        hclose_abruptly(fp);
5002
5003
0
    return fd;
5004
0
}
5005
5006
/* Opens an existing stream for reading or writing.
5007
 *
5008
 * Returns file handle on success;
5009
 *         NULL on failure.
5010
 */
5011
17.7k
cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
5012
17.7k
    int i;
5013
17.7k
    char *cp;
5014
17.7k
    cram_fd *fd = calloc(1, sizeof(*fd));
5015
17.7k
    if (!fd)
5016
0
        return NULL;
5017
5018
17.7k
    fd->level = CRAM_DEFAULT_LEVEL;
5019
53.2k
    for (i = 0; mode[i]; i++) {
5020
35.5k
        if (mode[i] >= '0' && mode[i] <= '9') {
5021
0
            fd->level = mode[i] - '0';
5022
0
            break;
5023
0
        }
5024
35.5k
    }
5025
5026
17.7k
    fd->fp = fp;
5027
17.7k
    fd->mode = *mode;
5028
17.7k
    fd->first_container = 0;
5029
17.7k
    fd->curr_position = 0;
5030
5031
17.7k
    if (fd->mode == 'r') {
5032
        /* Reader */
5033
5034
8.98k
        if (!(fd->file_def = cram_read_file_def(fd)))
5035
11
            goto err;
5036
5037
8.97k
        fd->version = fd->file_def->major_version * 256 +
5038
8.97k
            fd->file_def->minor_version;
5039
5040
8.97k
        cram_init_tables(fd);
5041
5042
8.97k
        if (!(fd->header = cram_read_SAM_hdr(fd))) {
5043
1.71k
            cram_free_file_def(fd->file_def);
5044
1.71k
            goto err;
5045
1.71k
        }
5046
5047
8.97k
    } else {
5048
        /* Writer */
5049
8.77k
        cram_file_def *def = calloc(1, sizeof(*def));
5050
8.77k
        if (!def)
5051
0
            return NULL;
5052
5053
8.77k
        fd->file_def = def;
5054
5055
8.77k
        def->magic[0] = 'C';
5056
8.77k
        def->magic[1] = 'R';
5057
8.77k
        def->magic[2] = 'A';
5058
8.77k
        def->magic[3] = 'M';
5059
8.77k
        def->major_version = 0; // Indicator to write file def later.
5060
8.77k
        def->minor_version = 0;
5061
8.77k
        memset(def->file_id, 0, 20);
5062
8.77k
        strncpy(def->file_id, filename, 20);
5063
5064
8.77k
        fd->version = major_version * 256 + minor_version;
5065
8.77k
        cram_init_tables(fd);
5066
5067
        /* SAM header written later along with this file_def */
5068
8.77k
    }
5069
5070
16.0k
    fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
5071
16.0k
    if (!fd->prefix)
5072
0
        goto err;
5073
16.0k
    fd->first_base = fd->last_base = -1;
5074
16.0k
    fd->record_counter = 0;
5075
5076
16.0k
    fd->ctr = NULL;
5077
16.0k
    fd->ctr_mt = NULL;
5078
16.0k
    fd->refs  = refs_create();
5079
16.0k
    if (!fd->refs)
5080
0
        goto err;
5081
16.0k
    fd->ref_id = -2;
5082
16.0k
    fd->ref = NULL;
5083
5084
16.0k
    fd->decode_md = 0;
5085
16.0k
    fd->seqs_per_slice = SEQS_PER_SLICE;
5086
16.0k
    fd->bases_per_slice = BASES_PER_SLICE;
5087
16.0k
    fd->slices_per_container = SLICE_PER_CNT;
5088
16.0k
    fd->embed_ref = -1; // automatic selection
5089
16.0k
    fd->no_ref = 0;
5090
16.0k
    fd->no_ref_counter = 0;
5091
16.0k
    fd->ap_delta = 0;
5092
16.0k
    fd->ignore_md5 = 0;
5093
16.0k
    fd->lossy_read_names = 0;
5094
16.0k
    fd->use_bz2 = 0;
5095
16.0k
    fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3);
5096
16.0k
    fd->use_tok = (CRAM_MAJOR_VERS(fd->version) >= 3) && (CRAM_MINOR_VERS(fd->version) >= 1);
5097
16.0k
    fd->use_lzma = 0;
5098
16.0k
    fd->multi_seq = -1;
5099
16.0k
    fd->multi_seq_user = -1;
5100
16.0k
    fd->unsorted   = 0;
5101
16.0k
    fd->shared_ref = 0;
5102
16.0k
    fd->store_md = 0;
5103
16.0k
    fd->store_nm = 0;
5104
16.0k
    fd->last_RI_count = 0;
5105
5106
16.0k
    fd->index       = NULL;
5107
16.0k
    fd->own_pool    = 0;
5108
16.0k
    fd->pool        = NULL;
5109
16.0k
    fd->rqueue      = NULL;
5110
16.0k
    fd->job_pending = NULL;
5111
16.0k
    fd->ooc         = 0;
5112
16.0k
    fd->required_fields = INT_MAX;
5113
5114
16.0k
    pthread_mutex_init(&fd->metrics_lock, NULL);
5115
16.0k
    pthread_mutex_init(&fd->ref_lock, NULL);
5116
16.0k
    pthread_mutex_init(&fd->range_lock, NULL);
5117
16.0k
    pthread_mutex_init(&fd->bam_list_lock, NULL);
5118
5119
769k
    for (i = 0; i < DS_END; i++) {
5120
753k
        fd->m[i] = cram_new_metrics();
5121
753k
        if (!fd->m[i])
5122
0
            goto err;
5123
753k
    }
5124
5125
16.0k
    if (!(fd->tags_used = kh_init(m_metrics)))
5126
0
        goto err;
5127
5128
16.0k
    fd->range.refid = -2; // no ref.
5129
16.0k
    fd->eof = 1;          // See samtools issue #150
5130
16.0k
    fd->ref_fn = NULL;
5131
5132
16.0k
    fd->bl = NULL;
5133
5134
    /* Initialise dummy refs from the @SQ headers */
5135
16.0k
    if (-1 == refs_from_header(fd))
5136
0
        goto err;
5137
5138
16.0k
    return fd;
5139
5140
1.72k
 err:
5141
1.72k
    if (fd)
5142
1.72k
        free(fd);
5143
5144
1.72k
    return NULL;
5145
16.0k
}
5146
5147
/*
5148
 * Seek within a CRAM file.
5149
 *
5150
 * Returns 0 on success
5151
 *        -1 on failure
5152
 */
5153
0
int cram_seek(cram_fd *fd, off_t offset, int whence) {
5154
0
    fd->ooc = 0;
5155
5156
0
    cram_drain_rqueue(fd);
5157
5158
0
    return hseek(fd->fp, offset, whence) >= 0 ? 0 : -1;
5159
0
}
5160
5161
/*
5162
 * Flushes a CRAM file.
5163
 * Useful for when writing to stdout without wishing to close the stream.
5164
 *
5165
 * Returns 0 on success
5166
 *        -1 on failure
5167
 */
5168
0
int cram_flush(cram_fd *fd) {
5169
0
    if (!fd)
5170
0
        return -1;
5171
5172
0
    int ret = 0;
5173
5174
0
    if (fd->mode == 'w' && fd->ctr) {
5175
0
        if(fd->ctr->slice)
5176
0
            cram_update_curr_slice(fd->ctr, fd->version);
5177
5178
0
        if (-1 == cram_flush_container_mt(fd, fd->ctr))
5179
0
            ret = -1;
5180
5181
0
        cram_free_container(fd->ctr);
5182
0
        if (fd->ctr_mt == fd->ctr)
5183
0
            fd->ctr_mt = NULL;
5184
0
        fd->ctr = NULL;
5185
0
    }
5186
5187
0
    return ret;
5188
0
}
5189
5190
/*
5191
 * Writes an EOF block to a CRAM file.
5192
 *
5193
 * Returns 0 on success
5194
 *        -1 on failure
5195
 */
5196
8.46k
int cram_write_eof_block(cram_fd *fd) {
5197
    // EOF block is a container with special values to aid detection
5198
8.46k
    if (CRAM_MAJOR_VERS(fd->version) >= 2) {
5199
        // Empty container with
5200
        //   ref_seq_id -1
5201
        //   start pos 0x454f46 ("EOF")
5202
        //   span 0
5203
        //   nrec 0
5204
        //   counter 0
5205
        //   nbases 0
5206
        //   1 block (landmark 0)
5207
        //   (CRC32)
5208
8.46k
        cram_container c;
5209
8.46k
        memset(&c, 0, sizeof(c));
5210
8.46k
        c.ref_seq_id = -1;
5211
8.46k
        c.ref_seq_start = 0x454f46; // "EOF"
5212
8.46k
        c.ref_seq_span = 0;
5213
8.46k
        c.record_counter = 0;
5214
8.46k
        c.num_bases = 0;
5215
8.46k
        c.num_blocks = 1;
5216
8.46k
        int32_t land[1] = {0};
5217
8.46k
        c.landmark = land;
5218
5219
        // An empty compression header block with
5220
        //   method raw (0)
5221
        //   type comp header (1)
5222
        //   content id 0
5223
        //   block contents size 6
5224
        //   raw size 6
5225
        //     empty preservation map (01 00)
5226
        //     empty data series map (01 00)
5227
        //     empty tag map (01 00)
5228
        //   block CRC
5229
8.46k
        cram_block_compression_hdr ch;
5230
8.46k
        memset(&ch, 0, sizeof(ch));
5231
8.46k
        c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch, 0);
5232
5233
8.46k
        c.length = c.comp_hdr_block->byte            // Landmark[0]
5234
8.46k
            + 5                                      // block struct
5235
8.46k
            + 4*(CRAM_MAJOR_VERS(fd->version) >= 3); // CRC
5236
8.46k
        if (cram_write_container(fd, &c) < 0 ||
5237
8.46k
            cram_write_block(fd, c.comp_hdr_block) < 0) {
5238
0
            cram_close(fd);
5239
0
            cram_free_block(c.comp_hdr_block);
5240
0
            return -1;
5241
0
        }
5242
8.46k
        if (ch.preservation_map)
5243
0
            kh_destroy(map, ch.preservation_map);
5244
8.46k
        cram_free_block(c.comp_hdr_block);
5245
5246
        // V2.1 bytes
5247
        // 0b 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id
5248
        // e0 45 4f 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5249
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5250
        // 00 01 00 06 06             // Comp.HDR blk
5251
        // 01 00 01 00 01 00          // Comp.HDR blk
5252
5253
        // V3.0 bytes:
5254
        // 0f 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id
5255
        // e0 45 4f 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5256
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5257
        // 05 bd d9 4f                // CRC32
5258
        // 00 01 00 06 06             // Comp.HDR blk
5259
        // 01 00 01 00 01 00          // Comp.HDR blk
5260
        // ee 63 01 4b                // CRC32
5261
8.46k
    }
5262
5263
8.46k
    return 0;
5264
8.46k
}
5265
5266
/*
5267
 * Closes a CRAM file.
5268
 * Returns 0 on success
5269
 *        -1 on failure
5270
 */
5271
16.0k
int cram_close(cram_fd *fd) {
5272
16.0k
    bam_list *bl, *next;
5273
16.0k
    int i, ret = 0;
5274
5275
16.0k
    if (!fd)
5276
0
        return -1;
5277
5278
16.0k
    if (fd->mode == 'w' && fd->ctr) {
5279
2.62k
        if(fd->ctr->slice)
5280
2.62k
            cram_update_curr_slice(fd->ctr, fd->version);
5281
5282
2.62k
        if (-1 == cram_flush_container_mt(fd, fd->ctr))
5283
304
            ret = -1;
5284
2.62k
    }
5285
5286
16.0k
    if (fd->mode != 'w')
5287
7.26k
        cram_drain_rqueue(fd);
5288
5289
16.0k
    if (fd->pool && fd->eof >= 0 && fd->rqueue) {
5290
0
        hts_tpool_process_flush(fd->rqueue);
5291
5292
0
        if (0 != cram_flush_result(fd))
5293
0
            ret = -1;
5294
5295
0
        if (fd->mode == 'w')
5296
0
            fd->ctr = NULL; // prevent double freeing
5297
5298
        //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
5299
5300
0
        hts_tpool_process_destroy(fd->rqueue);
5301
0
    }
5302
5303
16.0k
    pthread_mutex_destroy(&fd->metrics_lock);
5304
16.0k
    pthread_mutex_destroy(&fd->ref_lock);
5305
16.0k
    pthread_mutex_destroy(&fd->range_lock);
5306
16.0k
    pthread_mutex_destroy(&fd->bam_list_lock);
5307
5308
16.0k
    if (ret == 0 && fd->mode == 'w') {
5309
        /* Write EOF block */
5310
8.46k
        if (0 != cram_write_eof_block(fd))
5311
0
            ret = -1;
5312
8.46k
    }
5313
5314
18.3k
    for (bl = fd->bl; bl; bl = next) {
5315
2.33k
        next = bl->next;
5316
2.33k
        free_bam_list(bl->bams, bl->nbams);
5317
2.33k
        free(bl);
5318
2.33k
    }
5319
5320
16.0k
    if (hclose(fd->fp) != 0)
5321
0
        ret = -1;
5322
5323
16.0k
    if (fd->file_def)
5324
16.0k
        cram_free_file_def(fd->file_def);
5325
5326
16.0k
    if (fd->header)
5327
12.7k
        sam_hdr_destroy(fd->header);
5328
5329
16.0k
    free(fd->prefix);
5330
5331
16.0k
    if (fd->ctr)
5332
7.72k
        cram_free_container(fd->ctr);
5333
5334
16.0k
    if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
5335
38
        cram_free_container(fd->ctr_mt);
5336
5337
16.0k
    if (fd->refs)
5338
16.0k
        refs_free(fd->refs);
5339
16.0k
    if (fd->ref_free)
5340
0
        free(fd->ref_free);
5341
5342
769k
    for (i = 0; i < DS_END; i++)
5343
753k
        if (fd->m[i])
5344
753k
            free(fd->m[i]);
5345
5346
16.0k
    if (fd->tags_used) {
5347
16.0k
        khint_t k;
5348
5349
20.4k
        for (k = kh_begin(fd->tags_used); k != kh_end(fd->tags_used); k++) {
5350
4.36k
            if (kh_exist(fd->tags_used, k))
5351
1.94k
                free(kh_val(fd->tags_used, k));
5352
4.36k
        }
5353
5354
16.0k
        kh_destroy(m_metrics, fd->tags_used);
5355
16.0k
    }
5356
5357
16.0k
    if (fd->index)
5358
0
        cram_index_free(fd);
5359
5360
16.0k
    if (fd->own_pool && fd->pool)
5361
0
        hts_tpool_destroy(fd->pool);
5362
5363
16.0k
    if (fd->idxfp)
5364
0
        if (bgzf_close(fd->idxfp) < 0)
5365
0
            ret = -1;
5366
5367
16.0k
    free(fd);
5368
5369
16.0k
    return ret;
5370
16.0k
}
5371
5372
/*
5373
 * Returns 1 if we hit an EOF while reading.
5374
 */
5375
12.7k
int cram_eof(cram_fd *fd) {
5376
12.7k
    return fd->eof;
5377
12.7k
}
5378
5379
5380
/*
5381
 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5382
 * Use this immediately after opening.
5383
 *
5384
 * Returns 0 on success
5385
 *        -1 on failure
5386
 */
5387
7.26k
int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...) {
5388
7.26k
    int r;
5389
7.26k
    va_list args;
5390
5391
7.26k
    va_start(args, opt);
5392
7.26k
    r = cram_set_voption(fd, opt, args);
5393
7.26k
    va_end(args);
5394
5395
7.26k
    return r;
5396
7.26k
}
5397
5398
/*
5399
 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5400
 * Use this immediately after opening.
5401
 *
5402
 * Returns 0 on success
5403
 *        -1 on failure
5404
 */
5405
7.26k
int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
5406
7.26k
    refs_t *refs;
5407
5408
7.26k
    if (!fd) {
5409
0
        errno = EBADF;
5410
0
        return -1;
5411
0
    }
5412
5413
7.26k
    switch (opt) {
5414
7.26k
    case CRAM_OPT_DECODE_MD:
5415
7.26k
        fd->decode_md = va_arg(args, int);
5416
7.26k
        break;
5417
5418
0
    case CRAM_OPT_PREFIX:
5419
0
        if (fd->prefix)
5420
0
            free(fd->prefix);
5421
0
        if (!(fd->prefix = strdup(va_arg(args, char *))))
5422
0
            return -1;
5423
0
        break;
5424
5425
0
    case CRAM_OPT_VERBOSITY:
5426
0
        break;
5427
5428
0
    case CRAM_OPT_SEQS_PER_SLICE:
5429
0
        fd->seqs_per_slice = va_arg(args, int);
5430
0
        if (fd->bases_per_slice == BASES_PER_SLICE)
5431
0
            fd->bases_per_slice = fd->seqs_per_slice * 500;
5432
0
        break;
5433
5434
0
    case CRAM_OPT_BASES_PER_SLICE:
5435
0
        fd->bases_per_slice = va_arg(args, int);
5436
0
        break;
5437
5438
0
    case CRAM_OPT_SLICES_PER_CONTAINER:
5439
0
        fd->slices_per_container = va_arg(args, int);
5440
0
        break;
5441
5442
0
    case CRAM_OPT_EMBED_REF:
5443
0
        fd->embed_ref = va_arg(args, int);
5444
0
        break;
5445
5446
0
    case CRAM_OPT_NO_REF:
5447
0
        fd->no_ref = va_arg(args, int);
5448
0
        break;
5449
5450
0
    case CRAM_OPT_POS_DELTA:
5451
0
        fd->ap_delta = va_arg(args, int);
5452
0
        break;
5453
5454
0
    case CRAM_OPT_IGNORE_MD5:
5455
0
        fd->ignore_md5 = va_arg(args, int);
5456
0
        break;
5457
5458
0
    case CRAM_OPT_LOSSY_NAMES:
5459
0
        fd->lossy_read_names = va_arg(args, int);
5460
        // Currently lossy read names required paired (attached) reads.
5461
        // TLEN 0 or being 1 out causes read pairs to be detached, breaking
5462
        // the lossy read name compression, so we have extra options to
5463
        // slacken the exact TLEN round-trip checks.
5464
0
        fd->tlen_approx = fd->lossy_read_names;
5465
0
        fd->tlen_zero = fd->lossy_read_names;
5466
0
        break;
5467
5468
0
    case CRAM_OPT_USE_BZIP2:
5469
0
        fd->use_bz2 = va_arg(args, int);
5470
0
        break;
5471
5472
0
    case CRAM_OPT_USE_RANS:
5473
0
        fd->use_rans = va_arg(args, int);
5474
0
        break;
5475
5476
0
    case CRAM_OPT_USE_TOK:
5477
0
        fd->use_tok = va_arg(args, int);
5478
0
        break;
5479
5480
0
    case CRAM_OPT_USE_FQZ:
5481
0
        fd->use_fqz = va_arg(args, int);
5482
0
        break;
5483
5484
0
    case CRAM_OPT_USE_ARITH:
5485
0
        fd->use_arith = va_arg(args, int);
5486
0
        break;
5487
5488
0
    case CRAM_OPT_USE_LZMA:
5489
0
        fd->use_lzma = va_arg(args, int);
5490
0
        break;
5491
5492
0
    case CRAM_OPT_SHARED_REF:
5493
0
        fd->shared_ref = 1;
5494
0
        refs = va_arg(args, refs_t *);
5495
0
        if (refs != fd->refs) {
5496
0
            if (fd->refs)
5497
0
                refs_free(fd->refs);
5498
0
            fd->refs = refs;
5499
0
            fd->refs->count++;
5500
0
        }
5501
0
        break;
5502
5503
0
    case CRAM_OPT_RANGE: {
5504
0
        int r = cram_seek_to_refpos(fd, va_arg(args, cram_range *));
5505
0
        pthread_mutex_lock(&fd->range_lock);
5506
//        printf("opt range noseek to %p %d:%ld-%ld\n",
5507
//               fd, fd->range.refid, fd->range.start, fd->range.end);
5508
0
        if (fd->range.refid != -2)
5509
0
            fd->required_fields |= SAM_POS;
5510
0
        pthread_mutex_unlock(&fd->range_lock);
5511
0
        return r;
5512
0
    }
5513
5514
0
    case CRAM_OPT_RANGE_NOSEEK: {
5515
        // As per CRAM_OPT_RANGE, but no seeking
5516
0
        pthread_mutex_lock(&fd->range_lock);
5517
0
        cram_range *r = va_arg(args, cram_range *);
5518
0
        fd->range = *r;
5519
0
        if (r->refid == HTS_IDX_NOCOOR) {
5520
0
            fd->range.refid = -1;
5521
0
            fd->range.start = 0;
5522
0
        } else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) {
5523
0
            fd->range.refid = -2; // special case in cram_next_slice
5524
0
        }
5525
0
        if (fd->range.refid != -2)
5526
0
            fd->required_fields |= SAM_POS;
5527
0
        fd->ooc = 0;
5528
0
        fd->eof = 0;
5529
0
        pthread_mutex_unlock(&fd->range_lock);
5530
0
        return 0;
5531
0
    }
5532
5533
0
    case CRAM_OPT_REFERENCE:
5534
0
        return cram_load_reference(fd, va_arg(args, char *));
5535
5536
0
    case CRAM_OPT_VERSION: {
5537
0
        int major, minor;
5538
0
        char *s = va_arg(args, char *);
5539
0
        if (2 != sscanf(s, "%d.%d", &major, &minor)) {
5540
0
            hts_log_error("Malformed version string %s", s);
5541
0
            return -1;
5542
0
        }
5543
0
        if (!((major == 1 &&  minor == 0) ||
5544
0
              (major == 2 && (minor == 0 || minor == 1)) ||
5545
0
              (major == 3 && (minor == 0 || minor == 1)))) {
5546
0
            hts_log_error("Unknown version string; use 1.0, 2.0, 2.1, "
5547
0
                          "3.0 or 3.1");
5548
0
            errno = EINVAL;
5549
0
            return -1;
5550
0
        }
5551
5552
0
        fd->version = major*256 + minor;
5553
5554
0
        fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0;
5555
5556
0
        fd->use_tok = (CRAM_MAJOR_VERS(fd->version) == 3 &&
5557
0
                       CRAM_MINOR_VERS(fd->version) >= 1);
5558
0
        cram_init_tables(fd);
5559
5560
0
        break;
5561
0
    }
5562
5563
0
    case CRAM_OPT_MULTI_SEQ_PER_SLICE:
5564
0
        fd->multi_seq_user = fd->multi_seq = va_arg(args, int);
5565
0
        break;
5566
5567
0
    case CRAM_OPT_NTHREADS: {
5568
0
        int nthreads =  va_arg(args, int);
5569
0
        if (fd->pool)
5570
0
            return -2;  //already exists!
5571
0
        if (nthreads >= 1) {
5572
0
            if (!(fd->pool = hts_tpool_init(nthreads)))
5573
0
                return -1;
5574
5575
0
            fd->rqueue = hts_tpool_process_init(fd->pool, nthreads*2, 0);
5576
0
            fd->shared_ref = 1;
5577
0
            fd->own_pool = 1;
5578
0
        }
5579
0
        break;
5580
0
    }
5581
5582
0
    case CRAM_OPT_THREAD_POOL: {
5583
0
        htsThreadPool *p = va_arg(args, htsThreadPool *);
5584
0
        if (fd->pool)
5585
0
            return -2;  //already exists!
5586
0
        fd->pool = p ? p->pool : NULL;
5587
0
        if (fd->pool) {
5588
0
            fd->rqueue = hts_tpool_process_init(fd->pool,
5589
0
                                                p->qsize ? p->qsize : hts_tpool_size(fd->pool)*2,
5590
0
                                                0);
5591
0
        }
5592
0
        fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
5593
0
        fd->own_pool = 0;
5594
5595
        //fd->qsize = 1;
5596
        //fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
5597
        //hts_tpool_dispatch(fd->pool, cram_decoder_thread, fd);
5598
0
        break;
5599
0
    }
5600
5601
0
    case CRAM_OPT_REQUIRED_FIELDS:
5602
0
        fd->required_fields = va_arg(args, int);
5603
0
        if (fd->range.refid != -2)
5604
0
            fd->required_fields |= SAM_POS;
5605
0
        break;
5606
5607
0
    case CRAM_OPT_STORE_MD:
5608
0
        fd->store_md = va_arg(args, int);
5609
0
        break;
5610
5611
0
    case CRAM_OPT_STORE_NM:
5612
0
        fd->store_nm = va_arg(args, int);
5613
0
        break;
5614
5615
0
    case HTS_OPT_COMPRESSION_LEVEL:
5616
0
        fd->level = va_arg(args, int);
5617
0
        break;
5618
5619
0
    case HTS_OPT_PROFILE: {
5620
0
        enum hts_profile_option prof = va_arg(args, int);
5621
0
        switch (prof) {
5622
0
        case HTS_PROFILE_FAST:
5623
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 1;
5624
0
            fd->use_tok = 0;
5625
0
            fd->seqs_per_slice = 10000;
5626
0
            break;
5627
5628
0
        case HTS_PROFILE_NORMAL:
5629
0
            break;
5630
5631
0
        case HTS_PROFILE_SMALL:
5632
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 6;
5633
0
            fd->use_bz2 = 1;
5634
0
            fd->use_fqz = 1;
5635
0
            fd->seqs_per_slice = 25000;
5636
0
            break;
5637
5638
0
        case HTS_PROFILE_ARCHIVE:
5639
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 7;
5640
0
            fd->use_bz2 = 1;
5641
0
            fd->use_fqz = 1;
5642
0
            fd->use_arith = 1;
5643
0
            if (fd->level > 7)
5644
0
                fd->use_lzma = 1;
5645
0
            fd->seqs_per_slice = 100000;
5646
0
            break;
5647
0
        }
5648
5649
0
        if (fd->bases_per_slice == BASES_PER_SLICE)
5650
0
            fd->bases_per_slice = fd->seqs_per_slice * 500;
5651
0
        break;
5652
0
    }
5653
5654
0
    default:
5655
0
        hts_log_error("Unknown CRAM option code %d", opt);
5656
0
        errno = EINVAL;
5657
0
        return -1;
5658
7.26k
    }
5659
5660
7.26k
    return 0;
5661
7.26k
}
5662
5663
int cram_check_EOF(cram_fd *fd)
5664
0
{
5665
    // Byte 9 in these templates is & with 0x0f to resolve differences
5666
    // between ITF-8 interpretations between early Java and C
5667
    // implementations of CRAM
5668
0
    static const unsigned char TEMPLATE_2_1[30] = {
5669
0
        0x0b, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
5670
0
        0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
5671
0
        0x01, 0x00, 0x06, 0x06, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00
5672
0
    };
5673
0
    static const unsigned char TEMPLATE_3[38] = {
5674
0
        0x0f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
5675
0
        0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x05,
5676
0
        0xbd, 0xd9, 0x4f, 0x00, 0x01, 0x00, 0x06, 0x06, 0x01, 0x00,
5677
0
        0x01, 0x00, 0x01, 0x00, 0xee, 0x63, 0x01, 0x4b
5678
0
    };
5679
5680
0
    unsigned char buf[38]; // max(sizeof TEMPLATE_*)
5681
5682
0
    uint8_t major = CRAM_MAJOR_VERS(fd->version);
5683
0
    uint8_t minor = CRAM_MINOR_VERS(fd->version);
5684
5685
0
    const unsigned char *template;
5686
0
    ssize_t template_len;
5687
0
    if ((major < 2) ||
5688
0
        (major == 2 && minor == 0)) {
5689
0
        return 3; // No EOF support in cram versions less than 2.1
5690
0
    } else if (major == 2 && minor == 1) {
5691
0
        template = TEMPLATE_2_1;
5692
0
        template_len = sizeof TEMPLATE_2_1;
5693
0
    } else {
5694
0
        template = TEMPLATE_3;
5695
0
        template_len = sizeof TEMPLATE_3;
5696
0
    }
5697
5698
0
    off_t offset = htell(fd->fp);
5699
0
    if (hseek(fd->fp, -template_len, SEEK_END) < 0) {
5700
0
        if (errno == ESPIPE) {
5701
0
            hclearerr(fd->fp);
5702
0
            return 2;
5703
0
        }
5704
0
        else {
5705
0
            return -1;
5706
0
        }
5707
0
    }
5708
0
    if (hread(fd->fp, buf, template_len) != template_len) return -1;
5709
0
    if (hseek(fd->fp, offset, SEEK_SET) < 0) return -1;
5710
0
    buf[8] &= 0x0f;
5711
0
    return (memcmp(template, buf, template_len) == 0)? 1 : 0;
5712
0
}