Coverage Report

Created: 2025-07-11 06:53

/src/htslib/cram/cram_io.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2012-2025 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * CRAM I/O primitives.
33
 *
34
 * - ITF8 encoding and decoding.
35
 * - Block based I/O
36
 * - Zlib inflating and deflating (memory)
37
 * - CRAM basic data structure reading and writing
38
 * - File opening / closing
39
 * - Reference sequence handling
40
 */
41
42
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
43
#include <config.h>
44
45
#include <stdio.h>
46
#include <errno.h>
47
#include <assert.h>
48
#include <stdlib.h>
49
#include <string.h>
50
#include <unistd.h>
51
#include <zlib.h>
52
#ifdef HAVE_LIBBZ2
53
#include <bzlib.h>
54
#endif
55
#ifdef HAVE_LIBLZMA
56
#ifdef HAVE_LZMA_H
57
#include <lzma.h>
58
#else
59
#include "../os/lzma_stub.h"
60
#endif
61
#endif
62
#include <sys/types.h>
63
#include <sys/stat.h>
64
#include <math.h>
65
#include <stdint.h>
66
67
#ifdef HAVE_LIBDEFLATE
68
#include <libdeflate.h>
69
#define crc32(a,b,c) libdeflate_crc32((a),(b),(c))
70
#endif
71
72
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
73
#include "../fuzz_settings.h"
74
#endif
75
76
#include "cram.h"
77
#include "os.h"
78
#include "../htslib/hts.h"
79
#include "../hts_internal.h"
80
#include "open_trace_file.h"
81
82
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
83
#include <htscodecs/rANS_static.h>
84
#include <htscodecs/rANS_static4x16.h>
85
#include <htscodecs/arith_dynamic.h>
86
#include <htscodecs/tokenise_name3.h>
87
#include <htscodecs/fqzcomp_qual.h>
88
#include <htscodecs/varint.h> // CRAM v4.0 variable-size integers
89
#else
90
#include "../htscodecs/htscodecs/rANS_static.h"
91
#include "../htscodecs/htscodecs/rANS_static4x16.h"
92
#include "../htscodecs/htscodecs/arith_dynamic.h"
93
#include "../htscodecs/htscodecs/tokenise_name3.h"
94
#include "../htscodecs/htscodecs/fqzcomp_qual.h"
95
#include "../htscodecs/htscodecs/varint.h"
96
#endif
97
98
//#define REF_DEBUG
99
100
#ifdef REF_DEBUG
101
#include <sys/syscall.h>
102
#define gettid() (int)syscall(SYS_gettid)
103
104
#define RP(...) fprintf (stderr, __VA_ARGS__)
105
#else
106
#define RP(...)
107
#endif
108
109
#include "../htslib/hfile.h"
110
#include "../htslib/bgzf.h"
111
#include "../htslib/faidx.h"
112
#include "../hts_internal.h"
113
114
#ifndef PATH_MAX
115
#define PATH_MAX FILENAME_MAX
116
#endif
117
118
908k
#define TRIAL_SPAN 70
119
908k
#define NTRIALS 3
120
121
18.6k
#define CRAM_DEFAULT_LEVEL 5
122
123
/* ----------------------------------------------------------------------
124
 * ITF8 encoding and decoding.
125
 *
126
 * Also see the itf8_get and itf8_put macros in cram_io.h
127
 */
128
129
/*
130
 * LEGACY: consider using itf8_decode_crc.
131
 *
132
 * Reads an integer in ITF-8 encoding from 'cp' and stores it in
133
 * *val.
134
 *
135
 * Returns the number of bytes read on success
136
 *        -1 on failure
137
 */
138
0
int itf8_decode(cram_fd *fd, int32_t *val_p) {
139
0
    static int nbytes[16] = {
140
0
        0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
141
0
        1,1,1,1,                                        // 1000xxxx - 1011xxxx
142
0
        2,2,                                            // 1100xxxx - 1101xxxx
143
0
        3,                                              // 1110xxxx
144
0
        4,                                              // 1111xxxx
145
0
    };
146
147
0
    static int nbits[16] = {
148
0
        0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
149
0
        0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
150
0
        0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
151
0
        0x0f,                                           // 1110xxxx
152
0
        0x0f,                                           // 1111xxxx
153
0
    };
154
155
0
    int32_t val = hgetc(fd->fp);
156
0
    if (val == -1)
157
0
        return -1;
158
159
0
    int i = nbytes[val>>4];
160
0
    val &= nbits[val>>4];
161
162
0
    switch(i) {
163
0
    case 0:
164
0
        *val_p = val;
165
0
        return 1;
166
167
0
    case 1:
168
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
169
0
        *val_p = val;
170
0
        return 2;
171
172
0
    case 2:
173
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
174
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
175
0
        *val_p = val;
176
0
        return 3;
177
178
0
    case 3:
179
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
180
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
181
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
182
0
        *val_p = val;
183
0
        return 4;
184
185
0
    case 4: // really 3.5 more, why make it different?
186
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
187
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
188
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
189
0
        val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f);
190
0
        *val_p = val;
191
0
    }
192
193
0
    return 5;
194
0
}
195
196
242k
int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
197
242k
    static int nbytes[16] = {
198
242k
        0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
199
242k
        1,1,1,1,                                        // 1000xxxx - 1011xxxx
200
242k
        2,2,                                            // 1100xxxx - 1101xxxx
201
242k
        3,                                              // 1110xxxx
202
242k
        4,                                              // 1111xxxx
203
242k
    };
204
205
242k
    static int nbits[16] = {
206
242k
        0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
207
242k
        0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
208
242k
        0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
209
242k
        0x0f,                                           // 1110xxxx
210
242k
        0x0f,                                           // 1111xxxx
211
242k
    };
212
242k
    unsigned char c[5];
213
214
242k
    int32_t val = hgetc(fd->fp);
215
242k
    if (val == -1)
216
349
        return -1;
217
218
242k
    c[0]=val;
219
220
242k
    int i = nbytes[val>>4];
221
242k
    val &= nbits[val>>4];
222
223
242k
    if (i > 0) {
224
16.6k
        if (hread(fd->fp, &c[1], i) < i)
225
111
            return -1;
226
16.6k
    }
227
228
242k
    switch(i) {
229
225k
    case 0:
230
225k
        *val_p = val;
231
225k
        *crc = crc32(*crc, c, 1);
232
225k
        return 1;
233
234
7.35k
    case 1:
235
7.35k
        val = (val<<8) | c[1];
236
7.35k
        *val_p = val;
237
7.35k
        *crc = crc32(*crc, c, 2);
238
7.35k
        return 2;
239
240
3.72k
    case 2:
241
3.72k
        val = (val<<8) | c[1];
242
3.72k
        val = (val<<8) | c[2];
243
3.72k
        *val_p = val;
244
3.72k
        *crc = crc32(*crc, c, 3);
245
3.72k
        return 3;
246
247
1.15k
    case 3:
248
1.15k
        val = (val<<8) | c[1];
249
1.15k
        val = (val<<8) | c[2];
250
1.15k
        val = (val<<8) | c[3];
251
1.15k
        *val_p = val;
252
1.15k
        *crc = crc32(*crc, c, 4);
253
1.15k
        return 4;
254
255
4.27k
    case 4: // really 3.5 more, why make it different?
256
4.27k
        {
257
4.27k
            uint32_t uv = val;
258
4.27k
            uv = (uv<<8) |  c[1];
259
4.27k
            uv = (uv<<8) |  c[2];
260
4.27k
            uv = (uv<<8) |  c[3];
261
4.27k
            uv = (uv<<4) | (c[4] & 0x0f);
262
            // Avoid implementation-defined behaviour on negative values
263
4.27k
            *val_p = uv < 0x80000000UL ? (int32_t) uv : -((int32_t) (0xffffffffUL - uv)) - 1;
264
4.27k
            *crc = crc32(*crc, c, 5);
265
4.27k
        }
266
242k
    }
267
268
4.27k
    return 5;
269
242k
}
270
271
/*
272
 * Stores a value to memory in ITF-8 format.
273
 *
274
 * Returns the number of bytes required to store the number.
275
 * This is a maximum of 5 bytes.
276
 */
277
11.0M
static inline int itf8_put(char *cp, int32_t val) {
278
11.0M
    unsigned char *up = (unsigned char *)cp;
279
11.0M
    if        (!(val & ~0x00000007f)) { // 1 byte
280
10.1M
        *up = val;
281
10.1M
        return 1;
282
10.1M
    } else if (!(val & ~0x00003fff)) { // 2 byte
283
314k
        *up++ = (val >> 8 ) | 0x80;
284
314k
        *up   = val & 0xff;
285
314k
        return 2;
286
583k
    } else if (!(val & ~0x01fffff)) { // 3 byte
287
7.67k
        *up++ = (val >> 16) | 0xc0;
288
7.67k
        *up++ = (val >> 8 ) & 0xff;
289
7.67k
        *up   = val & 0xff;
290
7.67k
        return 3;
291
575k
    } else if (!(val & ~0x0fffffff)) { // 4 byte
292
401k
        *up++ = (val >> 24) | 0xe0;
293
401k
        *up++ = (val >> 16) & 0xff;
294
401k
        *up++ = (val >> 8 ) & 0xff;
295
401k
        *up   = val & 0xff;
296
401k
        return 4;
297
401k
    } else {                           // 5 byte
298
174k
        *up++ = 0xf0 | ((val>>28) & 0xff);
299
174k
        *up++ = (val >> 20) & 0xff;
300
174k
        *up++ = (val >> 12) & 0xff;
301
174k
        *up++ = (val >> 4 ) & 0xff;
302
174k
        *up = val & 0x0f;
303
174k
        return 5;
304
174k
    }
305
11.0M
}
306
307
308
/* 64-bit itf8 variant */
309
141k
static inline int ltf8_put(char *cp, int64_t val) {
310
141k
    unsigned char *up = (unsigned char *)cp;
311
141k
    if        (!(val & ~((1LL<<7)-1))) {
312
95.2k
        *up = val;
313
95.2k
        return 1;
314
95.2k
    } else if (!(val & ~((1LL<<(6+8))-1))) {
315
45.8k
        *up++ = (val >> 8 ) | 0x80;
316
45.8k
        *up   = val & 0xff;
317
45.8k
        return 2;
318
45.8k
    } else if (!(val & ~((1LL<<(5+2*8))-1))) {
319
361
        *up++ = (val >> 16) | 0xc0;
320
361
        *up++ = (val >> 8 ) & 0xff;
321
361
        *up   = val & 0xff;
322
361
        return 3;
323
361
    } else if (!(val & ~((1LL<<(4+3*8))-1))) {
324
16
        *up++ = (val >> 24) | 0xe0;
325
16
        *up++ = (val >> 16) & 0xff;
326
16
        *up++ = (val >> 8 ) & 0xff;
327
16
        *up   = val & 0xff;
328
16
        return 4;
329
16
    } else if (!(val & ~((1LL<<(3+4*8))-1))) {
330
0
        *up++ = (val >> 32) | 0xf0;
331
0
        *up++ = (val >> 24) & 0xff;
332
0
        *up++ = (val >> 16) & 0xff;
333
0
        *up++ = (val >> 8 ) & 0xff;
334
0
        *up   = val & 0xff;
335
0
        return 5;
336
0
    } else if (!(val & ~((1LL<<(2+5*8))-1))) {
337
0
        *up++ = (val >> 40) | 0xf8;
338
0
        *up++ = (val >> 32) & 0xff;
339
0
        *up++ = (val >> 24) & 0xff;
340
0
        *up++ = (val >> 16) & 0xff;
341
0
        *up++ = (val >> 8 ) & 0xff;
342
0
        *up   = val & 0xff;
343
0
        return 6;
344
0
    } else if (!(val & ~((1LL<<(1+6*8))-1))) {
345
0
        *up++ = (val >> 48) | 0xfc;
346
0
        *up++ = (val >> 40) & 0xff;
347
0
        *up++ = (val >> 32) & 0xff;
348
0
        *up++ = (val >> 24) & 0xff;
349
0
        *up++ = (val >> 16) & 0xff;
350
0
        *up++ = (val >> 8 ) & 0xff;
351
0
        *up   = val & 0xff;
352
0
        return 7;
353
0
    } else if (!(val & ~((1LL<<(7*8))-1))) {
354
0
        *up++ = (val >> 56) | 0xfe;
355
0
        *up++ = (val >> 48) & 0xff;
356
0
        *up++ = (val >> 40) & 0xff;
357
0
        *up++ = (val >> 32) & 0xff;
358
0
        *up++ = (val >> 24) & 0xff;
359
0
        *up++ = (val >> 16) & 0xff;
360
0
        *up++ = (val >> 8 ) & 0xff;
361
0
        *up   = val & 0xff;
362
0
        return 8;
363
0
    } else {
364
0
        *up++ = 0xff;
365
0
        *up++ = (val >> 56) & 0xff;
366
0
        *up++ = (val >> 48) & 0xff;
367
0
        *up++ = (val >> 40) & 0xff;
368
0
        *up++ = (val >> 32) & 0xff;
369
0
        *up++ = (val >> 24) & 0xff;
370
0
        *up++ = (val >> 16) & 0xff;
371
0
        *up++ = (val >> 8 ) & 0xff;
372
0
        *up   = val & 0xff;
373
0
        return 9;
374
0
    }
375
141k
}
376
377
/*
378
 * Encodes and writes a single integer in ITF-8 format.
379
 * Returns 0 on success
380
 *        -1 on failure
381
 */
382
0
int itf8_encode(cram_fd *fd, int32_t val) {
383
0
    char buf[5];
384
0
    int len = itf8_put(buf, val);
385
0
    return hwrite(fd->fp, buf, len) == len ? 0 : -1;
386
0
}
387
388
const int itf8_bytes[16] = {
389
    1, 1, 1, 1,  1, 1, 1, 1,
390
    2, 2, 2, 2,  3, 3, 4, 5
391
};
392
393
const int ltf8_bytes[256] = {
394
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
395
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
396
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
397
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
398
399
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
400
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
401
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
402
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
403
404
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
405
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
406
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
407
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
408
409
    3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
410
    3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
411
412
    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
413
414
    5, 5, 5, 5,  5, 5, 5, 5,  6, 6, 6, 6,  7, 7, 8, 9
415
};
416
417
/*
418
 * LEGACY: consider using ltf8_decode_crc.
419
 */
420
0
int ltf8_decode(cram_fd *fd, int64_t *val_p) {
421
0
    int c = hgetc(fd->fp);
422
0
    int64_t val = (unsigned char)c;
423
0
    if (c == -1)
424
0
        return -1;
425
426
0
    if (val < 0x80) {
427
0
        *val_p =   val;
428
0
        return 1;
429
430
0
    } else if (val < 0xc0) {
431
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
432
0
        *val_p = val & (((1LL<<(6+8)))-1);
433
0
        return 2;
434
435
0
    } else if (val < 0xe0) {
436
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
437
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
438
0
        *val_p = val & ((1LL<<(5+2*8))-1);
439
0
        return 3;
440
441
0
    } else if (val < 0xf0) {
442
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
443
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
444
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
445
0
        *val_p = val & ((1LL<<(4+3*8))-1);
446
0
        return 4;
447
448
0
    } else if (val < 0xf8) {
449
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
450
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
451
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
452
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
453
0
        *val_p = val & ((1LL<<(3+4*8))-1);
454
0
        return 5;
455
456
0
    } else if (val < 0xfc) {
457
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
458
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
459
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
460
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
461
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
462
0
        *val_p = val & ((1LL<<(2+5*8))-1);
463
0
        return 6;
464
465
0
    } else if (val < 0xfe) {
466
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
467
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
468
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
469
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
470
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
471
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
472
0
        *val_p = val & ((1LL<<(1+6*8))-1);
473
0
        return 7;
474
475
0
    } else if (val < 0xff) {
476
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
477
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
478
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
479
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
480
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
481
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
482
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
483
0
        *val_p = val & ((1LL<<(7*8))-1);
484
0
        return 8;
485
486
0
    } else {
487
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
488
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
489
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
490
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
491
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
492
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
493
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
494
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
495
0
        *val_p = val;
496
0
    }
497
498
0
    return 9;
499
0
}
500
501
3.55k
int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
502
3.55k
    unsigned char c[9];
503
3.55k
    int64_t val = hgetc(fd->fp);
504
3.55k
    if (val < 0)
505
19
        return -1;
506
507
3.54k
    c[0] = val;
508
509
3.54k
    if (val < 0x80) {
510
2.60k
        *val_p =   val;
511
2.60k
        *crc = crc32(*crc, c, 1);
512
2.60k
        return 1;
513
514
2.60k
    } else if (val < 0xc0) {
515
128
        int v = hgetc(fd->fp);
516
128
        if (v < 0)
517
5
            return -1;
518
123
        val = (val<<8) | (c[1]=v);
519
123
        *val_p = val & (((1LL<<(6+8)))-1);
520
123
        *crc = crc32(*crc, c, 2);
521
123
        return 2;
522
523
805
    } else if (val < 0xe0) {
524
38
        if (hread(fd->fp, &c[1], 2) < 2)
525
6
            return -1;
526
32
        val = (val<<8) | c[1];
527
32
        val = (val<<8) | c[2];
528
32
        *val_p = val & ((1LL<<(5+2*8))-1);
529
32
        *crc = crc32(*crc, c, 3);
530
32
        return 3;
531
532
767
    } else if (val < 0xf0) {
533
45
        if (hread(fd->fp, &c[1], 3) < 3)
534
5
            return -1;
535
40
        val = (val<<8) | c[1];
536
40
        val = (val<<8) | c[2];
537
40
        val = (val<<8) | c[3];
538
40
        *val_p = val & ((1LL<<(4+3*8))-1);
539
40
        *crc = crc32(*crc, c, 4);
540
40
        return 4;
541
542
722
    } else if (val < 0xf8) {
543
145
        if (hread(fd->fp, &c[1], 4) < 4)
544
2
            return -1;
545
143
        val = (val<<8) | c[1];
546
143
        val = (val<<8) | c[2];
547
143
        val = (val<<8) | c[3];
548
143
        val = (val<<8) | c[4];
549
143
        *val_p = val & ((1LL<<(3+4*8))-1);
550
143
        *crc = crc32(*crc, c, 5);
551
143
        return 5;
552
553
577
    } else if (val < 0xfc) {
554
356
        if (hread(fd->fp, &c[1], 5) < 5)
555
8
            return -1;
556
348
        val = (val<<8) | c[1];
557
348
        val = (val<<8) | c[2];
558
348
        val = (val<<8) | c[3];
559
348
        val = (val<<8) | c[4];
560
348
        val = (val<<8) | c[5];
561
348
        *val_p = val & ((1LL<<(2+5*8))-1);
562
348
        *crc = crc32(*crc, c, 6);
563
348
        return 6;
564
565
356
    } else if (val < 0xfe) {
566
75
        if (hread(fd->fp, &c[1], 6) < 6)
567
5
            return -1;
568
70
        val = (val<<8) | c[1];
569
70
        val = (val<<8) | c[2];
570
70
        val = (val<<8) | c[3];
571
70
        val = (val<<8) | c[4];
572
70
        val = (val<<8) | c[5];
573
70
        val = (val<<8) | c[6];
574
70
        *val_p = val & ((1LL<<(1+6*8))-1);
575
70
        *crc = crc32(*crc, c, 7);
576
70
        return 7;
577
578
146
    } else if (val < 0xff) {
579
90
        uint64_t uval = val;
580
90
        if (hread(fd->fp, &c[1], 7) < 7)
581
1
            return -1;
582
89
        uval = (uval<<8) | c[1];
583
89
        uval = (uval<<8) | c[2];
584
89
        uval = (uval<<8) | c[3];
585
89
        uval = (uval<<8) | c[4];
586
89
        uval = (uval<<8) | c[5];
587
89
        uval = (uval<<8) | c[6];
588
89
        uval = (uval<<8) | c[7];
589
89
        *val_p = uval & ((1ULL<<(7*8))-1);
590
89
        *crc = crc32(*crc, c, 8);
591
89
        return 8;
592
593
90
    } else {
594
56
        uint64_t uval;
595
56
        if (hread(fd->fp, &c[1], 8) < 8)
596
4
            return -1;
597
52
        uval =             c[1];
598
52
        uval = (uval<<8) | c[2];
599
52
        uval = (uval<<8) | c[3];
600
52
        uval = (uval<<8) | c[4];
601
52
        uval = (uval<<8) | c[5];
602
52
        uval = (uval<<8) | c[6];
603
52
        uval = (uval<<8) | c[7];
604
52
        uval = (uval<<8) | c[8];
605
52
        *crc = crc32(*crc, c, 9);
606
        // Avoid implementation-defined behaviour on negative values
607
52
        *val_p = c[1] < 0x80 ? (int64_t) uval : -((int64_t) (0xffffffffffffffffULL - uval)) - 1;
608
52
    }
609
610
52
    return 9;
611
3.54k
}
612
613
/*
614
 * Pushes a value in ITF8 format onto the end of a block.
615
 * This shouldn't be used for high-volume data as it is not the fastest
616
 * method.
617
 *
618
 * Returns the number of bytes written
619
 */
620
5.95M
int itf8_put_blk(cram_block *blk, int32_t val) {
621
5.95M
    char buf[5];
622
5.95M
    int sz;
623
624
5.95M
    sz = itf8_put(buf, val);
625
5.95M
    BLOCK_APPEND(blk, buf, sz);
626
5.95M
    return sz;
627
628
0
 block_err:
629
0
    return -1;
630
5.95M
}
631
632
0
int ltf8_put_blk(cram_block *blk, int64_t val) {
633
0
    char buf[9];
634
0
    int sz;
635
636
0
    sz = ltf8_put(buf, val);
637
0
    BLOCK_APPEND(blk, buf, sz);
638
0
    return sz;
639
640
0
 block_err:
641
0
    return -1;
642
0
}
643
644
193k
static int64_t safe_itf8_get(char **cp, const char *endp, int *err) {
645
193k
    const unsigned char *up = (unsigned char *)*cp;
646
647
193k
    if (endp && endp - *cp < 5 &&
648
193k
        (*cp >= endp || endp - *cp < itf8_bytes[up[0]>>4])) {
649
14.8k
        if (err) *err = 1;
650
14.8k
        return 0;
651
14.8k
    }
652
653
178k
    if (up[0] < 0x80) {
654
159k
        (*cp)++;
655
159k
        return up[0];
656
159k
    } else if (up[0] < 0xc0) {
657
8.81k
        (*cp)+=2;
658
8.81k
        return ((up[0] <<8) |  up[1])                           & 0x3fff;
659
9.90k
    } else if (up[0] < 0xe0) {
660
2.00k
        (*cp)+=3;
661
2.00k
        return ((up[0]<<16) | (up[1]<< 8) |  up[2])             & 0x1fffff;
662
7.90k
    } else if (up[0] < 0xf0) {
663
2.58k
        (*cp)+=4;
664
2.58k
        uint32_t uv = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
665
2.58k
        return (int32_t)uv;
666
5.31k
    } else {
667
5.31k
        (*cp)+=5;
668
5.31k
        uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
669
5.31k
        return (int32_t)uv;
670
5.31k
    }
671
178k
}
672
673
807
static int64_t safe_ltf8_get(char **cp, const char *endp, int *err) {
674
807
    unsigned char *up = (unsigned char *)*cp;
675
676
807
    if (endp && endp - *cp < 9 &&
677
807
        (*cp >= endp || endp - *cp < ltf8_bytes[up[0]])) {
678
327
        if (err) *err = 1;
679
327
        return 0;
680
327
    }
681
682
480
    if (up[0] < 0x80) {
683
237
        (*cp)++;
684
237
        return up[0];
685
243
    } else if (up[0] < 0xc0) {
686
15
        (*cp)+=2;
687
15
        return (((uint64_t)up[0]<< 8) |
688
15
                 (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
689
228
    } else if (up[0] < 0xe0) {
690
42
        (*cp)+=3;
691
42
        return (((uint64_t)up[0]<<16) |
692
42
                ((uint64_t)up[1]<< 8) |
693
42
                 (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
694
186
    } else if (up[0] < 0xf0) {
695
36
        (*cp)+=4;
696
36
        return (((uint64_t)up[0]<<24) |
697
36
                ((uint64_t)up[1]<<16) |
698
36
                ((uint64_t)up[2]<< 8) |
699
36
                 (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
700
150
    } else if (up[0] < 0xf8) {
701
75
        (*cp)+=5;
702
75
        return (((uint64_t)up[0]<<32) |
703
75
                ((uint64_t)up[1]<<24) |
704
75
                ((uint64_t)up[2]<<16) |
705
75
                ((uint64_t)up[3]<< 8) |
706
75
                 (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
707
75
    } else if (up[0] < 0xfc) {
708
3
        (*cp)+=6;
709
3
        return (((uint64_t)up[0]<<40) |
710
3
                ((uint64_t)up[1]<<32) |
711
3
                ((uint64_t)up[2]<<24) |
712
3
                ((uint64_t)up[3]<<16) |
713
3
                ((uint64_t)up[4]<< 8) |
714
3
                 (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
715
72
    } else if (up[0] < 0xfe) {
716
6
        (*cp)+=7;
717
6
        return (((uint64_t)up[0]<<48) |
718
6
                ((uint64_t)up[1]<<40) |
719
6
                ((uint64_t)up[2]<<32) |
720
6
                ((uint64_t)up[3]<<24) |
721
6
                ((uint64_t)up[4]<<16) |
722
6
                ((uint64_t)up[5]<< 8) |
723
6
                 (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
724
66
    } else if (up[0] < 0xff) {
725
3
        (*cp)+=8;
726
3
        return (((uint64_t)up[1]<<48) |
727
3
                ((uint64_t)up[2]<<40) |
728
3
                ((uint64_t)up[3]<<32) |
729
3
                ((uint64_t)up[4]<<24) |
730
3
                ((uint64_t)up[5]<<16) |
731
3
                ((uint64_t)up[6]<< 8) |
732
3
                 (uint64_t)up[7]) & ((1LL<<(7*8))-1);
733
63
    } else {
734
63
        (*cp)+=9;
735
63
        return (((uint64_t)up[1]<<56) |
736
63
                ((uint64_t)up[2]<<48) |
737
63
                ((uint64_t)up[3]<<40) |
738
63
                ((uint64_t)up[4]<<32) |
739
63
                ((uint64_t)up[5]<<24) |
740
63
                ((uint64_t)up[6]<<16) |
741
63
                ((uint64_t)up[7]<< 8) |
742
63
                 (uint64_t)up[8]);
743
63
    }
744
480
}
745
746
// Wrapper for now
747
5.13M
static int safe_itf8_put(char *cp, char *cp_end, int32_t val) {
748
5.13M
    return itf8_put(cp, val);
749
5.13M
}
750
751
141k
static int safe_ltf8_put(char *cp, char *cp_end, int64_t val) {
752
141k
    return ltf8_put(cp, val);
753
141k
}
754
755
1.09M
static int itf8_size(int64_t v) {
756
1.09M
    return ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5);
757
1.09M
}
758
759
//-----------------------------------------------------------------------------
760
761
// CRAM v4.0 onwards uses a different variable sized integer encoding
762
// that is size agnostic.
763
764
// Local interface to varint.h inline version, so we can use in func ptr.
765
// Note a lot of these use the unsigned interface but take signed int64_t.
766
// This is because the old CRAM ITF8 inteface had signed -1 as unsigned
767
// 0xffffffff.
768
6.18k
static int uint7_size(int64_t v) {
769
6.18k
    return var_size_u64(v);
770
6.18k
}
771
772
2.16k
static int64_t uint7_get_32(char **cp, const char *endp, int *err) {
773
2.16k
    uint32_t val;
774
2.16k
    int nb = var_get_u32((uint8_t *)(*cp), (const uint8_t *)endp, &val);
775
2.16k
    (*cp) += nb;
776
2.16k
    if (!nb && err) *err = 1;
777
2.16k
    return val;
778
2.16k
}
779
780
0
static int64_t sint7_get_32(char **cp, const char *endp, int *err) {
781
0
    int32_t val;
782
0
    int nb = var_get_s32((uint8_t *)(*cp), (const uint8_t *)endp, &val);
783
0
    (*cp) += nb;
784
0
    if (!nb && err) *err = 1;
785
0
    return val;
786
0
}
787
788
12.6M
static int64_t uint7_get_64(char **cp, const char *endp, int *err) {
789
12.6M
    uint64_t val;
790
12.6M
    int nb = var_get_u64((uint8_t *)(*cp), (const uint8_t *)endp, &val);
791
12.6M
    (*cp) += nb;
792
12.6M
    if (!nb && err) *err = 1;
793
12.6M
    return val;
794
12.6M
}
795
796
27
static int64_t sint7_get_64(char **cp, const char *endp, int *err) {
797
27
    int64_t val;
798
27
    int nb = var_get_s64((uint8_t *)(*cp), (const uint8_t *)endp, &val);
799
27
    (*cp) += nb;
800
27
    if (!nb && err) *err = 1;
801
27
    return val;
802
27
}
803
804
0
static int uint7_put_32(char *cp, char *endp, int32_t val) {
805
0
    return var_put_u32((uint8_t *)cp, (uint8_t *)endp, val);
806
0
}
807
808
0
static int sint7_put_32(char *cp, char *endp, int32_t val) {
809
0
    return var_put_s32((uint8_t *)cp, (uint8_t *)endp, val);
810
0
}
811
812
0
static int uint7_put_64(char *cp, char *endp, int64_t val) {
813
0
    return var_put_u64((uint8_t *)cp, (uint8_t *)endp, val);
814
0
}
815
816
0
static int sint7_put_64(char *cp, char *endp, int64_t val) {
817
0
    return var_put_s64((uint8_t *)cp, (uint8_t *)endp, val);
818
0
}
819
820
// Put direct to to cram_block
821
0
static int uint7_put_blk_32(cram_block *blk, int32_t v) {
822
0
    uint8_t buf[10];
823
0
    int sz = var_put_u32(buf, buf+10, v);
824
0
    BLOCK_APPEND(blk, buf, sz);
825
0
    return sz;
826
827
0
 block_err:
828
0
    return -1;
829
0
}
830
831
0
static int sint7_put_blk_32(cram_block *blk, int32_t v) {
832
0
    uint8_t buf[10];
833
0
    int sz = var_put_s32(buf, buf+10, v);
834
0
    BLOCK_APPEND(blk, buf, sz);
835
0
    return sz;
836
837
0
 block_err:
838
0
    return -1;
839
0
}
840
841
0
static int uint7_put_blk_64(cram_block *blk, int64_t v) {
842
0
    uint8_t buf[10];
843
0
    int sz = var_put_u64(buf, buf+10, v);
844
0
    BLOCK_APPEND(blk, buf, sz);
845
0
    return sz;
846
847
0
 block_err:
848
0
    return -1;
849
0
}
850
851
0
static int sint7_put_blk_64(cram_block *blk, int64_t v) {
852
0
    uint8_t buf[10];
853
0
    int sz = var_put_s64(buf, buf+10, v);
854
0
    BLOCK_APPEND(blk, buf, sz);
855
0
    return sz;
856
857
0
 block_err:
858
0
    return -1;
859
0
}
860
861
// Decode 32-bits with CRC update from cram_fd
862
23.9k
static int uint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
863
23.9k
    uint8_t b[5], i = 0;
864
23.9k
    int c;
865
23.9k
    uint32_t v = 0;
866
867
#ifdef VARINT2
868
    b[0] = hgetc(fd->fp);
869
    if (b[0] < 177) {
870
    } else if (b[0] < 241) {
871
        b[1] = hgetc(fd->fp);
872
    } else if (b[0] < 249) {
873
        b[1] = hgetc(fd->fp);
874
        b[2] = hgetc(fd->fp);
875
    } else {
876
        int n = b[0]+2, z = 1;
877
        while (n-- >= 249)
878
            b[z++] = hgetc(fd->fp);
879
    }
880
    i = var_get_u32(b, NULL, &v);
881
#else
882
//    // Little endian
883
//    int s = 0;
884
//    do {
885
//        b[i++] = c = hgetc(fd->fp);
886
//        if (c < 0)
887
//            return -1;
888
//        v |= (c & 0x7f) << s;
889
//      s += 7;
890
//    } while (i < 5 && (c & 0x80));
891
892
    // Big endian, see also htscodecs/varint.h
893
30.7k
    do {
894
30.7k
        b[i++] = c = hgetc(fd->fp);
895
30.7k
        if (c < 0)
896
92
            return -1;
897
30.6k
        v = (v<<7) | (c & 0x7f);
898
30.6k
    } while (i < 5 && (c & 0x80));
899
23.8k
#endif
900
23.8k
    *crc = crc32(*crc, b, i);
901
902
23.8k
    *val_p = v;
903
23.8k
    return i;
904
23.9k
}
905
906
// Decode 32-bits with CRC update from cram_fd
907
2.70k
static int sint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
908
2.70k
    uint8_t b[5], i = 0;
909
2.70k
    int c;
910
2.70k
    uint32_t v = 0;
911
912
#ifdef VARINT2
913
    b[0] = hgetc(fd->fp);
914
    if (b[0] < 177) {
915
    } else if (b[0] < 241) {
916
        b[1] = hgetc(fd->fp);
917
    } else if (b[0] < 249) {
918
        b[1] = hgetc(fd->fp);
919
        b[2] = hgetc(fd->fp);
920
    } else {
921
        int n = b[0]+2, z = 1;
922
        while (n-- >= 249)
923
            b[z++] = hgetc(fd->fp);
924
    }
925
    i = var_get_u32(b, NULL, &v);
926
#else
927
//    // Little endian
928
//    int s = 0;
929
//    do {
930
//        b[i++] = c = hgetc(fd->fp);
931
//        if (c < 0)
932
//            return -1;
933
//        v |= (c & 0x7f) << s;
934
//      s += 7;
935
//    } while (i < 5 && (c & 0x80));
936
937
    // Big endian, see also htscodecs/varint.h
938
4.09k
    do {
939
4.09k
        b[i++] = c = hgetc(fd->fp);
940
4.09k
        if (c < 0)
941
12
            return -1;
942
4.07k
        v = (v<<7) | (c & 0x7f);
943
4.07k
    } while (i < 5 && (c & 0x80));
944
2.68k
#endif
945
2.68k
    *crc = crc32(*crc, b, i);
946
947
2.68k
    *val_p = (v>>1) ^ -(v&1);
948
2.68k
    return i;
949
2.70k
}
950
951
952
// Decode 64-bits with CRC update from cram_fd
953
10.6k
static int uint7_decode_crc64(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
954
10.6k
    uint8_t b[10], i = 0;
955
10.6k
    int c;
956
10.6k
    uint64_t v = 0;
957
958
#ifdef VARINT2
959
    b[0] = hgetc(fd->fp);
960
    if (b[0] < 177) {
961
    } else if (b[0] < 241) {
962
        b[1] = hgetc(fd->fp);
963
    } else if (b[0] < 249) {
964
        b[1] = hgetc(fd->fp);
965
        b[2] = hgetc(fd->fp);
966
    } else {
967
        int n = b[0]+2, z = 1;
968
        while (n-- >= 249)
969
            b[z++] = hgetc(fd->fp);
970
    }
971
    i = var_get_u64(b, NULL, &v);
972
#else
973
//    // Little endian
974
//    int s = 0;
975
//    do {
976
//        b[i++] = c = hgetc(fd->fp);
977
//        if (c < 0)
978
//            return -1;
979
//        v |= (c & 0x7f) << s;
980
//      s += 7;
981
//    } while (i < 10 && (c & 0x80));
982
983
    // Big endian, see also htscodecs/varint.h
984
14.4k
    do {
985
14.4k
        b[i++] = c = hgetc(fd->fp);
986
14.4k
        if (c < 0)
987
39
            return -1;
988
14.4k
        v = (v<<7) | (c & 0x7f);
989
14.4k
    } while (i < 5 && (c & 0x80));
990
10.6k
#endif
991
10.6k
    *crc = crc32(*crc, b, i);
992
993
10.6k
    *val_p = v;
994
10.6k
    return i;
995
10.6k
}
996
997
//-----------------------------------------------------------------------------
998
999
/*
1000
 * Decodes a 32-bit little endian value from fd and stores in val.
1001
 *
1002
 * Returns the number of bytes read on success
1003
 *         -1 on failure
1004
 */
1005
16.0k
static int int32_decode(cram_fd *fd, int32_t *val) {
1006
16.0k
    int32_t i;
1007
16.0k
    if (4 != hread(fd->fp, &i, 4))
1008
42
        return -1;
1009
1010
16.0k
    *val = le_int4(i);
1011
16.0k
    return 4;
1012
16.0k
}
1013
1014
/*
1015
 * Encodes a 32-bit little endian value 'val' and writes to fd.
1016
 *
1017
 * Returns the number of bytes written on success
1018
 *         -1 on failure
1019
 */
1020
286k
static int int32_encode(cram_fd *fd, int32_t val) {
1021
286k
    uint32_t v = le_int4(val);
1022
286k
    if (4 != hwrite(fd->fp, &v, 4))
1023
0
        return -1;
1024
1025
286k
    return 4;
1026
286k
}
1027
1028
/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
1029
655
int int32_get_blk(cram_block *b, int32_t *val) {
1030
655
    if (b->uncomp_size - BLOCK_SIZE(b) < 4)
1031
75
        return -1;
1032
1033
580
    uint32_t v =
1034
580
         ((uint32_t) b->data[b->byte  ])        |
1035
580
        (((uint32_t) b->data[b->byte+1]) <<  8) |
1036
580
        (((uint32_t) b->data[b->byte+2]) << 16) |
1037
580
        (((uint32_t) b->data[b->byte+3]) << 24);
1038
    // Avoid implementation-defined behaviour on negative values
1039
580
    *val = v < 0x80000000U ? (int32_t) v : -((int32_t) (0xffffffffU - v)) - 1;
1040
580
    BLOCK_SIZE(b) += 4;
1041
580
    return 4;
1042
655
}
1043
1044
/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
1045
7.93k
int int32_put_blk(cram_block *b, int32_t val) {
1046
7.93k
    unsigned char cp[4];
1047
7.93k
    uint32_t v = val;
1048
7.93k
    cp[0] = ( v      & 0xff);
1049
7.93k
    cp[1] = ((v>>8)  & 0xff);
1050
7.93k
    cp[2] = ((v>>16) & 0xff);
1051
7.93k
    cp[3] = ((v>>24) & 0xff);
1052
1053
7.93k
    BLOCK_APPEND(b, cp, 4);
1054
7.93k
    return 0;
1055
1056
0
 block_err:
1057
0
    return -1;
1058
7.93k
}
1059
1060
#ifdef HAVE_LIBDEFLATE
1061
/* ----------------------------------------------------------------------
1062
 * libdeflate compression code, with interface to match
1063
 * zlib_mem_{in,de}flate for simplicity elsewhere.
1064
 */
1065
1066
// Named the same as the version that uses zlib as we always use libdeflate for
1067
// decompression when available.
1068
char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1069
    struct libdeflate_decompressor *z = libdeflate_alloc_decompressor();
1070
    if (!z) {
1071
        hts_log_error("Call to libdeflate_alloc_decompressor failed");
1072
        return NULL;
1073
    }
1074
1075
    uint8_t *data = NULL, *new_data;
1076
    if (!*size)
1077
        *size = csize*2;
1078
    for(;;) {
1079
        new_data = realloc(data, *size);
1080
        if (!new_data) {
1081
            hts_log_error("Memory allocation failure");
1082
            goto fail;
1083
        }
1084
        data = new_data;
1085
1086
        int ret = libdeflate_gzip_decompress(z, cdata, csize, data, *size, size);
1087
1088
        // Auto grow output buffer size if needed and try again.
1089
        // Fortunately for all bar one call of this we know the size already.
1090
        if (ret == LIBDEFLATE_INSUFFICIENT_SPACE) {
1091
            (*size) *= 1.5;
1092
            continue;
1093
        }
1094
1095
        if (ret != LIBDEFLATE_SUCCESS) {
1096
            hts_log_error("Inflate operation failed: %d", ret);
1097
            goto fail;
1098
        } else {
1099
            break;
1100
        }
1101
    }
1102
1103
    libdeflate_free_decompressor(z);
1104
    return (char *)data;
1105
1106
 fail:
1107
    libdeflate_free_decompressor(z);
1108
    free(data);
1109
    return NULL;
1110
}
1111
1112
// Named differently as we use both zlib/libdeflate for compression.
1113
static char *libdeflate_deflate(char *data, size_t size, size_t *cdata_size,
1114
                                int level, int strat) {
1115
    level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default
1116
    level *= 1.23;     // NB levels go up to 12 here; 5 onwards is +1
1117
    level += level>=8; // 5,6,7->6,7,8  8->10  9->12
1118
    if (level > 12) level = 12;
1119
1120
    if (strat == Z_RLE) // not supported by libdeflate
1121
        level = 1;
1122
1123
    struct libdeflate_compressor *z = libdeflate_alloc_compressor(level);
1124
    if (!z) {
1125
        hts_log_error("Call to libdeflate_alloc_compressor failed");
1126
        return NULL;
1127
    }
1128
1129
    unsigned char *cdata = NULL; /* Compressed output */
1130
    size_t cdata_alloc;
1131
    cdata = malloc(cdata_alloc = size*1.05+100);
1132
    if (!cdata) {
1133
        hts_log_error("Memory allocation failure");
1134
        libdeflate_free_compressor(z);
1135
        return NULL;
1136
    }
1137
1138
    *cdata_size = libdeflate_gzip_compress(z, data, size, cdata, cdata_alloc);
1139
    libdeflate_free_compressor(z);
1140
1141
    if (*cdata_size == 0) {
1142
        hts_log_error("Call to libdeflate_gzip_compress failed");
1143
        free(cdata);
1144
        return NULL;
1145
    }
1146
1147
    return (char *)cdata;
1148
}
1149
1150
#else
1151
1152
/* ----------------------------------------------------------------------
1153
 * zlib compression code - from Gap5's tg_iface_g.c
1154
 * They're static here as they're only used within the cram_compress_block
1155
 * and cram_uncompress_block functions, which are the external interface.
1156
 */
1157
14
char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1158
14
    z_stream s;
1159
14
    unsigned char *data = NULL; /* Uncompressed output */
1160
14
    int data_alloc = 0;
1161
14
    int err;
1162
1163
    /* Starting point at uncompressed size, and scale after that */
1164
14
    data = malloc(data_alloc = csize*1.2+100);
1165
14
    if (!data)
1166
0
        return NULL;
1167
1168
    /* Initialise zlib stream */
1169
14
    s.zalloc = Z_NULL; /* use default allocation functions */
1170
14
    s.zfree  = Z_NULL;
1171
14
    s.opaque = Z_NULL;
1172
14
    s.next_in  = (unsigned char *)cdata;
1173
14
    s.avail_in = csize;
1174
14
    s.total_in = 0;
1175
14
    s.next_out  = data;
1176
14
    s.avail_out = data_alloc;
1177
14
    s.total_out = 0;
1178
1179
    //err = inflateInit(&s);
1180
14
    err = inflateInit2(&s, 15 + 32);
1181
14
    if (err != Z_OK) {
1182
0
        hts_log_error("Call to zlib inflateInit failed: %s", s.msg);
1183
0
        free(data);
1184
0
        return NULL;
1185
0
    }
1186
1187
    /* Decode to 'data' array */
1188
21
    for (;s.avail_in;) {
1189
12
        unsigned char *data_tmp;
1190
12
        int alloc_inc;
1191
1192
12
        s.next_out = &data[s.total_out];
1193
12
        err = inflate(&s, Z_NO_FLUSH);
1194
12
        if (err == Z_STREAM_END)
1195
0
            break;
1196
1197
12
        if (err != Z_OK) {
1198
5
            hts_log_error("Call to zlib inflate failed: %s", s.msg);
1199
5
            free(data);
1200
5
            inflateEnd(&s);
1201
5
            return NULL;
1202
5
        }
1203
1204
        /* More to come, so realloc based on growth so far */
1205
7
        alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
1206
7
        data = realloc((data_tmp = data), data_alloc += alloc_inc);
1207
7
        if (!data) {
1208
0
            free(data_tmp);
1209
0
            inflateEnd(&s);
1210
0
            return NULL;
1211
0
        }
1212
7
        s.avail_out += alloc_inc;
1213
7
    }
1214
9
    inflateEnd(&s);
1215
1216
9
    *size = s.total_out;
1217
9
    return (char *)data;
1218
14
}
1219
#endif
1220
1221
#if !defined(HAVE_LIBDEFLATE) || LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR ==  1 && LIBDEFLATE_VERSION_MINOR <= 8)
1222
static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
1223
198k
                              int level, int strat) {
1224
198k
    z_stream s;
1225
198k
    unsigned char *cdata = NULL; /* Compressed output */
1226
198k
    int cdata_alloc = 0;
1227
198k
    int cdata_pos = 0;
1228
198k
    int err;
1229
1230
198k
    cdata = malloc(cdata_alloc = size*1.05+100);
1231
198k
    if (!cdata)
1232
0
        return NULL;
1233
198k
    cdata_pos = 0;
1234
1235
    /* Initialise zlib stream */
1236
198k
    s.zalloc = Z_NULL; /* use default allocation functions */
1237
198k
    s.zfree  = Z_NULL;
1238
198k
    s.opaque = Z_NULL;
1239
198k
    s.next_in  = (unsigned char *)data;
1240
198k
    s.avail_in = size;
1241
198k
    s.total_in = 0;
1242
198k
    s.next_out  = cdata;
1243
198k
    s.avail_out = cdata_alloc;
1244
198k
    s.total_out = 0;
1245
198k
    s.data_type = Z_BINARY;
1246
1247
198k
    err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
1248
198k
    if (err != Z_OK) {
1249
0
        hts_log_error("Call to zlib deflateInit2 failed: %s", s.msg);
1250
0
        return NULL;
1251
0
    }
1252
1253
    /* Encode to 'cdata' array */
1254
396k
    for (;s.avail_in;) {
1255
198k
        s.next_out = &cdata[cdata_pos];
1256
198k
        s.avail_out = cdata_alloc - cdata_pos;
1257
198k
        if (cdata_alloc - cdata_pos <= 0) {
1258
0
            hts_log_error("Deflate produced larger output than expected");
1259
0
            return NULL;
1260
0
        }
1261
198k
        err = deflate(&s, Z_NO_FLUSH);
1262
198k
        cdata_pos = cdata_alloc - s.avail_out;
1263
198k
        if (err != Z_OK) {
1264
0
            hts_log_error("Call to zlib deflate failed: %s", s.msg);
1265
0
            break;
1266
0
        }
1267
198k
    }
1268
198k
    if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
1269
0
        hts_log_error("Call to zlib deflate failed: %s", s.msg);
1270
0
    }
1271
198k
    *cdata_size = s.total_out;
1272
1273
198k
    if (deflateEnd(&s) != Z_OK) {
1274
0
        hts_log_error("Call to zlib deflate failed: %s", s.msg);
1275
0
    }
1276
198k
    return (char *)cdata;
1277
198k
}
1278
#endif
1279
1280
#ifdef HAVE_LIBLZMA
1281
/* ------------------------------------------------------------------------ */
1282
/*
1283
 * Data compression routines using liblzma (xz)
1284
 *
1285
 * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
1286
 * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
1287
 * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
1288
 * as compression times.
1289
 *
1290
 * For now we disable this functionality. If it's to be reenabled make sure you
1291
 * improve the mem_inflate implementation as it's just a test hack at the
1292
 * moment.
1293
 */
1294
1295
static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
1296
0
                              int level) {
1297
0
    char *out;
1298
0
    size_t out_size = lzma_stream_buffer_bound(size);
1299
0
    *cdata_size = 0;
1300
1301
0
    out = malloc(out_size);
1302
1303
    /* Single call compression */
1304
0
    if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
1305
0
                                           (uint8_t *)data, size,
1306
0
                                           (uint8_t *)out, cdata_size,
1307
0
                                           out_size))
1308
0
        return NULL;
1309
1310
0
    return out;
1311
0
}
1312
1313
16
static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
1314
16
    lzma_stream strm = LZMA_STREAM_INIT;
1315
16
    size_t out_size = 0, out_pos = 0;
1316
16
    char *out = NULL, *new_out;
1317
16
    int r;
1318
1319
    /* Initiate the decoder */
1320
16
    if (LZMA_OK != lzma_stream_decoder(&strm, lzma_easy_decoder_memusage(9), 0))
1321
0
        return NULL;
1322
1323
    /* Decode loop */
1324
16
    strm.avail_in = csize;
1325
16
    strm.next_in = (uint8_t *)cdata;
1326
1327
22
    for (;strm.avail_in;) {
1328
14
        if (strm.avail_in > out_size - out_pos) {
1329
14
            out_size += strm.avail_in * 4 + 32768;
1330
14
            new_out = realloc(out, out_size);
1331
14
            if (!new_out)
1332
0
                goto fail;
1333
14
            out = new_out;
1334
14
        }
1335
14
        strm.avail_out = out_size - out_pos;
1336
14
        strm.next_out = (uint8_t *)&out[out_pos];
1337
1338
14
        r = lzma_code(&strm, LZMA_RUN);
1339
14
        if (LZMA_OK != r && LZMA_STREAM_END != r) {
1340
8
            hts_log_error("LZMA decode failure (error %d)", r);
1341
8
            goto fail;
1342
8
        }
1343
1344
6
        out_pos = strm.total_out;
1345
1346
6
        if (r == LZMA_STREAM_END)
1347
0
            break;
1348
6
    }
1349
1350
    /* finish up any unflushed data; necessary? */
1351
8
    r = lzma_code(&strm, LZMA_FINISH);
1352
8
    if (r != LZMA_OK && r != LZMA_STREAM_END) {
1353
0
        hts_log_error("Call to lzma_code failed with error %d", r);
1354
0
        goto fail;
1355
0
    }
1356
1357
8
    new_out = realloc(out, strm.total_out > 0 ? strm.total_out : 1);
1358
8
    if (new_out)
1359
8
        out = new_out;
1360
8
    *size = strm.total_out;
1361
1362
8
    lzma_end(&strm);
1363
1364
8
    return out;
1365
1366
8
 fail:
1367
8
    lzma_end(&strm);
1368
8
    free(out);
1369
8
    return NULL;
1370
8
}
1371
#endif
1372
1373
/* ----------------------------------------------------------------------
1374
 * CRAM blocks - the dynamically growable data block. We have code to
1375
 * create, update, (un)compress and read/write.
1376
 *
1377
 * These are derived from the deflate_interlaced.c blocks, but with the
1378
 * CRAM extension of content types and IDs.
1379
 */
1380
1381
/*
1382
 * Allocates a new cram_block structure with a specified content_type and
1383
 * id.
1384
 *
1385
 * Returns block pointer on success
1386
 *         NULL on failure
1387
 */
1388
cram_block *cram_new_block(enum cram_content_type content_type,
1389
904k
                           int content_id) {
1390
904k
    cram_block *b = malloc(sizeof(*b));
1391
904k
    if (!b)
1392
0
        return NULL;
1393
904k
    b->method = b->orig_method = RAW;
1394
904k
    b->content_type = content_type;
1395
904k
    b->content_id = content_id;
1396
904k
    b->comp_size = 0;
1397
904k
    b->uncomp_size = 0;
1398
904k
    b->data = NULL;
1399
904k
    b->alloc = 0;
1400
904k
    b->byte = 0;
1401
904k
    b->bit = 7; // MSB
1402
904k
    b->crc32 = 0;
1403
904k
    b->idx = 0;
1404
904k
    b->m = NULL;
1405
1406
904k
    return b;
1407
904k
}
1408
1409
/*
1410
 * Reads a block from a cram file.
1411
 * Returns cram_block pointer on success.
1412
 *         NULL on failure
1413
 */
1414
13.4k
cram_block *cram_read_block(cram_fd *fd) {
1415
13.4k
    cram_block *b = malloc(sizeof(*b));
1416
13.4k
    unsigned char c;
1417
13.4k
    uint32_t crc = 0;
1418
13.4k
    if (!b)
1419
0
        return NULL;
1420
1421
    //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
1422
1423
13.4k
    if (-1 == (b->method      = hgetc(fd->fp))) { free(b); return NULL; }
1424
13.3k
    c = b->method; crc = crc32(crc, &c, 1);
1425
13.3k
    if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; }
1426
13.3k
    c = b->content_type; crc = crc32(crc, &c, 1);
1427
13.3k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->content_id, &crc))  { free(b); return NULL; }
1428
13.3k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->comp_size, &crc))   { free(b); return NULL; }
1429
13.3k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; }
1430
1431
    //fprintf(stderr, "  method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
1432
    //      b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
1433
1434
13.2k
    if (b->method == RAW) {
1435
5.60k
        if (b->uncomp_size < 0 || b->comp_size != b->uncomp_size) {
1436
75
            free(b);
1437
75
            return NULL;
1438
75
        }
1439
5.53k
        b->alloc = b->uncomp_size;
1440
5.53k
        if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
1441
5.53k
        if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) {
1442
27
            free(b->data);
1443
27
            free(b);
1444
27
            return NULL;
1445
27
        }
1446
7.69k
    } else {
1447
7.69k
        if (b->comp_size < 0 || b->uncomp_size < 0) {
1448
64
            free(b);
1449
64
            return NULL;
1450
64
        }
1451
7.62k
        b->alloc = b->comp_size;
1452
7.62k
        if (!(b->data = malloc(b->comp_size)))  { free(b); return NULL; }
1453
7.62k
        if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) {
1454
55
            free(b->data);
1455
55
            free(b);
1456
55
            return NULL;
1457
55
        }
1458
7.62k
    }
1459
1460
13.0k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1461
3.08k
        if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
1462
8
            free(b->data);
1463
8
            free(b);
1464
8
            return NULL;
1465
8
        }
1466
1467
3.07k
        b->crc32_checked = fd->ignore_md5;
1468
3.07k
        b->crc_part = crc;
1469
9.99k
    } else {
1470
9.99k
        b->crc32_checked = 1; // CRC not present
1471
9.99k
    }
1472
1473
13.0k
    b->orig_method = b->method;
1474
13.0k
    b->idx = 0;
1475
13.0k
    b->byte = 0;
1476
13.0k
    b->bit = 7; // MSB
1477
1478
13.0k
    return b;
1479
13.0k
}
1480
1481
1482
/*
1483
 * Computes the size of a cram block, including the block
1484
 * header itself.
1485
 */
1486
0
uint32_t cram_block_size(cram_block *b) {
1487
0
    unsigned char dat[100], *cp = dat;;
1488
0
    uint32_t sz;
1489
1490
0
    *cp++ = b->method;
1491
0
    *cp++ = b->content_type;
1492
0
    cp += itf8_put((char*)cp, b->content_id);
1493
0
    cp += itf8_put((char*)cp, b->comp_size);
1494
0
    cp += itf8_put((char*)cp, b->uncomp_size);
1495
1496
0
    sz = cp-dat + 4;
1497
0
    sz += b->method == RAW ? b->uncomp_size : b->comp_size;
1498
1499
0
    return sz;
1500
0
}
1501
1502
/*
1503
 * Writes a CRAM block.
1504
 * Returns 0 on success
1505
 *        -1 on failure
1506
 */
1507
286k
int cram_write_block(cram_fd *fd, cram_block *b) {
1508
286k
    char vardata[100];
1509
286k
    int vardata_o = 0;
1510
1511
286k
    assert(b->method != RAW || (b->comp_size == b->uncomp_size));
1512
1513
286k
    if (hputc(b->method,       fd->fp)  == EOF) return -1;
1514
286k
    if (hputc(b->content_type, fd->fp)  == EOF) return -1;
1515
286k
    vardata_o += fd->vv.varint_put32(vardata          , vardata+100, b->content_id);
1516
286k
    vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->comp_size);
1517
286k
    vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->uncomp_size);
1518
286k
    if (vardata_o != hwrite(fd->fp, vardata, vardata_o))
1519
0
        return -1;
1520
1521
286k
    if (b->data) {
1522
251k
        if (b->method == RAW) {
1523
219k
            if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size))
1524
0
                return -1;
1525
219k
        } else {
1526
31.2k
            if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size))
1527
0
                return -1;
1528
31.2k
        }
1529
251k
    } else {
1530
        // Absent blocks should be size 0
1531
35.2k
        assert(b->method == RAW && b->uncomp_size == 0);
1532
35.2k
    }
1533
1534
286k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1535
286k
        char dat[100], *cp = (char *)dat;
1536
286k
        uint32_t crc;
1537
1538
286k
        *cp++ = b->method;
1539
286k
        *cp++ = b->content_type;
1540
286k
        cp += fd->vv.varint_put32(cp, dat+100, b->content_id);
1541
286k
        cp += fd->vv.varint_put32(cp, dat+100, b->comp_size);
1542
286k
        cp += fd->vv.varint_put32(cp, dat+100, b->uncomp_size);
1543
286k
        crc = crc32(0L, (uc *)dat, cp-dat);
1544
1545
286k
        if (b->method == RAW) {
1546
255k
            b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
1547
255k
        } else {
1548
31.2k
            b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
1549
31.2k
        }
1550
1551
286k
        if (-1 == int32_encode(fd, b->crc32))
1552
0
            return -1;
1553
286k
    }
1554
1555
286k
    return 0;
1556
286k
}
1557
1558
/*
1559
 * Frees a CRAM block, deallocating internal data too.
1560
 */
1561
1.19M
void cram_free_block(cram_block *b) {
1562
1.19M
    if (!b)
1563
282k
        return;
1564
917k
    if (b->data)
1565
583k
        free(b->data);
1566
917k
    free(b);
1567
917k
}
1568
1569
/*
1570
 * Uncompresses a CRAM block, if compressed.
1571
 */
1572
8.31k
int cram_uncompress_block(cram_block *b) {
1573
8.31k
    char *uncomp;
1574
8.31k
    size_t uncomp_size = 0;
1575
1576
8.31k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1577
    // Pretend the CRC was OK so the fuzzer doesn't have to get it right
1578
8.31k
    b->crc32_checked = 1;
1579
8.31k
#endif
1580
1581
8.31k
    if (b->crc32_checked == 0) {
1582
0
        uint32_t crc = crc32(b->crc_part, b->data ? b->data : (uc *)"", b->alloc);
1583
0
        b->crc32_checked = 1;
1584
0
        if (crc != b->crc32) {
1585
0
            hts_log_error("Block CRC32 failure");
1586
0
            return -1;
1587
0
        }
1588
0
    }
1589
1590
8.31k
    if (b->uncomp_size == 0) {
1591
        // blank block
1592
3.08k
        b->method = RAW;
1593
3.08k
        return 0;
1594
3.08k
    }
1595
5.23k
    assert(b->uncomp_size >= 0); // cram_read_block should ensure this
1596
1597
5.23k
    switch (b->method) {
1598
457
    case RAW:
1599
457
        return 0;
1600
1601
14
    case GZIP:
1602
14
        uncomp_size = b->uncomp_size;
1603
14
        uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1604
1605
14
        if (!uncomp)
1606
5
            return -1;
1607
9
        if (uncomp_size != b->uncomp_size) {
1608
9
            free(uncomp);
1609
9
            return -1;
1610
9
        }
1611
0
        free(b->data);
1612
0
        b->data = (unsigned char *)uncomp;
1613
0
        b->alloc = uncomp_size;
1614
0
        b->method = RAW;
1615
0
        break;
1616
1617
0
#ifdef HAVE_LIBBZ2
1618
8
    case BZIP2: {
1619
8
        unsigned int usize = b->uncomp_size;
1620
8
        if (!(uncomp = malloc(usize)))
1621
0
            return -1;
1622
8
        if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
1623
8
                                                (char *)b->data, b->comp_size,
1624
8
                                                0, 0)) {
1625
8
            free(uncomp);
1626
8
            return -1;
1627
8
        }
1628
0
        free(b->data);
1629
0
        b->data = (unsigned char *)uncomp;
1630
0
        b->alloc = usize;
1631
0
        b->method = RAW;
1632
0
        b->uncomp_size = usize; // Just in case it differs
1633
0
        break;
1634
8
    }
1635
#else
1636
    case BZIP2:
1637
        hts_log_error("Bzip2 compression is not compiled into this version. Please rebuild and try again");
1638
        return -1;
1639
#endif
1640
1641
0
#ifdef HAVE_LIBLZMA
1642
16
    case LZMA:
1643
16
        uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1644
16
        if (!uncomp)
1645
8
            return -1;
1646
8
        if (uncomp_size != b->uncomp_size) {
1647
8
            free(uncomp);
1648
8
            return -1;
1649
8
        }
1650
0
        free(b->data);
1651
0
        b->data = (unsigned char *)uncomp;
1652
0
        b->alloc = uncomp_size;
1653
0
        b->method = RAW;
1654
0
        break;
1655
#else
1656
    case LZMA:
1657
        hts_log_error("Lzma compression is not compiled into this version. Please rebuild and try again");
1658
        return -1;
1659
        break;
1660
#endif
1661
1662
427
    case RANS: {
1663
427
        unsigned int usize = b->uncomp_size, usize2;
1664
427
        uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2);
1665
427
        if (!uncomp)
1666
182
            return -1;
1667
245
        if (usize != usize2) {
1668
244
            free(uncomp);
1669
244
            return -1;
1670
244
        }
1671
1
        free(b->data);
1672
1
        b->data = (unsigned char *)uncomp;
1673
1
        b->alloc = usize2;
1674
1
        b->method = RAW;
1675
1
        b->uncomp_size = usize2; // Just in case it differs
1676
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1677
1
        break;
1678
245
    }
1679
1680
668
    case FQZ: {
1681
668
        uncomp_size = b->uncomp_size;
1682
668
        uncomp = fqz_decompress((char *)b->data, b->comp_size, &uncomp_size, NULL, 0);
1683
668
        if (!uncomp)
1684
488
            return -1;
1685
180
        free(b->data);
1686
180
        b->data = (unsigned char *)uncomp;
1687
180
        b->alloc = uncomp_size;
1688
180
        b->method = RAW;
1689
180
        b->uncomp_size = uncomp_size;
1690
180
        break;
1691
668
    }
1692
1693
1.40k
    case RANS_PR0: {
1694
1.40k
        unsigned int usize = b->uncomp_size, usize2;
1695
1.40k
        uncomp = (char *)rans_uncompress_4x16(b->data, b->comp_size, &usize2);
1696
1.40k
        if (!uncomp)
1697
1.08k
            return -1;
1698
314
        if (usize != usize2) {
1699
253
            free(uncomp);
1700
253
            return -1;
1701
253
        }
1702
61
        b->orig_method = RANSPR;
1703
61
        free(b->data);
1704
61
        b->data = (unsigned char *)uncomp;
1705
61
        b->alloc = usize2;
1706
61
        b->method = RAW;
1707
61
        b->uncomp_size = usize2; // Just incase it differs
1708
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1709
61
        break;
1710
314
    }
1711
1712
720
    case ARITH_PR0: {
1713
720
        unsigned int usize = b->uncomp_size, usize2;
1714
720
        uncomp = (char *)arith_uncompress_to(b->data, b->comp_size, NULL, &usize2);
1715
720
        if (!uncomp)
1716
480
            return -1;
1717
240
        if (usize != usize2) {
1718
91
            free(uncomp);
1719
91
            return -1;
1720
91
        }
1721
149
        b->orig_method = ARITH;
1722
149
        free(b->data);
1723
149
        b->data = (unsigned char *)uncomp;
1724
149
        b->alloc = usize2;
1725
149
        b->method = RAW;
1726
149
        b->uncomp_size = usize2; // Just incase it differs
1727
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1728
149
        break;
1729
240
    }
1730
1731
1.49k
    case TOK3: {
1732
1.49k
        uint32_t out_len;
1733
1.49k
        uint8_t *cp = tok3_decode_names(b->data, b->comp_size, &out_len);
1734
1.49k
        if (!cp)
1735
1.41k
            return -1;
1736
81
        b->orig_method = TOK3;
1737
81
        b->method = RAW;
1738
81
        free(b->data);
1739
81
        b->data = cp;
1740
81
        b->alloc = out_len;
1741
81
        b->uncomp_size = out_len;
1742
81
        break;
1743
1.49k
    }
1744
1745
18
    default:
1746
18
        return -1;
1747
5.23k
    }
1748
1749
472
    return 0;
1750
5.23k
}
1751
1752
static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size,
1753
                                     int content_id, size_t *out_size,
1754
                                     enum cram_block_method_int method,
1755
479k
                                     int level, int strat) {
1756
479k
    switch (method) {
1757
62.5k
    case GZIP:
1758
93.9k
    case GZIP_RLE:
1759
198k
    case GZIP_1:
1760
        // Read names bizarrely benefit from zlib over libdeflate for
1761
        // mid-range compression levels.  Focusing purely of ratio or
1762
        // speed, libdeflate still wins.  It also seems to win for
1763
        // other data series too.
1764
        //
1765
        // Eg RN at level 5;  libdeflate=55.9MB  zlib=51.6MB
1766
#ifdef HAVE_LIBDEFLATE
1767
#  if (LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR == 1 && LIBDEFLATE_VERSION_MINOR <= 8))
1768
        if (content_id == DS_RN && level >= 4 && level <= 7)
1769
            return zlib_mem_deflate(in, in_size, out_size, level, strat);
1770
        else
1771
#  endif
1772
            return libdeflate_deflate(in, in_size, out_size, level, strat);
1773
#else
1774
198k
        return zlib_mem_deflate(in, in_size, out_size, level, strat);
1775
0
#endif
1776
1777
0
    case BZIP2: {
1778
0
#ifdef HAVE_LIBBZ2
1779
0
        unsigned int comp_size = in_size*1.01 + 600;
1780
0
        char *comp = malloc(comp_size);
1781
0
        if (!comp)
1782
0
            return NULL;
1783
1784
0
        if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
1785
0
                                              in, in_size,
1786
0
                                              level, 0, 30)) {
1787
0
            free(comp);
1788
0
            return NULL;
1789
0
        }
1790
0
        *out_size = comp_size;
1791
0
        return comp;
1792
#else
1793
        return NULL;
1794
#endif
1795
0
    }
1796
1797
0
    case FQZ:
1798
0
    case FQZ_b:
1799
0
    case FQZ_c:
1800
0
    case FQZ_d: {
1801
        // Extract the necessary portion of the slice into an fqz_slice struct.
1802
        // These previously were the same thing, but this permits us to detach
1803
        // the codec from the rest of this CRAM implementation.
1804
0
        fqz_slice *f = malloc(2*s->hdr->num_records * sizeof(uint32_t) + sizeof(fqz_slice));
1805
0
        if (!f)
1806
0
            return NULL;
1807
0
        f->num_records = s->hdr->num_records;
1808
0
        f->len = (uint32_t *)(((char *)f) + sizeof(fqz_slice));
1809
0
        f->flags = f->len + s->hdr->num_records;
1810
0
        int i;
1811
0
        for (i = 0; i < s->hdr->num_records; i++) {
1812
0
            f->flags[i] = s->crecs[i].flags;
1813
0
            f->len[i] = (i+1 < s->hdr->num_records
1814
0
                         ? s->crecs[i+1].qual - s->crecs[i].qual
1815
0
                         : s->block[DS_QS]->uncomp_size - s->crecs[i].qual);
1816
0
        }
1817
0
        char *comp = fqz_compress(strat & 0xff /* cram vers */, f,
1818
0
                                  in, in_size, out_size, strat >> 8, NULL);
1819
0
        free(f);
1820
0
        return comp;
1821
0
    }
1822
1823
0
    case LZMA:
1824
0
#ifdef HAVE_LIBLZMA
1825
0
        return lzma_mem_deflate(in, in_size, out_size, level);
1826
#else
1827
        return NULL;
1828
#endif
1829
1830
0
    case RANS0:
1831
0
    case RANS1: {
1832
0
        unsigned int out_size_i;
1833
0
        unsigned char *cp;
1834
0
        cp = rans_compress((unsigned char *)in, in_size, &out_size_i,
1835
0
                           method == RANS0 ? 0 : 1);
1836
0
        *out_size = out_size_i;
1837
0
        return (char *)cp;
1838
0
    }
1839
1840
86.0k
    case RANS_PR0:
1841
117k
    case RANS_PR1:
1842
161k
    case RANS_PR64:
1843
192k
    case RANS_PR9:
1844
234k
    case RANS_PR128:
1845
234k
    case RANS_PR129:
1846
234k
    case RANS_PR192:
1847
265k
    case RANS_PR193: {
1848
265k
        unsigned int out_size_i;
1849
265k
        unsigned char *cp;
1850
1851
        // see enum cram_block. We map RANS_* methods to order bit-fields
1852
265k
        static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1853
1854
265k
        int m = method == RANS_PR0 ? 0 : methmap[method - RANS_PR1];
1855
265k
        cp = rans_compress_4x16((unsigned char *)in, in_size, &out_size_i,
1856
265k
                                m | RANS_ORDER_SIMD_AUTO);
1857
265k
        *out_size = out_size_i;
1858
265k
        return (char *)cp;
1859
234k
    }
1860
1861
0
    case ARITH_PR0:
1862
0
    case ARITH_PR1:
1863
0
    case ARITH_PR64:
1864
0
    case ARITH_PR9:
1865
0
    case ARITH_PR128:
1866
0
    case ARITH_PR129:
1867
0
    case ARITH_PR192:
1868
0
    case ARITH_PR193: {
1869
0
        unsigned int out_size_i;
1870
0
        unsigned char *cp;
1871
1872
        // see enum cram_block. We map ARITH_* methods to order bit-fields
1873
0
        static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1874
1875
0
        cp = arith_compress_to((unsigned char *)in, in_size, NULL, &out_size_i,
1876
0
                               method == ARITH_PR0 ? 0 : methmap[method - ARITH_PR1]);
1877
0
        *out_size = out_size_i;
1878
0
        return (char *)cp;
1879
0
    }
1880
1881
15.4k
    case TOK3:
1882
15.4k
    case TOKA: {
1883
15.4k
        int out_len;
1884
15.4k
        int lev = level;
1885
15.4k
        if (method == TOK3 && lev > 3)
1886
15.4k
            lev = 3;
1887
15.4k
        uint8_t *cp = tok3_encode_names(in, in_size, lev, strat, &out_len, NULL);
1888
15.4k
        *out_size = out_len;
1889
15.4k
        return (char *)cp;
1890
15.4k
    }
1891
1892
0
    case RAW:
1893
0
        break;
1894
1895
0
    default:
1896
0
        return NULL;
1897
479k
    }
1898
1899
0
    return NULL;
1900
479k
}
1901
1902
/*
1903
 * A copy of cram_compress_block2 with added recursion detection.
1904
 * This is only called for error handling where the auto-tuning has failed.
1905
 * The simplest way of doing this is recusion + an additional argument, but
1906
 * we didn't want to complicate the existing code hence this is static.
1907
 */
1908
static int cram_compress_block3(cram_fd *fd, cram_slice *s,
1909
                                cram_block *b, cram_metrics *metrics,
1910
                                int method, int level,
1911
513k
                                int recurse) {
1912
1913
513k
    if (!b)
1914
34.6k
        return 0;
1915
1916
478k
    int orig_method = method;
1917
478k
    char *comp = NULL;
1918
478k
    size_t comp_size = 0;
1919
478k
    int strat;
1920
1921
    // Internally we have parameterised methods that externally map
1922
    // to the same CRAM method value.
1923
    // See enum_cram_block_method_int in cram_structs.h.
1924
478k
    int methmap[] = {
1925
        // Externally defined values
1926
478k
        RAW, GZIP, BZIP2, LZMA, RANS, RANSPR, ARITH, FQZ, TOK3,
1927
1928
        // Reserved for possible expansion
1929
478k
        0, 0,
1930
1931
        // Internally parameterised versions matching back to above
1932
        // external values
1933
478k
        GZIP, GZIP,
1934
478k
        FQZ, FQZ, FQZ,
1935
478k
        RANS,
1936
478k
        RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR,
1937
478k
        TOK3,
1938
478k
        ARITH,  ARITH,  ARITH,  ARITH,  ARITH,  ARITH,  ARITH,
1939
478k
    };
1940
1941
478k
    if (b->method != RAW) {
1942
        // Maybe already compressed if s->block[0] was compressed and
1943
        // we have e.g. s->block[DS_BA] set to s->block[0] due to only
1944
        // one base type present and hence using E_HUFFMAN on block 0.
1945
        // A second explicit attempt to compress the same block then
1946
        // occurs.
1947
0
        return 0;
1948
0
    }
1949
1950
478k
    if (method == -1) {
1951
7.93k
        method = 1<<GZIP;
1952
7.93k
        if (fd->use_bz2)
1953
0
            method |= 1<<BZIP2;
1954
7.93k
        if (fd->use_lzma)
1955
0
            method |= 1<<LZMA;
1956
7.93k
    }
1957
1958
478k
    if (level == -1)
1959
7.93k
        level = fd->level;
1960
1961
    //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
1962
1963
478k
    if (method == RAW || level == 0 || b->uncomp_size == 0) {
1964
279k
        b->method = RAW;
1965
279k
        b->comp_size = b->uncomp_size;
1966
        //fprintf(stderr, "Skip block id %d\n", b->content_id);
1967
279k
        return 0;
1968
279k
    }
1969
1970
198k
#ifndef ABS
1971
198k
#    define ABS(a) ((a)>=0?(a):-(a))
1972
198k
#endif
1973
1974
198k
    if (metrics) {
1975
183k
        pthread_mutex_lock(&fd->metrics_lock);
1976
        // Sudden changes in size trigger a retrial.  These are mainly
1977
        // triggered when switching to sorted / unsorted, where the number
1978
        // of elements in a slice radically changes.
1979
        //
1980
        // We also get large fluctuations based on genome coordinate for
1981
        // e.g. SA:Z and SC series, but we consider the typical scale of
1982
        // delta between blocks and use this to look for abnormality.
1983
1984
        // Equivalent to (but minus possible integer overflow)
1985
        //   (b->uncomp_size + 1000)/4 > metrics->input_avg_sz+1000 ||
1986
        //    b->uncomp_size + 1000    < (metrics->input_avg_sz+1000)/4)
1987
183k
        if (metrics->input_avg_sz &&
1988
183k
            (b->uncomp_size/4 - 750 > metrics->input_avg_sz ||
1989
38.1k
             b->uncomp_size         < metrics->input_avg_sz/4 - 750) &&
1990
183k
            ABS(b->uncomp_size-metrics->input_avg_sz)/10
1991
898
                > metrics->input_avg_delta) {
1992
28
            metrics->next_trial = 0;
1993
28
        }
1994
1995
183k
        if (metrics->trial > 0 || --metrics->next_trial <= 0) {
1996
47.0k
            int m, unpackable = metrics->unpackable;
1997
47.0k
            size_t sz_best = b->uncomp_size;
1998
47.0k
            size_t sz[CRAM_MAX_METHOD] = {0};
1999
47.0k
            int method_best = 0; // RAW
2000
47.0k
            char *c_best = NULL, *c = NULL;
2001
2002
47.0k
            metrics->input_avg_delta =
2003
47.0k
                0.9 * (metrics->input_avg_delta +
2004
47.0k
                       ABS(b->uncomp_size - metrics->input_avg_sz));
2005
2006
47.0k
            metrics->input_avg_sz += b->uncomp_size*.2;
2007
47.0k
            metrics->input_avg_sz *= 0.8;
2008
2009
47.0k
            if (metrics->revised_method)
2010
20.3k
                method = metrics->revised_method;
2011
26.6k
            else
2012
26.6k
                metrics->revised_method = method;
2013
2014
47.0k
            if (metrics->next_trial <= 0) {
2015
1.06k
                metrics->next_trial = TRIAL_SPAN;
2016
1.06k
                metrics->trial = NTRIALS;
2017
35.1k
                for (m = 0; m < CRAM_MAX_METHOD; m++)
2018
34.1k
                    metrics->sz[m] /= 2;
2019
1.06k
                metrics->unpackable = 0;
2020
1.06k
            }
2021
2022
            // Compress this block using the best method
2023
47.0k
            if (unpackable && CRAM_MAJOR_VERS(fd->version) > 3) {
2024
                // No point trying bit-pack if 17+ symbols.
2025
0
                if (method & (1<<RANS_PR128))
2026
0
                    method = (method|(1<<RANS_PR0))&~(1<<RANS_PR128);
2027
0
                if (method & (1<<RANS_PR129))
2028
0
                    method = (method|(1<<RANS_PR1))&~(1<<RANS_PR129);
2029
0
                if (method & (1<<RANS_PR192))
2030
0
                    method = (method|(1<<RANS_PR64))&~(1<<RANS_PR192);
2031
0
                if (method & (1<<RANS_PR193))
2032
0
                    method = (method|(1<<RANS_PR64)|(1<<RANS_PR1))&~(1<<RANS_PR193);
2033
2034
0
                if (method & (1<<ARITH_PR128))
2035
0
                    method = (method|(1<<ARITH_PR0))&~(1<<ARITH_PR128);
2036
0
                if (method & (1<<ARITH_PR129))
2037
0
                    method = (method|(1<<ARITH_PR1))&~(1<<ARITH_PR129);
2038
0
                if (method & (1<<ARITH_PR192))
2039
0
                    method = (method|(1<<ARITH_PR64))&~(1<<ARITH_PR192);
2040
0
                if (method & (1u<<ARITH_PR193))
2041
0
                    method = (method|(1<<ARITH_PR64)|(1<<ARITH_PR1))&~(1u<<ARITH_PR193);
2042
0
            }
2043
2044
            // Libdeflate doesn't have a Z_RLE strategy.
2045
            // We treat it as level 1, but iff we haven't also
2046
            // explicitly listed that in the method list.
2047
#ifdef HAVE_LIBDEFLATE
2048
            if ((method & (1<<GZIP_RLE)) && (method & (1<<GZIP_1)))
2049
                method &= ~(1<<GZIP_RLE);
2050
#endif
2051
2052
47.0k
            pthread_mutex_unlock(&fd->metrics_lock);
2053
2054
1.55M
            for (m = 0; m < CRAM_MAX_METHOD; m++) {
2055
1.50M
                if (method & (1u<<m)) {
2056
327k
                    int lvl = level;
2057
327k
                    switch (m) {
2058
46.5k
                    case GZIP:     strat = Z_FILTERED; break;
2059
46.9k
                    case GZIP_1:   strat = Z_DEFAULT_STRATEGY; lvl = 1; break;
2060
31.2k
                    case GZIP_RLE: strat = Z_RLE; break;
2061
0
                    case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
2062
0
                    case FQZ_b:    strat = CRAM_MAJOR_VERS(fd->version)+256; break;
2063
0
                    case FQZ_c:    strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
2064
0
                    case FQZ_d:    strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
2065
15.4k
                    case TOK3:     strat = 0; break;
2066
0
                    case TOKA:     strat = 1; break;
2067
187k
                    default:       strat = 0;
2068
327k
                    }
2069
2070
327k
                    c = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2071
327k
                                                b->content_id, &sz[m], m, lvl, strat);
2072
2073
327k
                    if (c && sz_best > sz[m]) {
2074
21.3k
                        sz_best = sz[m];
2075
21.3k
                        method_best = m;
2076
21.3k
                        if (c_best)
2077
11.1k
                            free(c_best);
2078
21.3k
                        c_best = c;
2079
306k
                    } else if (c) {
2080
305k
                        free(c);
2081
305k
                    } else {
2082
1.27k
                        sz[m] = UINT_MAX; // arbitrarily worse than raw
2083
1.27k
                    }
2084
1.17M
                } else {
2085
1.17M
                    sz[m] = UINT_MAX; // arbitrarily worse than raw
2086
1.17M
                }
2087
1.50M
            }
2088
2089
47.0k
            if (c_best) {
2090
10.1k
                free(b->data);
2091
10.1k
                b->data = (unsigned char *)c_best;
2092
10.1k
                b->method = method_best; // adjusted to methmap[method_best] later
2093
10.1k
                b->comp_size = sz_best;
2094
10.1k
            }
2095
2096
            // Accumulate stats for all methods tried
2097
47.0k
            pthread_mutex_lock(&fd->metrics_lock);
2098
1.55M
            for (m = 0; m < CRAM_MAX_METHOD; m++)
2099
                // don't be overly sure on small blocks.
2100
                // +2000 means eg bzip2 vs gzip (1.07 to 1.04) or gz vs rans1
2101
                // needs to be at least 60 bytes smaller to overcome the
2102
                // fixed size addition.
2103
1.50M
                metrics->sz[m] += sz[m]+2000;
2104
2105
            // When enough trials performed, find the best on average
2106
47.0k
            if (--metrics->trial == 0) {
2107
12.9k
                int best_method = RAW;
2108
12.9k
                int best_sz = INT_MAX;
2109
2110
                // Relative costs of methods. See enum_cram_block_method_int
2111
                // and methmap
2112
12.9k
                double meth_cost[32] = {
2113
                    // Externally defined methods
2114
12.9k
                    1,    // 0  raw
2115
12.9k
                    1.04, // 1  gzip (Z_FILTERED)
2116
12.9k
                    1.07, // 2  bzip2
2117
12.9k
                    1.08, // 3  lzma
2118
12.9k
                    1.00, // 4  rans    (O0)
2119
12.9k
                    1.00, // 5  ranspr  (O0)
2120
12.9k
                    1.04, // 6  arithpr (O0)
2121
12.9k
                    1.05, // 7  fqz
2122
12.9k
                    1.05, // 8  tok3 (rans)
2123
12.9k
                    1.00, 1.00, // 9,10 reserved
2124
2125
                    // Paramterised versions of above
2126
12.9k
                    1.01, // gzip rle
2127
12.9k
                    1.01, // gzip -1
2128
2129
12.9k
                    1.05, 1.05, 1.05, // FQZ_b,c,d
2130
2131
12.9k
                    1.01, // rans O1
2132
2133
12.9k
                    1.01, // rans_pr1
2134
12.9k
                    1.00, // rans_pr64; if smaller, usually fast
2135
12.9k
                    1.03, // rans_pr65/9
2136
12.9k
                    1.00, // rans_pr128
2137
12.9k
                    1.01, // rans_pr129
2138
12.9k
                    1.00, // rans_pr192
2139
12.9k
                    1.01, // rans_pr193
2140
2141
12.9k
                    1.07, // tok3 arith
2142
2143
12.9k
                    1.04, // arith_pr1
2144
12.9k
                    1.04, // arith_pr64
2145
12.9k
                    1.04, // arith_pr9
2146
12.9k
                    1.03, // arith_pr128
2147
12.9k
                    1.04, // arith_pr129
2148
12.9k
                    1.04, // arith_pr192
2149
12.9k
                    1.04, // arith_pr193
2150
12.9k
                };
2151
2152
                // Scale methods by cost based on compression level
2153
12.9k
                if (fd->level <= 1) {
2154
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2155
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1)*4;
2156
12.9k
                } else if (fd->level <= 3) {
2157
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2158
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1);
2159
12.9k
                } else if (fd->level <= 6) {
2160
425k
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2161
412k
                        metrics->sz[m] *= 1+(meth_cost[m]-1)/2;
2162
12.9k
                } else if (fd->level <= 7) {
2163
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2164
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1)/3;
2165
0
                } // else cost is ignored
2166
2167
                // Ensure these are never used; BSC and ZSTD
2168
12.9k
                metrics->sz[9] = metrics->sz[10] = INT_MAX;
2169
2170
425k
                for (m = 0; m < CRAM_MAX_METHOD; m++) {
2171
412k
                    if ((!metrics->sz[m]) || (!(method & (1u<<m))))
2172
333k
                        continue;
2173
2174
79.5k
                    if (best_sz > metrics->sz[m])
2175
28.5k
                        best_sz = metrics->sz[m], best_method = m;
2176
79.5k
                }
2177
2178
12.9k
                if (best_method != metrics->method) {
2179
                    //metrics->trial = (NTRIALS+1)/2; // be sure
2180
                    //metrics->next_trial /= 1.5;
2181
7.72k
                    metrics->consistency = 0;
2182
7.72k
                } else {
2183
5.18k
                    metrics->next_trial *= MIN(2, 1+metrics->consistency/4.0);
2184
5.18k
                    metrics->consistency++;
2185
5.18k
                }
2186
2187
12.9k
                metrics->method = best_method;
2188
12.9k
                switch (best_method) {
2189
111
                case GZIP:     strat = Z_FILTERED; break;
2190
6.02k
                case GZIP_1:   strat = Z_DEFAULT_STRATEGY; break;
2191
4
                case GZIP_RLE: strat = Z_RLE; break;
2192
0
                case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
2193
0
                case FQZ_b:    strat = CRAM_MAJOR_VERS(fd->version)+256; break;
2194
0
                case FQZ_c:    strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
2195
0
                case FQZ_d:    strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
2196
98
                case TOK3:     strat = 0; break;
2197
0
                case TOKA:     strat = 1; break;
2198
6.66k
                default:       strat = 0;
2199
12.9k
                }
2200
12.9k
                metrics->strat  = strat;
2201
2202
                // If we see at least MAXFAIL trials in a row for a specific
2203
                // compression method with more than MAXDELTA aggregate
2204
                // size then we drop this from the list of methods used
2205
                // for this block type.
2206
86.9k
#define MAXDELTA 0.20
2207
279k
#define MAXFAILS 4
2208
425k
                for (m = 0; m < CRAM_MAX_METHOD; m++) {
2209
412k
                    if (best_method == m) {
2210
12.9k
                        metrics->cnt[m] = 0;
2211
12.9k
                        metrics->extra[m] = 0;
2212
399k
                    } else if (best_sz < metrics->sz[m]) {
2213
279k
                        double r = (double)metrics->sz[m] / best_sz - 1;
2214
279k
                        int mul = 1+(fd->level>=7);
2215
279k
                        if (++metrics->cnt[m] >= MAXFAILS*mul &&
2216
279k
                            (metrics->extra[m] += r) >= MAXDELTA*mul)
2217
42.7k
                            method &= ~(1u<<m);
2218
2219
                        // Special case for fqzcomp as it rarely changes
2220
279k
                        if (m == FQZ || m == FQZ_b || m == FQZ_c || m == FQZ_d) {
2221
48.6k
                            if (metrics->sz[m] > best_sz)
2222
48.6k
                                method &= ~(1u<<m);
2223
48.6k
                        }
2224
279k
                    }
2225
412k
                }
2226
2227
                //if (fd->verbose > 1 && method != metrics->revised_method)
2228
                //    fprintf(stderr, "%d: revising method from %x to %x\n",
2229
                //          b->content_id, metrics->revised_method, method);
2230
12.9k
                metrics->revised_method = method;
2231
12.9k
            }
2232
47.0k
            pthread_mutex_unlock(&fd->metrics_lock);
2233
135k
        } else {
2234
135k
            metrics->input_avg_delta =
2235
135k
                0.9 * (metrics->input_avg_delta +
2236
135k
                       ABS(b->uncomp_size - metrics->input_avg_sz));
2237
2238
135k
            metrics->input_avg_sz += b->uncomp_size*.2;
2239
135k
            metrics->input_avg_sz *= 0.8;
2240
2241
135k
            strat = metrics->strat;
2242
135k
            method = metrics->method;
2243
2244
135k
            pthread_mutex_unlock(&fd->metrics_lock);
2245
135k
            comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2246
135k
                                           b->content_id, &comp_size, method,
2247
135k
                                           method == GZIP_1 ? 1 : level,
2248
135k
                                           strat);
2249
135k
            if (!comp) {
2250
                // Our cached best method failed, but maybe another works?
2251
                // Rerun with trial mode engaged again.
2252
23
                if (!recurse) {
2253
23
                    hts_log_warning("Compressed block ID %d method %s failed, "
2254
23
                                    "redoing trial", b->content_id,
2255
23
                                    cram_block_method2str(method));
2256
23
                    pthread_mutex_lock(&fd->metrics_lock);
2257
23
                    metrics->trial = NTRIALS;
2258
23
                    metrics->next_trial = TRIAL_SPAN;
2259
23
                    metrics->revised_method = orig_method;
2260
23
                    pthread_mutex_unlock(&fd->metrics_lock);
2261
23
                    return cram_compress_block3(fd, s, b, metrics, method,
2262
23
                                                level, 1);
2263
23
                }
2264
0
                return -1;
2265
23
            }
2266
2267
135k
            if (comp_size < b->uncomp_size) {
2268
20.1k
                free(b->data);
2269
20.1k
                b->data = (unsigned char *)comp;
2270
20.1k
                b->comp_size = comp_size;
2271
20.1k
                b->method = method;
2272
115k
            } else {
2273
115k
                free(comp);
2274
115k
            }
2275
135k
        }
2276
2277
183k
    } else {
2278
        // no cached metrics, so just do zlib?
2279
15.9k
        comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2280
15.9k
                                       b->content_id, &comp_size, GZIP, level, Z_FILTERED);
2281
15.9k
        if (!comp) {
2282
0
            hts_log_error("Compression failed!");
2283
0
            return -1;
2284
0
        }
2285
2286
15.9k
        if (comp_size < b->uncomp_size) {
2287
963
            free(b->data);
2288
963
            b->data = (unsigned char *)comp;
2289
963
            b->comp_size = comp_size;
2290
963
            b->method = GZIP;
2291
14.9k
        } else {
2292
14.9k
            free(comp);
2293
14.9k
        }
2294
15.9k
        strat = Z_FILTERED;
2295
15.9k
    }
2296
2297
198k
    hts_log_info("Compressed block ID %d from %d to %d by method %s",
2298
198k
                 b->content_id, b->uncomp_size, b->comp_size,
2299
198k
                 cram_block_method2str(b->method));
2300
2301
198k
    b->method = methmap[b->method];
2302
2303
198k
    return 0;
2304
198k
}
2305
2306
/*
2307
 * Compresses a block using a selection of compression codecs and options.
2308
 * The best is learnt and used for subsequent slices, periodically resampling.
2309
 *
2310
 * Method and level -1 implies defaults, as specified in cram_fd.
2311
 */
2312
int cram_compress_block2(cram_fd *fd, cram_slice *s,
2313
                         cram_block *b, cram_metrics *metrics,
2314
513k
                         int method, int level) {
2315
513k
    return cram_compress_block3(fd, s, b, metrics, method, level, 0);
2316
513k
}
2317
2318
int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
2319
7.93k
                        int method, int level) {
2320
7.93k
    return cram_compress_block2(fd, NULL, b, metrics, method, level);
2321
7.93k
}
2322
2323
786k
cram_metrics *cram_new_metrics(void) {
2324
786k
    cram_metrics *m = calloc(1, sizeof(*m));
2325
786k
    if (!m)
2326
0
        return NULL;
2327
786k
    m->trial = NTRIALS-1;
2328
786k
    m->next_trial = TRIAL_SPAN/2; // learn quicker at start
2329
786k
    m->method = RAW;
2330
786k
    m->strat = 0;
2331
786k
    m->revised_method = 0;
2332
786k
    m->unpackable = 0;
2333
2334
786k
    return m;
2335
786k
}
2336
2337
198k
char *cram_block_method2str(enum cram_block_method_int m) {
2338
198k
    switch(m) {
2339
167k
    case RAW:         return "RAW";
2340
2.96k
    case GZIP:        return "GZIP";
2341
0
    case BZIP2:       return "BZIP2";
2342
0
    case LZMA:        return "LZMA";
2343
0
    case RANS0:       return "RANS0";
2344
0
    case RANS1:       return "RANS1";
2345
150
    case GZIP_RLE:    return "GZIP_RLE";
2346
2.69k
    case GZIP_1:      return "GZIP_1";
2347
0
    case FQZ:         return "FQZ";
2348
0
    case FQZ_b:       return "FQZ_b";
2349
0
    case FQZ_c:       return "FQZ_c";
2350
0
    case FQZ_d:       return "FQZ_d";
2351
743
    case RANS_PR0:    return "RANS_PR0";
2352
61
    case RANS_PR1:    return "RANS_PR1";
2353
12.7k
    case RANS_PR64:   return "RANS_PR64";
2354
3
    case RANS_PR9:    return "RANS_PR9";
2355
11.6k
    case RANS_PR128:  return "RANS_PR128";
2356
0
    case RANS_PR129:  return "RANS_PR129";
2357
0
    case RANS_PR192:  return "RANS_PR192";
2358
195
    case RANS_PR193:  return "RANS_PR193";
2359
27
    case TOK3:        return "TOK3_R";
2360
0
    case TOKA:        return "TOK3_A";
2361
0
    case ARITH_PR0:   return "ARITH_PR0";
2362
0
    case ARITH_PR1:   return "ARITH_PR1";
2363
0
    case ARITH_PR64:  return "ARITH_PR64";
2364
0
    case ARITH_PR9:   return "ARITH_PR9";
2365
0
    case ARITH_PR128: return "ARITH_PR128";
2366
0
    case ARITH_PR129: return "ARITH_PR129";
2367
0
    case ARITH_PR192: return "ARITH_PR192";
2368
0
    case ARITH_PR193: return "ARITH_PR193";
2369
0
    case BM_ERROR: break;
2370
198k
    }
2371
0
    return "?";
2372
198k
}
2373
2374
7
char *cram_content_type2str(enum cram_content_type t) {
2375
7
    switch (t) {
2376
3
    case FILE_HEADER:         return "FILE_HEADER";
2377
2
    case COMPRESSION_HEADER:  return "COMPRESSION_HEADER";
2378
0
    case MAPPED_SLICE:        return "MAPPED_SLICE";
2379
0
    case UNMAPPED_SLICE:      return "UNMAPPED_SLICE";
2380
0
    case EXTERNAL:            return "EXTERNAL";
2381
0
    case CORE:                return "CORE";
2382
0
    case CT_ERROR:            break;
2383
7
    }
2384
2
    return "?";
2385
7
}
2386
2387
/* ----------------------------------------------------------------------
2388
 * Reference sequence handling
2389
 *
2390
 * These revolve around the refs_t structure, which may potentially be
2391
 * shared between multiple cram_fd.
2392
 *
2393
 * We start with refs_create() to allocate an empty refs_t and then
2394
 * populate it with @SQ line data using refs_from_header(). This is done on
2395
 * cram_open().  Also at start up we can call cram_load_reference() which
2396
 * is used with "scramble -r foo.fa". This replaces the fd->refs with the
2397
 * new one specified. In either case refs2id() is then called which
2398
 * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
2399
 *
2400
 * Later, possibly within a thread, we will want to know the actual ref
2401
 * seq itself, obtained by calling cram_get_ref().  This may use the
2402
 * UR: or M5: fields or the filename specified in the original
2403
 * cram_load_reference() call.
2404
 *
2405
 * Given the potential for multi-threaded reference usage, we have
2406
 * reference counting (sorry for the confusing double use of "ref") to
2407
 * track the number of callers interested in any specific reference.
2408
 */
2409
2410
/*
2411
 * Frees/unmaps a reference sequence and associated file handles.
2412
 */
2413
6.20k
static void ref_entry_free_seq(ref_entry *e) {
2414
6.20k
    if (e->mf)
2415
0
        mfclose(e->mf);
2416
6.20k
    if (e->seq && !e->mf)
2417
0
        free(e->seq);
2418
2419
6.20k
    e->seq = NULL;
2420
6.20k
    e->mf = NULL;
2421
6.20k
}
2422
2423
16.6k
void refs_free(refs_t *r) {
2424
16.6k
    RP("refs_free()\n");
2425
2426
16.6k
    if (--r->count > 0)
2427
0
        return;
2428
2429
16.6k
    if (!r)
2430
0
        return;
2431
2432
16.6k
    if (r->pool)
2433
16.6k
        string_pool_destroy(r->pool);
2434
2435
16.6k
    if (r->h_meta) {
2436
16.6k
        khint_t k;
2437
2438
32.4k
        for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) {
2439
15.8k
            ref_entry *e;
2440
2441
15.8k
            if (!kh_exist(r->h_meta, k))
2442
9.61k
                continue;
2443
6.20k
            if (!(e = kh_val(r->h_meta, k)))
2444
0
                continue;
2445
6.20k
            ref_entry_free_seq(e);
2446
6.20k
            free(e);
2447
6.20k
        }
2448
2449
16.6k
        kh_destroy(refs, r->h_meta);
2450
16.6k
    }
2451
2452
16.6k
    if (r->ref_id)
2453
8.44k
        free(r->ref_id);
2454
2455
16.6k
    if (r->fp)
2456
0
        bgzf_close(r->fp);
2457
2458
16.6k
    pthread_mutex_destroy(&r->lock);
2459
2460
16.6k
    free(r);
2461
16.6k
}
2462
2463
16.6k
static refs_t *refs_create(void) {
2464
16.6k
    refs_t *r = calloc(1, sizeof(*r));
2465
2466
16.6k
    RP("refs_create()\n");
2467
2468
16.6k
    if (!r)
2469
0
        return NULL;
2470
2471
16.6k
    if (!(r->pool = string_pool_create(8192)))
2472
0
        goto err;
2473
2474
16.6k
    r->ref_id = NULL; // see refs2id() to populate.
2475
16.6k
    r->count = 1;
2476
16.6k
    r->last = NULL;
2477
16.6k
    r->last_id = -1;
2478
2479
16.6k
    if (!(r->h_meta = kh_init(refs)))
2480
0
        goto err;
2481
2482
16.6k
    pthread_mutex_init(&r->lock, NULL);
2483
2484
16.6k
    return r;
2485
2486
0
 err:
2487
0
    refs_free(r);
2488
0
    return NULL;
2489
16.6k
}
2490
2491
/*
2492
 * Opens a reference fasta file as a BGZF stream, allowing for
2493
 * compressed files.  It automatically builds a .fai file if
2494
 * required and if compressed a .gzi bgzf index too.
2495
 *
2496
 * Returns a BGZF handle on success;
2497
 *         NULL on failure.
2498
 */
2499
1.01k
static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) {
2500
1.01k
    BGZF *fp;
2501
2502
1.01k
    if (strncmp(fn, "file://", 7) == 0)
2503
2
        fn += 7;
2504
2505
1.01k
    if (!is_md5 && !hisremote(fn)) {
2506
610
        char fai_file[PATH_MAX];
2507
2508
610
        snprintf(fai_file, PATH_MAX, "%s.fai", fn);
2509
610
        if (access(fai_file, R_OK) != 0)
2510
610
            if (fai_build(fn) != 0)
2511
610
                return NULL;
2512
610
    }
2513
2514
401
    if (!(fp = bgzf_open(fn, mode))) {
2515
401
        perror(fn);
2516
401
        return NULL;
2517
401
    }
2518
2519
0
    if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) {
2520
0
        hts_log_error("Unable to load .gzi index '%s.gzi'", fn);
2521
0
        bgzf_close(fp);
2522
0
        return NULL;
2523
0
    }
2524
2525
0
    return fp;
2526
0
}
2527
2528
/*
2529
 * Loads a FAI file for a reference.fasta.
2530
 * "is_err" indicates whether failure to load is worthy of emitting an
2531
 * error message. In some cases (eg with embedded references) we
2532
 * speculatively load, just in case, and silently ignore errors.
2533
 *
2534
 * Returns the refs_t struct on success (maybe newly allocated);
2535
 *         NULL on failure
2536
 */
2537
1.01k
static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) {
2538
1.01k
    hFILE *fp = NULL;
2539
1.01k
    char fai_fn[PATH_MAX];
2540
1.01k
    char line[8192];
2541
1.01k
    refs_t *r = r_orig;
2542
1.01k
    size_t fn_l = strlen(fn);
2543
1.01k
    int id = 0, id_alloc = 0;
2544
2545
1.01k
    RP("refs_load_fai %s\n", fn);
2546
2547
1.01k
    if (!r)
2548
0
        if (!(r = refs_create()))
2549
0
            goto err;
2550
2551
1.01k
    if (r->fp)
2552
0
        if (bgzf_close(r->fp) != 0)
2553
0
            goto err;
2554
1.01k
    r->fp = NULL;
2555
2556
    /* Look for a FASTA##idx##FAI format */
2557
1.01k
    char *fn_delim = strstr(fn, HTS_IDX_DELIM);
2558
1.01k
    if (fn_delim) {
2559
7
        if (!(r->fn = string_ndup(r->pool, fn, fn_delim - fn)))
2560
0
            goto err;
2561
7
        fn_delim += strlen(HTS_IDX_DELIM);
2562
7
        snprintf(fai_fn, PATH_MAX, "%s", fn_delim);
2563
1.00k
    } else {
2564
        /* An index file was provided, instead of the actual reference file */
2565
1.00k
        if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0) {
2566
6
            if (!r->fn) {
2567
5
                if (!(r->fn = string_ndup(r->pool, fn, fn_l-4)))
2568
0
                    goto err;
2569
5
            }
2570
6
            snprintf(fai_fn, PATH_MAX, "%s", fn);
2571
998
        } else {
2572
        /* Only the reference file provided. Get the index file name from it */
2573
998
            if (!(r->fn = string_dup(r->pool, fn)))
2574
0
                goto err;
2575
998
            snprintf(fai_fn, PATH_MAX, "%.*s.fai", PATH_MAX-5, fn);
2576
998
        }
2577
1.00k
    }
2578
2579
1.01k
    if (!(r->fp = bgzf_open_ref(r->fn, "r", 0))) {
2580
1.01k
        hts_log_error("Failed to open reference file '%s'", r->fn);
2581
1.01k
        goto err;
2582
1.01k
    }
2583
2584
0
    if (!(fp = hopen(fai_fn, "r"))) {
2585
0
        hts_log_error("Failed to open index file '%s'", fai_fn);
2586
0
        if (is_err)
2587
0
            perror(fai_fn);
2588
0
        goto err;
2589
0
    }
2590
0
    while (hgets(line, 8192, fp) != NULL) {
2591
0
        ref_entry *e = malloc(sizeof(*e));
2592
0
        char *cp;
2593
0
        int n;
2594
0
        khint_t k;
2595
2596
0
        if (!e)
2597
0
            return NULL;
2598
2599
        // id
2600
0
        for (cp = line; *cp && !isspace_c(*cp); cp++)
2601
0
            ;
2602
0
        *cp++ = 0;
2603
0
        e->name = string_dup(r->pool, line);
2604
2605
        // length
2606
0
        while (*cp && isspace_c(*cp))
2607
0
            cp++;
2608
0
        e->length = strtoll(cp, &cp, 10);
2609
2610
        // offset
2611
0
        while (*cp && isspace_c(*cp))
2612
0
            cp++;
2613
0
        e->offset = strtoll(cp, &cp, 10);
2614
2615
        // bases per line
2616
0
        while (*cp && isspace_c(*cp))
2617
0
            cp++;
2618
0
        e->bases_per_line = strtol(cp, &cp, 10);
2619
2620
        // line length
2621
0
        while (*cp && isspace_c(*cp))
2622
0
            cp++;
2623
0
        e->line_length = strtol(cp, &cp, 10);
2624
2625
        // filename
2626
0
        e->fn = r->fn;
2627
2628
0
        e->count = 0;
2629
0
        e->seq = NULL;
2630
0
        e->mf = NULL;
2631
0
        e->is_md5 = 0;
2632
0
        e->validated_md5 = 0;
2633
2634
0
        k = kh_put(refs, r->h_meta, e->name, &n);
2635
0
        if (-1 == n)  {
2636
0
            free(e);
2637
0
            return NULL;
2638
0
        }
2639
2640
0
        if (n) {
2641
0
            kh_val(r->h_meta, k) = e;
2642
0
        } else {
2643
0
            ref_entry *re = kh_val(r->h_meta, k);
2644
0
            if (re && (re->count != 0 || re->length != 0)) {
2645
                /* Keep old */
2646
0
                free(e);
2647
0
            } else {
2648
                /* Replace old */
2649
0
                if (re)
2650
0
                    free(re);
2651
0
                kh_val(r->h_meta, k) = e;
2652
0
            }
2653
0
        }
2654
2655
0
        if (id >= id_alloc) {
2656
0
            ref_entry **new_refs;
2657
0
            int x;
2658
2659
0
            id_alloc = id_alloc ?id_alloc*2 : 16;
2660
0
            new_refs = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
2661
0
            if (!new_refs)
2662
0
                goto err;
2663
0
            r->ref_id = new_refs;
2664
2665
0
            for (x = id; x < id_alloc; x++)
2666
0
                r->ref_id[x] = NULL;
2667
0
        }
2668
0
        r->ref_id[id] = e;
2669
0
        r->nref = ++id;
2670
0
    }
2671
2672
0
    if(hclose(fp) < 0)
2673
0
        goto err;
2674
0
    return r;
2675
2676
1.01k
 err:
2677
1.01k
    if (fp)
2678
0
        hclose_abruptly(fp);
2679
2680
1.01k
    if (!r_orig)
2681
0
        refs_free(r);
2682
2683
1.01k
    return NULL;
2684
0
}
2685
2686
/*
2687
 * Verifies that the CRAM @SQ lines and .fai files match.
2688
 */
2689
0
static void sanitise_SQ_lines(cram_fd *fd) {
2690
0
    int i;
2691
2692
0
    if (!fd->header || !fd->header->hrecs)
2693
0
        return;
2694
2695
0
    if (!fd->refs || !fd->refs->h_meta)
2696
0
        return;
2697
2698
0
    for (i = 0; i < fd->header->hrecs->nref; i++) {
2699
0
        const char *name = fd->header->hrecs->ref[i].name;
2700
0
        khint_t k = kh_get(refs, fd->refs->h_meta, name);
2701
0
        ref_entry *r;
2702
2703
        // We may have @SQ lines which have no known .fai, but do not
2704
        // in themselves pose a problem because they are unused in the file.
2705
0
        if (k == kh_end(fd->refs->h_meta))
2706
0
            continue;
2707
2708
0
        if (!(r = (ref_entry *)kh_val(fd->refs->h_meta, k)))
2709
0
            continue;
2710
2711
0
        if (r->length && r->length != fd->header->hrecs->ref[i].len) {
2712
0
            assert(strcmp(r->name, fd->header->hrecs->ref[i].name) == 0);
2713
2714
            // Should we also check MD5sums here to ensure the correct
2715
            // reference was given?
2716
0
            hts_log_warning("Header @SQ length mismatch for ref %s, %"PRIhts_pos" vs %d",
2717
0
                            r->name, fd->header->hrecs->ref[i].len, (int)r->length);
2718
2719
            // Fixing the parsed @SQ header will make MD:Z: strings work
2720
            // and also stop it producing N for the sequence.
2721
0
            fd->header->hrecs->ref[i].len = r->length;
2722
0
        }
2723
0
    }
2724
0
}
2725
2726
/*
2727
 * Indexes references by the order they appear in a BAM file. This may not
2728
 * necessarily be the same order they appear in the fasta reference file.
2729
 *
2730
 * Returns 0 on success
2731
 *        -1 on failure
2732
 */
2733
7.93k
int refs2id(refs_t *r, sam_hdr_t *hdr) {
2734
7.93k
    int i;
2735
7.93k
    sam_hrecs_t *h = hdr->hrecs;
2736
2737
7.93k
    if (r->ref_id)
2738
2.21k
        free(r->ref_id);
2739
7.93k
    if (r->last)
2740
0
        r->last = NULL;
2741
2742
7.93k
    r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
2743
7.93k
    if (!r->ref_id)
2744
0
        return -1;
2745
2746
7.93k
    r->nref = h->nref;
2747
12.8k
    for (i = 0; i < h->nref; i++) {
2748
4.96k
        khint_t k = kh_get(refs, r->h_meta, h->ref[i].name);
2749
4.96k
        if (k != kh_end(r->h_meta)) {
2750
4.96k
            r->ref_id[i] = kh_val(r->h_meta, k);
2751
4.96k
        } else {
2752
0
            hts_log_warning("Unable to find ref name '%s'", h->ref[i].name);
2753
0
        }
2754
4.96k
    }
2755
2756
7.93k
    return 0;
2757
7.93k
}
2758
2759
/*
2760
 * Generates refs_t entries based on @SQ lines in the header.
2761
 * Returns 0 on success
2762
 *         -1 on failure
2763
 */
2764
33.5k
static int refs_from_header(cram_fd *fd) {
2765
33.5k
    if (!fd)
2766
0
        return -1;
2767
2768
33.5k
    refs_t *r = fd->refs;
2769
33.5k
    if (!r)
2770
0
        return -1;
2771
2772
33.5k
    sam_hdr_t *h = fd->header;
2773
33.5k
    if (!h)
2774
9.37k
        return 0;
2775
2776
24.1k
    if (!h->hrecs) {
2777
15.0k
        if (-1 == sam_hdr_fill_hrecs(h))
2778
1.05k
            return -1;
2779
15.0k
    }
2780
2781
23.0k
    if (h->hrecs->nref == 0)
2782
18.1k
        return 0;
2783
2784
    //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
2785
2786
    /* Existing refs are fine, as long as they're compatible with the hdr. */
2787
4.94k
    ref_entry **new_ref_id = realloc(r->ref_id, (r->nref + h->hrecs->nref) * sizeof(*r->ref_id));
2788
4.94k
    if (!new_ref_id)
2789
0
        return -1;
2790
4.94k
    r->ref_id = new_ref_id;
2791
2792
4.94k
    int i, j;
2793
    /* Copy info from h->ref[i] over to r */
2794
16.1k
    for (i = 0, j = r->nref; i < h->hrecs->nref; i++) {
2795
11.1k
        sam_hrec_type_t *ty;
2796
11.1k
        sam_hrec_tag_t *tag;
2797
11.1k
        khint_t k;
2798
11.1k
        int n;
2799
2800
11.1k
        k = kh_get(refs, r->h_meta, h->hrecs->ref[i].name);
2801
11.1k
        if (k != kh_end(r->h_meta))
2802
            // Ref already known about
2803
4.96k
            continue;
2804
2805
6.20k
        if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
2806
0
            return -1;
2807
2808
6.20k
        if (!h->hrecs->ref[i].name)
2809
0
            return -1;
2810
2811
6.20k
        r->ref_id[j]->name = string_dup(r->pool, h->hrecs->ref[i].name);
2812
6.20k
        if (!r->ref_id[j]->name) return -1;
2813
6.20k
        r->ref_id[j]->length = 0; // marker for not yet loaded
2814
2815
        /* Initialise likely filename if known */
2816
6.20k
        if ((ty = sam_hrecs_find_type_id(h->hrecs, "SQ", "SN", h->hrecs->ref[i].name))) {
2817
6.20k
            if ((tag = sam_hrecs_find_key(ty, "M5", NULL)))
2818
41
                r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
2819
2820
6.20k
            if ((tag = sam_hrecs_find_key(ty, "LN", NULL))) {
2821
                // LN tag used when constructing consensus reference
2822
6.20k
                r->ref_id[j]->LN_length = strtoll(tag->str+3, NULL, 0);
2823
                // See fuzz 382922241
2824
6.20k
                if (r->ref_id[j]->LN_length < 0)
2825
446
                    r->ref_id[j]->LN_length = 0;
2826
6.20k
            }
2827
6.20k
        }
2828
2829
6.20k
        k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
2830
6.20k
        if (n <= 0) // already exists or error
2831
0
            return -1;
2832
6.20k
        kh_val(r->h_meta, k) = r->ref_id[j];
2833
2834
6.20k
        j++;
2835
6.20k
    }
2836
4.94k
    r->nref = j;
2837
2838
4.94k
    return 0;
2839
4.94k
}
2840
2841
/*
2842
 * Attaches a header to a cram_fd.
2843
 *
2844
 * This should be used when creating a new cram_fd for writing where
2845
 * we have a header already constructed (eg from a file we've read
2846
 * in).
2847
 */
2848
8.98k
int cram_set_header2(cram_fd *fd, const sam_hdr_t *hdr) {
2849
8.98k
    if (!fd || !hdr )
2850
0
        return -1;
2851
2852
8.98k
    if (fd->header != hdr) {
2853
8.98k
        if (fd->header)
2854
0
            sam_hdr_destroy(fd->header);
2855
8.98k
        fd->header = sam_hdr_dup(hdr);
2856
8.98k
        if (!fd->header)
2857
0
            return -1;
2858
8.98k
    }
2859
8.98k
    return refs_from_header(fd);
2860
8.98k
}
2861
2862
0
int cram_set_header(cram_fd *fd, sam_hdr_t *hdr) {
2863
0
    return cram_set_header2(fd, hdr);
2864
0
}
2865
2866
/*
2867
 * Returns whether the path refers to a directory.
2868
 */
2869
0
static int is_directory(char *fn) {
2870
0
    struct stat buf;
2871
0
    if ( stat(fn,&buf) ) return 0;
2872
0
    return S_ISDIR(buf.st_mode);
2873
0
}
2874
2875
/*
2876
 * Converts a directory and a filename into an expanded path, replacing %s
2877
 * in directory with the filename and %[0-9]+s with portions of the filename
2878
 * Any remaining parts of filename are added to the end with /%s.
2879
 */
2880
0
static int expand_cache_path(char *path, char *dir, const char *fn) {
2881
0
    char *cp, *start = path;
2882
0
    size_t len;
2883
0
    size_t sz = PATH_MAX;
2884
2885
0
    while ((cp = strchr(dir, '%'))) {
2886
0
        if (cp-dir >= sz) return -1;
2887
0
        strncpy(path, dir, cp-dir);
2888
0
        path += cp-dir;
2889
0
        sz -= cp-dir;
2890
2891
0
        if (*++cp == 's') {
2892
0
            len = strlen(fn);
2893
0
            if (len >= sz) return -1;
2894
0
            strcpy(path, fn);
2895
0
            path += len;
2896
0
            sz -= len;
2897
0
            fn += len;
2898
0
            cp++;
2899
0
        } else if (*cp >= '0' && *cp <= '9') {
2900
0
            char *endp;
2901
0
            long l;
2902
2903
0
            l = strtol(cp, &endp, 10);
2904
0
            l = MIN(l, strlen(fn));
2905
0
            if (*endp == 's') {
2906
0
                if (l >= sz) return -1;
2907
0
                strncpy(path, fn, l);
2908
0
                path += l;
2909
0
                fn += l;
2910
0
                sz -= l;
2911
0
                *path = 0;
2912
0
                cp = endp+1;
2913
0
            } else {
2914
0
                if (sz < 3) return -1;
2915
0
                *path++ = '%';
2916
0
                *path++ = *cp++;
2917
0
            }
2918
0
        } else {
2919
0
            if (sz < 3) return -1;
2920
0
            *path++ = '%';
2921
0
            *path++ = *cp++;
2922
0
        }
2923
0
        dir = cp;
2924
0
    }
2925
2926
0
    len = strlen(dir);
2927
0
    if (len >= sz) return -1;
2928
0
    strcpy(path, dir);
2929
0
    path += len;
2930
0
    sz -= len;
2931
2932
0
    len = strlen(fn) + ((*fn && path > start && path[-1] != '/') ? 1 : 0);
2933
0
    if (len >= sz) return -1;
2934
0
    if (*fn && path > start && path[-1] != '/')
2935
0
        *path++ = '/';
2936
0
    strcpy(path, fn);
2937
0
    return 0;
2938
0
}
2939
2940
/*
2941
 * Make the directory containing path and any prefix directories.
2942
 */
2943
0
static void mkdir_prefix(char *path, int mode) {
2944
0
    char *cp = strrchr(path, '/');
2945
0
    if (!cp)
2946
0
        return;
2947
2948
0
    *cp = 0;
2949
0
    if (is_directory(path)) {
2950
0
        *cp = '/';
2951
0
        return;
2952
0
    }
2953
2954
0
    if (mkdir(path, mode) == 0) {
2955
0
        chmod(path, mode);
2956
0
        *cp = '/';
2957
0
        return;
2958
0
    }
2959
2960
0
    mkdir_prefix(path, mode);
2961
0
    mkdir(path, mode);
2962
0
    chmod(path, mode);
2963
0
    *cp = '/';
2964
0
}
2965
2966
/*
2967
 * Queries the M5 string from the header and attempts to populate the
2968
 * reference from this using the REF_PATH environment.
2969
 *
2970
 * Returns 0 on success
2971
 *        -1 on failure
2972
 */
2973
2.46k
static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
2974
2.46k
    char *ref_path = getenv("REF_PATH");
2975
2.46k
    sam_hrec_type_t *ty;
2976
2.46k
    sam_hrec_tag_t *tag;
2977
2.46k
    char path[PATH_MAX];
2978
2.46k
    kstring_t path_tmp = KS_INITIALIZE;
2979
2.46k
    char *local_cache = getenv("REF_CACHE");
2980
2.46k
    mFILE *mf;
2981
2.46k
    int local_path = 0;
2982
2983
2.46k
    hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id);
2984
2985
2.46k
    if (!r->name)
2986
0
        return -1;
2987
2988
2.46k
    if (!(ty = sam_hrecs_find_type_id(fd->header->hrecs, "SQ", "SN", r->name)))
2989
0
        return -1;
2990
2991
2.46k
    if (!(tag = sam_hrecs_find_key(ty, "M5", NULL)))
2992
2.45k
        goto no_M5;
2993
2994
12
    hts_log_info("Querying ref %s", tag->str+3);
2995
2996
    /* Use cache if available */
2997
12
    if (local_cache && *local_cache) {
2998
0
        struct stat sb;
2999
0
        if (expand_cache_path(path, local_cache, tag->str+3) == 0 &&
3000
0
            stat(path, &sb) == 0)
3001
            // Found it in the local cache
3002
0
            local_path = 1;
3003
0
    }
3004
3005
#ifndef HAVE_MMAP
3006
    char *path2;
3007
    /* Search local files in REF_PATH; we can open them and return as above */
3008
    if (!local_path && (path2 = find_path(tag->str+3, ref_path))) {
3009
        int len = snprintf(path, PATH_MAX, "%s", path2);
3010
        free(path2);
3011
        if (len > 0 && len < PATH_MAX) // in case it's too long
3012
            local_path = 1;
3013
    }
3014
#endif
3015
3016
    /* Found via REF_CACHE or local REF_PATH file */
3017
12
    if (local_path) {
3018
0
        struct stat sb;
3019
0
        BGZF *fp;
3020
3021
0
        if (0 == stat(path, &sb)
3022
0
            && S_ISREG(sb.st_mode)
3023
0
            && (fp = bgzf_open(path, "r"))) {
3024
0
            r->length = sb.st_size;
3025
0
            r->offset = r->line_length = r->bases_per_line = 0;
3026
3027
0
            r->fn = string_dup(fd->refs->pool, path);
3028
3029
0
            if (fd->refs->fp)
3030
0
                if (bgzf_close(fd->refs->fp) != 0)
3031
0
                    return -1;
3032
0
            fd->refs->fp = fp;
3033
0
            fd->refs->fn = r->fn;
3034
0
            r->is_md5 = 1;
3035
0
            r->validated_md5 = 1;
3036
3037
            // Fall back to cram_get_ref() where it'll do the actual
3038
            // reading of the file.
3039
0
            return 0;
3040
0
        }
3041
0
    }
3042
3043
3044
    /* Otherwise search full REF_PATH; slower as loads entire file */
3045
12
    int is_local = 0;
3046
12
    if ((mf = open_path_mfile(tag->str+3, ref_path, NULL, &is_local))) {
3047
0
        size_t sz;
3048
0
        r->seq = mfsteal(mf, &sz);
3049
0
        if (r->seq) {
3050
0
            r->mf = NULL;
3051
0
        } else {
3052
            // keep mf around as we couldn't detach
3053
0
            r->seq = mf->data;
3054
0
            r->mf = mf;
3055
0
        }
3056
0
        r->length = sz;
3057
0
        r->is_md5 = 1;
3058
0
        r->validated_md5 = 1;
3059
12
    } else {
3060
12
        refs_t *refs;
3061
12
        const char *fn;
3062
12
        sam_hrec_tag_t *UR_tag;
3063
3064
2.46k
    no_M5:
3065
        /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
3066
2.46k
        if (!(UR_tag = sam_hrecs_find_key(ty, "UR", NULL)))
3067
1.44k
            return -1;
3068
3069
1.01k
        if (strstr(UR_tag->str+3, "://") &&
3070
1.01k
            strncmp(UR_tag->str+3, "file:", 5) != 0) {
3071
            // Documented as omitted, but accidentally supported until now
3072
7
            hts_log_error("UR tags pointing to remote files are not supported");
3073
7
            return -1;
3074
7
        }
3075
3076
1.01k
        fn = (strncmp(UR_tag->str+3, "file:", 5) == 0)
3077
1.01k
            ? UR_tag->str+8
3078
1.01k
            : UR_tag->str+3;
3079
3080
1.01k
        if (fd->refs->fp) {
3081
0
            if (bgzf_close(fd->refs->fp) != 0)
3082
0
                return -1;
3083
0
            fd->refs->fp = NULL;
3084
0
        }
3085
1.01k
        if (!(refs = refs_load_fai(fd->refs, fn, 0)))
3086
1.01k
            return -1;
3087
0
        sanitise_SQ_lines(fd);
3088
3089
0
        fd->refs = refs;
3090
0
        if (fd->refs->fp) {
3091
0
            if (bgzf_close(fd->refs->fp) != 0)
3092
0
                return -1;
3093
0
            fd->refs->fp = NULL;
3094
0
        }
3095
3096
0
        if (!fd->refs->fn)
3097
0
            return -1;
3098
3099
0
        if (-1 == refs2id(fd->refs, fd->header))
3100
0
            return -1;
3101
0
        if (!fd->refs->ref_id || !fd->refs->ref_id[id])
3102
0
            return -1;
3103
3104
        // Local copy already, so fall back to cram_get_ref().
3105
0
        return 0;
3106
0
    }
3107
3108
    /* Populate the local disk cache if required */
3109
0
    if (!is_local && local_cache && *local_cache) {
3110
0
        hFILE *fp;
3111
3112
0
        if (expand_cache_path(path, local_cache, tag->str+3) < 0) {
3113
0
            return 0; // Not fatal - we have the data already so keep going.
3114
0
        }
3115
0
        hts_log_info("Writing cache file '%s'", path);
3116
0
        mkdir_prefix(path, 01777);
3117
3118
0
        fp = hts_open_tmpfile(path, "wx", &path_tmp);
3119
0
        if (!fp) {
3120
0
            perror(path_tmp.s);
3121
0
            free(path_tmp.s);
3122
3123
            // Not fatal - we have the data already so keep going.
3124
0
            return 0;
3125
0
        }
3126
3127
        // Check md5sum
3128
0
        hts_md5_context *md5;
3129
0
        char unsigned md5_buf1[16];
3130
0
        char md5_buf2[33];
3131
3132
0
        if (!(md5 = hts_md5_init())) {
3133
0
            hclose_abruptly(fp);
3134
0
            unlink(path_tmp.s);
3135
0
            free(path_tmp.s);
3136
0
            return -1;
3137
0
        }
3138
0
        hts_md5_update(md5, r->seq, r->length);
3139
0
        hts_md5_final(md5_buf1, md5);
3140
0
        hts_md5_destroy(md5);
3141
0
        hts_md5_hex(md5_buf2, md5_buf1);
3142
3143
0
        if (strncmp(tag->str+3, md5_buf2, 32) != 0) {
3144
0
            hts_log_error("Mismatching md5sum for downloaded reference");
3145
0
            hclose_abruptly(fp);
3146
0
            unlink(path_tmp.s);
3147
0
            free(path_tmp.s);
3148
0
            return -1;
3149
0
        }
3150
3151
0
        ssize_t length_written = hwrite(fp, r->seq, r->length);
3152
0
        if (hclose(fp) < 0 || length_written != r->length ||
3153
0
            chmod(path_tmp.s, 0444) < 0 ||
3154
0
            rename(path_tmp.s, path) < 0) {
3155
0
            hts_log_error("Creating reference at %s failed: %s",
3156
0
                          path, strerror(errno));
3157
0
            unlink(path_tmp.s);
3158
0
        }
3159
0
    }
3160
3161
0
    free(path_tmp.s);
3162
0
    return 0;
3163
0
}
3164
3165
1.04k
static void cram_ref_incr_locked(refs_t *r, int id) {
3166
1.04k
    RP("%d INC REF %d, %d %p\n", gettid(), id,
3167
1.04k
       (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count+1:-999),
3168
1.04k
       id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
3169
3170
1.04k
    if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq)
3171
1.04k
        return;
3172
3173
0
    if (r->last_id == id)
3174
0
        r->last_id = -1;
3175
3176
0
    ++r->ref_id[id]->count;
3177
0
}
3178
3179
1.04k
void cram_ref_incr(refs_t *r, int id) {
3180
1.04k
    pthread_mutex_lock(&r->lock);
3181
1.04k
    cram_ref_incr_locked(r, id);
3182
1.04k
    pthread_mutex_unlock(&r->lock);
3183
1.04k
}
3184
3185
66
static void cram_ref_decr_locked(refs_t *r, int id) {
3186
66
    RP("%d DEC REF %d, %d %p\n", gettid(), id,
3187
66
       (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count-1:-999),
3188
66
       id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
3189
3190
66
    if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) {
3191
66
        return;
3192
66
    }
3193
3194
0
    if (--r->ref_id[id]->count <= 0) {
3195
0
        assert(r->ref_id[id]->count == 0);
3196
0
        if (r->last_id >= 0) {
3197
0
            if (r->ref_id[r->last_id]->count <= 0 &&
3198
0
                r->ref_id[r->last_id]->seq) {
3199
0
                RP("%d FREE REF %d (%p)\n", gettid(),
3200
0
                   r->last_id, r->ref_id[r->last_id]->seq);
3201
0
                ref_entry_free_seq(r->ref_id[r->last_id]);
3202
0
                if (r->ref_id[r->last_id]->is_md5) r->ref_id[r->last_id]->length = 0;
3203
0
            }
3204
0
        }
3205
0
        r->last_id = id;
3206
0
    }
3207
0
}
3208
3209
66
void cram_ref_decr(refs_t *r, int id) {
3210
66
    pthread_mutex_lock(&r->lock);
3211
66
    cram_ref_decr_locked(r, id);
3212
66
    pthread_mutex_unlock(&r->lock);
3213
66
}
3214
3215
/*
3216
 * Used by cram_ref_load and cram_get_ref. The file handle will have
3217
 * already been opened, so we can catch it. The ref_entry *e informs us
3218
 * of whether this is a multi-line fasta file or a raw MD5 style file.
3219
 * Either way we create a single contiguous sequence.
3220
 *
3221
 * Returns all or part of a reference sequence on success (malloced);
3222
 *         NULL on failure.
3223
 */
3224
static char *load_ref_portion(BGZF *fp, ref_entry *e,
3225
0
                              hts_pos_t start, hts_pos_t end) {
3226
0
    off_t offset, len;
3227
0
    char *seq;
3228
3229
0
    if (end < start)
3230
0
        end = start;
3231
3232
    /*
3233
     * Compute locations in file. This is trivial for the MD5 files, but
3234
     * is still necessary for the fasta variants.
3235
     *
3236
     * Note the offset here, as with faidx, has the assumption that white-
3237
     * space (the diff between line_length and bases_per_line) only occurs
3238
     * at the end of a line of text.
3239
     */
3240
0
    offset = e->line_length
3241
0
        ? e->offset + (start-1)/e->bases_per_line * e->line_length +
3242
0
          (start-1) % e->bases_per_line
3243
0
        : start-1;
3244
3245
0
    len = (e->line_length
3246
0
           ? e->offset + (end-1)/e->bases_per_line * e->line_length +
3247
0
             (end-1) % e->bases_per_line
3248
0
           : end-1) - offset + 1;
3249
3250
0
    if (bgzf_useek(fp, offset, SEEK_SET) < 0) {
3251
0
        perror("bgzf_useek() on reference file");
3252
0
        return NULL;
3253
0
    }
3254
3255
0
    if (len == 0 || !(seq = malloc(len))) {
3256
0
        return NULL;
3257
0
    }
3258
3259
0
    if (len != bgzf_read(fp, seq, len)) {
3260
0
        perror("bgzf_read() on reference file");
3261
0
        free(seq);
3262
0
        return NULL;
3263
0
    }
3264
3265
    /* Strip white-space if required. */
3266
0
    if (len != end-start+1) {
3267
0
        hts_pos_t i, j;
3268
0
        char *cp = seq;
3269
0
        char *cp_to;
3270
3271
        // Copy up to the first white-space, and then repeatedly just copy
3272
        // bases_per_line verbatim, and use the slow method to end again.
3273
        //
3274
        // This may seem excessive, but this code can be a significant
3275
        // portion of total CRAM decode CPU time for shallow data sets.
3276
0
        for (i = j = 0; i < len; i++) {
3277
0
            if (!isspace_c(cp[i]))
3278
0
                cp[j++] = cp[i] & ~0x20;
3279
0
            else
3280
0
                break;
3281
0
        }
3282
0
        while (i < len && isspace_c(cp[i]))
3283
0
            i++;
3284
0
        while (i < len - e->line_length) {
3285
0
            hts_pos_t j_end = j + e->bases_per_line;
3286
0
            while (j < j_end)
3287
0
                cp[j++] = cp[i++] & ~0x20; // toupper equiv
3288
0
            i += e->line_length - e->bases_per_line;
3289
0
        }
3290
0
        for (; i < len; i++) {
3291
0
            if (!isspace_c(cp[i]))
3292
0
                cp[j++] = cp[i] & ~0x20;
3293
0
        }
3294
3295
0
        cp_to = cp+j;
3296
3297
0
        if (cp_to - seq != end-start+1) {
3298
0
            hts_log_error("Malformed reference file");
3299
0
            free(seq);
3300
0
            return NULL;
3301
0
        }
3302
0
    } else {
3303
0
        int i;
3304
0
        for (i = 0; i < len; i++) {
3305
0
            seq[i] = toupper_c(seq[i]);
3306
0
        }
3307
0
    }
3308
3309
0
    return seq;
3310
0
}
3311
3312
/*
3313
 * Load the entire reference 'id'.
3314
 * This also increments the reference count by 1.
3315
 *
3316
 * Returns ref_entry on success;
3317
 *         NULL on failure
3318
 */
3319
0
ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) {
3320
0
    ref_entry *e = r->ref_id[id];
3321
0
    hts_pos_t start = 1, end = e->length;
3322
0
    char *seq;
3323
3324
0
    if (e->seq) {
3325
0
        return e;
3326
0
    }
3327
3328
0
    assert(e->count == 0);
3329
3330
0
    if (r->last) {
3331
#ifdef REF_DEBUG
3332
        int idx = 0;
3333
        for (idx = 0; idx < r->nref; idx++)
3334
            if (r->last == r->ref_id[idx])
3335
                break;
3336
        RP("%d cram_ref_load DECR %d\n", gettid(), idx);
3337
#endif
3338
0
        assert(r->last->count > 0);
3339
0
        if (--r->last->count <= 0) {
3340
0
            RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
3341
0
            if (r->last->seq)
3342
0
                ref_entry_free_seq(r->last);
3343
0
        }
3344
0
    }
3345
3346
0
    if (!r->fn)
3347
0
        return NULL;
3348
3349
    /* Open file if it's not already the current open reference */
3350
0
    if (strcmp(r->fn, e->fn) || r->fp == NULL) {
3351
0
        if (r->fp)
3352
0
            if (bgzf_close(r->fp) != 0)
3353
0
                return NULL;
3354
0
        r->fn = e->fn;
3355
0
        if (!(r->fp = bgzf_open_ref(r->fn, "r", is_md5)))
3356
0
            return NULL;
3357
0
    }
3358
3359
0
    RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
3360
3361
0
    if (!(seq = load_ref_portion(r->fp, e, start, end))) {
3362
0
        return NULL;
3363
0
    }
3364
3365
0
    RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
3366
3367
0
    RP("%d INC REF %d, %"PRId64"\n", gettid(), id, (e->count+1));
3368
0
    e->seq = seq;
3369
0
    e->mf = NULL;
3370
0
    e->count++;
3371
3372
    /*
3373
     * Also keep track of last used ref so incr/decr loops on the same
3374
     * sequence don't cause load/free loops.
3375
     */
3376
0
    RP("%d cram_ref_load INCR %d => %"PRId64"\n", gettid(), id, e->count+1);
3377
0
    r->last = e;
3378
0
    e->count++;
3379
3380
0
    return e;
3381
0
}
3382
3383
/*
3384
 * Returns a portion of a reference sequence from start to end inclusive.
3385
 * The returned pointer is owned by either the cram_file fd or by the
3386
 * internal refs_t structure and should not be freed  by the caller.
3387
 *
3388
 * The difference is whether or not this refs_t is in use by just the one
3389
 * cram_fd or by multiples, or whether we have multiple threads accessing
3390
 * references. In either case fd->shared will be true and we start using
3391
 * reference counting to track the number of users of a specific reference
3392
 * sequence.
3393
 *
3394
 * Otherwise the ref seq returned is allocated as part of cram_fd itself
3395
 * and will be freed up on the next call to cram_get_ref or cram_close.
3396
 *
3397
 * To return the entire reference sequence, specify start as 1 and end
3398
 * as 0.
3399
 *
3400
 * To cease using a reference, call cram_ref_decr().
3401
 *
3402
 * Returns reference on success,
3403
 *         NULL on failure
3404
 */
3405
5.73k
char *cram_get_ref(cram_fd *fd, int id, hts_pos_t start, hts_pos_t end) {
3406
5.73k
    ref_entry *r;
3407
5.73k
    char *seq;
3408
5.73k
    int ostart = start;
3409
3410
5.73k
    if (id == -1 || start < 1)
3411
3.25k
        return NULL;
3412
3413
    /* FIXME: axiomatic query of r->seq being true?
3414
     * Or shortcut for unsorted data where we load once and never free?
3415
     */
3416
3417
    //fd->shared_ref = 1; // hard code for now to simplify things
3418
3419
2.47k
    pthread_mutex_lock(&fd->ref_lock);
3420
3421
2.47k
    RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
3422
3423
    /*
3424
     * Unsorted data implies we want to fetch an entire reference at a time.
3425
     * We just deal with this at the moment by claiming we're sharing
3426
     * references instead, which has the same requirement.
3427
     */
3428
2.47k
    if (fd->unsorted)
3429
0
        fd->shared_ref = 1;
3430
3431
3432
    /* Sanity checking: does this ID exist? */
3433
2.47k
    if (id >= fd->refs->nref) {
3434
12
        hts_log_error("No reference found for id %d", id);
3435
12
        pthread_mutex_unlock(&fd->ref_lock);
3436
12
        return NULL;
3437
12
    }
3438
3439
2.46k
    if (!fd->refs || !fd->refs->ref_id[id]) {
3440
0
        hts_log_error("No reference found for id %d", id);
3441
0
        pthread_mutex_unlock(&fd->ref_lock);
3442
0
        return NULL;
3443
0
    }
3444
3445
2.46k
    if (!(r = fd->refs->ref_id[id])) {
3446
0
        hts_log_error("No reference found for id %d", id);
3447
0
        pthread_mutex_unlock(&fd->ref_lock);
3448
0
        return NULL;
3449
0
    }
3450
3451
3452
    /*
3453
     * It has an entry, but may not have been populated yet.
3454
     * Any manually loaded .fai files have their lengths known.
3455
     * A ref entry computed from @SQ lines (M5 or UR field) will have
3456
     * r->length == 0 unless it's been loaded once and verified that we have
3457
     * an on-disk filename for it.
3458
     *
3459
     * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
3460
     * open_path_mfile and libcurl, which isn't multi-thread safe unless I
3461
     * rewrite my code to have one curl handle per thread.
3462
     */
3463
2.46k
    pthread_mutex_lock(&fd->refs->lock);
3464
2.46k
    if (r->length == 0) {
3465
2.46k
        if (fd->ref_fn)
3466
0
            hts_log_warning("Reference file given, but ref '%s' not present",
3467
2.46k
                            r->name);
3468
2.46k
        if (cram_populate_ref(fd, id, r) == -1) {
3469
2.46k
            hts_log_warning("Failed to populate reference \"%s\"",
3470
2.46k
                            r->name);
3471
2.46k
            hts_log_warning("See https://www.htslib.org/doc/reference_seqs.html for further suggestions");
3472
2.46k
            pthread_mutex_unlock(&fd->refs->lock);
3473
2.46k
            pthread_mutex_unlock(&fd->ref_lock);
3474
2.46k
            return NULL;
3475
2.46k
        }
3476
0
        r = fd->refs->ref_id[id];
3477
0
        if (fd->unsorted)
3478
0
            cram_ref_incr_locked(fd->refs, id);
3479
0
    }
3480
3481
3482
    /*
3483
     * We now know that we the filename containing the reference, so check
3484
     * for limits. If it's over half the reference we'll load all of it in
3485
     * memory as this will speed up subsequent calls.
3486
     */
3487
0
    if (end < 1)
3488
0
        end = r->length;
3489
0
    if (end >= r->length)
3490
0
        end  = r->length;
3491
3492
0
    if (end - start >= 0.5*r->length || fd->shared_ref) {
3493
0
        start = 1;
3494
0
        end = r->length;
3495
0
    }
3496
3497
    /*
3498
     * Maybe we have it cached already? If so use it.
3499
     *
3500
     * Alternatively if we don't have the sequence but we're sharing
3501
     * references and/or are asking for the entire length of it, then
3502
     * load the full reference into the refs structure and return
3503
     * a pointer to that one instead.
3504
     */
3505
0
    if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
3506
0
        char *cp;
3507
3508
0
        if (id >= 0) {
3509
0
            if (r->seq) {
3510
0
                cram_ref_incr_locked(fd->refs, id);
3511
0
            } else {
3512
0
                ref_entry *e;
3513
0
                if (!(e = cram_ref_load(fd->refs, id, r->is_md5))) {
3514
0
                    pthread_mutex_unlock(&fd->refs->lock);
3515
0
                    pthread_mutex_unlock(&fd->ref_lock);
3516
0
                    return NULL;
3517
0
                }
3518
3519
                /* unsorted data implies cache ref indefinitely, to avoid
3520
                 * continually loading and unloading.
3521
                 */
3522
0
                if (fd->unsorted)
3523
0
                    cram_ref_incr_locked(fd->refs, id);
3524
0
            }
3525
3526
0
            fd->ref = NULL; /* We never access it directly */
3527
0
            fd->ref_start = 1;
3528
0
            fd->ref_end   = r->length;
3529
0
            fd->ref_id    = id;
3530
3531
0
            cp = fd->refs->ref_id[id]->seq + ostart-1;
3532
0
        } else {
3533
0
            fd->ref = NULL;
3534
0
            cp = NULL;
3535
0
        }
3536
3537
0
        RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
3538
3539
0
        pthread_mutex_unlock(&fd->refs->lock);
3540
0
        pthread_mutex_unlock(&fd->ref_lock);
3541
0
        return cp;
3542
0
    }
3543
3544
    /*
3545
     * Otherwise we're not sharing, we don't have a copy of it already and
3546
     * we're only asking for a small portion of it.
3547
     *
3548
     * In this case load up just that segment ourselves, freeing any old
3549
     * small segments in the process.
3550
     */
3551
3552
    /* Unmapped ref ID */
3553
0
    if (id < 0 || !fd->refs->fn) {
3554
0
        if (fd->ref_free) {
3555
0
            free(fd->ref_free);
3556
0
            fd->ref_free = NULL;
3557
0
        }
3558
0
        fd->ref = NULL;
3559
0
        fd->ref_id = id;
3560
0
        pthread_mutex_unlock(&fd->refs->lock);
3561
0
        pthread_mutex_unlock(&fd->ref_lock);
3562
0
        return NULL;
3563
0
    }
3564
3565
    /* Open file if it's not already the current open reference */
3566
0
    if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
3567
0
        if (fd->refs->fp)
3568
0
            if (bgzf_close(fd->refs->fp) != 0)
3569
0
                return NULL;
3570
0
        fd->refs->fn = r->fn;
3571
0
        if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r", r->is_md5))) {
3572
0
            pthread_mutex_unlock(&fd->refs->lock);
3573
0
            pthread_mutex_unlock(&fd->ref_lock);
3574
0
            return NULL;
3575
0
        }
3576
0
    }
3577
3578
0
    if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
3579
0
        pthread_mutex_unlock(&fd->refs->lock);
3580
0
        pthread_mutex_unlock(&fd->ref_lock);
3581
0
        return NULL;
3582
0
    }
3583
3584
0
    if (fd->ref_free)
3585
0
        free(fd->ref_free);
3586
3587
0
    fd->ref_id    = id;
3588
0
    fd->ref_start = start;
3589
0
    fd->ref_end   = end;
3590
0
    fd->ref_free = fd->ref;
3591
0
    seq = fd->ref;
3592
3593
0
    pthread_mutex_unlock(&fd->refs->lock);
3594
0
    pthread_mutex_unlock(&fd->ref_lock);
3595
3596
0
    return seq ? seq + ostart - start : NULL;
3597
0
}
3598
3599
/*
3600
 * If fd has been opened for reading, it may be permitted to specify 'fn'
3601
 * as NULL and let the code auto-detect the reference by parsing the
3602
 * SAM header @SQ lines.
3603
 */
3604
0
int cram_load_reference(cram_fd *fd, char *fn) {
3605
0
    int ret = 0;
3606
3607
0
    if (fn) {
3608
0
        fd->refs = refs_load_fai(fd->refs, fn,
3609
0
                                 !(fd->embed_ref>0 && fd->mode == 'r'));
3610
0
        fn = fd->refs ? fd->refs->fn : NULL;
3611
0
        if (!fn)
3612
0
            ret = -1;
3613
0
        sanitise_SQ_lines(fd);
3614
0
    }
3615
0
    fd->ref_fn = fn;
3616
3617
0
    if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
3618
0
        if (fd->refs)
3619
0
            refs_free(fd->refs);
3620
0
        if (!(fd->refs = refs_create()))
3621
0
            return -1;
3622
0
        if (-1 == refs_from_header(fd))
3623
0
            return -1;
3624
0
    }
3625
3626
0
    if (fd->header)
3627
0
        if (-1 == refs2id(fd->refs, fd->header))
3628
0
            return -1;
3629
3630
0
    return ret;
3631
0
}
3632
3633
/* ----------------------------------------------------------------------
3634
 * Containers
3635
 */
3636
3637
/*
3638
 * Creates a new container, specifying the maximum number of slices
3639
 * and records permitted.
3640
 *
3641
 * Returns cram_container ptr on success
3642
 *         NULL on failure
3643
 */
3644
44.3k
cram_container *cram_new_container(int nrec, int nslice) {
3645
44.3k
    cram_container *c = calloc(1, sizeof(*c));
3646
44.3k
    enum cram_DS_ID id;
3647
3648
44.3k
    if (!c)
3649
0
        return NULL;
3650
3651
44.3k
    c->curr_ref = -2;
3652
3653
44.3k
    c->max_c_rec = nrec * nslice;
3654
44.3k
    c->curr_c_rec = 0;
3655
3656
44.3k
    c->max_rec = nrec;
3657
44.3k
    c->record_counter = 0;
3658
44.3k
    c->num_bases = 0;
3659
44.3k
    c->s_num_bases = 0;
3660
3661
44.3k
    c->max_slice = nslice;
3662
44.3k
    c->curr_slice = 0;
3663
3664
44.3k
    c->pos_sorted = 1;
3665
44.3k
    c->max_apos   = 0;
3666
44.3k
    c->multi_seq  = 0;
3667
44.3k
    c->qs_seq_orient = 1;
3668
44.3k
    c->no_ref = 0;
3669
44.3k
    c->embed_ref = -1; // automatic selection
3670
3671
44.3k
    c->bams = NULL;
3672
3673
44.3k
    if (!(c->slices = calloc(nslice != 0 ? nslice : 1, sizeof(cram_slice *))))
3674
0
        goto err;
3675
44.3k
    c->slice = NULL;
3676
3677
44.3k
    if (!(c->comp_hdr = cram_new_compression_header()))
3678
0
        goto err;
3679
44.3k
    c->comp_hdr_block = NULL;
3680
3681
1.28M
    for (id = DS_RN; id < DS_TN; id++)
3682
1.24M
        if (!(c->stats[id] = cram_stats_create())) goto err;
3683
3684
    //c->aux_B_stats = cram_stats_create();
3685
3686
44.3k
    if (!(c->tags_used = kh_init(m_tagmap)))
3687
0
        goto err;
3688
44.3k
    c->refs_used = 0;
3689
44.3k
    c->ref_free = 0;
3690
3691
44.3k
    return c;
3692
3693
0
 err:
3694
0
    if (c) {
3695
0
        if (c->slices)
3696
0
            free(c->slices);
3697
0
        free(c);
3698
0
    }
3699
0
    return NULL;
3700
44.3k
}
3701
3702
4.39k
static void free_bam_list(bam_seq_t **bams, int max_rec) {
3703
4.39k
    int i;
3704
43.9M
    for (i = 0; i < max_rec; i++)
3705
43.9M
        bam_free(bams[i]);
3706
3707
4.39k
    free(bams);
3708
4.39k
}
3709
3710
72.3k
void cram_free_container(cram_container *c) {
3711
72.3k
    enum cram_DS_ID id;
3712
72.3k
    int i;
3713
3714
72.3k
    if (!c)
3715
0
        return;
3716
3717
72.3k
    if (c->refs_used)
3718
489
        free(c->refs_used);
3719
3720
72.3k
    if (c->landmark)
3721
46.1k
        free(c->landmark);
3722
3723
72.3k
    if (c->comp_hdr)
3724
44.7k
        cram_free_compression_header(c->comp_hdr);
3725
3726
72.3k
    if (c->comp_hdr_block)
3727
40.7k
        cram_free_block(c->comp_hdr_block);
3728
3729
    // Free the slices; filled out by encoder only
3730
72.3k
    if (c->slices) {
3731
80.7k
        for (i = 0; i < c->max_slice; i++) {
3732
36.4k
            if (c->slices[i])
3733
4.39k
                cram_free_slice(c->slices[i]);
3734
36.4k
            if (c->slices[i] == c->slice)
3735
36.4k
                c->slice = NULL;
3736
36.4k
        }
3737
44.3k
        free(c->slices);
3738
44.3k
    }
3739
3740
    // Free the current slice; set by both encoder & decoder
3741
72.3k
    if (c->slice) {
3742
0
        cram_free_slice(c->slice);
3743
0
        c->slice = NULL;
3744
0
    }
3745
3746
2.09M
    for (id = DS_RN; id < DS_TN; id++)
3747
2.02M
        if (c->stats[id]) cram_stats_free(c->stats[id]);
3748
3749
    //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
3750
3751
72.3k
    if (c->tags_used) {
3752
44.3k
        khint_t k;
3753
3754
225k
        for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
3755
180k
            if (!kh_exist(c->tags_used, k))
3756
103k
                continue;
3757
3758
77.6k
            cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
3759
77.6k
            if (tm) {
3760
77.6k
                cram_codec *c = tm->codec;
3761
3762
77.6k
                if (c) c->free(c);
3763
3764
                // If tm->blk or tm->blk2 is set, then we haven't yet got to
3765
                // cram_encode_container which copies the blocks to s->aux_block
3766
                // and NULLifies tm->blk*.  In this case we failed to complete
3767
                // the container construction, so we have to free up our partially
3768
                // converted CRAM.
3769
77.6k
                cram_free_block(tm->blk);
3770
77.6k
                cram_free_block(tm->blk2);
3771
77.6k
                free(tm);
3772
77.6k
            }
3773
77.6k
        }
3774
3775
44.3k
        kh_destroy(m_tagmap, c->tags_used);
3776
44.3k
    }
3777
3778
72.3k
    if (c->ref_free)
3779
16.8k
        free(c->ref);
3780
3781
72.3k
    if (c->bams)
3782
366
        free_bam_list(c->bams, c->max_c_rec);
3783
3784
72.3k
    free(c);
3785
72.3k
}
3786
3787
/*
3788
 * Reads a container header.
3789
 *
3790
 * Returns cram_container on success
3791
 *         NULL on failure or no container left (fd->err == 0).
3792
 */
3793
28.5k
cram_container *cram_read_container(cram_fd *fd) {
3794
28.5k
    cram_container c2, *c;
3795
28.5k
    int i, s;
3796
28.5k
    size_t rd = 0;
3797
28.5k
    uint32_t crc = 0;
3798
3799
28.5k
    fd->err = 0;
3800
28.5k
    fd->eof = 0;
3801
3802
28.5k
    memset(&c2, 0, sizeof(c2));
3803
28.5k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3804
22.5k
        if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) {
3805
38
            fd->eof = fd->empty_container ? 1 : 2;
3806
38
            return NULL;
3807
22.5k
        } else {
3808
22.5k
            rd+=s;
3809
22.5k
        }
3810
22.5k
    } else if (CRAM_MAJOR_VERS(fd->version) < 4) {
3811
3.25k
        uint32_t len;
3812
3.25k
        if ((s = int32_decode(fd, &c2.length)) == -1) {
3813
19
            if (CRAM_MAJOR_VERS(fd->version) == 2 &&
3814
19
                CRAM_MINOR_VERS(fd->version) == 0)
3815
8
                fd->eof = 1; // EOF blocks arrived in v2.1
3816
11
            else
3817
11
                fd->eof = fd->empty_container ? 1 : 2;
3818
19
            return NULL;
3819
3.23k
        } else {
3820
3.23k
            rd+=s;
3821
3.23k
        }
3822
3.23k
        len = le_int4(c2.length);
3823
3.23k
        crc = crc32(0L, (unsigned char *)&len, 4);
3824
3.23k
    } else {
3825
2.72k
        if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc))   == -1) {
3826
24
            fd->eof = fd->empty_container ? 1 : 2;
3827
24
            return NULL;
3828
2.70k
        } else {
3829
2.70k
            rd+=s;
3830
2.70k
        }
3831
2.72k
    }
3832
28.4k
    if ((s = fd->vv.varint_decode32s_crc(fd, &c2.ref_seq_id, &crc))   == -1) return NULL; else rd+=s;
3833
28.3k
    if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3834
2.68k
        int64_t i64;
3835
2.68k
        if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc))== -1) return NULL; else rd+=s;
3836
2.67k
        c2.ref_seq_start = i64;
3837
2.67k
        if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc)) == -1) return NULL; else rd+=s;
3838
2.66k
        c2.ref_seq_span = i64;
3839
25.6k
    } else {
3840
25.6k
        int32_t i32;
3841
25.6k
        if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s;
3842
25.6k
        c2.ref_seq_start = i32;
3843
25.6k
        if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s;
3844
25.5k
        c2.ref_seq_span = i32;
3845
25.5k
    }
3846
28.2k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_records, &crc))  == -1) return NULL; else rd+=s;
3847
3848
28.2k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3849
22.3k
        c2.record_counter = 0;
3850
22.3k
        c2.num_bases = 0;
3851
22.3k
    } else {
3852
5.85k
        if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3853
3.02k
            if ((s = fd->vv.varint_decode64_crc(fd, &c2.record_counter, &crc)) == -1)
3854
14
                return NULL;
3855
3.01k
            else
3856
3.01k
                rd += s;
3857
3.02k
        } else {
3858
2.82k
            int32_t i32;
3859
2.82k
            if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1)
3860
14
                return NULL;
3861
2.80k
            else
3862
2.80k
                rd += s;
3863
2.80k
            c2.record_counter = i32;
3864
2.80k
        }
3865
3866
5.82k
        if ((s = fd->vv.varint_decode64_crc(fd, &c2.num_bases, &crc))== -1)
3867
53
            return NULL;
3868
5.76k
        else
3869
5.76k
            rd += s;
3870
5.82k
    }
3871
28.1k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_blocks, &crc))   == -1)
3872
38
        return NULL;
3873
28.0k
    else
3874
28.0k
        rd+=s;
3875
28.0k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_landmarks, &crc))== -1)
3876
22
        return NULL;
3877
28.0k
    else
3878
28.0k
        rd+=s;
3879
3880
28.0k
    if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t))
3881
52
        return NULL;
3882
3883
28.0k
    if (!(c = calloc(1, sizeof(*c))))
3884
0
        return NULL;
3885
3886
28.0k
    *c = c2;
3887
28.0k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3888
28.0k
    if (c->num_landmarks > FUZZ_ALLOC_LIMIT/sizeof(int32_t)) {
3889
9
        fd->err = errno = ENOMEM;
3890
9
        cram_free_container(c);
3891
9
        return NULL;
3892
9
    }
3893
28.0k
#endif
3894
28.0k
    if (c->num_landmarks && !(c->landmark = malloc(c->num_landmarks * sizeof(int32_t)))) {
3895
0
        fd->err = errno;
3896
0
        cram_free_container(c);
3897
0
        return NULL;
3898
0
    }
3899
65.0k
    for (i = 0; i < c->num_landmarks; i++) {
3900
37.2k
        if ((s = fd->vv.varint_decode32_crc(fd, &c->landmark[i], &crc)) == -1) {
3901
161
            cram_free_container(c);
3902
161
            return NULL;
3903
37.0k
        } else {
3904
37.0k
            rd += s;
3905
37.0k
        }
3906
37.2k
    }
3907
3908
27.8k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3909
2.90k
        if (-1 == int32_decode(fd, (int32_t *)&c->crc32)) {
3910
12
            cram_free_container(c);
3911
12
            return NULL;
3912
2.89k
        } else {
3913
2.89k
            rd+=4;
3914
2.89k
        }
3915
3916
2.89k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3917
        // Pretend the CRC was OK so the fuzzer doesn't have to get it right
3918
2.89k
        crc = c->crc32;
3919
2.89k
#endif
3920
3921
2.89k
        if (crc != c->crc32) {
3922
0
            hts_log_error("Container header CRC32 failure");
3923
0
            cram_free_container(c);
3924
0
            return NULL;
3925
0
        }
3926
2.89k
    }
3927
3928
27.8k
    c->offset = rd;
3929
27.8k
    c->slices = NULL;
3930
27.8k
    c->slice = NULL;
3931
27.8k
    c->curr_slice = 0;
3932
27.8k
    c->max_slice = c->num_landmarks;
3933
27.8k
    c->slice_rec = 0;
3934
27.8k
    c->curr_rec = 0;
3935
27.8k
    c->max_rec = 0;
3936
3937
27.8k
    if (c->ref_seq_id == -2) {
3938
27
        c->multi_seq = 1;
3939
27
        fd->multi_seq = 1;
3940
27
    }
3941
3942
27.8k
    fd->empty_container =
3943
27.8k
        (c->num_records == 0 &&
3944
27.8k
         c->ref_seq_id == -1 &&
3945
27.8k
         c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
3946
3947
27.8k
    return c;
3948
27.8k
}
3949
3950
3951
/* MAXIMUM storage size needed for the container. */
3952
0
int cram_container_size(cram_container *c) {
3953
0
    return 55 + 5*c->num_landmarks;
3954
0
}
3955
3956
3957
/*
3958
 * Stores the container structure in dat and returns *size as the
3959
 * number of bytes written to dat[].  The input size of dat is also
3960
 * held in *size and should be initialised to cram_container_size(c).
3961
 *
3962
 * Returns 0 on success;
3963
 *        -1 on failure
3964
 */
3965
int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
3966
0
{
3967
0
    char *cp = (char *)dat;
3968
0
    int i;
3969
3970
    // Check the input buffer is large enough according to our stated
3971
    // requirements. (NOTE: it may actually take less.)
3972
0
    if (cram_container_size(c) > *size)
3973
0
        return -1;
3974
3975
0
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3976
0
        cp += itf8_put(cp, c->length);
3977
0
    } else {
3978
0
        *(int32_t *)cp = le_int4(c->length);
3979
0
        cp += 4;
3980
0
    }
3981
0
    if (c->multi_seq) {
3982
0
        cp += fd->vv.varint_put32(cp, NULL, -2);
3983
0
        cp += fd->vv.varint_put32(cp, NULL, 0);
3984
0
        cp += fd->vv.varint_put32(cp, NULL, 0);
3985
0
    } else {
3986
0
        cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
3987
0
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3988
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start);
3989
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span);
3990
0
        } else {
3991
0
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
3992
0
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
3993
0
        }
3994
0
    }
3995
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_records);
3996
0
    if (CRAM_MAJOR_VERS(fd->version) == 2) {
3997
0
        cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
3998
0
    } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3999
0
        cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
4000
0
    }
4001
0
    cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
4002
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
4003
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
4004
0
    for (i = 0; i < c->num_landmarks; i++)
4005
0
        cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
4006
4007
0
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
4008
0
        c->crc32 = crc32(0L, (uc *)dat, cp-dat);
4009
0
        cp[0] =  c->crc32        & 0xff;
4010
0
        cp[1] = (c->crc32 >>  8) & 0xff;
4011
0
        cp[2] = (c->crc32 >> 16) & 0xff;
4012
0
        cp[3] = (c->crc32 >> 24) & 0xff;
4013
0
        cp += 4;
4014
0
    }
4015
4016
0
    *size = cp-dat; // actual used size
4017
4018
0
    return 0;
4019
0
}
4020
4021
4022
/*
4023
 * Writes a container structure.
4024
 *
4025
 * Returns 0 on success
4026
 *        -1 on failure
4027
 */
4028
52.7k
int cram_write_container(cram_fd *fd, cram_container *c) {
4029
52.7k
    char buf_a[1024], *buf = buf_a, *cp;
4030
52.7k
    int i;
4031
4032
52.7k
    if (61 + c->num_landmarks * 10 >= 1024) {
4033
0
        buf = malloc(61 + c->num_landmarks * 10);
4034
0
        if (!buf)
4035
0
            return -1;
4036
0
    }
4037
52.7k
    cp = buf;
4038
4039
52.7k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4040
0
        cp += itf8_put(cp, c->length);
4041
52.7k
    } else if (CRAM_MAJOR_VERS(fd->version) <= 3) {
4042
52.7k
        *(int32_t *)cp = le_int4(c->length);
4043
52.7k
        cp += 4;
4044
52.7k
    } else {
4045
0
        cp += fd->vv.varint_put32(cp, NULL, c->length);
4046
0
    }
4047
52.7k
    if (c->multi_seq) {
4048
485
        cp += fd->vv.varint_put32(cp, NULL, (uint32_t)-2);
4049
485
        cp += fd->vv.varint_put32(cp, NULL, 0);
4050
485
        cp += fd->vv.varint_put32(cp, NULL, 0);
4051
52.2k
    } else {
4052
52.2k
        cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
4053
52.2k
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
4054
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start);
4055
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span);
4056
52.2k
        } else {
4057
52.2k
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
4058
52.2k
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
4059
52.2k
        }
4060
52.2k
    }
4061
52.7k
    cp += fd->vv.varint_put32(cp, NULL, c->num_records);
4062
52.7k
    if (CRAM_MAJOR_VERS(fd->version) >= 3)
4063
52.7k
        cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
4064
0
    else
4065
0
        cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
4066
52.7k
    cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
4067
52.7k
    cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
4068
52.7k
    cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
4069
104k
    for (i = 0; i < c->num_landmarks; i++)
4070
51.7k
        cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
4071
4072
52.7k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
4073
52.7k
        c->crc32 = crc32(0L, (uc *)buf, cp-buf);
4074
52.7k
        cp[0] =  c->crc32        & 0xff;
4075
52.7k
        cp[1] = (c->crc32 >>  8) & 0xff;
4076
52.7k
        cp[2] = (c->crc32 >> 16) & 0xff;
4077
52.7k
        cp[3] = (c->crc32 >> 24) & 0xff;
4078
52.7k
        cp += 4;
4079
52.7k
    }
4080
4081
52.7k
    if (cp-buf != hwrite(fd->fp, buf, cp-buf)) {
4082
0
        if (buf != buf_a)
4083
0
            free(buf);
4084
0
        return -1;
4085
0
    }
4086
4087
52.7k
    if (buf != buf_a)
4088
0
        free(buf);
4089
4090
52.7k
    return 0;
4091
52.7k
}
4092
4093
// common component shared by cram_flush_container{,_mt}
4094
35.9k
static int cram_flush_container2(cram_fd *fd, cram_container *c) {
4095
35.9k
    int i, j;
4096
4097
35.9k
    if (c->curr_slice > 0 && !c->slices)
4098
0
        return -1;
4099
4100
    //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
4101
4102
35.9k
    off_t c_offset = htell(fd->fp); // File offset of container
4103
4104
    /* Write the container struct itself */
4105
35.9k
    if (0 != cram_write_container(fd, c))
4106
0
        return -1;
4107
4108
35.9k
    off_t hdr_size = htell(fd->fp) - c_offset;
4109
4110
    /* And the compression header */
4111
35.9k
    if (0 != cram_write_block(fd, c->comp_hdr_block))
4112
0
        return -1;
4113
4114
    /* Followed by the slice blocks */
4115
35.9k
    off_t file_offset = htell(fd->fp);
4116
71.8k
    for (i = 0; i < c->curr_slice; i++) {
4117
35.9k
        cram_slice *s = c->slices[i];
4118
35.9k
        off_t spos = file_offset - c_offset - hdr_size;
4119
4120
35.9k
        if (0 != cram_write_block(fd, s->hdr_block))
4121
0
            return -1;
4122
4123
225k
        for (j = 0; j < s->hdr->num_blocks; j++) {
4124
189k
            if (0 != cram_write_block(fd, s->block[j]))
4125
0
                return -1;
4126
189k
        }
4127
4128
35.9k
        file_offset = htell(fd->fp);
4129
35.9k
        off_t sz = file_offset - c_offset - hdr_size - spos;
4130
4131
35.9k
        if (fd->idxfp) {
4132
0
            if (cram_index_slice(fd, c, s, fd->idxfp, c_offset, spos, sz) < 0)
4133
0
                return -1;
4134
0
        }
4135
35.9k
    }
4136
4137
35.9k
    return 0;
4138
35.9k
}
4139
4140
/*
4141
 * Flushes a completely or partially full container to disk, writing
4142
 * container structure, header and blocks. This also calls the encoder
4143
 * functions.
4144
 *
4145
 * Returns 0 on success
4146
 *        -1 on failure
4147
 */
4148
36.4k
int cram_flush_container(cram_fd *fd, cram_container *c) {
4149
    /* Encode the container blocks and generate compression header */
4150
36.4k
    if (0 != cram_encode_container(fd, c))
4151
482
        return -1;
4152
4153
35.9k
    return cram_flush_container2(fd, c);
4154
36.4k
}
4155
4156
typedef struct {
4157
    cram_fd *fd;
4158
    cram_container *c;
4159
} cram_job;
4160
4161
0
void *cram_flush_thread(void *arg) {
4162
0
    cram_job *j = (cram_job *)arg;
4163
4164
    /* Encode the container blocks and generate compression header */
4165
0
    if (0 != cram_encode_container(j->fd, j->c)) {
4166
0
        hts_log_error("Call to cram_encode_container failed");
4167
0
        return NULL;
4168
0
    }
4169
4170
0
    return arg;
4171
0
}
4172
4173
0
static int cram_flush_result(cram_fd *fd) {
4174
0
    int i, ret = 0;
4175
0
    hts_tpool_result *r;
4176
0
    cram_container *lc = NULL;
4177
4178
    // NB: we can have one result per slice, not per container,
4179
    // so we need to free the container only after all slices
4180
    // within it have been freed.  (Automatic via reference counting.)
4181
0
    while ((r = hts_tpool_next_result(fd->rqueue))) {
4182
0
        cram_job *j = (cram_job *)hts_tpool_result_data(r);
4183
0
        cram_container *c;
4184
4185
0
        if (!j) {
4186
0
            hts_tpool_delete_result(r, 0);
4187
0
            return -1;
4188
0
        }
4189
4190
0
        fd = j->fd;
4191
0
        c = j->c;
4192
4193
0
        if (fd->mode == 'w')
4194
0
            if (0 != cram_flush_container2(fd, c))
4195
0
                return -1;
4196
4197
        // Free the slices; filled out by encoder only
4198
0
        if (c->slices) {
4199
0
            for (i = 0; i < c->max_slice; i++) {
4200
0
                if (c->slices[i])
4201
0
                    cram_free_slice(c->slices[i]);
4202
0
                if (c->slices[i] == c->slice)
4203
0
                    c->slice = NULL;
4204
0
                c->slices[i] = NULL;
4205
0
            }
4206
0
        }
4207
4208
        // Free the current slice; set by both encoder & decoder
4209
0
        if (c->slice) {
4210
0
            cram_free_slice(c->slice);
4211
0
            c->slice = NULL;
4212
0
        }
4213
0
        c->curr_slice = 0;
4214
4215
        // Our jobs will be in order, so we free the last
4216
        // container when our job has switched to a new one.
4217
0
        if (c != lc) {
4218
0
            if (lc) {
4219
0
                if (fd->ctr == lc)
4220
0
                    fd->ctr = NULL;
4221
0
                if (fd->ctr_mt == lc)
4222
0
                    fd->ctr_mt = NULL;
4223
0
                cram_free_container(lc);
4224
0
            }
4225
0
            lc = c;
4226
0
        }
4227
4228
0
        hts_tpool_delete_result(r, 1);
4229
0
    }
4230
0
    if (lc) {
4231
0
        if (fd->ctr == lc)
4232
0
            fd->ctr = NULL;
4233
0
        if (fd->ctr_mt == lc)
4234
0
            fd->ctr_mt = NULL;
4235
0
        cram_free_container(lc);
4236
0
    }
4237
4238
0
    return ret;
4239
0
}
4240
4241
// Note: called while metrics_lock is held.
4242
// Will be left in this state too, but may temporarily unlock.
4243
2.56k
void reset_metrics(cram_fd *fd) {
4244
2.56k
    int i;
4245
4246
2.56k
    if (fd->pool) {
4247
        // If multi-threaded we have multiple blocks being
4248
        // compressed already and several on the to-do list
4249
        // (fd->rqueue->pending).  It's tricky to reset the
4250
        // metrics exactly the correct point, so instead we
4251
        // just flush the pool, reset, and then continue again.
4252
4253
        // Don't bother starting a new trial before then though.
4254
0
        for (i = 0; i < DS_END; i++) {
4255
0
            cram_metrics *m = fd->m[i];
4256
0
            if (!m)
4257
0
                continue;
4258
0
            m->next_trial = 999;
4259
0
        }
4260
4261
0
        pthread_mutex_unlock(&fd->metrics_lock);
4262
0
        hts_tpool_process_flush(fd->rqueue);
4263
0
        pthread_mutex_lock(&fd->metrics_lock);
4264
0
    }
4265
4266
123k
    for (i = 0; i < DS_END; i++) {
4267
120k
        cram_metrics *m = fd->m[i];
4268
120k
        if (!m)
4269
0
            continue;
4270
4271
120k
        m->trial = NTRIALS;
4272
120k
        m->next_trial = TRIAL_SPAN;
4273
120k
        m->revised_method = 0;
4274
120k
        m->unpackable = 0;
4275
4276
120k
        memset(m->sz, 0, sizeof(m->sz));
4277
120k
    }
4278
2.56k
}
4279
4280
36.4k
int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
4281
36.4k
    cram_job *j;
4282
4283
    // At the junction of mapped to unmapped data the compression
4284
    // methods may need to change due to very different statistical
4285
    // properties; particularly BA if minhash sorted.
4286
    //
4287
    // However with threading we'll have several in-flight blocks
4288
    // arriving out of order.
4289
    //
4290
    // So we do one trial reset of NThreads to last for NThreads
4291
    // duration to get us over this transition period, followed
4292
    // by another retrial of the usual ntrials & trial span.
4293
36.4k
    pthread_mutex_lock(&fd->metrics_lock);
4294
36.4k
    if (c->n_mapped < 0.3*c->curr_rec &&
4295
36.4k
        fd->last_mapped > 0.7*c->max_rec) {
4296
2.56k
        reset_metrics(fd);
4297
2.56k
    }
4298
36.4k
    fd->last_mapped = c->n_mapped * (c->max_rec+1)/(c->curr_rec+1) ;
4299
36.4k
    pthread_mutex_unlock(&fd->metrics_lock);
4300
4301
36.4k
    if (!fd->pool)
4302
36.4k
        return cram_flush_container(fd, c);
4303
4304
0
    if (!(j = malloc(sizeof(*j))))
4305
0
        return -1;
4306
0
    j->fd = fd;
4307
0
    j->c = c;
4308
4309
    // Flush the job.  Note our encoder queue may be full, so we
4310
    // either have to keep trying in non-blocking mode (what we do) or
4311
    // use a dedicated separate thread for draining the queue.
4312
0
    for (;;) {
4313
0
        errno = 0;
4314
0
        hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_flush_thread, j, 1);
4315
0
        int pending = (errno == EAGAIN);
4316
0
        if (cram_flush_result(fd) != 0)
4317
0
            return -1;
4318
0
        if (!pending)
4319
0
            break;
4320
4321
0
        hts_usleep(1000);
4322
0
    }
4323
4324
0
    return 0;
4325
0
}
4326
4327
/* ----------------------------------------------------------------------
4328
 * Compression headers; the first part of the container
4329
 */
4330
4331
/*
4332
 * Creates a new blank container compression header
4333
 *
4334
 * Returns header ptr on success
4335
 *         NULL on failure
4336
 */
4337
44.3k
cram_block_compression_hdr *cram_new_compression_header(void) {
4338
44.3k
    cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
4339
44.3k
    if (!hdr)
4340
0
        return NULL;
4341
4342
44.3k
    if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
4343
0
        free(hdr);
4344
0
        return NULL;
4345
0
    }
4346
4347
44.3k
    if (!(hdr->TD_hash = kh_init(m_s2i))) {
4348
0
        cram_free_block(hdr->TD_blk);
4349
0
        free(hdr);
4350
0
        return NULL;
4351
0
    }
4352
4353
44.3k
    if (!(hdr->TD_keys = string_pool_create(8192))) {
4354
0
        kh_destroy(m_s2i, hdr->TD_hash);
4355
0
        cram_free_block(hdr->TD_blk);
4356
0
        free(hdr);
4357
0
        return NULL;
4358
0
    }
4359
4360
44.3k
    return hdr;
4361
44.3k
}
4362
4363
46.1k
void cram_free_compression_header(cram_block_compression_hdr *hdr) {
4364
46.1k
    int i;
4365
4366
46.1k
    if (hdr->landmark)
4367
1.71k
        free(hdr->landmark);
4368
4369
46.1k
    if (hdr->preservation_map)
4370
46.1k
        kh_destroy(map, hdr->preservation_map);
4371
4372
1.52M
    for (i = 0; i < CRAM_MAP_HASH; i++) {
4373
1.47M
        cram_map *m, *m2;
4374
1.50M
        for (m = hdr->rec_encoding_map[i]; m; m = m2) {
4375
26.2k
            m2 = m->next;
4376
26.2k
            if (m->codec)
4377
0
                m->codec->free(m->codec);
4378
26.2k
            free(m);
4379
26.2k
        }
4380
1.47M
    }
4381
4382
1.52M
    for (i = 0; i < CRAM_MAP_HASH; i++) {
4383
1.47M
        cram_map *m, *m2;
4384
1.47M
        for (m = hdr->tag_encoding_map[i]; m; m = m2) {
4385
639
            m2 = m->next;
4386
639
            if (m->codec)
4387
639
                m->codec->free(m->codec);
4388
639
            free(m);
4389
639
        }
4390
1.47M
    }
4391
4392
2.21M
    for (i = 0; i < DS_END; i++) {
4393
2.17M
        if (hdr->codecs[i])
4394
669k
            hdr->codecs[i]->free(hdr->codecs[i]);
4395
2.17M
    }
4396
4397
46.1k
    if (hdr->TL)
4398
54
        free(hdr->TL);
4399
46.1k
    if (hdr->TD_blk)
4400
44.3k
        cram_free_block(hdr->TD_blk);
4401
46.1k
    if (hdr->TD_hash)
4402
46.1k
        kh_destroy(m_s2i, hdr->TD_hash);
4403
46.1k
    if (hdr->TD_keys)
4404
44.3k
        string_pool_destroy(hdr->TD_keys);
4405
4406
46.1k
    free(hdr);
4407
46.1k
}
4408
4409
4410
/* ----------------------------------------------------------------------
4411
 * Slices and slice headers
4412
 */
4413
4414
37.0k
void cram_free_slice_header(cram_block_slice_hdr *hdr) {
4415
37.0k
    if (!hdr)
4416
0
        return;
4417
4418
37.0k
    if (hdr->block_content_ids)
4419
36.6k
        free(hdr->block_content_ids);
4420
4421
37.0k
    free(hdr);
4422
4423
37.0k
    return;
4424
37.0k
}
4425
4426
37.1k
void cram_free_slice(cram_slice *s) {
4427
37.1k
    if (!s)
4428
0
        return;
4429
4430
37.1k
    if (s->hdr_block)
4431
36.5k
        cram_free_block(s->hdr_block);
4432
4433
37.1k
    if (s->block) {
4434
36.6k
        int i;
4435
4436
36.6k
        if (s->hdr) {
4437
6.41M
            for (i = 0; i < s->hdr->num_blocks; i++) {
4438
6.38M
                if (i > 0 && s->block[i] == s->block[0])
4439
6.14M
                    continue;
4440
240k
                cram_free_block(s->block[i]);
4441
240k
            }
4442
36.6k
        }
4443
36.6k
        free(s->block);
4444
36.6k
    }
4445
4446
37.1k
    {
4447
        // Normally already copied into s->block[], but potentially still
4448
        // here if we error part way through cram_encode_slice.
4449
37.1k
        int i;
4450
114k
        for (i = 0; i < s->naux_block; i++)
4451
77.2k
            cram_free_block(s->aux_block[i]);
4452
37.1k
    }
4453
4454
37.1k
    if (s->block_by_id)
4455
649
        free(s->block_by_id);
4456
4457
37.1k
    if (s->hdr)
4458
37.0k
        cram_free_slice_header(s->hdr);
4459
4460
37.1k
    if (s->seqs_blk)
4461
37.0k
        cram_free_block(s->seqs_blk);
4462
4463
37.1k
    if (s->qual_blk)
4464
1.05k
        cram_free_block(s->qual_blk);
4465
4466
37.1k
    if (s->name_blk)
4467
1.05k
        cram_free_block(s->name_blk);
4468
4469
37.1k
    if (s->aux_blk)
4470
37.0k
        cram_free_block(s->aux_blk);
4471
4472
37.1k
    if (s->base_blk)
4473
1.05k
        cram_free_block(s->base_blk);
4474
4475
37.1k
    if (s->soft_blk)
4476
1.05k
        cram_free_block(s->soft_blk);
4477
4478
37.1k
    if (s->cigar)
4479
37.0k
        free(s->cigar);
4480
4481
37.1k
    if (s->crecs)
4482
37.0k
        free(s->crecs);
4483
4484
37.1k
    if (s->features)
4485
15.8k
        free(s->features);
4486
4487
37.1k
    if (s->TN)
4488
0
        free(s->TN);
4489
4490
37.1k
    if (s->pair_keys)
4491
36.4k
        string_pool_destroy(s->pair_keys);
4492
4493
37.1k
    if (s->pair[0])
4494
37.1k
        kh_destroy(m_s2i, s->pair[0]);
4495
37.1k
    if (s->pair[1])
4496
37.1k
        kh_destroy(m_s2i, s->pair[1]);
4497
4498
37.1k
    if (s->aux_block)
4499
31.0k
        free(s->aux_block);
4500
4501
37.1k
    free(s);
4502
37.1k
}
4503
4504
/*
4505
 * Creates a new empty slice in memory, for subsequent writing to
4506
 * disk.
4507
 *
4508
 * Returns cram_slice ptr on success
4509
 *         NULL on failure
4510
 */
4511
36.4k
cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
4512
36.4k
    cram_slice *s = calloc(1, sizeof(*s));
4513
36.4k
    if (!s)
4514
0
        return NULL;
4515
4516
36.4k
    if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
4517
0
        goto err;
4518
36.4k
    s->hdr->content_type = type;
4519
4520
36.4k
    s->hdr_block = NULL;
4521
36.4k
    s->block = NULL;
4522
36.4k
    s->block_by_id = NULL;
4523
36.4k
    s->last_apos = 0;
4524
36.4k
    if (!(s->crecs = malloc(nrecs * sizeof(cram_record))))  goto err;
4525
36.4k
    s->cigar_alloc = 1024;
4526
36.4k
    if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err;
4527
36.4k
    s->ncigar = 0;
4528
4529
36.4k
    if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))       goto err;
4530
36.4k
    if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))   goto err;
4531
36.4k
    if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))   goto err;
4532
36.4k
    if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux)))  goto err;
4533
36.4k
    if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))   goto err;
4534
36.4k
    if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))   goto err;
4535
4536
36.4k
    s->features = NULL;
4537
36.4k
    s->nfeatures = s->afeatures = 0;
4538
4539
36.4k
#ifndef TN_external
4540
36.4k
    s->TN = NULL;
4541
36.4k
    s->nTN = s->aTN = 0;
4542
36.4k
#endif
4543
4544
    // Volatile keys as we do realloc in dstring
4545
36.4k
    if (!(s->pair_keys = string_pool_create(8192))) goto err;
4546
36.4k
    if (!(s->pair[0] = kh_init(m_s2i)))             goto err;
4547
36.4k
    if (!(s->pair[1] = kh_init(m_s2i)))             goto err;
4548
4549
#ifdef BA_external
4550
    s->BA_len = 0;
4551
#endif
4552
4553
36.4k
    return s;
4554
4555
0
 err:
4556
0
    if (s)
4557
0
        cram_free_slice(s);
4558
4559
0
    return NULL;
4560
36.4k
}
4561
4562
/*
4563
 * Loads an entire slice.
4564
 * FIXME: In 1.0 the native unit of slices within CRAM is broken
4565
 * as slices contain references to objects in other slices.
4566
 * To work around this while keeping the slice oriented outer loop
4567
 * we read all slices and stitch them together into a fake large
4568
 * slice instead.
4569
 *
4570
 * Returns cram_slice ptr on success
4571
 *         NULL on failure
4572
 */
4573
771
cram_slice *cram_read_slice(cram_fd *fd) {
4574
771
    cram_block *b = cram_read_block(fd);
4575
771
    cram_slice *s = calloc(1, sizeof(*s));
4576
771
    int i, n, max_id, min_id;
4577
4578
771
    if (!b || !s)
4579
20
        goto err;
4580
4581
751
    s->hdr_block = b;
4582
751
    switch (b->content_type) {
4583
714
    case MAPPED_SLICE:
4584
744
    case UNMAPPED_SLICE:
4585
744
        if (!(s->hdr = cram_decode_slice_header(fd, b)))
4586
55
            goto err;
4587
689
        break;
4588
4589
689
    default:
4590
7
        hts_log_error("Unexpected block of type %s",
4591
7
                      cram_content_type2str(b->content_type));
4592
7
        goto err;
4593
751
    }
4594
4595
689
    if (s->hdr->num_blocks < 1) {
4596
8
        hts_log_error("Slice does not include any data blocks");
4597
8
        goto err;
4598
8
    }
4599
4600
681
    s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
4601
681
    if (!s->block)
4602
0
        goto err;
4603
4604
3.22k
    for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
4605
2.57k
        if (!(s->block[i] = cram_read_block(fd)))
4606
32
            goto err;
4607
4608
2.54k
        if (s->block[i]->content_type == EXTERNAL) {
4609
603
            if (max_id < s->block[i]->content_id)
4610
14
                max_id = s->block[i]->content_id;
4611
603
            if (min_id > s->block[i]->content_id)
4612
591
                min_id = s->block[i]->content_id;
4613
603
        }
4614
2.54k
    }
4615
4616
649
    if (!(s->block_by_id = calloc(512, sizeof(s->block[0]))))
4617
0
        goto err;
4618
4619
3.12k
    for (i = 0; i < n; i++) {
4620
2.47k
        if (s->block[i]->content_type != EXTERNAL)
4621
1.87k
            continue;
4622
597
        uint32_t v = s->block[i]->content_id;
4623
597
        if (v >= 256)
4624
15
            v = 256 + v % 251;
4625
597
        s->block_by_id[v] = s->block[i];
4626
597
    }
4627
4628
    /* Initialise encoding/decoding tables */
4629
649
    s->cigar_alloc = 1024;
4630
649
    if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err;
4631
649
    s->ncigar = 0;
4632
4633
649
    if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))      goto err;
4634
649
    if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))  goto err;
4635
649
    if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))  goto err;
4636
649
    if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux))) goto err;
4637
649
    if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))  goto err;
4638
649
    if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))  goto err;
4639
4640
649
    s->crecs = NULL;
4641
4642
649
    s->last_apos = s->hdr->ref_seq_start;
4643
649
    s->decode_md = fd->decode_md;
4644
4645
649
    return s;
4646
4647
122
 err:
4648
122
    if (b)
4649
102
        cram_free_block(b);
4650
122
    if (s) {
4651
122
        s->hdr_block = NULL;
4652
122
        cram_free_slice(s);
4653
122
    }
4654
122
    return NULL;
4655
649
}
4656
4657
4658
/* ----------------------------------------------------------------------
4659
 * CRAM file definition (header)
4660
 */
4661
4662
/*
4663
 * Reads a CRAM file definition structure.
4664
 * Returns file_def ptr on success
4665
 *         NULL on failure
4666
 */
4667
9.29k
cram_file_def *cram_read_file_def(cram_fd *fd) {
4668
9.29k
    cram_file_def *def = malloc(sizeof(*def));
4669
9.29k
    if (!def)
4670
0
        return NULL;
4671
4672
9.29k
    if (26 != hread(fd->fp, &def->magic[0], 26)) {
4673
2
        free(def);
4674
2
        return NULL;
4675
2
    }
4676
4677
9.29k
    if (memcmp(def->magic, "CRAM", 4) != 0) {
4678
1
        free(def);
4679
1
        return NULL;
4680
1
    }
4681
4682
9.29k
    if (def->major_version > 4) {
4683
0
        hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x, 3.x or 4.x, got %d.%d",
4684
0
                      def->major_version, def->minor_version);
4685
0
        free(def);
4686
0
        return NULL;
4687
0
    }
4688
4689
9.29k
    fd->first_container += 26;
4690
9.29k
    fd->curr_position = fd->first_container;
4691
9.29k
    fd->last_slice = 0;
4692
4693
9.29k
    return def;
4694
9.29k
}
4695
4696
/*
4697
 * Writes a cram_file_def structure to cram_fd.
4698
 * Returns 0 on success
4699
 *        -1 on failure
4700
 */
4701
7.93k
int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
4702
7.93k
    return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1;
4703
7.93k
}
4704
4705
18.6k
void cram_free_file_def(cram_file_def *def) {
4706
18.6k
    if (def) free(def);
4707
18.6k
}
4708
4709
/* ----------------------------------------------------------------------
4710
 * SAM header I/O
4711
 */
4712
4713
4714
/*
4715
 * Reads the SAM header from the first CRAM data block.
4716
 * Also performs minimal parsing to extract read-group
4717
 * and sample information.
4718
4719
 * Returns SAM hdr ptr on success
4720
 *         NULL on failure
4721
 */
4722
9.29k
sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) {
4723
9.29k
    int32_t header_len;
4724
9.29k
    char *header;
4725
9.29k
    sam_hdr_t *hdr;
4726
4727
    /* 1.1 onwards stores the header in the first block of a container */
4728
9.29k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4729
        /* Length */
4730
6.84k
        if (-1 == int32_decode(fd, &header_len))
4731
3
            return NULL;
4732
4733
6.84k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
4734
6.84k
        if (header_len > FUZZ_ALLOC_LIMIT)
4735
2
            return NULL;
4736
6.84k
#endif
4737
4738
        /* Alloc and read */
4739
6.84k
        if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1)))
4740
0
            return NULL;
4741
4742
6.84k
        if (header_len != hread(fd->fp, header, header_len)) {
4743
27
            free(header);
4744
27
            return NULL;
4745
27
        }
4746
6.81k
        header[header_len] = '\0';
4747
4748
6.81k
        fd->first_container += 4 + header_len;
4749
6.81k
    } else {
4750
2.44k
        cram_container *c = cram_read_container(fd);
4751
2.44k
        cram_block *b;
4752
2.44k
        int i;
4753
2.44k
        int64_t len;
4754
4755
2.44k
        if (!c)
4756
138
            return NULL;
4757
4758
2.30k
        fd->first_container += c->length + c->offset;
4759
2.30k
        fd->curr_position = fd->first_container;
4760
4761
2.30k
        if (c->num_blocks < 1) {
4762
74
            cram_free_container(c);
4763
74
            return NULL;
4764
74
        }
4765
4766
2.23k
        if (!(b = cram_read_block(fd))) {
4767
111
            cram_free_container(c);
4768
111
            return NULL;
4769
111
        }
4770
2.12k
        if (cram_uncompress_block(b) != 0) {
4771
1.46k
            cram_free_container(c);
4772
1.46k
            cram_free_block(b);
4773
1.46k
            return NULL;
4774
1.46k
        }
4775
4776
655
        len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4777
655
            fd->vv.varint_size(b->content_id) +
4778
655
            fd->vv.varint_size(b->uncomp_size) +
4779
655
            fd->vv.varint_size(b->comp_size);
4780
4781
        /* Extract header from 1st block */
4782
655
        if (-1 == int32_get_blk(b, &header_len) ||
4783
655
            header_len < 0 || /* Spec. says signed...  why? */
4784
655
            b->uncomp_size - 4 < header_len) {
4785
127
            cram_free_container(c);
4786
127
            cram_free_block(b);
4787
127
            return NULL;
4788
127
        }
4789
528
        if (NULL == (header = malloc((size_t) header_len+1))) {
4790
0
            cram_free_container(c);
4791
0
            cram_free_block(b);
4792
0
            return NULL;
4793
0
        }
4794
528
        memcpy(header, BLOCK_END(b), header_len);
4795
528
        header[header_len] = '\0';
4796
528
        cram_free_block(b);
4797
4798
        /* Consume any remaining blocks */
4799
3.37k
        for (i = 1; i < c->num_blocks; i++) {
4800
2.85k
            if (!(b = cram_read_block(fd))) {
4801
12
                cram_free_container(c);
4802
12
                free(header);
4803
12
                return NULL;
4804
12
            }
4805
2.84k
            len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4806
2.84k
                fd->vv.varint_size(b->content_id) +
4807
2.84k
                fd->vv.varint_size(b->uncomp_size) +
4808
2.84k
                fd->vv.varint_size(b->comp_size);
4809
2.84k
            cram_free_block(b);
4810
2.84k
        }
4811
4812
516
        if (c->length > 0 && len > 0 && c->length > len) {
4813
            // Consume padding
4814
34
            char *pads = malloc(c->length - len);
4815
34
            if (!pads) {
4816
0
                cram_free_container(c);
4817
0
                free(header);
4818
0
                return NULL;
4819
0
            }
4820
4821
34
            if (c->length - len != hread(fd->fp, pads, c->length - len)) {
4822
21
                cram_free_container(c);
4823
21
                free(header);
4824
21
                free(pads);
4825
21
                return NULL;
4826
21
            }
4827
13
            free(pads);
4828
13
        }
4829
4830
495
        cram_free_container(c);
4831
495
    }
4832
4833
    /* Parse */
4834
7.30k
    hdr = sam_hdr_init();
4835
7.30k
    if (!hdr) {
4836
0
        free(header);
4837
0
        return NULL;
4838
0
    }
4839
4840
7.30k
    if (-1 == sam_hdr_add_lines(hdr, header, header_len)) {
4841
73
        free(header);
4842
73
        sam_hdr_destroy(hdr);
4843
73
        return NULL;
4844
73
    }
4845
4846
7.23k
    hdr->l_text = header_len;
4847
7.23k
    hdr->text = header;
4848
4849
7.23k
    return hdr;
4850
4851
7.30k
}
4852
4853
/*
4854
 * Converts 'in' to a full pathname to store in out.
4855
 * Out must be at least PATH_MAX bytes long.
4856
 */
4857
0
static void full_path(char *out, char *in) {
4858
0
    size_t in_l = strlen(in);
4859
0
    if (hisremote(in)) {
4860
0
        if (in_l > PATH_MAX) {
4861
0
            hts_log_error("Reference path is longer than %d", PATH_MAX);
4862
0
            return;
4863
0
        }
4864
0
        strncpy(out, in, PATH_MAX-1);
4865
0
        out[PATH_MAX-1] = 0;
4866
0
        return;
4867
0
    }
4868
0
    if (*in == '/' ||
4869
        // Windows paths
4870
0
        (in_l > 3 && toupper_c(*in) >= 'A'  && toupper_c(*in) <= 'Z' &&
4871
0
         in[1] == ':' && (in[2] == '/' || in[2] == '\\'))) {
4872
0
        strncpy(out, in, PATH_MAX-1);
4873
0
        out[PATH_MAX-1] = 0;
4874
0
    } else {
4875
0
        size_t len;
4876
4877
        // unable to get dir or out+in is too long
4878
0
        if (!getcwd(out, PATH_MAX) ||
4879
0
            (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
4880
0
            strncpy(out, in, PATH_MAX-1);
4881
0
            out[PATH_MAX-1] = 0;
4882
0
            return;
4883
0
        }
4884
4885
0
        snprintf(out+len, PATH_MAX - len, "/%s", in);
4886
4887
        // FIXME: cope with `pwd`/../../../foo.fa ?
4888
0
    }
4889
0
}
4890
4891
/*
4892
 * Writes a CRAM SAM header.
4893
 * Returns 0 on success
4894
 *        -1 on failure
4895
 */
4896
7.93k
int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) {
4897
7.93k
    size_t header_len;
4898
7.93k
    int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
4899
4900
    /* Write CRAM MAGIC if not yet written. */
4901
7.93k
    if (fd->file_def->major_version == 0) {
4902
7.93k
        fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version);
4903
7.93k
        fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version);
4904
7.93k
        if (0 != cram_write_file_def(fd, fd->file_def))
4905
0
            return -1;
4906
7.93k
    }
4907
4908
    /* 1.0 requires an UNKNOWN read-group */
4909
7.93k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4910
0
        if (!sam_hrecs_find_rg(hdr->hrecs, "UNKNOWN"))
4911
0
            if (sam_hdr_add_line(hdr, "RG",
4912
0
                            "ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
4913
0
                return -1;
4914
0
    }
4915
4916
7.93k
    if (-1 == refs_from_header(fd))
4917
0
        return -1;
4918
7.93k
    if (-1 == refs2id(fd->refs, fd->header))
4919
0
        return -1;
4920
4921
    /* Fix M5 strings */
4922
7.93k
    if (fd->refs && !fd->no_ref && fd->embed_ref <= 1) {
4923
7.93k
        int i;
4924
7.96k
        for (i = 0; i < hdr->hrecs->nref; i++) {
4925
2.23k
            sam_hrec_type_t *ty;
4926
2.23k
            char *ref;
4927
4928
2.23k
            if (!(ty = sam_hrecs_find_type_id(hdr->hrecs, "SQ", "SN", hdr->hrecs->ref[i].name)))
4929
0
                return -1;
4930
4931
2.23k
            if (!sam_hrecs_find_key(ty, "M5", NULL)) {
4932
2.20k
                char unsigned buf[16];
4933
2.20k
                char buf2[33];
4934
2.20k
                hts_pos_t rlen;
4935
2.20k
                hts_md5_context *md5;
4936
4937
2.20k
                if (!fd->refs ||
4938
2.20k
                    !fd->refs->ref_id ||
4939
2.20k
                    !fd->refs->ref_id[i]) {
4940
0
                    return -1;
4941
0
                }
4942
2.20k
                rlen = fd->refs->ref_id[i]->length;
4943
2.20k
                ref = cram_get_ref(fd, i, 1, rlen);
4944
2.20k
                if (NULL == ref) {
4945
2.20k
                    if (fd->embed_ref == -1) {
4946
                        // auto embed-ref
4947
2.20k
                        hts_log_warning("No M5 tags present and could not "
4948
2.20k
                                        "find reference");
4949
2.20k
                        hts_log_warning("Enabling embed_ref=2 option");
4950
2.20k
                        hts_log_warning("NOTE: the CRAM file will be bigger "
4951
2.20k
                                        "than using an external reference");
4952
2.20k
                        pthread_mutex_lock(&fd->ref_lock);
4953
                        // Best guess.  It may be unmapped data with broken
4954
                        // headers, in which case this will get ignored.
4955
2.20k
                        fd->embed_ref = 2;
4956
2.20k
                        pthread_mutex_unlock(&fd->ref_lock);
4957
2.20k
                        break;
4958
2.20k
                    }
4959
0
                    return -1;
4960
2.20k
                }
4961
0
                rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
4962
0
                if (!(md5 = hts_md5_init()))
4963
0
                    return -1;
4964
0
                if (HTS_POS_MAX <= ULONG_MAX) {
4965
                    // Platforms with 64-bit unsigned long update in one go
4966
0
                    hts_md5_update(md5, ref, rlen);
4967
0
                } else {
4968
                    // Those with 32-bit ulong (Windows) may have to loop
4969
                    // over epic references
4970
0
                    hts_pos_t pos = 0;
4971
0
                    while (rlen - pos > ULONG_MAX) {
4972
0
                        hts_md5_update(md5, ref + pos, ULONG_MAX);
4973
0
                        pos += ULONG_MAX;
4974
0
                    }
4975
0
                    hts_md5_update(md5, ref + pos, (unsigned long)(rlen - pos));
4976
0
                }
4977
0
                hts_md5_final(buf, md5);
4978
0
                hts_md5_destroy(md5);
4979
0
                cram_ref_decr(fd->refs, i);
4980
4981
0
                hts_md5_hex(buf2, buf);
4982
0
                fd->refs->ref_id[i]->validated_md5 = 1;
4983
0
                if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "M5", buf2, NULL))
4984
0
                    return -1;
4985
0
            }
4986
4987
30
            if (fd->ref_fn) {
4988
0
                char ref_fn[PATH_MAX];
4989
0
                full_path(ref_fn, fd->ref_fn);
4990
0
                if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "UR", ref_fn, NULL))
4991
0
                    return -1;
4992
0
            }
4993
30
        }
4994
7.93k
    }
4995
4996
    /* Length */
4997
7.93k
    header_len = sam_hdr_length(hdr);
4998
7.93k
    if (header_len > INT32_MAX) {
4999
0
        hts_log_error("Header is too long for CRAM format");
5000
0
        return -1;
5001
0
    }
5002
7.93k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
5003
0
        if (-1 == int32_encode(fd, header_len))
5004
0
            return -1;
5005
5006
        /* Text data */
5007
0
        if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len))
5008
0
            return -1;
5009
7.93k
    } else {
5010
        /* Create block(s) inside a container */
5011
7.93k
        cram_block *b = cram_new_block(FILE_HEADER, 0);
5012
7.93k
        cram_container *c = cram_new_container(0, 0);
5013
7.93k
        int padded_length;
5014
7.93k
        char *pads;
5015
7.93k
        int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3);
5016
5017
7.93k
        if (!b || !c) {
5018
0
            if (b) cram_free_block(b);
5019
0
            if (c) cram_free_container(c);
5020
0
            return -1;
5021
0
        }
5022
5023
7.93k
        if (int32_put_blk(b, header_len) < 0)
5024
0
            return -1;
5025
7.93k
        if (header_len)
5026
3.29k
            BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
5027
7.93k
        BLOCK_UPLEN(b);
5028
5029
        // Compress header block if V3.0 and above
5030
7.93k
        if (CRAM_MAJOR_VERS(fd->version) >= 3)
5031
7.93k
            if (cram_compress_block(fd, b, NULL, -1, -1) < 0)
5032
0
                return -1;
5033
5034
7.93k
        if (blank_block) {
5035
7.93k
            c->length = b->comp_size + 2 + 4*is_cram_3 +
5036
7.93k
                fd->vv.varint_size(b->content_id)   +
5037
7.93k
                fd->vv.varint_size(b->uncomp_size)  +
5038
7.93k
                fd->vv.varint_size(b->comp_size);
5039
5040
7.93k
            c->num_blocks = 2;
5041
7.93k
            c->num_landmarks = 2;
5042
7.93k
            if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
5043
0
                cram_free_block(b);
5044
0
                cram_free_container(c);
5045
0
                return -1;
5046
0
            }
5047
7.93k
            c->landmark[0] = 0;
5048
7.93k
            c->landmark[1] = c->length;
5049
5050
            // Plus extra storage for uncompressed secondary blank block
5051
7.93k
            padded_length = MIN(c->length*.5, 10000);
5052
7.93k
            c->length += padded_length + 2 + 4*is_cram_3 +
5053
7.93k
                fd->vv.varint_size(b->content_id) +
5054
7.93k
                fd->vv.varint_size(padded_length)*2;
5055
7.93k
        } else {
5056
            // Pad the block instead.
5057
0
            c->num_blocks = 1;
5058
0
            c->num_landmarks = 1;
5059
0
            if (!(c->landmark = malloc(sizeof(*c->landmark))))
5060
0
                return -1;
5061
0
            c->landmark[0] = 0;
5062
5063
0
            padded_length = MAX(c->length*1.5, 10000) - c->length;
5064
5065
0
            c->length = b->comp_size + padded_length +
5066
0
                2 + 4*is_cram_3 +
5067
0
                fd->vv.varint_size(b->content_id)   +
5068
0
                fd->vv.varint_size(b->uncomp_size)  +
5069
0
                fd->vv.varint_size(b->comp_size);
5070
5071
0
            if (NULL == (pads = calloc(1, padded_length))) {
5072
0
                cram_free_block(b);
5073
0
                cram_free_container(c);
5074
0
                return -1;
5075
0
            }
5076
0
            BLOCK_APPEND(b, pads, padded_length);
5077
0
            BLOCK_UPLEN(b);
5078
0
            free(pads);
5079
0
        }
5080
5081
7.93k
        if (-1 == cram_write_container(fd, c)) {
5082
0
            cram_free_block(b);
5083
0
            cram_free_container(c);
5084
0
            return -1;
5085
0
        }
5086
5087
7.93k
        if (-1 == cram_write_block(fd, b)) {
5088
0
            cram_free_block(b);
5089
0
            cram_free_container(c);
5090
0
            return -1;
5091
0
        }
5092
5093
7.93k
        if (blank_block) {
5094
7.93k
            BLOCK_RESIZE(b, padded_length);
5095
7.93k
            memset(BLOCK_DATA(b), 0, padded_length);
5096
7.93k
            BLOCK_SIZE(b) = padded_length;
5097
7.93k
            BLOCK_UPLEN(b);
5098
7.93k
            b->method = RAW;
5099
7.93k
            if (-1 == cram_write_block(fd, b)) {
5100
0
                cram_free_block(b);
5101
0
                cram_free_container(c);
5102
0
                return -1;
5103
0
            }
5104
7.93k
        }
5105
5106
7.93k
        cram_free_block(b);
5107
7.93k
        cram_free_container(c);
5108
7.93k
    }
5109
5110
7.93k
    if (0 != hflush(fd->fp))
5111
0
        return -1;
5112
5113
7.93k
    RP("=== Finishing saving header ===\n");
5114
5115
7.93k
    return 0;
5116
5117
0
 block_err:
5118
0
    return -1;
5119
7.93k
}
5120
5121
/* ----------------------------------------------------------------------
5122
 * The top-level cram opening, closing and option handling
5123
 */
5124
5125
/*
5126
 * Sets CRAM variable sized integer decode function tables.
5127
 * CRAM 1, 2, and 3.x all used ITF8 for uint32 and UTF8 for uint64.
5128
 * CRAM 4.x uses the same encoding mechanism for 32-bit and 64-bit
5129
 * (or anything inbetween), but also now supports signed values.
5130
 *
5131
 * Version is the CRAM major version number.
5132
 * vv is the vector table (probably &cram_fd->vv)
5133
 */
5134
18.6k
static void cram_init_varint(varint_vec *vv, int version) {
5135
18.6k
    if (version >= 4) {
5136
838
        vv->varint_get32 = uint7_get_32; // FIXME: varint.h API should be size agnostic
5137
838
        vv->varint_get32s = sint7_get_32;
5138
838
        vv->varint_get64 = uint7_get_64;
5139
838
        vv->varint_get64s = sint7_get_64;
5140
838
        vv->varint_put32 = uint7_put_32;
5141
838
        vv->varint_put32s = sint7_put_32;
5142
838
        vv->varint_put64 = uint7_put_64;
5143
838
        vv->varint_put64s = sint7_put_64;
5144
838
        vv->varint_put32_blk = uint7_put_blk_32;
5145
838
        vv->varint_put32s_blk = sint7_put_blk_32;
5146
838
        vv->varint_put64_blk = uint7_put_blk_64;
5147
838
        vv->varint_put64s_blk = sint7_put_blk_64;
5148
838
        vv->varint_size = uint7_size;
5149
838
        vv->varint_decode32_crc = uint7_decode_crc32;
5150
838
        vv->varint_decode32s_crc = sint7_decode_crc32;
5151
838
        vv->varint_decode64_crc = uint7_decode_crc64;
5152
17.8k
    } else {
5153
17.8k
        vv->varint_get32 = safe_itf8_get;
5154
17.8k
        vv->varint_get32s = safe_itf8_get;
5155
17.8k
        vv->varint_get64 = safe_ltf8_get;
5156
17.8k
        vv->varint_get64s = safe_ltf8_get;
5157
17.8k
        vv->varint_put32 = safe_itf8_put;
5158
17.8k
        vv->varint_put32s = safe_itf8_put;
5159
17.8k
        vv->varint_put64 = safe_ltf8_put;
5160
17.8k
        vv->varint_put64s = safe_ltf8_put;
5161
17.8k
        vv->varint_put32_blk = itf8_put_blk;
5162
17.8k
        vv->varint_put32s_blk = itf8_put_blk;
5163
17.8k
        vv->varint_put64_blk = ltf8_put_blk;
5164
17.8k
        vv->varint_put64s_blk = ltf8_put_blk;
5165
17.8k
        vv->varint_size = itf8_size;
5166
17.8k
        vv->varint_decode32_crc = itf8_decode_crc;
5167
17.8k
        vv->varint_decode32s_crc = itf8_decode_crc;
5168
17.8k
        vv->varint_decode64_crc = ltf8_decode_crc;
5169
17.8k
    }
5170
18.6k
}
5171
5172
/*
5173
 * Initialises the lookup tables. These could be global statics, but they're
5174
 * clumsy to setup in a multi-threaded environment unless we generate
5175
 * verbatim code and include that.
5176
 */
5177
18.6k
static void cram_init_tables(cram_fd *fd) {
5178
18.6k
    int i;
5179
5180
18.6k
    memset(fd->L1, 4, 256);
5181
18.6k
    fd->L1['A'] = 0; fd->L1['a'] = 0;
5182
18.6k
    fd->L1['C'] = 1; fd->L1['c'] = 1;
5183
18.6k
    fd->L1['G'] = 2; fd->L1['g'] = 2;
5184
18.6k
    fd->L1['T'] = 3; fd->L1['t'] = 3;
5185
5186
18.6k
    memset(fd->L2, 5, 256);
5187
18.6k
    fd->L2['A'] = 0; fd->L2['a'] = 0;
5188
18.6k
    fd->L2['C'] = 1; fd->L2['c'] = 1;
5189
18.6k
    fd->L2['G'] = 2; fd->L2['g'] = 2;
5190
18.6k
    fd->L2['T'] = 3; fd->L2['t'] = 3;
5191
18.6k
    fd->L2['N'] = 4; fd->L2['n'] = 4;
5192
5193
18.6k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
5194
3.51M
        for (i = 0; i < 0x200; i++) {
5195
3.50M
            int f = 0;
5196
5197
3.50M
            if (i & CRAM_FPAIRED)      f |= BAM_FPAIRED;
5198
3.50M
            if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
5199
3.50M
            if (i & CRAM_FUNMAP)       f |= BAM_FUNMAP;
5200
3.50M
            if (i & CRAM_FREVERSE)     f |= BAM_FREVERSE;
5201
3.50M
            if (i & CRAM_FREAD1)       f |= BAM_FREAD1;
5202
3.50M
            if (i & CRAM_FREAD2)       f |= BAM_FREAD2;
5203
3.50M
            if (i & CRAM_FSECONDARY)   f |= BAM_FSECONDARY;
5204
3.50M
            if (i & CRAM_FQCFAIL)      f |= BAM_FQCFAIL;
5205
3.50M
            if (i & CRAM_FDUP)         f |= BAM_FDUP;
5206
5207
3.50M
            fd->bam_flag_swap[i]  = f;
5208
3.50M
        }
5209
5210
28.0M
        for (i = 0; i < 0x1000; i++) {
5211
28.0M
            int g = 0;
5212
5213
28.0M
            if (i & BAM_FPAIRED)           g |= CRAM_FPAIRED;
5214
28.0M
            if (i & BAM_FPROPER_PAIR)  g |= CRAM_FPROPER_PAIR;
5215
28.0M
            if (i & BAM_FUNMAP)        g |= CRAM_FUNMAP;
5216
28.0M
            if (i & BAM_FREVERSE)      g |= CRAM_FREVERSE;
5217
28.0M
            if (i & BAM_FREAD1)        g |= CRAM_FREAD1;
5218
28.0M
            if (i & BAM_FREAD2)        g |= CRAM_FREAD2;
5219
28.0M
            if (i & BAM_FSECONDARY)    g |= CRAM_FSECONDARY;
5220
28.0M
            if (i & BAM_FQCFAIL)       g |= CRAM_FQCFAIL;
5221
28.0M
            if (i & BAM_FDUP)          g |= CRAM_FDUP;
5222
5223
28.0M
            fd->cram_flag_swap[i] = g;
5224
28.0M
        }
5225
11.8k
    } else {
5226
        /* NOP */
5227
48.4M
        for (i = 0; i < 0x1000; i++)
5228
48.4M
            fd->bam_flag_swap[i] = i;
5229
48.4M
        for (i = 0; i < 0x1000; i++)
5230
48.4M
            fd->cram_flag_swap[i] = i;
5231
11.8k
    }
5232
5233
18.6k
    memset(fd->cram_sub_matrix, 4, 32*32);
5234
616k
    for (i = 0; i < 32; i++) {
5235
597k
        fd->cram_sub_matrix[i]['A'&0x1f]=0;
5236
597k
        fd->cram_sub_matrix[i]['C'&0x1f]=1;
5237
597k
        fd->cram_sub_matrix[i]['G'&0x1f]=2;
5238
597k
        fd->cram_sub_matrix[i]['T'&0x1f]=3;
5239
597k
        fd->cram_sub_matrix[i]['N'&0x1f]=4;
5240
597k
    }
5241
112k
    for (i = 0; i < 20; i+=4) {
5242
93.3k
        int j;
5243
1.96M
        for (j = 0; j < 20; j++) {
5244
1.86M
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5245
1.86M
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5246
1.86M
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5247
1.86M
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5248
1.86M
        }
5249
93.3k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
5250
93.3k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
5251
93.3k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
5252
93.3k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
5253
93.3k
    }
5254
5255
18.6k
    cram_init_varint(&fd->vv, CRAM_MAJOR_VERS(fd->version));
5256
18.6k
}
5257
5258
// Default version numbers for CRAM
5259
static int major_version = 3;
5260
static int minor_version = 1;
5261
5262
/*
5263
 * Opens a CRAM file for read (mode "rb") or write ("wb").
5264
 * The filename may be "-" to indicate stdin or stdout.
5265
 *
5266
 * Returns file handle on success
5267
 *         NULL on failure.
5268
 */
5269
0
cram_fd *cram_open(const char *filename, const char *mode) {
5270
0
    hFILE *fp;
5271
0
    cram_fd *fd;
5272
0
    char fmode[3]= { mode[0], '\0', '\0' };
5273
5274
0
    if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
5275
0
        fmode[1] = 'b';
5276
0
    }
5277
5278
0
    fp = hopen(filename, fmode);
5279
0
    if (!fp)
5280
0
        return NULL;
5281
5282
0
    fd = cram_dopen(fp, filename, mode);
5283
0
    if (!fd)
5284
0
        hclose_abruptly(fp);
5285
5286
0
    return fd;
5287
0
}
5288
5289
/* Opens an existing stream for reading or writing.
5290
 *
5291
 * Returns file handle on success;
5292
 *         NULL on failure.
5293
 */
5294
18.6k
cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
5295
18.6k
    int i;
5296
18.6k
    char *cp;
5297
18.6k
    cram_fd *fd = calloc(1, sizeof(*fd));
5298
18.6k
    if (!fd)
5299
0
        return NULL;
5300
5301
18.6k
    fd->level = CRAM_DEFAULT_LEVEL;
5302
56.0k
    for (i = 0; mode[i]; i++) {
5303
37.3k
        if (mode[i] >= '0' && mode[i] <= '9') {
5304
0
            fd->level = mode[i] - '0';
5305
0
            break;
5306
0
        }
5307
37.3k
    }
5308
5309
18.6k
    fd->fp = fp;
5310
18.6k
    fd->mode = *mode;
5311
18.6k
    fd->first_container = 0;
5312
18.6k
    fd->curr_position = 0;
5313
5314
18.6k
    if (fd->mode == 'r') {
5315
        /* Reader */
5316
5317
9.29k
        if (!(fd->file_def = cram_read_file_def(fd)))
5318
3
            goto err;
5319
5320
9.29k
        fd->version = fd->file_def->major_version * 256 +
5321
9.29k
            fd->file_def->minor_version;
5322
5323
9.29k
        cram_init_tables(fd);
5324
5325
9.29k
        if (!(fd->header = cram_read_SAM_hdr(fd))) {
5326
2.05k
            cram_free_file_def(fd->file_def);
5327
2.05k
            goto err;
5328
2.05k
        }
5329
5330
9.37k
    } else {
5331
        /* Writer */
5332
9.37k
        cram_file_def *def = calloc(1, sizeof(*def));
5333
9.37k
        if (!def)
5334
0
            return NULL;
5335
5336
9.37k
        fd->file_def = def;
5337
5338
9.37k
        def->magic[0] = 'C';
5339
9.37k
        def->magic[1] = 'R';
5340
9.37k
        def->magic[2] = 'A';
5341
9.37k
        def->magic[3] = 'M';
5342
9.37k
        def->major_version = 0; // Indicator to write file def later.
5343
9.37k
        def->minor_version = 0;
5344
9.37k
        memset(def->file_id, 0, 20);
5345
9.37k
        strncpy(def->file_id, filename, 20);
5346
5347
9.37k
        fd->version = major_version * 256 + minor_version;
5348
9.37k
        cram_init_tables(fd);
5349
5350
        /* SAM header written later along with this file_def */
5351
9.37k
    }
5352
5353
16.6k
    fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
5354
16.6k
    if (!fd->prefix)
5355
0
        goto err;
5356
16.6k
    fd->first_base = fd->last_base = -1;
5357
16.6k
    fd->record_counter = 0;
5358
5359
16.6k
    fd->ctr = NULL;
5360
16.6k
    fd->ctr_mt = NULL;
5361
16.6k
    fd->refs  = refs_create();
5362
16.6k
    if (!fd->refs)
5363
0
        goto err;
5364
16.6k
    fd->ref_id = -2;
5365
16.6k
    fd->ref = NULL;
5366
5367
16.6k
    fd->decode_md = 0;
5368
16.6k
    fd->seqs_per_slice = SEQS_PER_SLICE;
5369
16.6k
    fd->bases_per_slice = BASES_PER_SLICE;
5370
16.6k
    fd->slices_per_container = SLICE_PER_CNT;
5371
16.6k
    fd->embed_ref = -1; // automatic selection
5372
16.6k
    fd->no_ref = 0;
5373
16.6k
    fd->no_ref_counter = 0;
5374
16.6k
    fd->ap_delta = 0;
5375
16.6k
    fd->ignore_md5 = 0;
5376
16.6k
    fd->lossy_read_names = 0;
5377
16.6k
    fd->use_bz2 = 0;
5378
16.6k
    fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3);
5379
16.6k
    fd->use_tok = (CRAM_MAJOR_VERS(fd->version) >= 3) && (CRAM_MINOR_VERS(fd->version) >= 1);
5380
16.6k
    fd->use_lzma = 0;
5381
16.6k
    fd->multi_seq = -1;
5382
16.6k
    fd->multi_seq_user = -1;
5383
16.6k
    fd->unsorted   = 0;
5384
16.6k
    fd->shared_ref = 0;
5385
16.6k
    fd->store_md = 0;
5386
16.6k
    fd->store_nm = 0;
5387
16.6k
    fd->last_RI_count = 0;
5388
5389
16.6k
    fd->index       = NULL;
5390
16.6k
    fd->own_pool    = 0;
5391
16.6k
    fd->pool        = NULL;
5392
16.6k
    fd->rqueue      = NULL;
5393
16.6k
    fd->job_pending = NULL;
5394
16.6k
    fd->ooc         = 0;
5395
16.6k
    fd->required_fields = INT_MAX;
5396
5397
16.6k
    pthread_mutex_init(&fd->metrics_lock, NULL);
5398
16.6k
    pthread_mutex_init(&fd->ref_lock, NULL);
5399
16.6k
    pthread_mutex_init(&fd->range_lock, NULL);
5400
16.6k
    pthread_mutex_init(&fd->bam_list_lock, NULL);
5401
5402
797k
    for (i = 0; i < DS_END; i++) {
5403
780k
        fd->m[i] = cram_new_metrics();
5404
780k
        if (!fd->m[i])
5405
0
            goto err;
5406
780k
    }
5407
5408
16.6k
    if (!(fd->tags_used = kh_init(m_metrics)))
5409
0
        goto err;
5410
5411
16.6k
    fd->range.refid = -2; // no ref.
5412
16.6k
    fd->eof = 1;          // See samtools issue #150
5413
16.6k
    fd->ref_fn = NULL;
5414
5415
16.6k
    fd->bl = NULL;
5416
5417
    /* Initialise dummy refs from the @SQ headers */
5418
16.6k
    if (-1 == refs_from_header(fd))
5419
0
        goto err;
5420
5421
16.6k
    return fd;
5422
5423
2.06k
 err:
5424
2.06k
    if (fd)
5425
2.06k
        free(fd);
5426
5427
2.06k
    return NULL;
5428
16.6k
}
5429
5430
/*
5431
 * Seek within a CRAM file.
5432
 *
5433
 * Returns 0 on success
5434
 *        -1 on failure
5435
 */
5436
0
int cram_seek(cram_fd *fd, off_t offset, int whence) {
5437
0
    fd->ooc = 0;
5438
5439
0
    cram_drain_rqueue(fd);
5440
5441
0
    return hseek(fd->fp, offset, whence) >= 0 ? 0 : -1;
5442
0
}
5443
5444
/*
5445
 * Flushes a CRAM file.
5446
 * Useful for when writing to stdout without wishing to close the stream.
5447
 *
5448
 * Returns 0 on success
5449
 *        -1 on failure
5450
 */
5451
0
int cram_flush(cram_fd *fd) {
5452
0
    if (!fd)
5453
0
        return -1;
5454
5455
0
    int ret = 0;
5456
5457
0
    if (fd->mode == 'w' && fd->ctr) {
5458
0
        if(fd->ctr->slice)
5459
0
            cram_update_curr_slice(fd->ctr, fd->version);
5460
5461
0
        if (-1 == cram_flush_container_mt(fd, fd->ctr))
5462
0
            ret = -1;
5463
5464
0
        cram_free_container(fd->ctr);
5465
0
        if (fd->ctr_mt == fd->ctr)
5466
0
            fd->ctr_mt = NULL;
5467
0
        fd->ctr = NULL;
5468
0
    }
5469
5470
0
    return ret;
5471
0
}
5472
5473
/*
5474
 * Writes an EOF block to a CRAM file.
5475
 *
5476
 * Returns 0 on success
5477
 *        -1 on failure
5478
 */
5479
8.90k
int cram_write_eof_block(cram_fd *fd) {
5480
    // EOF block is a container with special values to aid detection
5481
8.90k
    if (CRAM_MAJOR_VERS(fd->version) >= 2) {
5482
        // Empty container with
5483
        //   ref_seq_id -1
5484
        //   start pos 0x454f46 ("EOF")
5485
        //   span 0
5486
        //   nrec 0
5487
        //   counter 0
5488
        //   nbases 0
5489
        //   1 block (landmark 0)
5490
        //   (CRC32)
5491
8.90k
        cram_container c;
5492
8.90k
        memset(&c, 0, sizeof(c));
5493
8.90k
        c.ref_seq_id = -1;
5494
8.90k
        c.ref_seq_start = 0x454f46; // "EOF"
5495
8.90k
        c.ref_seq_span = 0;
5496
8.90k
        c.record_counter = 0;
5497
8.90k
        c.num_bases = 0;
5498
8.90k
        c.num_blocks = 1;
5499
8.90k
        int32_t land[1] = {0};
5500
8.90k
        c.landmark = land;
5501
5502
        // An empty compression header block with
5503
        //   method raw (0)
5504
        //   type comp header (1)
5505
        //   content id 0
5506
        //   block contents size 6
5507
        //   raw size 6
5508
        //     empty preservation map (01 00)
5509
        //     empty data series map (01 00)
5510
        //     empty tag map (01 00)
5511
        //   block CRC
5512
8.90k
        cram_block_compression_hdr ch;
5513
8.90k
        memset(&ch, 0, sizeof(ch));
5514
8.90k
        c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch, 0);
5515
5516
8.90k
        c.length = c.comp_hdr_block->byte            // Landmark[0]
5517
8.90k
            + 5                                      // block struct
5518
8.90k
            + 4*(CRAM_MAJOR_VERS(fd->version) >= 3); // CRC
5519
8.90k
        if (cram_write_container(fd, &c) < 0 ||
5520
8.90k
            cram_write_block(fd, c.comp_hdr_block) < 0) {
5521
0
            cram_close(fd);
5522
0
            cram_free_block(c.comp_hdr_block);
5523
0
            return -1;
5524
0
        }
5525
8.90k
        if (ch.preservation_map)
5526
8.90k
            kh_destroy(map, ch.preservation_map);
5527
8.90k
        cram_free_block(c.comp_hdr_block);
5528
5529
        // V2.1 bytes
5530
        // 0b 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id
5531
        // e0 45 4f 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5532
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5533
        // 00 01 00 06 06             // Comp.HDR blk
5534
        // 01 00 01 00 01 00          // Comp.HDR blk
5535
5536
        // V3.0 bytes:
5537
        // 0f 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id
5538
        // e0 45 4f 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5539
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5540
        // 05 bd d9 4f                // CRC32
5541
        // 00 01 00 06 06             // Comp.HDR blk
5542
        // 01 00 01 00 01 00          // Comp.HDR blk
5543
        // ee 63 01 4b                // CRC32
5544
5545
        // V4.0 bytes:
5546
        // 0f 00 00 00 8f ff ff ff    // Cont HDR: size, ref seq id
5547
        // 82 95 9e 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5548
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5549
        // ac d6 05 bc                // CRC32
5550
        // 00 01 00 06 06             // Comp.HDR blk
5551
        // 01 00 01 00 01 00          // Comp.HDR blk
5552
        // ee 63 01 4b                // CRC32
5553
8.90k
    }
5554
5555
8.90k
    return 0;
5556
8.90k
}
5557
5558
/*
5559
 * Closes a CRAM file.
5560
 * Returns 0 on success
5561
 *        -1 on failure
5562
 */
5563
16.6k
int cram_close(cram_fd *fd) {
5564
16.6k
    spare_bams *bl, *next;
5565
16.6k
    int i, ret = 0;
5566
5567
16.6k
    if (!fd)
5568
0
        return -1;
5569
5570
16.6k
    if (fd->mode == 'w' && fd->ctr) {
5571
4.38k
        if(fd->ctr->slice)
5572
4.38k
            cram_update_curr_slice(fd->ctr, fd->version);
5573
5574
4.38k
        if (-1 == cram_flush_container_mt(fd, fd->ctr))
5575
472
            ret = -1;
5576
4.38k
    }
5577
5578
16.6k
    if (fd->mode != 'w')
5579
7.23k
        cram_drain_rqueue(fd);
5580
5581
16.6k
    if (fd->pool && fd->eof >= 0 && fd->rqueue) {
5582
0
        hts_tpool_process_flush(fd->rqueue);
5583
5584
0
        if (0 != cram_flush_result(fd))
5585
0
            ret = -1;
5586
5587
0
        if (fd->mode == 'w')
5588
0
            fd->ctr = NULL; // prevent double freeing
5589
5590
        //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
5591
5592
0
        hts_tpool_process_destroy(fd->rqueue);
5593
0
    }
5594
5595
16.6k
    pthread_mutex_destroy(&fd->metrics_lock);
5596
16.6k
    pthread_mutex_destroy(&fd->ref_lock);
5597
16.6k
    pthread_mutex_destroy(&fd->range_lock);
5598
16.6k
    pthread_mutex_destroy(&fd->bam_list_lock);
5599
5600
16.6k
    if (ret == 0 && fd->mode == 'w') {
5601
        /* Write EOF block */
5602
8.90k
        if (0 != cram_write_eof_block(fd))
5603
0
            ret = -1;
5604
8.90k
    }
5605
5606
20.6k
    for (bl = fd->bl; bl; bl = next) {
5607
4.03k
        int max_rec = fd->seqs_per_slice * fd->slices_per_container;
5608
5609
4.03k
        next = bl->next;
5610
4.03k
        free_bam_list(bl->bams, max_rec);
5611
4.03k
        free(bl);
5612
4.03k
    }
5613
5614
16.6k
    if (hclose(fd->fp) != 0)
5615
0
        ret = -1;
5616
5617
16.6k
    if (fd->file_def)
5618
16.6k
        cram_free_file_def(fd->file_def);
5619
5620
16.6k
    if (fd->header)
5621
16.2k
        sam_hdr_destroy(fd->header);
5622
5623
16.6k
    free(fd->prefix);
5624
5625
16.6k
    if (fd->ctr)
5626
9.12k
        cram_free_container(fd->ctr);
5627
5628
16.6k
    if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
5629
69
        cram_free_container(fd->ctr_mt);
5630
5631
16.6k
    if (fd->refs)
5632
16.6k
        refs_free(fd->refs);
5633
16.6k
    if (fd->ref_free)
5634
0
        free(fd->ref_free);
5635
5636
797k
    for (i = 0; i < DS_END; i++)
5637
780k
        if (fd->m[i])
5638
780k
            free(fd->m[i]);
5639
5640
16.6k
    if (fd->tags_used) {
5641
16.6k
        khint_t k;
5642
5643
28.5k
        for (k = kh_begin(fd->tags_used); k != kh_end(fd->tags_used); k++) {
5644
11.9k
            if (kh_exist(fd->tags_used, k))
5645
5.50k
                free(kh_val(fd->tags_used, k));
5646
11.9k
        }
5647
5648
16.6k
        kh_destroy(m_metrics, fd->tags_used);
5649
16.6k
    }
5650
5651
16.6k
    if (fd->index)
5652
0
        cram_index_free(fd);
5653
5654
16.6k
    if (fd->own_pool && fd->pool)
5655
0
        hts_tpool_destroy(fd->pool);
5656
5657
16.6k
    if (fd->idxfp)
5658
0
        if (bgzf_close(fd->idxfp) < 0)
5659
0
            ret = -1;
5660
5661
16.6k
    free(fd);
5662
5663
16.6k
    return ret;
5664
16.6k
}
5665
5666
/*
5667
 * Returns 1 if we hit an EOF while reading.
5668
 */
5669
12.5k
int cram_eof(cram_fd *fd) {
5670
12.5k
    return fd->eof;
5671
12.5k
}
5672
5673
5674
/*
5675
 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5676
 * Use this immediately after opening.
5677
 *
5678
 * Returns 0 on success
5679
 *        -1 on failure
5680
 */
5681
7.23k
int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...) {
5682
7.23k
    int r;
5683
7.23k
    va_list args;
5684
5685
7.23k
    va_start(args, opt);
5686
7.23k
    r = cram_set_voption(fd, opt, args);
5687
7.23k
    va_end(args);
5688
5689
7.23k
    return r;
5690
7.23k
}
5691
5692
/*
5693
 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5694
 * Use this immediately after opening.
5695
 *
5696
 * Returns 0 on success
5697
 *        -1 on failure
5698
 */
5699
7.23k
int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
5700
7.23k
    refs_t *refs;
5701
5702
7.23k
    if (!fd) {
5703
0
        errno = EBADF;
5704
0
        return -1;
5705
0
    }
5706
5707
7.23k
    switch (opt) {
5708
7.23k
    case CRAM_OPT_DECODE_MD:
5709
7.23k
        fd->decode_md = va_arg(args, int);
5710
7.23k
        break;
5711
5712
0
    case CRAM_OPT_PREFIX:
5713
0
        if (fd->prefix)
5714
0
            free(fd->prefix);
5715
0
        if (!(fd->prefix = strdup(va_arg(args, char *))))
5716
0
            return -1;
5717
0
        break;
5718
5719
0
    case CRAM_OPT_VERBOSITY:
5720
0
        break;
5721
5722
0
    case CRAM_OPT_SEQS_PER_SLICE:
5723
0
        fd->seqs_per_slice = va_arg(args, int);
5724
0
        if (fd->bases_per_slice == BASES_PER_SLICE)
5725
0
            fd->bases_per_slice = fd->seqs_per_slice * 500;
5726
0
        break;
5727
5728
0
    case CRAM_OPT_BASES_PER_SLICE:
5729
0
        fd->bases_per_slice = va_arg(args, int);
5730
0
        break;
5731
5732
0
    case CRAM_OPT_SLICES_PER_CONTAINER:
5733
0
        fd->slices_per_container = va_arg(args, int);
5734
0
        break;
5735
5736
0
    case CRAM_OPT_EMBED_REF:
5737
0
        fd->embed_ref = va_arg(args, int);
5738
0
        break;
5739
5740
0
    case CRAM_OPT_NO_REF:
5741
0
        fd->no_ref = va_arg(args, int);
5742
0
        break;
5743
5744
0
    case CRAM_OPT_POS_DELTA:
5745
0
        fd->ap_delta = va_arg(args, int);
5746
0
        break;
5747
5748
0
    case CRAM_OPT_IGNORE_MD5:
5749
0
        fd->ignore_md5 = va_arg(args, int);
5750
0
        break;
5751
5752
0
    case CRAM_OPT_LOSSY_NAMES:
5753
0
        fd->lossy_read_names = va_arg(args, int);
5754
        // Currently lossy read names required paired (attached) reads.
5755
        // TLEN 0 or being 1 out causes read pairs to be detached, breaking
5756
        // the lossy read name compression, so we have extra options to
5757
        // slacken the exact TLEN round-trip checks.
5758
0
        fd->tlen_approx = fd->lossy_read_names;
5759
0
        fd->tlen_zero = fd->lossy_read_names;
5760
0
        break;
5761
5762
0
    case CRAM_OPT_USE_BZIP2:
5763
0
        fd->use_bz2 = va_arg(args, int);
5764
0
        break;
5765
5766
0
    case CRAM_OPT_USE_RANS:
5767
0
        fd->use_rans = va_arg(args, int);
5768
0
        break;
5769
5770
0
    case CRAM_OPT_USE_TOK:
5771
0
        fd->use_tok = va_arg(args, int);
5772
0
        break;
5773
5774
0
    case CRAM_OPT_USE_FQZ:
5775
0
        fd->use_fqz = va_arg(args, int);
5776
0
        break;
5777
5778
0
    case CRAM_OPT_USE_ARITH:
5779
0
        fd->use_arith = va_arg(args, int);
5780
0
        break;
5781
5782
0
    case CRAM_OPT_USE_LZMA:
5783
0
        fd->use_lzma = va_arg(args, int);
5784
0
        break;
5785
5786
0
    case CRAM_OPT_SHARED_REF:
5787
0
        fd->shared_ref = 1;
5788
0
        refs = va_arg(args, refs_t *);
5789
0
        if (refs != fd->refs) {
5790
0
            if (fd->refs)
5791
0
                refs_free(fd->refs);
5792
0
            fd->refs = refs;
5793
0
            fd->refs->count++;
5794
0
        }
5795
0
        break;
5796
5797
0
    case CRAM_OPT_RANGE: {
5798
0
        int r = cram_seek_to_refpos(fd, va_arg(args, cram_range *));
5799
0
        pthread_mutex_lock(&fd->range_lock);
5800
//        printf("opt range noseek to %p %d:%ld-%ld\n",
5801
//               fd, fd->range.refid, fd->range.start, fd->range.end);
5802
0
        if (fd->range.refid != -2)
5803
0
            fd->required_fields |= SAM_POS;
5804
0
        pthread_mutex_unlock(&fd->range_lock);
5805
0
        return r;
5806
0
    }
5807
5808
0
    case CRAM_OPT_RANGE_NOSEEK: {
5809
        // As per CRAM_OPT_RANGE, but no seeking
5810
0
        pthread_mutex_lock(&fd->range_lock);
5811
0
        cram_range *r = va_arg(args, cram_range *);
5812
0
        fd->range = *r;
5813
0
        if (r->refid == HTS_IDX_NOCOOR) {
5814
0
            fd->range.refid = -1;
5815
0
            fd->range.start = 0;
5816
0
        } else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) {
5817
0
            fd->range.refid = -2; // special case in cram_next_slice
5818
0
        }
5819
0
        if (fd->range.refid != -2)
5820
0
            fd->required_fields |= SAM_POS;
5821
0
        fd->ooc = 0;
5822
0
        fd->eof = 0;
5823
0
        pthread_mutex_unlock(&fd->range_lock);
5824
0
        return 0;
5825
0
    }
5826
5827
0
    case CRAM_OPT_REFERENCE:
5828
0
        return cram_load_reference(fd, va_arg(args, char *));
5829
5830
0
    case CRAM_OPT_VERSION: {
5831
0
        int major, minor;
5832
0
        char *s = va_arg(args, char *);
5833
0
        if (2 != sscanf(s, "%d.%d", &major, &minor)) {
5834
0
            hts_log_error("Malformed version string %s", s);
5835
0
            return -1;
5836
0
        }
5837
0
        if (!((major == 1 &&  minor == 0) ||
5838
0
              (major == 2 && (minor == 0 || minor == 1)) ||
5839
0
              (major == 3 && (minor == 0 || minor == 1)) ||
5840
0
              (major == 4 &&  minor == 0))) {
5841
0
            hts_log_error("Unknown version string; use 1.0, 2.0, 2.1, 3.0, 3.1 or 4.0");
5842
0
            errno = EINVAL;
5843
0
            return -1;
5844
0
        }
5845
5846
0
        if (major > 3 || (major == 3 && minor > 1)) {
5847
0
            hts_log_warning(
5848
0
                "CRAM version %s is still a draft and subject to change.\n"
5849
0
                "This is a technology demonstration that should not be "
5850
0
                "used for archival data.", s);
5851
0
        }
5852
5853
0
        fd->version = major*256 + minor;
5854
5855
0
        fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0;
5856
5857
0
        fd->use_tok = ((CRAM_MAJOR_VERS(fd->version) == 3 &&
5858
0
                        CRAM_MINOR_VERS(fd->version) >= 1) ||
5859
0
                        CRAM_MAJOR_VERS(fd->version) >= 4) ? 1 : 0;
5860
0
        cram_init_tables(fd);
5861
5862
0
        break;
5863
0
    }
5864
5865
0
    case CRAM_OPT_MULTI_SEQ_PER_SLICE:
5866
0
        fd->multi_seq_user = fd->multi_seq = va_arg(args, int);
5867
0
        break;
5868
5869
0
    case CRAM_OPT_NTHREADS: {
5870
0
        int nthreads =  va_arg(args, int);
5871
0
        if (nthreads >= 1) {
5872
0
            if (!(fd->pool = hts_tpool_init(nthreads)))
5873
0
                return -1;
5874
5875
0
            fd->rqueue = hts_tpool_process_init(fd->pool, nthreads*2, 0);
5876
0
            fd->shared_ref = 1;
5877
0
            fd->own_pool = 1;
5878
0
        }
5879
0
        break;
5880
0
    }
5881
5882
0
    case CRAM_OPT_THREAD_POOL: {
5883
0
        htsThreadPool *p = va_arg(args, htsThreadPool *);
5884
0
        fd->pool = p ? p->pool : NULL;
5885
0
        if (fd->pool) {
5886
0
            fd->rqueue = hts_tpool_process_init(fd->pool,
5887
0
                                                p->qsize ? p->qsize : hts_tpool_size(fd->pool)*2,
5888
0
                                                0);
5889
0
        }
5890
0
        fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
5891
0
        fd->own_pool = 0;
5892
5893
        //fd->qsize = 1;
5894
        //fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
5895
        //hts_tpool_dispatch(fd->pool, cram_decoder_thread, fd);
5896
0
        break;
5897
0
    }
5898
5899
0
    case CRAM_OPT_REQUIRED_FIELDS:
5900
0
        fd->required_fields = va_arg(args, int);
5901
0
        if (fd->range.refid != -2)
5902
0
            fd->required_fields |= SAM_POS;
5903
0
        break;
5904
5905
0
    case CRAM_OPT_STORE_MD:
5906
0
        fd->store_md = va_arg(args, int);
5907
0
        break;
5908
5909
0
    case CRAM_OPT_STORE_NM:
5910
0
        fd->store_nm = va_arg(args, int);
5911
0
        break;
5912
5913
0
    case HTS_OPT_COMPRESSION_LEVEL:
5914
0
        fd->level = va_arg(args, int);
5915
0
        break;
5916
5917
0
    case HTS_OPT_PROFILE: {
5918
0
        enum hts_profile_option prof = va_arg(args, int);
5919
0
        switch (prof) {
5920
0
        case HTS_PROFILE_FAST:
5921
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 1;
5922
0
            fd->use_tok = 0;
5923
0
            fd->seqs_per_slice = 10000;
5924
0
            break;
5925
5926
0
        case HTS_PROFILE_NORMAL:
5927
0
            break;
5928
5929
0
        case HTS_PROFILE_SMALL:
5930
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 6;
5931
0
            fd->use_bz2 = 1;
5932
0
            fd->use_fqz = 1;
5933
0
            fd->seqs_per_slice = 25000;
5934
0
            break;
5935
5936
0
        case HTS_PROFILE_ARCHIVE:
5937
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 7;
5938
0
            fd->use_bz2 = 1;
5939
0
            fd->use_fqz = 1;
5940
0
            fd->use_arith = 1;
5941
0
            if (fd->level > 7)
5942
0
                fd->use_lzma = 1;
5943
0
            fd->seqs_per_slice = 100000;
5944
0
            break;
5945
0
        }
5946
5947
0
        if (fd->bases_per_slice == BASES_PER_SLICE)
5948
0
            fd->bases_per_slice = fd->seqs_per_slice * 500;
5949
0
        break;
5950
0
    }
5951
5952
0
    default:
5953
0
        hts_log_error("Unknown CRAM option code %d", opt);
5954
0
        errno = EINVAL;
5955
0
        return -1;
5956
7.23k
    }
5957
5958
7.23k
    return 0;
5959
7.23k
}
5960
5961
int cram_check_EOF(cram_fd *fd)
5962
0
{
5963
    // Byte 9 in these templates is & with 0x0f to resolve differences
5964
    // between ITF-8 interpretations between early Java and C
5965
    // implementations of CRAM
5966
0
    static const unsigned char TEMPLATE_2_1[30] = {
5967
0
        0x0b, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
5968
0
        0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
5969
0
        0x01, 0x00, 0x06, 0x06, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00
5970
0
    };
5971
0
    static const unsigned char TEMPLATE_3[38] = {
5972
0
        0x0f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
5973
0
        0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x05,
5974
0
        0xbd, 0xd9, 0x4f, 0x00, 0x01, 0x00, 0x06, 0x06, 0x01, 0x00,
5975
0
        0x01, 0x00, 0x01, 0x00, 0xee, 0x63, 0x01, 0x4b
5976
0
    };
5977
5978
0
    unsigned char buf[38]; // max(sizeof TEMPLATE_*)
5979
5980
0
    uint8_t major = CRAM_MAJOR_VERS(fd->version);
5981
0
    uint8_t minor = CRAM_MINOR_VERS(fd->version);
5982
5983
0
    const unsigned char *template;
5984
0
    ssize_t template_len;
5985
0
    if ((major < 2) ||
5986
0
        (major == 2 && minor == 0)) {
5987
0
        return 3; // No EOF support in cram versions less than 2.1
5988
0
    } else if (major == 2 && minor == 1) {
5989
0
        template = TEMPLATE_2_1;
5990
0
        template_len = sizeof TEMPLATE_2_1;
5991
0
    } else {
5992
0
        template = TEMPLATE_3;
5993
0
        template_len = sizeof TEMPLATE_3;
5994
0
    }
5995
5996
0
    off_t offset = htell(fd->fp);
5997
0
    if (hseek(fd->fp, -template_len, SEEK_END) < 0) {
5998
0
        if (errno == ESPIPE) {
5999
0
            hclearerr(fd->fp);
6000
0
            return 2;
6001
0
        }
6002
0
        else {
6003
0
            return -1;
6004
0
        }
6005
0
    }
6006
0
    if (hread(fd->fp, buf, template_len) != template_len) return -1;
6007
0
    if (hseek(fd->fp, offset, SEEK_SET) < 0) return -1;
6008
0
    buf[8] &= 0x0f;
6009
0
    return (memcmp(template, buf, template_len) == 0)? 1 : 0;
6010
0
}