Coverage Report

Created: 2025-11-11 06:39

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/cram/cram_io.c
Line
Count
Source
1
/*
2
Copyright (c) 2012-2025 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * CRAM I/O primitives.
33
 *
34
 * - ITF8 encoding and decoding.
35
 * - Block based I/O
36
 * - Zlib inflating and deflating (memory)
37
 * - CRAM basic data structure reading and writing
38
 * - File opening / closing
39
 * - Reference sequence handling
40
 */
41
42
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
43
#include <config.h>
44
45
#include <stdio.h>
46
#include <errno.h>
47
#include <assert.h>
48
#include <stdlib.h>
49
#include <string.h>
50
#include <unistd.h>
51
#include <zlib.h>
52
#ifdef HAVE_LIBBZ2
53
#include <bzlib.h>
54
#endif
55
#ifdef HAVE_LIBLZMA
56
#ifdef HAVE_LZMA_H
57
#include <lzma.h>
58
#else
59
#include "../os/lzma_stub.h"
60
#endif
61
#endif
62
#include <sys/types.h>
63
#include <sys/stat.h>
64
#include <math.h>
65
#include <stdint.h>
66
67
#ifdef HAVE_LIBDEFLATE
68
#include <libdeflate.h>
69
#define crc32(a,b,c) libdeflate_crc32((a),(b),(c))
70
#endif
71
72
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
73
#include "../fuzz_settings.h"
74
#endif
75
76
#include "cram.h"
77
#include "os.h"
78
#include "../htslib/hts.h"
79
#include "../hts_internal.h"
80
#include "open_trace_file.h"
81
82
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
83
#include <htscodecs/rANS_static.h>
84
#include <htscodecs/rANS_static4x16.h>
85
#include <htscodecs/arith_dynamic.h>
86
#include <htscodecs/tokenise_name3.h>
87
#include <htscodecs/fqzcomp_qual.h>
88
#include <htscodecs/varint.h> // CRAM v4.0 variable-size integers
89
#else
90
#include "../htscodecs/htscodecs/rANS_static.h"
91
#include "../htscodecs/htscodecs/rANS_static4x16.h"
92
#include "../htscodecs/htscodecs/arith_dynamic.h"
93
#include "../htscodecs/htscodecs/tokenise_name3.h"
94
#include "../htscodecs/htscodecs/fqzcomp_qual.h"
95
#include "../htscodecs/htscodecs/varint.h"
96
#endif
97
98
//#define REF_DEBUG
99
100
#ifdef REF_DEBUG
101
#include <sys/syscall.h>
102
#define gettid() (int)syscall(SYS_gettid)
103
104
#define RP(...) fprintf (stderr, __VA_ARGS__)
105
#else
106
#define RP(...)
107
#endif
108
109
#include "../htslib/hfile.h"
110
#include "../htslib/bgzf.h"
111
#include "../htslib/faidx.h"
112
#include "../hts_internal.h"
113
114
#ifndef PATH_MAX
115
#define PATH_MAX FILENAME_MAX
116
#endif
117
118
1.10M
#define TRIAL_SPAN 70
119
1.10M
#define NTRIALS 3
120
121
22.2k
#define CRAM_DEFAULT_LEVEL 5
122
123
/* ----------------------------------------------------------------------
124
 * ITF8 encoding and decoding.
125
 *
126
 * Also see the itf8_get and itf8_put macros in cram_io.h
127
 */
128
129
/*
130
 * LEGACY: consider using itf8_decode_crc.
131
 *
132
 * Reads an integer in ITF-8 encoding from 'cp' and stores it in
133
 * *val.
134
 *
135
 * Returns the number of bytes read on success
136
 *        -1 on failure
137
 */
138
0
int itf8_decode(cram_fd *fd, int32_t *val_p) {
139
0
    static int nbytes[16] = {
140
0
        0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
141
0
        1,1,1,1,                                        // 1000xxxx - 1011xxxx
142
0
        2,2,                                            // 1100xxxx - 1101xxxx
143
0
        3,                                              // 1110xxxx
144
0
        4,                                              // 1111xxxx
145
0
    };
146
147
0
    static int nbits[16] = {
148
0
        0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
149
0
        0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
150
0
        0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
151
0
        0x0f,                                           // 1110xxxx
152
0
        0x0f,                                           // 1111xxxx
153
0
    };
154
155
0
    int32_t val = hgetc(fd->fp);
156
0
    if (val == -1)
157
0
        return -1;
158
159
0
    int i = nbytes[val>>4];
160
0
    val &= nbits[val>>4];
161
162
0
    switch(i) {
163
0
    case 0:
164
0
        *val_p = val;
165
0
        return 1;
166
167
0
    case 1:
168
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
169
0
        *val_p = val;
170
0
        return 2;
171
172
0
    case 2:
173
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
174
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
175
0
        *val_p = val;
176
0
        return 3;
177
178
0
    case 3:
179
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
180
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
181
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
182
0
        *val_p = val;
183
0
        return 4;
184
185
0
    case 4: // really 3.5 more, why make it different?
186
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
187
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
188
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
189
0
        val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f);
190
0
        *val_p = val;
191
0
    }
192
193
0
    return 5;
194
0
}
195
196
280k
int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
197
280k
    static int nbytes[16] = {
198
280k
        0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
199
280k
        1,1,1,1,                                        // 1000xxxx - 1011xxxx
200
280k
        2,2,                                            // 1100xxxx - 1101xxxx
201
280k
        3,                                              // 1110xxxx
202
280k
        4,                                              // 1111xxxx
203
280k
    };
204
205
280k
    static int nbits[16] = {
206
280k
        0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
207
280k
        0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
208
280k
        0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
209
280k
        0x0f,                                           // 1110xxxx
210
280k
        0x0f,                                           // 1111xxxx
211
280k
    };
212
280k
    unsigned char c[5];
213
214
280k
    int32_t val = hgetc(fd->fp);
215
280k
    if (val == -1)
216
424
        return -1;
217
218
279k
    c[0]=val;
219
220
279k
    int i = nbytes[val>>4];
221
279k
    val &= nbits[val>>4];
222
223
279k
    if (i > 0) {
224
22.3k
        if (hread(fd->fp, &c[1], i) < i)
225
100
            return -1;
226
22.3k
    }
227
228
279k
    switch(i) {
229
257k
    case 0:
230
257k
        *val_p = val;
231
257k
        *crc = crc32(*crc, c, 1);
232
257k
        return 1;
233
234
9.96k
    case 1:
235
9.96k
        val = (val<<8) | c[1];
236
9.96k
        *val_p = val;
237
9.96k
        *crc = crc32(*crc, c, 2);
238
9.96k
        return 2;
239
240
5.08k
    case 2:
241
5.08k
        val = (val<<8) | c[1];
242
5.08k
        val = (val<<8) | c[2];
243
5.08k
        *val_p = val;
244
5.08k
        *crc = crc32(*crc, c, 3);
245
5.08k
        return 3;
246
247
1.73k
    case 3:
248
1.73k
        val = (val<<8) | c[1];
249
1.73k
        val = (val<<8) | c[2];
250
1.73k
        val = (val<<8) | c[3];
251
1.73k
        *val_p = val;
252
1.73k
        *crc = crc32(*crc, c, 4);
253
1.73k
        return 4;
254
255
5.49k
    case 4: // really 3.5 more, why make it different?
256
5.49k
        {
257
5.49k
            uint32_t uv = val;
258
5.49k
            uv = (uv<<8) |  c[1];
259
5.49k
            uv = (uv<<8) |  c[2];
260
5.49k
            uv = (uv<<8) |  c[3];
261
5.49k
            uv = (uv<<4) | (c[4] & 0x0f);
262
            // Avoid implementation-defined behaviour on negative values
263
5.49k
            *val_p = uv < 0x80000000UL ? (int32_t) uv : -((int32_t) (0xffffffffUL - uv)) - 1;
264
5.49k
            *crc = crc32(*crc, c, 5);
265
5.49k
        }
266
279k
    }
267
268
5.49k
    return 5;
269
279k
}
270
271
/*
272
 * Stores a value to memory in ITF-8 format.
273
 *
274
 * Returns the number of bytes required to store the number.
275
 * This is a maximum of 5 bytes.
276
 */
277
23.1M
static inline int itf8_put(char *cp, int32_t val) {
278
23.1M
    unsigned char *up = (unsigned char *)cp;
279
23.1M
    if        (!(val & ~0x00000007f)) { // 1 byte
280
22.0M
        *up = val;
281
22.0M
        return 1;
282
22.0M
    } else if (!(val & ~0x00003fff)) { // 2 byte
283
331k
        *up++ = (val >> 8 ) | 0x80;
284
331k
        *up   = val & 0xff;
285
331k
        return 2;
286
771k
    } else if (!(val & ~0x01fffff)) { // 3 byte
287
20.6k
        *up++ = (val >> 16) | 0xc0;
288
20.6k
        *up++ = (val >> 8 ) & 0xff;
289
20.6k
        *up   = val & 0xff;
290
20.6k
        return 3;
291
751k
    } else if (!(val & ~0x0fffffff)) { // 4 byte
292
562k
        *up++ = (val >> 24) | 0xe0;
293
562k
        *up++ = (val >> 16) & 0xff;
294
562k
        *up++ = (val >> 8 ) & 0xff;
295
562k
        *up   = val & 0xff;
296
562k
        return 4;
297
562k
    } else {                           // 5 byte
298
188k
        *up++ = 0xf0 | ((val>>28) & 0xff);
299
188k
        *up++ = (val >> 20) & 0xff;
300
188k
        *up++ = (val >> 12) & 0xff;
301
188k
        *up++ = (val >> 4 ) & 0xff;
302
188k
        *up = val & 0x0f;
303
188k
        return 5;
304
188k
    }
305
23.1M
}
306
307
308
/* 64-bit itf8 variant */
309
151k
static inline int ltf8_put(char *cp, int64_t val) {
310
151k
    unsigned char *up = (unsigned char *)cp;
311
151k
    if        (!(val & ~((1LL<<7)-1))) {
312
111k
        *up = val;
313
111k
        return 1;
314
111k
    } else if (!(val & ~((1LL<<(6+8))-1))) {
315
38.8k
        *up++ = (val >> 8 ) | 0x80;
316
38.8k
        *up   = val & 0xff;
317
38.8k
        return 2;
318
38.8k
    } else if (!(val & ~((1LL<<(5+2*8))-1))) {
319
1.48k
        *up++ = (val >> 16) | 0xc0;
320
1.48k
        *up++ = (val >> 8 ) & 0xff;
321
1.48k
        *up   = val & 0xff;
322
1.48k
        return 3;
323
1.48k
    } else if (!(val & ~((1LL<<(4+3*8))-1))) {
324
45
        *up++ = (val >> 24) | 0xe0;
325
45
        *up++ = (val >> 16) & 0xff;
326
45
        *up++ = (val >> 8 ) & 0xff;
327
45
        *up   = val & 0xff;
328
45
        return 4;
329
45
    } else if (!(val & ~((1LL<<(3+4*8))-1))) {
330
0
        *up++ = (val >> 32) | 0xf0;
331
0
        *up++ = (val >> 24) & 0xff;
332
0
        *up++ = (val >> 16) & 0xff;
333
0
        *up++ = (val >> 8 ) & 0xff;
334
0
        *up   = val & 0xff;
335
0
        return 5;
336
0
    } else if (!(val & ~((1LL<<(2+5*8))-1))) {
337
0
        *up++ = (val >> 40) | 0xf8;
338
0
        *up++ = (val >> 32) & 0xff;
339
0
        *up++ = (val >> 24) & 0xff;
340
0
        *up++ = (val >> 16) & 0xff;
341
0
        *up++ = (val >> 8 ) & 0xff;
342
0
        *up   = val & 0xff;
343
0
        return 6;
344
0
    } else if (!(val & ~((1LL<<(1+6*8))-1))) {
345
0
        *up++ = (val >> 48) | 0xfc;
346
0
        *up++ = (val >> 40) & 0xff;
347
0
        *up++ = (val >> 32) & 0xff;
348
0
        *up++ = (val >> 24) & 0xff;
349
0
        *up++ = (val >> 16) & 0xff;
350
0
        *up++ = (val >> 8 ) & 0xff;
351
0
        *up   = val & 0xff;
352
0
        return 7;
353
0
    } else if (!(val & ~((1LL<<(7*8))-1))) {
354
0
        *up++ = (val >> 56) | 0xfe;
355
0
        *up++ = (val >> 48) & 0xff;
356
0
        *up++ = (val >> 40) & 0xff;
357
0
        *up++ = (val >> 32) & 0xff;
358
0
        *up++ = (val >> 24) & 0xff;
359
0
        *up++ = (val >> 16) & 0xff;
360
0
        *up++ = (val >> 8 ) & 0xff;
361
0
        *up   = val & 0xff;
362
0
        return 8;
363
0
    } else {
364
0
        *up++ = 0xff;
365
0
        *up++ = (val >> 56) & 0xff;
366
0
        *up++ = (val >> 48) & 0xff;
367
0
        *up++ = (val >> 40) & 0xff;
368
0
        *up++ = (val >> 32) & 0xff;
369
0
        *up++ = (val >> 24) & 0xff;
370
0
        *up++ = (val >> 16) & 0xff;
371
0
        *up++ = (val >> 8 ) & 0xff;
372
0
        *up   = val & 0xff;
373
0
        return 9;
374
0
    }
375
151k
}
376
377
/*
378
 * Encodes and writes a single integer in ITF-8 format.
379
 * Returns 0 on success
380
 *        -1 on failure
381
 */
382
0
int itf8_encode(cram_fd *fd, int32_t val) {
383
0
    char buf[5];
384
0
    int len = itf8_put(buf, val);
385
0
    return hwrite(fd->fp, buf, len) == len ? 0 : -1;
386
0
}
387
388
const int itf8_bytes[16] = {
389
    1, 1, 1, 1,  1, 1, 1, 1,
390
    2, 2, 2, 2,  3, 3, 4, 5
391
};
392
393
const int ltf8_bytes[256] = {
394
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
395
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
396
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
397
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
398
399
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
400
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
401
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
402
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
403
404
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
405
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
406
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
407
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
408
409
    3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
410
    3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
411
412
    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
413
414
    5, 5, 5, 5,  5, 5, 5, 5,  6, 6, 6, 6,  7, 7, 8, 9
415
};
416
417
/*
418
 * LEGACY: consider using ltf8_decode_crc.
419
 */
420
0
int ltf8_decode(cram_fd *fd, int64_t *val_p) {
421
0
    int c = hgetc(fd->fp);
422
0
    int64_t val = (unsigned char)c;
423
0
    if (c == -1)
424
0
        return -1;
425
426
0
    if (val < 0x80) {
427
0
        *val_p =   val;
428
0
        return 1;
429
430
0
    } else if (val < 0xc0) {
431
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
432
0
        *val_p = val & (((1LL<<(6+8)))-1);
433
0
        return 2;
434
435
0
    } else if (val < 0xe0) {
436
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
437
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
438
0
        *val_p = val & ((1LL<<(5+2*8))-1);
439
0
        return 3;
440
441
0
    } else if (val < 0xf0) {
442
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
443
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
444
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
445
0
        *val_p = val & ((1LL<<(4+3*8))-1);
446
0
        return 4;
447
448
0
    } else if (val < 0xf8) {
449
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
450
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
451
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
452
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
453
0
        *val_p = val & ((1LL<<(3+4*8))-1);
454
0
        return 5;
455
456
0
    } else if (val < 0xfc) {
457
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
458
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
459
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
460
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
461
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
462
0
        *val_p = val & ((1LL<<(2+5*8))-1);
463
0
        return 6;
464
465
0
    } else if (val < 0xfe) {
466
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
467
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
468
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
469
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
470
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
471
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
472
0
        *val_p = val & ((1LL<<(1+6*8))-1);
473
0
        return 7;
474
475
0
    } else if (val < 0xff) {
476
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
477
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
478
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
479
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
480
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
481
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
482
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
483
0
        *val_p = val & ((1LL<<(7*8))-1);
484
0
        return 8;
485
486
0
    } else {
487
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
488
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
489
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
490
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
491
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
492
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
493
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
494
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
495
0
        *val_p = val;
496
0
    }
497
498
0
    return 9;
499
0
}
500
501
2.16k
int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
502
2.16k
    unsigned char c[9];
503
2.16k
    int64_t val = hgetc(fd->fp);
504
2.16k
    if (val < 0)
505
12
        return -1;
506
507
2.14k
    c[0] = val;
508
509
2.14k
    if (val < 0x80) {
510
1.86k
        *val_p =   val;
511
1.86k
        *crc = crc32(*crc, c, 1);
512
1.86k
        return 1;
513
514
1.86k
    } else if (val < 0xc0) {
515
35
        int v = hgetc(fd->fp);
516
35
        if (v < 0)
517
2
            return -1;
518
33
        val = (val<<8) | (c[1]=v);
519
33
        *val_p = val & (((1LL<<(6+8)))-1);
520
33
        *crc = crc32(*crc, c, 2);
521
33
        return 2;
522
523
252
    } else if (val < 0xe0) {
524
57
        if (hread(fd->fp, &c[1], 2) < 2)
525
7
            return -1;
526
50
        val = (val<<8) | c[1];
527
50
        val = (val<<8) | c[2];
528
50
        *val_p = val & ((1LL<<(5+2*8))-1);
529
50
        *crc = crc32(*crc, c, 3);
530
50
        return 3;
531
532
195
    } else if (val < 0xf0) {
533
43
        if (hread(fd->fp, &c[1], 3) < 3)
534
4
            return -1;
535
39
        val = (val<<8) | c[1];
536
39
        val = (val<<8) | c[2];
537
39
        val = (val<<8) | c[3];
538
39
        *val_p = val & ((1LL<<(4+3*8))-1);
539
39
        *crc = crc32(*crc, c, 4);
540
39
        return 4;
541
542
152
    } else if (val < 0xf8) {
543
20
        if (hread(fd->fp, &c[1], 4) < 4)
544
1
            return -1;
545
19
        val = (val<<8) | c[1];
546
19
        val = (val<<8) | c[2];
547
19
        val = (val<<8) | c[3];
548
19
        val = (val<<8) | c[4];
549
19
        *val_p = val & ((1LL<<(3+4*8))-1);
550
19
        *crc = crc32(*crc, c, 5);
551
19
        return 5;
552
553
132
    } else if (val < 0xfc) {
554
19
        if (hread(fd->fp, &c[1], 5) < 5)
555
1
            return -1;
556
18
        val = (val<<8) | c[1];
557
18
        val = (val<<8) | c[2];
558
18
        val = (val<<8) | c[3];
559
18
        val = (val<<8) | c[4];
560
18
        val = (val<<8) | c[5];
561
18
        *val_p = val & ((1LL<<(2+5*8))-1);
562
18
        *crc = crc32(*crc, c, 6);
563
18
        return 6;
564
565
113
    } else if (val < 0xfe) {
566
19
        if (hread(fd->fp, &c[1], 6) < 6)
567
4
            return -1;
568
15
        val = (val<<8) | c[1];
569
15
        val = (val<<8) | c[2];
570
15
        val = (val<<8) | c[3];
571
15
        val = (val<<8) | c[4];
572
15
        val = (val<<8) | c[5];
573
15
        val = (val<<8) | c[6];
574
15
        *val_p = val & ((1LL<<(1+6*8))-1);
575
15
        *crc = crc32(*crc, c, 7);
576
15
        return 7;
577
578
94
    } else if (val < 0xff) {
579
12
        uint64_t uval = val;
580
12
        if (hread(fd->fp, &c[1], 7) < 7)
581
1
            return -1;
582
11
        uval = (uval<<8) | c[1];
583
11
        uval = (uval<<8) | c[2];
584
11
        uval = (uval<<8) | c[3];
585
11
        uval = (uval<<8) | c[4];
586
11
        uval = (uval<<8) | c[5];
587
11
        uval = (uval<<8) | c[6];
588
11
        uval = (uval<<8) | c[7];
589
11
        *val_p = uval & ((1ULL<<(7*8))-1);
590
11
        *crc = crc32(*crc, c, 8);
591
11
        return 8;
592
593
82
    } else {
594
82
        uint64_t uval;
595
82
        if (hread(fd->fp, &c[1], 8) < 8)
596
1
            return -1;
597
81
        uval =             c[1];
598
81
        uval = (uval<<8) | c[2];
599
81
        uval = (uval<<8) | c[3];
600
81
        uval = (uval<<8) | c[4];
601
81
        uval = (uval<<8) | c[5];
602
81
        uval = (uval<<8) | c[6];
603
81
        uval = (uval<<8) | c[7];
604
81
        uval = (uval<<8) | c[8];
605
81
        *crc = crc32(*crc, c, 9);
606
        // Avoid implementation-defined behaviour on negative values
607
81
        *val_p = c[1] < 0x80 ? (int64_t) uval : -((int64_t) (0xffffffffffffffffULL - uval)) - 1;
608
81
    }
609
610
81
    return 9;
611
2.14k
}
612
613
/*
614
 * Pushes a value in ITF8 format onto the end of a block.
615
 * This shouldn't be used for high-volume data as it is not the fastest
616
 * method.
617
 *
618
 * Returns the number of bytes written
619
 */
620
17.3M
int itf8_put_blk(cram_block *blk, int32_t val) {
621
17.3M
    char buf[5];
622
17.3M
    int sz;
623
624
17.3M
    sz = itf8_put(buf, val);
625
17.3M
    BLOCK_APPEND(blk, buf, sz);
626
17.3M
    return sz;
627
628
0
 block_err:
629
0
    return -1;
630
17.3M
}
631
632
0
int ltf8_put_blk(cram_block *blk, int64_t val) {
633
0
    char buf[9];
634
0
    int sz;
635
636
0
    sz = ltf8_put(buf, val);
637
0
    BLOCK_APPEND(blk, buf, sz);
638
0
    return sz;
639
640
0
 block_err:
641
0
    return -1;
642
0
}
643
644
304k
static int64_t safe_itf8_get(char **cp, const char *endp, int *err) {
645
304k
    const unsigned char *up = (unsigned char *)*cp;
646
647
304k
    if (endp && endp - *cp < 5 &&
648
99.8k
        (*cp >= endp || endp - *cp < itf8_bytes[up[0]>>4])) {
649
84.8k
        if (err) *err = 1;
650
84.8k
        return 0;
651
84.8k
    }
652
653
219k
    if (up[0] < 0x80) {
654
198k
        (*cp)++;
655
198k
        return up[0];
656
198k
    } else if (up[0] < 0xc0) {
657
10.9k
        (*cp)+=2;
658
10.9k
        return ((up[0] <<8) |  up[1])                           & 0x3fff;
659
10.9k
    } else if (up[0] < 0xe0) {
660
2.26k
        (*cp)+=3;
661
2.26k
        return ((up[0]<<16) | (up[1]<< 8) |  up[2])             & 0x1fffff;
662
8.23k
    } else if (up[0] < 0xf0) {
663
3.17k
        (*cp)+=4;
664
3.17k
        uint32_t uv = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
665
3.17k
        return (int32_t)uv;
666
5.05k
    } else {
667
5.05k
        (*cp)+=5;
668
5.05k
        uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
669
5.05k
        return (int32_t)uv;
670
5.05k
    }
671
219k
}
672
673
797
static int64_t safe_ltf8_get(char **cp, const char *endp, int *err) {
674
797
    unsigned char *up = (unsigned char *)*cp;
675
676
797
    if (endp && endp - *cp < 9 &&
677
746
        (*cp >= endp || endp - *cp < ltf8_bytes[up[0]])) {
678
321
        if (err) *err = 1;
679
321
        return 0;
680
321
    }
681
682
476
    if (up[0] < 0x80) {
683
144
        (*cp)++;
684
144
        return up[0];
685
332
    } else if (up[0] < 0xc0) {
686
60
        (*cp)+=2;
687
60
        return (((uint64_t)up[0]<< 8) |
688
60
                 (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
689
272
    } else if (up[0] < 0xe0) {
690
32
        (*cp)+=3;
691
32
        return (((uint64_t)up[0]<<16) |
692
32
                ((uint64_t)up[1]<< 8) |
693
32
                 (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
694
240
    } else if (up[0] < 0xf0) {
695
51
        (*cp)+=4;
696
51
        return (((uint64_t)up[0]<<24) |
697
51
                ((uint64_t)up[1]<<16) |
698
51
                ((uint64_t)up[2]<< 8) |
699
51
                 (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
700
189
    } else if (up[0] < 0xf8) {
701
138
        (*cp)+=5;
702
138
        return (((uint64_t)up[0]<<32) |
703
138
                ((uint64_t)up[1]<<24) |
704
138
                ((uint64_t)up[2]<<16) |
705
138
                ((uint64_t)up[3]<< 8) |
706
138
                 (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
707
138
    } else if (up[0] < 0xfc) {
708
9
        (*cp)+=6;
709
9
        return (((uint64_t)up[0]<<40) |
710
9
                ((uint64_t)up[1]<<32) |
711
9
                ((uint64_t)up[2]<<24) |
712
9
                ((uint64_t)up[3]<<16) |
713
9
                ((uint64_t)up[4]<< 8) |
714
9
                 (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
715
42
    } else if (up[0] < 0xfe) {
716
6
        (*cp)+=7;
717
6
        return (((uint64_t)up[0]<<48) |
718
6
                ((uint64_t)up[1]<<40) |
719
6
                ((uint64_t)up[2]<<32) |
720
6
                ((uint64_t)up[3]<<24) |
721
6
                ((uint64_t)up[4]<<16) |
722
6
                ((uint64_t)up[5]<< 8) |
723
6
                 (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
724
36
    } else if (up[0] < 0xff) {
725
3
        (*cp)+=8;
726
3
        return (((uint64_t)up[1]<<48) |
727
3
                ((uint64_t)up[2]<<40) |
728
3
                ((uint64_t)up[3]<<32) |
729
3
                ((uint64_t)up[4]<<24) |
730
3
                ((uint64_t)up[5]<<16) |
731
3
                ((uint64_t)up[6]<< 8) |
732
3
                 (uint64_t)up[7]) & ((1LL<<(7*8))-1);
733
33
    } else {
734
33
        (*cp)+=9;
735
33
        return (((uint64_t)up[1]<<56) |
736
33
                ((uint64_t)up[2]<<48) |
737
33
                ((uint64_t)up[3]<<40) |
738
33
                ((uint64_t)up[4]<<32) |
739
33
                ((uint64_t)up[5]<<24) |
740
33
                ((uint64_t)up[6]<<16) |
741
33
                ((uint64_t)up[7]<< 8) |
742
33
                 (uint64_t)up[8]);
743
33
    }
744
476
}
745
746
// Wrapper for now
747
5.78M
static int safe_itf8_put(char *cp, char *cp_end, int32_t val) {
748
5.78M
    return itf8_put(cp, val);
749
5.78M
}
750
751
151k
static int safe_ltf8_put(char *cp, char *cp_end, int64_t val) {
752
151k
    return ltf8_put(cp, val);
753
151k
}
754
755
1.28M
static int itf8_size(int64_t v) {
756
1.28M
    return ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5);
757
1.28M
}
758
759
//-----------------------------------------------------------------------------
760
761
// CRAM v4.0 onwards uses a different variable sized integer encoding
762
// that is size agnostic.
763
764
// Local interface to varint.h inline version, so we can use in func ptr.
765
// Note a lot of these use the unsigned interface but take signed int64_t.
766
// This is because the old CRAM ITF8 inteface had signed -1 as unsigned
767
// 0xffffffff.
768
5.23k
static int uint7_size(int64_t v) {
769
5.23k
    return var_size_u64(v);
770
5.23k
}
771
772
3.77k
static int64_t uint7_get_32(char **cp, const char *endp, int *err) {
773
3.77k
    uint32_t val;
774
3.77k
    int nb = var_get_u32((uint8_t *)(*cp), (const uint8_t *)endp, &val);
775
3.77k
    (*cp) += nb;
776
3.77k
    if (!nb && err) *err = 1;
777
3.77k
    return val;
778
3.77k
}
779
780
0
static int64_t sint7_get_32(char **cp, const char *endp, int *err) {
781
0
    int32_t val;
782
0
    int nb = var_get_s32((uint8_t *)(*cp), (const uint8_t *)endp, &val);
783
0
    (*cp) += nb;
784
0
    if (!nb && err) *err = 1;
785
0
    return val;
786
0
}
787
788
12.7M
static int64_t uint7_get_64(char **cp, const char *endp, int *err) {
789
12.7M
    uint64_t val;
790
12.7M
    int nb = var_get_u64((uint8_t *)(*cp), (const uint8_t *)endp, &val);
791
12.7M
    (*cp) += nb;
792
12.7M
    if (!nb && err) *err = 1;
793
12.7M
    return val;
794
12.7M
}
795
796
63
static int64_t sint7_get_64(char **cp, const char *endp, int *err) {
797
63
    int64_t val;
798
63
    int nb = var_get_s64((uint8_t *)(*cp), (const uint8_t *)endp, &val);
799
63
    (*cp) += nb;
800
63
    if (!nb && err) *err = 1;
801
63
    return val;
802
63
}
803
804
0
static int uint7_put_32(char *cp, char *endp, int32_t val) {
805
0
    return var_put_u32((uint8_t *)cp, (uint8_t *)endp, val);
806
0
}
807
808
0
static int sint7_put_32(char *cp, char *endp, int32_t val) {
809
0
    return var_put_s32((uint8_t *)cp, (uint8_t *)endp, val);
810
0
}
811
812
0
static int uint7_put_64(char *cp, char *endp, int64_t val) {
813
0
    return var_put_u64((uint8_t *)cp, (uint8_t *)endp, val);
814
0
}
815
816
0
static int sint7_put_64(char *cp, char *endp, int64_t val) {
817
0
    return var_put_s64((uint8_t *)cp, (uint8_t *)endp, val);
818
0
}
819
820
// Put direct to to cram_block
821
0
static int uint7_put_blk_32(cram_block *blk, int32_t v) {
822
0
    uint8_t buf[10];
823
0
    int sz = var_put_u32(buf, buf+10, v);
824
0
    BLOCK_APPEND(blk, buf, sz);
825
0
    return sz;
826
827
0
 block_err:
828
0
    return -1;
829
0
}
830
831
0
static int sint7_put_blk_32(cram_block *blk, int32_t v) {
832
0
    uint8_t buf[10];
833
0
    int sz = var_put_s32(buf, buf+10, v);
834
0
    BLOCK_APPEND(blk, buf, sz);
835
0
    return sz;
836
837
0
 block_err:
838
0
    return -1;
839
0
}
840
841
0
static int uint7_put_blk_64(cram_block *blk, int64_t v) {
842
0
    uint8_t buf[10];
843
0
    int sz = var_put_u64(buf, buf+10, v);
844
0
    BLOCK_APPEND(blk, buf, sz);
845
0
    return sz;
846
847
0
 block_err:
848
0
    return -1;
849
0
}
850
851
0
static int sint7_put_blk_64(cram_block *blk, int64_t v) {
852
0
    uint8_t buf[10];
853
0
    int sz = var_put_s64(buf, buf+10, v);
854
0
    BLOCK_APPEND(blk, buf, sz);
855
0
    return sz;
856
857
0
 block_err:
858
0
    return -1;
859
0
}
860
861
// Decode 32-bits with CRC update from cram_fd
862
27.4k
static int uint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
863
27.4k
    uint8_t b[5], i = 0;
864
27.4k
    int c;
865
27.4k
    uint32_t v = 0;
866
867
#ifdef VARINT2
868
    b[0] = hgetc(fd->fp);
869
    if (b[0] < 177) {
870
    } else if (b[0] < 241) {
871
        b[1] = hgetc(fd->fp);
872
    } else if (b[0] < 249) {
873
        b[1] = hgetc(fd->fp);
874
        b[2] = hgetc(fd->fp);
875
    } else {
876
        int n = b[0]+2, z = 1;
877
        while (n-- >= 249)
878
            b[z++] = hgetc(fd->fp);
879
    }
880
    i = var_get_u32(b, NULL, &v);
881
#else
882
//    // Little endian
883
//    int s = 0;
884
//    do {
885
//        b[i++] = c = hgetc(fd->fp);
886
//        if (c < 0)
887
//            return -1;
888
//        v |= (c & 0x7f) << s;
889
//      s += 7;
890
//    } while (i < 5 && (c & 0x80));
891
892
    // Big endian, see also htscodecs/varint.h
893
31.0k
    do {
894
31.0k
        b[i++] = c = hgetc(fd->fp);
895
31.0k
        if (c < 0)
896
86
            return -1;
897
30.9k
        v = (v<<7) | (c & 0x7f);
898
30.9k
    } while (i < 5 && (c & 0x80));
899
27.4k
#endif
900
27.4k
    *crc = crc32(*crc, b, i);
901
902
27.4k
    *val_p = v;
903
27.4k
    return i;
904
27.4k
}
905
906
// Decode 32-bits with CRC update from cram_fd
907
3.40k
static int sint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
908
3.40k
    uint8_t b[5], i = 0;
909
3.40k
    int c;
910
3.40k
    uint32_t v = 0;
911
912
#ifdef VARINT2
913
    b[0] = hgetc(fd->fp);
914
    if (b[0] < 177) {
915
    } else if (b[0] < 241) {
916
        b[1] = hgetc(fd->fp);
917
    } else if (b[0] < 249) {
918
        b[1] = hgetc(fd->fp);
919
        b[2] = hgetc(fd->fp);
920
    } else {
921
        int n = b[0]+2, z = 1;
922
        while (n-- >= 249)
923
            b[z++] = hgetc(fd->fp);
924
    }
925
    i = var_get_u32(b, NULL, &v);
926
#else
927
//    // Little endian
928
//    int s = 0;
929
//    do {
930
//        b[i++] = c = hgetc(fd->fp);
931
//        if (c < 0)
932
//            return -1;
933
//        v |= (c & 0x7f) << s;
934
//      s += 7;
935
//    } while (i < 5 && (c & 0x80));
936
937
    // Big endian, see also htscodecs/varint.h
938
4.31k
    do {
939
4.31k
        b[i++] = c = hgetc(fd->fp);
940
4.31k
        if (c < 0)
941
12
            return -1;
942
4.30k
        v = (v<<7) | (c & 0x7f);
943
4.30k
    } while (i < 5 && (c & 0x80));
944
3.39k
#endif
945
3.39k
    *crc = crc32(*crc, b, i);
946
947
3.39k
    *val_p = (v>>1) ^ -(v&1);
948
3.39k
    return i;
949
3.40k
}
950
951
952
// Decode 64-bits with CRC update from cram_fd
953
13.5k
static int uint7_decode_crc64(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
954
13.5k
    uint8_t b[10], i = 0;
955
13.5k
    int c;
956
13.5k
    uint64_t v = 0;
957
958
#ifdef VARINT2
959
    b[0] = hgetc(fd->fp);
960
    if (b[0] < 177) {
961
    } else if (b[0] < 241) {
962
        b[1] = hgetc(fd->fp);
963
    } else if (b[0] < 249) {
964
        b[1] = hgetc(fd->fp);
965
        b[2] = hgetc(fd->fp);
966
    } else {
967
        int n = b[0]+2, z = 1;
968
        while (n-- >= 249)
969
            b[z++] = hgetc(fd->fp);
970
    }
971
    i = var_get_u64(b, NULL, &v);
972
#else
973
//    // Little endian
974
//    int s = 0;
975
//    do {
976
//        b[i++] = c = hgetc(fd->fp);
977
//        if (c < 0)
978
//            return -1;
979
//        v |= (c & 0x7f) << s;
980
//      s += 7;
981
//    } while (i < 10 && (c & 0x80));
982
983
    // Big endian, see also htscodecs/varint.h
984
14.4k
    do {
985
14.4k
        b[i++] = c = hgetc(fd->fp);
986
14.4k
        if (c < 0)
987
33
            return -1;
988
14.4k
        v = (v<<7) | (c & 0x7f);
989
14.4k
    } while (i < 5 && (c & 0x80));
990
13.4k
#endif
991
13.4k
    *crc = crc32(*crc, b, i);
992
993
13.4k
    *val_p = v;
994
13.4k
    return i;
995
13.5k
}
996
997
//-----------------------------------------------------------------------------
998
999
/*
1000
 * Decodes a 32-bit little endian value from fd and stores in val.
1001
 *
1002
 * Returns the number of bytes read on success
1003
 *         -1 on failure
1004
 */
1005
17.0k
static int int32_decode(cram_fd *fd, int32_t *val) {
1006
17.0k
    int32_t i;
1007
17.0k
    if (4 != hread(fd->fp, &i, 4))
1008
60
        return -1;
1009
1010
16.9k
    *val = le_int4(i);
1011
16.9k
    return 4;
1012
17.0k
}
1013
1014
/*
1015
 * Encodes a 32-bit little endian value 'val' and writes to fd.
1016
 *
1017
 * Returns the number of bytes written on success
1018
 *         -1 on failure
1019
 */
1020
342k
static int int32_encode(cram_fd *fd, int32_t val) {
1021
342k
    uint32_t v = le_int4(val);
1022
342k
    if (4 != hwrite(fd->fp, &v, 4))
1023
0
        return -1;
1024
1025
342k
    return 4;
1026
342k
}
1027
1028
/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
1029
788
int int32_get_blk(cram_block *b, int32_t *val) {
1030
788
    if (b->uncomp_size - BLOCK_SIZE(b) < 4)
1031
80
        return -1;
1032
1033
708
    uint32_t v =
1034
708
         ((uint32_t) b->data[b->byte  ])        |
1035
708
        (((uint32_t) b->data[b->byte+1]) <<  8) |
1036
708
        (((uint32_t) b->data[b->byte+2]) << 16) |
1037
708
        (((uint32_t) b->data[b->byte+3]) << 24);
1038
    // Avoid implementation-defined behaviour on negative values
1039
708
    *val = v < 0x80000000U ? (int32_t) v : -((int32_t) (0xffffffffU - v)) - 1;
1040
708
    BLOCK_SIZE(b) += 4;
1041
708
    return 4;
1042
788
}
1043
1044
/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
1045
9.67k
int int32_put_blk(cram_block *b, int32_t val) {
1046
9.67k
    unsigned char cp[4];
1047
9.67k
    uint32_t v = val;
1048
9.67k
    cp[0] = ( v      & 0xff);
1049
9.67k
    cp[1] = ((v>>8)  & 0xff);
1050
9.67k
    cp[2] = ((v>>16) & 0xff);
1051
9.67k
    cp[3] = ((v>>24) & 0xff);
1052
1053
9.67k
    BLOCK_APPEND(b, cp, 4);
1054
9.67k
    return 0;
1055
1056
0
 block_err:
1057
0
    return -1;
1058
9.67k
}
1059
1060
#ifdef HAVE_LIBDEFLATE
1061
/* ----------------------------------------------------------------------
1062
 * libdeflate compression code, with interface to match
1063
 * zlib_mem_{in,de}flate for simplicity elsewhere.
1064
 */
1065
1066
// Named the same as the version that uses zlib as we always use libdeflate for
1067
// decompression when available.
1068
char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1069
    struct libdeflate_decompressor *z = libdeflate_alloc_decompressor();
1070
    if (!z) {
1071
        hts_log_error("Call to libdeflate_alloc_decompressor failed");
1072
        return NULL;
1073
    }
1074
1075
    uint8_t *data = NULL, *new_data;
1076
    if (!*size)
1077
        *size = csize*2;
1078
    for(;;) {
1079
        new_data = realloc(data, *size);
1080
        if (!new_data) {
1081
            hts_log_error("Memory allocation failure");
1082
            goto fail;
1083
        }
1084
        data = new_data;
1085
1086
        int ret = libdeflate_gzip_decompress(z, cdata, csize, data, *size, size);
1087
1088
        // Auto grow output buffer size if needed and try again.
1089
        // Fortunately for all bar one call of this we know the size already.
1090
        if (ret == LIBDEFLATE_INSUFFICIENT_SPACE) {
1091
            (*size) *= 1.5;
1092
            continue;
1093
        }
1094
1095
        if (ret != LIBDEFLATE_SUCCESS) {
1096
            hts_log_error("Inflate operation failed: %d", ret);
1097
            goto fail;
1098
        } else {
1099
            break;
1100
        }
1101
    }
1102
1103
    libdeflate_free_decompressor(z);
1104
    return (char *)data;
1105
1106
 fail:
1107
    libdeflate_free_decompressor(z);
1108
    free(data);
1109
    return NULL;
1110
}
1111
1112
// Named differently as we use both zlib/libdeflate for compression.
1113
static char *libdeflate_deflate(char *data, size_t size, size_t *cdata_size,
1114
                                int level, int strat) {
1115
    level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default
1116
    level *= 1.23;     // NB levels go up to 12 here; 5 onwards is +1
1117
    level += level>=8; // 5,6,7->6,7,8  8->10  9->12
1118
    if (level > 12) level = 12;
1119
1120
    if (strat == Z_RLE) // not supported by libdeflate
1121
        level = 1;
1122
1123
    struct libdeflate_compressor *z = libdeflate_alloc_compressor(level);
1124
    if (!z) {
1125
        hts_log_error("Call to libdeflate_alloc_compressor failed");
1126
        return NULL;
1127
    }
1128
1129
    unsigned char *cdata = NULL; /* Compressed output */
1130
    size_t cdata_alloc;
1131
    cdata = malloc(cdata_alloc = size*1.05+100);
1132
    if (!cdata) {
1133
        hts_log_error("Memory allocation failure");
1134
        libdeflate_free_compressor(z);
1135
        return NULL;
1136
    }
1137
1138
    *cdata_size = libdeflate_gzip_compress(z, data, size, cdata, cdata_alloc);
1139
    libdeflate_free_compressor(z);
1140
1141
    if (*cdata_size == 0) {
1142
        hts_log_error("Call to libdeflate_gzip_compress failed");
1143
        free(cdata);
1144
        return NULL;
1145
    }
1146
1147
    return (char *)cdata;
1148
}
1149
1150
#else
1151
1152
/* ----------------------------------------------------------------------
1153
 * zlib compression code - from Gap5's tg_iface_g.c
1154
 * They're static here as they're only used within the cram_compress_block
1155
 * and cram_uncompress_block functions, which are the external interface.
1156
 */
1157
21
char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1158
21
    z_stream s;
1159
21
    unsigned char *data = NULL; /* Uncompressed output */
1160
21
    int data_alloc = 0;
1161
21
    int err;
1162
1163
    /* Starting point at uncompressed size, and scale after that */
1164
21
    data = malloc(data_alloc = csize*1.2+100);
1165
21
    if (!data)
1166
0
        return NULL;
1167
1168
    /* Initialise zlib stream */
1169
21
    s.zalloc = Z_NULL; /* use default allocation functions */
1170
21
    s.zfree  = Z_NULL;
1171
21
    s.opaque = Z_NULL;
1172
21
    s.next_in  = (unsigned char *)cdata;
1173
21
    s.avail_in = csize;
1174
21
    s.total_in = 0;
1175
21
    s.next_out  = data;
1176
21
    s.avail_out = data_alloc;
1177
21
    s.total_out = 0;
1178
1179
    //err = inflateInit(&s);
1180
21
    err = inflateInit2(&s, 15 + 32);
1181
21
    if (err != Z_OK) {
1182
0
        hts_log_error("Call to zlib inflateInit failed: %s", s.msg);
1183
0
        free(data);
1184
0
        return NULL;
1185
0
    }
1186
1187
    /* Decode to 'data' array */
1188
33
    for (;s.avail_in;) {
1189
20
        unsigned char *data_tmp;
1190
20
        int alloc_inc;
1191
1192
20
        s.next_out = &data[s.total_out];
1193
20
        err = inflate(&s, Z_NO_FLUSH);
1194
20
        if (err == Z_STREAM_END)
1195
0
            break;
1196
1197
20
        if (err != Z_OK) {
1198
8
            hts_log_error("Call to zlib inflate failed: %s", s.msg);
1199
8
            free(data);
1200
8
            inflateEnd(&s);
1201
8
            return NULL;
1202
8
        }
1203
1204
        /* More to come, so realloc based on growth so far */
1205
12
        alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
1206
12
        data = realloc((data_tmp = data), data_alloc += alloc_inc);
1207
12
        if (!data) {
1208
0
            free(data_tmp);
1209
0
            inflateEnd(&s);
1210
0
            return NULL;
1211
0
        }
1212
12
        s.avail_out += alloc_inc;
1213
12
    }
1214
13
    inflateEnd(&s);
1215
1216
13
    *size = s.total_out;
1217
13
    return (char *)data;
1218
21
}
1219
#endif
1220
1221
#if !defined(HAVE_LIBDEFLATE) || LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR ==  1 && LIBDEFLATE_VERSION_MINOR <= 8)
1222
static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
1223
249k
                              int level, int strat) {
1224
249k
    z_stream s;
1225
249k
    unsigned char *cdata = NULL; /* Compressed output */
1226
249k
    int cdata_alloc = 0;
1227
249k
    int cdata_pos = 0;
1228
249k
    int err;
1229
1230
249k
    cdata = malloc(cdata_alloc = size*1.05+100);
1231
249k
    if (!cdata)
1232
0
        return NULL;
1233
249k
    cdata_pos = 0;
1234
1235
    /* Initialise zlib stream */
1236
249k
    s.zalloc = Z_NULL; /* use default allocation functions */
1237
249k
    s.zfree  = Z_NULL;
1238
249k
    s.opaque = Z_NULL;
1239
249k
    s.next_in  = (unsigned char *)data;
1240
249k
    s.avail_in = size;
1241
249k
    s.total_in = 0;
1242
249k
    s.next_out  = cdata;
1243
249k
    s.avail_out = cdata_alloc;
1244
249k
    s.total_out = 0;
1245
249k
    s.data_type = Z_BINARY;
1246
1247
249k
    err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
1248
249k
    if (err != Z_OK) {
1249
0
        hts_log_error("Call to zlib deflateInit2 failed: %s", s.msg);
1250
0
        return NULL;
1251
0
    }
1252
1253
    /* Encode to 'cdata' array */
1254
499k
    for (;s.avail_in;) {
1255
249k
        s.next_out = &cdata[cdata_pos];
1256
249k
        s.avail_out = cdata_alloc - cdata_pos;
1257
249k
        if (cdata_alloc - cdata_pos <= 0) {
1258
0
            hts_log_error("Deflate produced larger output than expected");
1259
0
            return NULL;
1260
0
        }
1261
249k
        err = deflate(&s, Z_NO_FLUSH);
1262
249k
        cdata_pos = cdata_alloc - s.avail_out;
1263
249k
        if (err != Z_OK) {
1264
0
            hts_log_error("Call to zlib deflate failed: %s", s.msg);
1265
0
            break;
1266
0
        }
1267
249k
    }
1268
249k
    if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
1269
0
        hts_log_error("Call to zlib deflate failed: %s", s.msg);
1270
0
    }
1271
249k
    *cdata_size = s.total_out;
1272
1273
249k
    if (deflateEnd(&s) != Z_OK) {
1274
0
        hts_log_error("Call to zlib deflate failed: %s", s.msg);
1275
0
    }
1276
249k
    return (char *)cdata;
1277
249k
}
1278
#endif
1279
1280
#ifdef HAVE_LIBLZMA
1281
/* ------------------------------------------------------------------------ */
1282
/*
1283
 * Data compression routines using liblzma (xz)
1284
 *
1285
 * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
1286
 * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
1287
 * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
1288
 * as compression times.
1289
 *
1290
 * For now we disable this functionality. If it's to be reenabled make sure you
1291
 * improve the mem_inflate implementation as it's just a test hack at the
1292
 * moment.
1293
 */
1294
1295
static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
1296
0
                              int level) {
1297
0
    char *out;
1298
0
    size_t out_size = lzma_stream_buffer_bound(size);
1299
0
    *cdata_size = 0;
1300
1301
0
    out = malloc(out_size);
1302
1303
    /* Single call compression */
1304
0
    if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
1305
0
                                           (uint8_t *)data, size,
1306
0
                                           (uint8_t *)out, cdata_size,
1307
0
                                           out_size))
1308
0
        return NULL;
1309
1310
0
    return out;
1311
0
}
1312
1313
19
static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
1314
19
    lzma_stream strm = LZMA_STREAM_INIT;
1315
19
    size_t out_size = 0, out_pos = 0;
1316
19
    char *out = NULL, *new_out;
1317
19
    int r;
1318
1319
    /* Initiate the decoder */
1320
19
    if (LZMA_OK != lzma_stream_decoder(&strm, lzma_easy_decoder_memusage(9), 0))
1321
0
        return NULL;
1322
1323
    /* Decode loop */
1324
19
    strm.avail_in = csize;
1325
19
    strm.next_in = (uint8_t *)cdata;
1326
1327
26
    for (;strm.avail_in;) {
1328
17
        if (strm.avail_in > out_size - out_pos) {
1329
17
            out_size += strm.avail_in * 4 + 32768;
1330
17
            new_out = realloc(out, out_size);
1331
17
            if (!new_out)
1332
0
                goto fail;
1333
17
            out = new_out;
1334
17
        }
1335
17
        strm.avail_out = out_size - out_pos;
1336
17
        strm.next_out = (uint8_t *)&out[out_pos];
1337
1338
17
        r = lzma_code(&strm, LZMA_RUN);
1339
17
        if (LZMA_OK != r && LZMA_STREAM_END != r) {
1340
10
            hts_log_error("LZMA decode failure (error %d)", r);
1341
10
            goto fail;
1342
10
        }
1343
1344
7
        out_pos = strm.total_out;
1345
1346
7
        if (r == LZMA_STREAM_END)
1347
0
            break;
1348
7
    }
1349
1350
    /* finish up any unflushed data; necessary? */
1351
9
    r = lzma_code(&strm, LZMA_FINISH);
1352
9
    if (r != LZMA_OK && r != LZMA_STREAM_END) {
1353
0
        hts_log_error("Call to lzma_code failed with error %d", r);
1354
0
        goto fail;
1355
0
    }
1356
1357
9
    new_out = realloc(out, strm.total_out > 0 ? strm.total_out : 1);
1358
9
    if (new_out)
1359
9
        out = new_out;
1360
9
    *size = strm.total_out;
1361
1362
9
    lzma_end(&strm);
1363
1364
9
    return out;
1365
1366
10
 fail:
1367
10
    lzma_end(&strm);
1368
10
    free(out);
1369
10
    return NULL;
1370
9
}
1371
#endif
1372
1373
/* ----------------------------------------------------------------------
1374
 * CRAM blocks - the dynamically growable data block. We have code to
1375
 * create, update, (un)compress and read/write.
1376
 *
1377
 * These are derived from the deflate_interlaced.c blocks, but with the
1378
 * CRAM extension of content types and IDs.
1379
 */
1380
1381
/*
1382
 * Allocates a new cram_block structure with a specified content_type and
1383
 * id.
1384
 *
1385
 * Returns block pointer on success
1386
 *         NULL on failure
1387
 */
1388
cram_block *cram_new_block(enum cram_content_type content_type,
1389
1.02M
                           int content_id) {
1390
1.02M
    cram_block *b = malloc(sizeof(*b));
1391
1.02M
    if (!b)
1392
0
        return NULL;
1393
1.02M
    b->method = b->orig_method = RAW;
1394
1.02M
    b->content_type = content_type;
1395
1.02M
    b->content_id = content_id;
1396
1.02M
    b->comp_size = 0;
1397
1.02M
    b->uncomp_size = 0;
1398
1.02M
    b->data = NULL;
1399
1.02M
    b->alloc = 0;
1400
1.02M
    b->byte = 0;
1401
1.02M
    b->bit = 7; // MSB
1402
1.02M
    b->crc32 = 0;
1403
1.02M
    b->idx = 0;
1404
1.02M
    b->m = NULL;
1405
1406
1.02M
    return b;
1407
1.02M
}
1408
1409
/*
1410
 * Reads a block from a cram file.
1411
 * Returns cram_block pointer on success.
1412
 *         NULL on failure
1413
 */
1414
14.7k
cram_block *cram_read_block(cram_fd *fd) {
1415
14.7k
    cram_block *b = malloc(sizeof(*b));
1416
14.7k
    unsigned char c;
1417
14.7k
    uint32_t crc = 0;
1418
14.7k
    if (!b)
1419
0
        return NULL;
1420
1421
    //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
1422
1423
14.7k
    if (-1 == (b->method      = hgetc(fd->fp))) { free(b); return NULL; }
1424
14.7k
    c = b->method; crc = crc32(crc, &c, 1);
1425
14.7k
    if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; }
1426
14.7k
    c = b->content_type; crc = crc32(crc, &c, 1);
1427
14.7k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->content_id, &crc))  { free(b); return NULL; }
1428
14.6k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->comp_size, &crc))   { free(b); return NULL; }
1429
14.6k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; }
1430
1431
    //fprintf(stderr, "  method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
1432
    //      b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
1433
1434
14.6k
    if (b->method == RAW) {
1435
6.30k
        if (b->uncomp_size < 0 || b->comp_size != b->uncomp_size) {
1436
175
            free(b);
1437
175
            return NULL;
1438
175
        }
1439
6.12k
        b->alloc = b->uncomp_size;
1440
6.12k
        if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
1441
6.12k
        if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) {
1442
27
            free(b->data);
1443
27
            free(b);
1444
27
            return NULL;
1445
27
        }
1446
8.30k
    } else {
1447
8.30k
        if (b->comp_size < 0 || b->uncomp_size < 0) {
1448
125
            free(b);
1449
125
            return NULL;
1450
125
        }
1451
8.18k
        b->alloc = b->comp_size;
1452
8.18k
        if (!(b->data = malloc(b->comp_size)))  { free(b); return NULL; }
1453
8.18k
        if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) {
1454
64
            free(b->data);
1455
64
            free(b);
1456
64
            return NULL;
1457
64
        }
1458
8.18k
    }
1459
1460
14.2k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1461
2.27k
        if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
1462
8
            free(b->data);
1463
8
            free(b);
1464
8
            return NULL;
1465
8
        }
1466
1467
2.26k
        b->crc32_checked = fd->ignore_md5;
1468
2.26k
        b->crc_part = crc;
1469
11.9k
    } else {
1470
11.9k
        b->crc32_checked = 1; // CRC not present
1471
11.9k
    }
1472
1473
14.2k
    b->orig_method = b->method;
1474
14.2k
    b->idx = 0;
1475
14.2k
    b->byte = 0;
1476
14.2k
    b->bit = 7; // MSB
1477
1478
14.2k
    return b;
1479
14.2k
}
1480
1481
1482
/*
1483
 * Computes the size of a cram block, including the block
1484
 * header itself.
1485
 */
1486
0
uint32_t cram_block_size(cram_block *b) {
1487
0
    unsigned char dat[100], *cp = dat;;
1488
0
    uint32_t sz;
1489
1490
0
    *cp++ = b->method;
1491
0
    *cp++ = b->content_type;
1492
0
    cp += itf8_put((char*)cp, b->content_id);
1493
0
    cp += itf8_put((char*)cp, b->comp_size);
1494
0
    cp += itf8_put((char*)cp, b->uncomp_size);
1495
1496
0
    sz = cp-dat + 4;
1497
0
    sz += b->method == RAW ? b->uncomp_size : b->comp_size;
1498
1499
0
    return sz;
1500
0
}
1501
1502
/*
1503
 * Writes a CRAM block.
1504
 * Returns 0 on success
1505
 *        -1 on failure
1506
 */
1507
342k
int cram_write_block(cram_fd *fd, cram_block *b) {
1508
342k
    char vardata[100];
1509
342k
    int vardata_o = 0;
1510
1511
342k
    assert(b->method != RAW || (b->comp_size == b->uncomp_size));
1512
1513
342k
    if (hputc(b->method,       fd->fp)  == EOF) return -1;
1514
342k
    if (hputc(b->content_type, fd->fp)  == EOF) return -1;
1515
342k
    vardata_o += fd->vv.varint_put32(vardata          , vardata+100, b->content_id);
1516
342k
    vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->comp_size);
1517
342k
    vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->uncomp_size);
1518
342k
    if (vardata_o != hwrite(fd->fp, vardata, vardata_o))
1519
0
        return -1;
1520
1521
342k
    if (b->data) {
1522
305k
        if (b->method == RAW) {
1523
260k
            if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size))
1524
0
                return -1;
1525
260k
        } else {
1526
44.4k
            if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size))
1527
0
                return -1;
1528
44.4k
        }
1529
305k
    } else {
1530
        // Absent blocks should be size 0
1531
36.7k
        assert(b->method == RAW && b->uncomp_size == 0);
1532
36.7k
    }
1533
1534
342k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1535
342k
        char dat[100], *cp = (char *)dat;
1536
342k
        uint32_t crc;
1537
1538
342k
        *cp++ = b->method;
1539
342k
        *cp++ = b->content_type;
1540
342k
        cp += fd->vv.varint_put32(cp, dat+100, b->content_id);
1541
342k
        cp += fd->vv.varint_put32(cp, dat+100, b->comp_size);
1542
342k
        cp += fd->vv.varint_put32(cp, dat+100, b->uncomp_size);
1543
342k
        crc = crc32(0L, (uc *)dat, cp-dat);
1544
1545
342k
        if (b->method == RAW) {
1546
297k
            b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
1547
297k
        } else {
1548
44.4k
            b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
1549
44.4k
        }
1550
1551
342k
        if (-1 == int32_encode(fd, b->crc32))
1552
0
            return -1;
1553
342k
    }
1554
1555
342k
    return 0;
1556
342k
}
1557
1558
/*
1559
 * Frees a CRAM block, deallocating internal data too.
1560
 */
1561
1.41M
void cram_free_block(cram_block *b) {
1562
1.41M
    if (!b)
1563
375k
        return;
1564
1.03M
    if (b->data)
1565
683k
        free(b->data);
1566
1.03M
    free(b);
1567
1.03M
}
1568
1569
/*
1570
 * Uncompresses a CRAM block, if compressed.
1571
 */
1572
10.4k
int cram_uncompress_block(cram_block *b) {
1573
10.4k
    char *uncomp;
1574
10.4k
    size_t uncomp_size = 0;
1575
1576
10.4k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1577
    // Pretend the CRC was OK so the fuzzer doesn't have to get it right
1578
10.4k
    b->crc32_checked = 1;
1579
10.4k
#endif
1580
1581
10.4k
    if (b->crc32_checked == 0) {
1582
0
        uint32_t crc = crc32(b->crc_part, b->data ? b->data : (uc *)"", b->alloc);
1583
0
        b->crc32_checked = 1;
1584
0
        if (crc != b->crc32) {
1585
0
            hts_log_error("Block CRC32 failure");
1586
0
            return -1;
1587
0
        }
1588
0
    }
1589
1590
10.4k
    if (b->uncomp_size == 0) {
1591
        // blank block
1592
3.70k
        b->method = RAW;
1593
3.70k
        return 0;
1594
3.70k
    }
1595
10.4k
    assert(b->uncomp_size >= 0); // cram_read_block should ensure this
1596
1597
6.78k
    switch (b->method) {
1598
662
    case RAW:
1599
662
        return 0;
1600
1601
21
    case GZIP:
1602
21
        uncomp_size = b->uncomp_size;
1603
21
        uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1604
1605
21
        if (!uncomp)
1606
8
            return -1;
1607
13
        if (uncomp_size != b->uncomp_size) {
1608
13
            free(uncomp);
1609
13
            return -1;
1610
13
        }
1611
0
        free(b->data);
1612
0
        b->data = (unsigned char *)uncomp;
1613
0
        b->alloc = uncomp_size;
1614
0
        b->method = RAW;
1615
0
        break;
1616
1617
0
#ifdef HAVE_LIBBZ2
1618
6
    case BZIP2: {
1619
6
        unsigned int usize = b->uncomp_size;
1620
6
        if (!(uncomp = malloc(usize)))
1621
0
            return -1;
1622
6
        if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
1623
6
                                                (char *)b->data, b->comp_size,
1624
6
                                                0, 0)) {
1625
6
            free(uncomp);
1626
6
            return -1;
1627
6
        }
1628
0
        free(b->data);
1629
0
        b->data = (unsigned char *)uncomp;
1630
0
        b->alloc = usize;
1631
0
        b->method = RAW;
1632
0
        b->uncomp_size = usize; // Just in case it differs
1633
0
        break;
1634
6
    }
1635
#else
1636
    case BZIP2:
1637
        hts_log_error("Bzip2 compression is not compiled into this version. Please rebuild and try again");
1638
        return -1;
1639
#endif
1640
1641
0
#ifdef HAVE_LIBLZMA
1642
19
    case LZMA:
1643
19
        uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1644
19
        if (!uncomp)
1645
10
            return -1;
1646
9
        if (uncomp_size != b->uncomp_size) {
1647
9
            free(uncomp);
1648
9
            return -1;
1649
9
        }
1650
0
        free(b->data);
1651
0
        b->data = (unsigned char *)uncomp;
1652
0
        b->alloc = uncomp_size;
1653
0
        b->method = RAW;
1654
0
        break;
1655
#else
1656
    case LZMA:
1657
        hts_log_error("Lzma compression is not compiled into this version. Please rebuild and try again");
1658
        return -1;
1659
        break;
1660
#endif
1661
1662
509
    case RANS: {
1663
509
        unsigned int usize = b->uncomp_size, usize2;
1664
509
        uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2);
1665
509
        if (!uncomp)
1666
261
            return -1;
1667
248
        if (usize != usize2) {
1668
244
            free(uncomp);
1669
244
            return -1;
1670
244
        }
1671
4
        free(b->data);
1672
4
        b->data = (unsigned char *)uncomp;
1673
4
        b->alloc = usize2;
1674
4
        b->method = RAW;
1675
4
        b->uncomp_size = usize2; // Just in case it differs
1676
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1677
4
        break;
1678
248
    }
1679
1680
1.03k
    case FQZ: {
1681
1.03k
        uncomp_size = b->uncomp_size;
1682
1.03k
        uncomp = fqz_decompress((char *)b->data, b->comp_size, &uncomp_size, NULL, 0);
1683
1.03k
        if (!uncomp)
1684
777
            return -1;
1685
262
        free(b->data);
1686
262
        b->data = (unsigned char *)uncomp;
1687
262
        b->alloc = uncomp_size;
1688
262
        b->method = RAW;
1689
262
        b->uncomp_size = uncomp_size;
1690
262
        break;
1691
1.03k
    }
1692
1693
1.06k
    case RANS_PR0: {
1694
1.06k
        unsigned int usize = b->uncomp_size, usize2;
1695
1.06k
        uncomp = (char *)rans_uncompress_4x16(b->data, b->comp_size, &usize2);
1696
1.06k
        if (!uncomp)
1697
811
            return -1;
1698
252
        if (usize != usize2) {
1699
168
            free(uncomp);
1700
168
            return -1;
1701
168
        }
1702
84
        b->orig_method = RANSPR;
1703
84
        free(b->data);
1704
84
        b->data = (unsigned char *)uncomp;
1705
84
        b->alloc = usize2;
1706
84
        b->method = RAW;
1707
84
        b->uncomp_size = usize2; // Just incase it differs
1708
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1709
84
        break;
1710
252
    }
1711
1712
725
    case ARITH_PR0: {
1713
725
        unsigned int usize = b->uncomp_size, usize2;
1714
725
        uncomp = (char *)arith_uncompress_to(b->data, b->comp_size, NULL, &usize2);
1715
725
        if (!uncomp)
1716
499
            return -1;
1717
226
        if (usize != usize2) {
1718
64
            free(uncomp);
1719
64
            return -1;
1720
64
        }
1721
162
        b->orig_method = ARITH;
1722
162
        free(b->data);
1723
162
        b->data = (unsigned char *)uncomp;
1724
162
        b->alloc = usize2;
1725
162
        b->method = RAW;
1726
162
        b->uncomp_size = usize2; // Just incase it differs
1727
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1728
162
        break;
1729
226
    }
1730
1731
2.73k
    case TOK3: {
1732
2.73k
        uint32_t out_len;
1733
2.73k
        uint8_t *cp = tok3_decode_names(b->data, b->comp_size, &out_len);
1734
2.73k
        if (!cp)
1735
2.57k
            return -1;
1736
160
        b->orig_method = TOK3;
1737
160
        b->method = RAW;
1738
160
        free(b->data);
1739
160
        b->data = cp;
1740
160
        b->alloc = out_len;
1741
160
        b->uncomp_size = out_len;
1742
160
        break;
1743
2.73k
    }
1744
1745
10
    default:
1746
10
        return -1;
1747
6.78k
    }
1748
1749
672
    return 0;
1750
6.78k
}
1751
1752
static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size,
1753
                                     int content_id, size_t *out_size,
1754
                                     enum cram_block_method_int method,
1755
664k
                                     int level, int strat) {
1756
664k
    switch (method) {
1757
78.2k
    case GZIP:
1758
125k
    case GZIP_RLE:
1759
249k
    case GZIP_1:
1760
        // Read names bizarrely benefit from zlib over libdeflate for
1761
        // mid-range compression levels.  Focusing purely of ratio or
1762
        // speed, libdeflate still wins.  It also seems to win for
1763
        // other data series too.
1764
        //
1765
        // Eg RN at level 5;  libdeflate=55.9MB  zlib=51.6MB
1766
#ifdef HAVE_LIBDEFLATE
1767
#  if (LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR == 1 && LIBDEFLATE_VERSION_MINOR <= 8))
1768
        if (content_id == DS_RN && level >= 4 && level <= 7)
1769
            return zlib_mem_deflate(in, in_size, out_size, level, strat);
1770
        else
1771
#  endif
1772
            return libdeflate_deflate(in, in_size, out_size, level, strat);
1773
#else
1774
249k
        return zlib_mem_deflate(in, in_size, out_size, level, strat);
1775
0
#endif
1776
1777
0
    case BZIP2: {
1778
0
#ifdef HAVE_LIBBZ2
1779
0
        unsigned int comp_size = in_size*1.01 + 600;
1780
0
        char *comp = malloc(comp_size);
1781
0
        if (!comp)
1782
0
            return NULL;
1783
1784
0
        if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
1785
0
                                              in, in_size,
1786
0
                                              level, 0, 30)) {
1787
0
            free(comp);
1788
0
            return NULL;
1789
0
        }
1790
0
        *out_size = comp_size;
1791
0
        return comp;
1792
#else
1793
        return NULL;
1794
#endif
1795
0
    }
1796
1797
0
    case FQZ:
1798
0
    case FQZ_b:
1799
0
    case FQZ_c:
1800
0
    case FQZ_d: {
1801
        // Extract the necessary portion of the slice into an fqz_slice struct.
1802
        // These previously were the same thing, but this permits us to detach
1803
        // the codec from the rest of this CRAM implementation.
1804
0
        fqz_slice *f = malloc(2*s->hdr->num_records * sizeof(uint32_t) + sizeof(fqz_slice));
1805
0
        if (!f)
1806
0
            return NULL;
1807
0
        f->num_records = s->hdr->num_records;
1808
0
        f->len = (uint32_t *)(((char *)f) + sizeof(fqz_slice));
1809
0
        f->flags = f->len + s->hdr->num_records;
1810
0
        int i;
1811
0
        for (i = 0; i < s->hdr->num_records; i++) {
1812
0
            f->flags[i] = s->crecs[i].flags;
1813
0
            f->len[i] = (i+1 < s->hdr->num_records
1814
0
                         ? s->crecs[i+1].qual - s->crecs[i].qual
1815
0
                         : s->block[DS_QS]->uncomp_size - s->crecs[i].qual);
1816
0
        }
1817
0
        char *comp = fqz_compress(strat & 0xff /* cram vers */, f,
1818
0
                                  in, in_size, out_size, strat >> 8, NULL);
1819
0
        free(f);
1820
0
        return comp;
1821
0
    }
1822
1823
0
    case LZMA:
1824
0
#ifdef HAVE_LIBLZMA
1825
0
        return lzma_mem_deflate(in, in_size, out_size, level);
1826
#else
1827
        return NULL;
1828
#endif
1829
1830
0
    case RANS0:
1831
0
    case RANS1: {
1832
0
        unsigned int out_size_i;
1833
0
        unsigned char *cp;
1834
0
        cp = rans_compress((unsigned char *)in, in_size, &out_size_i,
1835
0
                           method == RANS0 ? 0 : 1);
1836
0
        *out_size = out_size_i;
1837
0
        return (char *)cp;
1838
0
    }
1839
1840
128k
    case RANS_PR0:
1841
175k
    case RANS_PR1:
1842
238k
    case RANS_PR64:
1843
285k
    case RANS_PR9:
1844
348k
    case RANS_PR128:
1845
348k
    case RANS_PR129:
1846
348k
    case RANS_PR192:
1847
395k
    case RANS_PR193: {
1848
395k
        unsigned int out_size_i;
1849
395k
        unsigned char *cp;
1850
1851
        // see enum cram_block. We map RANS_* methods to order bit-fields
1852
395k
        static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1853
1854
395k
        int m = method == RANS_PR0 ? 0 : methmap[method - RANS_PR1];
1855
395k
        cp = rans_compress_4x16((unsigned char *)in, in_size, &out_size_i,
1856
395k
                                m | RANS_ORDER_SIMD_AUTO);
1857
395k
        *out_size = out_size_i;
1858
395k
        return (char *)cp;
1859
348k
    }
1860
1861
0
    case ARITH_PR0:
1862
0
    case ARITH_PR1:
1863
0
    case ARITH_PR64:
1864
0
    case ARITH_PR9:
1865
0
    case ARITH_PR128:
1866
0
    case ARITH_PR129:
1867
0
    case ARITH_PR192:
1868
0
    case ARITH_PR193: {
1869
0
        unsigned int out_size_i;
1870
0
        unsigned char *cp;
1871
1872
        // see enum cram_block. We map ARITH_* methods to order bit-fields
1873
0
        static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1874
1875
0
        cp = arith_compress_to((unsigned char *)in, in_size, NULL, &out_size_i,
1876
0
                               method == ARITH_PR0 ? 0 : methmap[method - ARITH_PR1]);
1877
0
        *out_size = out_size_i;
1878
0
        return (char *)cp;
1879
0
    }
1880
1881
18.7k
    case TOK3:
1882
18.7k
    case TOKA: {
1883
18.7k
        int out_len;
1884
18.7k
        int lev = level;
1885
18.7k
        if (method == TOK3 && lev > 3)
1886
18.7k
            lev = 3;
1887
18.7k
        uint8_t *cp = tok3_encode_names(in, in_size, lev, strat, &out_len, NULL);
1888
18.7k
        *out_size = out_len;
1889
18.7k
        return (char *)cp;
1890
18.7k
    }
1891
1892
0
    case RAW:
1893
0
        break;
1894
1895
0
    default:
1896
0
        return NULL;
1897
664k
    }
1898
1899
0
    return NULL;
1900
664k
}
1901
1902
/*
1903
 * A copy of cram_compress_block2 with added recursion detection.
1904
 * This is only called for error handling where the auto-tuning has failed.
1905
 * The simplest way of doing this is recusion + an additional argument, but
1906
 * we didn't want to complicate the existing code hence this is static.
1907
 */
1908
static int cram_compress_block3(cram_fd *fd, cram_slice *s,
1909
                                cram_block *b, cram_metrics *metrics,
1910
                                int method, int level,
1911
575k
                                int recurse) {
1912
1913
575k
    if (!b)
1914
35.9k
        return 0;
1915
1916
539k
    int orig_method = method;
1917
539k
    char *comp = NULL;
1918
539k
    size_t comp_size = 0;
1919
539k
    int strat;
1920
1921
    // Internally we have parameterised methods that externally map
1922
    // to the same CRAM method value.
1923
    // See enum_cram_block_method_int in cram_structs.h.
1924
539k
    int methmap[] = {
1925
        // Externally defined values
1926
539k
        RAW, GZIP, BZIP2, LZMA, RANS, RANSPR, ARITH, FQZ, TOK3,
1927
1928
        // Reserved for possible expansion
1929
539k
        0, 0,
1930
1931
        // Internally parameterised versions matching back to above
1932
        // external values
1933
539k
        GZIP, GZIP,
1934
539k
        FQZ, FQZ, FQZ,
1935
539k
        RANS,
1936
539k
        RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR,
1937
539k
        TOK3,
1938
539k
        ARITH,  ARITH,  ARITH,  ARITH,  ARITH,  ARITH,  ARITH,
1939
539k
    };
1940
1941
539k
    if (b->method != RAW) {
1942
        // Maybe already compressed if s->block[0] was compressed and
1943
        // we have e.g. s->block[DS_BA] set to s->block[0] due to only
1944
        // one base type present and hence using E_HUFFMAN on block 0.
1945
        // A second explicit attempt to compress the same block then
1946
        // occurs.
1947
0
        return 0;
1948
0
    }
1949
1950
539k
    if (method == -1) {
1951
9.67k
        method = 1<<GZIP;
1952
9.67k
        if (fd->use_bz2)
1953
0
            method |= 1<<BZIP2;
1954
9.67k
        if (fd->use_lzma)
1955
0
            method |= 1<<LZMA;
1956
9.67k
    }
1957
1958
539k
    if (level == -1)
1959
9.67k
        level = fd->level;
1960
1961
    //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
1962
1963
539k
    if (method == RAW || level == 0 || b->uncomp_size == 0) {
1964
288k
        b->method = RAW;
1965
288k
        b->comp_size = b->uncomp_size;
1966
        //fprintf(stderr, "Skip block id %d\n", b->content_id);
1967
288k
        return 0;
1968
288k
    }
1969
1970
251k
#ifndef ABS
1971
251k
#    define ABS(a) ((a)>=0?(a):-(a))
1972
251k
#endif
1973
1974
251k
    if (metrics) {
1975
238k
        pthread_mutex_lock(&fd->metrics_lock);
1976
        // Sudden changes in size trigger a retrial.  These are mainly
1977
        // triggered when switching to sorted / unsorted, where the number
1978
        // of elements in a slice radically changes.
1979
        //
1980
        // We also get large fluctuations based on genome coordinate for
1981
        // e.g. SA:Z and SC series, but we consider the typical scale of
1982
        // delta between blocks and use this to look for abnormality.
1983
1984
        // Equivalent to (but minus possible integer overflow)
1985
        //   (b->uncomp_size + 1000)/4 > metrics->input_avg_sz+1000 ||
1986
        //    b->uncomp_size + 1000    < (metrics->input_avg_sz+1000)/4)
1987
238k
        if (metrics->input_avg_sz &&
1988
64.0k
            (b->uncomp_size/4 - 750 > metrics->input_avg_sz ||
1989
63.1k
             b->uncomp_size         < metrics->input_avg_sz/4 - 750) &&
1990
1.72k
            ABS(b->uncomp_size-metrics->input_avg_sz)/10
1991
1.72k
                > metrics->input_avg_delta) {
1992
95
            metrics->next_trial = 0;
1993
95
        }
1994
1995
238k
        if (metrics->trial > 0 || --metrics->next_trial <= 0) {
1996
65.8k
            int m, unpackable = metrics->unpackable;
1997
65.8k
            size_t sz_best = b->uncomp_size;
1998
65.8k
            size_t sz[CRAM_MAX_METHOD] = {0};
1999
65.8k
            int method_best = 0; // RAW
2000
65.8k
            char *c_best = NULL, *c = NULL;
2001
2002
65.8k
            metrics->input_avg_delta =
2003
65.8k
                0.9 * (metrics->input_avg_delta +
2004
65.8k
                       ABS(b->uncomp_size - metrics->input_avg_sz));
2005
2006
65.8k
            metrics->input_avg_sz += b->uncomp_size*.2;
2007
65.8k
            metrics->input_avg_sz *= 0.8;
2008
2009
65.8k
            if (metrics->revised_method)
2010
27.5k
                method = metrics->revised_method;
2011
38.3k
            else
2012
38.3k
                metrics->revised_method = method;
2013
2014
65.8k
            if (metrics->next_trial <= 0) {
2015
1.55k
                metrics->next_trial = TRIAL_SPAN;
2016
1.55k
                metrics->trial = NTRIALS;
2017
51.4k
                for (m = 0; m < CRAM_MAX_METHOD; m++)
2018
49.8k
                    metrics->sz[m] /= 2;
2019
1.55k
                metrics->unpackable = 0;
2020
1.55k
            }
2021
2022
            // Compress this block using the best method
2023
65.8k
            if (unpackable && CRAM_MAJOR_VERS(fd->version) > 3) {
2024
                // No point trying bit-pack if 17+ symbols.
2025
0
                if (method & (1<<RANS_PR128))
2026
0
                    method = (method|(1<<RANS_PR0))&~(1<<RANS_PR128);
2027
0
                if (method & (1<<RANS_PR129))
2028
0
                    method = (method|(1<<RANS_PR1))&~(1<<RANS_PR129);
2029
0
                if (method & (1<<RANS_PR192))
2030
0
                    method = (method|(1<<RANS_PR64))&~(1<<RANS_PR192);
2031
0
                if (method & (1<<RANS_PR193))
2032
0
                    method = (method|(1<<RANS_PR64)|(1<<RANS_PR1))&~(1<<RANS_PR193);
2033
2034
0
                if (method & (1<<ARITH_PR128))
2035
0
                    method = (method|(1<<ARITH_PR0))&~(1<<ARITH_PR128);
2036
0
                if (method & (1<<ARITH_PR129))
2037
0
                    method = (method|(1<<ARITH_PR1))&~(1<<ARITH_PR129);
2038
0
                if (method & (1<<ARITH_PR192))
2039
0
                    method = (method|(1<<ARITH_PR64))&~(1<<ARITH_PR192);
2040
0
                if (method & (1u<<ARITH_PR193))
2041
0
                    method = (method|(1<<ARITH_PR64)|(1<<ARITH_PR1))&~(1u<<ARITH_PR193);
2042
0
            }
2043
2044
            // Libdeflate doesn't have a Z_RLE strategy.
2045
            // We treat it as level 1, but iff we haven't also
2046
            // explicitly listed that in the method list.
2047
#ifdef HAVE_LIBDEFLATE
2048
            if ((method & (1<<GZIP_RLE)) && (method & (1<<GZIP_1)))
2049
                method &= ~(1<<GZIP_RLE);
2050
#endif
2051
2052
65.8k
            pthread_mutex_unlock(&fd->metrics_lock);
2053
2054
2.17M
            for (m = 0; m < CRAM_MAX_METHOD; m++) {
2055
2.10M
                if (method & (1u<<m)) {
2056
478k
                    int lvl = level;
2057
478k
                    switch (m) {
2058
65.3k
                    case GZIP:     strat = Z_FILTERED; break;
2059
65.8k
                    case GZIP_1:   strat = Z_DEFAULT_STRATEGY; lvl = 1; break;
2060
47.0k
                    case GZIP_RLE: strat = Z_RLE; break;
2061
0
                    case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
2062
0
                    case FQZ_b:    strat = CRAM_MAJOR_VERS(fd->version)+256; break;
2063
0
                    case FQZ_c:    strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
2064
0
                    case FQZ_d:    strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
2065
18.4k
                    case TOK3:     strat = 0; break;
2066
0
                    case TOKA:     strat = 1; break;
2067
282k
                    default:       strat = 0;
2068
478k
                    }
2069
2070
478k
                    c = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2071
478k
                                                b->content_id, &sz[m], m, lvl, strat);
2072
2073
478k
                    if (c && sz_best > sz[m]) {
2074
37.2k
                        sz_best = sz[m];
2075
37.2k
                        method_best = m;
2076
37.2k
                        if (c_best)
2077
18.7k
                            free(c_best);
2078
37.2k
                        c_best = c;
2079
441k
                    } else if (c) {
2080
439k
                        free(c);
2081
439k
                    } else {
2082
2.05k
                        sz[m] = UINT_MAX; // arbitrarily worse than raw
2083
2.05k
                    }
2084
1.62M
                } else {
2085
1.62M
                    sz[m] = UINT_MAX; // arbitrarily worse than raw
2086
1.62M
                }
2087
2.10M
            }
2088
2089
65.8k
            if (c_best) {
2090
18.5k
                free(b->data);
2091
18.5k
                b->data = (unsigned char *)c_best;
2092
18.5k
                b->method = method_best; // adjusted to methmap[method_best] later
2093
18.5k
                b->comp_size = sz_best;
2094
18.5k
            }
2095
2096
            // Accumulate stats for all methods tried
2097
65.8k
            pthread_mutex_lock(&fd->metrics_lock);
2098
2.17M
            for (m = 0; m < CRAM_MAX_METHOD; m++)
2099
                // don't be overly sure on small blocks.
2100
                // +2000 means eg bzip2 vs gzip (1.07 to 1.04) or gz vs rans1
2101
                // needs to be at least 60 bytes smaller to overcome the
2102
                // fixed size addition.
2103
2.10M
                metrics->sz[m] += sz[m]+2000;
2104
2105
            // When enough trials performed, find the best on average
2106
65.8k
            if (--metrics->trial == 0) {
2107
17.2k
                int best_method = RAW;
2108
17.2k
                int best_sz = INT_MAX;
2109
2110
                // Relative costs of methods. See enum_cram_block_method_int
2111
                // and methmap
2112
17.2k
                double meth_cost[32] = {
2113
                    // Externally defined methods
2114
17.2k
                    1,    // 0  raw
2115
17.2k
                    1.04, // 1  gzip (Z_FILTERED)
2116
17.2k
                    1.07, // 2  bzip2
2117
17.2k
                    1.08, // 3  lzma
2118
17.2k
                    1.00, // 4  rans    (O0)
2119
17.2k
                    1.00, // 5  ranspr  (O0)
2120
17.2k
                    1.04, // 6  arithpr (O0)
2121
17.2k
                    1.05, // 7  fqz
2122
17.2k
                    1.05, // 8  tok3 (rans)
2123
17.2k
                    1.00, 1.00, // 9,10 reserved
2124
2125
                    // Paramterised versions of above
2126
17.2k
                    1.01, // gzip rle
2127
17.2k
                    1.01, // gzip -1
2128
2129
17.2k
                    1.05, 1.05, 1.05, // FQZ_b,c,d
2130
2131
17.2k
                    1.01, // rans O1
2132
2133
17.2k
                    1.01, // rans_pr1
2134
17.2k
                    1.00, // rans_pr64; if smaller, usually fast
2135
17.2k
                    1.03, // rans_pr65/9
2136
17.2k
                    1.00, // rans_pr128
2137
17.2k
                    1.01, // rans_pr129
2138
17.2k
                    1.00, // rans_pr192
2139
17.2k
                    1.01, // rans_pr193
2140
2141
17.2k
                    1.07, // tok3 arith
2142
2143
17.2k
                    1.04, // arith_pr1
2144
17.2k
                    1.04, // arith_pr64
2145
17.2k
                    1.04, // arith_pr9
2146
17.2k
                    1.03, // arith_pr128
2147
17.2k
                    1.04, // arith_pr129
2148
17.2k
                    1.04, // arith_pr192
2149
17.2k
                    1.04, // arith_pr193
2150
17.2k
                };
2151
2152
                // Scale methods by cost based on compression level
2153
17.2k
                if (fd->level <= 1) {
2154
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2155
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1)*4;
2156
17.2k
                } else if (fd->level <= 3) {
2157
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2158
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1);
2159
17.2k
                } else if (fd->level <= 6) {
2160
569k
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2161
552k
                        metrics->sz[m] *= 1+(meth_cost[m]-1)/2;
2162
17.2k
                } else if (fd->level <= 7) {
2163
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2164
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1)/3;
2165
0
                } // else cost is ignored
2166
2167
                // Ensure these are never used; BSC and ZSTD
2168
17.2k
                metrics->sz[9] = metrics->sz[10] = INT_MAX;
2169
2170
569k
                for (m = 0; m < CRAM_MAX_METHOD; m++) {
2171
552k
                    if ((!metrics->sz[m]) || (!(method & (1u<<m))))
2172
439k
                        continue;
2173
2174
112k
                    if (best_sz > metrics->sz[m])
2175
39.3k
                        best_sz = metrics->sz[m], best_method = m;
2176
112k
                }
2177
2178
17.2k
                if (best_method != metrics->method) {
2179
                    //metrics->trial = (NTRIALS+1)/2; // be sure
2180
                    //metrics->next_trial /= 1.5;
2181
11.2k
                    metrics->consistency = 0;
2182
11.2k
                } else {
2183
5.99k
                    metrics->next_trial *= MIN(2, 1+metrics->consistency/4.0);
2184
5.99k
                    metrics->consistency++;
2185
5.99k
                }
2186
2187
17.2k
                metrics->method = best_method;
2188
17.2k
                switch (best_method) {
2189
108
                case GZIP:     strat = Z_FILTERED; break;
2190
6.91k
                case GZIP_1:   strat = Z_DEFAULT_STRATEGY; break;
2191
4
                case GZIP_RLE: strat = Z_RLE; break;
2192
0
                case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
2193
0
                case FQZ_b:    strat = CRAM_MAJOR_VERS(fd->version)+256; break;
2194
0
                case FQZ_c:    strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
2195
0
                case FQZ_d:    strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
2196
333
                case TOK3:     strat = 0; break;
2197
0
                case TOKA:     strat = 1; break;
2198
9.90k
                default:       strat = 0;
2199
17.2k
                }
2200
17.2k
                metrics->strat  = strat;
2201
2202
                // If we see at least MAXFAIL trials in a row for a specific
2203
                // compression method with more than MAXDELTA aggregate
2204
                // size then we drop this from the list of methods used
2205
                // for this block type.
2206
97.1k
#define MAXDELTA 0.20
2207
381k
#define MAXFAILS 4
2208
569k
                for (m = 0; m < CRAM_MAX_METHOD; m++) {
2209
552k
                    if (best_method == m) {
2210
17.2k
                        metrics->cnt[m] = 0;
2211
17.2k
                        metrics->extra[m] = 0;
2212
535k
                    } else if (best_sz < metrics->sz[m]) {
2213
381k
                        double r = (double)metrics->sz[m] / best_sz - 1;
2214
381k
                        int mul = 1+(fd->level>=7);
2215
381k
                        if (++metrics->cnt[m] >= MAXFAILS*mul &&
2216
97.1k
                            (metrics->extra[m] += r) >= MAXDELTA*mul)
2217
37.7k
                            method &= ~(1u<<m);
2218
2219
                        // Special case for fqzcomp as it rarely changes
2220
381k
                        if (m == FQZ || m == FQZ_b || m == FQZ_c || m == FQZ_d) {
2221
64.2k
                            if (metrics->sz[m] > best_sz)
2222
64.2k
                                method &= ~(1u<<m);
2223
64.2k
                        }
2224
381k
                    }
2225
552k
                }
2226
2227
                //if (fd->verbose > 1 && method != metrics->revised_method)
2228
                //    fprintf(stderr, "%d: revising method from %x to %x\n",
2229
                //          b->content_id, metrics->revised_method, method);
2230
17.2k
                metrics->revised_method = method;
2231
17.2k
            }
2232
65.8k
            pthread_mutex_unlock(&fd->metrics_lock);
2233
172k
        } else {
2234
172k
            metrics->input_avg_delta =
2235
172k
                0.9 * (metrics->input_avg_delta +
2236
172k
                       ABS(b->uncomp_size - metrics->input_avg_sz));
2237
2238
172k
            metrics->input_avg_sz += b->uncomp_size*.2;
2239
172k
            metrics->input_avg_sz *= 0.8;
2240
2241
172k
            strat = metrics->strat;
2242
172k
            method = metrics->method;
2243
2244
172k
            pthread_mutex_unlock(&fd->metrics_lock);
2245
172k
            comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2246
172k
                                           b->content_id, &comp_size, method,
2247
172k
                                           method == GZIP_1 ? 1 : level,
2248
172k
                                           strat);
2249
172k
            if (!comp) {
2250
                // Our cached best method failed, but maybe another works?
2251
                // Rerun with trial mode engaged again.
2252
230
                if (!recurse) {
2253
230
                    hts_log_warning("Compressed block ID %d method %s failed, "
2254
230
                                    "redoing trial", b->content_id,
2255
230
                                    cram_block_method2str(method));
2256
230
                    pthread_mutex_lock(&fd->metrics_lock);
2257
230
                    metrics->trial = NTRIALS;
2258
230
                    metrics->next_trial = TRIAL_SPAN;
2259
230
                    metrics->revised_method = orig_method;
2260
230
                    pthread_mutex_unlock(&fd->metrics_lock);
2261
230
                    return cram_compress_block3(fd, s, b, metrics, method,
2262
230
                                                level, 1);
2263
230
                }
2264
0
                return -1;
2265
230
            }
2266
2267
172k
            if (comp_size < b->uncomp_size) {
2268
24.4k
                free(b->data);
2269
24.4k
                b->data = (unsigned char *)comp;
2270
24.4k
                b->comp_size = comp_size;
2271
24.4k
                b->method = method;
2272
147k
            } else {
2273
147k
                free(comp);
2274
147k
            }
2275
172k
        }
2276
2277
238k
    } else {
2278
        // no cached metrics, so just do zlib?
2279
12.7k
        comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2280
12.7k
                                       b->content_id, &comp_size, GZIP, level, Z_FILTERED);
2281
12.7k
        if (!comp) {
2282
0
            hts_log_error("Compression failed!");
2283
0
            return -1;
2284
0
        }
2285
2286
12.7k
        if (comp_size < b->uncomp_size) {
2287
1.51k
            free(b->data);
2288
1.51k
            b->data = (unsigned char *)comp;
2289
1.51k
            b->comp_size = comp_size;
2290
1.51k
            b->method = GZIP;
2291
11.2k
        } else {
2292
11.2k
            free(comp);
2293
11.2k
        }
2294
12.7k
        strat = Z_FILTERED;
2295
12.7k
    }
2296
2297
250k
    hts_log_info("Compressed block ID %d from %d to %d by method %s",
2298
250k
                 b->content_id, b->uncomp_size, b->comp_size,
2299
250k
                 cram_block_method2str(b->method));
2300
2301
250k
    b->method = methmap[b->method];
2302
2303
250k
    return 0;
2304
251k
}
2305
2306
/*
2307
 * Compresses a block using a selection of compression codecs and options.
2308
 * The best is learnt and used for subsequent slices, periodically resampling.
2309
 *
2310
 * Method and level -1 implies defaults, as specified in cram_fd.
2311
 */
2312
int cram_compress_block2(cram_fd *fd, cram_slice *s,
2313
                         cram_block *b, cram_metrics *metrics,
2314
575k
                         int method, int level) {
2315
575k
    return cram_compress_block3(fd, s, b, metrics, method, level, 0);
2316
575k
}
2317
2318
int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
2319
9.67k
                        int method, int level) {
2320
9.67k
    return cram_compress_block2(fd, NULL, b, metrics, method, level);
2321
9.67k
}
2322
2323
961k
cram_metrics *cram_new_metrics(void) {
2324
961k
    cram_metrics *m = calloc(1, sizeof(*m));
2325
961k
    if (!m)
2326
0
        return NULL;
2327
961k
    m->trial = NTRIALS-1;
2328
961k
    m->next_trial = TRIAL_SPAN/2; // learn quicker at start
2329
961k
    m->method = RAW;
2330
961k
    m->strat = 0;
2331
961k
    m->revised_method = 0;
2332
961k
    m->unpackable = 0;
2333
2334
961k
    return m;
2335
961k
}
2336
2337
251k
char *cram_block_method2str(enum cram_block_method_int m) {
2338
251k
    switch(m) {
2339
206k
    case RAW:         return "RAW";
2340
5.11k
    case GZIP:        return "GZIP";
2341
0
    case BZIP2:       return "BZIP2";
2342
0
    case LZMA:        return "LZMA";
2343
0
    case RANS0:       return "RANS0";
2344
0
    case RANS1:       return "RANS1";
2345
32
    case GZIP_RLE:    return "GZIP_RLE";
2346
4.32k
    case GZIP_1:      return "GZIP_1";
2347
0
    case FQZ:         return "FQZ";
2348
0
    case FQZ_b:       return "FQZ_b";
2349
0
    case FQZ_c:       return "FQZ_c";
2350
0
    case FQZ_d:       return "FQZ_d";
2351
1.36k
    case RANS_PR0:    return "RANS_PR0";
2352
484
    case RANS_PR1:    return "RANS_PR1";
2353
14.3k
    case RANS_PR64:   return "RANS_PR64";
2354
1
    case RANS_PR9:    return "RANS_PR9";
2355
18.4k
    case RANS_PR128:  return "RANS_PR128";
2356
0
    case RANS_PR129:  return "RANS_PR129";
2357
0
    case RANS_PR192:  return "RANS_PR192";
2358
346
    case RANS_PR193:  return "RANS_PR193";
2359
234
    case TOK3:        return "TOK3_R";
2360
0
    case TOKA:        return "TOK3_A";
2361
0
    case ARITH_PR0:   return "ARITH_PR0";
2362
0
    case ARITH_PR1:   return "ARITH_PR1";
2363
0
    case ARITH_PR64:  return "ARITH_PR64";
2364
0
    case ARITH_PR9:   return "ARITH_PR9";
2365
0
    case ARITH_PR128: return "ARITH_PR128";
2366
0
    case ARITH_PR129: return "ARITH_PR129";
2367
0
    case ARITH_PR192: return "ARITH_PR192";
2368
0
    case ARITH_PR193: return "ARITH_PR193";
2369
0
    case BM_ERROR: break;
2370
251k
    }
2371
0
    return "?";
2372
251k
}
2373
2374
19
char *cram_content_type2str(enum cram_content_type t) {
2375
19
    switch (t) {
2376
8
    case FILE_HEADER:         return "FILE_HEADER";
2377
3
    case COMPRESSION_HEADER:  return "COMPRESSION_HEADER";
2378
0
    case MAPPED_SLICE:        return "MAPPED_SLICE";
2379
0
    case UNMAPPED_SLICE:      return "UNMAPPED_SLICE";
2380
0
    case EXTERNAL:            return "EXTERNAL";
2381
0
    case CORE:                return "CORE";
2382
0
    case CT_ERROR:            break;
2383
19
    }
2384
8
    return "?";
2385
19
}
2386
2387
/* ----------------------------------------------------------------------
2388
 * Reference sequence handling
2389
 *
2390
 * These revolve around the refs_t structure, which may potentially be
2391
 * shared between multiple cram_fd.
2392
 *
2393
 * We start with refs_create() to allocate an empty refs_t and then
2394
 * populate it with @SQ line data using refs_from_header(). This is done on
2395
 * cram_open().  Also at start up we can call cram_load_reference() which
2396
 * is used with "scramble -r foo.fa". This replaces the fd->refs with the
2397
 * new one specified. In either case refs2id() is then called which
2398
 * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
2399
 *
2400
 * Later, possibly within a thread, we will want to know the actual ref
2401
 * seq itself, obtained by calling cram_get_ref().  This may use the
2402
 * UR: or M5: fields or the filename specified in the original
2403
 * cram_load_reference() call.
2404
 *
2405
 * Given the potential for multi-threaded reference usage, we have
2406
 * reference counting (sorry for the confusing double use of "ref") to
2407
 * track the number of callers interested in any specific reference.
2408
 */
2409
2410
/*
2411
 * Frees/unmaps a reference sequence and associated file handles.
2412
 */
2413
6.99k
static void ref_entry_free_seq(ref_entry *e) {
2414
6.99k
    if (e->mf)
2415
0
        mfclose(e->mf);
2416
6.99k
    if (e->seq && !e->mf)
2417
0
        free(e->seq);
2418
2419
6.99k
    e->seq = NULL;
2420
6.99k
    e->mf = NULL;
2421
6.99k
}
2422
2423
20.2k
void refs_free(refs_t *r) {
2424
20.2k
    RP("refs_free()\n");
2425
2426
20.2k
    if (--r->count > 0)
2427
0
        return;
2428
2429
20.2k
    if (!r)
2430
0
        return;
2431
2432
20.2k
    if (r->pool)
2433
20.2k
        string_pool_destroy(r->pool);
2434
2435
20.2k
    if (r->h_meta) {
2436
20.2k
        khint_t k;
2437
2438
40.8k
        for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) {
2439
20.6k
            ref_entry *e;
2440
2441
20.6k
            if (!kh_exist(r->h_meta, k))
2442
13.6k
                continue;
2443
6.99k
            if (!(e = kh_val(r->h_meta, k)))
2444
0
                continue;
2445
6.99k
            ref_entry_free_seq(e);
2446
6.99k
            free(e);
2447
6.99k
        }
2448
2449
20.2k
        kh_destroy(refs, r->h_meta);
2450
20.2k
    }
2451
2452
20.2k
    if (r->ref_id)
2453
10.5k
        free(r->ref_id);
2454
2455
20.2k
    if (r->fp)
2456
0
        bgzf_close(r->fp);
2457
2458
20.2k
    pthread_mutex_destroy(&r->lock);
2459
2460
20.2k
    free(r);
2461
20.2k
}
2462
2463
20.2k
static refs_t *refs_create(void) {
2464
20.2k
    refs_t *r = calloc(1, sizeof(*r));
2465
2466
20.2k
    RP("refs_create()\n");
2467
2468
20.2k
    if (!r)
2469
0
        return NULL;
2470
2471
20.2k
    if (!(r->pool = string_pool_create(8192)))
2472
0
        goto err;
2473
2474
20.2k
    r->ref_id = NULL; // see refs2id() to populate.
2475
20.2k
    r->count = 1;
2476
20.2k
    r->last = NULL;
2477
20.2k
    r->last_id = -1;
2478
2479
20.2k
    if (!(r->h_meta = kh_init(refs)))
2480
0
        goto err;
2481
2482
20.2k
    pthread_mutex_init(&r->lock, NULL);
2483
2484
20.2k
    return r;
2485
2486
0
 err:
2487
0
    refs_free(r);
2488
0
    return NULL;
2489
20.2k
}
2490
2491
/*
2492
 * Opens a reference fasta file as a BGZF stream, allowing for
2493
 * compressed files.  It automatically builds a .fai file if
2494
 * required and if compressed a .gzi bgzf index too.
2495
 *
2496
 * Returns a BGZF handle on success;
2497
 *         NULL on failure.
2498
 */
2499
1.89k
static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) {
2500
1.89k
    BGZF *fp;
2501
2502
1.89k
    if (strncmp(fn, "file://", 7) == 0)
2503
1
        fn += 7;
2504
2505
1.89k
    if (!is_md5 && !hisremote(fn)) {
2506
1.06k
        char fai_file[PATH_MAX];
2507
2508
1.06k
        snprintf(fai_file, PATH_MAX, "%s.fai", fn);
2509
1.06k
        if (access(fai_file, R_OK) != 0)
2510
1.06k
            if (fai_build(fn) != 0)
2511
1.06k
                return NULL;
2512
1.06k
    }
2513
2514
835
    if (!(fp = bgzf_open(fn, mode))) {
2515
835
        perror(fn);
2516
835
        return NULL;
2517
835
    }
2518
2519
0
    if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) {
2520
0
        hts_log_error("Unable to load .gzi index '%s.gzi'", fn);
2521
0
        bgzf_close(fp);
2522
0
        return NULL;
2523
0
    }
2524
2525
0
    return fp;
2526
0
}
2527
2528
/*
2529
 * Loads a FAI file for a reference.fasta.
2530
 * "is_err" indicates whether failure to load is worthy of emitting an
2531
 * error message. In some cases (eg with embedded references) we
2532
 * speculatively load, just in case, and silently ignore errors.
2533
 *
2534
 * Returns the refs_t struct on success (maybe newly allocated);
2535
 *         NULL on failure
2536
 */
2537
1.89k
static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) {
2538
1.89k
    hFILE *fp = NULL;
2539
1.89k
    char fai_fn[PATH_MAX];
2540
1.89k
    char line[8192];
2541
1.89k
    refs_t *r = r_orig;
2542
1.89k
    size_t fn_l = strlen(fn);
2543
1.89k
    int id = 0, id_alloc = 0;
2544
2545
1.89k
    RP("refs_load_fai %s\n", fn);
2546
2547
1.89k
    if (!r)
2548
0
        if (!(r = refs_create()))
2549
0
            goto err;
2550
2551
1.89k
    if (r->fp)
2552
0
        if (bgzf_close(r->fp) != 0)
2553
0
            goto err;
2554
1.89k
    r->fp = NULL;
2555
2556
    /* Look for a FASTA##idx##FAI format */
2557
1.89k
    char *fn_delim = strstr(fn, HTS_IDX_DELIM);
2558
1.89k
    if (fn_delim) {
2559
37
        if (!(r->fn = string_ndup(r->pool, fn, fn_delim - fn)))
2560
0
            goto err;
2561
37
        fn_delim += strlen(HTS_IDX_DELIM);
2562
37
        snprintf(fai_fn, PATH_MAX, "%s", fn_delim);
2563
1.85k
    } else {
2564
        /* An index file was provided, instead of the actual reference file */
2565
1.85k
        if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0) {
2566
3
            if (!r->fn) {
2567
3
                if (!(r->fn = string_ndup(r->pool, fn, fn_l-4)))
2568
0
                    goto err;
2569
3
            }
2570
3
            snprintf(fai_fn, PATH_MAX, "%s", fn);
2571
1.85k
        } else {
2572
        /* Only the reference file provided. Get the index file name from it */
2573
1.85k
            if (!(r->fn = string_dup(r->pool, fn)))
2574
0
                goto err;
2575
1.85k
            snprintf(fai_fn, PATH_MAX, "%.*s.fai", PATH_MAX-5, fn);
2576
1.85k
        }
2577
1.85k
    }
2578
2579
1.89k
    if (!(r->fp = bgzf_open_ref(r->fn, "r", 0))) {
2580
1.89k
        hts_log_error("Failed to open reference file '%s'", r->fn);
2581
1.89k
        goto err;
2582
1.89k
    }
2583
2584
0
    if (!(fp = hopen(fai_fn, "r"))) {
2585
0
        hts_log_error("Failed to open index file '%s'", fai_fn);
2586
0
        if (is_err)
2587
0
            perror(fai_fn);
2588
0
        goto err;
2589
0
    }
2590
0
    while (hgets(line, 8192, fp) != NULL) {
2591
0
        ref_entry *e = malloc(sizeof(*e));
2592
0
        char *cp;
2593
0
        int n;
2594
0
        khint_t k;
2595
2596
0
        if (!e)
2597
0
            return NULL;
2598
2599
        // id
2600
0
        for (cp = line; *cp && !isspace_c(*cp); cp++)
2601
0
            ;
2602
0
        *cp++ = 0;
2603
0
        e->name = string_dup(r->pool, line);
2604
2605
        // length
2606
0
        while (*cp && isspace_c(*cp))
2607
0
            cp++;
2608
0
        e->length = strtoll(cp, &cp, 10);
2609
2610
        // offset
2611
0
        while (*cp && isspace_c(*cp))
2612
0
            cp++;
2613
0
        e->offset = strtoll(cp, &cp, 10);
2614
2615
        // bases per line
2616
0
        while (*cp && isspace_c(*cp))
2617
0
            cp++;
2618
0
        e->bases_per_line = strtol(cp, &cp, 10);
2619
2620
        // line length
2621
0
        while (*cp && isspace_c(*cp))
2622
0
            cp++;
2623
0
        e->line_length = strtol(cp, &cp, 10);
2624
2625
        // filename
2626
0
        e->fn = r->fn;
2627
2628
0
        e->count = 0;
2629
0
        e->seq = NULL;
2630
0
        e->mf = NULL;
2631
0
        e->is_md5 = 0;
2632
0
        e->validated_md5 = 0;
2633
2634
0
        k = kh_put(refs, r->h_meta, e->name, &n);
2635
0
        if (-1 == n)  {
2636
0
            free(e);
2637
0
            return NULL;
2638
0
        }
2639
2640
0
        if (n) {
2641
0
            kh_val(r->h_meta, k) = e;
2642
0
        } else {
2643
0
            ref_entry *re = kh_val(r->h_meta, k);
2644
0
            if (re && (re->count != 0 || re->length != 0)) {
2645
                /* Keep old */
2646
0
                free(e);
2647
0
            } else {
2648
                /* Replace old */
2649
0
                if (re)
2650
0
                    free(re);
2651
0
                kh_val(r->h_meta, k) = e;
2652
0
            }
2653
0
        }
2654
2655
0
        if (id >= id_alloc) {
2656
0
            ref_entry **new_refs;
2657
0
            int x;
2658
2659
0
            id_alloc = id_alloc ?id_alloc*2 : 16;
2660
0
            new_refs = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
2661
0
            if (!new_refs)
2662
0
                goto err;
2663
0
            r->ref_id = new_refs;
2664
2665
0
            for (x = id; x < id_alloc; x++)
2666
0
                r->ref_id[x] = NULL;
2667
0
        }
2668
0
        r->ref_id[id] = e;
2669
0
        r->nref = ++id;
2670
0
    }
2671
2672
0
    if(hclose(fp) < 0)
2673
0
        goto err;
2674
0
    return r;
2675
2676
1.89k
 err:
2677
1.89k
    if (fp)
2678
0
        hclose_abruptly(fp);
2679
2680
1.89k
    if (!r_orig)
2681
0
        refs_free(r);
2682
2683
1.89k
    return NULL;
2684
0
}
2685
2686
/*
2687
 * Verifies that the CRAM @SQ lines and .fai files match.
2688
 */
2689
0
static void sanitise_SQ_lines(cram_fd *fd) {
2690
0
    int i;
2691
2692
0
    if (!fd->header || !fd->header->hrecs)
2693
0
        return;
2694
2695
0
    if (!fd->refs || !fd->refs->h_meta)
2696
0
        return;
2697
2698
0
    for (i = 0; i < fd->header->hrecs->nref; i++) {
2699
0
        const char *name = fd->header->hrecs->ref[i].name;
2700
0
        khint_t k = kh_get(refs, fd->refs->h_meta, name);
2701
0
        ref_entry *r;
2702
2703
        // We may have @SQ lines which have no known .fai, but do not
2704
        // in themselves pose a problem because they are unused in the file.
2705
0
        if (k == kh_end(fd->refs->h_meta))
2706
0
            continue;
2707
2708
0
        if (!(r = (ref_entry *)kh_val(fd->refs->h_meta, k)))
2709
0
            continue;
2710
2711
0
        if (r->length && r->length != fd->header->hrecs->ref[i].len) {
2712
0
            assert(strcmp(r->name, fd->header->hrecs->ref[i].name) == 0);
2713
2714
            // Should we also check MD5sums here to ensure the correct
2715
            // reference was given?
2716
0
            hts_log_warning("Header @SQ length mismatch for ref %s, %"PRIhts_pos" vs %d",
2717
0
                            r->name, fd->header->hrecs->ref[i].len, (int)r->length);
2718
2719
            // Fixing the parsed @SQ header will make MD:Z: strings work
2720
            // and also stop it producing N for the sequence.
2721
0
            fd->header->hrecs->ref[i].len = r->length;
2722
0
        }
2723
0
    }
2724
0
}
2725
2726
/*
2727
 * Indexes references by the order they appear in a BAM file. This may not
2728
 * necessarily be the same order they appear in the fasta reference file.
2729
 *
2730
 * Returns 0 on success
2731
 *        -1 on failure
2732
 */
2733
9.67k
int refs2id(refs_t *r, sam_hdr_t *hdr) {
2734
9.67k
    int i;
2735
9.67k
    sam_hrecs_t *h = hdr->hrecs;
2736
2737
9.67k
    if (r->ref_id)
2738
3.78k
        free(r->ref_id);
2739
9.67k
    if (r->last)
2740
0
        r->last = NULL;
2741
2742
9.67k
    r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
2743
9.67k
    if (!r->ref_id)
2744
0
        return -1;
2745
2746
9.67k
    r->nref = h->nref;
2747
14.3k
    for (i = 0; i < h->nref; i++) {
2748
4.70k
        khint_t k = kh_get(refs, r->h_meta, h->ref[i].name);
2749
4.70k
        if (k != kh_end(r->h_meta)) {
2750
4.70k
            r->ref_id[i] = kh_val(r->h_meta, k);
2751
4.70k
        } else {
2752
0
            hts_log_warning("Unable to find ref name '%s'", h->ref[i].name);
2753
0
        }
2754
4.70k
    }
2755
2756
9.67k
    return 0;
2757
9.67k
}
2758
2759
/*
2760
 * Generates refs_t entries based on @SQ lines in the header.
2761
 * Returns 0 on success
2762
 *         -1 on failure
2763
 */
2764
39.8k
static int refs_from_header(cram_fd *fd) {
2765
39.8k
    if (!fd)
2766
0
        return -1;
2767
2768
39.8k
    refs_t *r = fd->refs;
2769
39.8k
    if (!r)
2770
0
        return -1;
2771
2772
39.8k
    sam_hdr_t *h = fd->header;
2773
39.8k
    if (!h)
2774
10.4k
        return 0;
2775
2776
29.3k
    if (!h->hrecs) {
2777
18.4k
        if (-1 == sam_hdr_fill_hrecs(h))
2778
231
            return -1;
2779
18.4k
    }
2780
2781
29.1k
    if (h->hrecs->nref == 0)
2782
20.6k
        return 0;
2783
2784
    //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
2785
2786
    /* Existing refs are fine, as long as they're compatible with the hdr. */
2787
8.44k
    ref_entry **new_ref_id = realloc(r->ref_id, (r->nref + h->hrecs->nref) * sizeof(*r->ref_id));
2788
8.44k
    if (!new_ref_id)
2789
0
        return -1;
2790
8.44k
    r->ref_id = new_ref_id;
2791
2792
8.44k
    int i, j;
2793
    /* Copy info from h->ref[i] over to r */
2794
20.1k
    for (i = 0, j = r->nref; i < h->hrecs->nref; i++) {
2795
11.6k
        sam_hrec_type_t *ty;
2796
11.6k
        sam_hrec_tag_t *tag;
2797
11.6k
        khint_t k;
2798
11.6k
        int n;
2799
2800
11.6k
        k = kh_get(refs, r->h_meta, h->hrecs->ref[i].name);
2801
11.6k
        if (k != kh_end(r->h_meta))
2802
            // Ref already known about
2803
4.70k
            continue;
2804
2805
6.99k
        if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
2806
0
            return -1;
2807
2808
6.99k
        if (!h->hrecs->ref[i].name)
2809
0
            return -1;
2810
2811
6.99k
        r->ref_id[j]->name = string_dup(r->pool, h->hrecs->ref[i].name);
2812
6.99k
        if (!r->ref_id[j]->name) return -1;
2813
6.99k
        r->ref_id[j]->length = 0; // marker for not yet loaded
2814
2815
        /* Initialise likely filename if known */
2816
6.99k
        if ((ty = sam_hrecs_find_type_id(h->hrecs, "SQ", "SN", h->hrecs->ref[i].name))) {
2817
6.99k
            if ((tag = sam_hrecs_find_key(ty, "M5", NULL)))
2818
46
                r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
2819
2820
6.99k
            if ((tag = sam_hrecs_find_key(ty, "LN", NULL))) {
2821
                // LN tag used when constructing consensus reference
2822
6.99k
                r->ref_id[j]->LN_length = strtoll(tag->str+3, NULL, 0);
2823
                // See fuzz 382922241
2824
6.99k
                if (r->ref_id[j]->LN_length < 0)
2825
693
                    r->ref_id[j]->LN_length = 0;
2826
6.99k
            }
2827
6.99k
        }
2828
2829
6.99k
        k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
2830
6.99k
        if (n <= 0) // already exists or error
2831
0
            return -1;
2832
6.99k
        kh_val(r->h_meta, k) = r->ref_id[j];
2833
2834
6.99k
        j++;
2835
6.99k
    }
2836
8.44k
    r->nref = j;
2837
2838
8.44k
    return 0;
2839
8.44k
}
2840
2841
/*
2842
 * Attaches a header to a cram_fd.
2843
 *
2844
 * This should be used when creating a new cram_fd for writing where
2845
 * we have a header already constructed (eg from a file we've read
2846
 * in).
2847
 */
2848
9.90k
int cram_set_header2(cram_fd *fd, const sam_hdr_t *hdr) {
2849
9.90k
    if (!fd || !hdr )
2850
0
        return -1;
2851
2852
9.90k
    if (fd->header != hdr) {
2853
9.90k
        if (fd->header)
2854
0
            sam_hdr_destroy(fd->header);
2855
9.90k
        fd->header = sam_hdr_dup(hdr);
2856
9.90k
        if (!fd->header)
2857
0
            return -1;
2858
9.90k
    }
2859
9.90k
    return refs_from_header(fd);
2860
9.90k
}
2861
2862
0
int cram_set_header(cram_fd *fd, sam_hdr_t *hdr) {
2863
0
    return cram_set_header2(fd, hdr);
2864
0
}
2865
2866
/*
2867
 * Returns whether the path refers to a directory.
2868
 */
2869
0
static int is_directory(char *fn) {
2870
0
    struct stat buf;
2871
0
    if ( stat(fn,&buf) ) return 0;
2872
0
    return S_ISDIR(buf.st_mode);
2873
0
}
2874
2875
/*
2876
 * Converts a directory and a filename into an expanded path, replacing %s
2877
 * in directory with the filename and %[0-9]+s with portions of the filename
2878
 * Any remaining parts of filename are added to the end with /%s.
2879
 */
2880
0
static int expand_cache_path(char *path, char *dir, const char *fn) {
2881
0
    char *cp, *start = path;
2882
0
    size_t len;
2883
0
    size_t sz = PATH_MAX;
2884
2885
0
    while ((cp = strchr(dir, '%'))) {
2886
0
        if (cp-dir >= sz) return -1;
2887
0
        strncpy(path, dir, cp-dir);
2888
0
        path += cp-dir;
2889
0
        sz -= cp-dir;
2890
2891
0
        if (*++cp == 's') {
2892
0
            len = strlen(fn);
2893
0
            if (len >= sz) return -1;
2894
0
            strcpy(path, fn);
2895
0
            path += len;
2896
0
            sz -= len;
2897
0
            fn += len;
2898
0
            cp++;
2899
0
        } else if (*cp >= '0' && *cp <= '9') {
2900
0
            char *endp;
2901
0
            long l;
2902
2903
0
            l = strtol(cp, &endp, 10);
2904
0
            l = MIN(l, strlen(fn));
2905
0
            if (*endp == 's') {
2906
0
                if (l >= sz) return -1;
2907
0
                strncpy(path, fn, l);
2908
0
                path += l;
2909
0
                fn += l;
2910
0
                sz -= l;
2911
0
                *path = 0;
2912
0
                cp = endp+1;
2913
0
            } else {
2914
0
                if (sz < 3) return -1;
2915
0
                *path++ = '%';
2916
0
                *path++ = *cp++;
2917
0
            }
2918
0
        } else {
2919
0
            if (sz < 3) return -1;
2920
0
            *path++ = '%';
2921
0
            *path++ = *cp++;
2922
0
        }
2923
0
        dir = cp;
2924
0
    }
2925
2926
0
    len = strlen(dir);
2927
0
    if (len >= sz) return -1;
2928
0
    strcpy(path, dir);
2929
0
    path += len;
2930
0
    sz -= len;
2931
2932
0
    len = strlen(fn) + ((*fn && path > start && path[-1] != '/') ? 1 : 0);
2933
0
    if (len >= sz) return -1;
2934
0
    if (*fn && path > start && path[-1] != '/')
2935
0
        *path++ = '/';
2936
0
    strcpy(path, fn);
2937
0
    return 0;
2938
0
}
2939
2940
/*
2941
 * Make the directory containing path and any prefix directories.
2942
 */
2943
0
static void mkdir_prefix(char *path, int mode) {
2944
0
    char *cp = strrchr(path, '/');
2945
0
    if (!cp)
2946
0
        return;
2947
2948
0
    *cp = 0;
2949
0
    if (is_directory(path)) {
2950
0
        *cp = '/';
2951
0
        return;
2952
0
    }
2953
2954
0
    if (mkdir(path, mode) == 0) {
2955
0
        chmod(path, mode);
2956
0
        *cp = '/';
2957
0
        return;
2958
0
    }
2959
2960
0
    mkdir_prefix(path, mode);
2961
0
    mkdir(path, mode);
2962
0
    chmod(path, mode);
2963
0
    *cp = '/';
2964
0
}
2965
2966
/*
2967
 * Queries the M5 string from the header and attempts to populate the
2968
 * reference from this using the REF_PATH environment.
2969
 *
2970
 * Returns 0 on success
2971
 *        -1 on failure
2972
 */
2973
4.06k
static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
2974
4.06k
    char *ref_path = getenv("REF_PATH");
2975
4.06k
    sam_hrec_type_t *ty;
2976
4.06k
    sam_hrec_tag_t *tag;
2977
4.06k
    char path[PATH_MAX];
2978
4.06k
    kstring_t path_tmp = KS_INITIALIZE;
2979
4.06k
    char *local_cache = getenv("REF_CACHE");
2980
4.06k
    mFILE *mf;
2981
4.06k
    int local_path = 0;
2982
2983
4.06k
    hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id);
2984
2985
4.06k
    if (!r->name)
2986
0
        return -1;
2987
2988
4.06k
    if (!(ty = sam_hrecs_find_type_id(fd->header->hrecs, "SQ", "SN", r->name)))
2989
0
        return -1;
2990
2991
4.06k
    if (!(tag = sam_hrecs_find_key(ty, "M5", NULL)))
2992
4.04k
        goto no_M5;
2993
2994
14
    hts_log_info("Querying ref %s", tag->str+3);
2995
2996
    /* Use cache if available */
2997
14
    if (local_cache && *local_cache) {
2998
0
        struct stat sb;
2999
0
        if (expand_cache_path(path, local_cache, tag->str+3) == 0 &&
3000
0
            stat(path, &sb) == 0)
3001
            // Found it in the local cache
3002
0
            local_path = 1;
3003
0
    }
3004
3005
#ifndef HAVE_MMAP
3006
    char *path2;
3007
    /* Search local files in REF_PATH; we can open them and return as above */
3008
    if (!local_path && (path2 = find_path(tag->str+3, ref_path))) {
3009
        int len = snprintf(path, PATH_MAX, "%s", path2);
3010
        free(path2);
3011
        if (len > 0 && len < PATH_MAX) // in case it's too long
3012
            local_path = 1;
3013
    }
3014
#endif
3015
3016
    /* Found via REF_CACHE or local REF_PATH file */
3017
14
    if (local_path) {
3018
0
        struct stat sb;
3019
0
        BGZF *fp;
3020
3021
0
        if (0 == stat(path, &sb)
3022
0
            && S_ISREG(sb.st_mode)
3023
0
            && (fp = bgzf_open(path, "r"))) {
3024
0
            r->length = sb.st_size;
3025
0
            r->offset = r->line_length = r->bases_per_line = 0;
3026
3027
0
            r->fn = string_dup(fd->refs->pool, path);
3028
3029
0
            if (fd->refs->fp)
3030
0
                if (bgzf_close(fd->refs->fp) != 0)
3031
0
                    return -1;
3032
0
            fd->refs->fp = fp;
3033
0
            fd->refs->fn = r->fn;
3034
0
            r->is_md5 = 1;
3035
0
            r->validated_md5 = 1;
3036
3037
            // Fall back to cram_get_ref() where it'll do the actual
3038
            // reading of the file.
3039
0
            return 0;
3040
0
        }
3041
0
    }
3042
3043
3044
    /* Otherwise search full REF_PATH; slower as loads entire file */
3045
14
    int is_local = 0;
3046
14
    if ((mf = open_path_mfile(tag->str+3, ref_path, NULL, &is_local))) {
3047
0
        size_t sz;
3048
0
        r->seq = mfsteal(mf, &sz);
3049
0
        if (r->seq) {
3050
0
            r->mf = NULL;
3051
0
        } else {
3052
            // keep mf around as we couldn't detach
3053
0
            r->seq = mf->data;
3054
0
            r->mf = mf;
3055
0
        }
3056
0
        r->length = sz;
3057
0
        r->is_md5 = 1;
3058
0
        r->validated_md5 = 1;
3059
14
    } else {
3060
14
        refs_t *refs;
3061
14
        const char *fn;
3062
14
        sam_hrec_tag_t *UR_tag;
3063
3064
4.06k
    no_M5:
3065
        /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
3066
4.06k
        if (!(UR_tag = sam_hrecs_find_key(ty, "UR", NULL)))
3067
2.14k
            return -1;
3068
3069
1.91k
        if (strstr(UR_tag->str+3, "://") &&
3070
77
            strncmp(UR_tag->str+3, "file:", 5) != 0) {
3071
            // Documented as omitted, but accidentally supported until now
3072
21
            hts_log_error("UR tags pointing to remote files are not supported");
3073
21
            return -1;
3074
21
        }
3075
3076
1.89k
        fn = (strncmp(UR_tag->str+3, "file:", 5) == 0)
3077
1.89k
            ? UR_tag->str+8
3078
1.89k
            : UR_tag->str+3;
3079
3080
1.89k
        if (fd->refs->fp) {
3081
0
            if (bgzf_close(fd->refs->fp) != 0)
3082
0
                return -1;
3083
0
            fd->refs->fp = NULL;
3084
0
        }
3085
1.89k
        if (!(refs = refs_load_fai(fd->refs, fn, 0)))
3086
1.89k
            return -1;
3087
0
        sanitise_SQ_lines(fd);
3088
3089
0
        fd->refs = refs;
3090
0
        if (fd->refs->fp) {
3091
0
            if (bgzf_close(fd->refs->fp) != 0)
3092
0
                return -1;
3093
0
            fd->refs->fp = NULL;
3094
0
        }
3095
3096
0
        if (!fd->refs->fn)
3097
0
            return -1;
3098
3099
0
        if (-1 == refs2id(fd->refs, fd->header))
3100
0
            return -1;
3101
0
        if (!fd->refs->ref_id || !fd->refs->ref_id[id])
3102
0
            return -1;
3103
3104
        // Local copy already, so fall back to cram_get_ref().
3105
0
        return 0;
3106
0
    }
3107
3108
    /* Populate the local disk cache if required */
3109
0
    if (!is_local && local_cache && *local_cache) {
3110
0
        hFILE *fp;
3111
3112
0
        if (expand_cache_path(path, local_cache, tag->str+3) < 0) {
3113
0
            return 0; // Not fatal - we have the data already so keep going.
3114
0
        }
3115
0
        hts_log_info("Writing cache file '%s'", path);
3116
0
        mkdir_prefix(path, 01777);
3117
3118
0
        fp = hts_open_tmpfile(path, "wx", &path_tmp);
3119
0
        if (!fp) {
3120
0
            perror(path_tmp.s);
3121
0
            free(path_tmp.s);
3122
3123
            // Not fatal - we have the data already so keep going.
3124
0
            return 0;
3125
0
        }
3126
3127
        // Check md5sum
3128
0
        hts_md5_context *md5;
3129
0
        char unsigned md5_buf1[16];
3130
0
        char md5_buf2[33];
3131
3132
0
        if (!(md5 = hts_md5_init())) {
3133
0
            hclose_abruptly(fp);
3134
0
            unlink(path_tmp.s);
3135
0
            free(path_tmp.s);
3136
0
            return -1;
3137
0
        }
3138
0
        hts_md5_update(md5, r->seq, r->length);
3139
0
        hts_md5_final(md5_buf1, md5);
3140
0
        hts_md5_destroy(md5);
3141
0
        hts_md5_hex(md5_buf2, md5_buf1);
3142
3143
0
        if (strncmp(tag->str+3, md5_buf2, 32) != 0) {
3144
0
            hts_log_error("Mismatching md5sum for downloaded reference");
3145
0
            hclose_abruptly(fp);
3146
0
            unlink(path_tmp.s);
3147
0
            free(path_tmp.s);
3148
0
            return -1;
3149
0
        }
3150
3151
0
        ssize_t length_written = hwrite(fp, r->seq, r->length);
3152
0
        if (hclose(fp) < 0 || length_written != r->length ||
3153
0
            chmod(path_tmp.s, 0444) < 0 ||
3154
0
            rename(path_tmp.s, path) < 0) {
3155
0
            hts_log_error("Creating reference at %s failed: %s",
3156
0
                          path, strerror(errno));
3157
0
            unlink(path_tmp.s);
3158
0
        }
3159
0
    }
3160
3161
0
    free(path_tmp.s);
3162
0
    return 0;
3163
0
}
3164
3165
1.51k
static void cram_ref_incr_locked(refs_t *r, int id) {
3166
1.51k
    RP("%d INC REF %d, %d %p\n", gettid(), id,
3167
1.51k
       (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count+1:-999),
3168
1.51k
       id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
3169
3170
1.51k
    if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq)
3171
1.51k
        return;
3172
3173
0
    if (r->last_id == id)
3174
0
        r->last_id = -1;
3175
3176
0
    ++r->ref_id[id]->count;
3177
0
}
3178
3179
1.51k
void cram_ref_incr(refs_t *r, int id) {
3180
1.51k
    pthread_mutex_lock(&r->lock);
3181
1.51k
    cram_ref_incr_locked(r, id);
3182
1.51k
    pthread_mutex_unlock(&r->lock);
3183
1.51k
}
3184
3185
81
static void cram_ref_decr_locked(refs_t *r, int id) {
3186
81
    RP("%d DEC REF %d, %d %p\n", gettid(), id,
3187
81
       (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count-1:-999),
3188
81
       id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
3189
3190
81
    if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) {
3191
81
        return;
3192
81
    }
3193
3194
0
    if (--r->ref_id[id]->count <= 0) {
3195
0
        assert(r->ref_id[id]->count == 0);
3196
0
        if (r->last_id >= 0) {
3197
0
            if (r->ref_id[r->last_id]->count <= 0 &&
3198
0
                r->ref_id[r->last_id]->seq) {
3199
0
                RP("%d FREE REF %d (%p)\n", gettid(),
3200
0
                   r->last_id, r->ref_id[r->last_id]->seq);
3201
0
                ref_entry_free_seq(r->ref_id[r->last_id]);
3202
0
                if (r->ref_id[r->last_id]->is_md5) r->ref_id[r->last_id]->length = 0;
3203
0
            }
3204
0
        }
3205
0
        r->last_id = id;
3206
0
    }
3207
0
}
3208
3209
81
void cram_ref_decr(refs_t *r, int id) {
3210
81
    pthread_mutex_lock(&r->lock);
3211
81
    cram_ref_decr_locked(r, id);
3212
81
    pthread_mutex_unlock(&r->lock);
3213
81
}
3214
3215
/*
3216
 * Used by cram_ref_load and cram_get_ref. The file handle will have
3217
 * already been opened, so we can catch it. The ref_entry *e informs us
3218
 * of whether this is a multi-line fasta file or a raw MD5 style file.
3219
 * Either way we create a single contiguous sequence.
3220
 *
3221
 * Returns all or part of a reference sequence on success (malloced);
3222
 *         NULL on failure.
3223
 */
3224
static char *load_ref_portion(BGZF *fp, ref_entry *e,
3225
0
                              hts_pos_t start, hts_pos_t end) {
3226
0
    off_t offset, len;
3227
0
    char *seq;
3228
3229
0
    if (end < start)
3230
0
        end = start;
3231
3232
    /*
3233
     * Compute locations in file. This is trivial for the MD5 files, but
3234
     * is still necessary for the fasta variants.
3235
     *
3236
     * Note the offset here, as with faidx, has the assumption that white-
3237
     * space (the diff between line_length and bases_per_line) only occurs
3238
     * at the end of a line of text.
3239
     */
3240
0
    offset = e->line_length
3241
0
        ? e->offset + (start-1)/e->bases_per_line * e->line_length +
3242
0
          (start-1) % e->bases_per_line
3243
0
        : start-1;
3244
3245
0
    len = (e->line_length
3246
0
           ? e->offset + (end-1)/e->bases_per_line * e->line_length +
3247
0
             (end-1) % e->bases_per_line
3248
0
           : end-1) - offset + 1;
3249
3250
0
    if (bgzf_useek(fp, offset, SEEK_SET) < 0) {
3251
0
        perror("bgzf_useek() on reference file");
3252
0
        return NULL;
3253
0
    }
3254
3255
0
    if (len == 0 || !(seq = malloc(len))) {
3256
0
        return NULL;
3257
0
    }
3258
3259
0
    if (len != bgzf_read(fp, seq, len)) {
3260
0
        perror("bgzf_read() on reference file");
3261
0
        free(seq);
3262
0
        return NULL;
3263
0
    }
3264
3265
    /* Strip white-space if required. */
3266
0
    if (len != end-start+1) {
3267
0
        hts_pos_t i, j;
3268
0
        char *cp = seq;
3269
0
        char *cp_to;
3270
3271
        // Copy up to the first white-space, and then repeatedly just copy
3272
        // bases_per_line verbatim, and use the slow method to end again.
3273
        //
3274
        // This may seem excessive, but this code can be a significant
3275
        // portion of total CRAM decode CPU time for shallow data sets.
3276
0
        for (i = j = 0; i < len; i++) {
3277
0
            if (!isspace_c(cp[i]))
3278
0
                cp[j++] = cp[i] & ~0x20;
3279
0
            else
3280
0
                break;
3281
0
        }
3282
0
        while (i < len && isspace_c(cp[i]))
3283
0
            i++;
3284
0
        while (i < len - e->line_length) {
3285
0
            hts_pos_t j_end = j + e->bases_per_line;
3286
0
            while (j < j_end)
3287
0
                cp[j++] = cp[i++] & ~0x20; // toupper equiv
3288
0
            i += e->line_length - e->bases_per_line;
3289
0
        }
3290
0
        for (; i < len; i++) {
3291
0
            if (!isspace_c(cp[i]))
3292
0
                cp[j++] = cp[i] & ~0x20;
3293
0
        }
3294
3295
0
        cp_to = cp+j;
3296
3297
0
        if (cp_to - seq != end-start+1) {
3298
0
            hts_log_error("Malformed reference file");
3299
0
            free(seq);
3300
0
            return NULL;
3301
0
        }
3302
0
    } else {
3303
0
        int i;
3304
0
        for (i = 0; i < len; i++) {
3305
0
            seq[i] = toupper_c(seq[i]);
3306
0
        }
3307
0
    }
3308
3309
0
    return seq;
3310
0
}
3311
3312
/*
3313
 * Load the entire reference 'id'.
3314
 * This also increments the reference count by 1.
3315
 *
3316
 * Returns ref_entry on success;
3317
 *         NULL on failure
3318
 */
3319
0
ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) {
3320
0
    ref_entry *e = r->ref_id[id];
3321
0
    hts_pos_t start = 1, end = e->length;
3322
0
    char *seq;
3323
3324
0
    if (e->seq) {
3325
0
        return e;
3326
0
    }
3327
3328
0
    assert(e->count == 0);
3329
3330
0
    if (r->last) {
3331
#ifdef REF_DEBUG
3332
        int idx = 0;
3333
        for (idx = 0; idx < r->nref; idx++)
3334
            if (r->last == r->ref_id[idx])
3335
                break;
3336
        RP("%d cram_ref_load DECR %d\n", gettid(), idx);
3337
#endif
3338
0
        assert(r->last->count > 0);
3339
0
        if (--r->last->count <= 0) {
3340
0
            RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
3341
0
            if (r->last->seq)
3342
0
                ref_entry_free_seq(r->last);
3343
0
        }
3344
0
    }
3345
3346
0
    if (!r->fn)
3347
0
        return NULL;
3348
3349
    /* Open file if it's not already the current open reference */
3350
0
    if (strcmp(r->fn, e->fn) || r->fp == NULL) {
3351
0
        if (r->fp)
3352
0
            if (bgzf_close(r->fp) != 0)
3353
0
                return NULL;
3354
0
        r->fn = e->fn;
3355
0
        if (!(r->fp = bgzf_open_ref(r->fn, "r", is_md5)))
3356
0
            return NULL;
3357
0
    }
3358
3359
0
    RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
3360
3361
0
    if (!(seq = load_ref_portion(r->fp, e, start, end))) {
3362
0
        return NULL;
3363
0
    }
3364
3365
0
    RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
3366
3367
0
    RP("%d INC REF %d, %"PRId64"\n", gettid(), id, (e->count+1));
3368
0
    e->seq = seq;
3369
0
    e->mf = NULL;
3370
0
    e->count++;
3371
3372
    /*
3373
     * Also keep track of last used ref so incr/decr loops on the same
3374
     * sequence don't cause load/free loops.
3375
     */
3376
0
    RP("%d cram_ref_load INCR %d => %"PRId64"\n", gettid(), id, e->count+1);
3377
0
    r->last = e;
3378
0
    e->count++;
3379
3380
0
    return e;
3381
0
}
3382
3383
/*
3384
 * Returns a portion of a reference sequence from start to end inclusive.
3385
 * The returned pointer is owned by either the cram_file fd or by the
3386
 * internal refs_t structure and should not be freed  by the caller.
3387
 *
3388
 * The difference is whether or not this refs_t is in use by just the one
3389
 * cram_fd or by multiples, or whether we have multiple threads accessing
3390
 * references. In either case fd->shared will be true and we start using
3391
 * reference counting to track the number of users of a specific reference
3392
 * sequence.
3393
 *
3394
 * Otherwise the ref seq returned is allocated as part of cram_fd itself
3395
 * and will be freed up on the next call to cram_get_ref or cram_close.
3396
 *
3397
 * To return the entire reference sequence, specify start as 1 and end
3398
 * as 0.
3399
 *
3400
 * To cease using a reference, call cram_ref_decr().
3401
 *
3402
 * Returns reference on success,
3403
 *         NULL on failure
3404
 */
3405
7.53k
char *cram_get_ref(cram_fd *fd, int id, hts_pos_t start, hts_pos_t end) {
3406
7.53k
    ref_entry *r;
3407
7.53k
    char *seq;
3408
7.53k
    int ostart = start;
3409
3410
7.53k
    if (id == -1 || start < 1)
3411
3.45k
        return NULL;
3412
3413
    /* FIXME: axiomatic query of r->seq being true?
3414
     * Or shortcut for unsorted data where we load once and never free?
3415
     */
3416
3417
    //fd->shared_ref = 1; // hard code for now to simplify things
3418
3419
4.08k
    pthread_mutex_lock(&fd->ref_lock);
3420
3421
4.08k
    RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
3422
3423
    /*
3424
     * Unsorted data implies we want to fetch an entire reference at a time.
3425
     * We just deal with this at the moment by claiming we're sharing
3426
     * references instead, which has the same requirement.
3427
     */
3428
4.08k
    if (fd->unsorted)
3429
0
        fd->shared_ref = 1;
3430
3431
3432
    /* Sanity checking: does this ID exist? */
3433
4.08k
    if (id >= fd->refs->nref) {
3434
18
        hts_log_error("No reference found for id %d", id);
3435
18
        pthread_mutex_unlock(&fd->ref_lock);
3436
18
        return NULL;
3437
18
    }
3438
3439
4.06k
    if (!fd->refs || !fd->refs->ref_id[id]) {
3440
0
        hts_log_error("No reference found for id %d", id);
3441
0
        pthread_mutex_unlock(&fd->ref_lock);
3442
0
        return NULL;
3443
0
    }
3444
3445
4.06k
    if (!(r = fd->refs->ref_id[id])) {
3446
0
        hts_log_error("No reference found for id %d", id);
3447
0
        pthread_mutex_unlock(&fd->ref_lock);
3448
0
        return NULL;
3449
0
    }
3450
3451
3452
    /*
3453
     * It has an entry, but may not have been populated yet.
3454
     * Any manually loaded .fai files have their lengths known.
3455
     * A ref entry computed from @SQ lines (M5 or UR field) will have
3456
     * r->length == 0 unless it's been loaded once and verified that we have
3457
     * an on-disk filename for it.
3458
     *
3459
     * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
3460
     * open_path_mfile and libcurl, which isn't multi-thread safe unless I
3461
     * rewrite my code to have one curl handle per thread.
3462
     */
3463
4.06k
    pthread_mutex_lock(&fd->refs->lock);
3464
4.06k
    if (r->length == 0) {
3465
4.06k
        if (fd->ref_fn)
3466
0
            hts_log_warning("Reference file given, but ref '%s' not present",
3467
4.06k
                            r->name);
3468
4.06k
        if (cram_populate_ref(fd, id, r) == -1) {
3469
4.06k
            hts_log_warning("Failed to populate reference \"%s\"",
3470
4.06k
                            r->name);
3471
4.06k
            hts_log_warning("See https://www.htslib.org/doc/reference_seqs.html for further suggestions");
3472
4.06k
            pthread_mutex_unlock(&fd->refs->lock);
3473
4.06k
            pthread_mutex_unlock(&fd->ref_lock);
3474
4.06k
            return NULL;
3475
4.06k
        }
3476
0
        r = fd->refs->ref_id[id];
3477
0
        if (fd->unsorted)
3478
0
            cram_ref_incr_locked(fd->refs, id);
3479
0
    }
3480
3481
3482
    /*
3483
     * We now know that we the filename containing the reference, so check
3484
     * for limits. If it's over half the reference we'll load all of it in
3485
     * memory as this will speed up subsequent calls.
3486
     */
3487
0
    if (end < 1)
3488
0
        end = r->length;
3489
0
    if (end >= r->length)
3490
0
        end  = r->length;
3491
3492
0
    if (end - start >= 0.5*r->length || fd->shared_ref) {
3493
0
        start = 1;
3494
0
        end = r->length;
3495
0
    }
3496
3497
    /*
3498
     * Maybe we have it cached already? If so use it.
3499
     *
3500
     * Alternatively if we don't have the sequence but we're sharing
3501
     * references and/or are asking for the entire length of it, then
3502
     * load the full reference into the refs structure and return
3503
     * a pointer to that one instead.
3504
     */
3505
0
    if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
3506
0
        char *cp;
3507
3508
0
        if (id >= 0) {
3509
0
            if (r->seq) {
3510
0
                cram_ref_incr_locked(fd->refs, id);
3511
0
            } else {
3512
0
                ref_entry *e;
3513
0
                if (!(e = cram_ref_load(fd->refs, id, r->is_md5))) {
3514
0
                    pthread_mutex_unlock(&fd->refs->lock);
3515
0
                    pthread_mutex_unlock(&fd->ref_lock);
3516
0
                    return NULL;
3517
0
                }
3518
3519
                /* unsorted data implies cache ref indefinitely, to avoid
3520
                 * continually loading and unloading.
3521
                 */
3522
0
                if (fd->unsorted)
3523
0
                    cram_ref_incr_locked(fd->refs, id);
3524
0
            }
3525
3526
0
            fd->ref = NULL; /* We never access it directly */
3527
0
            fd->ref_start = 1;
3528
0
            fd->ref_end   = r->length;
3529
0
            fd->ref_id    = id;
3530
3531
0
            cp = fd->refs->ref_id[id]->seq + ostart-1;
3532
0
        } else {
3533
0
            fd->ref = NULL;
3534
0
            cp = NULL;
3535
0
        }
3536
3537
0
        RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
3538
3539
0
        pthread_mutex_unlock(&fd->refs->lock);
3540
0
        pthread_mutex_unlock(&fd->ref_lock);
3541
0
        return cp;
3542
0
    }
3543
3544
    /*
3545
     * Otherwise we're not sharing, we don't have a copy of it already and
3546
     * we're only asking for a small portion of it.
3547
     *
3548
     * In this case load up just that segment ourselves, freeing any old
3549
     * small segments in the process.
3550
     */
3551
3552
    /* Unmapped ref ID */
3553
0
    if (id < 0 || !fd->refs->fn) {
3554
0
        if (fd->ref_free) {
3555
0
            free(fd->ref_free);
3556
0
            fd->ref_free = NULL;
3557
0
        }
3558
0
        fd->ref = NULL;
3559
0
        fd->ref_id = id;
3560
0
        pthread_mutex_unlock(&fd->refs->lock);
3561
0
        pthread_mutex_unlock(&fd->ref_lock);
3562
0
        return NULL;
3563
0
    }
3564
3565
    /* Open file if it's not already the current open reference */
3566
0
    if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
3567
0
        if (fd->refs->fp)
3568
0
            if (bgzf_close(fd->refs->fp) != 0)
3569
0
                return NULL;
3570
0
        fd->refs->fn = r->fn;
3571
0
        if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r", r->is_md5))) {
3572
0
            pthread_mutex_unlock(&fd->refs->lock);
3573
0
            pthread_mutex_unlock(&fd->ref_lock);
3574
0
            return NULL;
3575
0
        }
3576
0
    }
3577
3578
0
    if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
3579
0
        pthread_mutex_unlock(&fd->refs->lock);
3580
0
        pthread_mutex_unlock(&fd->ref_lock);
3581
0
        return NULL;
3582
0
    }
3583
3584
0
    if (fd->ref_free)
3585
0
        free(fd->ref_free);
3586
3587
0
    fd->ref_id    = id;
3588
0
    fd->ref_start = start;
3589
0
    fd->ref_end   = end;
3590
0
    fd->ref_free = fd->ref;
3591
0
    seq = fd->ref;
3592
3593
0
    pthread_mutex_unlock(&fd->refs->lock);
3594
0
    pthread_mutex_unlock(&fd->ref_lock);
3595
3596
0
    return seq ? seq + ostart - start : NULL;
3597
0
}
3598
3599
/*
3600
 * If fd has been opened for reading, it may be permitted to specify 'fn'
3601
 * as NULL and let the code auto-detect the reference by parsing the
3602
 * SAM header @SQ lines.
3603
 */
3604
0
int cram_load_reference(cram_fd *fd, char *fn) {
3605
0
    int ret = 0;
3606
3607
0
    if (fn) {
3608
0
        fd->refs = refs_load_fai(fd->refs, fn,
3609
0
                                 !(fd->embed_ref>0 && fd->mode == 'r'));
3610
0
        fn = fd->refs ? fd->refs->fn : NULL;
3611
0
        if (!fn)
3612
0
            ret = -1;
3613
0
        sanitise_SQ_lines(fd);
3614
0
    }
3615
0
    fd->ref_fn = fn;
3616
3617
0
    if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
3618
0
        if (fd->refs)
3619
0
            refs_free(fd->refs);
3620
0
        if (!(fd->refs = refs_create()))
3621
0
            return -1;
3622
0
        if (-1 == refs_from_header(fd))
3623
0
            return -1;
3624
0
    }
3625
3626
0
    if (fd->header)
3627
0
        if (-1 == refs2id(fd->refs, fd->header))
3628
0
            return -1;
3629
3630
0
    return ret;
3631
0
}
3632
3633
/* ----------------------------------------------------------------------
3634
 * Containers
3635
 */
3636
3637
/*
3638
 * Creates a new container, specifying the maximum number of slices
3639
 * and records permitted.
3640
 *
3641
 * Returns cram_container ptr on success
3642
 *         NULL on failure
3643
 */
3644
48.0k
cram_container *cram_new_container(int nrec, int nslice) {
3645
48.0k
    cram_container *c = calloc(1, sizeof(*c));
3646
48.0k
    enum cram_DS_ID id;
3647
3648
48.0k
    if (!c)
3649
0
        return NULL;
3650
3651
48.0k
    c->curr_ref = -2;
3652
3653
48.0k
    c->max_c_rec = nrec * nslice;
3654
48.0k
    c->curr_c_rec = 0;
3655
3656
48.0k
    c->max_rec = nrec;
3657
48.0k
    c->record_counter = 0;
3658
48.0k
    c->num_bases = 0;
3659
48.0k
    c->s_num_bases = 0;
3660
3661
48.0k
    c->max_slice = nslice;
3662
48.0k
    c->curr_slice = 0;
3663
3664
48.0k
    c->pos_sorted = 1;
3665
48.0k
    c->max_apos   = 0;
3666
48.0k
    c->multi_seq  = 0;
3667
48.0k
    c->qs_seq_orient = 1;
3668
48.0k
    c->no_ref = 0;
3669
48.0k
    c->embed_ref = -1; // automatic selection
3670
3671
48.0k
    c->bams = NULL;
3672
3673
48.0k
    if (!(c->slices = calloc(nslice != 0 ? nslice : 1, sizeof(cram_slice *))))
3674
0
        goto err;
3675
48.0k
    c->slice = NULL;
3676
3677
48.0k
    if (!(c->comp_hdr = cram_new_compression_header()))
3678
0
        goto err;
3679
48.0k
    c->comp_hdr_block = NULL;
3680
3681
1.39M
    for (id = DS_RN; id < DS_TN; id++)
3682
1.34M
        if (!(c->stats[id] = cram_stats_create())) goto err;
3683
3684
    //c->aux_B_stats = cram_stats_create();
3685
3686
48.0k
    if (!(c->tags_used = kh_init(m_tagmap)))
3687
0
        goto err;
3688
48.0k
    c->refs_used = 0;
3689
48.0k
    c->ref_free = 0;
3690
3691
48.0k
    return c;
3692
3693
0
 err:
3694
0
    if (c) {
3695
0
        if (c->slices)
3696
0
            free(c->slices);
3697
0
        free(c);
3698
0
    }
3699
0
    return NULL;
3700
48.0k
}
3701
3702
4.98k
static void free_bam_list(bam_seq_t **bams, int max_rec) {
3703
4.98k
    int i;
3704
49.8M
    for (i = 0; i < max_rec; i++)
3705
49.8M
        bam_free(bams[i]);
3706
3707
4.98k
    free(bams);
3708
4.98k
}
3709
3710
76.0k
void cram_free_container(cram_container *c) {
3711
76.0k
    enum cram_DS_ID id;
3712
76.0k
    int i;
3713
3714
76.0k
    if (!c)
3715
0
        return;
3716
3717
76.0k
    if (c->refs_used)
3718
641
        free(c->refs_used);
3719
3720
76.0k
    if (c->landmark)
3721
49.4k
        free(c->landmark);
3722
3723
76.0k
    if (c->comp_hdr)
3724
48.4k
        cram_free_compression_header(c->comp_hdr);
3725
3726
76.0k
    if (c->comp_hdr_block)
3727
44.0k
        cram_free_block(c->comp_hdr_block);
3728
3729
    // Free the slices; filled out by encoder only
3730
76.0k
    if (c->slices) {
3731
86.3k
        for (i = 0; i < c->max_slice; i++) {
3732
38.3k
            if (c->slices[i])
3733
4.98k
                cram_free_slice(c->slices[i]);
3734
38.3k
            if (c->slices[i] == c->slice)
3735
38.3k
                c->slice = NULL;
3736
38.3k
        }
3737
48.0k
        free(c->slices);
3738
48.0k
    }
3739
3740
    // Free the current slice; set by both encoder & decoder
3741
76.0k
    if (c->slice) {
3742
0
        cram_free_slice(c->slice);
3743
0
        c->slice = NULL;
3744
0
    }
3745
3746
2.20M
    for (id = DS_RN; id < DS_TN; id++)
3747
2.12M
        if (c->stats[id]) cram_stats_free(c->stats[id]);
3748
3749
    //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
3750
3751
76.0k
    if (c->tags_used) {
3752
48.0k
        khint_t k;
3753
3754
290k
        for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
3755
242k
            if (!kh_exist(c->tags_used, k))
3756
132k
                continue;
3757
3758
109k
            cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
3759
109k
            if (tm) {
3760
109k
                cram_codec *c = tm->codec;
3761
3762
109k
                if (c) c->free(c);
3763
3764
                // If tm->blk or tm->blk2 is set, then we haven't yet got to
3765
                // cram_encode_container which copies the blocks to s->aux_block
3766
                // and NULLifies tm->blk*.  In this case we failed to complete
3767
                // the container construction, so we have to free up our partially
3768
                // converted CRAM.
3769
109k
                cram_free_block(tm->blk);
3770
109k
                cram_free_block(tm->blk2);
3771
109k
                free(tm);
3772
109k
            }
3773
109k
        }
3774
3775
48.0k
        kh_destroy(m_tagmap, c->tags_used);
3776
48.0k
    }
3777
3778
76.0k
    if (c->ref_free)
3779
17.9k
        free(c->ref);
3780
3781
76.0k
    if (c->bams)
3782
381
        free_bam_list(c->bams, c->max_c_rec);
3783
3784
76.0k
    free(c);
3785
76.0k
}
3786
3787
/*
3788
 * Reads a container header.
3789
 *
3790
 * Returns cram_container on success
3791
 *         NULL on failure or no container left (fd->err == 0).
3792
 */
3793
28.5k
cram_container *cram_read_container(cram_fd *fd) {
3794
28.5k
    cram_container c2, *c;
3795
28.5k
    int i, s;
3796
28.5k
    size_t rd = 0;
3797
28.5k
    uint32_t crc = 0;
3798
3799
28.5k
    fd->err = 0;
3800
28.5k
    fd->eof = 0;
3801
3802
28.5k
    memset(&c2, 0, sizeof(c2));
3803
28.5k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3804
23.1k
        if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) {
3805
57
            fd->eof = fd->empty_container ? 1 : 2;
3806
57
            return NULL;
3807
23.0k
        } else {
3808
23.0k
            rd+=s;
3809
23.0k
        }
3810
23.1k
    } else if (CRAM_MAJOR_VERS(fd->version) < 4) {
3811
2.00k
        uint32_t len;
3812
2.00k
        if ((s = int32_decode(fd, &c2.length)) == -1) {
3813
30
            if (CRAM_MAJOR_VERS(fd->version) == 2 &&
3814
22
                CRAM_MINOR_VERS(fd->version) == 0)
3815
13
                fd->eof = 1; // EOF blocks arrived in v2.1
3816
17
            else
3817
17
                fd->eof = fd->empty_container ? 1 : 2;
3818
30
            return NULL;
3819
1.97k
        } else {
3820
1.97k
            rd+=s;
3821
1.97k
        }
3822
1.97k
        len = le_int4(c2.length);
3823
1.97k
        crc = crc32(0L, (unsigned char *)&len, 4);
3824
3.42k
    } else {
3825
3.42k
        if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc))   == -1) {
3826
14
            fd->eof = fd->empty_container ? 1 : 2;
3827
14
            return NULL;
3828
3.40k
        } else {
3829
3.40k
            rd+=s;
3830
3.40k
        }
3831
3.42k
    }
3832
28.4k
    if ((s = fd->vv.varint_decode32s_crc(fd, &c2.ref_seq_id, &crc))   == -1) return NULL; else rd+=s;
3833
28.3k
    if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3834
3.39k
        int64_t i64;
3835
3.39k
        if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc))== -1) return NULL; else rd+=s;
3836
3.38k
        c2.ref_seq_start = i64;
3837
3.38k
        if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc)) == -1) return NULL; else rd+=s;
3838
3.37k
        c2.ref_seq_span = i64;
3839
24.9k
    } else {
3840
24.9k
        int32_t i32;
3841
24.9k
        if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s;
3842
24.9k
        c2.ref_seq_start = i32;
3843
24.9k
        if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s;
3844
24.8k
        c2.ref_seq_span = i32;
3845
24.8k
    }
3846
28.2k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_records, &crc))  == -1) return NULL; else rd+=s;
3847
3848
28.2k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3849
22.8k
        c2.record_counter = 0;
3850
22.8k
        c2.num_bases = 0;
3851
22.8k
    } else {
3852
5.31k
        if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3853
3.58k
            if ((s = fd->vv.varint_decode64_crc(fd, &c2.record_counter, &crc)) == -1)
3854
10
                return NULL;
3855
3.57k
            else
3856
3.57k
                rd += s;
3857
3.58k
        } else {
3858
1.72k
            int32_t i32;
3859
1.72k
            if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1)
3860
5
                return NULL;
3861
1.72k
            else
3862
1.72k
                rd += s;
3863
1.72k
            c2.record_counter = i32;
3864
1.72k
        }
3865
3866
5.29k
        if ((s = fd->vv.varint_decode64_crc(fd, &c2.num_bases, &crc))== -1)
3867
31
            return NULL;
3868
5.26k
        else
3869
5.26k
            rd += s;
3870
5.29k
    }
3871
28.1k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_blocks, &crc))   == -1)
3872
31
        return NULL;
3873
28.1k
    else
3874
28.1k
        rd+=s;
3875
28.1k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_landmarks, &crc))== -1)
3876
26
        return NULL;
3877
28.1k
    else
3878
28.1k
        rd+=s;
3879
3880
28.1k
    if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t))
3881
59
        return NULL;
3882
3883
28.0k
    if (!(c = calloc(1, sizeof(*c))))
3884
0
        return NULL;
3885
3886
28.0k
    *c = c2;
3887
28.0k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3888
28.0k
    if (c->num_landmarks > FUZZ_ALLOC_LIMIT/sizeof(int32_t)) {
3889
10
        fd->err = errno = ENOMEM;
3890
10
        cram_free_container(c);
3891
10
        return NULL;
3892
10
    }
3893
28.0k
#endif
3894
28.0k
    if (c->num_landmarks && !(c->landmark = malloc(c->num_landmarks * sizeof(int32_t)))) {
3895
0
        fd->err = errno;
3896
0
        cram_free_container(c);
3897
0
        return NULL;
3898
0
    }
3899
103k
    for (i = 0; i < c->num_landmarks; i++) {
3900
75.9k
        if ((s = fd->vv.varint_decode32_crc(fd, &c->landmark[i], &crc)) == -1) {
3901
197
            cram_free_container(c);
3902
197
            return NULL;
3903
75.7k
        } else {
3904
75.7k
            rd += s;
3905
75.7k
        }
3906
75.9k
    }
3907
3908
27.8k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3909
3.46k
        if (-1 == int32_decode(fd, (int32_t *)&c->crc32)) {
3910
20
            cram_free_container(c);
3911
20
            return NULL;
3912
3.44k
        } else {
3913
3.44k
            rd+=4;
3914
3.44k
        }
3915
3916
3.44k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3917
        // Pretend the CRC was OK so the fuzzer doesn't have to get it right
3918
3.44k
        crc = c->crc32;
3919
3.44k
#endif
3920
3921
3.44k
        if (crc != c->crc32) {
3922
0
            hts_log_error("Container header CRC32 failure");
3923
0
            cram_free_container(c);
3924
0
            return NULL;
3925
0
        }
3926
3.44k
    }
3927
3928
27.8k
    c->offset = rd;
3929
27.8k
    c->slices = NULL;
3930
27.8k
    c->slice = NULL;
3931
27.8k
    c->curr_slice = 0;
3932
27.8k
    c->max_slice = c->num_landmarks;
3933
27.8k
    c->slice_rec = 0;
3934
27.8k
    c->curr_rec = 0;
3935
27.8k
    c->max_rec = 0;
3936
3937
27.8k
    if (c->ref_seq_id == -2) {
3938
32
        c->multi_seq = 1;
3939
32
        fd->multi_seq = 1;
3940
32
    }
3941
3942
27.8k
    fd->empty_container =
3943
27.8k
        (c->num_records == 0 &&
3944
19.9k
         c->ref_seq_id == -1 &&
3945
27.8k
         c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
3946
3947
27.8k
    return c;
3948
27.8k
}
3949
3950
3951
/* MAXIMUM storage size needed for the container. */
3952
0
int cram_container_size(cram_container *c) {
3953
0
    return 55 + 5*c->num_landmarks;
3954
0
}
3955
3956
3957
/*
3958
 * Stores the container structure in dat and returns *size as the
3959
 * number of bytes written to dat[].  The input size of dat is also
3960
 * held in *size and should be initialised to cram_container_size(c).
3961
 *
3962
 * Returns 0 on success;
3963
 *        -1 on failure
3964
 */
3965
int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
3966
0
{
3967
0
    char *cp = (char *)dat;
3968
0
    int i;
3969
3970
    // Check the input buffer is large enough according to our stated
3971
    // requirements. (NOTE: it may actually take less.)
3972
0
    if (cram_container_size(c) > *size)
3973
0
        return -1;
3974
3975
0
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3976
0
        cp += itf8_put(cp, c->length);
3977
0
    } else {
3978
0
        *(int32_t *)cp = le_int4(c->length);
3979
0
        cp += 4;
3980
0
    }
3981
0
    if (c->multi_seq) {
3982
0
        cp += fd->vv.varint_put32(cp, NULL, -2);
3983
0
        cp += fd->vv.varint_put32(cp, NULL, 0);
3984
0
        cp += fd->vv.varint_put32(cp, NULL, 0);
3985
0
    } else {
3986
0
        cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
3987
0
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3988
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start);
3989
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span);
3990
0
        } else {
3991
0
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
3992
0
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
3993
0
        }
3994
0
    }
3995
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_records);
3996
0
    if (CRAM_MAJOR_VERS(fd->version) == 2) {
3997
0
        cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
3998
0
    } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3999
0
        cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
4000
0
    }
4001
0
    cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
4002
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
4003
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
4004
0
    for (i = 0; i < c->num_landmarks; i++)
4005
0
        cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
4006
4007
0
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
4008
0
        c->crc32 = crc32(0L, (uc *)dat, cp-dat);
4009
0
        cp[0] =  c->crc32        & 0xff;
4010
0
        cp[1] = (c->crc32 >>  8) & 0xff;
4011
0
        cp[2] = (c->crc32 >> 16) & 0xff;
4012
0
        cp[3] = (c->crc32 >> 24) & 0xff;
4013
0
        cp += 4;
4014
0
    }
4015
4016
0
    *size = cp-dat; // actual used size
4017
4018
0
    return 0;
4019
0
}
4020
4021
4022
/*
4023
 * Writes a container structure.
4024
 *
4025
 * Returns 0 on success
4026
 *        -1 on failure
4027
 */
4028
57.1k
int cram_write_container(cram_fd *fd, cram_container *c) {
4029
57.1k
    char buf_a[1024], *buf = buf_a, *cp;
4030
57.1k
    int i;
4031
4032
57.1k
    if (61 + c->num_landmarks * 10 >= 1024) {
4033
0
        buf = malloc(61 + c->num_landmarks * 10);
4034
0
        if (!buf)
4035
0
            return -1;
4036
0
    }
4037
57.1k
    cp = buf;
4038
4039
57.1k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4040
0
        cp += itf8_put(cp, c->length);
4041
57.1k
    } else if (CRAM_MAJOR_VERS(fd->version) <= 3) {
4042
57.1k
        *(int32_t *)cp = le_int4(c->length);
4043
57.1k
        cp += 4;
4044
57.1k
    } else {
4045
0
        cp += fd->vv.varint_put32(cp, NULL, c->length);
4046
0
    }
4047
57.1k
    if (c->multi_seq) {
4048
631
        cp += fd->vv.varint_put32(cp, NULL, (uint32_t)-2);
4049
631
        cp += fd->vv.varint_put32(cp, NULL, 0);
4050
631
        cp += fd->vv.varint_put32(cp, NULL, 0);
4051
56.4k
    } else {
4052
56.4k
        cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
4053
56.4k
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
4054
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start);
4055
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span);
4056
56.4k
        } else {
4057
56.4k
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
4058
56.4k
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
4059
56.4k
        }
4060
56.4k
    }
4061
57.1k
    cp += fd->vv.varint_put32(cp, NULL, c->num_records);
4062
57.1k
    if (CRAM_MAJOR_VERS(fd->version) >= 3)
4063
57.1k
        cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
4064
0
    else
4065
0
        cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
4066
57.1k
    cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
4067
57.1k
    cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
4068
57.1k
    cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
4069
114k
    for (i = 0; i < c->num_landmarks; i++)
4070
56.9k
        cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
4071
4072
57.1k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
4073
57.1k
        c->crc32 = crc32(0L, (uc *)buf, cp-buf);
4074
57.1k
        cp[0] =  c->crc32        & 0xff;
4075
57.1k
        cp[1] = (c->crc32 >>  8) & 0xff;
4076
57.1k
        cp[2] = (c->crc32 >> 16) & 0xff;
4077
57.1k
        cp[3] = (c->crc32 >> 24) & 0xff;
4078
57.1k
        cp += 4;
4079
57.1k
    }
4080
4081
57.1k
    if (cp-buf != hwrite(fd->fp, buf, cp-buf)) {
4082
0
        if (buf != buf_a)
4083
0
            free(buf);
4084
0
        return -1;
4085
0
    }
4086
4087
57.1k
    if (buf != buf_a)
4088
0
        free(buf);
4089
4090
57.1k
    return 0;
4091
57.1k
}
4092
4093
// common component shared by cram_flush_container{,_mt}
4094
37.5k
static int cram_flush_container2(cram_fd *fd, cram_container *c) {
4095
37.5k
    int i, j;
4096
4097
37.5k
    if (c->curr_slice > 0 && !c->slices)
4098
0
        return -1;
4099
4100
    //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
4101
4102
37.5k
    off_t c_offset = htell(fd->fp); // File offset of container
4103
4104
    /* Write the container struct itself */
4105
37.5k
    if (0 != cram_write_container(fd, c))
4106
0
        return -1;
4107
4108
37.5k
    off_t hdr_size = htell(fd->fp) - c_offset;
4109
4110
    /* And the compression header */
4111
37.5k
    if (0 != cram_write_block(fd, c->comp_hdr_block))
4112
0
        return -1;
4113
4114
    /* Followed by the slice blocks */
4115
37.5k
    off_t file_offset = htell(fd->fp);
4116
75.1k
    for (i = 0; i < c->curr_slice; i++) {
4117
37.5k
        cram_slice *s = c->slices[i];
4118
37.5k
        off_t spos = file_offset - c_offset - hdr_size;
4119
4120
37.5k
        if (0 != cram_write_block(fd, s->hdr_block))
4121
0
            return -1;
4122
4123
275k
        for (j = 0; j < s->hdr->num_blocks; j++) {
4124
237k
            if (0 != cram_write_block(fd, s->block[j]))
4125
0
                return -1;
4126
237k
        }
4127
4128
37.5k
        file_offset = htell(fd->fp);
4129
37.5k
        off_t sz = file_offset - c_offset - hdr_size - spos;
4130
4131
37.5k
        if (fd->idxfp) {
4132
0
            if (cram_index_slice(fd, c, s, fd->idxfp, c_offset, spos, sz) < 0)
4133
0
                return -1;
4134
0
        }
4135
37.5k
    }
4136
4137
37.5k
    return 0;
4138
37.5k
}
4139
4140
/*
4141
 * Flushes a completely or partially full container to disk, writing
4142
 * container structure, header and blocks. This also calls the encoder
4143
 * functions.
4144
 *
4145
 * Returns 0 on success
4146
 *        -1 on failure
4147
 */
4148
38.3k
int cram_flush_container(cram_fd *fd, cram_container *c) {
4149
    /* Encode the container blocks and generate compression header */
4150
38.3k
    if (0 != cram_encode_container(fd, c))
4151
747
        return -1;
4152
4153
37.5k
    return cram_flush_container2(fd, c);
4154
38.3k
}
4155
4156
typedef struct {
4157
    cram_fd *fd;
4158
    cram_container *c;
4159
} cram_job;
4160
4161
0
void *cram_flush_thread(void *arg) {
4162
0
    cram_job *j = (cram_job *)arg;
4163
4164
    /* Encode the container blocks and generate compression header */
4165
0
    if (0 != cram_encode_container(j->fd, j->c)) {
4166
0
        hts_log_error("Call to cram_encode_container failed");
4167
0
        return NULL;
4168
0
    }
4169
4170
0
    return arg;
4171
0
}
4172
4173
0
static int cram_flush_result(cram_fd *fd) {
4174
0
    int i, ret = 0;
4175
0
    hts_tpool_result *r;
4176
0
    cram_container *lc = NULL;
4177
4178
    // NB: we can have one result per slice, not per container,
4179
    // so we need to free the container only after all slices
4180
    // within it have been freed.  (Automatic via reference counting.)
4181
0
    while ((r = hts_tpool_next_result(fd->rqueue))) {
4182
0
        cram_job *j = (cram_job *)hts_tpool_result_data(r);
4183
0
        cram_container *c;
4184
4185
0
        if (!j) {
4186
0
            hts_tpool_delete_result(r, 0);
4187
0
            return -1;
4188
0
        }
4189
4190
0
        fd = j->fd;
4191
0
        c = j->c;
4192
4193
0
        if (fd->mode == 'w')
4194
0
            if (0 != cram_flush_container2(fd, c))
4195
0
                return -1;
4196
4197
        // Free the slices; filled out by encoder only
4198
0
        if (c->slices) {
4199
0
            for (i = 0; i < c->max_slice; i++) {
4200
0
                if (c->slices[i])
4201
0
                    cram_free_slice(c->slices[i]);
4202
0
                if (c->slices[i] == c->slice)
4203
0
                    c->slice = NULL;
4204
0
                c->slices[i] = NULL;
4205
0
            }
4206
0
        }
4207
4208
        // Free the current slice; set by both encoder & decoder
4209
0
        if (c->slice) {
4210
0
            cram_free_slice(c->slice);
4211
0
            c->slice = NULL;
4212
0
        }
4213
0
        c->curr_slice = 0;
4214
4215
        // Our jobs will be in order, so we free the last
4216
        // container when our job has switched to a new one.
4217
0
        if (c != lc) {
4218
0
            if (lc) {
4219
0
                if (fd->ctr == lc)
4220
0
                    fd->ctr = NULL;
4221
0
                if (fd->ctr_mt == lc)
4222
0
                    fd->ctr_mt = NULL;
4223
0
                cram_free_container(lc);
4224
0
            }
4225
0
            lc = c;
4226
0
        }
4227
4228
0
        hts_tpool_delete_result(r, 1);
4229
0
    }
4230
0
    if (lc) {
4231
0
        if (fd->ctr == lc)
4232
0
            fd->ctr = NULL;
4233
0
        if (fd->ctr_mt == lc)
4234
0
            fd->ctr_mt = NULL;
4235
0
        cram_free_container(lc);
4236
0
    }
4237
4238
0
    return ret;
4239
0
}
4240
4241
// Note: called while metrics_lock is held.
4242
// Will be left in this state too, but may temporarily unlock.
4243
3.08k
void reset_metrics(cram_fd *fd) {
4244
3.08k
    int i;
4245
4246
3.08k
    if (fd->pool) {
4247
        // If multi-threaded we have multiple blocks being
4248
        // compressed already and several on the to-do list
4249
        // (fd->rqueue->pending).  It's tricky to reset the
4250
        // metrics exactly the correct point, so instead we
4251
        // just flush the pool, reset, and then continue again.
4252
4253
        // Don't bother starting a new trial before then though.
4254
0
        for (i = 0; i < DS_END; i++) {
4255
0
            cram_metrics *m = fd->m[i];
4256
0
            if (!m)
4257
0
                continue;
4258
0
            m->next_trial = 999;
4259
0
        }
4260
4261
0
        pthread_mutex_unlock(&fd->metrics_lock);
4262
0
        hts_tpool_process_flush(fd->rqueue);
4263
0
        pthread_mutex_lock(&fd->metrics_lock);
4264
0
    }
4265
4266
148k
    for (i = 0; i < DS_END; i++) {
4267
145k
        cram_metrics *m = fd->m[i];
4268
145k
        if (!m)
4269
0
            continue;
4270
4271
145k
        m->trial = NTRIALS;
4272
145k
        m->next_trial = TRIAL_SPAN;
4273
145k
        m->revised_method = 0;
4274
145k
        m->unpackable = 0;
4275
4276
145k
        memset(m->sz, 0, sizeof(m->sz));
4277
145k
    }
4278
3.08k
}
4279
4280
38.3k
int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
4281
38.3k
    cram_job *j;
4282
4283
    // At the junction of mapped to unmapped data the compression
4284
    // methods may need to change due to very different statistical
4285
    // properties; particularly BA if minhash sorted.
4286
    //
4287
    // However with threading we'll have several in-flight blocks
4288
    // arriving out of order.
4289
    //
4290
    // So we do one trial reset of NThreads to last for NThreads
4291
    // duration to get us over this transition period, followed
4292
    // by another retrial of the usual ntrials & trial span.
4293
38.3k
    pthread_mutex_lock(&fd->metrics_lock);
4294
38.3k
    if (c->n_mapped < 0.3*c->curr_rec &&
4295
20.7k
        fd->last_mapped > 0.7*c->max_rec) {
4296
3.08k
        reset_metrics(fd);
4297
3.08k
    }
4298
38.3k
    fd->last_mapped = c->n_mapped * (c->max_rec+1)/(c->curr_rec+1) ;
4299
38.3k
    pthread_mutex_unlock(&fd->metrics_lock);
4300
4301
38.3k
    if (!fd->pool)
4302
38.3k
        return cram_flush_container(fd, c);
4303
4304
0
    if (!(j = malloc(sizeof(*j))))
4305
0
        return -1;
4306
0
    j->fd = fd;
4307
0
    j->c = c;
4308
4309
    // Flush the job.  Note our encoder queue may be full, so we
4310
    // either have to keep trying in non-blocking mode (what we do) or
4311
    // use a dedicated separate thread for draining the queue.
4312
0
    for (;;) {
4313
0
        errno = 0;
4314
0
        hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_flush_thread, j, 1);
4315
0
        int pending = (errno == EAGAIN);
4316
0
        if (cram_flush_result(fd) != 0)
4317
0
            return -1;
4318
0
        if (!pending)
4319
0
            break;
4320
4321
0
        hts_usleep(1000);
4322
0
    }
4323
4324
0
    return 0;
4325
0
}
4326
4327
/* ----------------------------------------------------------------------
4328
 * Compression headers; the first part of the container
4329
 */
4330
4331
/*
4332
 * Creates a new blank container compression header
4333
 *
4334
 * Returns header ptr on success
4335
 *         NULL on failure
4336
 */
4337
48.0k
cram_block_compression_hdr *cram_new_compression_header(void) {
4338
48.0k
    cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
4339
48.0k
    if (!hdr)
4340
0
        return NULL;
4341
4342
48.0k
    if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
4343
0
        free(hdr);
4344
0
        return NULL;
4345
0
    }
4346
4347
48.0k
    if (!(hdr->TD_hash = kh_init(m_s2i))) {
4348
0
        cram_free_block(hdr->TD_blk);
4349
0
        free(hdr);
4350
0
        return NULL;
4351
0
    }
4352
4353
48.0k
    if (!(hdr->TD_keys = string_pool_create(8192))) {
4354
0
        kh_destroy(m_s2i, hdr->TD_hash);
4355
0
        cram_free_block(hdr->TD_blk);
4356
0
        free(hdr);
4357
0
        return NULL;
4358
0
    }
4359
4360
48.0k
    return hdr;
4361
48.0k
}
4362
4363
50.2k
void cram_free_compression_header(cram_block_compression_hdr *hdr) {
4364
50.2k
    int i;
4365
4366
50.2k
    if (hdr->landmark)
4367
1.99k
        free(hdr->landmark);
4368
4369
50.2k
    if (hdr->preservation_map)
4370
39.8k
        kh_destroy(map, hdr->preservation_map);
4371
4372
1.65M
    for (i = 0; i < CRAM_MAP_HASH; i++) {
4373
1.60M
        cram_map *m, *m2;
4374
1.64M
        for (m = hdr->rec_encoding_map[i]; m; m = m2) {
4375
36.8k
            m2 = m->next;
4376
36.8k
            if (m->codec)
4377
0
                m->codec->free(m->codec);
4378
36.8k
            free(m);
4379
36.8k
        }
4380
1.60M
    }
4381
4382
1.65M
    for (i = 0; i < CRAM_MAP_HASH; i++) {
4383
1.60M
        cram_map *m, *m2;
4384
1.60M
        for (m = hdr->tag_encoding_map[i]; m; m = m2) {
4385
1.21k
            m2 = m->next;
4386
1.21k
            if (m->codec)
4387
1.21k
                m->codec->free(m->codec);
4388
1.21k
            free(m);
4389
1.21k
        }
4390
1.60M
    }
4391
4392
2.41M
    for (i = 0; i < DS_END; i++) {
4393
2.36M
        if (hdr->codecs[i])
4394
708k
            hdr->codecs[i]->free(hdr->codecs[i]);
4395
2.36M
    }
4396
4397
50.2k
    if (hdr->TL)
4398
90
        free(hdr->TL);
4399
50.2k
    if (hdr->TD_blk)
4400
48.1k
        cram_free_block(hdr->TD_blk);
4401
50.2k
    if (hdr->TD_hash)
4402
48.0k
        kh_destroy(m_s2i, hdr->TD_hash);
4403
50.2k
    if (hdr->TD_keys)
4404
48.0k
        string_pool_destroy(hdr->TD_keys);
4405
4406
50.2k
    free(hdr);
4407
50.2k
}
4408
4409
4410
/* ----------------------------------------------------------------------
4411
 * Slices and slice headers
4412
 */
4413
4414
39.1k
void cram_free_slice_header(cram_block_slice_hdr *hdr) {
4415
39.1k
    if (!hdr)
4416
0
        return;
4417
4418
39.1k
    if (hdr->block_content_ids)
4419
38.6k
        free(hdr->block_content_ids);
4420
4421
39.1k
    free(hdr);
4422
4423
39.1k
    return;
4424
39.1k
}
4425
4426
39.3k
void cram_free_slice(cram_slice *s) {
4427
39.3k
    if (!s)
4428
0
        return;
4429
4430
39.3k
    if (s->hdr_block)
4431
38.3k
        cram_free_block(s->hdr_block);
4432
4433
39.3k
    if (s->block) {
4434
38.6k
        int i;
4435
4436
38.6k
        if (s->hdr) {
4437
6.45M
            for (i = 0; i < s->hdr->num_blocks; i++) {
4438
6.41M
                if (i > 0 && s->block[i] == s->block[0])
4439
6.12M
                    continue;
4440
286k
                cram_free_block(s->block[i]);
4441
286k
            }
4442
38.6k
        }
4443
38.6k
        free(s->block);
4444
38.6k
    }
4445
4446
39.3k
    {
4447
        // Normally already copied into s->block[], but potentially still
4448
        // here if we error part way through cram_encode_slice.
4449
39.3k
        int i;
4450
148k
        for (i = 0; i < s->naux_block; i++)
4451
109k
            cram_free_block(s->aux_block[i]);
4452
39.3k
    }
4453
4454
39.3k
    if (s->block_by_id)
4455
770
        free(s->block_by_id);
4456
4457
39.3k
    if (s->hdr)
4458
39.1k
        cram_free_slice_header(s->hdr);
4459
4460
39.3k
    if (s->seqs_blk)
4461
39.1k
        cram_free_block(s->seqs_blk);
4462
4463
39.3k
    if (s->qual_blk)
4464
1.29k
        cram_free_block(s->qual_blk);
4465
4466
39.3k
    if (s->name_blk)
4467
1.29k
        cram_free_block(s->name_blk);
4468
4469
39.3k
    if (s->aux_blk)
4470
39.1k
        cram_free_block(s->aux_blk);
4471
4472
39.3k
    if (s->base_blk)
4473
1.29k
        cram_free_block(s->base_blk);
4474
4475
39.3k
    if (s->soft_blk)
4476
1.29k
        cram_free_block(s->soft_blk);
4477
4478
39.3k
    if (s->cigar)
4479
39.1k
        free(s->cigar);
4480
4481
39.3k
    if (s->crecs)
4482
39.0k
        free(s->crecs);
4483
4484
39.3k
    if (s->features)
4485
17.3k
        free(s->features);
4486
4487
39.3k
    if (s->TN)
4488
0
        free(s->TN);
4489
4490
39.3k
    if (s->pair_keys)
4491
38.3k
        string_pool_destroy(s->pair_keys);
4492
4493
39.3k
    if (s->pair[0])
4494
38.3k
        kh_destroy(m_s2i, s->pair[0]);
4495
39.3k
    if (s->pair[1])
4496
38.3k
        kh_destroy(m_s2i, s->pair[1]);
4497
4498
39.3k
    if (s->aux_block)
4499
31.2k
        free(s->aux_block);
4500
4501
39.3k
    free(s);
4502
39.3k
}
4503
4504
/*
4505
 * Creates a new empty slice in memory, for subsequent writing to
4506
 * disk.
4507
 *
4508
 * Returns cram_slice ptr on success
4509
 *         NULL on failure
4510
 */
4511
38.3k
cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
4512
38.3k
    cram_slice *s = calloc(1, sizeof(*s));
4513
38.3k
    if (!s)
4514
0
        return NULL;
4515
4516
38.3k
    if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
4517
0
        goto err;
4518
38.3k
    s->hdr->content_type = type;
4519
4520
38.3k
    s->hdr_block = NULL;
4521
38.3k
    s->block = NULL;
4522
38.3k
    s->block_by_id = NULL;
4523
38.3k
    s->last_apos = 0;
4524
38.3k
    if (!(s->crecs = malloc(nrecs * sizeof(cram_record))))  goto err;
4525
38.3k
    s->cigar_alloc = 1024;
4526
38.3k
    if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err;
4527
38.3k
    s->ncigar = 0;
4528
4529
38.3k
    if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))       goto err;
4530
38.3k
    if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))   goto err;
4531
38.3k
    if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))   goto err;
4532
38.3k
    if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux)))  goto err;
4533
38.3k
    if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))   goto err;
4534
38.3k
    if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))   goto err;
4535
4536
38.3k
    s->features = NULL;
4537
38.3k
    s->nfeatures = s->afeatures = 0;
4538
4539
38.3k
#ifndef TN_external
4540
38.3k
    s->TN = NULL;
4541
38.3k
    s->nTN = s->aTN = 0;
4542
38.3k
#endif
4543
4544
    // Volatile keys as we do realloc in dstring
4545
38.3k
    if (!(s->pair_keys = string_pool_create(8192))) goto err;
4546
38.3k
    if (!(s->pair[0] = kh_init(m_s2i)))             goto err;
4547
38.3k
    if (!(s->pair[1] = kh_init(m_s2i)))             goto err;
4548
4549
#ifdef BA_external
4550
    s->BA_len = 0;
4551
#endif
4552
4553
38.3k
    return s;
4554
4555
0
 err:
4556
0
    if (s)
4557
0
        cram_free_slice(s);
4558
4559
0
    return NULL;
4560
38.3k
}
4561
4562
/*
4563
 * Loads an entire slice.
4564
 * FIXME: In 1.0 the native unit of slices within CRAM is broken
4565
 * as slices contain references to objects in other slices.
4566
 * To work around this while keeping the slice oriented outer loop
4567
 * we read all slices and stitch them together into a fake large
4568
 * slice instead.
4569
 *
4570
 * Returns cram_slice ptr on success
4571
 *         NULL on failure
4572
 */
4573
967
cram_slice *cram_read_slice(cram_fd *fd) {
4574
967
    cram_block *b = cram_read_block(fd);
4575
967
    cram_slice *s = calloc(1, sizeof(*s));
4576
967
    int i, n, max_id, min_id;
4577
4578
967
    if (!b || !s)
4579
26
        goto err;
4580
4581
941
    s->hdr_block = b;
4582
941
    switch (b->content_type) {
4583
919
    case MAPPED_SLICE:
4584
922
    case UNMAPPED_SLICE:
4585
922
        if (!(s->hdr = cram_decode_slice_header(fd, b)))
4586
82
            goto err;
4587
840
        break;
4588
4589
840
    default:
4590
19
        hts_log_error("Unexpected block of type %s",
4591
19
                      cram_content_type2str(b->content_type));
4592
19
        goto err;
4593
941
    }
4594
4595
840
    if (s->hdr->num_blocks < 1) {
4596
6
        hts_log_error("Slice does not include any data blocks");
4597
6
        goto err;
4598
6
    }
4599
4600
834
    s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
4601
834
    if (!s->block)
4602
0
        goto err;
4603
4604
3.91k
    for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
4605
3.14k
        if (!(s->block[i] = cram_read_block(fd)))
4606
64
            goto err;
4607
4608
3.07k
        if (s->block[i]->content_type == EXTERNAL) {
4609
720
            if (max_id < s->block[i]->content_id)
4610
12
                max_id = s->block[i]->content_id;
4611
720
            if (min_id > s->block[i]->content_id)
4612
690
                min_id = s->block[i]->content_id;
4613
720
        }
4614
3.07k
    }
4615
4616
770
    if (!(s->block_by_id = calloc(512, sizeof(s->block[0]))))
4617
0
        goto err;
4618
4619
3.77k
    for (i = 0; i < n; i++) {
4620
3.00k
        if (s->block[i]->content_type != EXTERNAL)
4621
2.28k
            continue;
4622
720
        uint32_t v = s->block[i]->content_id;
4623
720
        if (v >= 256)
4624
15
            v = 256 + v % 251;
4625
720
        s->block_by_id[v] = s->block[i];
4626
720
    }
4627
4628
    /* Initialise encoding/decoding tables */
4629
770
    s->cigar_alloc = 1024;
4630
770
    if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err;
4631
770
    s->ncigar = 0;
4632
4633
770
    if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))      goto err;
4634
770
    if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))  goto err;
4635
770
    if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))  goto err;
4636
770
    if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux))) goto err;
4637
770
    if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))  goto err;
4638
770
    if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))  goto err;
4639
4640
770
    s->crecs = NULL;
4641
4642
770
    s->last_apos = s->hdr->ref_seq_start;
4643
770
    s->decode_md = fd->decode_md;
4644
4645
770
    return s;
4646
4647
197
 err:
4648
197
    if (b)
4649
171
        cram_free_block(b);
4650
197
    if (s) {
4651
197
        s->hdr_block = NULL;
4652
197
        cram_free_slice(s);
4653
197
    }
4654
197
    return NULL;
4655
770
}
4656
4657
4658
/* ----------------------------------------------------------------------
4659
 * CRAM file definition (header)
4660
 */
4661
4662
/*
4663
 * Reads a CRAM file definition structure.
4664
 * Returns file_def ptr on success
4665
 *         NULL on failure
4666
 */
4667
11.7k
cram_file_def *cram_read_file_def(cram_fd *fd) {
4668
11.7k
    cram_file_def *def = malloc(sizeof(*def));
4669
11.7k
    if (!def)
4670
0
        return NULL;
4671
4672
11.7k
    if (26 != hread(fd->fp, &def->magic[0], 26)) {
4673
6
        free(def);
4674
6
        return NULL;
4675
6
    }
4676
4677
11.7k
    if (memcmp(def->magic, "CRAM", 4) != 0) {
4678
0
        free(def);
4679
0
        return NULL;
4680
0
    }
4681
4682
11.7k
    if (def->major_version > 4) {
4683
1
        hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x, 3.x or 4.x, got %d.%d",
4684
1
                      def->major_version, def->minor_version);
4685
1
        free(def);
4686
1
        return NULL;
4687
1
    }
4688
4689
11.7k
    fd->first_container += 26;
4690
11.7k
    fd->curr_position = fd->first_container;
4691
11.7k
    fd->last_slice = 0;
4692
4693
11.7k
    return def;
4694
11.7k
}
4695
4696
/*
4697
 * Writes a cram_file_def structure to cram_fd.
4698
 * Returns 0 on success
4699
 *        -1 on failure
4700
 */
4701
9.67k
int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
4702
9.67k
    return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1;
4703
9.67k
}
4704
4705
22.1k
void cram_free_file_def(cram_file_def *def) {
4706
22.1k
    if (def) free(def);
4707
22.1k
}
4708
4709
/* ----------------------------------------------------------------------
4710
 * SAM header I/O
4711
 */
4712
4713
4714
/*
4715
 * Reads the SAM header from the first CRAM data block.
4716
 * Also performs minimal parsing to extract read-group
4717
 * and sample information.
4718
4719
 * Returns SAM hdr ptr on success
4720
 *         NULL on failure
4721
 */
4722
11.7k
sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) {
4723
11.7k
    int32_t header_len;
4724
11.7k
    char *header;
4725
11.7k
    sam_hdr_t *hdr;
4726
4727
    /* 1.1 onwards stores the header in the first block of a container */
4728
11.7k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4729
        /* Length */
4730
9.26k
        if (-1 == int32_decode(fd, &header_len))
4731
2
            return NULL;
4732
4733
9.26k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
4734
9.26k
        if (header_len > FUZZ_ALLOC_LIMIT)
4735
6
            return NULL;
4736
9.26k
#endif
4737
4738
        /* Alloc and read */
4739
9.26k
        if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1)))
4740
0
            return NULL;
4741
4742
9.26k
        if (header_len != hread(fd->fp, header, header_len)) {
4743
32
            free(header);
4744
32
            return NULL;
4745
32
        }
4746
9.22k
        header[header_len] = '\0';
4747
4748
9.22k
        fd->first_container += 4 + header_len;
4749
9.22k
    } else {
4750
2.44k
        cram_container *c = cram_read_container(fd);
4751
2.44k
        cram_block *b;
4752
2.44k
        int i;
4753
2.44k
        int64_t len;
4754
4755
2.44k
        if (!c)
4756
147
            return NULL;
4757
4758
2.30k
        fd->first_container += c->length + c->offset;
4759
2.30k
        fd->curr_position = fd->first_container;
4760
4761
2.30k
        if (c->num_blocks < 1) {
4762
53
            cram_free_container(c);
4763
53
            return NULL;
4764
53
        }
4765
4766
2.24k
        if (!(b = cram_read_block(fd))) {
4767
124
            cram_free_container(c);
4768
124
            return NULL;
4769
124
        }
4770
2.12k
        if (cram_uncompress_block(b) != 0) {
4771
1.33k
            cram_free_container(c);
4772
1.33k
            cram_free_block(b);
4773
1.33k
            return NULL;
4774
1.33k
        }
4775
4776
788
        len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4777
788
            fd->vv.varint_size(b->content_id) +
4778
788
            fd->vv.varint_size(b->uncomp_size) +
4779
788
            fd->vv.varint_size(b->comp_size);
4780
4781
        /* Extract header from 1st block */
4782
788
        if (-1 == int32_get_blk(b, &header_len) ||
4783
708
            header_len < 0 || /* Spec. says signed...  why? */
4784
694
            b->uncomp_size - 4 < header_len) {
4785
119
            cram_free_container(c);
4786
119
            cram_free_block(b);
4787
119
            return NULL;
4788
119
        }
4789
669
        if (NULL == (header = malloc((size_t) header_len+1))) {
4790
0
            cram_free_container(c);
4791
0
            cram_free_block(b);
4792
0
            return NULL;
4793
0
        }
4794
669
        memcpy(header, BLOCK_END(b), header_len);
4795
669
        header[header_len] = '\0';
4796
669
        cram_free_block(b);
4797
4798
        /* Consume any remaining blocks */
4799
2.22k
        for (i = 1; i < c->num_blocks; i++) {
4800
1.57k
            if (!(b = cram_read_block(fd))) {
4801
16
                cram_free_container(c);
4802
16
                free(header);
4803
16
                return NULL;
4804
16
            }
4805
1.56k
            len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4806
1.56k
                fd->vv.varint_size(b->content_id) +
4807
1.56k
                fd->vv.varint_size(b->uncomp_size) +
4808
1.56k
                fd->vv.varint_size(b->comp_size);
4809
1.56k
            cram_free_block(b);
4810
1.56k
        }
4811
4812
653
        if (c->length > 0 && len > 0 && c->length > len) {
4813
            // Consume padding
4814
27
            char *pads = malloc(c->length - len);
4815
27
            if (!pads) {
4816
0
                cram_free_container(c);
4817
0
                free(header);
4818
0
                return NULL;
4819
0
            }
4820
4821
27
            if (c->length - len != hread(fd->fp, pads, c->length - len)) {
4822
15
                cram_free_container(c);
4823
15
                free(header);
4824
15
                free(pads);
4825
15
                return NULL;
4826
15
            }
4827
12
            free(pads);
4828
12
        }
4829
4830
638
        cram_free_container(c);
4831
638
    }
4832
4833
    /* Parse */
4834
9.86k
    hdr = sam_hdr_init();
4835
9.86k
    if (!hdr) {
4836
0
        free(header);
4837
0
        return NULL;
4838
0
    }
4839
4840
9.86k
    if (-1 == sam_hdr_add_lines(hdr, header, header_len)) {
4841
111
        free(header);
4842
111
        sam_hdr_destroy(hdr);
4843
111
        return NULL;
4844
111
    }
4845
4846
9.75k
    hdr->l_text = header_len;
4847
9.75k
    hdr->text = header;
4848
4849
9.75k
    return hdr;
4850
4851
9.86k
}
4852
4853
/*
4854
 * Converts 'in' to a full pathname to store in out.
4855
 * Out must be at least PATH_MAX bytes long.
4856
 */
4857
0
static void full_path(char *out, char *in) {
4858
0
    size_t in_l = strlen(in);
4859
0
    if (hisremote(in)) {
4860
0
        if (in_l > PATH_MAX) {
4861
0
            hts_log_error("Reference path is longer than %d", PATH_MAX);
4862
0
            return;
4863
0
        }
4864
0
        strncpy(out, in, PATH_MAX-1);
4865
0
        out[PATH_MAX-1] = 0;
4866
0
        return;
4867
0
    }
4868
0
    if (*in == '/' ||
4869
        // Windows paths
4870
0
        (in_l > 3 && toupper_c(*in) >= 'A'  && toupper_c(*in) <= 'Z' &&
4871
0
         in[1] == ':' && (in[2] == '/' || in[2] == '\\'))) {
4872
0
        strncpy(out, in, PATH_MAX-1);
4873
0
        out[PATH_MAX-1] = 0;
4874
0
    } else {
4875
0
        size_t len;
4876
4877
        // unable to get dir or out+in is too long
4878
0
        if (!getcwd(out, PATH_MAX) ||
4879
0
            (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
4880
0
            strncpy(out, in, PATH_MAX-1);
4881
0
            out[PATH_MAX-1] = 0;
4882
0
            return;
4883
0
        }
4884
4885
0
        snprintf(out+len, PATH_MAX - len, "/%s", in);
4886
4887
        // FIXME: cope with `pwd`/../../../foo.fa ?
4888
0
    }
4889
0
}
4890
4891
/*
4892
 * Writes a CRAM SAM header.
4893
 * Returns 0 on success
4894
 *        -1 on failure
4895
 */
4896
9.67k
int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) {
4897
9.67k
    size_t header_len;
4898
9.67k
    int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
4899
4900
    /* Write CRAM MAGIC if not yet written. */
4901
9.67k
    if (fd->file_def->major_version == 0) {
4902
9.67k
        fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version);
4903
9.67k
        fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version);
4904
9.67k
        if (0 != cram_write_file_def(fd, fd->file_def))
4905
0
            return -1;
4906
9.67k
    }
4907
4908
    /* 1.0 requires an UNKNOWN read-group */
4909
9.67k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4910
0
        if (!sam_hrecs_find_rg(hdr->hrecs, "UNKNOWN"))
4911
0
            if (sam_hdr_add_line(hdr, "RG",
4912
0
                            "ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
4913
0
                return -1;
4914
0
    }
4915
4916
9.67k
    if (-1 == refs_from_header(fd))
4917
0
        return -1;
4918
9.67k
    if (-1 == refs2id(fd->refs, fd->header))
4919
0
        return -1;
4920
4921
    /* Fix M5 strings */
4922
9.67k
    if (fd->refs && !fd->no_ref && fd->embed_ref <= 1) {
4923
9.67k
        int i;
4924
9.72k
        for (i = 0; i < hdr->hrecs->nref; i++) {
4925
3.80k
            sam_hrec_type_t *ty;
4926
3.80k
            char *ref;
4927
4928
3.80k
            if (!(ty = sam_hrecs_find_type_id(hdr->hrecs, "SQ", "SN", hdr->hrecs->ref[i].name)))
4929
0
                return -1;
4930
4931
3.80k
            if (!sam_hrecs_find_key(ty, "M5", NULL)) {
4932
3.76k
                char unsigned buf[16];
4933
3.76k
                char buf2[33];
4934
3.76k
                hts_pos_t rlen;
4935
3.76k
                hts_md5_context *md5;
4936
4937
3.76k
                if (!fd->refs ||
4938
3.76k
                    !fd->refs->ref_id ||
4939
3.76k
                    !fd->refs->ref_id[i]) {
4940
0
                    return -1;
4941
0
                }
4942
3.76k
                rlen = fd->refs->ref_id[i]->length;
4943
3.76k
                ref = cram_get_ref(fd, i, 1, rlen);
4944
3.76k
                if (NULL == ref) {
4945
3.76k
                    if (fd->embed_ref == -1) {
4946
                        // auto embed-ref
4947
3.76k
                        hts_log_warning("No M5 tags present and could not "
4948
3.76k
                                        "find reference");
4949
3.76k
                        hts_log_warning("Enabling embed_ref=2 option");
4950
3.76k
                        hts_log_warning("NOTE: the CRAM file will be bigger "
4951
3.76k
                                        "than using an external reference");
4952
3.76k
                        pthread_mutex_lock(&fd->ref_lock);
4953
                        // Best guess.  It may be unmapped data with broken
4954
                        // headers, in which case this will get ignored.
4955
3.76k
                        fd->embed_ref = 2;
4956
3.76k
                        pthread_mutex_unlock(&fd->ref_lock);
4957
3.76k
                        break;
4958
3.76k
                    }
4959
0
                    return -1;
4960
3.76k
                }
4961
0
                rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
4962
0
                if (!(md5 = hts_md5_init()))
4963
0
                    return -1;
4964
0
                if (HTS_POS_MAX <= ULONG_MAX) {
4965
                    // Platforms with 64-bit unsigned long update in one go
4966
0
                    hts_md5_update(md5, ref, rlen);
4967
0
                } else {
4968
                    // Those with 32-bit ulong (Windows) may have to loop
4969
                    // over epic references
4970
0
                    hts_pos_t pos = 0;
4971
0
                    while (rlen - pos > ULONG_MAX) {
4972
0
                        hts_md5_update(md5, ref + pos, ULONG_MAX);
4973
0
                        pos += ULONG_MAX;
4974
0
                    }
4975
0
                    hts_md5_update(md5, ref + pos, (unsigned long)(rlen - pos));
4976
0
                }
4977
0
                hts_md5_final(buf, md5);
4978
0
                hts_md5_destroy(md5);
4979
0
                cram_ref_decr(fd->refs, i);
4980
4981
0
                hts_md5_hex(buf2, buf);
4982
0
                fd->refs->ref_id[i]->validated_md5 = 1;
4983
0
                if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "M5", buf2, NULL))
4984
0
                    return -1;
4985
0
            }
4986
4987
42
            if (fd->ref_fn) {
4988
0
                char ref_fn[PATH_MAX];
4989
0
                full_path(ref_fn, fd->ref_fn);
4990
0
                if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "UR", ref_fn, NULL))
4991
0
                    return -1;
4992
0
            }
4993
42
        }
4994
9.67k
    }
4995
4996
    /* Length */
4997
9.67k
    header_len = sam_hdr_length(hdr);
4998
9.67k
    if (header_len > INT32_MAX) {
4999
0
        hts_log_error("Header is too long for CRAM format");
5000
0
        return -1;
5001
0
    }
5002
9.67k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
5003
0
        if (-1 == int32_encode(fd, header_len))
5004
0
            return -1;
5005
5006
        /* Text data */
5007
0
        if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len))
5008
0
            return -1;
5009
9.67k
    } else {
5010
        /* Create block(s) inside a container */
5011
9.67k
        cram_block *b = cram_new_block(FILE_HEADER, 0);
5012
9.67k
        cram_container *c = cram_new_container(0, 0);
5013
9.67k
        int padded_length;
5014
9.67k
        char *pads;
5015
9.67k
        int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3);
5016
5017
9.67k
        if (!b || !c) {
5018
0
            if (b) cram_free_block(b);
5019
0
            if (c) cram_free_container(c);
5020
0
            return -1;
5021
0
        }
5022
5023
9.67k
        if (int32_put_blk(b, header_len) < 0)
5024
0
            return -1;
5025
9.67k
        if (header_len)
5026
5.14k
            BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
5027
9.67k
        BLOCK_UPLEN(b);
5028
5029
        // Compress header block if V3.0 and above
5030
9.67k
        if (CRAM_MAJOR_VERS(fd->version) >= 3)
5031
9.67k
            if (cram_compress_block(fd, b, NULL, -1, -1) < 0)
5032
0
                return -1;
5033
5034
9.67k
        if (blank_block) {
5035
9.67k
            c->length = b->comp_size + 2 + 4*is_cram_3 +
5036
9.67k
                fd->vv.varint_size(b->content_id)   +
5037
9.67k
                fd->vv.varint_size(b->uncomp_size)  +
5038
9.67k
                fd->vv.varint_size(b->comp_size);
5039
5040
9.67k
            c->num_blocks = 2;
5041
9.67k
            c->num_landmarks = 2;
5042
9.67k
            if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
5043
0
                cram_free_block(b);
5044
0
                cram_free_container(c);
5045
0
                return -1;
5046
0
            }
5047
9.67k
            c->landmark[0] = 0;
5048
9.67k
            c->landmark[1] = c->length;
5049
5050
            // Plus extra storage for uncompressed secondary blank block
5051
9.67k
            padded_length = MIN(c->length*.5, 10000);
5052
9.67k
            c->length += padded_length + 2 + 4*is_cram_3 +
5053
9.67k
                fd->vv.varint_size(b->content_id) +
5054
9.67k
                fd->vv.varint_size(padded_length)*2;
5055
9.67k
        } else {
5056
            // Pad the block instead.
5057
0
            c->num_blocks = 1;
5058
0
            c->num_landmarks = 1;
5059
0
            if (!(c->landmark = malloc(sizeof(*c->landmark))))
5060
0
                return -1;
5061
0
            c->landmark[0] = 0;
5062
5063
0
            padded_length = MAX(c->length*1.5, 10000) - c->length;
5064
5065
0
            c->length = b->comp_size + padded_length +
5066
0
                2 + 4*is_cram_3 +
5067
0
                fd->vv.varint_size(b->content_id)   +
5068
0
                fd->vv.varint_size(b->uncomp_size)  +
5069
0
                fd->vv.varint_size(b->comp_size);
5070
5071
0
            if (NULL == (pads = calloc(1, padded_length))) {
5072
0
                cram_free_block(b);
5073
0
                cram_free_container(c);
5074
0
                return -1;
5075
0
            }
5076
0
            BLOCK_APPEND(b, pads, padded_length);
5077
0
            BLOCK_UPLEN(b);
5078
0
            free(pads);
5079
0
        }
5080
5081
9.67k
        if (-1 == cram_write_container(fd, c)) {
5082
0
            cram_free_block(b);
5083
0
            cram_free_container(c);
5084
0
            return -1;
5085
0
        }
5086
5087
9.67k
        if (-1 == cram_write_block(fd, b)) {
5088
0
            cram_free_block(b);
5089
0
            cram_free_container(c);
5090
0
            return -1;
5091
0
        }
5092
5093
9.67k
        if (blank_block) {
5094
9.67k
            BLOCK_RESIZE(b, padded_length);
5095
9.67k
            memset(BLOCK_DATA(b), 0, padded_length);
5096
9.67k
            BLOCK_SIZE(b) = padded_length;
5097
9.67k
            BLOCK_UPLEN(b);
5098
9.67k
            b->method = RAW;
5099
9.67k
            if (-1 == cram_write_block(fd, b)) {
5100
0
                cram_free_block(b);
5101
0
                cram_free_container(c);
5102
0
                return -1;
5103
0
            }
5104
9.67k
        }
5105
5106
9.67k
        cram_free_block(b);
5107
9.67k
        cram_free_container(c);
5108
9.67k
    }
5109
5110
9.67k
    if (0 != hflush(fd->fp))
5111
0
        return -1;
5112
5113
9.67k
    RP("=== Finishing saving header ===\n");
5114
5115
9.67k
    return 0;
5116
5117
0
 block_err:
5118
0
    return -1;
5119
9.67k
}
5120
5121
/* ----------------------------------------------------------------------
5122
 * The top-level cram opening, closing and option handling
5123
 */
5124
5125
/*
5126
 * Sets CRAM variable sized integer decode function tables.
5127
 * CRAM 1, 2, and 3.x all used ITF8 for uint32 and UTF8 for uint64.
5128
 * CRAM 4.x uses the same encoding mechanism for 32-bit and 64-bit
5129
 * (or anything inbetween), but also now supports signed values.
5130
 *
5131
 * Version is the CRAM major version number.
5132
 * vv is the vector table (probably &cram_fd->vv)
5133
 */
5134
22.1k
static void cram_init_varint(varint_vec *vv, int version) {
5135
22.1k
    if (version >= 4) {
5136
731
        vv->varint_get32 = uint7_get_32; // FIXME: varint.h API should be size agnostic
5137
731
        vv->varint_get32s = sint7_get_32;
5138
731
        vv->varint_get64 = uint7_get_64;
5139
731
        vv->varint_get64s = sint7_get_64;
5140
731
        vv->varint_put32 = uint7_put_32;
5141
731
        vv->varint_put32s = sint7_put_32;
5142
731
        vv->varint_put64 = uint7_put_64;
5143
731
        vv->varint_put64s = sint7_put_64;
5144
731
        vv->varint_put32_blk = uint7_put_blk_32;
5145
731
        vv->varint_put32s_blk = sint7_put_blk_32;
5146
731
        vv->varint_put64_blk = uint7_put_blk_64;
5147
731
        vv->varint_put64s_blk = sint7_put_blk_64;
5148
731
        vv->varint_size = uint7_size;
5149
731
        vv->varint_decode32_crc = uint7_decode_crc32;
5150
731
        vv->varint_decode32s_crc = sint7_decode_crc32;
5151
731
        vv->varint_decode64_crc = uint7_decode_crc64;
5152
21.4k
    } else {
5153
21.4k
        vv->varint_get32 = safe_itf8_get;
5154
21.4k
        vv->varint_get32s = safe_itf8_get;
5155
21.4k
        vv->varint_get64 = safe_ltf8_get;
5156
21.4k
        vv->varint_get64s = safe_ltf8_get;
5157
21.4k
        vv->varint_put32 = safe_itf8_put;
5158
21.4k
        vv->varint_put32s = safe_itf8_put;
5159
21.4k
        vv->varint_put64 = safe_ltf8_put;
5160
21.4k
        vv->varint_put64s = safe_ltf8_put;
5161
21.4k
        vv->varint_put32_blk = itf8_put_blk;
5162
21.4k
        vv->varint_put32s_blk = itf8_put_blk;
5163
21.4k
        vv->varint_put64_blk = ltf8_put_blk;
5164
21.4k
        vv->varint_put64s_blk = ltf8_put_blk;
5165
21.4k
        vv->varint_size = itf8_size;
5166
21.4k
        vv->varint_decode32_crc = itf8_decode_crc;
5167
21.4k
        vv->varint_decode32s_crc = itf8_decode_crc;
5168
21.4k
        vv->varint_decode64_crc = ltf8_decode_crc;
5169
21.4k
    }
5170
22.1k
}
5171
5172
/*
5173
 * Initialises the lookup tables. These could be global statics, but they're
5174
 * clumsy to setup in a multi-threaded environment unless we generate
5175
 * verbatim code and include that.
5176
 */
5177
22.1k
static void cram_init_tables(cram_fd *fd) {
5178
22.1k
    int i;
5179
5180
22.1k
    memset(fd->L1, 4, 256);
5181
22.1k
    fd->L1['A'] = 0; fd->L1['a'] = 0;
5182
22.1k
    fd->L1['C'] = 1; fd->L1['c'] = 1;
5183
22.1k
    fd->L1['G'] = 2; fd->L1['g'] = 2;
5184
22.1k
    fd->L1['T'] = 3; fd->L1['t'] = 3;
5185
5186
22.1k
    memset(fd->L2, 5, 256);
5187
22.1k
    fd->L2['A'] = 0; fd->L2['a'] = 0;
5188
22.1k
    fd->L2['C'] = 1; fd->L2['c'] = 1;
5189
22.1k
    fd->L2['G'] = 2; fd->L2['g'] = 2;
5190
22.1k
    fd->L2['T'] = 3; fd->L2['t'] = 3;
5191
22.1k
    fd->L2['N'] = 4; fd->L2['n'] = 4;
5192
5193
22.1k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
5194
4.75M
        for (i = 0; i < 0x200; i++) {
5195
4.74M
            int f = 0;
5196
5197
4.74M
            if (i & CRAM_FPAIRED)      f |= BAM_FPAIRED;
5198
4.74M
            if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
5199
4.74M
            if (i & CRAM_FUNMAP)       f |= BAM_FUNMAP;
5200
4.74M
            if (i & CRAM_FREVERSE)     f |= BAM_FREVERSE;
5201
4.74M
            if (i & CRAM_FREAD1)       f |= BAM_FREAD1;
5202
4.74M
            if (i & CRAM_FREAD2)       f |= BAM_FREAD2;
5203
4.74M
            if (i & CRAM_FSECONDARY)   f |= BAM_FSECONDARY;
5204
4.74M
            if (i & CRAM_FQCFAIL)      f |= BAM_FQCFAIL;
5205
4.74M
            if (i & CRAM_FDUP)         f |= BAM_FDUP;
5206
5207
4.74M
            fd->bam_flag_swap[i]  = f;
5208
4.74M
        }
5209
5210
37.9M
        for (i = 0; i < 0x1000; i++) {
5211
37.9M
            int g = 0;
5212
5213
37.9M
            if (i & BAM_FPAIRED)           g |= CRAM_FPAIRED;
5214
37.9M
            if (i & BAM_FPROPER_PAIR)  g |= CRAM_FPROPER_PAIR;
5215
37.9M
            if (i & BAM_FUNMAP)        g |= CRAM_FUNMAP;
5216
37.9M
            if (i & BAM_FREVERSE)      g |= CRAM_FREVERSE;
5217
37.9M
            if (i & BAM_FREAD1)        g |= CRAM_FREAD1;
5218
37.9M
            if (i & BAM_FREAD2)        g |= CRAM_FREAD2;
5219
37.9M
            if (i & BAM_FSECONDARY)    g |= CRAM_FSECONDARY;
5220
37.9M
            if (i & BAM_FQCFAIL)       g |= CRAM_FQCFAIL;
5221
37.9M
            if (i & BAM_FDUP)          g |= CRAM_FDUP;
5222
5223
37.9M
            fd->cram_flag_swap[i] = g;
5224
37.9M
        }
5225
12.9k
    } else {
5226
        /* NOP */
5227
52.9M
        for (i = 0; i < 0x1000; i++)
5228
52.9M
            fd->bam_flag_swap[i] = i;
5229
52.9M
        for (i = 0; i < 0x1000; i++)
5230
52.9M
            fd->cram_flag_swap[i] = i;
5231
12.9k
    }
5232
5233
22.1k
    memset(fd->cram_sub_matrix, 4, 32*32);
5234
732k
    for (i = 0; i < 32; i++) {
5235
710k
        fd->cram_sub_matrix[i]['A'&0x1f]=0;
5236
710k
        fd->cram_sub_matrix[i]['C'&0x1f]=1;
5237
710k
        fd->cram_sub_matrix[i]['G'&0x1f]=2;
5238
710k
        fd->cram_sub_matrix[i]['T'&0x1f]=3;
5239
710k
        fd->cram_sub_matrix[i]['N'&0x1f]=4;
5240
710k
    }
5241
133k
    for (i = 0; i < 20; i+=4) {
5242
110k
        int j;
5243
2.33M
        for (j = 0; j < 20; j++) {
5244
2.21M
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5245
2.21M
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5246
2.21M
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5247
2.21M
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5248
2.21M
        }
5249
110k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
5250
110k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
5251
110k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
5252
110k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
5253
110k
    }
5254
5255
22.1k
    cram_init_varint(&fd->vv, CRAM_MAJOR_VERS(fd->version));
5256
22.1k
}
5257
5258
// Default version numbers for CRAM
5259
static int major_version = 3;
5260
static int minor_version = 1;
5261
5262
/*
5263
 * Opens a CRAM file for read (mode "rb") or write ("wb").
5264
 * The filename may be "-" to indicate stdin or stdout.
5265
 *
5266
 * Returns file handle on success
5267
 *         NULL on failure.
5268
 */
5269
0
cram_fd *cram_open(const char *filename, const char *mode) {
5270
0
    hFILE *fp;
5271
0
    cram_fd *fd;
5272
0
    char fmode[3]= { mode[0], '\0', '\0' };
5273
5274
0
    if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
5275
0
        fmode[1] = 'b';
5276
0
    }
5277
5278
0
    fp = hopen(filename, fmode);
5279
0
    if (!fp)
5280
0
        return NULL;
5281
5282
0
    fd = cram_dopen(fp, filename, mode);
5283
0
    if (!fd)
5284
0
        hclose_abruptly(fp);
5285
5286
0
    return fd;
5287
0
}
5288
5289
/* Opens an existing stream for reading or writing.
5290
 *
5291
 * Returns file handle on success;
5292
 *         NULL on failure.
5293
 */
5294
22.2k
cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
5295
22.2k
    int i;
5296
22.2k
    char *cp;
5297
22.2k
    cram_fd *fd = calloc(1, sizeof(*fd));
5298
22.2k
    if (!fd)
5299
0
        return NULL;
5300
5301
22.2k
    fd->level = CRAM_DEFAULT_LEVEL;
5302
66.6k
    for (i = 0; mode[i]; i++) {
5303
44.4k
        if (mode[i] >= '0' && mode[i] <= '9') {
5304
0
            fd->level = mode[i] - '0';
5305
0
            break;
5306
0
        }
5307
44.4k
    }
5308
5309
22.2k
    fd->fp = fp;
5310
22.2k
    fd->mode = *mode;
5311
22.2k
    fd->first_container = 0;
5312
22.2k
    fd->curr_position = 0;
5313
5314
22.2k
    if (fd->mode == 'r') {
5315
        /* Reader */
5316
5317
11.7k
        if (!(fd->file_def = cram_read_file_def(fd)))
5318
7
            goto err;
5319
5320
11.7k
        fd->version = fd->file_def->major_version * 256 +
5321
11.7k
            fd->file_def->minor_version;
5322
5323
11.7k
        cram_init_tables(fd);
5324
5325
11.7k
        if (!(fd->header = cram_read_SAM_hdr(fd))) {
5326
1.96k
            cram_free_file_def(fd->file_def);
5327
1.96k
            goto err;
5328
1.96k
        }
5329
5330
11.7k
    } else {
5331
        /* Writer */
5332
10.4k
        cram_file_def *def = calloc(1, sizeof(*def));
5333
10.4k
        if (!def)
5334
0
            return NULL;
5335
5336
10.4k
        fd->file_def = def;
5337
5338
10.4k
        def->magic[0] = 'C';
5339
10.4k
        def->magic[1] = 'R';
5340
10.4k
        def->magic[2] = 'A';
5341
10.4k
        def->magic[3] = 'M';
5342
10.4k
        def->major_version = 0; // Indicator to write file def later.
5343
10.4k
        def->minor_version = 0;
5344
10.4k
        memset(def->file_id, 0, 20);
5345
10.4k
        strncpy(def->file_id, filename, 20);
5346
5347
10.4k
        fd->version = major_version * 256 + minor_version;
5348
10.4k
        cram_init_tables(fd);
5349
5350
        /* SAM header written later along with this file_def */
5351
10.4k
    }
5352
5353
20.2k
    fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
5354
20.2k
    if (!fd->prefix)
5355
0
        goto err;
5356
20.2k
    fd->first_base = fd->last_base = -1;
5357
20.2k
    fd->record_counter = 0;
5358
5359
20.2k
    fd->ctr = NULL;
5360
20.2k
    fd->ctr_mt = NULL;
5361
20.2k
    fd->refs  = refs_create();
5362
20.2k
    if (!fd->refs)
5363
0
        goto err;
5364
20.2k
    fd->ref_id = -2;
5365
20.2k
    fd->ref = NULL;
5366
5367
20.2k
    fd->decode_md = 0;
5368
20.2k
    fd->seqs_per_slice = SEQS_PER_SLICE;
5369
20.2k
    fd->bases_per_slice = BASES_PER_SLICE;
5370
20.2k
    fd->slices_per_container = SLICE_PER_CNT;
5371
20.2k
    fd->embed_ref = -1; // automatic selection
5372
20.2k
    fd->no_ref = 0;
5373
20.2k
    fd->no_ref_counter = 0;
5374
20.2k
    fd->ap_delta = 0;
5375
20.2k
    fd->ignore_md5 = 0;
5376
20.2k
    fd->lossy_read_names = 0;
5377
20.2k
    fd->use_bz2 = 0;
5378
20.2k
    fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3);
5379
20.2k
    fd->use_tok = (CRAM_MAJOR_VERS(fd->version) >= 3) && (CRAM_MINOR_VERS(fd->version) >= 1);
5380
20.2k
    fd->use_lzma = 0;
5381
20.2k
    fd->multi_seq = -1;
5382
20.2k
    fd->multi_seq_user = -1;
5383
20.2k
    fd->unsorted   = 0;
5384
20.2k
    fd->shared_ref = 0;
5385
20.2k
    fd->store_md = 0;
5386
20.2k
    fd->store_nm = 0;
5387
20.2k
    fd->last_RI_count = 0;
5388
5389
20.2k
    fd->index       = NULL;
5390
20.2k
    fd->own_pool    = 0;
5391
20.2k
    fd->pool        = NULL;
5392
20.2k
    fd->rqueue      = NULL;
5393
20.2k
    fd->job_pending = NULL;
5394
20.2k
    fd->ooc         = 0;
5395
20.2k
    fd->required_fields = INT_MAX;
5396
5397
20.2k
    pthread_mutex_init(&fd->metrics_lock, NULL);
5398
20.2k
    pthread_mutex_init(&fd->ref_lock, NULL);
5399
20.2k
    pthread_mutex_init(&fd->range_lock, NULL);
5400
20.2k
    pthread_mutex_init(&fd->bam_list_lock, NULL);
5401
5402
971k
    for (i = 0; i < DS_END; i++) {
5403
951k
        fd->m[i] = cram_new_metrics();
5404
951k
        if (!fd->m[i])
5405
0
            goto err;
5406
951k
    }
5407
5408
20.2k
    if (!(fd->tags_used = kh_init(m_metrics)))
5409
0
        goto err;
5410
5411
20.2k
    fd->range.refid = -2; // no ref.
5412
20.2k
    fd->eof = 1;          // See samtools issue #150
5413
20.2k
    fd->ref_fn = NULL;
5414
5415
20.2k
    fd->bl = NULL;
5416
5417
    /* Initialise dummy refs from the @SQ headers */
5418
20.2k
    if (-1 == refs_from_header(fd))
5419
0
        goto err;
5420
5421
20.2k
    return fd;
5422
5423
1.96k
 err:
5424
1.96k
    if (fd)
5425
1.96k
        free(fd);
5426
5427
1.96k
    return NULL;
5428
20.2k
}
5429
5430
/*
5431
 * Seek within a CRAM file.
5432
 *
5433
 * Returns 0 on success
5434
 *        -1 on failure
5435
 */
5436
0
int cram_seek(cram_fd *fd, off_t offset, int whence) {
5437
0
    fd->ooc = 0;
5438
5439
0
    cram_drain_rqueue(fd);
5440
5441
0
    return hseek(fd->fp, offset, whence) >= 0 ? 0 : -1;
5442
0
}
5443
5444
/*
5445
 * Flushes a CRAM file.
5446
 * Useful for when writing to stdout without wishing to close the stream.
5447
 *
5448
 * Returns 0 on success
5449
 *        -1 on failure
5450
 */
5451
0
int cram_flush(cram_fd *fd) {
5452
0
    if (!fd)
5453
0
        return -1;
5454
5455
0
    int ret = 0;
5456
5457
0
    if (fd->mode == 'w' && fd->ctr) {
5458
0
        if(fd->ctr->slice)
5459
0
            cram_update_curr_slice(fd->ctr, fd->version);
5460
5461
0
        if (-1 == cram_flush_container_mt(fd, fd->ctr))
5462
0
            ret = -1;
5463
5464
0
        cram_free_container(fd->ctr);
5465
0
        if (fd->ctr_mt == fd->ctr)
5466
0
            fd->ctr_mt = NULL;
5467
0
        fd->ctr = NULL;
5468
0
    }
5469
5470
0
    return ret;
5471
0
}
5472
5473
/*
5474
 * Writes an EOF block to a CRAM file.
5475
 *
5476
 * Returns 0 on success
5477
 *        -1 on failure
5478
 */
5479
9.85k
int cram_write_eof_block(cram_fd *fd) {
5480
    // EOF block is a container with special values to aid detection
5481
9.85k
    if (CRAM_MAJOR_VERS(fd->version) >= 2) {
5482
        // Empty container with
5483
        //   ref_seq_id -1
5484
        //   start pos 0x454f46 ("EOF")
5485
        //   span 0
5486
        //   nrec 0
5487
        //   counter 0
5488
        //   nbases 0
5489
        //   1 block (landmark 0)
5490
        //   (CRC32)
5491
9.85k
        cram_container c;
5492
9.85k
        memset(&c, 0, sizeof(c));
5493
9.85k
        c.ref_seq_id = -1;
5494
9.85k
        c.ref_seq_start = 0x454f46; // "EOF"
5495
9.85k
        c.ref_seq_span = 0;
5496
9.85k
        c.record_counter = 0;
5497
9.85k
        c.num_bases = 0;
5498
9.85k
        c.num_blocks = 1;
5499
9.85k
        int32_t land[1] = {0};
5500
9.85k
        c.landmark = land;
5501
5502
        // An empty compression header block with
5503
        //   method raw (0)
5504
        //   type comp header (1)
5505
        //   content id 0
5506
        //   block contents size 6
5507
        //   raw size 6
5508
        //     empty preservation map (01 00)
5509
        //     empty data series map (01 00)
5510
        //     empty tag map (01 00)
5511
        //   block CRC
5512
9.85k
        cram_block_compression_hdr ch;
5513
9.85k
        memset(&ch, 0, sizeof(ch));
5514
9.85k
        c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch, 0);
5515
5516
9.85k
        c.length = c.comp_hdr_block->byte            // Landmark[0]
5517
9.85k
            + 5                                      // block struct
5518
9.85k
            + 4*(CRAM_MAJOR_VERS(fd->version) >= 3); // CRC
5519
9.85k
        if (cram_write_container(fd, &c) < 0 ||
5520
9.85k
            cram_write_block(fd, c.comp_hdr_block) < 0) {
5521
0
            cram_close(fd);
5522
0
            cram_free_block(c.comp_hdr_block);
5523
0
            return -1;
5524
0
        }
5525
9.85k
        if (ch.preservation_map)
5526
0
            kh_destroy(map, ch.preservation_map);
5527
9.85k
        cram_free_block(c.comp_hdr_block);
5528
5529
        // V2.1 bytes
5530
        // 0b 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id
5531
        // e0 45 4f 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5532
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5533
        // 00 01 00 06 06             // Comp.HDR blk
5534
        // 01 00 01 00 01 00          // Comp.HDR blk
5535
5536
        // V3.0 bytes:
5537
        // 0f 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id
5538
        // e0 45 4f 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5539
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5540
        // 05 bd d9 4f                // CRC32
5541
        // 00 01 00 06 06             // Comp.HDR blk
5542
        // 01 00 01 00 01 00          // Comp.HDR blk
5543
        // ee 63 01 4b                // CRC32
5544
5545
        // V4.0 bytes:
5546
        // 0f 00 00 00 8f ff ff ff    // Cont HDR: size, ref seq id
5547
        // 82 95 9e 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5548
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5549
        // ac d6 05 bc                // CRC32
5550
        // 00 01 00 06 06             // Comp.HDR blk
5551
        // 01 00 01 00 01 00          // Comp.HDR blk
5552
        // ee 63 01 4b                // CRC32
5553
9.85k
    }
5554
5555
9.85k
    return 0;
5556
9.85k
}
5557
5558
/*
5559
 * Closes a CRAM file.
5560
 * Returns 0 on success
5561
 *        -1 on failure
5562
 */
5563
20.2k
int cram_close(cram_fd *fd) {
5564
20.2k
    spare_bams *bl, *next;
5565
20.2k
    int i, ret = 0;
5566
5567
20.2k
    if (!fd)
5568
0
        return -1;
5569
5570
20.2k
    if (fd->mode == 'w' && fd->ctr) {
5571
4.86k
        if(fd->ctr->slice)
5572
4.86k
            cram_update_curr_slice(fd->ctr, fd->version);
5573
5574
4.86k
        if (-1 == cram_flush_container_mt(fd, fd->ctr))
5575
627
            ret = -1;
5576
4.86k
    }
5577
5578
20.2k
    if (fd->mode != 'w')
5579
9.75k
        cram_drain_rqueue(fd);
5580
5581
20.2k
    if (fd->pool && fd->eof >= 0 && fd->rqueue) {
5582
0
        hts_tpool_process_flush(fd->rqueue);
5583
5584
0
        if (0 != cram_flush_result(fd))
5585
0
            ret = -1;
5586
5587
0
        if (fd->mode == 'w')
5588
0
            fd->ctr = NULL; // prevent double freeing
5589
5590
        //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
5591
5592
0
        hts_tpool_process_destroy(fd->rqueue);
5593
0
    }
5594
5595
20.2k
    pthread_mutex_destroy(&fd->metrics_lock);
5596
20.2k
    pthread_mutex_destroy(&fd->ref_lock);
5597
20.2k
    pthread_mutex_destroy(&fd->range_lock);
5598
20.2k
    pthread_mutex_destroy(&fd->bam_list_lock);
5599
5600
20.2k
    if (ret == 0 && fd->mode == 'w') {
5601
        /* Write EOF block */
5602
9.85k
        if (0 != cram_write_eof_block(fd))
5603
0
            ret = -1;
5604
9.85k
    }
5605
5606
24.8k
    for (bl = fd->bl; bl; bl = next) {
5607
4.60k
        int max_rec = fd->seqs_per_slice * fd->slices_per_container;
5608
5609
4.60k
        next = bl->next;
5610
4.60k
        free_bam_list(bl->bams, max_rec);
5611
4.60k
        free(bl);
5612
4.60k
    }
5613
5614
20.2k
    if (hclose(fd->fp) != 0)
5615
0
        ret = -1;
5616
5617
20.2k
    if (fd->file_def)
5618
20.2k
        cram_free_file_def(fd->file_def);
5619
5620
20.2k
    if (fd->header)
5621
19.6k
        sam_hdr_destroy(fd->header);
5622
5623
20.2k
    free(fd->prefix);
5624
5625
20.2k
    if (fd->ctr)
5626
11.4k
        cram_free_container(fd->ctr);
5627
5628
20.2k
    if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
5629
165
        cram_free_container(fd->ctr_mt);
5630
5631
20.2k
    if (fd->refs)
5632
20.2k
        refs_free(fd->refs);
5633
20.2k
    if (fd->ref_free)
5634
0
        free(fd->ref_free);
5635
5636
971k
    for (i = 0; i < DS_END; i++)
5637
951k
        if (fd->m[i])
5638
951k
            free(fd->m[i]);
5639
5640
20.2k
    if (fd->tags_used) {
5641
20.2k
        khint_t k;
5642
5643
41.7k
        for (k = kh_begin(fd->tags_used); k != kh_end(fd->tags_used); k++) {
5644
21.5k
            if (kh_exist(fd->tags_used, k))
5645
10.3k
                free(kh_val(fd->tags_used, k));
5646
21.5k
        }
5647
5648
20.2k
        kh_destroy(m_metrics, fd->tags_used);
5649
20.2k
    }
5650
5651
20.2k
    if (fd->index)
5652
0
        cram_index_free(fd);
5653
5654
20.2k
    if (fd->own_pool && fd->pool)
5655
0
        hts_tpool_destroy(fd->pool);
5656
5657
20.2k
    if (fd->idxfp)
5658
0
        if (bgzf_close(fd->idxfp) < 0)
5659
0
            ret = -1;
5660
5661
20.2k
    free(fd);
5662
5663
20.2k
    return ret;
5664
20.2k
}
5665
5666
/*
5667
 * Returns 1 if we hit an EOF while reading.
5668
 */
5669
16.9k
int cram_eof(cram_fd *fd) {
5670
16.9k
    return fd->eof;
5671
16.9k
}
5672
5673
5674
/*
5675
 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5676
 * Use this immediately after opening.
5677
 *
5678
 * Returns 0 on success
5679
 *        -1 on failure
5680
 */
5681
9.75k
int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...) {
5682
9.75k
    int r;
5683
9.75k
    va_list args;
5684
5685
9.75k
    va_start(args, opt);
5686
9.75k
    r = cram_set_voption(fd, opt, args);
5687
9.75k
    va_end(args);
5688
5689
9.75k
    return r;
5690
9.75k
}
5691
5692
/*
5693
 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5694
 * Use this immediately after opening.
5695
 *
5696
 * Returns 0 on success
5697
 *        -1 on failure
5698
 */
5699
9.75k
int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
5700
9.75k
    refs_t *refs;
5701
5702
9.75k
    if (!fd) {
5703
0
        errno = EBADF;
5704
0
        return -1;
5705
0
    }
5706
5707
9.75k
    switch (opt) {
5708
9.75k
    case CRAM_OPT_DECODE_MD:
5709
9.75k
        fd->decode_md = va_arg(args, int);
5710
9.75k
        break;
5711
5712
0
    case CRAM_OPT_PREFIX:
5713
0
        if (fd->prefix)
5714
0
            free(fd->prefix);
5715
0
        if (!(fd->prefix = strdup(va_arg(args, char *))))
5716
0
            return -1;
5717
0
        break;
5718
5719
0
    case CRAM_OPT_VERBOSITY:
5720
0
        break;
5721
5722
0
    case CRAM_OPT_SEQS_PER_SLICE:
5723
0
        fd->seqs_per_slice = va_arg(args, int);
5724
0
        if (fd->bases_per_slice == BASES_PER_SLICE)
5725
0
            fd->bases_per_slice = fd->seqs_per_slice * 500;
5726
0
        break;
5727
5728
0
    case CRAM_OPT_BASES_PER_SLICE:
5729
0
        fd->bases_per_slice = va_arg(args, int);
5730
0
        break;
5731
5732
0
    case CRAM_OPT_SLICES_PER_CONTAINER:
5733
0
        fd->slices_per_container = va_arg(args, int);
5734
0
        break;
5735
5736
0
    case CRAM_OPT_EMBED_REF:
5737
0
        fd->embed_ref = va_arg(args, int);
5738
0
        break;
5739
5740
0
    case CRAM_OPT_NO_REF:
5741
0
        fd->no_ref = va_arg(args, int);
5742
0
        break;
5743
5744
0
    case CRAM_OPT_POS_DELTA:
5745
0
        fd->ap_delta = va_arg(args, int);
5746
0
        break;
5747
5748
0
    case CRAM_OPT_IGNORE_MD5:
5749
0
        fd->ignore_md5 = va_arg(args, int);
5750
0
        break;
5751
5752
0
    case CRAM_OPT_LOSSY_NAMES:
5753
0
        fd->lossy_read_names = va_arg(args, int);
5754
        // Currently lossy read names required paired (attached) reads.
5755
        // TLEN 0 or being 1 out causes read pairs to be detached, breaking
5756
        // the lossy read name compression, so we have extra options to
5757
        // slacken the exact TLEN round-trip checks.
5758
0
        fd->tlen_approx = fd->lossy_read_names;
5759
0
        fd->tlen_zero = fd->lossy_read_names;
5760
0
        break;
5761
5762
0
    case CRAM_OPT_USE_BZIP2:
5763
0
        fd->use_bz2 = va_arg(args, int);
5764
0
        break;
5765
5766
0
    case CRAM_OPT_USE_RANS:
5767
0
        fd->use_rans = va_arg(args, int);
5768
0
        break;
5769
5770
0
    case CRAM_OPT_USE_TOK:
5771
0
        fd->use_tok = va_arg(args, int);
5772
0
        break;
5773
5774
0
    case CRAM_OPT_USE_FQZ:
5775
0
        fd->use_fqz = va_arg(args, int);
5776
0
        break;
5777
5778
0
    case CRAM_OPT_USE_ARITH:
5779
0
        fd->use_arith = va_arg(args, int);
5780
0
        break;
5781
5782
0
    case CRAM_OPT_USE_LZMA:
5783
0
        fd->use_lzma = va_arg(args, int);
5784
0
        break;
5785
5786
0
    case CRAM_OPT_SHARED_REF:
5787
0
        fd->shared_ref = 1;
5788
0
        refs = va_arg(args, refs_t *);
5789
0
        if (refs != fd->refs) {
5790
0
            if (fd->refs)
5791
0
                refs_free(fd->refs);
5792
0
            fd->refs = refs;
5793
0
            fd->refs->count++;
5794
0
        }
5795
0
        break;
5796
5797
0
    case CRAM_OPT_RANGE: {
5798
0
        int r = cram_seek_to_refpos(fd, va_arg(args, cram_range *));
5799
0
        pthread_mutex_lock(&fd->range_lock);
5800
//        printf("opt range noseek to %p %d:%ld-%ld\n",
5801
//               fd, fd->range.refid, fd->range.start, fd->range.end);
5802
0
        if (fd->range.refid != -2)
5803
0
            fd->required_fields |= SAM_POS;
5804
0
        pthread_mutex_unlock(&fd->range_lock);
5805
0
        return r;
5806
0
    }
5807
5808
0
    case CRAM_OPT_RANGE_NOSEEK: {
5809
        // As per CRAM_OPT_RANGE, but no seeking
5810
0
        pthread_mutex_lock(&fd->range_lock);
5811
0
        cram_range *r = va_arg(args, cram_range *);
5812
0
        fd->range = *r;
5813
0
        if (r->refid == HTS_IDX_NOCOOR) {
5814
0
            fd->range.refid = -1;
5815
0
            fd->range.start = 0;
5816
0
        } else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) {
5817
0
            fd->range.refid = -2; // special case in cram_next_slice
5818
0
        }
5819
0
        if (fd->range.refid != -2)
5820
0
            fd->required_fields |= SAM_POS;
5821
0
        fd->ooc = 0;
5822
0
        fd->eof = 0;
5823
0
        pthread_mutex_unlock(&fd->range_lock);
5824
0
        return 0;
5825
0
    }
5826
5827
0
    case CRAM_OPT_REFERENCE:
5828
0
        return cram_load_reference(fd, va_arg(args, char *));
5829
5830
0
    case CRAM_OPT_VERSION: {
5831
0
        int major, minor;
5832
0
        char *s = va_arg(args, char *);
5833
0
        if (2 != sscanf(s, "%d.%d", &major, &minor)) {
5834
0
            hts_log_error("Malformed version string %s", s);
5835
0
            return -1;
5836
0
        }
5837
0
        if (!((major == 1 &&  minor == 0) ||
5838
0
              (major == 2 && (minor == 0 || minor == 1)) ||
5839
0
              (major == 3 && (minor == 0 || minor == 1)) ||
5840
0
              (major == 4 &&  minor == 0))) {
5841
0
            hts_log_error("Unknown version string; use 1.0, 2.0, 2.1, 3.0, 3.1 or 4.0");
5842
0
            errno = EINVAL;
5843
0
            return -1;
5844
0
        }
5845
5846
0
        if (major > 3 || (major == 3 && minor > 1)) {
5847
0
            hts_log_warning(
5848
0
                "CRAM version %s is still a draft and subject to change.\n"
5849
0
                "This is a technology demonstration that should not be "
5850
0
                "used for archival data.", s);
5851
0
        }
5852
5853
0
        fd->version = major*256 + minor;
5854
5855
0
        fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0;
5856
5857
0
        fd->use_tok = ((CRAM_MAJOR_VERS(fd->version) == 3 &&
5858
0
                        CRAM_MINOR_VERS(fd->version) >= 1) ||
5859
0
                        CRAM_MAJOR_VERS(fd->version) >= 4) ? 1 : 0;
5860
0
        cram_init_tables(fd);
5861
5862
0
        break;
5863
0
    }
5864
5865
0
    case CRAM_OPT_MULTI_SEQ_PER_SLICE:
5866
0
        fd->multi_seq_user = fd->multi_seq = va_arg(args, int);
5867
0
        break;
5868
5869
0
    case CRAM_OPT_NTHREADS: {
5870
0
        int nthreads =  va_arg(args, int);
5871
0
        if (nthreads >= 1) {
5872
0
            if (!(fd->pool = hts_tpool_init(nthreads)))
5873
0
                return -1;
5874
5875
0
            fd->rqueue = hts_tpool_process_init(fd->pool, nthreads*2, 0);
5876
0
            fd->shared_ref = 1;
5877
0
            fd->own_pool = 1;
5878
0
        }
5879
0
        break;
5880
0
    }
5881
5882
0
    case CRAM_OPT_THREAD_POOL: {
5883
0
        htsThreadPool *p = va_arg(args, htsThreadPool *);
5884
0
        fd->pool = p ? p->pool : NULL;
5885
0
        if (fd->pool) {
5886
0
            fd->rqueue = hts_tpool_process_init(fd->pool,
5887
0
                                                p->qsize ? p->qsize : hts_tpool_size(fd->pool)*2,
5888
0
                                                0);
5889
0
        }
5890
0
        fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
5891
0
        fd->own_pool = 0;
5892
5893
        //fd->qsize = 1;
5894
        //fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
5895
        //hts_tpool_dispatch(fd->pool, cram_decoder_thread, fd);
5896
0
        break;
5897
0
    }
5898
5899
0
    case CRAM_OPT_REQUIRED_FIELDS:
5900
0
        fd->required_fields = va_arg(args, int);
5901
0
        if (fd->range.refid != -2)
5902
0
            fd->required_fields |= SAM_POS;
5903
0
        break;
5904
5905
0
    case CRAM_OPT_STORE_MD:
5906
0
        fd->store_md = va_arg(args, int);
5907
0
        break;
5908
5909
0
    case CRAM_OPT_STORE_NM:
5910
0
        fd->store_nm = va_arg(args, int);
5911
0
        break;
5912
5913
0
    case HTS_OPT_COMPRESSION_LEVEL:
5914
0
        fd->level = va_arg(args, int);
5915
0
        break;
5916
5917
0
    case HTS_OPT_PROFILE: {
5918
0
        enum hts_profile_option prof = va_arg(args, int);
5919
0
        switch (prof) {
5920
0
        case HTS_PROFILE_FAST:
5921
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 1;
5922
0
            fd->use_tok = 0;
5923
0
            fd->seqs_per_slice = 10000;
5924
0
            break;
5925
5926
0
        case HTS_PROFILE_NORMAL:
5927
0
            break;
5928
5929
0
        case HTS_PROFILE_SMALL:
5930
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 6;
5931
0
            fd->use_bz2 = 1;
5932
0
            fd->use_fqz = 1;
5933
0
            fd->seqs_per_slice = 25000;
5934
0
            break;
5935
5936
0
        case HTS_PROFILE_ARCHIVE:
5937
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 7;
5938
0
            fd->use_bz2 = 1;
5939
0
            fd->use_fqz = 1;
5940
0
            fd->use_arith = 1;
5941
0
            if (fd->level > 7)
5942
0
                fd->use_lzma = 1;
5943
0
            fd->seqs_per_slice = 100000;
5944
0
            break;
5945
0
        }
5946
5947
0
        if (fd->bases_per_slice == BASES_PER_SLICE)
5948
0
            fd->bases_per_slice = fd->seqs_per_slice * 500;
5949
0
        break;
5950
0
    }
5951
5952
0
    default:
5953
0
        hts_log_error("Unknown CRAM option code %d", opt);
5954
0
        errno = EINVAL;
5955
0
        return -1;
5956
9.75k
    }
5957
5958
9.75k
    return 0;
5959
9.75k
}
5960
5961
int cram_check_EOF(cram_fd *fd)
5962
0
{
5963
    // Byte 9 in these templates is & with 0x0f to resolve differences
5964
    // between ITF-8 interpretations between early Java and C
5965
    // implementations of CRAM
5966
0
    static const unsigned char TEMPLATE_2_1[30] = {
5967
0
        0x0b, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
5968
0
        0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
5969
0
        0x01, 0x00, 0x06, 0x06, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00
5970
0
    };
5971
0
    static const unsigned char TEMPLATE_3[38] = {
5972
0
        0x0f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
5973
0
        0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x05,
5974
0
        0xbd, 0xd9, 0x4f, 0x00, 0x01, 0x00, 0x06, 0x06, 0x01, 0x00,
5975
0
        0x01, 0x00, 0x01, 0x00, 0xee, 0x63, 0x01, 0x4b
5976
0
    };
5977
5978
0
    unsigned char buf[38]; // max(sizeof TEMPLATE_*)
5979
5980
0
    uint8_t major = CRAM_MAJOR_VERS(fd->version);
5981
0
    uint8_t minor = CRAM_MINOR_VERS(fd->version);
5982
5983
0
    const unsigned char *template;
5984
0
    ssize_t template_len;
5985
0
    if ((major < 2) ||
5986
0
        (major == 2 && minor == 0)) {
5987
0
        return 3; // No EOF support in cram versions less than 2.1
5988
0
    } else if (major == 2 && minor == 1) {
5989
0
        template = TEMPLATE_2_1;
5990
0
        template_len = sizeof TEMPLATE_2_1;
5991
0
    } else {
5992
0
        template = TEMPLATE_3;
5993
0
        template_len = sizeof TEMPLATE_3;
5994
0
    }
5995
5996
0
    off_t offset = htell(fd->fp);
5997
0
    if (hseek(fd->fp, -template_len, SEEK_END) < 0) {
5998
0
        if (errno == ESPIPE) {
5999
0
            hclearerr(fd->fp);
6000
0
            return 2;
6001
0
        }
6002
0
        else {
6003
0
            return -1;
6004
0
        }
6005
0
    }
6006
0
    if (hread(fd->fp, buf, template_len) != template_len) return -1;
6007
0
    if (hseek(fd->fp, offset, SEEK_SET) < 0) return -1;
6008
0
    buf[8] &= 0x0f;
6009
0
    return (memcmp(template, buf, template_len) == 0)? 1 : 0;
6010
0
}