Coverage Report

Created: 2026-03-31 06:35

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/cram/cram_io.c
Line
Count
Source
1
/*
2
Copyright (c) 2012-2025 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * CRAM I/O primitives.
33
 *
34
 * - ITF8 encoding and decoding.
35
 * - Block based I/O
36
 * - Zlib inflating and deflating (memory)
37
 * - CRAM basic data structure reading and writing
38
 * - File opening / closing
39
 * - Reference sequence handling
40
 */
41
42
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
43
#include <config.h>
44
45
#include <stdio.h>
46
#include <errno.h>
47
#include <assert.h>
48
#include <stdlib.h>
49
#include <string.h>
50
#include <unistd.h>
51
#include <zlib.h>
52
#ifdef HAVE_LIBBZ2
53
#include <bzlib.h>
54
#endif
55
#ifdef HAVE_LIBLZMA
56
#ifdef HAVE_LZMA_H
57
#include <lzma.h>
58
#else
59
#include "../os/lzma_stub.h"
60
#endif
61
#endif
62
#include <sys/types.h>
63
#include <sys/stat.h>
64
#include <math.h>
65
#include <stdint.h>
66
67
#ifdef HAVE_LIBDEFLATE
68
#include <libdeflate.h>
69
#define crc32(a,b,c) libdeflate_crc32((a),(b),(c))
70
#endif
71
72
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
73
#include "../fuzz_settings.h"
74
#endif
75
76
#include "cram.h"
77
#include "os.h"
78
#include "../htslib/hts.h"
79
#include "../hts_internal.h"
80
#include "open_trace_file.h"
81
82
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
83
#include <htscodecs/rANS_static.h>
84
#include <htscodecs/rANS_static4x16.h>
85
#include <htscodecs/arith_dynamic.h>
86
#include <htscodecs/tokenise_name3.h>
87
#include <htscodecs/fqzcomp_qual.h>
88
#include <htscodecs/varint.h> // CRAM v4.0 variable-size integers
89
#else
90
#include "../htscodecs/htscodecs/rANS_static.h"
91
#include "../htscodecs/htscodecs/rANS_static4x16.h"
92
#include "../htscodecs/htscodecs/arith_dynamic.h"
93
#include "../htscodecs/htscodecs/tokenise_name3.h"
94
#include "../htscodecs/htscodecs/fqzcomp_qual.h"
95
#include "../htscodecs/htscodecs/varint.h"
96
#endif
97
98
//#define REF_DEBUG
99
100
#ifdef REF_DEBUG
101
#include <sys/syscall.h>
102
#define gettid() (int)syscall(SYS_gettid)
103
104
#define RP(...) fprintf (stderr, __VA_ARGS__)
105
#else
106
#define RP(...)
107
#endif
108
109
#include "../htslib/hfile.h"
110
#include "../htslib/bgzf.h"
111
#include "../htslib/faidx.h"
112
#include "../hts_internal.h"
113
114
#ifndef PATH_MAX
115
#define PATH_MAX FILENAME_MAX
116
#endif
117
118
267k
#define TRIAL_SPAN 70
119
267k
#define NTRIALS 3
120
121
3.12k
#define CRAM_DEFAULT_LEVEL 5
122
123
/* ----------------------------------------------------------------------
124
 * ITF8 encoding and decoding.
125
 *
126
 * Also see the itf8_get and itf8_put macros in cram_io.h
127
 */
128
129
/*
130
 * LEGACY: consider using itf8_decode_crc.
131
 *
132
 * Reads an integer in ITF-8 encoding from 'cp' and stores it in
133
 * *val.
134
 *
135
 * Returns the number of bytes read on success
136
 *        -1 on failure
137
 */
138
0
int itf8_decode(cram_fd *fd, int32_t *val_p) {
139
0
    static int nbytes[16] = {
140
0
        0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
141
0
        1,1,1,1,                                        // 1000xxxx - 1011xxxx
142
0
        2,2,                                            // 1100xxxx - 1101xxxx
143
0
        3,                                              // 1110xxxx
144
0
        4,                                              // 1111xxxx
145
0
    };
146
147
0
    static int nbits[16] = {
148
0
        0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
149
0
        0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
150
0
        0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
151
0
        0x0f,                                           // 1110xxxx
152
0
        0x0f,                                           // 1111xxxx
153
0
    };
154
155
0
    int32_t val = hgetc(fd->fp);
156
0
    if (val == -1)
157
0
        return -1;
158
159
0
    int i = nbytes[val>>4];
160
0
    val &= nbits[val>>4];
161
162
0
    switch(i) {
163
0
    case 0:
164
0
        *val_p = val;
165
0
        return 1;
166
167
0
    case 1:
168
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
169
0
        *val_p = val;
170
0
        return 2;
171
172
0
    case 2:
173
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
174
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
175
0
        *val_p = val;
176
0
        return 3;
177
178
0
    case 3:
179
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
180
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
181
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
182
0
        *val_p = val;
183
0
        return 4;
184
185
0
    case 4: // really 3.5 more, why make it different?
186
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
187
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
188
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
189
0
        val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f);
190
0
        *val_p = val;
191
0
    }
192
193
0
    return 5;
194
0
}
195
196
69.4k
int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
197
69.4k
    static int nbytes[16] = {
198
69.4k
        0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
199
69.4k
        1,1,1,1,                                        // 1000xxxx - 1011xxxx
200
69.4k
        2,2,                                            // 1100xxxx - 1101xxxx
201
69.4k
        3,                                              // 1110xxxx
202
69.4k
        4,                                              // 1111xxxx
203
69.4k
    };
204
205
69.4k
    static int nbits[16] = {
206
69.4k
        0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
207
69.4k
        0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
208
69.4k
        0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
209
69.4k
        0x0f,                                           // 1110xxxx
210
69.4k
        0x0f,                                           // 1111xxxx
211
69.4k
    };
212
69.4k
    unsigned char c[5];
213
214
69.4k
    int32_t val = hgetc(fd->fp);
215
69.4k
    if (val == -1)
216
9
        return -1;
217
218
69.4k
    c[0]=val;
219
220
69.4k
    int i = nbytes[val>>4];
221
69.4k
    val &= nbits[val>>4];
222
223
69.4k
    if (i > 0) {
224
18.5k
        if (hread(fd->fp, &c[1], i) < i)
225
0
            return -1;
226
18.5k
    }
227
228
69.4k
    switch(i) {
229
50.9k
    case 0:
230
50.9k
        *val_p = val;
231
50.9k
        *crc = crc32(*crc, c, 1);
232
50.9k
        return 1;
233
234
16.5k
    case 1:
235
16.5k
        val = (val<<8) | c[1];
236
16.5k
        *val_p = val;
237
16.5k
        *crc = crc32(*crc, c, 2);
238
16.5k
        return 2;
239
240
390
    case 2:
241
390
        val = (val<<8) | c[1];
242
390
        val = (val<<8) | c[2];
243
390
        *val_p = val;
244
390
        *crc = crc32(*crc, c, 3);
245
390
        return 3;
246
247
452
    case 3:
248
452
        val = (val<<8) | c[1];
249
452
        val = (val<<8) | c[2];
250
452
        val = (val<<8) | c[3];
251
452
        *val_p = val;
252
452
        *crc = crc32(*crc, c, 4);
253
452
        return 4;
254
255
1.08k
    case 4: // really 3.5 more, why make it different?
256
1.08k
        {
257
1.08k
            uint32_t uv = val;
258
1.08k
            uv = (uv<<8) |  c[1];
259
1.08k
            uv = (uv<<8) |  c[2];
260
1.08k
            uv = (uv<<8) |  c[3];
261
1.08k
            uv = (uv<<4) | (c[4] & 0x0f);
262
            // Avoid implementation-defined behaviour on negative values
263
1.08k
            *val_p = uv < 0x80000000UL ? (int32_t) uv : -((int32_t) (0xffffffffUL - uv)) - 1;
264
1.08k
            *crc = crc32(*crc, c, 5);
265
1.08k
        }
266
69.4k
    }
267
268
1.08k
    return 5;
269
69.4k
}
270
271
/*
272
 * Stores a value to memory in ITF-8 format.
273
 *
274
 * Returns the number of bytes required to store the number.
275
 * This is a maximum of 5 bytes.
276
 */
277
19.6M
static inline int itf8_put(char *cp, int32_t val) {
278
19.6M
    unsigned char *up = (unsigned char *)cp;
279
19.6M
    if        (!(val & ~0x00000007f)) { // 1 byte
280
18.4M
        *up = val;
281
18.4M
        return 1;
282
18.4M
    } else if (!(val & ~0x00003fff)) { // 2 byte
283
359k
        *up++ = (val >> 8 ) | 0x80;
284
359k
        *up   = val & 0xff;
285
359k
        return 2;
286
837k
    } else if (!(val & ~0x01fffff)) { // 3 byte
287
18.4k
        *up++ = (val >> 16) | 0xc0;
288
18.4k
        *up++ = (val >> 8 ) & 0xff;
289
18.4k
        *up   = val & 0xff;
290
18.4k
        return 3;
291
819k
    } else if (!(val & ~0x0fffffff)) { // 4 byte
292
623k
        *up++ = (val >> 24) | 0xe0;
293
623k
        *up++ = (val >> 16) & 0xff;
294
623k
        *up++ = (val >> 8 ) & 0xff;
295
623k
        *up   = val & 0xff;
296
623k
        return 4;
297
623k
    } else {                           // 5 byte
298
195k
        *up++ = 0xf0 | ((val>>28) & 0xff);
299
195k
        *up++ = (val >> 20) & 0xff;
300
195k
        *up++ = (val >> 12) & 0xff;
301
195k
        *up++ = (val >> 4 ) & 0xff;
302
195k
        *up = val & 0x0f;
303
195k
        return 5;
304
195k
    }
305
19.6M
}
306
307
308
/* 64-bit itf8 variant */
309
129k
static inline int ltf8_put(char *cp, int64_t val) {
310
129k
    unsigned char *up = (unsigned char *)cp;
311
129k
    if        (!(val & ~((1LL<<7)-1))) {
312
73.3k
        *up = val;
313
73.3k
        return 1;
314
73.3k
    } else if (!(val & ~((1LL<<(6+8))-1))) {
315
54.9k
        *up++ = (val >> 8 ) | 0x80;
316
54.9k
        *up   = val & 0xff;
317
54.9k
        return 2;
318
54.9k
    } else if (!(val & ~((1LL<<(5+2*8))-1))) {
319
999
        *up++ = (val >> 16) | 0xc0;
320
999
        *up++ = (val >> 8 ) & 0xff;
321
999
        *up   = val & 0xff;
322
999
        return 3;
323
999
    } else if (!(val & ~((1LL<<(4+3*8))-1))) {
324
39
        *up++ = (val >> 24) | 0xe0;
325
39
        *up++ = (val >> 16) & 0xff;
326
39
        *up++ = (val >> 8 ) & 0xff;
327
39
        *up   = val & 0xff;
328
39
        return 4;
329
39
    } else if (!(val & ~((1LL<<(3+4*8))-1))) {
330
0
        *up++ = (val >> 32) | 0xf0;
331
0
        *up++ = (val >> 24) & 0xff;
332
0
        *up++ = (val >> 16) & 0xff;
333
0
        *up++ = (val >> 8 ) & 0xff;
334
0
        *up   = val & 0xff;
335
0
        return 5;
336
0
    } else if (!(val & ~((1LL<<(2+5*8))-1))) {
337
0
        *up++ = (val >> 40) | 0xf8;
338
0
        *up++ = (val >> 32) & 0xff;
339
0
        *up++ = (val >> 24) & 0xff;
340
0
        *up++ = (val >> 16) & 0xff;
341
0
        *up++ = (val >> 8 ) & 0xff;
342
0
        *up   = val & 0xff;
343
0
        return 6;
344
0
    } else if (!(val & ~((1LL<<(1+6*8))-1))) {
345
0
        *up++ = (val >> 48) | 0xfc;
346
0
        *up++ = (val >> 40) & 0xff;
347
0
        *up++ = (val >> 32) & 0xff;
348
0
        *up++ = (val >> 24) & 0xff;
349
0
        *up++ = (val >> 16) & 0xff;
350
0
        *up++ = (val >> 8 ) & 0xff;
351
0
        *up   = val & 0xff;
352
0
        return 7;
353
0
    } else if (!(val & ~((1LL<<(7*8))-1))) {
354
0
        *up++ = (val >> 56) | 0xfe;
355
0
        *up++ = (val >> 48) & 0xff;
356
0
        *up++ = (val >> 40) & 0xff;
357
0
        *up++ = (val >> 32) & 0xff;
358
0
        *up++ = (val >> 24) & 0xff;
359
0
        *up++ = (val >> 16) & 0xff;
360
0
        *up++ = (val >> 8 ) & 0xff;
361
0
        *up   = val & 0xff;
362
0
        return 8;
363
0
    } else {
364
0
        *up++ = 0xff;
365
0
        *up++ = (val >> 56) & 0xff;
366
0
        *up++ = (val >> 48) & 0xff;
367
0
        *up++ = (val >> 40) & 0xff;
368
0
        *up++ = (val >> 32) & 0xff;
369
0
        *up++ = (val >> 24) & 0xff;
370
0
        *up++ = (val >> 16) & 0xff;
371
0
        *up++ = (val >> 8 ) & 0xff;
372
0
        *up   = val & 0xff;
373
0
        return 9;
374
0
    }
375
129k
}
376
377
/*
378
 * Encodes and writes a single integer in ITF-8 format.
379
 * Returns 0 on success
380
 *        -1 on failure
381
 */
382
0
int itf8_encode(cram_fd *fd, int32_t val) {
383
0
    char buf[5];
384
0
    int len = itf8_put(buf, val);
385
0
    return hwrite(fd->fp, buf, len) == len ? 0 : -1;
386
0
}
387
388
const int itf8_bytes[16] = {
389
    1, 1, 1, 1,  1, 1, 1, 1,
390
    2, 2, 2, 2,  3, 3, 4, 5
391
};
392
393
const int ltf8_bytes[256] = {
394
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
395
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
396
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
397
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
398
399
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
400
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
401
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
402
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
403
404
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
405
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
406
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
407
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
408
409
    3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
410
    3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
411
412
    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
413
414
    5, 5, 5, 5,  5, 5, 5, 5,  6, 6, 6, 6,  7, 7, 8, 9
415
};
416
417
/*
418
 * LEGACY: consider using ltf8_decode_crc.
419
 */
420
0
int ltf8_decode(cram_fd *fd, int64_t *val_p) {
421
0
    int c = hgetc(fd->fp);
422
0
    int64_t val = (unsigned char)c;
423
0
    if (c == -1)
424
0
        return -1;
425
426
0
    if (val < 0x80) {
427
0
        *val_p =   val;
428
0
        return 1;
429
430
0
    } else if (val < 0xc0) {
431
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
432
0
        *val_p = val & (((1LL<<(6+8)))-1);
433
0
        return 2;
434
435
0
    } else if (val < 0xe0) {
436
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
437
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
438
0
        *val_p = val & ((1LL<<(5+2*8))-1);
439
0
        return 3;
440
441
0
    } else if (val < 0xf0) {
442
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
443
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
444
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
445
0
        *val_p = val & ((1LL<<(4+3*8))-1);
446
0
        return 4;
447
448
0
    } else if (val < 0xf8) {
449
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
450
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
451
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
452
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
453
0
        *val_p = val & ((1LL<<(3+4*8))-1);
454
0
        return 5;
455
456
0
    } else if (val < 0xfc) {
457
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
458
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
459
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
460
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
461
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
462
0
        *val_p = val & ((1LL<<(2+5*8))-1);
463
0
        return 6;
464
465
0
    } else if (val < 0xfe) {
466
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
467
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
468
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
469
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
470
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
471
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
472
0
        *val_p = val & ((1LL<<(1+6*8))-1);
473
0
        return 7;
474
475
0
    } else if (val < 0xff) {
476
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
477
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
478
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
479
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
480
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
481
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
482
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
483
0
        *val_p = val & ((1LL<<(7*8))-1);
484
0
        return 8;
485
486
0
    } else {
487
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
488
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
489
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
490
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
491
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
492
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
493
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
494
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
495
0
        *val_p = val;
496
0
    }
497
498
0
    return 9;
499
0
}
500
501
1.06k
int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
502
1.06k
    unsigned char c[9];
503
1.06k
    int64_t val = hgetc(fd->fp);
504
1.06k
    if (val < 0)
505
0
        return -1;
506
507
1.06k
    c[0] = val;
508
509
1.06k
    if (val < 0x80) {
510
1.02k
        *val_p =   val;
511
1.02k
        *crc = crc32(*crc, c, 1);
512
1.02k
        return 1;
513
514
1.02k
    } else if (val < 0xc0) {
515
24
        int v = hgetc(fd->fp);
516
24
        if (v < 0)
517
0
            return -1;
518
24
        val = (val<<8) | (c[1]=v);
519
24
        *val_p = val & (((1LL<<(6+8)))-1);
520
24
        *crc = crc32(*crc, c, 2);
521
24
        return 2;
522
523
24
    } else if (val < 0xe0) {
524
0
        if (hread(fd->fp, &c[1], 2) < 2)
525
0
            return -1;
526
0
        val = (val<<8) | c[1];
527
0
        val = (val<<8) | c[2];
528
0
        *val_p = val & ((1LL<<(5+2*8))-1);
529
0
        *crc = crc32(*crc, c, 3);
530
0
        return 3;
531
532
21
    } else if (val < 0xf0) {
533
0
        if (hread(fd->fp, &c[1], 3) < 3)
534
0
            return -1;
535
0
        val = (val<<8) | c[1];
536
0
        val = (val<<8) | c[2];
537
0
        val = (val<<8) | c[3];
538
0
        *val_p = val & ((1LL<<(4+3*8))-1);
539
0
        *crc = crc32(*crc, c, 4);
540
0
        return 4;
541
542
21
    } else if (val < 0xf8) {
543
9
        if (hread(fd->fp, &c[1], 4) < 4)
544
0
            return -1;
545
9
        val = (val<<8) | c[1];
546
9
        val = (val<<8) | c[2];
547
9
        val = (val<<8) | c[3];
548
9
        val = (val<<8) | c[4];
549
9
        *val_p = val & ((1LL<<(3+4*8))-1);
550
9
        *crc = crc32(*crc, c, 5);
551
9
        return 5;
552
553
12
    } else if (val < 0xfc) {
554
0
        if (hread(fd->fp, &c[1], 5) < 5)
555
0
            return -1;
556
0
        val = (val<<8) | c[1];
557
0
        val = (val<<8) | c[2];
558
0
        val = (val<<8) | c[3];
559
0
        val = (val<<8) | c[4];
560
0
        val = (val<<8) | c[5];
561
0
        *val_p = val & ((1LL<<(2+5*8))-1);
562
0
        *crc = crc32(*crc, c, 6);
563
0
        return 6;
564
565
12
    } else if (val < 0xfe) {
566
12
        if (hread(fd->fp, &c[1], 6) < 6)
567
0
            return -1;
568
12
        val = (val<<8) | c[1];
569
12
        val = (val<<8) | c[2];
570
12
        val = (val<<8) | c[3];
571
12
        val = (val<<8) | c[4];
572
12
        val = (val<<8) | c[5];
573
12
        val = (val<<8) | c[6];
574
12
        *val_p = val & ((1LL<<(1+6*8))-1);
575
12
        *crc = crc32(*crc, c, 7);
576
12
        return 7;
577
578
12
    } else if (val < 0xff) {
579
0
        uint64_t uval = val;
580
0
        if (hread(fd->fp, &c[1], 7) < 7)
581
0
            return -1;
582
0
        uval = (uval<<8) | c[1];
583
0
        uval = (uval<<8) | c[2];
584
0
        uval = (uval<<8) | c[3];
585
0
        uval = (uval<<8) | c[4];
586
0
        uval = (uval<<8) | c[5];
587
0
        uval = (uval<<8) | c[6];
588
0
        uval = (uval<<8) | c[7];
589
0
        *val_p = uval & ((1ULL<<(7*8))-1);
590
0
        *crc = crc32(*crc, c, 8);
591
0
        return 8;
592
593
0
    } else {
594
0
        uint64_t uval;
595
0
        if (hread(fd->fp, &c[1], 8) < 8)
596
0
            return -1;
597
0
        uval =             c[1];
598
0
        uval = (uval<<8) | c[2];
599
0
        uval = (uval<<8) | c[3];
600
0
        uval = (uval<<8) | c[4];
601
0
        uval = (uval<<8) | c[5];
602
0
        uval = (uval<<8) | c[6];
603
0
        uval = (uval<<8) | c[7];
604
0
        uval = (uval<<8) | c[8];
605
0
        *crc = crc32(*crc, c, 9);
606
        // Avoid implementation-defined behaviour on negative values
607
0
        *val_p = c[1] < 0x80 ? (int64_t) uval : -((int64_t) (0xffffffffffffffffULL - uval)) - 1;
608
0
    }
609
610
0
    return 9;
611
1.06k
}
612
613
/*
614
 * Pushes a value in ITF8 format onto the end of a block.
615
 * This shouldn't be used for high-volume data as it is not the fastest
616
 * method.
617
 *
618
 * Returns the number of bytes written
619
 */
620
13.5M
int itf8_put_blk(cram_block *blk, int32_t val) {
621
13.5M
    char buf[5];
622
13.5M
    int sz;
623
624
13.5M
    sz = itf8_put(buf, val);
625
13.5M
    BLOCK_APPEND(blk, buf, sz);
626
13.5M
    return sz;
627
628
0
 block_err:
629
0
    return -1;
630
13.5M
}
631
632
0
int ltf8_put_blk(cram_block *blk, int64_t val) {
633
0
    char buf[9];
634
0
    int sz;
635
636
0
    sz = ltf8_put(buf, val);
637
0
    BLOCK_APPEND(blk, buf, sz);
638
0
    return sz;
639
640
0
 block_err:
641
0
    return -1;
642
0
}
643
644
110k
static int64_t safe_itf8_get(char **cp, const char *endp, int *err) {
645
110k
    const unsigned char *up = (unsigned char *)*cp;
646
647
110k
    if (endp && endp - *cp < 5 &&
648
12.4k
        (*cp >= endp || endp - *cp < itf8_bytes[up[0]>>4])) {
649
3.82k
        if (err) *err = 1;
650
3.82k
        return 0;
651
3.82k
    }
652
653
106k
    if (up[0] < 0x80) {
654
95.9k
        (*cp)++;
655
95.9k
        return up[0];
656
95.9k
    } else if (up[0] < 0xc0) {
657
4.86k
        (*cp)+=2;
658
4.86k
        return ((up[0] <<8) |  up[1])                           & 0x3fff;
659
6.17k
    } else if (up[0] < 0xe0) {
660
1.16k
        (*cp)+=3;
661
1.16k
        return ((up[0]<<16) | (up[1]<< 8) |  up[2])             & 0x1fffff;
662
5.01k
    } else if (up[0] < 0xf0) {
663
1.24k
        (*cp)+=4;
664
1.24k
        uint32_t uv = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
665
1.24k
        return (int32_t)uv;
666
3.77k
    } else {
667
3.77k
        (*cp)+=5;
668
3.77k
        uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
669
3.77k
        return (int32_t)uv;
670
3.77k
    }
671
106k
}
672
673
393
static int64_t safe_ltf8_get(char **cp, const char *endp, int *err) {
674
393
    unsigned char *up = (unsigned char *)*cp;
675
676
393
    if (endp && endp - *cp < 9 &&
677
389
        (*cp >= endp || endp - *cp < ltf8_bytes[up[0]])) {
678
111
        if (err) *err = 1;
679
111
        return 0;
680
111
    }
681
682
282
    if (up[0] < 0x80) {
683
10
        (*cp)++;
684
10
        return up[0];
685
272
    } else if (up[0] < 0xc0) {
686
42
        (*cp)+=2;
687
42
        return (((uint64_t)up[0]<< 8) |
688
42
                 (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
689
230
    } else if (up[0] < 0xe0) {
690
26
        (*cp)+=3;
691
26
        return (((uint64_t)up[0]<<16) |
692
26
                ((uint64_t)up[1]<< 8) |
693
26
                 (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
694
204
    } else if (up[0] < 0xf0) {
695
66
        (*cp)+=4;
696
66
        return (((uint64_t)up[0]<<24) |
697
66
                ((uint64_t)up[1]<<16) |
698
66
                ((uint64_t)up[2]<< 8) |
699
66
                 (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
700
138
    } else if (up[0] < 0xf8) {
701
120
        (*cp)+=5;
702
120
        return (((uint64_t)up[0]<<32) |
703
120
                ((uint64_t)up[1]<<24) |
704
120
                ((uint64_t)up[2]<<16) |
705
120
                ((uint64_t)up[3]<< 8) |
706
120
                 (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
707
120
    } else if (up[0] < 0xfc) {
708
18
        (*cp)+=6;
709
18
        return (((uint64_t)up[0]<<40) |
710
18
                ((uint64_t)up[1]<<32) |
711
18
                ((uint64_t)up[2]<<24) |
712
18
                ((uint64_t)up[3]<<16) |
713
18
                ((uint64_t)up[4]<< 8) |
714
18
                 (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
715
18
    } else if (up[0] < 0xfe) {
716
0
        (*cp)+=7;
717
0
        return (((uint64_t)up[0]<<48) |
718
0
                ((uint64_t)up[1]<<40) |
719
0
                ((uint64_t)up[2]<<32) |
720
0
                ((uint64_t)up[3]<<24) |
721
0
                ((uint64_t)up[4]<<16) |
722
0
                ((uint64_t)up[5]<< 8) |
723
0
                 (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
724
0
    } else if (up[0] < 0xff) {
725
0
        (*cp)+=8;
726
0
        return (((uint64_t)up[1]<<48) |
727
0
                ((uint64_t)up[2]<<40) |
728
0
                ((uint64_t)up[3]<<32) |
729
0
                ((uint64_t)up[4]<<24) |
730
0
                ((uint64_t)up[5]<<16) |
731
0
                ((uint64_t)up[6]<< 8) |
732
0
                 (uint64_t)up[7]) & ((1LL<<(7*8))-1);
733
0
    } else {
734
0
        (*cp)+=9;
735
0
        return (((uint64_t)up[1]<<56) |
736
0
                ((uint64_t)up[2]<<48) |
737
0
                ((uint64_t)up[3]<<40) |
738
0
                ((uint64_t)up[4]<<32) |
739
0
                ((uint64_t)up[5]<<24) |
740
0
                ((uint64_t)up[6]<<16) |
741
0
                ((uint64_t)up[7]<< 8) |
742
0
                 (uint64_t)up[8]);
743
0
    }
744
282
}
745
746
// Wrapper for now
747
6.05M
static int safe_itf8_put(char *cp, char *cp_end, int32_t val) {
748
6.05M
    return itf8_put(cp, val);
749
6.05M
}
750
751
129k
static int safe_ltf8_put(char *cp, char *cp_end, int64_t val) {
752
129k
    return ltf8_put(cp, val);
753
129k
}
754
755
1.40M
static int itf8_size(int64_t v) {
756
1.40M
    return ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5);
757
1.40M
}
758
759
//-----------------------------------------------------------------------------
760
761
// CRAM v4.0 onwards uses a different variable sized integer encoding
762
// that is size agnostic.
763
764
// Local interface to varint.h inline version, so we can use in func ptr.
765
// Note a lot of these use the unsigned interface but take signed int64_t.
766
// This is because the old CRAM ITF8 inteface had signed -1 as unsigned
767
// 0xffffffff.
768
240
static int uint7_size(int64_t v) {
769
240
    return var_size_u64(v);
770
240
}
771
772
0
static int64_t uint7_get_32(char **cp, const char *endp, int *err) {
773
0
    uint32_t val;
774
0
    int nb = var_get_u32((uint8_t *)(*cp), (const uint8_t *)endp, &val);
775
0
    (*cp) += nb;
776
0
    if (!nb && err) *err = 1;
777
0
    return val;
778
0
}
779
780
0
static int64_t sint7_get_32(char **cp, const char *endp, int *err) {
781
0
    int32_t val;
782
0
    int nb = var_get_s32((uint8_t *)(*cp), (const uint8_t *)endp, &val);
783
0
    (*cp) += nb;
784
0
    if (!nb && err) *err = 1;
785
0
    return val;
786
0
}
787
788
0
static int64_t uint7_get_64(char **cp, const char *endp, int *err) {
789
0
    uint64_t val;
790
0
    int nb = var_get_u64((uint8_t *)(*cp), (const uint8_t *)endp, &val);
791
0
    (*cp) += nb;
792
0
    if (!nb && err) *err = 1;
793
0
    return val;
794
0
}
795
796
0
static int64_t sint7_get_64(char **cp, const char *endp, int *err) {
797
0
    int64_t val;
798
0
    int nb = var_get_s64((uint8_t *)(*cp), (const uint8_t *)endp, &val);
799
0
    (*cp) += nb;
800
0
    if (!nb && err) *err = 1;
801
0
    return val;
802
0
}
803
804
0
static int uint7_put_32(char *cp, char *endp, int32_t val) {
805
0
    return var_put_u32((uint8_t *)cp, (uint8_t *)endp, val);
806
0
}
807
808
0
static int sint7_put_32(char *cp, char *endp, int32_t val) {
809
0
    return var_put_s32((uint8_t *)cp, (uint8_t *)endp, val);
810
0
}
811
812
0
static int uint7_put_64(char *cp, char *endp, int64_t val) {
813
0
    return var_put_u64((uint8_t *)cp, (uint8_t *)endp, val);
814
0
}
815
816
0
static int sint7_put_64(char *cp, char *endp, int64_t val) {
817
0
    return var_put_s64((uint8_t *)cp, (uint8_t *)endp, val);
818
0
}
819
820
// Put direct to to cram_block
821
0
static int uint7_put_blk_32(cram_block *blk, int32_t v) {
822
0
    uint8_t buf[10];
823
0
    int sz = var_put_u32(buf, buf+10, v);
824
0
    BLOCK_APPEND(blk, buf, sz);
825
0
    return sz;
826
827
0
 block_err:
828
0
    return -1;
829
0
}
830
831
0
static int sint7_put_blk_32(cram_block *blk, int32_t v) {
832
0
    uint8_t buf[10];
833
0
    int sz = var_put_s32(buf, buf+10, v);
834
0
    BLOCK_APPEND(blk, buf, sz);
835
0
    return sz;
836
837
0
 block_err:
838
0
    return -1;
839
0
}
840
841
0
static int uint7_put_blk_64(cram_block *blk, int64_t v) {
842
0
    uint8_t buf[10];
843
0
    int sz = var_put_u64(buf, buf+10, v);
844
0
    BLOCK_APPEND(blk, buf, sz);
845
0
    return sz;
846
847
0
 block_err:
848
0
    return -1;
849
0
}
850
851
0
static int sint7_put_blk_64(cram_block *blk, int64_t v) {
852
0
    uint8_t buf[10];
853
0
    int sz = var_put_s64(buf, buf+10, v);
854
0
    BLOCK_APPEND(blk, buf, sz);
855
0
    return sz;
856
857
0
 block_err:
858
0
    return -1;
859
0
}
860
861
// Decode 32-bits with CRC update from cram_fd
862
7.76k
static int uint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
863
7.76k
    uint8_t b[5], i = 0;
864
7.76k
    int c;
865
7.76k
    uint32_t v = 0;
866
867
#ifdef VARINT2
868
    b[0] = hgetc(fd->fp);
869
    if (b[0] < 177) {
870
    } else if (b[0] < 241) {
871
        b[1] = hgetc(fd->fp);
872
    } else if (b[0] < 249) {
873
        b[1] = hgetc(fd->fp);
874
        b[2] = hgetc(fd->fp);
875
    } else {
876
        int n = b[0]+2, z = 1;
877
        while (n-- >= 249)
878
            b[z++] = hgetc(fd->fp);
879
    }
880
    i = var_get_u32(b, NULL, &v);
881
#else
882
//    // Little endian
883
//    int s = 0;
884
//    do {
885
//        b[i++] = c = hgetc(fd->fp);
886
//        if (c < 0)
887
//            return -1;
888
//        v |= (c & 0x7f) << s;
889
//      s += 7;
890
//    } while (i < 5 && (c & 0x80));
891
892
    // Big endian, see also htscodecs/varint.h
893
7.91k
    do {
894
7.91k
        b[i++] = c = hgetc(fd->fp);
895
7.91k
        if (c < 0)
896
0
            return -1;
897
7.91k
        v = (v<<7) | (c & 0x7f);
898
7.91k
    } while (i < 5 && (c & 0x80));
899
7.76k
#endif
900
7.76k
    *crc = crc32(*crc, b, i);
901
902
7.76k
    *val_p = v;
903
7.76k
    return i;
904
7.76k
}
905
906
// Decode 32-bits with CRC update from cram_fd
907
1.55k
static int sint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
908
1.55k
    uint8_t b[5], i = 0;
909
1.55k
    int c;
910
1.55k
    uint32_t v = 0;
911
912
#ifdef VARINT2
913
    b[0] = hgetc(fd->fp);
914
    if (b[0] < 177) {
915
    } else if (b[0] < 241) {
916
        b[1] = hgetc(fd->fp);
917
    } else if (b[0] < 249) {
918
        b[1] = hgetc(fd->fp);
919
        b[2] = hgetc(fd->fp);
920
    } else {
921
        int n = b[0]+2, z = 1;
922
        while (n-- >= 249)
923
            b[z++] = hgetc(fd->fp);
924
    }
925
    i = var_get_u32(b, NULL, &v);
926
#else
927
//    // Little endian
928
//    int s = 0;
929
//    do {
930
//        b[i++] = c = hgetc(fd->fp);
931
//        if (c < 0)
932
//            return -1;
933
//        v |= (c & 0x7f) << s;
934
//      s += 7;
935
//    } while (i < 5 && (c & 0x80));
936
937
    // Big endian, see also htscodecs/varint.h
938
1.56k
    do {
939
1.56k
        b[i++] = c = hgetc(fd->fp);
940
1.56k
        if (c < 0)
941
0
            return -1;
942
1.56k
        v = (v<<7) | (c & 0x7f);
943
1.56k
    } while (i < 5 && (c & 0x80));
944
1.55k
#endif
945
1.55k
    *crc = crc32(*crc, b, i);
946
947
1.55k
    *val_p = (v>>1) ^ -(v&1);
948
1.55k
    return i;
949
1.55k
}
950
951
952
// Decode 64-bits with CRC update from cram_fd
953
6.21k
static int uint7_decode_crc64(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
954
6.21k
    uint8_t b[10], i = 0;
955
6.21k
    int c;
956
6.21k
    uint64_t v = 0;
957
958
#ifdef VARINT2
959
    b[0] = hgetc(fd->fp);
960
    if (b[0] < 177) {
961
    } else if (b[0] < 241) {
962
        b[1] = hgetc(fd->fp);
963
    } else if (b[0] < 249) {
964
        b[1] = hgetc(fd->fp);
965
        b[2] = hgetc(fd->fp);
966
    } else {
967
        int n = b[0]+2, z = 1;
968
        while (n-- >= 249)
969
            b[z++] = hgetc(fd->fp);
970
    }
971
    i = var_get_u64(b, NULL, &v);
972
#else
973
//    // Little endian
974
//    int s = 0;
975
//    do {
976
//        b[i++] = c = hgetc(fd->fp);
977
//        if (c < 0)
978
//            return -1;
979
//        v |= (c & 0x7f) << s;
980
//      s += 7;
981
//    } while (i < 10 && (c & 0x80));
982
983
    // Big endian, see also htscodecs/varint.h
984
6.33k
    do {
985
6.33k
        b[i++] = c = hgetc(fd->fp);
986
6.33k
        if (c < 0)
987
0
            return -1;
988
6.33k
        v = (v<<7) | (c & 0x7f);
989
6.33k
    } while (i < 5 && (c & 0x80));
990
6.21k
#endif
991
6.21k
    *crc = crc32(*crc, b, i);
992
993
6.21k
    *val_p = v;
994
6.21k
    return i;
995
6.21k
}
996
997
//-----------------------------------------------------------------------------
998
999
/*
1000
 * Decodes a 32-bit little endian value from fd and stores in val.
1001
 *
1002
 * Returns the number of bytes read on success
1003
 *         -1 on failure
1004
 */
1005
3.86k
static int int32_decode(cram_fd *fd, int32_t *val) {
1006
3.86k
    int32_t i;
1007
3.86k
    if (4 != hread(fd->fp, &i, 4))
1008
3
        return -1;
1009
1010
3.86k
    *val = le_int4(i);
1011
3.86k
    return 4;
1012
3.86k
}
1013
1014
/*
1015
 * Encodes a 32-bit little endian value 'val' and writes to fd.
1016
 *
1017
 * Returns the number of bytes written on success
1018
 *         -1 on failure
1019
 */
1020
362k
static int int32_encode(cram_fd *fd, int32_t val) {
1021
362k
    uint32_t v = le_int4(val);
1022
362k
    if (4 != hwrite(fd->fp, &v, 4))
1023
0
        return -1;
1024
1025
362k
    return 4;
1026
362k
}
1027
1028
/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
1029
30
int int32_get_blk(cram_block *b, int32_t *val) {
1030
30
    if (b->uncomp_size - BLOCK_SIZE(b) < 4)
1031
0
        return -1;
1032
1033
30
    uint32_t v =
1034
30
         ((uint32_t) b->data[b->byte  ])        |
1035
30
        (((uint32_t) b->data[b->byte+1]) <<  8) |
1036
30
        (((uint32_t) b->data[b->byte+2]) << 16) |
1037
30
        (((uint32_t) b->data[b->byte+3]) << 24);
1038
    // Avoid implementation-defined behaviour on negative values
1039
30
    *val = v < 0x80000000U ? (int32_t) v : -((int32_t) (0xffffffffU - v)) - 1;
1040
30
    BLOCK_SIZE(b) += 4;
1041
30
    return 4;
1042
30
}
1043
1044
/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
1045
1.76k
int int32_put_blk(cram_block *b, int32_t val) {
1046
1.76k
    unsigned char cp[4];
1047
1.76k
    uint32_t v = val;
1048
1.76k
    cp[0] = ( v      & 0xff);
1049
1.76k
    cp[1] = ((v>>8)  & 0xff);
1050
1.76k
    cp[2] = ((v>>16) & 0xff);
1051
1.76k
    cp[3] = ((v>>24) & 0xff);
1052
1053
1.76k
    BLOCK_APPEND(b, cp, 4);
1054
1.76k
    return 0;
1055
1056
0
 block_err:
1057
0
    return -1;
1058
1.76k
}
1059
1060
#ifdef HAVE_LIBDEFLATE
1061
/* ----------------------------------------------------------------------
1062
 * libdeflate compression code, with interface to match
1063
 * zlib_mem_{in,de}flate for simplicity elsewhere.
1064
 */
1065
1066
// Named the same as the version that uses zlib as we always use libdeflate for
1067
// decompression when available.
1068
char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1069
    struct libdeflate_decompressor *z = libdeflate_alloc_decompressor();
1070
    if (!z) {
1071
        hts_log_error("Call to libdeflate_alloc_decompressor failed");
1072
        return NULL;
1073
    }
1074
1075
    uint8_t *data = NULL, *new_data;
1076
    if (!*size)
1077
        *size = csize*2;
1078
    for(;;) {
1079
        new_data = realloc(data, *size);
1080
        if (!new_data) {
1081
            hts_log_error("Memory allocation failure");
1082
            goto fail;
1083
        }
1084
        data = new_data;
1085
1086
        int ret = libdeflate_gzip_decompress(z, cdata, csize, data, *size, size);
1087
1088
        // Auto grow output buffer size if needed and try again.
1089
        // Fortunately for all bar one call of this we know the size already.
1090
        if (ret == LIBDEFLATE_INSUFFICIENT_SPACE) {
1091
            (*size) *= 1.5;
1092
            continue;
1093
        }
1094
1095
        if (ret != LIBDEFLATE_SUCCESS) {
1096
            hts_log_error("Inflate operation failed: %d", ret);
1097
            goto fail;
1098
        } else {
1099
            break;
1100
        }
1101
    }
1102
1103
    libdeflate_free_decompressor(z);
1104
    return (char *)data;
1105
1106
 fail:
1107
    libdeflate_free_decompressor(z);
1108
    free(data);
1109
    return NULL;
1110
}
1111
1112
// Named differently as we use both zlib/libdeflate for compression.
1113
static char *libdeflate_deflate(char *data, size_t size, size_t *cdata_size,
1114
                                int level, int strat) {
1115
    level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default
1116
    level *= 1.23;     // NB levels go up to 12 here; 5 onwards is +1
1117
    level += level>=8; // 5,6,7->6,7,8  8->10  9->12
1118
    if (level > 12) level = 12;
1119
1120
    if (strat == Z_RLE) // not supported by libdeflate
1121
        level = 1;
1122
1123
    struct libdeflate_compressor *z = libdeflate_alloc_compressor(level);
1124
    if (!z) {
1125
        hts_log_error("Call to libdeflate_alloc_compressor failed");
1126
        return NULL;
1127
    }
1128
1129
    unsigned char *cdata = NULL; /* Compressed output */
1130
    size_t cdata_alloc;
1131
    cdata = malloc(cdata_alloc = size*1.05+100);
1132
    if (!cdata) {
1133
        hts_log_error("Memory allocation failure");
1134
        libdeflate_free_compressor(z);
1135
        return NULL;
1136
    }
1137
1138
    *cdata_size = libdeflate_gzip_compress(z, data, size, cdata, cdata_alloc);
1139
    libdeflate_free_compressor(z);
1140
1141
    if (*cdata_size == 0) {
1142
        hts_log_error("Call to libdeflate_gzip_compress failed");
1143
        free(cdata);
1144
        return NULL;
1145
    }
1146
1147
    return (char *)cdata;
1148
}
1149
1150
#else
1151
1152
/* ----------------------------------------------------------------------
1153
 * zlib compression code - from Gap5's tg_iface_g.c
1154
 * They're static here as they're only used within the cram_compress_block
1155
 * and cram_uncompress_block functions, which are the external interface.
1156
 */
1157
0
char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1158
0
    z_stream s;
1159
0
    unsigned char *data = NULL; /* Uncompressed output */
1160
0
    int data_alloc = 0;
1161
0
    int err;
1162
1163
    /* Starting point at uncompressed size, and scale after that */
1164
0
    data = malloc(data_alloc = csize*1.2+100);
1165
0
    if (!data)
1166
0
        return NULL;
1167
1168
    /* Initialise zlib stream */
1169
0
    s.zalloc = Z_NULL; /* use default allocation functions */
1170
0
    s.zfree  = Z_NULL;
1171
0
    s.opaque = Z_NULL;
1172
0
    s.next_in  = (unsigned char *)cdata;
1173
0
    s.avail_in = csize;
1174
0
    s.total_in = 0;
1175
0
    s.next_out  = data;
1176
0
    s.avail_out = data_alloc;
1177
0
    s.total_out = 0;
1178
1179
    //err = inflateInit(&s);
1180
0
    err = inflateInit2(&s, 15 + 32);
1181
0
    if (err != Z_OK) {
1182
0
        hts_log_error("Call to zlib inflateInit failed: %s", s.msg);
1183
0
        free(data);
1184
0
        return NULL;
1185
0
    }
1186
1187
    /* Decode to 'data' array */
1188
0
    for (;s.avail_in;) {
1189
0
        unsigned char *data_tmp;
1190
0
        int alloc_inc;
1191
1192
0
        s.next_out = &data[s.total_out];
1193
0
        err = inflate(&s, Z_NO_FLUSH);
1194
0
        if (err == Z_STREAM_END)
1195
0
            break;
1196
1197
0
        if (err != Z_OK) {
1198
0
            hts_log_error("Call to zlib inflate failed: %s", s.msg);
1199
0
            free(data);
1200
0
            inflateEnd(&s);
1201
0
            return NULL;
1202
0
        }
1203
1204
        /* More to come, so realloc based on growth so far */
1205
0
        alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
1206
0
        data = realloc((data_tmp = data), data_alloc += alloc_inc);
1207
0
        if (!data) {
1208
0
            free(data_tmp);
1209
0
            inflateEnd(&s);
1210
0
            return NULL;
1211
0
        }
1212
0
        s.avail_out += alloc_inc;
1213
0
    }
1214
0
    inflateEnd(&s);
1215
1216
0
    *size = s.total_out;
1217
0
    return (char *)data;
1218
0
}
1219
#endif
1220
1221
#if !defined(HAVE_LIBDEFLATE) || LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR ==  1 && LIBDEFLATE_VERSION_MINOR <= 8)
1222
static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
1223
201k
                              int level, int strat) {
1224
201k
    z_stream s;
1225
201k
    unsigned char *cdata = NULL; /* Compressed output */
1226
201k
    int cdata_alloc = 0;
1227
201k
    int cdata_pos = 0;
1228
201k
    int err;
1229
1230
201k
    cdata = malloc(cdata_alloc = size*1.05+100);
1231
201k
    if (!cdata)
1232
0
        return NULL;
1233
201k
    cdata_pos = 0;
1234
1235
    /* Initialise zlib stream */
1236
201k
    s.zalloc = Z_NULL; /* use default allocation functions */
1237
201k
    s.zfree  = Z_NULL;
1238
201k
    s.opaque = Z_NULL;
1239
201k
    s.next_in  = (unsigned char *)data;
1240
201k
    s.avail_in = size;
1241
201k
    s.total_in = 0;
1242
201k
    s.next_out  = cdata;
1243
201k
    s.avail_out = cdata_alloc;
1244
201k
    s.total_out = 0;
1245
201k
    s.data_type = Z_BINARY;
1246
1247
201k
    err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
1248
201k
    if (err != Z_OK) {
1249
0
        hts_log_error("Call to zlib deflateInit2 failed: %s", s.msg);
1250
0
        return NULL;
1251
0
    }
1252
1253
    /* Encode to 'cdata' array */
1254
403k
    for (;s.avail_in;) {
1255
201k
        s.next_out = &cdata[cdata_pos];
1256
201k
        s.avail_out = cdata_alloc - cdata_pos;
1257
201k
        if (cdata_alloc - cdata_pos <= 0) {
1258
0
            hts_log_error("Deflate produced larger output than expected");
1259
0
            return NULL;
1260
0
        }
1261
201k
        err = deflate(&s, Z_NO_FLUSH);
1262
201k
        cdata_pos = cdata_alloc - s.avail_out;
1263
201k
        if (err != Z_OK) {
1264
0
            hts_log_error("Call to zlib deflate failed: %s", s.msg);
1265
0
            break;
1266
0
        }
1267
201k
    }
1268
201k
    if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
1269
0
        hts_log_error("Call to zlib deflate failed: %s", s.msg);
1270
0
    }
1271
201k
    *cdata_size = s.total_out;
1272
1273
201k
    if (deflateEnd(&s) != Z_OK) {
1274
0
        hts_log_error("Call to zlib deflate failed: %s", s.msg);
1275
0
    }
1276
201k
    return (char *)cdata;
1277
201k
}
1278
#endif
1279
1280
#ifdef HAVE_LIBLZMA
1281
/* ------------------------------------------------------------------------ */
1282
/*
1283
 * Data compression routines using liblzma (xz)
1284
 *
1285
 * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
1286
 * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
1287
 * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
1288
 * as compression times.
1289
 *
1290
 * For now we disable this functionality. If it's to be reenabled make sure you
1291
 * improve the mem_inflate implementation as it's just a test hack at the
1292
 * moment.
1293
 */
1294
1295
static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
1296
0
                              int level) {
1297
0
    char *out;
1298
0
    size_t out_size = lzma_stream_buffer_bound(size);
1299
0
    *cdata_size = 0;
1300
1301
0
    out = malloc(out_size);
1302
1303
    /* Single call compression */
1304
0
    if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
1305
0
                                           (uint8_t *)data, size,
1306
0
                                           (uint8_t *)out, cdata_size,
1307
0
                                           out_size))
1308
0
        return NULL;
1309
1310
0
    return out;
1311
0
}
1312
1313
0
static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
1314
0
    lzma_stream strm = LZMA_STREAM_INIT;
1315
0
    size_t out_size = 0, out_pos = 0;
1316
0
    char *out = NULL, *new_out;
1317
0
    int r;
1318
1319
    /* Initiate the decoder */
1320
0
    if (LZMA_OK != lzma_stream_decoder(&strm, lzma_easy_decoder_memusage(9), 0))
1321
0
        return NULL;
1322
1323
    /* Decode loop */
1324
0
    strm.avail_in = csize;
1325
0
    strm.next_in = (uint8_t *)cdata;
1326
1327
0
    for (;strm.avail_in;) {
1328
0
        if (strm.avail_in > out_size - out_pos) {
1329
0
            out_size += strm.avail_in * 4 + 32768;
1330
0
            new_out = realloc(out, out_size);
1331
0
            if (!new_out)
1332
0
                goto fail;
1333
0
            out = new_out;
1334
0
        }
1335
0
        strm.avail_out = out_size - out_pos;
1336
0
        strm.next_out = (uint8_t *)&out[out_pos];
1337
1338
0
        r = lzma_code(&strm, LZMA_RUN);
1339
0
        if (LZMA_OK != r && LZMA_STREAM_END != r) {
1340
0
            hts_log_error("LZMA decode failure (error %d)", r);
1341
0
            goto fail;
1342
0
        }
1343
1344
0
        out_pos = strm.total_out;
1345
1346
0
        if (r == LZMA_STREAM_END)
1347
0
            break;
1348
0
    }
1349
1350
    /* finish up any unflushed data; necessary? */
1351
0
    r = lzma_code(&strm, LZMA_FINISH);
1352
0
    if (r != LZMA_OK && r != LZMA_STREAM_END) {
1353
0
        hts_log_error("Call to lzma_code failed with error %d", r);
1354
0
        goto fail;
1355
0
    }
1356
1357
0
    new_out = realloc(out, strm.total_out > 0 ? strm.total_out : 1);
1358
0
    if (new_out)
1359
0
        out = new_out;
1360
0
    *size = strm.total_out;
1361
1362
0
    lzma_end(&strm);
1363
1364
0
    return out;
1365
1366
0
 fail:
1367
0
    lzma_end(&strm);
1368
0
    free(out);
1369
0
    return NULL;
1370
0
}
1371
#endif
1372
1373
/* ----------------------------------------------------------------------
1374
 * CRAM blocks - the dynamically growable data block. We have code to
1375
 * create, update, (un)compress and read/write.
1376
 *
1377
 * These are derived from the deflate_interlaced.c blocks, but with the
1378
 * CRAM extension of content types and IDs.
1379
 */
1380
1381
/*
1382
 * Allocates a new cram_block structure with a specified content_type and
1383
 * id.
1384
 *
1385
 * Returns block pointer on success
1386
 *         NULL on failure
1387
 */
1388
cram_block *cram_new_block(enum cram_content_type content_type,
1389
1.04M
                           int content_id) {
1390
1.04M
    cram_block *b = malloc(sizeof(*b));
1391
1.04M
    if (!b)
1392
0
        return NULL;
1393
1.04M
    b->method = b->orig_method = RAW;
1394
1.04M
    b->content_type = content_type;
1395
1.04M
    b->content_id = content_id;
1396
1.04M
    b->comp_size = 0;
1397
1.04M
    b->uncomp_size = 0;
1398
1.04M
    b->data = NULL;
1399
1.04M
    b->alloc = 0;
1400
1.04M
    b->byte = 0;
1401
1.04M
    b->bit = 7; // MSB
1402
1.04M
    b->crc32 = 0;
1403
1.04M
    b->idx = 0;
1404
1.04M
    b->m = NULL;
1405
1406
1.04M
    return b;
1407
1.04M
}
1408
1409
/*
1410
 * Reads a block from a cram file.
1411
 * Returns cram_block pointer on success.
1412
 *         NULL on failure
1413
 */
1414
919
cram_block *cram_read_block(cram_fd *fd) {
1415
919
    cram_block *b = malloc(sizeof(*b));
1416
919
    unsigned char c;
1417
919
    uint32_t crc = 0;
1418
919
    if (!b)
1419
0
        return NULL;
1420
1421
    //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
1422
1423
919
    if (-1 == (b->method      = hgetc(fd->fp))) { free(b); return NULL; }
1424
919
    if (b->method > TOK3) {
1425
11
        hts_log_error("Unknown block compression method %d", (int) b->method);
1426
11
        free(b); return NULL;
1427
11
    }
1428
908
    c = b->method; crc = crc32(crc, &c, 1);
1429
908
    if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; }
1430
908
    c = b->content_type; crc = crc32(crc, &c, 1);
1431
908
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->content_id, &crc))  { free(b); return NULL; }
1432
908
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->comp_size, &crc))   { free(b); return NULL; }
1433
908
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; }
1434
1435
    //fprintf(stderr, "  method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
1436
    //      b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
1437
1438
908
    if (b->method == RAW) {
1439
551
        if (b->uncomp_size < 0 || b->comp_size != b->uncomp_size) {
1440
3
            free(b);
1441
3
            return NULL;
1442
3
        }
1443
548
        b->alloc = b->uncomp_size;
1444
548
        if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
1445
548
        if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) {
1446
2
            free(b->data);
1447
2
            free(b);
1448
2
            return NULL;
1449
2
        }
1450
548
    } else {
1451
357
        if (b->comp_size < 0 || b->uncomp_size < 0) {
1452
0
            free(b);
1453
0
            return NULL;
1454
0
        }
1455
357
        b->alloc = b->comp_size;
1456
357
        if (!(b->data = malloc(b->comp_size)))  { free(b); return NULL; }
1457
357
        if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) {
1458
0
            free(b->data);
1459
0
            free(b);
1460
0
            return NULL;
1461
0
        }
1462
357
    }
1463
1464
903
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1465
122
        if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
1466
0
            free(b->data);
1467
0
            free(b);
1468
0
            return NULL;
1469
0
        }
1470
1471
122
        b->crc32_checked = fd->ignore_md5;
1472
122
        b->crc_part = crc;
1473
781
    } else {
1474
781
        b->crc32_checked = 1; // CRC not present
1475
781
    }
1476
1477
903
    b->orig_method = b->method;
1478
903
    b->idx = 0;
1479
903
    b->byte = 0;
1480
903
    b->bit = 7; // MSB
1481
1482
903
    return b;
1483
903
}
1484
1485
1486
/*
1487
 * Computes the size of a cram block, including the block
1488
 * header itself.
1489
 */
1490
0
uint32_t cram_block_size(cram_block *b) {
1491
0
    unsigned char dat[100], *cp = dat;;
1492
0
    uint32_t sz;
1493
1494
0
    *cp++ = b->method;
1495
0
    *cp++ = b->content_type;
1496
0
    cp += itf8_put((char*)cp, b->content_id);
1497
0
    cp += itf8_put((char*)cp, b->comp_size);
1498
0
    cp += itf8_put((char*)cp, b->uncomp_size);
1499
1500
0
    sz = cp-dat + 4;
1501
0
    sz += b->method == RAW ? b->uncomp_size : b->comp_size;
1502
1503
0
    return sz;
1504
0
}
1505
1506
/*
1507
 * Writes a CRAM block.
1508
 * Returns 0 on success
1509
 *        -1 on failure
1510
 */
1511
362k
int cram_write_block(cram_fd *fd, cram_block *b) {
1512
362k
    char vardata[100];
1513
362k
    int vardata_o = 0;
1514
1515
362k
    assert(b->method != RAW || (b->comp_size == b->uncomp_size));
1516
1517
362k
    if (hputc(b->method,       fd->fp)  == EOF) return -1;
1518
362k
    if (hputc(b->content_type, fd->fp)  == EOF) return -1;
1519
362k
    vardata_o += fd->vv.varint_put32(vardata          , vardata+100, b->content_id);
1520
362k
    vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->comp_size);
1521
362k
    vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->uncomp_size);
1522
362k
    if (vardata_o != hwrite(fd->fp, vardata, vardata_o))
1523
0
        return -1;
1524
1525
362k
    if (b->data) {
1526
323k
        if (b->method == RAW) {
1527
281k
            if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size))
1528
0
                return -1;
1529
281k
        } else {
1530
41.8k
            if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size))
1531
0
                return -1;
1532
41.8k
        }
1533
323k
    } else {
1534
        // Absent blocks should be size 0
1535
39.6k
        assert(b->method == RAW && b->uncomp_size == 0);
1536
39.6k
    }
1537
1538
362k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1539
362k
        char dat[100], *cp = (char *)dat;
1540
362k
        uint32_t crc;
1541
1542
362k
        *cp++ = b->method;
1543
362k
        *cp++ = b->content_type;
1544
362k
        cp += fd->vv.varint_put32(cp, dat+100, b->content_id);
1545
362k
        cp += fd->vv.varint_put32(cp, dat+100, b->comp_size);
1546
362k
        cp += fd->vv.varint_put32(cp, dat+100, b->uncomp_size);
1547
362k
        crc = crc32(0L, (uc *)dat, cp-dat);
1548
1549
362k
        if (b->method == RAW) {
1550
321k
            b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
1551
321k
        } else {
1552
41.8k
            b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
1553
41.8k
        }
1554
1555
362k
        if (-1 == int32_encode(fd, b->crc32))
1556
0
            return -1;
1557
362k
    }
1558
1559
362k
    return 0;
1560
362k
}
1561
1562
/*
1563
 * Frees a CRAM block, deallocating internal data too.
1564
 */
1565
1.41M
void cram_free_block(cram_block *b) {
1566
1.41M
    if (!b)
1567
371k
        return;
1568
1.04M
    if (b->data)
1569
691k
        free(b->data);
1570
1.04M
    free(b);
1571
1.04M
}
1572
1573
/*
1574
 * Uncompresses a CRAM block, if compressed.
1575
 */
1576
371
int cram_uncompress_block(cram_block *b) {
1577
371
    char *uncomp;
1578
371
    size_t uncomp_size = 0;
1579
1580
371
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1581
    // Pretend the CRC was OK so the fuzzer doesn't have to get it right
1582
371
    b->crc32_checked = 1;
1583
371
#endif
1584
1585
371
    if (b->crc32_checked == 0) {
1586
0
        uint32_t crc = crc32(b->crc_part, b->data ? b->data : (uc *)"", b->alloc);
1587
0
        b->crc32_checked = 1;
1588
0
        if (crc != b->crc32) {
1589
0
            hts_log_error("Block CRC32 failure");
1590
0
            return -1;
1591
0
        }
1592
0
    }
1593
1594
371
    if (b->uncomp_size == 0) {
1595
        // blank block
1596
0
        b->method = RAW;
1597
0
        return 0;
1598
0
    }
1599
371
    assert(b->uncomp_size >= 0); // cram_read_block should ensure this
1600
1601
371
    switch (b->method) {
1602
14
    case RAW:
1603
14
        return 0;
1604
1605
0
    case GZIP:
1606
0
        uncomp_size = b->uncomp_size;
1607
0
        uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1608
1609
0
        if (!uncomp)
1610
0
            return -1;
1611
0
        if (uncomp_size != b->uncomp_size) {
1612
0
            free(uncomp);
1613
0
            return -1;
1614
0
        }
1615
0
        free(b->data);
1616
0
        b->data = (unsigned char *)uncomp;
1617
0
        b->alloc = uncomp_size;
1618
0
        b->method = RAW;
1619
0
        break;
1620
1621
0
#ifdef HAVE_LIBBZ2
1622
0
    case BZIP2: {
1623
0
        unsigned int usize = b->uncomp_size;
1624
0
        if (!(uncomp = malloc(usize)))
1625
0
            return -1;
1626
0
        if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
1627
0
                                                (char *)b->data, b->comp_size,
1628
0
                                                0, 0)) {
1629
0
            free(uncomp);
1630
0
            return -1;
1631
0
        }
1632
0
        free(b->data);
1633
0
        b->data = (unsigned char *)uncomp;
1634
0
        b->alloc = usize;
1635
0
        b->method = RAW;
1636
0
        b->uncomp_size = usize; // Just in case it differs
1637
0
        break;
1638
0
    }
1639
#else
1640
    case BZIP2:
1641
        hts_log_error("Bzip2 compression is not compiled into this version. Please rebuild and try again");
1642
        return -1;
1643
#endif
1644
1645
0
#ifdef HAVE_LIBLZMA
1646
0
    case LZMA:
1647
0
        uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1648
0
        if (!uncomp)
1649
0
            return -1;
1650
0
        if (uncomp_size != b->uncomp_size) {
1651
0
            free(uncomp);
1652
0
            return -1;
1653
0
        }
1654
0
        free(b->data);
1655
0
        b->data = (unsigned char *)uncomp;
1656
0
        b->alloc = uncomp_size;
1657
0
        b->method = RAW;
1658
0
        break;
1659
#else
1660
    case LZMA:
1661
        hts_log_error("Lzma compression is not compiled into this version. Please rebuild and try again");
1662
        return -1;
1663
        break;
1664
#endif
1665
1666
2
    case RANS: {
1667
2
        unsigned int usize = b->uncomp_size, usize2;
1668
2
        uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2);
1669
2
        if (!uncomp)
1670
2
            return -1;
1671
0
        if (usize != usize2) {
1672
0
            free(uncomp);
1673
0
            return -1;
1674
0
        }
1675
0
        free(b->data);
1676
0
        b->data = (unsigned char *)uncomp;
1677
0
        b->alloc = usize2;
1678
0
        b->method = RAW;
1679
0
        b->uncomp_size = usize2; // Just in case it differs
1680
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1681
0
        break;
1682
0
    }
1683
1684
73
    case FQZ: {
1685
73
        uncomp_size = b->uncomp_size;
1686
73
        uncomp = fqz_decompress((char *)b->data, b->comp_size, &uncomp_size, NULL, 0);
1687
73
        if (!uncomp)
1688
55
            return -1;
1689
18
        free(b->data);
1690
18
        b->data = (unsigned char *)uncomp;
1691
18
        b->alloc = uncomp_size;
1692
18
        b->method = RAW;
1693
18
        b->uncomp_size = uncomp_size;
1694
18
        break;
1695
73
    }
1696
1697
4
    case RANS_PR0: {
1698
4
        unsigned int usize = b->uncomp_size, usize2;
1699
4
        uncomp = (char *)rans_uncompress_4x16(b->data, b->comp_size, &usize2);
1700
4
        if (!uncomp)
1701
1
            return -1;
1702
3
        if (usize != usize2) {
1703
3
            free(uncomp);
1704
3
            return -1;
1705
3
        }
1706
0
        b->orig_method = RANSPR;
1707
0
        free(b->data);
1708
0
        b->data = (unsigned char *)uncomp;
1709
0
        b->alloc = usize2;
1710
0
        b->method = RAW;
1711
0
        b->uncomp_size = usize2; // Just incase it differs
1712
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1713
0
        break;
1714
3
    }
1715
1716
0
    case ARITH_PR0: {
1717
0
        unsigned int usize = b->uncomp_size, usize2;
1718
0
        uncomp = (char *)arith_uncompress_to(b->data, b->comp_size, NULL, &usize2);
1719
0
        if (!uncomp)
1720
0
            return -1;
1721
0
        if (usize != usize2) {
1722
0
            free(uncomp);
1723
0
            return -1;
1724
0
        }
1725
0
        b->orig_method = ARITH;
1726
0
        free(b->data);
1727
0
        b->data = (unsigned char *)uncomp;
1728
0
        b->alloc = usize2;
1729
0
        b->method = RAW;
1730
0
        b->uncomp_size = usize2; // Just incase it differs
1731
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1732
0
        break;
1733
0
    }
1734
1735
278
    case TOK3: {
1736
278
        uint32_t out_len;
1737
278
        uint8_t *cp = tok3_decode_names(b->data, b->comp_size, &out_len);
1738
278
        if (!cp)
1739
278
            return -1;
1740
0
        b->orig_method = TOK3;
1741
0
        b->method = RAW;
1742
0
        free(b->data);
1743
0
        b->data = cp;
1744
0
        b->alloc = out_len;
1745
0
        b->uncomp_size = out_len;
1746
0
        break;
1747
278
    }
1748
1749
0
    default:
1750
0
        return -1;
1751
371
    }
1752
1753
18
    return 0;
1754
371
}
1755
1756
static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size,
1757
                                     int content_id, size_t *out_size,
1758
                                     enum cram_block_method_int method,
1759
579k
                                     int level, int strat) {
1760
579k
    switch (method) {
1761
49.2k
    case GZIP:
1762
83.6k
    case GZIP_RLE:
1763
201k
    case GZIP_1:
1764
        // Read names bizarrely benefit from zlib over libdeflate for
1765
        // mid-range compression levels.  Focusing purely of ratio or
1766
        // speed, libdeflate still wins.  It also seems to win for
1767
        // other data series too.
1768
        //
1769
        // Eg RN at level 5;  libdeflate=55.9MB  zlib=51.6MB
1770
#ifdef HAVE_LIBDEFLATE
1771
#  if (LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR == 1 && LIBDEFLATE_VERSION_MINOR <= 8))
1772
        if (content_id == DS_RN && level >= 4 && level <= 7)
1773
            return zlib_mem_deflate(in, in_size, out_size, level, strat);
1774
        else
1775
#  endif
1776
            return libdeflate_deflate(in, in_size, out_size, level, strat);
1777
#else
1778
201k
        return zlib_mem_deflate(in, in_size, out_size, level, strat);
1779
0
#endif
1780
1781
0
    case BZIP2: {
1782
0
#ifdef HAVE_LIBBZ2
1783
0
        unsigned int comp_size = in_size*1.01 + 600;
1784
0
        char *comp = malloc(comp_size);
1785
0
        if (!comp)
1786
0
            return NULL;
1787
1788
0
        if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
1789
0
                                              in, in_size,
1790
0
                                              level, 0, 30)) {
1791
0
            free(comp);
1792
0
            return NULL;
1793
0
        }
1794
0
        *out_size = comp_size;
1795
0
        return comp;
1796
#else
1797
        return NULL;
1798
#endif
1799
0
    }
1800
1801
0
    case FQZ:
1802
0
    case FQZ_b:
1803
0
    case FQZ_c:
1804
0
    case FQZ_d: {
1805
        // Extract the necessary portion of the slice into an fqz_slice struct.
1806
        // These previously were the same thing, but this permits us to detach
1807
        // the codec from the rest of this CRAM implementation.
1808
0
        fqz_slice *f = malloc(2*s->hdr->num_records * sizeof(uint32_t) + sizeof(fqz_slice));
1809
0
        if (!f)
1810
0
            return NULL;
1811
0
        f->num_records = s->hdr->num_records;
1812
0
        f->len = (uint32_t *)(((char *)f) + sizeof(fqz_slice));
1813
0
        f->flags = f->len + s->hdr->num_records;
1814
0
        int i;
1815
0
        for (i = 0; i < s->hdr->num_records; i++) {
1816
0
            f->flags[i] = s->crecs[i].flags;
1817
0
            f->len[i] = (i+1 < s->hdr->num_records
1818
0
                         ? s->crecs[i+1].qual - s->crecs[i].qual
1819
0
                         : s->block[DS_QS]->uncomp_size - s->crecs[i].qual);
1820
0
        }
1821
0
        char *comp = fqz_compress(strat & 0xff /* cram vers */, f,
1822
0
                                  in, in_size, out_size, strat >> 8, NULL);
1823
0
        free(f);
1824
0
        return comp;
1825
0
    }
1826
1827
0
    case LZMA:
1828
0
#ifdef HAVE_LIBLZMA
1829
0
        return lzma_mem_deflate(in, in_size, out_size, level);
1830
#else
1831
        return NULL;
1832
#endif
1833
1834
0
    case RANS0:
1835
0
    case RANS1: {
1836
0
        unsigned int out_size_i;
1837
0
        unsigned char *cp;
1838
0
        cp = rans_compress((unsigned char *)in, in_size, &out_size_i,
1839
0
                           method == RANS0 ? 0 : 1);
1840
0
        *out_size = out_size_i;
1841
0
        return (char *)cp;
1842
0
    }
1843
1844
162k
    case RANS_PR0:
1845
196k
    case RANS_PR1:
1846
251k
    case RANS_PR64:
1847
285k
    case RANS_PR9:
1848
332k
    case RANS_PR128:
1849
332k
    case RANS_PR129:
1850
332k
    case RANS_PR192:
1851
367k
    case RANS_PR193: {
1852
367k
        unsigned int out_size_i;
1853
367k
        unsigned char *cp;
1854
1855
        // see enum cram_block. We map RANS_* methods to order bit-fields
1856
367k
        static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1857
1858
367k
        int m = method == RANS_PR0 ? 0 : methmap[method - RANS_PR1];
1859
367k
        cp = rans_compress_4x16((unsigned char *)in, in_size, &out_size_i,
1860
367k
                                m | RANS_ORDER_SIMD_AUTO);
1861
367k
        *out_size = out_size_i;
1862
367k
        return (char *)cp;
1863
332k
    }
1864
1865
0
    case ARITH_PR0:
1866
0
    case ARITH_PR1:
1867
0
    case ARITH_PR64:
1868
0
    case ARITH_PR9:
1869
0
    case ARITH_PR128:
1870
0
    case ARITH_PR129:
1871
0
    case ARITH_PR192:
1872
0
    case ARITH_PR193: {
1873
0
        unsigned int out_size_i;
1874
0
        unsigned char *cp;
1875
1876
        // see enum cram_block. We map ARITH_* methods to order bit-fields
1877
0
        static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1878
1879
0
        cp = arith_compress_to((unsigned char *)in, in_size, NULL, &out_size_i,
1880
0
                               method == ARITH_PR0 ? 0 : methmap[method - ARITH_PR1]);
1881
0
        *out_size = out_size_i;
1882
0
        return (char *)cp;
1883
0
    }
1884
1885
10.7k
    case TOK3:
1886
10.7k
    case TOKA: {
1887
10.7k
        int out_len;
1888
10.7k
        int lev = level;
1889
10.7k
        if (method == TOK3 && lev > 3)
1890
10.7k
            lev = 3;
1891
10.7k
        uint8_t *cp = tok3_encode_names(in, in_size, lev, strat, &out_len, NULL);
1892
10.7k
        *out_size = out_len;
1893
10.7k
        return (char *)cp;
1894
10.7k
    }
1895
1896
0
    case RAW:
1897
0
        break;
1898
1899
0
    default:
1900
0
        return NULL;
1901
579k
    }
1902
1903
0
    return NULL;
1904
579k
}
1905
1906
/*
1907
 * A copy of cram_compress_block2 with added recursion detection.
1908
 * This is only called for error handling where the auto-tuning has failed.
1909
 * The simplest way of doing this is recusion + an additional argument, but
1910
 * we didn't want to complicate the existing code hence this is static.
1911
 */
1912
static int cram_compress_block3(cram_fd *fd, cram_slice *s,
1913
                                cram_block *b, cram_metrics *metrics,
1914
                                int method, int level,
1915
635k
                                int recurse) {
1916
1917
635k
    if (!b)
1918
39.9k
        return 0;
1919
1920
595k
    int orig_method = method;
1921
595k
    char *comp = NULL;
1922
595k
    size_t comp_size = 0;
1923
595k
    int strat;
1924
1925
    // Internally we have parameterised methods that externally map
1926
    // to the same CRAM method value.
1927
    // See enum_cram_block_method_int in cram_structs.h.
1928
595k
    int methmap[] = {
1929
        // Externally defined values
1930
595k
        RAW, GZIP, BZIP2, LZMA, RANS, RANSPR, ARITH, FQZ, TOK3,
1931
1932
        // Reserved for possible expansion
1933
595k
        0, 0,
1934
1935
        // Internally parameterised versions matching back to above
1936
        // external values
1937
595k
        GZIP, GZIP,
1938
595k
        FQZ, FQZ, FQZ,
1939
595k
        RANS,
1940
595k
        RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR,
1941
595k
        TOK3,
1942
595k
        ARITH,  ARITH,  ARITH,  ARITH,  ARITH,  ARITH,  ARITH,
1943
595k
    };
1944
1945
595k
    if (b->method != RAW) {
1946
        // Maybe already compressed if s->block[0] was compressed and
1947
        // we have e.g. s->block[DS_BA] set to s->block[0] due to only
1948
        // one base type present and hence using E_HUFFMAN on block 0.
1949
        // A second explicit attempt to compress the same block then
1950
        // occurs.
1951
0
        return 0;
1952
0
    }
1953
1954
595k
    if (method == -1) {
1955
1.76k
        method = 1<<GZIP;
1956
1.76k
        if (fd->use_bz2)
1957
0
            method |= 1<<BZIP2;
1958
1.76k
        if (fd->use_lzma)
1959
0
            method |= 1<<LZMA;
1960
1.76k
    }
1961
1962
595k
    if (level == -1)
1963
1.76k
        level = fd->level;
1964
1965
    //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
1966
1967
595k
    if (method == RAW || level == 0 || b->uncomp_size == 0) {
1968
312k
        b->method = RAW;
1969
312k
        b->comp_size = b->uncomp_size;
1970
        //fprintf(stderr, "Skip block id %d\n", b->content_id);
1971
312k
        return 0;
1972
312k
    }
1973
1974
283k
#ifndef ABS
1975
283k
#    define ABS(a) ((a)>=0?(a):-(a))
1976
283k
#endif
1977
1978
283k
    if (metrics) {
1979
278k
        pthread_mutex_lock(&fd->metrics_lock);
1980
        // Sudden changes in size trigger a retrial.  These are mainly
1981
        // triggered when switching to sorted / unsorted, where the number
1982
        // of elements in a slice radically changes.
1983
        //
1984
        // We also get large fluctuations based on genome coordinate for
1985
        // e.g. SA:Z and SC series, but we consider the typical scale of
1986
        // delta between blocks and use this to look for abnormality.
1987
1988
        // Equivalent to (but minus possible integer overflow)
1989
        //   (b->uncomp_size + 1000)/4 > metrics->input_avg_sz+1000 ||
1990
        //    b->uncomp_size + 1000    < (metrics->input_avg_sz+1000)/4)
1991
278k
        if (metrics->input_avg_sz &&
1992
87.4k
            (b->uncomp_size/4 - 750 > metrics->input_avg_sz ||
1993
86.7k
             b->uncomp_size         < metrics->input_avg_sz/4 - 750) &&
1994
1.36k
            ABS(b->uncomp_size-metrics->input_avg_sz)/10
1995
1.36k
                > metrics->input_avg_delta) {
1996
82
            metrics->next_trial = 0;
1997
82
        }
1998
1999
278k
        if (metrics->trial > 0 || --metrics->next_trial <= 0) {
2000
45.5k
            int m, unpackable = metrics->unpackable;
2001
45.5k
            size_t sz_best = b->uncomp_size;
2002
45.5k
            size_t sz[CRAM_MAX_METHOD] = {0};
2003
45.5k
            int method_best = 0; // RAW
2004
45.5k
            char *c_best = NULL, *c = NULL;
2005
2006
45.5k
            metrics->input_avg_delta =
2007
45.5k
                0.9 * (metrics->input_avg_delta +
2008
45.5k
                       ABS(b->uncomp_size - metrics->input_avg_sz));
2009
2010
45.5k
            metrics->input_avg_sz += b->uncomp_size*.2;
2011
45.5k
            metrics->input_avg_sz *= 0.8;
2012
2013
45.5k
            if (metrics->revised_method)
2014
23.0k
                method = metrics->revised_method;
2015
22.4k
            else
2016
22.4k
                metrics->revised_method = method;
2017
2018
45.5k
            if (metrics->next_trial <= 0) {
2019
2.32k
                metrics->next_trial = TRIAL_SPAN;
2020
2.32k
                metrics->trial = NTRIALS;
2021
76.8k
                for (m = 0; m < CRAM_MAX_METHOD; m++)
2022
74.4k
                    metrics->sz[m] /= 2;
2023
2.32k
                metrics->unpackable = 0;
2024
2.32k
            }
2025
2026
            // Compress this block using the best method
2027
45.5k
            if (unpackable && CRAM_MAJOR_VERS(fd->version) > 3) {
2028
                // No point trying bit-pack if 17+ symbols.
2029
0
                if (method & (1<<RANS_PR128))
2030
0
                    method = (method|(1<<RANS_PR0))&~(1<<RANS_PR128);
2031
0
                if (method & (1<<RANS_PR129))
2032
0
                    method = (method|(1<<RANS_PR1))&~(1<<RANS_PR129);
2033
0
                if (method & (1<<RANS_PR192))
2034
0
                    method = (method|(1<<RANS_PR64))&~(1<<RANS_PR192);
2035
0
                if (method & (1<<RANS_PR193))
2036
0
                    method = (method|(1<<RANS_PR64)|(1<<RANS_PR1))&~(1<<RANS_PR193);
2037
2038
0
                if (method & (1<<ARITH_PR128))
2039
0
                    method = (method|(1<<ARITH_PR0))&~(1<<ARITH_PR128);
2040
0
                if (method & (1<<ARITH_PR129))
2041
0
                    method = (method|(1<<ARITH_PR1))&~(1<<ARITH_PR129);
2042
0
                if (method & (1<<ARITH_PR192))
2043
0
                    method = (method|(1<<ARITH_PR64))&~(1<<ARITH_PR192);
2044
0
                if (method & (1u<<ARITH_PR193))
2045
0
                    method = (method|(1<<ARITH_PR64)|(1<<ARITH_PR1))&~(1u<<ARITH_PR193);
2046
0
            }
2047
2048
            // Libdeflate doesn't have a Z_RLE strategy.
2049
            // We treat it as level 1, but iff we haven't also
2050
            // explicitly listed that in the method list.
2051
#ifdef HAVE_LIBDEFLATE
2052
            if ((method & (1<<GZIP_RLE)) && (method & (1<<GZIP_1)))
2053
                method &= ~(1<<GZIP_RLE);
2054
#endif
2055
2056
45.5k
            pthread_mutex_unlock(&fd->metrics_lock);
2057
2058
1.50M
            for (m = 0; m < CRAM_MAX_METHOD; m++) {
2059
1.45M
                if (method & (1u<<m)) {
2060
342k
                    int lvl = level;
2061
342k
                    switch (m) {
2062
44.8k
                    case GZIP:     strat = Z_FILTERED; break;
2063
45.5k
                    case GZIP_1:   strat = Z_DEFAULT_STRATEGY; lvl = 1; break;
2064
34.4k
                    case GZIP_RLE: strat = Z_RLE; break;
2065
0
                    case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
2066
0
                    case FQZ_b:    strat = CRAM_MAJOR_VERS(fd->version)+256; break;
2067
0
                    case FQZ_c:    strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
2068
0
                    case FQZ_d:    strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
2069
10.5k
                    case TOK3:     strat = 0; break;
2070
0
                    case TOKA:     strat = 1; break;
2071
206k
                    default:       strat = 0;
2072
342k
                    }
2073
2074
342k
                    c = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2075
342k
                                                b->content_id, &sz[m], m, lvl, strat);
2076
2077
342k
                    if (c && sz_best > sz[m]) {
2078
28.2k
                        sz_best = sz[m];
2079
28.2k
                        method_best = m;
2080
28.2k
                        if (c_best)
2081
14.1k
                            free(c_best);
2082
28.2k
                        c_best = c;
2083
313k
                    } else if (c) {
2084
312k
                        free(c);
2085
312k
                    } else {
2086
1.03k
                        sz[m] = UINT_MAX; // arbitrarily worse than raw
2087
1.03k
                    }
2088
1.11M
                } else {
2089
1.11M
                    sz[m] = UINT_MAX; // arbitrarily worse than raw
2090
1.11M
                }
2091
1.45M
            }
2092
2093
45.5k
            if (c_best) {
2094
14.1k
                free(b->data);
2095
14.1k
                b->data = (unsigned char *)c_best;
2096
14.1k
                b->method = method_best; // adjusted to methmap[method_best] later
2097
14.1k
                b->comp_size = sz_best;
2098
14.1k
            }
2099
2100
            // Accumulate stats for all methods tried
2101
45.5k
            pthread_mutex_lock(&fd->metrics_lock);
2102
1.50M
            for (m = 0; m < CRAM_MAX_METHOD; m++)
2103
                // don't be overly sure on small blocks.
2104
                // +2000 means eg bzip2 vs gzip (1.07 to 1.04) or gz vs rans1
2105
                // needs to be at least 60 bytes smaller to overcome the
2106
                // fixed size addition.
2107
1.45M
                metrics->sz[m] += sz[m]+2000;
2108
2109
            // When enough trials performed, find the best on average
2110
45.5k
            if (--metrics->trial == 0) {
2111
11.9k
                int best_method = RAW;
2112
11.9k
                int best_sz = INT_MAX;
2113
2114
                // Relative costs of methods. See enum_cram_block_method_int
2115
                // and methmap
2116
11.9k
                double meth_cost[32] = {
2117
                    // Externally defined methods
2118
11.9k
                    1,    // 0  raw
2119
11.9k
                    1.04, // 1  gzip (Z_FILTERED)
2120
11.9k
                    1.07, // 2  bzip2
2121
11.9k
                    1.08, // 3  lzma
2122
11.9k
                    1.00, // 4  rans    (O0)
2123
11.9k
                    1.00, // 5  ranspr  (O0)
2124
11.9k
                    1.04, // 6  arithpr (O0)
2125
11.9k
                    1.05, // 7  fqz
2126
11.9k
                    1.05, // 8  tok3 (rans)
2127
11.9k
                    1.00, 1.00, // 9,10 reserved
2128
2129
                    // Paramterised versions of above
2130
11.9k
                    1.01, // gzip rle
2131
11.9k
                    1.01, // gzip -1
2132
2133
11.9k
                    1.05, 1.05, 1.05, // FQZ_b,c,d
2134
2135
11.9k
                    1.01, // rans O1
2136
2137
11.9k
                    1.01, // rans_pr1
2138
11.9k
                    1.00, // rans_pr64; if smaller, usually fast
2139
11.9k
                    1.03, // rans_pr65/9
2140
11.9k
                    1.00, // rans_pr128
2141
11.9k
                    1.01, // rans_pr129
2142
11.9k
                    1.00, // rans_pr192
2143
11.9k
                    1.01, // rans_pr193
2144
2145
11.9k
                    1.07, // tok3 arith
2146
2147
11.9k
                    1.04, // arith_pr1
2148
11.9k
                    1.04, // arith_pr64
2149
11.9k
                    1.04, // arith_pr9
2150
11.9k
                    1.03, // arith_pr128
2151
11.9k
                    1.04, // arith_pr129
2152
11.9k
                    1.04, // arith_pr192
2153
11.9k
                    1.04, // arith_pr193
2154
11.9k
                };
2155
2156
                // Scale methods by cost based on compression level
2157
11.9k
                if (fd->level <= 1) {
2158
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2159
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1)*4;
2160
11.9k
                } else if (fd->level <= 3) {
2161
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2162
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1);
2163
11.9k
                } else if (fd->level <= 6) {
2164
394k
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2165
382k
                        metrics->sz[m] *= 1+(meth_cost[m]-1)/2;
2166
11.9k
                } else if (fd->level <= 7) {
2167
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2168
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1)/3;
2169
0
                } // else cost is ignored
2170
2171
                // Ensure these are never used; BSC and ZSTD
2172
11.9k
                metrics->sz[9] = metrics->sz[10] = INT_MAX;
2173
2174
394k
                for (m = 0; m < CRAM_MAX_METHOD; m++) {
2175
382k
                    if ((!metrics->sz[m]) || (!(method & (1u<<m))))
2176
297k
                        continue;
2177
2178
84.2k
                    if (best_sz > metrics->sz[m])
2179
27.6k
                        best_sz = metrics->sz[m], best_method = m;
2180
84.2k
                }
2181
2182
11.9k
                if (best_method != metrics->method) {
2183
                    //metrics->trial = (NTRIALS+1)/2; // be sure
2184
                    //metrics->next_trial /= 1.5;
2185
5.81k
                    metrics->consistency = 0;
2186
6.13k
                } else {
2187
6.13k
                    metrics->next_trial *= MIN(2, 1+metrics->consistency/4.0);
2188
6.13k
                    metrics->consistency++;
2189
6.13k
                }
2190
2191
11.9k
                metrics->method = best_method;
2192
11.9k
                switch (best_method) {
2193
122
                case GZIP:     strat = Z_FILTERED; break;
2194
3.71k
                case GZIP_1:   strat = Z_DEFAULT_STRATEGY; break;
2195
0
                case GZIP_RLE: strat = Z_RLE; break;
2196
0
                case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
2197
0
                case FQZ_b:    strat = CRAM_MAJOR_VERS(fd->version)+256; break;
2198
0
                case FQZ_c:    strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
2199
0
                case FQZ_d:    strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
2200
228
                case TOK3:     strat = 0; break;
2201
0
                case TOKA:     strat = 1; break;
2202
7.87k
                default:       strat = 0;
2203
11.9k
                }
2204
11.9k
                metrics->strat  = strat;
2205
2206
                // If we see at least MAXFAIL trials in a row for a specific
2207
                // compression method with more than MAXDELTA aggregate
2208
                // size then we drop this from the list of methods used
2209
                // for this block type.
2210
105k
#define MAXDELTA 0.20
2211
274k
#define MAXFAILS 4
2212
394k
                for (m = 0; m < CRAM_MAX_METHOD; m++) {
2213
382k
                    if (best_method == m) {
2214
11.9k
                        metrics->cnt[m] = 0;
2215
11.9k
                        metrics->extra[m] = 0;
2216
370k
                    } else if (best_sz < metrics->sz[m]) {
2217
274k
                        double r = (double)metrics->sz[m] / best_sz - 1;
2218
274k
                        int mul = 1+(fd->level>=7);
2219
274k
                        if (++metrics->cnt[m] >= MAXFAILS*mul &&
2220
105k
                            (metrics->extra[m] += r) >= MAXDELTA*mul)
2221
44.7k
                            method &= ~(1u<<m);
2222
2223
                        // Special case for fqzcomp as it rarely changes
2224
274k
                        if (m == FQZ || m == FQZ_b || m == FQZ_c || m == FQZ_d) {
2225
44.0k
                            if (metrics->sz[m] > best_sz)
2226
44.0k
                                method &= ~(1u<<m);
2227
44.0k
                        }
2228
274k
                    }
2229
382k
                }
2230
2231
                //if (fd->verbose > 1 && method != metrics->revised_method)
2232
                //    fprintf(stderr, "%d: revising method from %x to %x\n",
2233
                //          b->content_id, metrics->revised_method, method);
2234
11.9k
                metrics->revised_method = method;
2235
11.9k
            }
2236
45.5k
            pthread_mutex_unlock(&fd->metrics_lock);
2237
233k
        } else {
2238
233k
            metrics->input_avg_delta =
2239
233k
                0.9 * (metrics->input_avg_delta +
2240
233k
                       ABS(b->uncomp_size - metrics->input_avg_sz));
2241
2242
233k
            metrics->input_avg_sz += b->uncomp_size*.2;
2243
233k
            metrics->input_avg_sz *= 0.8;
2244
2245
233k
            strat = metrics->strat;
2246
233k
            method = metrics->method;
2247
2248
233k
            pthread_mutex_unlock(&fd->metrics_lock);
2249
233k
            comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2250
233k
                                           b->content_id, &comp_size, method,
2251
233k
                                           method == GZIP_1 ? 1 : level,
2252
233k
                                           strat);
2253
233k
            if (!comp) {
2254
                // Our cached best method failed, but maybe another works?
2255
                // Rerun with trial mode engaged again.
2256
168
                if (!recurse) {
2257
168
                    hts_log_warning("Compressed block ID %d method %s failed, "
2258
168
                                    "redoing trial", b->content_id,
2259
168
                                    cram_block_method2str(method));
2260
168
                    pthread_mutex_lock(&fd->metrics_lock);
2261
168
                    metrics->trial = NTRIALS;
2262
168
                    metrics->next_trial = TRIAL_SPAN;
2263
168
                    metrics->revised_method = orig_method;
2264
168
                    pthread_mutex_unlock(&fd->metrics_lock);
2265
168
                    return cram_compress_block3(fd, s, b, metrics, method,
2266
168
                                                level, 1);
2267
168
                }
2268
0
                return -1;
2269
168
            }
2270
2271
233k
            if (comp_size < b->uncomp_size) {
2272
26.5k
                free(b->data);
2273
26.5k
                b->data = (unsigned char *)comp;
2274
26.5k
                b->comp_size = comp_size;
2275
26.5k
                b->method = method;
2276
206k
            } else {
2277
206k
                free(comp);
2278
206k
            }
2279
233k
        }
2280
2281
278k
    } else {
2282
        // no cached metrics, so just do zlib?
2283
4.25k
        comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2284
4.25k
                                       b->content_id, &comp_size, GZIP, level, Z_FILTERED);
2285
4.25k
        if (!comp) {
2286
0
            hts_log_error("Compression failed!");
2287
0
            return -1;
2288
0
        }
2289
2290
4.25k
        if (comp_size < b->uncomp_size) {
2291
1.21k
            free(b->data);
2292
1.21k
            b->data = (unsigned char *)comp;
2293
1.21k
            b->comp_size = comp_size;
2294
1.21k
            b->method = GZIP;
2295
3.03k
        } else {
2296
3.03k
            free(comp);
2297
3.03k
        }
2298
4.25k
        strat = Z_FILTERED;
2299
4.25k
    }
2300
2301
282k
    hts_log_info("Compressed block ID %d from %d to %d by method %s",
2302
282k
                 b->content_id, b->uncomp_size, b->comp_size,
2303
282k
                 cram_block_method2str(b->method));
2304
2305
282k
    b->method = methmap[b->method];
2306
2307
282k
    return 0;
2308
283k
}
2309
2310
/*
2311
 * Compresses a block using a selection of compression codecs and options.
2312
 * The best is learnt and used for subsequent slices, periodically resampling.
2313
 *
2314
 * Method and level -1 implies defaults, as specified in cram_fd.
2315
 */
2316
int cram_compress_block2(cram_fd *fd, cram_slice *s,
2317
                         cram_block *b, cram_metrics *metrics,
2318
635k
                         int method, int level) {
2319
635k
    return cram_compress_block3(fd, s, b, metrics, method, level, 0);
2320
635k
}
2321
2322
int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
2323
1.76k
                        int method, int level) {
2324
1.76k
    return cram_compress_block2(fd, NULL, b, metrics, method, level);
2325
1.76k
}
2326
2327
150k
cram_metrics *cram_new_metrics(void) {
2328
150k
    cram_metrics *m = calloc(1, sizeof(*m));
2329
150k
    if (!m)
2330
0
        return NULL;
2331
150k
    m->trial = NTRIALS-1;
2332
150k
    m->next_trial = TRIAL_SPAN/2; // learn quicker at start
2333
150k
    m->method = RAW;
2334
150k
    m->strat = 0;
2335
150k
    m->revised_method = 0;
2336
150k
    m->unpackable = 0;
2337
2338
150k
    return m;
2339
150k
}
2340
2341
283k
char *cram_block_method2str(enum cram_block_method_int m) {
2342
283k
    switch(m) {
2343
241k
    case RAW:         return "RAW";
2344
3.69k
    case GZIP:        return "GZIP";
2345
0
    case BZIP2:       return "BZIP2";
2346
0
    case LZMA:        return "LZMA";
2347
0
    case RANS0:       return "RANS0";
2348
0
    case RANS1:       return "RANS1";
2349
11
    case GZIP_RLE:    return "GZIP_RLE";
2350
4.11k
    case GZIP_1:      return "GZIP_1";
2351
0
    case FQZ:         return "FQZ";
2352
0
    case FQZ_b:       return "FQZ_b";
2353
0
    case FQZ_c:       return "FQZ_c";
2354
0
    case FQZ_d:       return "FQZ_d";
2355
1.21k
    case RANS_PR0:    return "RANS_PR0";
2356
440
    case RANS_PR1:    return "RANS_PR1";
2357
18.5k
    case RANS_PR64:   return "RANS_PR64";
2358
1
    case RANS_PR9:    return "RANS_PR9";
2359
13.6k
    case RANS_PR128:  return "RANS_PR128";
2360
0
    case RANS_PR129:  return "RANS_PR129";
2361
0
    case RANS_PR192:  return "RANS_PR192";
2362
245
    case RANS_PR193:  return "RANS_PR193";
2363
168
    case TOK3:        return "TOK3_R";
2364
0
    case TOKA:        return "TOK3_A";
2365
0
    case ARITH_PR0:   return "ARITH_PR0";
2366
0
    case ARITH_PR1:   return "ARITH_PR1";
2367
0
    case ARITH_PR64:  return "ARITH_PR64";
2368
0
    case ARITH_PR9:   return "ARITH_PR9";
2369
0
    case ARITH_PR128: return "ARITH_PR128";
2370
0
    case ARITH_PR129: return "ARITH_PR129";
2371
0
    case ARITH_PR192: return "ARITH_PR192";
2372
0
    case ARITH_PR193: return "ARITH_PR193";
2373
0
    case BM_ERROR: break;
2374
283k
    }
2375
0
    return "?";
2376
283k
}
2377
2378
0
char *cram_content_type2str(enum cram_content_type t) {
2379
0
    switch (t) {
2380
0
    case FILE_HEADER:         return "FILE_HEADER";
2381
0
    case COMPRESSION_HEADER:  return "COMPRESSION_HEADER";
2382
0
    case MAPPED_SLICE:        return "MAPPED_SLICE";
2383
0
    case UNMAPPED_SLICE:      return "UNMAPPED_SLICE";
2384
0
    case EXTERNAL:            return "EXTERNAL";
2385
0
    case CORE:                return "CORE";
2386
0
    case CT_ERROR:            break;
2387
0
    }
2388
0
    return "?";
2389
0
}
2390
2391
/* ----------------------------------------------------------------------
2392
 * Reference sequence handling
2393
 *
2394
 * These revolve around the refs_t structure, which may potentially be
2395
 * shared between multiple cram_fd.
2396
 *
2397
 * We start with refs_create() to allocate an empty refs_t and then
2398
 * populate it with @SQ line data using refs_from_header(). This is done on
2399
 * cram_open().  Also at start up we can call cram_load_reference() which
2400
 * is used with "scramble -r foo.fa". This replaces the fd->refs with the
2401
 * new one specified. In either case refs2id() is then called which
2402
 * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
2403
 *
2404
 * Later, possibly within a thread, we will want to know the actual ref
2405
 * seq itself, obtained by calling cram_get_ref().  This may use the
2406
 * UR: or M5: fields or the filename specified in the original
2407
 * cram_load_reference() call.
2408
 *
2409
 * Given the potential for multi-threaded reference usage, we have
2410
 * reference counting (sorry for the confusing double use of "ref") to
2411
 * track the number of callers interested in any specific reference.
2412
 */
2413
2414
/*
2415
 * Frees/unmaps a reference sequence and associated file handles.
2416
 */
2417
1.88k
static void ref_entry_free_seq(ref_entry *e) {
2418
1.88k
    if (e->mf)
2419
0
        mfclose(e->mf);
2420
1.88k
    if (e->seq && !e->mf)
2421
0
        free(e->seq);
2422
2423
1.88k
    e->seq = NULL;
2424
1.88k
    e->mf = NULL;
2425
1.88k
}
2426
2427
3.09k
void refs_free(refs_t *r) {
2428
3.09k
    RP("refs_free()\n");
2429
2430
3.09k
    if (!r)
2431
0
        return;
2432
2433
3.09k
    if (--r->count > 0)
2434
0
        return;
2435
2436
3.09k
    if (r->pool)
2437
3.09k
        string_pool_destroy(r->pool);
2438
2439
3.09k
    if (r->h_meta) {
2440
3.09k
        khint_t k;
2441
2442
8.57k
        for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) {
2443
5.48k
            ref_entry *e;
2444
2445
5.48k
            if (!kh_exist(r->h_meta, k))
2446
3.60k
                continue;
2447
1.88k
            if (!(e = kh_val(r->h_meta, k)))
2448
0
                continue;
2449
1.88k
            ref_entry_free_seq(e);
2450
1.88k
            free(e);
2451
1.88k
        }
2452
2453
3.09k
        kh_destroy(refs, r->h_meta);
2454
3.09k
    }
2455
2456
3.09k
    if (r->ref_id)
2457
2.06k
        free(r->ref_id);
2458
2459
3.09k
    if (r->fp)
2460
0
        bgzf_close(r->fp);
2461
2462
3.09k
    pthread_mutex_destroy(&r->lock);
2463
2464
3.09k
    free(r);
2465
3.09k
}
2466
2467
3.09k
static refs_t *refs_create(void) {
2468
3.09k
    refs_t *r = calloc(1, sizeof(*r));
2469
2470
3.09k
    RP("refs_create()\n");
2471
2472
3.09k
    if (!r)
2473
0
        return NULL;
2474
2475
3.09k
    if (!(r->pool = string_pool_create(8192)))
2476
0
        goto err;
2477
2478
3.09k
    r->ref_id = NULL; // see refs2id() to populate.
2479
3.09k
    r->count = 1;
2480
3.09k
    r->last = NULL;
2481
3.09k
    r->last_id = -1;
2482
2483
3.09k
    if (!(r->h_meta = kh_init(refs)))
2484
0
        goto err;
2485
2486
3.09k
    pthread_mutex_init(&r->lock, NULL);
2487
2488
3.09k
    return r;
2489
2490
0
 err:
2491
0
    refs_free(r);
2492
0
    return NULL;
2493
3.09k
}
2494
2495
/*
2496
 * Opens a reference fasta file as a BGZF stream, allowing for
2497
 * compressed files.  It automatically builds a .fai file if
2498
 * required and if compressed a .gzi bgzf index too.
2499
 *
2500
 * Returns a BGZF handle on success;
2501
 *         NULL on failure.
2502
 */
2503
628
static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) {
2504
628
    BGZF *fp;
2505
2506
628
    if (strncmp(fn, "file://", 7) == 0)
2507
0
        fn += 7;
2508
2509
628
    if (!is_md5 && !hisremote(fn)) {
2510
384
        char fai_file[PATH_MAX];
2511
2512
384
        snprintf(fai_file, PATH_MAX, "%s.fai", fn);
2513
384
        if (access(fai_file, R_OK) != 0)
2514
384
            if (fai_build(fn) != 0)
2515
384
                return NULL;
2516
384
    }
2517
2518
244
    if (!(fp = bgzf_open(fn, mode))) {
2519
244
        perror(fn);
2520
244
        return NULL;
2521
244
    }
2522
2523
0
    if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) {
2524
0
        hts_log_error("Unable to load .gzi index '%s.gzi'", fn);
2525
0
        bgzf_close(fp);
2526
0
        return NULL;
2527
0
    }
2528
2529
0
    return fp;
2530
0
}
2531
2532
/*
2533
 * Loads a FAI file for a reference.fasta.
2534
 * "is_err" indicates whether failure to load is worthy of emitting an
2535
 * error message. In some cases (eg with embedded references) we
2536
 * speculatively load, just in case, and silently ignore errors.
2537
 *
2538
 * Returns the refs_t struct on success (maybe newly allocated);
2539
 *         NULL on failure
2540
 */
2541
628
static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) {
2542
628
    hFILE *fp = NULL;
2543
628
    char fai_fn[PATH_MAX];
2544
628
    char line[8192];
2545
628
    refs_t *r = r_orig;
2546
628
    size_t fn_l = strlen(fn);
2547
628
    int id = 0, id_alloc = 0;
2548
2549
628
    RP("refs_load_fai %s\n", fn);
2550
2551
628
    if (!r)
2552
0
        if (!(r = refs_create()))
2553
0
            goto err;
2554
2555
628
    if (r->fp)
2556
0
        if (bgzf_close(r->fp) != 0)
2557
0
            goto err;
2558
628
    r->fp = NULL;
2559
2560
    /* Look for a FASTA##idx##FAI format */
2561
628
    char *fn_delim = strstr(fn, HTS_IDX_DELIM);
2562
628
    if (fn_delim) {
2563
11
        if (!(r->fn = string_ndup(r->pool, fn, fn_delim - fn)))
2564
0
            goto err;
2565
11
        fn_delim += strlen(HTS_IDX_DELIM);
2566
11
        snprintf(fai_fn, PATH_MAX, "%s", fn_delim);
2567
617
    } else {
2568
        /* An index file was provided, instead of the actual reference file */
2569
617
        if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0) {
2570
7
            if (!r->fn) {
2571
2
                if (!(r->fn = string_ndup(r->pool, fn, fn_l-4)))
2572
0
                    goto err;
2573
2
            }
2574
7
            snprintf(fai_fn, PATH_MAX, "%s", fn);
2575
610
        } else {
2576
        /* Only the reference file provided. Get the index file name from it */
2577
610
            if (!(r->fn = string_dup(r->pool, fn)))
2578
0
                goto err;
2579
610
            snprintf(fai_fn, PATH_MAX, "%.*s.fai", PATH_MAX-5, fn);
2580
610
        }
2581
617
    }
2582
2583
628
    if (!(r->fp = bgzf_open_ref(r->fn, "r", 0))) {
2584
628
        hts_log_error("Failed to open reference file '%s'", r->fn);
2585
628
        goto err;
2586
628
    }
2587
2588
0
    if (!(fp = hopen(fai_fn, "r"))) {
2589
0
        hts_log_error("Failed to open index file '%s'", fai_fn);
2590
0
        if (is_err)
2591
0
            perror(fai_fn);
2592
0
        goto err;
2593
0
    }
2594
0
    while (hgets(line, 8192, fp) != NULL) {
2595
0
        ref_entry *e = malloc(sizeof(*e));
2596
0
        char *cp;
2597
0
        int n;
2598
0
        khint_t k;
2599
2600
0
        if (!e)
2601
0
            return NULL;
2602
2603
        // id
2604
0
        for (cp = line; *cp && !isspace_c(*cp); cp++)
2605
0
            ;
2606
0
        *cp++ = 0;
2607
0
        e->name = string_dup(r->pool, line);
2608
2609
        // length
2610
0
        while (*cp && isspace_c(*cp))
2611
0
            cp++;
2612
0
        e->length = strtoll(cp, &cp, 10);
2613
2614
        // offset
2615
0
        while (*cp && isspace_c(*cp))
2616
0
            cp++;
2617
0
        e->offset = strtoll(cp, &cp, 10);
2618
2619
        // bases per line
2620
0
        while (*cp && isspace_c(*cp))
2621
0
            cp++;
2622
0
        e->bases_per_line = strtol(cp, &cp, 10);
2623
2624
        // line length
2625
0
        while (*cp && isspace_c(*cp))
2626
0
            cp++;
2627
0
        e->line_length = strtol(cp, &cp, 10);
2628
2629
        // filename
2630
0
        e->fn = r->fn;
2631
2632
0
        e->count = 0;
2633
0
        e->seq = NULL;
2634
0
        e->mf = NULL;
2635
0
        e->is_md5 = 0;
2636
0
        e->validated_md5 = 0;
2637
2638
0
        k = kh_put(refs, r->h_meta, e->name, &n);
2639
0
        if (-1 == n)  {
2640
0
            free(e);
2641
0
            return NULL;
2642
0
        }
2643
2644
0
        if (n) {
2645
0
            kh_val(r->h_meta, k) = e;
2646
0
        } else {
2647
0
            ref_entry *re = kh_val(r->h_meta, k);
2648
0
            if (re && (re->count != 0 || re->length != 0)) {
2649
                /* Keep old */
2650
0
                free(e);
2651
0
            } else {
2652
                /* Replace old */
2653
0
                if (re)
2654
0
                    free(re);
2655
0
                kh_val(r->h_meta, k) = e;
2656
0
            }
2657
0
        }
2658
2659
0
        if (id >= id_alloc) {
2660
0
            ref_entry **new_refs;
2661
0
            int x;
2662
2663
0
            id_alloc = id_alloc ?id_alloc*2 : 16;
2664
0
            new_refs = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
2665
0
            if (!new_refs)
2666
0
                goto err;
2667
0
            r->ref_id = new_refs;
2668
2669
0
            for (x = id; x < id_alloc; x++)
2670
0
                r->ref_id[x] = NULL;
2671
0
        }
2672
0
        r->ref_id[id] = e;
2673
0
        r->nref = ++id;
2674
0
    }
2675
2676
0
    if(hclose(fp) < 0)
2677
0
        goto err;
2678
0
    return r;
2679
2680
628
 err:
2681
628
    if (fp)
2682
0
        hclose_abruptly(fp);
2683
2684
628
    if (!r_orig)
2685
0
        refs_free(r);
2686
2687
628
    return NULL;
2688
0
}
2689
2690
/*
2691
 * Verifies that the CRAM @SQ lines and .fai files match.
2692
 */
2693
0
static void sanitise_SQ_lines(cram_fd *fd) {
2694
0
    int i;
2695
2696
0
    if (!fd->header || !fd->header->hrecs)
2697
0
        return;
2698
2699
0
    if (!fd->refs || !fd->refs->h_meta)
2700
0
        return;
2701
2702
0
    for (i = 0; i < fd->header->hrecs->nref; i++) {
2703
0
        const char *name = fd->header->hrecs->ref[i].name;
2704
0
        khint_t k = kh_get(refs, fd->refs->h_meta, name);
2705
0
        ref_entry *r;
2706
2707
        // We may have @SQ lines which have no known .fai, but do not
2708
        // in themselves pose a problem because they are unused in the file.
2709
0
        if (k == kh_end(fd->refs->h_meta))
2710
0
            continue;
2711
2712
0
        if (!(r = (ref_entry *)kh_val(fd->refs->h_meta, k)))
2713
0
            continue;
2714
2715
0
        if (r->length && r->length != fd->header->hrecs->ref[i].len) {
2716
0
            assert(strcmp(r->name, fd->header->hrecs->ref[i].name) == 0);
2717
2718
            // Should we also check MD5sums here to ensure the correct
2719
            // reference was given?
2720
0
            hts_log_warning("Header @SQ length mismatch for ref %s, %"PRIhts_pos" vs %d",
2721
0
                            r->name, fd->header->hrecs->ref[i].len, (int)r->length);
2722
2723
            // Fixing the parsed @SQ header will make MD:Z: strings work
2724
            // and also stop it producing N for the sequence.
2725
0
            fd->header->hrecs->ref[i].len = r->length;
2726
0
        }
2727
0
    }
2728
0
}
2729
2730
/*
2731
 * Indexes references by the order they appear in a BAM file. This may not
2732
 * necessarily be the same order they appear in the fasta reference file.
2733
 *
2734
 * Returns 0 on success
2735
 *        -1 on failure
2736
 */
2737
1.76k
int refs2id(refs_t *r, sam_hdr_t *hdr) {
2738
1.76k
    int i;
2739
1.76k
    sam_hrecs_t *h = hdr->hrecs;
2740
2741
1.76k
    if (r->ref_id)
2742
973
        free(r->ref_id);
2743
1.76k
    if (r->last)
2744
0
        r->last = NULL;
2745
2746
1.76k
    r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
2747
1.76k
    if (!r->ref_id)
2748
0
        return -1;
2749
2750
1.76k
    r->nref = h->nref;
2751
3.06k
    for (i = 0; i < h->nref; i++) {
2752
1.30k
        khint_t k = kh_get(refs, r->h_meta, h->ref[i].name);
2753
1.30k
        if (k != kh_end(r->h_meta)) {
2754
1.30k
            r->ref_id[i] = kh_val(r->h_meta, k);
2755
1.30k
        } else {
2756
0
            hts_log_warning("Unable to find ref name '%s'", h->ref[i].name);
2757
0
        }
2758
1.30k
    }
2759
2760
1.76k
    return 0;
2761
1.76k
}
2762
2763
/*
2764
 * Generates refs_t entries based on @SQ lines in the header.
2765
 * Returns 0 on success
2766
 *         -1 on failure
2767
 */
2768
6.69k
static int refs_from_header(cram_fd *fd) {
2769
6.69k
    if (!fd)
2770
0
        return -1;
2771
2772
6.69k
    refs_t *r = fd->refs;
2773
6.69k
    if (!r)
2774
0
        return -1;
2775
2776
6.69k
    sam_hdr_t *h = fd->header;
2777
6.69k
    if (!h)
2778
1.93k
        return 0;
2779
2780
4.76k
    if (!h->hrecs) {
2781
2.64k
        if (-1 == sam_hdr_fill_hrecs(h))
2782
71
            return -1;
2783
2.64k
    }
2784
2785
4.69k
    if (h->hrecs->nref == 0)
2786
2.45k
        return 0;
2787
2788
    //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
2789
2790
    /* Existing refs are fine, as long as they're compatible with the hdr. */
2791
2.24k
    ref_entry **new_ref_id = realloc(r->ref_id, (r->nref + h->hrecs->nref) * sizeof(*r->ref_id));
2792
2.24k
    if (!new_ref_id)
2793
0
        return -1;
2794
2.24k
    r->ref_id = new_ref_id;
2795
2796
2.24k
    int i, j;
2797
    /* Copy info from h->ref[i] over to r */
2798
5.42k
    for (i = 0, j = r->nref; i < h->hrecs->nref; i++) {
2799
3.18k
        sam_hrec_type_t *ty;
2800
3.18k
        sam_hrec_tag_t *tag;
2801
3.18k
        khint_t k;
2802
3.18k
        int n;
2803
2804
3.18k
        k = kh_get(refs, r->h_meta, h->hrecs->ref[i].name);
2805
3.18k
        if (k != kh_end(r->h_meta))
2806
            // Ref already known about
2807
1.30k
            continue;
2808
2809
1.88k
        if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
2810
0
            return -1;
2811
2812
1.88k
        if (!h->hrecs->ref[i].name)
2813
0
            return -1;
2814
2815
1.88k
        r->ref_id[j]->name = string_dup(r->pool, h->hrecs->ref[i].name);
2816
1.88k
        if (!r->ref_id[j]->name) return -1;
2817
1.88k
        r->ref_id[j]->length = 0; // marker for not yet loaded
2818
2819
        /* Initialise likely filename if known */
2820
1.88k
        if ((ty = sam_hrecs_find_type_id(h->hrecs, "SQ", "SN", h->hrecs->ref[i].name))) {
2821
1.88k
            if ((tag = sam_hrecs_find_key(ty, "M5", NULL)))
2822
142
                r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
2823
2824
1.88k
            if ((tag = sam_hrecs_find_key(ty, "LN", NULL))) {
2825
                // LN tag used when constructing consensus reference
2826
1.88k
                r->ref_id[j]->LN_length = strtoll(tag->str+3, NULL, 0);
2827
                // See fuzz 382922241
2828
1.88k
                if (r->ref_id[j]->LN_length < 0)
2829
276
                    r->ref_id[j]->LN_length = 0;
2830
1.88k
            }
2831
1.88k
        }
2832
2833
1.88k
        k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
2834
1.88k
        if (n <= 0) // already exists or error
2835
0
            return -1;
2836
1.88k
        kh_val(r->h_meta, k) = r->ref_id[j];
2837
2838
1.88k
        j++;
2839
1.88k
    }
2840
2.24k
    r->nref = j;
2841
2842
2.24k
    return 0;
2843
2.24k
}
2844
2845
/*
2846
 * Attaches a header to a cram_fd.
2847
 *
2848
 * This should be used when creating a new cram_fd for writing where
2849
 * we have a header already constructed (eg from a file we've read
2850
 * in).
2851
 */
2852
1.83k
int cram_set_header2(cram_fd *fd, const sam_hdr_t *hdr) {
2853
1.83k
    if (!fd || !hdr )
2854
0
        return -1;
2855
2856
1.83k
    if (fd->header != hdr) {
2857
1.83k
        if (fd->header)
2858
0
            sam_hdr_destroy(fd->header);
2859
1.83k
        fd->header = sam_hdr_dup(hdr);
2860
1.83k
        if (!fd->header)
2861
0
            return -1;
2862
1.83k
    }
2863
1.83k
    return refs_from_header(fd);
2864
1.83k
}
2865
2866
0
int cram_set_header(cram_fd *fd, sam_hdr_t *hdr) {
2867
0
    return cram_set_header2(fd, hdr);
2868
0
}
2869
2870
/*
2871
 * Returns whether the path refers to a directory.
2872
 */
2873
0
static int is_directory(char *fn) {
2874
0
    struct stat buf;
2875
0
    if ( stat(fn,&buf) ) return 0;
2876
0
    return S_ISDIR(buf.st_mode);
2877
0
}
2878
2879
/*
2880
 * Converts a directory and a filename into an expanded path, replacing %s
2881
 * in directory with the filename and %[0-9]+s with portions of the filename
2882
 * Any remaining parts of filename are added to the end with /%s.
2883
 */
2884
0
static int expand_cache_path(char *path, char *dir, const char *fn) {
2885
0
    char *cp, *start = path;
2886
0
    size_t len;
2887
0
    size_t sz = PATH_MAX;
2888
2889
0
    while ((cp = strchr(dir, '%'))) {
2890
0
        if (cp-dir >= sz) return -1;
2891
0
        strncpy(path, dir, cp-dir);
2892
0
        path += cp-dir;
2893
0
        sz -= cp-dir;
2894
2895
0
        if (*++cp == 's') {
2896
0
            len = strlen(fn);
2897
0
            if (len >= sz) return -1;
2898
0
            strcpy(path, fn);
2899
0
            path += len;
2900
0
            sz -= len;
2901
0
            fn += len;
2902
0
            cp++;
2903
0
        } else if (*cp >= '0' && *cp <= '9') {
2904
0
            char *endp;
2905
0
            long l;
2906
2907
0
            l = strtol(cp, &endp, 10);
2908
0
            l = MIN(l, strlen(fn));
2909
0
            if (*endp == 's') {
2910
0
                if (l >= sz) return -1;
2911
0
                strncpy(path, fn, l);
2912
0
                path += l;
2913
0
                fn += l;
2914
0
                sz -= l;
2915
0
                *path = 0;
2916
0
                cp = endp+1;
2917
0
            } else {
2918
0
                if (sz < 3) return -1;
2919
0
                *path++ = '%';
2920
0
                *path++ = *cp++;
2921
0
            }
2922
0
        } else {
2923
0
            if (sz < 3) return -1;
2924
0
            *path++ = '%';
2925
0
            *path++ = *cp++;
2926
0
        }
2927
0
        dir = cp;
2928
0
    }
2929
2930
0
    len = strlen(dir);
2931
0
    if (len >= sz) return -1;
2932
0
    strcpy(path, dir);
2933
0
    path += len;
2934
0
    sz -= len;
2935
2936
0
    len = strlen(fn) + ((*fn && path > start && path[-1] != '/') ? 1 : 0);
2937
0
    if (len >= sz) return -1;
2938
0
    if (*fn && path > start && path[-1] != '/')
2939
0
        *path++ = '/';
2940
0
    strcpy(path, fn);
2941
0
    return 0;
2942
0
}
2943
2944
/*
2945
 * Make the directory containing path and any prefix directories.
2946
 */
2947
0
static void mkdir_prefix(char *path, int mode) {
2948
0
    char *cp = strrchr(path, '/');
2949
0
    if (!cp)
2950
0
        return;
2951
2952
0
    *cp = 0;
2953
0
    if (is_directory(path)) {
2954
0
        *cp = '/';
2955
0
        return;
2956
0
    }
2957
2958
0
    if (mkdir(path, mode) == 0) {
2959
0
        chmod(path, mode);
2960
0
        *cp = '/';
2961
0
        return;
2962
0
    }
2963
2964
0
    mkdir_prefix(path, mode);
2965
0
    mkdir(path, mode);
2966
0
    chmod(path, mode);
2967
0
    *cp = '/';
2968
0
}
2969
2970
/*
2971
 * Queries the M5 string from the header and attempts to populate the
2972
 * reference from this using the REF_PATH environment.
2973
 *
2974
 * Returns 0 on success
2975
 *        -1 on failure
2976
 */
2977
1.10k
static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
2978
1.10k
    char *ref_path = getenv("REF_PATH");
2979
1.10k
    sam_hrec_type_t *ty;
2980
1.10k
    sam_hrec_tag_t *tag;
2981
1.10k
    char path[PATH_MAX];
2982
1.10k
    kstring_t path_tmp = KS_INITIALIZE;
2983
1.10k
    char *local_cache = getenv("REF_CACHE");
2984
1.10k
    mFILE *mf;
2985
1.10k
    int local_path = 0;
2986
2987
1.10k
    hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id);
2988
2989
1.10k
    if (!r->name)
2990
0
        return -1;
2991
2992
1.10k
    if (!(ty = sam_hrecs_find_type_id(fd->header->hrecs, "SQ", "SN", r->name)))
2993
0
        return -1;
2994
2995
1.10k
    if (!(tag = sam_hrecs_find_key(ty, "M5", NULL)))
2996
1.10k
        goto no_M5;
2997
2998
5
    hts_log_info("Querying ref %s", tag->str+3);
2999
3000
    /* Use cache if available */
3001
5
    if (local_cache && *local_cache) {
3002
0
        struct stat sb;
3003
0
        if (expand_cache_path(path, local_cache, tag->str+3) == 0 &&
3004
0
            stat(path, &sb) == 0)
3005
            // Found it in the local cache
3006
0
            local_path = 1;
3007
0
    }
3008
3009
#ifndef HAVE_MMAP
3010
    char *path2;
3011
    /* Search local files in REF_PATH; we can open them and return as above */
3012
    if (!local_path && (path2 = find_path(tag->str+3, ref_path))) {
3013
        int len = snprintf(path, PATH_MAX, "%s", path2);
3014
        free(path2);
3015
        if (len > 0 && len < PATH_MAX) // in case it's too long
3016
            local_path = 1;
3017
    }
3018
#endif
3019
3020
    /* Found via REF_CACHE or local REF_PATH file */
3021
5
    if (local_path) {
3022
0
        struct stat sb;
3023
0
        BGZF *fp;
3024
3025
0
        if (0 == stat(path, &sb)
3026
0
            && S_ISREG(sb.st_mode)
3027
0
            && (fp = bgzf_open(path, "r"))) {
3028
0
            r->length = sb.st_size;
3029
0
            r->offset = r->line_length = r->bases_per_line = 0;
3030
3031
0
            r->fn = string_dup(fd->refs->pool, path);
3032
3033
0
            if (fd->refs->fp)
3034
0
                if (bgzf_close(fd->refs->fp) != 0)
3035
0
                    return -1;
3036
0
            fd->refs->fp = fp;
3037
0
            fd->refs->fn = r->fn;
3038
0
            r->is_md5 = 1;
3039
0
            r->validated_md5 = 1;
3040
3041
            // Fall back to cram_get_ref() where it'll do the actual
3042
            // reading of the file.
3043
0
            return 0;
3044
0
        }
3045
0
    }
3046
3047
3048
    /* Otherwise search full REF_PATH; slower as loads entire file */
3049
5
    int is_local = 0;
3050
5
    if ((mf = open_path_mfile(tag->str+3, ref_path, NULL, &is_local))) {
3051
0
        size_t sz;
3052
0
        r->seq = mfsteal(mf, &sz);
3053
0
        if (r->seq) {
3054
0
            r->mf = NULL;
3055
0
        } else {
3056
            // keep mf around as we couldn't detach
3057
0
            r->seq = mf->data;
3058
0
            r->mf = mf;
3059
0
        }
3060
0
        r->length = sz;
3061
0
        r->is_md5 = 1;
3062
0
        r->validated_md5 = 1;
3063
5
    } else {
3064
5
        refs_t *refs;
3065
5
        const char *fn;
3066
5
        sam_hrec_tag_t *UR_tag;
3067
3068
1.10k
    no_M5:
3069
        /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
3070
1.10k
        if (!(UR_tag = sam_hrecs_find_key(ty, "UR", NULL)))
3071
463
            return -1;
3072
3073
646
        if (strstr(UR_tag->str+3, "://") &&
3074
34
            strncmp(UR_tag->str+3, "file:", 5) != 0) {
3075
            // Documented as omitted, but accidentally supported until now
3076
18
            hts_log_error("UR tags pointing to remote files are not supported");
3077
18
            return -1;
3078
18
        }
3079
3080
628
        fn = (strncmp(UR_tag->str+3, "file:", 5) == 0)
3081
628
            ? UR_tag->str+8
3082
628
            : UR_tag->str+3;
3083
3084
628
        if (fd->refs->fp) {
3085
0
            if (bgzf_close(fd->refs->fp) != 0)
3086
0
                return -1;
3087
0
            fd->refs->fp = NULL;
3088
0
        }
3089
628
        if (!(refs = refs_load_fai(fd->refs, fn, 0)))
3090
628
            return -1;
3091
0
        sanitise_SQ_lines(fd);
3092
3093
0
        fd->refs = refs;
3094
0
        if (fd->refs->fp) {
3095
0
            if (bgzf_close(fd->refs->fp) != 0)
3096
0
                return -1;
3097
0
            fd->refs->fp = NULL;
3098
0
        }
3099
3100
0
        if (!fd->refs->fn)
3101
0
            return -1;
3102
3103
0
        if (-1 == refs2id(fd->refs, fd->header))
3104
0
            return -1;
3105
0
        if (!fd->refs->ref_id || !fd->refs->ref_id[id])
3106
0
            return -1;
3107
3108
        // Local copy already, so fall back to cram_get_ref().
3109
0
        return 0;
3110
0
    }
3111
3112
    /* Populate the local disk cache if required */
3113
0
    if (!is_local && local_cache && *local_cache) {
3114
0
        hFILE *fp;
3115
3116
0
        if (expand_cache_path(path, local_cache, tag->str+3) < 0) {
3117
0
            return 0; // Not fatal - we have the data already so keep going.
3118
0
        }
3119
0
        hts_log_info("Writing cache file '%s'", path);
3120
0
        mkdir_prefix(path, 01777);
3121
3122
0
        fp = hts_open_tmpfile(path, "wx", &path_tmp);
3123
0
        if (!fp) {
3124
0
            perror(path_tmp.s);
3125
0
            free(path_tmp.s);
3126
3127
            // Not fatal - we have the data already so keep going.
3128
0
            return 0;
3129
0
        }
3130
3131
        // Check md5sum
3132
0
        hts_md5_context *md5;
3133
0
        char unsigned md5_buf1[16];
3134
0
        char md5_buf2[33];
3135
3136
0
        if (!(md5 = hts_md5_init())) {
3137
0
            hclose_abruptly(fp);
3138
0
            unlink(path_tmp.s);
3139
0
            free(path_tmp.s);
3140
0
            return -1;
3141
0
        }
3142
0
        hts_md5_update(md5, r->seq, r->length);
3143
0
        hts_md5_final(md5_buf1, md5);
3144
0
        hts_md5_destroy(md5);
3145
0
        hts_md5_hex(md5_buf2, md5_buf1);
3146
3147
0
        if (strncmp(tag->str+3, md5_buf2, 32) != 0) {
3148
0
            hts_log_error("Mismatching md5sum for downloaded reference");
3149
0
            hclose_abruptly(fp);
3150
0
            unlink(path_tmp.s);
3151
0
            free(path_tmp.s);
3152
0
            return -1;
3153
0
        }
3154
3155
0
        ssize_t length_written = hwrite(fp, r->seq, r->length);
3156
0
        if (hclose(fp) < 0 || length_written != r->length ||
3157
0
            chmod(path_tmp.s, 0444) < 0 ||
3158
0
            rename(path_tmp.s, path) < 0) {
3159
0
            hts_log_error("Creating reference at %s failed: %s",
3160
0
                          path, strerror(errno));
3161
0
            unlink(path_tmp.s);
3162
0
        }
3163
0
    }
3164
3165
0
    free(path_tmp.s);
3166
0
    return 0;
3167
0
}
3168
3169
641
static void cram_ref_incr_locked(refs_t *r, int id) {
3170
641
    RP("%d INC REF %d, %d %p\n", gettid(), id,
3171
641
       (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count+1:-999),
3172
641
       id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
3173
3174
641
    if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq)
3175
641
        return;
3176
3177
0
    if (r->last_id == id)
3178
0
        r->last_id = -1;
3179
3180
0
    ++r->ref_id[id]->count;
3181
0
}
3182
3183
641
void cram_ref_incr(refs_t *r, int id) {
3184
641
    pthread_mutex_lock(&r->lock);
3185
641
    cram_ref_incr_locked(r, id);
3186
641
    pthread_mutex_unlock(&r->lock);
3187
641
}
3188
3189
63
static void cram_ref_decr_locked(refs_t *r, int id) {
3190
63
    RP("%d DEC REF %d, %d %p\n", gettid(), id,
3191
63
       (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count-1:-999),
3192
63
       id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
3193
3194
63
    if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) {
3195
63
        return;
3196
63
    }
3197
3198
0
    if (--r->ref_id[id]->count <= 0) {
3199
0
        assert(r->ref_id[id]->count == 0);
3200
0
        if (r->last_id >= 0) {
3201
0
            if (r->ref_id[r->last_id]->count <= 0 &&
3202
0
                r->ref_id[r->last_id]->seq) {
3203
0
                RP("%d FREE REF %d (%p)\n", gettid(),
3204
0
                   r->last_id, r->ref_id[r->last_id]->seq);
3205
0
                ref_entry_free_seq(r->ref_id[r->last_id]);
3206
0
                if (r->ref_id[r->last_id]->is_md5) r->ref_id[r->last_id]->length = 0;
3207
0
            }
3208
0
        }
3209
0
        r->last_id = id;
3210
0
    }
3211
0
}
3212
3213
63
void cram_ref_decr(refs_t *r, int id) {
3214
63
    pthread_mutex_lock(&r->lock);
3215
63
    cram_ref_decr_locked(r, id);
3216
63
    pthread_mutex_unlock(&r->lock);
3217
63
}
3218
3219
/*
3220
 * Used by cram_ref_load and cram_get_ref. The file handle will have
3221
 * already been opened, so we can catch it. The ref_entry *e informs us
3222
 * of whether this is a multi-line fasta file or a raw MD5 style file.
3223
 * Either way we create a single contiguous sequence.
3224
 *
3225
 * Returns all or part of a reference sequence on success (malloced);
3226
 *         NULL on failure.
3227
 */
3228
static char *load_ref_portion(BGZF *fp, ref_entry *e,
3229
0
                              hts_pos_t start, hts_pos_t end) {
3230
0
    off_t offset, len;
3231
0
    char *seq;
3232
3233
0
    if (end < start)
3234
0
        end = start;
3235
3236
    /*
3237
     * Compute locations in file. This is trivial for the MD5 files, but
3238
     * is still necessary for the fasta variants.
3239
     *
3240
     * Note the offset here, as with faidx, has the assumption that white-
3241
     * space (the diff between line_length and bases_per_line) only occurs
3242
     * at the end of a line of text.
3243
     */
3244
0
    offset = e->line_length
3245
0
        ? e->offset + (start-1)/e->bases_per_line * e->line_length +
3246
0
          (start-1) % e->bases_per_line
3247
0
        : start-1;
3248
3249
0
    len = (e->line_length
3250
0
           ? e->offset + (end-1)/e->bases_per_line * e->line_length +
3251
0
             (end-1) % e->bases_per_line
3252
0
           : end-1) - offset + 1;
3253
3254
0
    if (bgzf_useek(fp, offset, SEEK_SET) < 0) {
3255
0
        perror("bgzf_useek() on reference file");
3256
0
        return NULL;
3257
0
    }
3258
3259
0
    if (len == 0 || !(seq = malloc(len))) {
3260
0
        return NULL;
3261
0
    }
3262
3263
0
    if (len != bgzf_read(fp, seq, len)) {
3264
0
        perror("bgzf_read() on reference file");
3265
0
        free(seq);
3266
0
        return NULL;
3267
0
    }
3268
3269
    /* Strip white-space if required. */
3270
0
    if (len != end-start+1) {
3271
0
        hts_pos_t i, j;
3272
0
        char *cp = seq;
3273
0
        char *cp_to;
3274
3275
        // Copy up to the first white-space, and then repeatedly just copy
3276
        // bases_per_line verbatim, and use the slow method to end again.
3277
        //
3278
        // This may seem excessive, but this code can be a significant
3279
        // portion of total CRAM decode CPU time for shallow data sets.
3280
0
        for (i = j = 0; i < len; i++) {
3281
0
            if (!isspace_c(cp[i]))
3282
0
                cp[j++] = cp[i] & ~0x20;
3283
0
            else
3284
0
                break;
3285
0
        }
3286
0
        while (i < len && isspace_c(cp[i]))
3287
0
            i++;
3288
0
        while (i < len - e->line_length) {
3289
0
            hts_pos_t j_end = j + e->bases_per_line;
3290
0
            while (j < j_end)
3291
0
                cp[j++] = cp[i++] & ~0x20; // toupper equiv
3292
0
            i += e->line_length - e->bases_per_line;
3293
0
        }
3294
0
        for (; i < len; i++) {
3295
0
            if (!isspace_c(cp[i]))
3296
0
                cp[j++] = cp[i] & ~0x20;
3297
0
        }
3298
3299
0
        cp_to = cp+j;
3300
3301
0
        if (cp_to - seq != end-start+1) {
3302
0
            hts_log_error("Malformed reference file");
3303
0
            free(seq);
3304
0
            return NULL;
3305
0
        }
3306
0
    } else {
3307
0
        int i;
3308
0
        for (i = 0; i < len; i++) {
3309
0
            seq[i] = toupper_c(seq[i]);
3310
0
        }
3311
0
    }
3312
3313
0
    return seq;
3314
0
}
3315
3316
/*
3317
 * Load the entire reference 'id'.
3318
 * This also increments the reference count by 1.
3319
 *
3320
 * Returns ref_entry on success;
3321
 *         NULL on failure
3322
 */
3323
0
ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) {
3324
0
    ref_entry *e = r->ref_id[id];
3325
0
    hts_pos_t start = 1, end = e->length;
3326
0
    char *seq;
3327
3328
0
    if (e->seq) {
3329
0
        return e;
3330
0
    }
3331
3332
0
    assert(e->count == 0);
3333
3334
0
    if (r->last) {
3335
#ifdef REF_DEBUG
3336
        int idx = 0;
3337
        for (idx = 0; idx < r->nref; idx++)
3338
            if (r->last == r->ref_id[idx])
3339
                break;
3340
        RP("%d cram_ref_load DECR %d\n", gettid(), idx);
3341
#endif
3342
0
        assert(r->last->count > 0);
3343
0
        if (--r->last->count <= 0) {
3344
0
            RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
3345
0
            if (r->last->seq)
3346
0
                ref_entry_free_seq(r->last);
3347
0
        }
3348
0
    }
3349
3350
0
    if (!r->fn)
3351
0
        return NULL;
3352
3353
    /* Open file if it's not already the current open reference */
3354
0
    if (strcmp(r->fn, e->fn) || r->fp == NULL) {
3355
0
        if (r->fp)
3356
0
            if (bgzf_close(r->fp) != 0)
3357
0
                return NULL;
3358
0
        r->fn = e->fn;
3359
0
        if (!(r->fp = bgzf_open_ref(r->fn, "r", is_md5)))
3360
0
            return NULL;
3361
0
    }
3362
3363
0
    RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
3364
3365
0
    if (!(seq = load_ref_portion(r->fp, e, start, end))) {
3366
0
        return NULL;
3367
0
    }
3368
3369
0
    RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
3370
3371
0
    RP("%d INC REF %d, %"PRId64"\n", gettid(), id, (e->count+1));
3372
0
    e->seq = seq;
3373
0
    e->mf = NULL;
3374
0
    e->count++;
3375
3376
    /*
3377
     * Also keep track of last used ref so incr/decr loops on the same
3378
     * sequence don't cause load/free loops.
3379
     */
3380
0
    RP("%d cram_ref_load INCR %d => %"PRId64"\n", gettid(), id, e->count+1);
3381
0
    r->last = e;
3382
0
    e->count++;
3383
3384
0
    return e;
3385
0
}
3386
3387
/*
3388
 * Returns a portion of a reference sequence from start to end inclusive.
3389
 * The returned pointer is owned by either the cram_file fd or by the
3390
 * internal refs_t structure and should not be freed  by the caller.
3391
 *
3392
 * The difference is whether or not this refs_t is in use by just the one
3393
 * cram_fd or by multiples, or whether we have multiple threads accessing
3394
 * references. In either case fd->shared will be true and we start using
3395
 * reference counting to track the number of users of a specific reference
3396
 * sequence.
3397
 *
3398
 * Otherwise the ref seq returned is allocated as part of cram_fd itself
3399
 * and will be freed up on the next call to cram_get_ref or cram_close.
3400
 *
3401
 * To return the entire reference sequence, specify start as 1 and end
3402
 * as 0.
3403
 *
3404
 * To cease using a reference, call cram_ref_decr().
3405
 *
3406
 * Returns reference on success,
3407
 *         NULL on failure
3408
 */
3409
2.82k
char *cram_get_ref(cram_fd *fd, int id, hts_pos_t start, hts_pos_t end) {
3410
2.82k
    ref_entry *r;
3411
2.82k
    char *seq;
3412
2.82k
    int ostart = start;
3413
3414
2.82k
    if (id == -1 || start < 1)
3415
1.72k
        return NULL;
3416
3417
    /* FIXME: axiomatic query of r->seq being true?
3418
     * Or shortcut for unsorted data where we load once and never free?
3419
     */
3420
3421
    //fd->shared_ref = 1; // hard code for now to simplify things
3422
3423
1.10k
    pthread_mutex_lock(&fd->ref_lock);
3424
3425
1.10k
    RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
3426
3427
    /*
3428
     * Unsorted data implies we want to fetch an entire reference at a time.
3429
     * We just deal with this at the moment by claiming we're sharing
3430
     * references instead, which has the same requirement.
3431
     */
3432
1.10k
    if (fd->unsorted)
3433
0
        fd->shared_ref = 1;
3434
3435
3436
    /* Sanity checking: does this ID exist? */
3437
1.10k
    if (!fd->refs || id < 0 || id >= fd->refs->nref || !fd->refs->ref_id[id]) {
3438
0
        hts_log_error("No reference found for id %d", id);
3439
0
        pthread_mutex_unlock(&fd->ref_lock);
3440
0
        return NULL;
3441
0
    }
3442
3443
1.10k
    r = fd->refs->ref_id[id];
3444
3445
    /*
3446
     * It has an entry, but may not have been populated yet.
3447
     * Any manually loaded .fai files have their lengths known.
3448
     * A ref entry computed from @SQ lines (M5 or UR field) will have
3449
     * r->length == 0 unless it's been loaded once and verified that we have
3450
     * an on-disk filename for it.
3451
     *
3452
     * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
3453
     * open_path_mfile and libcurl, which isn't multi-thread safe unless I
3454
     * rewrite my code to have one curl handle per thread.
3455
     */
3456
1.10k
    pthread_mutex_lock(&fd->refs->lock);
3457
1.10k
    if (r->length == 0) {
3458
1.10k
        if (fd->ref_fn)
3459
0
            hts_log_warning("Reference file given, but ref '%s' not present",
3460
1.10k
                            r->name);
3461
1.10k
        if (cram_populate_ref(fd, id, r) == -1) {
3462
1.10k
            hts_log_warning("Failed to populate reference \"%s\"",
3463
1.10k
                            r->name);
3464
1.10k
            hts_log_warning("See https://www.htslib.org/doc/reference_seqs.html for further suggestions");
3465
1.10k
            pthread_mutex_unlock(&fd->refs->lock);
3466
1.10k
            pthread_mutex_unlock(&fd->ref_lock);
3467
1.10k
            return NULL;
3468
1.10k
        }
3469
0
        r = fd->refs->ref_id[id];
3470
0
        if (fd->unsorted)
3471
0
            cram_ref_incr_locked(fd->refs, id);
3472
0
    }
3473
3474
3475
    /*
3476
     * We now know that we the filename containing the reference, so check
3477
     * for limits. If it's over half the reference we'll load all of it in
3478
     * memory as this will speed up subsequent calls.
3479
     */
3480
0
    if (end < 1)
3481
0
        end = r->length;
3482
0
    if (end >= r->length)
3483
0
        end  = r->length;
3484
3485
0
    if (end - start >= 0.5*r->length || fd->shared_ref) {
3486
0
        start = 1;
3487
0
        end = r->length;
3488
0
    }
3489
3490
    /*
3491
     * Maybe we have it cached already? If so use it.
3492
     *
3493
     * Alternatively if we don't have the sequence but we're sharing
3494
     * references and/or are asking for the entire length of it, then
3495
     * load the full reference into the refs structure and return
3496
     * a pointer to that one instead.
3497
     */
3498
0
    if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
3499
0
        char *cp;
3500
3501
0
        if (id >= 0) {
3502
0
            if (r->seq) {
3503
0
                cram_ref_incr_locked(fd->refs, id);
3504
0
            } else {
3505
0
                ref_entry *e;
3506
0
                if (!(e = cram_ref_load(fd->refs, id, r->is_md5))) {
3507
0
                    pthread_mutex_unlock(&fd->refs->lock);
3508
0
                    pthread_mutex_unlock(&fd->ref_lock);
3509
0
                    return NULL;
3510
0
                }
3511
3512
                /* unsorted data implies cache ref indefinitely, to avoid
3513
                 * continually loading and unloading.
3514
                 */
3515
0
                if (fd->unsorted)
3516
0
                    cram_ref_incr_locked(fd->refs, id);
3517
0
            }
3518
3519
0
            fd->ref = NULL; /* We never access it directly */
3520
0
            fd->ref_start = 1;
3521
0
            fd->ref_end   = r->length;
3522
0
            fd->ref_id    = id;
3523
3524
0
            cp = fd->refs->ref_id[id]->seq + ostart-1;
3525
0
        } else {
3526
0
            fd->ref = NULL;
3527
0
            cp = NULL;
3528
0
        }
3529
3530
0
        RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
3531
3532
0
        pthread_mutex_unlock(&fd->refs->lock);
3533
0
        pthread_mutex_unlock(&fd->ref_lock);
3534
0
        return cp;
3535
0
    }
3536
3537
    /*
3538
     * Otherwise we're not sharing, we don't have a copy of it already and
3539
     * we're only asking for a small portion of it.
3540
     *
3541
     * In this case load up just that segment ourselves, freeing any old
3542
     * small segments in the process.
3543
     */
3544
3545
    /* Unmapped ref ID */
3546
0
    if (id < 0 || !fd->refs->fn) {
3547
0
        if (fd->ref_free) {
3548
0
            free(fd->ref_free);
3549
0
            fd->ref_free = NULL;
3550
0
        }
3551
0
        fd->ref = NULL;
3552
0
        fd->ref_id = id;
3553
0
        pthread_mutex_unlock(&fd->refs->lock);
3554
0
        pthread_mutex_unlock(&fd->ref_lock);
3555
0
        return NULL;
3556
0
    }
3557
3558
    /* Open file if it's not already the current open reference */
3559
0
    if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
3560
0
        if (fd->refs->fp)
3561
0
            if (bgzf_close(fd->refs->fp) != 0)
3562
0
                return NULL;
3563
0
        fd->refs->fn = r->fn;
3564
0
        if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r", r->is_md5))) {
3565
0
            pthread_mutex_unlock(&fd->refs->lock);
3566
0
            pthread_mutex_unlock(&fd->ref_lock);
3567
0
            return NULL;
3568
0
        }
3569
0
    }
3570
3571
0
    if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
3572
0
        pthread_mutex_unlock(&fd->refs->lock);
3573
0
        pthread_mutex_unlock(&fd->ref_lock);
3574
0
        return NULL;
3575
0
    }
3576
3577
0
    if (fd->ref_free)
3578
0
        free(fd->ref_free);
3579
3580
0
    fd->ref_id    = id;
3581
0
    fd->ref_start = start;
3582
0
    fd->ref_end   = end;
3583
0
    fd->ref_free = fd->ref;
3584
0
    seq = fd->ref;
3585
3586
0
    pthread_mutex_unlock(&fd->refs->lock);
3587
0
    pthread_mutex_unlock(&fd->ref_lock);
3588
3589
0
    return seq ? seq + ostart - start : NULL;
3590
0
}
3591
3592
/*
3593
 * If fd has been opened for reading, it may be permitted to specify 'fn'
3594
 * as NULL and let the code auto-detect the reference by parsing the
3595
 * SAM header @SQ lines.
3596
 */
3597
0
int cram_load_reference(cram_fd *fd, char *fn) {
3598
0
    int ret = 0;
3599
3600
0
    if (fn) {
3601
0
        fd->refs = refs_load_fai(fd->refs, fn,
3602
0
                                 !(fd->embed_ref>0 && fd->mode == 'r'));
3603
0
        fn = fd->refs ? fd->refs->fn : NULL;
3604
0
        if (!fn)
3605
0
            ret = -1;
3606
0
        sanitise_SQ_lines(fd);
3607
0
    }
3608
0
    fd->ref_fn = fn;
3609
3610
0
    if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
3611
0
        if (fd->refs)
3612
0
            refs_free(fd->refs);
3613
0
        if (!(fd->refs = refs_create()))
3614
0
            return -1;
3615
0
        if (-1 == refs_from_header(fd))
3616
0
            return -1;
3617
0
    }
3618
3619
0
    if (fd->header)
3620
0
        if (-1 == refs2id(fd->refs, fd->header))
3621
0
            return -1;
3622
3623
0
    return ret;
3624
0
}
3625
3626
/* ----------------------------------------------------------------------
3627
 * Containers
3628
 */
3629
3630
/*
3631
 * Creates a new container, specifying the maximum number of slices
3632
 * and records permitted.
3633
 *
3634
 * Returns cram_container ptr on success
3635
 *         NULL on failure
3636
 */
3637
42.6k
cram_container *cram_new_container(int nrec, int nslice) {
3638
42.6k
    cram_container *c = calloc(1, sizeof(*c));
3639
42.6k
    enum cram_DS_ID id;
3640
3641
42.6k
    if (!c)
3642
0
        return NULL;
3643
3644
42.6k
    c->curr_ref = -2;
3645
3646
42.6k
    c->max_c_rec = nrec * nslice;
3647
42.6k
    c->curr_c_rec = 0;
3648
3649
42.6k
    c->max_rec = nrec;
3650
42.6k
    c->record_counter = 0;
3651
42.6k
    c->num_bases = 0;
3652
42.6k
    c->s_num_bases = 0;
3653
3654
42.6k
    c->max_slice = nslice;
3655
42.6k
    c->curr_slice = 0;
3656
3657
42.6k
    c->pos_sorted = 1;
3658
42.6k
    c->max_apos   = 0;
3659
42.6k
    c->multi_seq  = 0;
3660
42.6k
    c->qs_seq_orient = 1;
3661
42.6k
    c->no_ref = 0;
3662
42.6k
    c->embed_ref = -1; // automatic selection
3663
3664
42.6k
    c->bams = NULL;
3665
3666
42.6k
    if (!(c->slices = calloc(nslice != 0 ? nslice : 1, sizeof(cram_slice *))))
3667
0
        goto err;
3668
42.6k
    c->slice = NULL;
3669
3670
42.6k
    if (!(c->comp_hdr = cram_new_compression_header()))
3671
0
        goto err;
3672
42.6k
    c->comp_hdr_block = NULL;
3673
3674
1.23M
    for (id = DS_RN; id < DS_TN; id++)
3675
1.19M
        if (!(c->stats[id] = cram_stats_create())) goto err;
3676
3677
    //c->aux_B_stats = cram_stats_create();
3678
3679
42.6k
    if (!(c->tags_used = kh_init(m_tagmap)))
3680
0
        goto err;
3681
42.6k
    c->refs_used = 0;
3682
42.6k
    c->ref_free = 0;
3683
3684
42.6k
    return c;
3685
3686
0
 err:
3687
0
    if (c) {
3688
0
        if (c->slices)
3689
0
            free(c->slices);
3690
0
        free(c);
3691
0
    }
3692
0
    return NULL;
3693
42.6k
}
3694
3695
1.21k
static void free_bam_list(bam_seq_t **bams, int max_rec) {
3696
1.21k
    int i;
3697
12.1M
    for (i = 0; i < max_rec; i++)
3698
12.1M
        bam_free(bams[i]);
3699
3700
1.21k
    free(bams);
3701
1.21k
}
3702
3703
50.2k
void cram_free_container(cram_container *c) {
3704
50.2k
    enum cram_DS_ID id;
3705
50.2k
    int i;
3706
3707
50.2k
    if (!c)
3708
0
        return;
3709
3710
50.2k
    if (c->refs_used)
3711
675
        free(c->refs_used);
3712
3713
50.2k
    if (c->landmark)
3714
42.8k
        free(c->landmark);
3715
3716
50.2k
    if (c->comp_hdr)
3717
42.7k
        cram_free_compression_header(c->comp_hdr);
3718
3719
50.2k
    if (c->comp_hdr_block)
3720
41.5k
        cram_free_block(c->comp_hdr_block);
3721
3722
    // Free the slices; filled out by encoder only
3723
50.2k
    if (c->slices) {
3724
83.6k
        for (i = 0; i < c->max_slice; i++) {
3725
40.9k
            if (c->slices[i])
3726
1.21k
                cram_free_slice(c->slices[i]);
3727
40.9k
            if (c->slices[i] == c->slice)
3728
40.9k
                c->slice = NULL;
3729
40.9k
        }
3730
42.6k
        free(c->slices);
3731
42.6k
    }
3732
3733
    // Free the current slice; set by both encoder & decoder
3734
50.2k
    if (c->slice) {
3735
0
        cram_free_slice(c->slice);
3736
0
        c->slice = NULL;
3737
0
    }
3738
3739
1.45M
    for (id = DS_RN; id < DS_TN; id++)
3740
1.40M
        if (c->stats[id]) cram_stats_free(c->stats[id]);
3741
3742
    //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
3743
3744
50.2k
    if (c->tags_used) {
3745
42.6k
        khint_t k;
3746
3747
295k
        for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
3748
252k
            if (!kh_exist(c->tags_used, k))
3749
128k
                continue;
3750
3751
123k
            cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
3752
123k
            if (tm) {
3753
123k
                cram_codec *c = tm->codec;
3754
3755
123k
                if (c) c->free(c);
3756
3757
                // If tm->blk or tm->blk2 is set, then we haven't yet got to
3758
                // cram_encode_container which copies the blocks to s->aux_block
3759
                // and NULLifies tm->blk*.  In this case we failed to complete
3760
                // the container construction, so we have to free up our partially
3761
                // converted CRAM.
3762
123k
                cram_free_block(tm->blk);
3763
123k
                cram_free_block(tm->blk2);
3764
123k
                free(tm);
3765
123k
            }
3766
123k
        }
3767
3768
42.6k
        kh_destroy(m_tagmap, c->tags_used);
3769
42.6k
    }
3770
3771
50.2k
    if (c->ref_free)
3772
19.6k
        free(c->ref);
3773
3774
50.2k
    if (c->bams)
3775
145
        free_bam_list(c->bams, c->max_c_rec);
3776
3777
50.2k
    free(c);
3778
50.2k
}
3779
3780
/*
3781
 * Reads a container header.
3782
 *
3783
 * Returns cram_container on success
3784
 *         NULL on failure or no container left (fd->err == 0).
3785
 */
3786
7.56k
cram_container *cram_read_container(cram_fd *fd) {
3787
7.56k
    cram_container c2, *c;
3788
7.56k
    int i, s;
3789
7.56k
    size_t rd = 0;
3790
7.56k
    uint32_t crc = 0;
3791
3792
7.56k
    fd->err = 0;
3793
7.56k
    fd->eof = 0;
3794
3795
7.56k
    memset(&c2, 0, sizeof(c2));
3796
7.56k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3797
5.47k
        if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) {
3798
0
            fd->eof = fd->empty_container ? 1 : 2;
3799
0
            return NULL;
3800
5.47k
        } else {
3801
5.47k
            rd+=s;
3802
5.47k
        }
3803
5.47k
    } else if (CRAM_MAJOR_VERS(fd->version) < 4) {
3804
538
        uint32_t len;
3805
538
        if ((s = int32_decode(fd, &c2.length)) == -1) {
3806
3
            if (CRAM_MAJOR_VERS(fd->version) == 2 &&
3807
0
                CRAM_MINOR_VERS(fd->version) == 0)
3808
0
                fd->eof = 1; // EOF blocks arrived in v2.1
3809
3
            else
3810
3
                fd->eof = fd->empty_container ? 1 : 2;
3811
3
            return NULL;
3812
535
        } else {
3813
535
            rd+=s;
3814
535
        }
3815
535
        len = le_int4(c2.length);
3816
535
        crc = crc32(0L, (unsigned char *)&len, 4);
3817
1.55k
    } else {
3818
1.55k
        if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc))   == -1) {
3819
0
            fd->eof = fd->empty_container ? 1 : 2;
3820
0
            return NULL;
3821
1.55k
        } else {
3822
1.55k
            rd+=s;
3823
1.55k
        }
3824
1.55k
    }
3825
7.56k
    if ((s = fd->vv.varint_decode32s_crc(fd, &c2.ref_seq_id, &crc))   == -1) return NULL; else rd+=s;
3826
7.56k
    if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3827
1.55k
        int64_t i64;
3828
1.55k
        if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc))== -1) return NULL; else rd+=s;
3829
1.55k
        c2.ref_seq_start = i64;
3830
1.55k
        if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc)) == -1) return NULL; else rd+=s;
3831
1.55k
        c2.ref_seq_span = i64;
3832
6.01k
    } else {
3833
6.01k
        int32_t i32;
3834
6.01k
        if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s;
3835
6.01k
        c2.ref_seq_start = i32;
3836
6.01k
        if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s;
3837
6.01k
        c2.ref_seq_span = i32;
3838
6.01k
    }
3839
7.56k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_records, &crc))  == -1) return NULL; else rd+=s;
3840
3841
7.56k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3842
5.47k
        c2.record_counter = 0;
3843
5.47k
        c2.num_bases = 0;
3844
5.47k
    } else {
3845
2.08k
        if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3846
2.08k
            if ((s = fd->vv.varint_decode64_crc(fd, &c2.record_counter, &crc)) == -1)
3847
0
                return NULL;
3848
2.08k
            else
3849
2.08k
                rd += s;
3850
2.08k
        } else {
3851
2
            int32_t i32;
3852
2
            if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1)
3853
0
                return NULL;
3854
2
            else
3855
2
                rd += s;
3856
2
            c2.record_counter = i32;
3857
2
        }
3858
3859
2.08k
        if ((s = fd->vv.varint_decode64_crc(fd, &c2.num_bases, &crc))== -1)
3860
0
            return NULL;
3861
2.08k
        else
3862
2.08k
            rd += s;
3863
2.08k
    }
3864
7.56k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_blocks, &crc))   == -1)
3865
3
        return NULL;
3866
7.56k
    else
3867
7.56k
        rd+=s;
3868
7.56k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_landmarks, &crc))== -1)
3869
0
        return NULL;
3870
7.56k
    else
3871
7.56k
        rd+=s;
3872
3873
7.56k
    if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t))
3874
12
        return NULL;
3875
3876
7.54k
    if (!(c = calloc(1, sizeof(*c))))
3877
0
        return NULL;
3878
3879
7.54k
    *c = c2;
3880
7.54k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3881
7.54k
    if (c->num_landmarks > FUZZ_ALLOC_LIMIT/sizeof(int32_t)) {
3882
0
        fd->err = errno = ENOMEM;
3883
0
        cram_free_container(c);
3884
0
        return NULL;
3885
0
    }
3886
7.54k
#endif
3887
7.54k
    if (c->num_landmarks && !(c->landmark = malloc(c->num_landmarks * sizeof(int32_t)))) {
3888
0
        fd->err = errno;
3889
0
        cram_free_container(c);
3890
0
        return NULL;
3891
0
    }
3892
34.3k
    for (i = 0; i < c->num_landmarks; i++) {
3893
26.7k
        if ((s = fd->vv.varint_decode32_crc(fd, &c->landmark[i], &crc)) == -1) {
3894
6
            cram_free_container(c);
3895
6
            return NULL;
3896
26.7k
        } else {
3897
26.7k
            rd += s;
3898
26.7k
        }
3899
26.7k
    }
3900
3901
7.54k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3902
2.07k
        if (-1 == int32_decode(fd, (int32_t *)&c->crc32)) {
3903
0
            cram_free_container(c);
3904
0
            return NULL;
3905
2.07k
        } else {
3906
2.07k
            rd+=4;
3907
2.07k
        }
3908
3909
2.07k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3910
        // Pretend the CRC was OK so the fuzzer doesn't have to get it right
3911
2.07k
        crc = c->crc32;
3912
2.07k
#endif
3913
3914
2.07k
        if (crc != c->crc32) {
3915
0
            hts_log_error("Container header CRC32 failure");
3916
0
            cram_free_container(c);
3917
0
            return NULL;
3918
0
        }
3919
2.07k
    }
3920
3921
7.54k
    c->offset = rd;
3922
7.54k
    c->slices = NULL;
3923
7.54k
    c->slice = NULL;
3924
7.54k
    c->curr_slice = 0;
3925
7.54k
    c->max_slice = c->num_landmarks;
3926
7.54k
    c->slice_rec = 0;
3927
7.54k
    c->curr_rec = 0;
3928
7.54k
    c->max_rec = 0;
3929
3930
7.54k
    if (c->ref_seq_id == -2) {
3931
0
        c->multi_seq = 1;
3932
0
        fd->multi_seq = 1;
3933
0
    }
3934
3935
7.54k
    fd->empty_container =
3936
7.54k
        (c->num_records == 0 &&
3937
6.85k
         c->ref_seq_id == -1 &&
3938
7.54k
         c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
3939
3940
7.54k
    return c;
3941
7.54k
}
3942
3943
3944
/* MAXIMUM storage size needed for the container. */
3945
0
int cram_container_size(cram_container *c) {
3946
0
    return 55 + 5*c->num_landmarks;
3947
0
}
3948
3949
3950
/*
3951
 * Stores the container structure in dat and returns *size as the
3952
 * number of bytes written to dat[].  The input size of dat is also
3953
 * held in *size and should be initialised to cram_container_size(c).
3954
 *
3955
 * Returns 0 on success;
3956
 *        -1 on failure
3957
 */
3958
int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
3959
0
{
3960
0
    char *cp = (char *)dat;
3961
0
    int i;
3962
3963
    // Check the input buffer is large enough according to our stated
3964
    // requirements. (NOTE: it may actually take less.)
3965
0
    if (cram_container_size(c) > *size)
3966
0
        return -1;
3967
3968
0
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3969
0
        cp += itf8_put(cp, c->length);
3970
0
    } else {
3971
0
        *(int32_t *)cp = le_int4(c->length);
3972
0
        cp += 4;
3973
0
    }
3974
0
    if (c->multi_seq) {
3975
0
        cp += fd->vv.varint_put32(cp, NULL, -2);
3976
0
        cp += fd->vv.varint_put32(cp, NULL, 0);
3977
0
        cp += fd->vv.varint_put32(cp, NULL, 0);
3978
0
    } else {
3979
0
        cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
3980
0
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3981
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start);
3982
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span);
3983
0
        } else {
3984
0
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
3985
0
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
3986
0
        }
3987
0
    }
3988
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_records);
3989
0
    if (CRAM_MAJOR_VERS(fd->version) == 2) {
3990
0
        cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
3991
0
    } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3992
0
        cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
3993
0
    }
3994
0
    cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
3995
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
3996
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
3997
0
    for (i = 0; i < c->num_landmarks; i++)
3998
0
        cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
3999
4000
0
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
4001
0
        c->crc32 = crc32(0L, (uc *)dat, cp-dat);
4002
0
        cp[0] =  c->crc32        & 0xff;
4003
0
        cp[1] = (c->crc32 >>  8) & 0xff;
4004
0
        cp[2] = (c->crc32 >> 16) & 0xff;
4005
0
        cp[3] = (c->crc32 >> 24) & 0xff;
4006
0
        cp += 4;
4007
0
    }
4008
4009
0
    *size = cp-dat; // actual used size
4010
4011
0
    return 0;
4012
0
}
4013
4014
4015
/*
4016
 * Writes a container structure.
4017
 *
4018
 * Returns 0 on success
4019
 *        -1 on failure
4020
 */
4021
44.3k
int cram_write_container(cram_fd *fd, cram_container *c) {
4022
44.3k
    char buf_a[1024], *buf = buf_a, *cp;
4023
44.3k
    int i;
4024
4025
44.3k
    if (61 + c->num_landmarks * 10 >= 1024) {
4026
0
        buf = malloc(61 + c->num_landmarks * 10);
4027
0
        if (!buf)
4028
0
            return -1;
4029
0
    }
4030
44.3k
    cp = buf;
4031
4032
44.3k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4033
0
        cp += itf8_put(cp, c->length);
4034
44.3k
    } else if (CRAM_MAJOR_VERS(fd->version) <= 3) {
4035
44.3k
        *(int32_t *)cp = le_int4(c->length);
4036
44.3k
        cp += 4;
4037
44.3k
    } else {
4038
0
        cp += fd->vv.varint_put32(cp, NULL, c->length);
4039
0
    }
4040
44.3k
    if (c->multi_seq) {
4041
675
        cp += fd->vv.varint_put32(cp, NULL, (uint32_t)-2);
4042
675
        cp += fd->vv.varint_put32(cp, NULL, 0);
4043
675
        cp += fd->vv.varint_put32(cp, NULL, 0);
4044
43.6k
    } else {
4045
43.6k
        cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
4046
43.6k
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
4047
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start);
4048
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span);
4049
43.6k
        } else {
4050
43.6k
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
4051
43.6k
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
4052
43.6k
        }
4053
43.6k
    }
4054
44.3k
    cp += fd->vv.varint_put32(cp, NULL, c->num_records);
4055
44.3k
    if (CRAM_MAJOR_VERS(fd->version) >= 3)
4056
44.3k
        cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
4057
0
    else
4058
0
        cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
4059
44.3k
    cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
4060
44.3k
    cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
4061
44.3k
    cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
4062
88.5k
    for (i = 0; i < c->num_landmarks; i++)
4063
44.2k
        cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
4064
4065
44.3k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
4066
44.3k
        c->crc32 = crc32(0L, (uc *)buf, cp-buf);
4067
44.3k
        cp[0] =  c->crc32        & 0xff;
4068
44.3k
        cp[1] = (c->crc32 >>  8) & 0xff;
4069
44.3k
        cp[2] = (c->crc32 >> 16) & 0xff;
4070
44.3k
        cp[3] = (c->crc32 >> 24) & 0xff;
4071
44.3k
        cp += 4;
4072
44.3k
    }
4073
4074
44.3k
    if (cp-buf != hwrite(fd->fp, buf, cp-buf)) {
4075
0
        if (buf != buf_a)
4076
0
            free(buf);
4077
0
        return -1;
4078
0
    }
4079
4080
44.3k
    if (buf != buf_a)
4081
0
        free(buf);
4082
4083
44.3k
    return 0;
4084
44.3k
}
4085
4086
// common component shared by cram_flush_container{,_mt}
4087
40.7k
static int cram_flush_container2(cram_fd *fd, cram_container *c) {
4088
40.7k
    int i, j;
4089
4090
40.7k
    if (c->curr_slice > 0 && !c->slices)
4091
0
        return -1;
4092
4093
    //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
4094
4095
40.7k
    off_t c_offset = htell(fd->fp); // File offset of container
4096
4097
    /* Write the container struct itself */
4098
40.7k
    if (0 != cram_write_container(fd, c))
4099
0
        return -1;
4100
4101
40.7k
    off_t hdr_size = htell(fd->fp) - c_offset;
4102
4103
    /* And the compression header */
4104
40.7k
    if (0 != cram_write_block(fd, c->comp_hdr_block))
4105
0
        return -1;
4106
4107
    /* Followed by the slice blocks */
4108
40.7k
    off_t file_offset = htell(fd->fp);
4109
81.5k
    for (i = 0; i < c->curr_slice; i++) {
4110
40.7k
        cram_slice *s = c->slices[i];
4111
40.7k
        off_t spos = file_offset - c_offset - hdr_size;
4112
4113
40.7k
        if (0 != cram_write_block(fd, s->hdr_block))
4114
0
            return -1;
4115
4116
316k
        for (j = 0; j < s->hdr->num_blocks; j++) {
4117
276k
            if (0 != cram_write_block(fd, s->block[j]))
4118
0
                return -1;
4119
276k
        }
4120
4121
40.7k
        file_offset = htell(fd->fp);
4122
40.7k
        off_t sz = file_offset - c_offset - hdr_size - spos;
4123
4124
40.7k
        if (fd->idxfp) {
4125
0
            if (cram_index_slice(fd, c, s, fd->idxfp, c_offset, spos, sz) < 0)
4126
0
                return -1;
4127
0
        }
4128
40.7k
    }
4129
4130
40.7k
    return 0;
4131
40.7k
}
4132
4133
/*
4134
 * Flushes a completely or partially full container to disk, writing
4135
 * container structure, header and blocks. This also calls the encoder
4136
 * functions.
4137
 *
4138
 * Returns 0 on success
4139
 *        -1 on failure
4140
 */
4141
40.9k
int cram_flush_container(cram_fd *fd, cram_container *c) {
4142
    /* Encode the container blocks and generate compression header */
4143
40.9k
    if (0 != cram_encode_container(fd, c))
4144
171
        return -1;
4145
4146
40.7k
    return cram_flush_container2(fd, c);
4147
40.9k
}
4148
4149
typedef struct {
4150
    cram_fd *fd;
4151
    cram_container *c;
4152
} cram_job;
4153
4154
0
void *cram_flush_thread(void *arg) {
4155
0
    cram_job *j = (cram_job *)arg;
4156
4157
    /* Encode the container blocks and generate compression header */
4158
0
    if (0 != cram_encode_container(j->fd, j->c)) {
4159
0
        hts_log_error("Call to cram_encode_container failed");
4160
0
        return NULL;
4161
0
    }
4162
4163
0
    return arg;
4164
0
}
4165
4166
0
static int cram_flush_result(cram_fd *fd) {
4167
0
    int i, ret = 0;
4168
0
    hts_tpool_result *r;
4169
0
    cram_container *lc = NULL;
4170
4171
    // NB: we can have one result per slice, not per container,
4172
    // so we need to free the container only after all slices
4173
    // within it have been freed.  (Automatic via reference counting.)
4174
0
    while ((r = hts_tpool_next_result(fd->rqueue))) {
4175
0
        cram_job *j = (cram_job *)hts_tpool_result_data(r);
4176
0
        cram_container *c;
4177
4178
0
        if (!j) {
4179
0
            hts_tpool_delete_result(r, 0);
4180
0
            return -1;
4181
0
        }
4182
4183
0
        fd = j->fd;
4184
0
        c = j->c;
4185
4186
0
        if (fd->mode == 'w')
4187
0
            if (0 != cram_flush_container2(fd, c))
4188
0
                return -1;
4189
4190
        // Free the slices; filled out by encoder only
4191
0
        if (c->slices) {
4192
0
            for (i = 0; i < c->max_slice; i++) {
4193
0
                if (c->slices[i])
4194
0
                    cram_free_slice(c->slices[i]);
4195
0
                if (c->slices[i] == c->slice)
4196
0
                    c->slice = NULL;
4197
0
                c->slices[i] = NULL;
4198
0
            }
4199
0
        }
4200
4201
        // Free the current slice; set by both encoder & decoder
4202
0
        if (c->slice) {
4203
0
            cram_free_slice(c->slice);
4204
0
            c->slice = NULL;
4205
0
        }
4206
0
        c->curr_slice = 0;
4207
4208
        // Our jobs will be in order, so we free the last
4209
        // container when our job has switched to a new one.
4210
0
        if (c != lc) {
4211
0
            if (lc) {
4212
0
                if (fd->ctr == lc)
4213
0
                    fd->ctr = NULL;
4214
0
                if (fd->ctr_mt == lc)
4215
0
                    fd->ctr_mt = NULL;
4216
0
                cram_free_container(lc);
4217
0
            }
4218
0
            lc = c;
4219
0
        }
4220
4221
0
        hts_tpool_delete_result(r, 1);
4222
0
    }
4223
0
    if (lc) {
4224
0
        if (fd->ctr == lc)
4225
0
            fd->ctr = NULL;
4226
0
        if (fd->ctr_mt == lc)
4227
0
            fd->ctr_mt = NULL;
4228
0
        cram_free_container(lc);
4229
0
    }
4230
4231
0
    return ret;
4232
0
}
4233
4234
// Note: called while metrics_lock is held.
4235
// Will be left in this state too, but may temporarily unlock.
4236
2.44k
void reset_metrics(cram_fd *fd) {
4237
2.44k
    int i;
4238
4239
2.44k
    if (fd->pool) {
4240
        // If multi-threaded we have multiple blocks being
4241
        // compressed already and several on the to-do list
4242
        // (fd->rqueue->pending).  It's tricky to reset the
4243
        // metrics exactly the correct point, so instead we
4244
        // just flush the pool, reset, and then continue again.
4245
4246
        // Don't bother starting a new trial before then though.
4247
0
        for (i = 0; i < DS_END; i++) {
4248
0
            cram_metrics *m = fd->m[i];
4249
0
            if (!m)
4250
0
                continue;
4251
0
            m->next_trial = 999;
4252
0
        }
4253
4254
0
        pthread_mutex_unlock(&fd->metrics_lock);
4255
0
        hts_tpool_process_flush(fd->rqueue);
4256
0
        pthread_mutex_lock(&fd->metrics_lock);
4257
0
    }
4258
4259
117k
    for (i = 0; i < DS_END; i++) {
4260
115k
        cram_metrics *m = fd->m[i];
4261
115k
        if (!m)
4262
0
            continue;
4263
4264
115k
        m->trial = NTRIALS;
4265
115k
        m->next_trial = TRIAL_SPAN;
4266
115k
        m->revised_method = 0;
4267
115k
        m->unpackable = 0;
4268
4269
115k
        memset(m->sz, 0, sizeof(m->sz));
4270
115k
    }
4271
2.44k
}
4272
4273
40.9k
int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
4274
40.9k
    cram_job *j;
4275
4276
    // At the junction of mapped to unmapped data the compression
4277
    // methods may need to change due to very different statistical
4278
    // properties; particularly BA if minhash sorted.
4279
    //
4280
    // However with threading we'll have several in-flight blocks
4281
    // arriving out of order.
4282
    //
4283
    // So we do one trial reset of NThreads to last for NThreads
4284
    // duration to get us over this transition period, followed
4285
    // by another retrial of the usual ntrials & trial span.
4286
40.9k
    pthread_mutex_lock(&fd->metrics_lock);
4287
40.9k
    if (c->n_mapped < 0.3*c->curr_rec &&
4288
21.8k
        fd->last_mapped > 0.7*c->max_rec) {
4289
2.44k
        reset_metrics(fd);
4290
2.44k
    }
4291
40.9k
    fd->last_mapped = c->n_mapped * (c->max_rec+1)/(c->curr_rec+1) ;
4292
40.9k
    pthread_mutex_unlock(&fd->metrics_lock);
4293
4294
40.9k
    if (!fd->pool)
4295
40.9k
        return cram_flush_container(fd, c);
4296
4297
0
    if (!(j = malloc(sizeof(*j))))
4298
0
        return -1;
4299
0
    j->fd = fd;
4300
0
    j->c = c;
4301
4302
    // Flush the job.  Note our encoder queue may be full, so we
4303
    // either have to keep trying in non-blocking mode (what we do) or
4304
    // use a dedicated separate thread for draining the queue.
4305
0
    for (;;) {
4306
0
        errno = 0;
4307
0
        hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_flush_thread, j, 1);
4308
0
        int pending = (errno == EAGAIN);
4309
0
        if (cram_flush_result(fd) != 0)
4310
0
            return -1;
4311
0
        if (!pending)
4312
0
            break;
4313
4314
0
        hts_usleep(1000);
4315
0
    }
4316
4317
0
    return 0;
4318
0
}
4319
4320
/* ----------------------------------------------------------------------
4321
 * Compression headers; the first part of the container
4322
 */
4323
4324
/*
4325
 * Creates a new blank container compression header
4326
 *
4327
 * Returns header ptr on success
4328
 *         NULL on failure
4329
 */
4330
42.6k
cram_block_compression_hdr *cram_new_compression_header(void) {
4331
42.6k
    cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
4332
42.6k
    if (!hdr)
4333
0
        return NULL;
4334
4335
42.6k
    if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
4336
0
        free(hdr);
4337
0
        return NULL;
4338
0
    }
4339
4340
42.6k
    if (!(hdr->TD_hash = kh_init(m_s2i))) {
4341
0
        cram_free_block(hdr->TD_blk);
4342
0
        free(hdr);
4343
0
        return NULL;
4344
0
    }
4345
4346
42.6k
    if (!(hdr->TD_keys = string_pool_create(8192))) {
4347
0
        kh_destroy(m_s2i, hdr->TD_hash);
4348
0
        cram_free_block(hdr->TD_blk);
4349
0
        free(hdr);
4350
0
        return NULL;
4351
0
    }
4352
4353
42.6k
    return hdr;
4354
42.6k
}
4355
4356
43.1k
void cram_free_compression_header(cram_block_compression_hdr *hdr) {
4357
43.1k
    int i;
4358
4359
43.1k
    if (hdr->landmark)
4360
453
        free(hdr->landmark);
4361
4362
43.1k
    if (hdr->preservation_map)
4363
41.2k
        kh_destroy(map, hdr->preservation_map);
4364
4365
1.42M
    for (i = 0; i < CRAM_MAP_HASH; i++) {
4366
1.38M
        cram_map *m, *m2;
4367
1.39M
        for (m = hdr->rec_encoding_map[i]; m; m = m2) {
4368
19.0k
            m2 = m->next;
4369
19.0k
            if (m->codec)
4370
0
                m->codec->free(m->codec);
4371
19.0k
            free(m);
4372
19.0k
        }
4373
1.38M
    }
4374
4375
1.42M
    for (i = 0; i < CRAM_MAP_HASH; i++) {
4376
1.38M
        cram_map *m, *m2;
4377
1.38M
        for (m = hdr->tag_encoding_map[i]; m; m = m2) {
4378
564
            m2 = m->next;
4379
564
            if (m->codec)
4380
564
                m->codec->free(m->codec);
4381
564
            free(m);
4382
564
        }
4383
1.38M
    }
4384
4385
2.07M
    for (i = 0; i < DS_END; i++) {
4386
2.02M
        if (hdr->codecs[i])
4387
759k
            hdr->codecs[i]->free(hdr->codecs[i]);
4388
2.02M
    }
4389
4390
43.1k
    if (hdr->TL)
4391
36
        free(hdr->TL);
4392
43.1k
    if (hdr->TD_blk)
4393
42.7k
        cram_free_block(hdr->TD_blk);
4394
43.1k
    if (hdr->TD_hash)
4395
42.6k
        kh_destroy(m_s2i, hdr->TD_hash);
4396
43.1k
    if (hdr->TD_keys)
4397
42.6k
        string_pool_destroy(hdr->TD_keys);
4398
4399
43.1k
    free(hdr);
4400
43.1k
}
4401
4402
4403
/* ----------------------------------------------------------------------
4404
 * Slices and slice headers
4405
 */
4406
4407
40.9k
void cram_free_slice_header(cram_block_slice_hdr *hdr) {
4408
40.9k
    if (!hdr)
4409
0
        return;
4410
4411
40.9k
    if (hdr->block_content_ids)
4412
40.7k
        free(hdr->block_content_ids);
4413
4414
40.9k
    free(hdr);
4415
4416
40.9k
    return;
4417
40.9k
}
4418
4419
40.9k
void cram_free_slice(cram_slice *s) {
4420
40.9k
    if (!s)
4421
0
        return;
4422
4423
40.9k
    if (s->hdr_block)
4424
40.7k
        cram_free_block(s->hdr_block);
4425
4426
40.9k
    if (s->block) {
4427
40.7k
        int i;
4428
4429
40.7k
        if (s->hdr) {
4430
316k
            for (i = 0; i < s->hdr->num_blocks; i++) {
4431
276k
                if (i > 0 && s->block[i] == s->block[0])
4432
24
                    continue;
4433
276k
                cram_free_block(s->block[i]);
4434
276k
            }
4435
40.7k
        }
4436
40.7k
        free(s->block);
4437
40.7k
    }
4438
4439
40.9k
    {
4440
        // Normally already copied into s->block[], but potentially still
4441
        // here if we error part way through cram_encode_slice.
4442
40.9k
        int i;
4443
164k
        for (i = 0; i < s->naux_block; i++)
4444
123k
            cram_free_block(s->aux_block[i]);
4445
40.9k
    }
4446
4447
40.9k
    if (s->block_by_id)
4448
0
        free(s->block_by_id);
4449
4450
40.9k
    if (s->hdr)
4451
40.9k
        cram_free_slice_header(s->hdr);
4452
4453
40.9k
    if (s->seqs_blk)
4454
40.9k
        cram_free_block(s->seqs_blk);
4455
4456
40.9k
    if (s->qual_blk)
4457
169
        cram_free_block(s->qual_blk);
4458
4459
40.9k
    if (s->name_blk)
4460
169
        cram_free_block(s->name_blk);
4461
4462
40.9k
    if (s->aux_blk)
4463
40.9k
        cram_free_block(s->aux_blk);
4464
4465
40.9k
    if (s->base_blk)
4466
169
        cram_free_block(s->base_blk);
4467
4468
40.9k
    if (s->soft_blk)
4469
169
        cram_free_block(s->soft_blk);
4470
4471
40.9k
    if (s->cigar)
4472
40.9k
        free(s->cigar);
4473
4474
40.9k
    if (s->crecs)
4475
40.9k
        free(s->crecs);
4476
4477
40.9k
    if (s->features)
4478
19.0k
        free(s->features);
4479
4480
40.9k
    if (s->TN)
4481
0
        free(s->TN);
4482
4483
40.9k
    if (s->pair_keys)
4484
40.9k
        string_pool_destroy(s->pair_keys);
4485
4486
40.9k
    if (s->pair[0])
4487
40.9k
        kh_destroy(m_s2i, s->pair[0]);
4488
40.9k
    if (s->pair[1])
4489
40.9k
        kh_destroy(m_s2i, s->pair[1]);
4490
4491
40.9k
    if (s->aux_block)
4492
36.9k
        free(s->aux_block);
4493
4494
40.9k
    free(s);
4495
40.9k
}
4496
4497
/*
4498
 * Creates a new empty slice in memory, for subsequent writing to
4499
 * disk.
4500
 *
4501
 * Returns cram_slice ptr on success
4502
 *         NULL on failure
4503
 */
4504
40.9k
cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
4505
40.9k
    cram_slice *s = calloc(1, sizeof(*s));
4506
40.9k
    if (!s)
4507
0
        return NULL;
4508
4509
40.9k
    if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
4510
0
        goto err;
4511
40.9k
    s->hdr->content_type = type;
4512
4513
40.9k
    s->hdr_block = NULL;
4514
40.9k
    s->block = NULL;
4515
40.9k
    s->block_by_id = NULL;
4516
40.9k
    s->last_apos = 0;
4517
40.9k
    if (!(s->crecs = malloc(nrecs * sizeof(cram_record))))  goto err;
4518
40.9k
    s->cigar_alloc = 1024;
4519
40.9k
    if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err;
4520
40.9k
    s->ncigar = 0;
4521
4522
40.9k
    if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))       goto err;
4523
40.9k
    if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))   goto err;
4524
40.9k
    if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))   goto err;
4525
40.9k
    if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux)))  goto err;
4526
40.9k
    if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))   goto err;
4527
40.9k
    if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))   goto err;
4528
4529
40.9k
    s->features = NULL;
4530
40.9k
    s->nfeatures = s->afeatures = 0;
4531
4532
40.9k
#ifndef TN_external
4533
40.9k
    s->TN = NULL;
4534
40.9k
    s->nTN = s->aTN = 0;
4535
40.9k
#endif
4536
4537
    // Volatile keys as we do realloc in dstring
4538
40.9k
    if (!(s->pair_keys = string_pool_create(8192))) goto err;
4539
40.9k
    if (!(s->pair[0] = kh_init(m_s2i)))             goto err;
4540
40.9k
    if (!(s->pair[1] = kh_init(m_s2i)))             goto err;
4541
4542
#ifdef BA_external
4543
    s->BA_len = 0;
4544
#endif
4545
4546
40.9k
    return s;
4547
4548
0
 err:
4549
0
    if (s)
4550
0
        cram_free_slice(s);
4551
4552
0
    return NULL;
4553
40.9k
}
4554
4555
/*
4556
 * Loads an entire slice.
4557
 * FIXME: In 1.0 the native unit of slices within CRAM is broken
4558
 * as slices contain references to objects in other slices.
4559
 * To work around this while keeping the slice oriented outer loop
4560
 * we read all slices and stitch them together into a fake large
4561
 * slice instead.
4562
 *
4563
 * Returns cram_slice ptr on success
4564
 *         NULL on failure
4565
 */
4566
10
cram_slice *cram_read_slice(cram_fd *fd) {
4567
10
    cram_block *b = cram_read_block(fd);
4568
10
    cram_slice *s = calloc(1, sizeof(*s));
4569
10
    int i, n, max_id, min_id;
4570
4571
10
    if (!b || !s)
4572
0
        goto err;
4573
4574
10
    s->hdr_block = b;
4575
10
    switch (b->content_type) {
4576
10
    case MAPPED_SLICE:
4577
10
    case UNMAPPED_SLICE:
4578
10
        if (!(s->hdr = cram_decode_slice_header(fd, b)))
4579
0
            goto err;
4580
10
        break;
4581
4582
10
    default:
4583
0
        hts_log_error("Unexpected block of type %s",
4584
0
                      cram_content_type2str(b->content_type));
4585
0
        goto err;
4586
10
    }
4587
4588
10
    if (s->hdr->num_blocks < 1) {
4589
2
        hts_log_error("Slice does not include any data blocks");
4590
2
        goto err;
4591
2
    }
4592
4593
8
    s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
4594
8
    if (!s->block)
4595
0
        goto err;
4596
4597
8
    for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
4598
8
        if (!(s->block[i] = cram_read_block(fd)))
4599
8
            goto err;
4600
4601
0
        if (s->block[i]->content_type == EXTERNAL) {
4602
0
            if (max_id < s->block[i]->content_id)
4603
0
                max_id = s->block[i]->content_id;
4604
0
            if (min_id > s->block[i]->content_id)
4605
0
                min_id = s->block[i]->content_id;
4606
0
        }
4607
0
    }
4608
4609
0
    if (!(s->block_by_id = calloc(512, sizeof(s->block[0]))))
4610
0
        goto err;
4611
4612
0
    for (i = 0; i < n; i++) {
4613
0
        if (s->block[i]->content_type != EXTERNAL)
4614
0
            continue;
4615
0
        uint32_t v = s->block[i]->content_id;
4616
0
        if (v >= 256)
4617
0
            v = 256 + v % 251;
4618
0
        s->block_by_id[v] = s->block[i];
4619
0
    }
4620
4621
    /* Initialise encoding/decoding tables */
4622
0
    s->cigar_alloc = 1024;
4623
0
    if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err;
4624
0
    s->ncigar = 0;
4625
4626
0
    if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))      goto err;
4627
0
    if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))  goto err;
4628
0
    if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))  goto err;
4629
0
    if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux))) goto err;
4630
0
    if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))  goto err;
4631
0
    if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))  goto err;
4632
4633
0
    s->crecs = NULL;
4634
4635
0
    s->last_apos = s->hdr->ref_seq_start;
4636
0
    s->decode_md = fd->decode_md;
4637
4638
0
    return s;
4639
4640
10
 err:
4641
10
    if (b)
4642
10
        cram_free_block(b);
4643
10
    if (s) {
4644
10
        s->hdr_block = NULL;
4645
10
        cram_free_slice(s);
4646
10
    }
4647
10
    return NULL;
4648
0
}
4649
4650
4651
/* ----------------------------------------------------------------------
4652
 * CRAM file definition (header)
4653
 */
4654
4655
/*
4656
 * Reads a CRAM file definition structure.
4657
 * Returns file_def ptr on success
4658
 *         NULL on failure
4659
 */
4660
1.19k
cram_file_def *cram_read_file_def(cram_fd *fd) {
4661
1.19k
    cram_file_def *def = malloc(sizeof(*def));
4662
1.19k
    if (!def)
4663
0
        return NULL;
4664
4665
1.19k
    if (26 != hread(fd->fp, &def->magic[0], 26)) {
4666
0
        free(def);
4667
0
        return NULL;
4668
0
    }
4669
4670
1.19k
    if (memcmp(def->magic, "CRAM", 4) != 0) {
4671
0
        free(def);
4672
0
        return NULL;
4673
0
    }
4674
4675
1.19k
    if (def->major_version > 4) {
4676
0
        hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x, 3.x or 4.x, got %d.%d",
4677
0
                      def->major_version, def->minor_version);
4678
0
        free(def);
4679
0
        return NULL;
4680
0
    }
4681
4682
1.19k
    fd->first_container += 26;
4683
1.19k
    fd->curr_position = fd->first_container;
4684
1.19k
    fd->last_slice = 0;
4685
4686
1.19k
    return def;
4687
1.19k
}
4688
4689
/*
4690
 * Writes a cram_file_def structure to cram_fd.
4691
 * Returns 0 on success
4692
 *        -1 on failure
4693
 */
4694
1.76k
int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
4695
1.76k
    return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1;
4696
1.76k
}
4697
4698
3.12k
void cram_free_file_def(cram_file_def *def) {
4699
3.12k
    if (def) free(def);
4700
3.12k
}
4701
4702
/* ----------------------------------------------------------------------
4703
 * SAM header I/O
4704
 */
4705
4706
4707
/*
4708
 * Reads the SAM header from the first CRAM data block.
4709
 * Also performs minimal parsing to extract read-group
4710
 * and sample information.
4711
4712
 * Returns SAM hdr ptr on success
4713
 *         NULL on failure
4714
 */
4715
1.19k
sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) {
4716
1.19k
    int32_t header_len;
4717
1.19k
    char *header;
4718
1.19k
    sam_hdr_t *hdr;
4719
4720
    /* 1.1 onwards stores the header in the first block of a container */
4721
1.19k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4722
        /* Length */
4723
1.13k
        if (-1 == int32_decode(fd, &header_len))
4724
0
            return NULL;
4725
4726
1.13k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
4727
1.13k
        if (header_len > FUZZ_ALLOC_LIMIT)
4728
0
            return NULL;
4729
1.13k
#endif
4730
4731
        /* Alloc and read */
4732
1.13k
        if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1)))
4733
0
            return NULL;
4734
4735
1.13k
        if (header_len != hread(fd->fp, header, header_len)) {
4736
0
            free(header);
4737
0
            return NULL;
4738
0
        }
4739
1.13k
        header[header_len] = '\0';
4740
4741
1.13k
        fd->first_container += 4 + header_len;
4742
1.13k
    } else {
4743
57
        cram_container *c = cram_read_container(fd);
4744
57
        cram_block *b;
4745
57
        int i;
4746
57
        int64_t len;
4747
4748
57
        if (!c)
4749
0
            return NULL;
4750
4751
57
        fd->first_container += c->length + c->offset;
4752
57
        fd->curr_position = fd->first_container;
4753
4754
57
        if (c->num_blocks < 1) {
4755
0
            cram_free_container(c);
4756
0
            return NULL;
4757
0
        }
4758
4759
57
        if (!(b = cram_read_block(fd))) {
4760
0
            cram_free_container(c);
4761
0
            return NULL;
4762
0
        }
4763
57
        if (cram_uncompress_block(b) != 0) {
4764
27
            cram_free_container(c);
4765
27
            cram_free_block(b);
4766
27
            return NULL;
4767
27
        }
4768
4769
30
        len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4770
30
            fd->vv.varint_size(b->content_id) +
4771
30
            fd->vv.varint_size(b->uncomp_size) +
4772
30
            fd->vv.varint_size(b->comp_size);
4773
4774
        /* Extract header from 1st block */
4775
30
        if (-1 == int32_get_blk(b, &header_len) ||
4776
30
            header_len < 0 || /* Spec. says signed...  why? */
4777
30
            b->uncomp_size - 4 < header_len) {
4778
0
            cram_free_container(c);
4779
0
            cram_free_block(b);
4780
0
            return NULL;
4781
0
        }
4782
30
        if (NULL == (header = malloc((size_t) header_len+1))) {
4783
0
            cram_free_container(c);
4784
0
            cram_free_block(b);
4785
0
            return NULL;
4786
0
        }
4787
30
        memcpy(header, BLOCK_END(b), header_len);
4788
30
        header[header_len] = '\0';
4789
30
        cram_free_block(b);
4790
4791
        /* Consume any remaining blocks */
4792
94
        for (i = 1; i < c->num_blocks; i++) {
4793
64
            if (!(b = cram_read_block(fd))) {
4794
0
                cram_free_container(c);
4795
0
                free(header);
4796
0
                return NULL;
4797
0
            }
4798
64
            len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4799
64
                fd->vv.varint_size(b->content_id) +
4800
64
                fd->vv.varint_size(b->uncomp_size) +
4801
64
                fd->vv.varint_size(b->comp_size);
4802
64
            cram_free_block(b);
4803
64
        }
4804
4805
30
        if (c->length > 0 && len > 0 && c->length > len) {
4806
            // Consume padding
4807
2
            char *pads = malloc(c->length - len);
4808
2
            if (!pads) {
4809
0
                cram_free_container(c);
4810
0
                free(header);
4811
0
                return NULL;
4812
0
            }
4813
4814
2
            if (c->length - len != hread(fd->fp, pads, c->length - len)) {
4815
2
                cram_free_container(c);
4816
2
                free(header);
4817
2
                free(pads);
4818
2
                return NULL;
4819
2
            }
4820
0
            free(pads);
4821
0
        }
4822
4823
28
        cram_free_container(c);
4824
28
    }
4825
4826
    /* Parse */
4827
1.16k
    hdr = sam_hdr_init();
4828
1.16k
    if (!hdr) {
4829
0
        free(header);
4830
0
        return NULL;
4831
0
    }
4832
4833
1.16k
    if (-1 == sam_hdr_add_lines(hdr, header, header_len)) {
4834
2
        free(header);
4835
2
        sam_hdr_destroy(hdr);
4836
2
        return NULL;
4837
2
    }
4838
4839
1.16k
    hdr->l_text = header_len;
4840
1.16k
    hdr->text = header;
4841
4842
1.16k
    return hdr;
4843
4844
1.16k
}
4845
4846
/*
4847
 * Converts 'in' to a full pathname to store in out.
4848
 * Out must be at least PATH_MAX bytes long.
4849
 */
4850
0
static void full_path(char *out, char *in) {
4851
0
    size_t in_l = strlen(in);
4852
0
    if (hisremote(in)) {
4853
0
        if (in_l > PATH_MAX) {
4854
0
            hts_log_error("Reference path is longer than %d", PATH_MAX);
4855
0
            return;
4856
0
        }
4857
0
        strncpy(out, in, PATH_MAX-1);
4858
0
        out[PATH_MAX-1] = 0;
4859
0
        return;
4860
0
    }
4861
0
    if (*in == '/' ||
4862
        // Windows paths
4863
0
        (in_l > 3 && toupper_c(*in) >= 'A'  && toupper_c(*in) <= 'Z' &&
4864
0
         in[1] == ':' && (in[2] == '/' || in[2] == '\\'))) {
4865
0
        strncpy(out, in, PATH_MAX-1);
4866
0
        out[PATH_MAX-1] = 0;
4867
0
    } else {
4868
0
        size_t len;
4869
4870
        // unable to get dir or out+in is too long
4871
0
        if (!getcwd(out, PATH_MAX) ||
4872
0
            (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
4873
0
            strncpy(out, in, PATH_MAX-1);
4874
0
            out[PATH_MAX-1] = 0;
4875
0
            return;
4876
0
        }
4877
4878
0
        snprintf(out+len, PATH_MAX - len, "/%s", in);
4879
4880
        // FIXME: cope with `pwd`/../../../foo.fa ?
4881
0
    }
4882
0
}
4883
4884
/*
4885
 * Writes a CRAM SAM header.
4886
 * Returns 0 on success
4887
 *        -1 on failure
4888
 */
4889
1.76k
int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) {
4890
1.76k
    size_t header_len;
4891
1.76k
    int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
4892
4893
    /* Write CRAM MAGIC if not yet written. */
4894
1.76k
    if (fd->file_def->major_version == 0) {
4895
1.76k
        fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version);
4896
1.76k
        fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version);
4897
1.76k
        if (0 != cram_write_file_def(fd, fd->file_def))
4898
0
            return -1;
4899
1.76k
    }
4900
4901
    /* 1.0 requires an UNKNOWN read-group */
4902
1.76k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4903
0
        if (!sam_hrecs_find_rg(hdr->hrecs, "UNKNOWN"))
4904
0
            if (sam_hdr_add_line(hdr, "RG",
4905
0
                            "ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
4906
0
                return -1;
4907
0
    }
4908
4909
1.76k
    if (-1 == refs_from_header(fd))
4910
0
        return -1;
4911
1.76k
    if (-1 == refs2id(fd->refs, fd->header))
4912
0
        return -1;
4913
4914
    /* Fix M5 strings */
4915
1.76k
    if (fd->refs && !fd->no_ref && fd->embed_ref <= 1) {
4916
1.76k
        int i;
4917
1.86k
        for (i = 0; i < hdr->hrecs->nref; i++) {
4918
1.04k
            sam_hrec_type_t *ty;
4919
1.04k
            char *ref;
4920
4921
1.04k
            if (!(ty = sam_hrecs_find_type_id(hdr->hrecs, "SQ", "SN", hdr->hrecs->ref[i].name)))
4922
0
                return -1;
4923
4924
1.04k
            if (!sam_hrecs_find_key(ty, "M5", NULL)) {
4925
946
                char unsigned buf[16];
4926
946
                char buf2[33];
4927
946
                hts_pos_t rlen;
4928
946
                hts_md5_context *md5;
4929
4930
946
                if (!fd->refs ||
4931
946
                    !fd->refs->ref_id ||
4932
946
                    !fd->refs->ref_id[i]) {
4933
0
                    return -1;
4934
0
                }
4935
946
                rlen = fd->refs->ref_id[i]->length;
4936
946
                ref = cram_get_ref(fd, i, 1, rlen);
4937
946
                if (NULL == ref) {
4938
946
                    if (fd->embed_ref == -1) {
4939
                        // auto embed-ref
4940
946
                        hts_log_warning("No M5 tags present and could not "
4941
946
                                        "find reference");
4942
946
                        hts_log_warning("Enabling embed_ref=2 option");
4943
946
                        hts_log_warning("NOTE: the CRAM file will be bigger "
4944
946
                                        "than using an external reference");
4945
946
                        pthread_mutex_lock(&fd->ref_lock);
4946
                        // Best guess.  It may be unmapped data with broken
4947
                        // headers, in which case this will get ignored.
4948
946
                        fd->embed_ref = 2;
4949
946
                        pthread_mutex_unlock(&fd->ref_lock);
4950
946
                        break;
4951
946
                    }
4952
0
                    return -1;
4953
946
                }
4954
0
                rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
4955
0
                if (!(md5 = hts_md5_init()))
4956
0
                    return -1;
4957
0
                if (HTS_POS_MAX <= ULONG_MAX) {
4958
                    // Platforms with 64-bit unsigned long update in one go
4959
0
                    hts_md5_update(md5, ref, rlen);
4960
0
                } else {
4961
                    // Those with 32-bit ulong (Windows) may have to loop
4962
                    // over epic references
4963
0
                    hts_pos_t pos = 0;
4964
0
                    while (rlen - pos > ULONG_MAX) {
4965
0
                        hts_md5_update(md5, ref + pos, ULONG_MAX);
4966
0
                        pos += ULONG_MAX;
4967
0
                    }
4968
0
                    hts_md5_update(md5, ref + pos, (unsigned long)(rlen - pos));
4969
0
                }
4970
0
                hts_md5_final(buf, md5);
4971
0
                hts_md5_destroy(md5);
4972
0
                cram_ref_decr(fd->refs, i);
4973
4974
0
                hts_md5_hex(buf2, buf);
4975
0
                fd->refs->ref_id[i]->validated_md5 = 1;
4976
0
                if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "M5", buf2, NULL))
4977
0
                    return -1;
4978
0
            }
4979
4980
97
            if (fd->ref_fn) {
4981
0
                char ref_fn[PATH_MAX];
4982
0
                full_path(ref_fn, fd->ref_fn);
4983
0
                if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "UR", ref_fn, NULL))
4984
0
                    return -1;
4985
0
            }
4986
97
        }
4987
1.76k
    }
4988
4989
    /* Length */
4990
1.76k
    header_len = sam_hdr_length(hdr);
4991
1.76k
    if (header_len > INT32_MAX) {
4992
0
        hts_log_error("Header is too long for CRAM format");
4993
0
        return -1;
4994
0
    }
4995
1.76k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4996
0
        if (-1 == int32_encode(fd, header_len))
4997
0
            return -1;
4998
4999
        /* Text data */
5000
0
        if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len))
5001
0
            return -1;
5002
1.76k
    } else {
5003
        /* Create block(s) inside a container */
5004
1.76k
        cram_block *b = cram_new_block(FILE_HEADER, 0);
5005
1.76k
        cram_container *c = cram_new_container(0, 0);
5006
1.76k
        int padded_length;
5007
1.76k
        char *pads;
5008
1.76k
        int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3);
5009
5010
1.76k
        if (!b || !c) {
5011
0
            if (b) cram_free_block(b);
5012
0
            if (c) cram_free_container(c);
5013
0
            return -1;
5014
0
        }
5015
5016
1.76k
        if (int32_put_blk(b, header_len) < 0)
5017
0
            return -1;
5018
1.76k
        if (header_len)
5019
1.11k
            BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
5020
1.76k
        BLOCK_UPLEN(b);
5021
5022
        // Compress header block if V3.0 and above
5023
1.76k
        if (CRAM_MAJOR_VERS(fd->version) >= 3)
5024
1.76k
            if (cram_compress_block(fd, b, NULL, -1, -1) < 0)
5025
0
                return -1;
5026
5027
1.76k
        if (blank_block) {
5028
1.76k
            c->length = b->comp_size + 2 + 4*is_cram_3 +
5029
1.76k
                fd->vv.varint_size(b->content_id)   +
5030
1.76k
                fd->vv.varint_size(b->uncomp_size)  +
5031
1.76k
                fd->vv.varint_size(b->comp_size);
5032
5033
1.76k
            c->num_blocks = 2;
5034
1.76k
            c->num_landmarks = 2;
5035
1.76k
            if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
5036
0
                cram_free_block(b);
5037
0
                cram_free_container(c);
5038
0
                return -1;
5039
0
            }
5040
1.76k
            c->landmark[0] = 0;
5041
1.76k
            c->landmark[1] = c->length;
5042
5043
            // Plus extra storage for uncompressed secondary blank block
5044
1.76k
            padded_length = MIN(c->length*.5, 10000);
5045
1.76k
            c->length += padded_length + 2 + 4*is_cram_3 +
5046
1.76k
                fd->vv.varint_size(b->content_id) +
5047
1.76k
                fd->vv.varint_size(padded_length)*2;
5048
1.76k
        } else {
5049
            // Pad the block instead.
5050
0
            c->num_blocks = 1;
5051
0
            c->num_landmarks = 1;
5052
0
            if (!(c->landmark = malloc(sizeof(*c->landmark))))
5053
0
                return -1;
5054
0
            c->landmark[0] = 0;
5055
5056
0
            padded_length = MAX(c->length*1.5, 10000) - c->length;
5057
5058
0
            c->length = b->comp_size + padded_length +
5059
0
                2 + 4*is_cram_3 +
5060
0
                fd->vv.varint_size(b->content_id)   +
5061
0
                fd->vv.varint_size(b->uncomp_size)  +
5062
0
                fd->vv.varint_size(b->comp_size);
5063
5064
0
            if (NULL == (pads = calloc(1, padded_length))) {
5065
0
                cram_free_block(b);
5066
0
                cram_free_container(c);
5067
0
                return -1;
5068
0
            }
5069
0
            BLOCK_APPEND(b, pads, padded_length);
5070
0
            BLOCK_UPLEN(b);
5071
0
            free(pads);
5072
0
        }
5073
5074
1.76k
        if (-1 == cram_write_container(fd, c)) {
5075
0
            cram_free_block(b);
5076
0
            cram_free_container(c);
5077
0
            return -1;
5078
0
        }
5079
5080
1.76k
        if (-1 == cram_write_block(fd, b)) {
5081
0
            cram_free_block(b);
5082
0
            cram_free_container(c);
5083
0
            return -1;
5084
0
        }
5085
5086
1.76k
        if (blank_block) {
5087
1.76k
            BLOCK_RESIZE(b, padded_length);
5088
1.76k
            memset(BLOCK_DATA(b), 0, padded_length);
5089
1.76k
            BLOCK_SIZE(b) = padded_length;
5090
1.76k
            BLOCK_UPLEN(b);
5091
1.76k
            b->method = RAW;
5092
1.76k
            if (-1 == cram_write_block(fd, b)) {
5093
0
                cram_free_block(b);
5094
0
                cram_free_container(c);
5095
0
                return -1;
5096
0
            }
5097
1.76k
        }
5098
5099
1.76k
        cram_free_block(b);
5100
1.76k
        cram_free_container(c);
5101
1.76k
    }
5102
5103
1.76k
    if (0 != hflush(fd->fp))
5104
0
        return -1;
5105
5106
1.76k
    RP("=== Finishing saving header ===\n");
5107
5108
1.76k
    return 0;
5109
5110
0
 block_err:
5111
0
    return -1;
5112
1.76k
}
5113
5114
/* ----------------------------------------------------------------------
5115
 * The top-level cram opening, closing and option handling
5116
 */
5117
5118
/*
5119
 * Sets CRAM variable sized integer decode function tables.
5120
 * CRAM 1, 2, and 3.x all used ITF8 for uint32 and UTF8 for uint64.
5121
 * CRAM 4.x uses the same encoding mechanism for 32-bit and 64-bit
5122
 * (or anything inbetween), but also now supports signed values.
5123
 *
5124
 * Version is the CRAM major version number.
5125
 * vv is the vector table (probably &cram_fd->vv)
5126
 */
5127
3.12k
static void cram_init_varint(varint_vec *vv, int version) {
5128
3.12k
    if (version >= 4) {
5129
29
        vv->varint_get32 = uint7_get_32; // FIXME: varint.h API should be size agnostic
5130
29
        vv->varint_get32s = sint7_get_32;
5131
29
        vv->varint_get64 = uint7_get_64;
5132
29
        vv->varint_get64s = sint7_get_64;
5133
29
        vv->varint_put32 = uint7_put_32;
5134
29
        vv->varint_put32s = sint7_put_32;
5135
29
        vv->varint_put64 = uint7_put_64;
5136
29
        vv->varint_put64s = sint7_put_64;
5137
29
        vv->varint_put32_blk = uint7_put_blk_32;
5138
29
        vv->varint_put32s_blk = sint7_put_blk_32;
5139
29
        vv->varint_put64_blk = uint7_put_blk_64;
5140
29
        vv->varint_put64s_blk = sint7_put_blk_64;
5141
29
        vv->varint_size = uint7_size;
5142
29
        vv->varint_decode32_crc = uint7_decode_crc32;
5143
29
        vv->varint_decode32s_crc = sint7_decode_crc32;
5144
29
        vv->varint_decode64_crc = uint7_decode_crc64;
5145
3.09k
    } else {
5146
3.09k
        vv->varint_get32 = safe_itf8_get;
5147
3.09k
        vv->varint_get32s = safe_itf8_get;
5148
3.09k
        vv->varint_get64 = safe_ltf8_get;
5149
3.09k
        vv->varint_get64s = safe_ltf8_get;
5150
3.09k
        vv->varint_put32 = safe_itf8_put;
5151
3.09k
        vv->varint_put32s = safe_itf8_put;
5152
3.09k
        vv->varint_put64 = safe_ltf8_put;
5153
3.09k
        vv->varint_put64s = safe_ltf8_put;
5154
3.09k
        vv->varint_put32_blk = itf8_put_blk;
5155
3.09k
        vv->varint_put32s_blk = itf8_put_blk;
5156
3.09k
        vv->varint_put64_blk = ltf8_put_blk;
5157
3.09k
        vv->varint_put64s_blk = ltf8_put_blk;
5158
3.09k
        vv->varint_size = itf8_size;
5159
3.09k
        vv->varint_decode32_crc = itf8_decode_crc;
5160
3.09k
        vv->varint_decode32s_crc = itf8_decode_crc;
5161
3.09k
        vv->varint_decode64_crc = ltf8_decode_crc;
5162
3.09k
    }
5163
3.12k
}
5164
5165
/*
5166
 * Initialises the lookup tables. These could be global statics, but they're
5167
 * clumsy to setup in a multi-threaded environment unless we generate
5168
 * verbatim code and include that.
5169
 */
5170
3.12k
static void cram_init_tables(cram_fd *fd) {
5171
3.12k
    int i;
5172
5173
3.12k
    memset(fd->L1, 4, 256);
5174
3.12k
    fd->L1['A'] = 0; fd->L1['a'] = 0;
5175
3.12k
    fd->L1['C'] = 1; fd->L1['c'] = 1;
5176
3.12k
    fd->L1['G'] = 2; fd->L1['g'] = 2;
5177
3.12k
    fd->L1['T'] = 3; fd->L1['t'] = 3;
5178
5179
3.12k
    memset(fd->L2, 5, 256);
5180
3.12k
    fd->L2['A'] = 0; fd->L2['a'] = 0;
5181
3.12k
    fd->L2['C'] = 1; fd->L2['c'] = 1;
5182
3.12k
    fd->L2['G'] = 2; fd->L2['g'] = 2;
5183
3.12k
    fd->L2['T'] = 3; fd->L2['t'] = 3;
5184
3.12k
    fd->L2['N'] = 4; fd->L2['n'] = 4;
5185
5186
3.12k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
5187
581k
        for (i = 0; i < 0x200; i++) {
5188
580k
            int f = 0;
5189
5190
580k
            if (i & CRAM_FPAIRED)      f |= BAM_FPAIRED;
5191
580k
            if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
5192
580k
            if (i & CRAM_FUNMAP)       f |= BAM_FUNMAP;
5193
580k
            if (i & CRAM_FREVERSE)     f |= BAM_FREVERSE;
5194
580k
            if (i & CRAM_FREAD1)       f |= BAM_FREAD1;
5195
580k
            if (i & CRAM_FREAD2)       f |= BAM_FREAD2;
5196
580k
            if (i & CRAM_FSECONDARY)   f |= BAM_FSECONDARY;
5197
580k
            if (i & CRAM_FQCFAIL)      f |= BAM_FQCFAIL;
5198
580k
            if (i & CRAM_FDUP)         f |= BAM_FDUP;
5199
5200
580k
            fd->bam_flag_swap[i]  = f;
5201
580k
        }
5202
5203
4.64M
        for (i = 0; i < 0x1000; i++) {
5204
4.64M
            int g = 0;
5205
5206
4.64M
            if (i & BAM_FPAIRED)           g |= CRAM_FPAIRED;
5207
4.64M
            if (i & BAM_FPROPER_PAIR)  g |= CRAM_FPROPER_PAIR;
5208
4.64M
            if (i & BAM_FUNMAP)        g |= CRAM_FUNMAP;
5209
4.64M
            if (i & BAM_FREVERSE)      g |= CRAM_FREVERSE;
5210
4.64M
            if (i & BAM_FREAD1)        g |= CRAM_FREAD1;
5211
4.64M
            if (i & BAM_FREAD2)        g |= CRAM_FREAD2;
5212
4.64M
            if (i & BAM_FSECONDARY)    g |= CRAM_FSECONDARY;
5213
4.64M
            if (i & BAM_FQCFAIL)       g |= CRAM_FQCFAIL;
5214
4.64M
            if (i & BAM_FDUP)          g |= CRAM_FDUP;
5215
5216
4.64M
            fd->cram_flag_swap[i] = g;
5217
4.64M
        }
5218
1.98k
    } else {
5219
        /* NOP */
5220
8.14M
        for (i = 0; i < 0x1000; i++)
5221
8.14M
            fd->bam_flag_swap[i] = i;
5222
8.14M
        for (i = 0; i < 0x1000; i++)
5223
8.14M
            fd->cram_flag_swap[i] = i;
5224
1.98k
    }
5225
5226
3.12k
    memset(fd->cram_sub_matrix, 4, 32*32);
5227
103k
    for (i = 0; i < 32; i++) {
5228
99.9k
        fd->cram_sub_matrix[i]['A'&0x1f]=0;
5229
99.9k
        fd->cram_sub_matrix[i]['C'&0x1f]=1;
5230
99.9k
        fd->cram_sub_matrix[i]['G'&0x1f]=2;
5231
99.9k
        fd->cram_sub_matrix[i]['T'&0x1f]=3;
5232
99.9k
        fd->cram_sub_matrix[i]['N'&0x1f]=4;
5233
99.9k
    }
5234
18.7k
    for (i = 0; i < 20; i+=4) {
5235
15.6k
        int j;
5236
327k
        for (j = 0; j < 20; j++) {
5237
312k
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5238
312k
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5239
312k
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5240
312k
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5241
312k
        }
5242
15.6k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
5243
15.6k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
5244
15.6k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
5245
15.6k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
5246
15.6k
    }
5247
5248
3.12k
    cram_init_varint(&fd->vv, CRAM_MAJOR_VERS(fd->version));
5249
3.12k
}
5250
5251
// Default version numbers for CRAM
5252
static int major_version = 3;
5253
static int minor_version = 1;
5254
5255
/*
5256
 * Opens a CRAM file for read (mode "rb") or write ("wb").
5257
 * The filename may be "-" to indicate stdin or stdout.
5258
 *
5259
 * Returns file handle on success
5260
 *         NULL on failure.
5261
 */
5262
0
cram_fd *cram_open(const char *filename, const char *mode) {
5263
0
    hFILE *fp;
5264
0
    cram_fd *fd;
5265
0
    char fmode[3]= { mode[0], '\0', '\0' };
5266
5267
0
    if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
5268
0
        fmode[1] = 'b';
5269
0
    }
5270
5271
0
    fp = hopen(filename, fmode);
5272
0
    if (!fp)
5273
0
        return NULL;
5274
5275
0
    fd = cram_dopen(fp, filename, mode);
5276
0
    if (!fd)
5277
0
        hclose_abruptly(fp);
5278
5279
0
    return fd;
5280
0
}
5281
5282
/* Opens an existing stream for reading or writing.
5283
 *
5284
 * Returns file handle on success;
5285
 *         NULL on failure.
5286
 */
5287
3.12k
cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
5288
3.12k
    int i;
5289
3.12k
    char *cp;
5290
3.12k
    cram_fd *fd = calloc(1, sizeof(*fd));
5291
3.12k
    if (!fd)
5292
0
        return NULL;
5293
5294
3.12k
    fd->level = CRAM_DEFAULT_LEVEL;
5295
9.36k
    for (i = 0; mode[i]; i++) {
5296
6.24k
        if (mode[i] >= '0' && mode[i] <= '9') {
5297
0
            fd->level = mode[i] - '0';
5298
0
            break;
5299
0
        }
5300
6.24k
    }
5301
5302
3.12k
    fd->fp = fp;
5303
3.12k
    fd->mode = *mode;
5304
3.12k
    fd->first_container = 0;
5305
3.12k
    fd->curr_position = 0;
5306
5307
3.12k
    if (fd->mode == 'r') {
5308
        /* Reader */
5309
5310
1.19k
        if (!(fd->file_def = cram_read_file_def(fd)))
5311
0
            goto err;
5312
5313
1.19k
        fd->version = fd->file_def->major_version * 256 +
5314
1.19k
            fd->file_def->minor_version;
5315
5316
1.19k
        cram_init_tables(fd);
5317
5318
1.19k
        if (!(fd->header = cram_read_SAM_hdr(fd))) {
5319
31
            cram_free_file_def(fd->file_def);
5320
31
            goto err;
5321
31
        }
5322
5323
1.93k
    } else {
5324
        /* Writer */
5325
1.93k
        cram_file_def *def = calloc(1, sizeof(*def));
5326
1.93k
        if (!def)
5327
0
            return NULL;
5328
5329
1.93k
        fd->file_def = def;
5330
5331
1.93k
        def->magic[0] = 'C';
5332
1.93k
        def->magic[1] = 'R';
5333
1.93k
        def->magic[2] = 'A';
5334
1.93k
        def->magic[3] = 'M';
5335
1.93k
        def->major_version = 0; // Indicator to write file def later.
5336
1.93k
        def->minor_version = 0;
5337
1.93k
        memset(def->file_id, 0, 20);
5338
1.93k
        strncpy(def->file_id, filename, 20);
5339
5340
1.93k
        fd->version = major_version * 256 + minor_version;
5341
1.93k
        cram_init_tables(fd);
5342
5343
        /* SAM header written later along with this file_def */
5344
1.93k
    }
5345
5346
3.09k
    fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
5347
3.09k
    if (!fd->prefix)
5348
0
        goto err;
5349
3.09k
    fd->first_base = fd->last_base = -1;
5350
3.09k
    fd->record_counter = 0;
5351
5352
3.09k
    fd->ctr = NULL;
5353
3.09k
    fd->ctr_mt = NULL;
5354
3.09k
    fd->refs  = refs_create();
5355
3.09k
    if (!fd->refs)
5356
0
        goto err;
5357
3.09k
    fd->ref_id = -2;
5358
3.09k
    fd->ref = NULL;
5359
5360
3.09k
    fd->decode_md = 0;
5361
3.09k
    fd->seqs_per_slice = SEQS_PER_SLICE;
5362
3.09k
    fd->bases_per_slice = BASES_PER_SLICE;
5363
3.09k
    fd->slices_per_container = SLICE_PER_CNT;
5364
3.09k
    fd->embed_ref = -1; // automatic selection
5365
3.09k
    fd->no_ref = 0;
5366
3.09k
    fd->no_ref_counter = 0;
5367
3.09k
    fd->ap_delta = 0;
5368
3.09k
    fd->ignore_md5 = 0;
5369
3.09k
    fd->lossy_read_names = 0;
5370
3.09k
    fd->use_bz2 = 0;
5371
3.09k
    fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3);
5372
3.09k
    fd->use_tok = (CRAM_MAJOR_VERS(fd->version) >= 3) && (CRAM_MINOR_VERS(fd->version) >= 1);
5373
3.09k
    fd->use_lzma = 0;
5374
3.09k
    fd->multi_seq = -1;
5375
3.09k
    fd->multi_seq_user = -1;
5376
3.09k
    fd->unsorted   = 0;
5377
3.09k
    fd->shared_ref = 0;
5378
3.09k
    fd->store_md = 0;
5379
3.09k
    fd->store_nm = 0;
5380
3.09k
    fd->last_RI_count = 0;
5381
5382
3.09k
    fd->index       = NULL;
5383
3.09k
    fd->own_pool    = 0;
5384
3.09k
    fd->pool        = NULL;
5385
3.09k
    fd->rqueue      = NULL;
5386
3.09k
    fd->job_pending = NULL;
5387
3.09k
    fd->ooc         = 0;
5388
3.09k
    fd->required_fields = INT_MAX;
5389
5390
3.09k
    pthread_mutex_init(&fd->metrics_lock, NULL);
5391
3.09k
    pthread_mutex_init(&fd->ref_lock, NULL);
5392
3.09k
    pthread_mutex_init(&fd->range_lock, NULL);
5393
3.09k
    pthread_mutex_init(&fd->bam_list_lock, NULL);
5394
5395
148k
    for (i = 0; i < DS_END; i++) {
5396
145k
        fd->m[i] = cram_new_metrics();
5397
145k
        if (!fd->m[i])
5398
0
            goto err;
5399
145k
    }
5400
5401
3.09k
    if (!(fd->tags_used = kh_init(m_metrics)))
5402
0
        goto err;
5403
5404
3.09k
    fd->range.refid = -2; // no ref.
5405
3.09k
    fd->eof = 1;          // See samtools issue #150
5406
3.09k
    fd->ref_fn = NULL;
5407
5408
3.09k
    fd->bl = NULL;
5409
5410
    /* Initialise dummy refs from the @SQ headers */
5411
3.09k
    if (-1 == refs_from_header(fd))
5412
0
        goto err;
5413
5414
3.09k
    return fd;
5415
5416
31
 err:
5417
31
    if (fd)
5418
31
        free(fd);
5419
5420
31
    return NULL;
5421
3.09k
}
5422
5423
/*
5424
 * Seek within a CRAM file.
5425
 *
5426
 * Returns 0 on success
5427
 *        -1 on failure
5428
 */
5429
0
int cram_seek(cram_fd *fd, off_t offset, int whence) {
5430
0
    fd->ooc = 0;
5431
5432
0
    cram_drain_rqueue(fd);
5433
5434
0
    return hseek(fd->fp, offset, whence) >= 0 ? 0 : -1;
5435
0
}
5436
5437
/*
5438
 * Flushes a CRAM file.
5439
 * Useful for when writing to stdout without wishing to close the stream.
5440
 *
5441
 * Returns 0 on success
5442
 *        -1 on failure
5443
 */
5444
0
int cram_flush(cram_fd *fd) {
5445
0
    if (!fd)
5446
0
        return -1;
5447
5448
0
    int ret = 0;
5449
5450
0
    if (fd->mode == 'w' && fd->ctr) {
5451
0
        if(fd->ctr->slice)
5452
0
            cram_update_curr_slice(fd->ctr, fd->version);
5453
5454
0
        if (-1 == cram_flush_container_mt(fd, fd->ctr))
5455
0
            ret = -1;
5456
5457
0
        cram_free_container(fd->ctr);
5458
0
        if (fd->ctr_mt == fd->ctr)
5459
0
            fd->ctr_mt = NULL;
5460
0
        fd->ctr = NULL;
5461
0
    }
5462
5463
0
    return ret;
5464
0
}
5465
5466
/*
5467
 * Writes an EOF block to a CRAM file.
5468
 *
5469
 * Returns 0 on success
5470
 *        -1 on failure
5471
 */
5472
1.79k
int cram_write_eof_block(cram_fd *fd) {
5473
    // EOF block is a container with special values to aid detection
5474
1.79k
    if (CRAM_MAJOR_VERS(fd->version) >= 2) {
5475
        // Empty container with
5476
        //   ref_seq_id -1
5477
        //   start pos 0x454f46 ("EOF")
5478
        //   span 0
5479
        //   nrec 0
5480
        //   counter 0
5481
        //   nbases 0
5482
        //   1 block (landmark 0)
5483
        //   (CRC32)
5484
1.79k
        cram_container c;
5485
1.79k
        memset(&c, 0, sizeof(c));
5486
1.79k
        c.ref_seq_id = -1;
5487
1.79k
        c.ref_seq_start = 0x454f46; // "EOF"
5488
1.79k
        c.ref_seq_span = 0;
5489
1.79k
        c.record_counter = 0;
5490
1.79k
        c.num_bases = 0;
5491
1.79k
        c.num_blocks = 1;
5492
1.79k
        int32_t land[1] = {0};
5493
1.79k
        c.landmark = land;
5494
5495
        // An empty compression header block with
5496
        //   method raw (0)
5497
        //   type comp header (1)
5498
        //   content id 0
5499
        //   block contents size 6
5500
        //   raw size 6
5501
        //     empty preservation map (01 00)
5502
        //     empty data series map (01 00)
5503
        //     empty tag map (01 00)
5504
        //   block CRC
5505
1.79k
        cram_block_compression_hdr ch;
5506
1.79k
        memset(&ch, 0, sizeof(ch));
5507
1.79k
        c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch, 0);
5508
5509
1.79k
        c.length = c.comp_hdr_block->byte            // Landmark[0]
5510
1.79k
            + 5                                      // block struct
5511
1.79k
            + 4*(CRAM_MAJOR_VERS(fd->version) >= 3); // CRC
5512
1.79k
        if (cram_write_container(fd, &c) < 0 ||
5513
1.79k
            cram_write_block(fd, c.comp_hdr_block) < 0) {
5514
0
            cram_close(fd);
5515
0
            cram_free_block(c.comp_hdr_block);
5516
0
            return -1;
5517
0
        }
5518
1.79k
        if (ch.preservation_map)
5519
0
            kh_destroy(map, ch.preservation_map);
5520
1.79k
        cram_free_block(c.comp_hdr_block);
5521
5522
        // V2.1 bytes
5523
        // 0b 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id
5524
        // e0 45 4f 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5525
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5526
        // 00 01 00 06 06             // Comp.HDR blk
5527
        // 01 00 01 00 01 00          // Comp.HDR blk
5528
5529
        // V3.0 bytes:
5530
        // 0f 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id
5531
        // e0 45 4f 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5532
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5533
        // 05 bd d9 4f                // CRC32
5534
        // 00 01 00 06 06             // Comp.HDR blk
5535
        // 01 00 01 00 01 00          // Comp.HDR blk
5536
        // ee 63 01 4b                // CRC32
5537
5538
        // V4.0 bytes:
5539
        // 0f 00 00 00 8f ff ff ff    // Cont HDR: size, ref seq id
5540
        // 82 95 9e 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5541
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5542
        // ac d6 05 bc                // CRC32
5543
        // 00 01 00 06 06             // Comp.HDR blk
5544
        // 01 00 01 00 01 00          // Comp.HDR blk
5545
        // ee 63 01 4b                // CRC32
5546
1.79k
    }
5547
5548
1.79k
    return 0;
5549
1.79k
}
5550
5551
/*
5552
 * Closes a CRAM file.
5553
 * Returns 0 on success
5554
 *        -1 on failure
5555
 */
5556
3.09k
int cram_close(cram_fd *fd) {
5557
3.09k
    spare_bams *bl, *next;
5558
3.09k
    int i, ret = 0;
5559
5560
3.09k
    if (!fd)
5561
0
        return -1;
5562
5563
3.09k
    if (fd->mode == 'w' && fd->ctr) {
5564
1.18k
        if(fd->ctr->slice)
5565
1.18k
            cram_update_curr_slice(fd->ctr, fd->version);
5566
5567
1.18k
        if (-1 == cram_flush_container_mt(fd, fd->ctr))
5568
139
            ret = -1;
5569
1.18k
    }
5570
5571
3.09k
    if (fd->mode != 'w')
5572
1.16k
        cram_drain_rqueue(fd);
5573
5574
3.09k
    if (fd->pool && fd->eof >= 0 && fd->rqueue) {
5575
0
        hts_tpool_process_flush(fd->rqueue);
5576
5577
0
        if (0 != cram_flush_result(fd))
5578
0
            ret = -1;
5579
5580
0
        if (fd->mode == 'w')
5581
0
            fd->ctr = NULL; // prevent double freeing
5582
5583
        //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
5584
5585
0
        hts_tpool_process_destroy(fd->rqueue);
5586
0
    }
5587
5588
3.09k
    pthread_mutex_destroy(&fd->metrics_lock);
5589
3.09k
    pthread_mutex_destroy(&fd->ref_lock);
5590
3.09k
    pthread_mutex_destroy(&fd->range_lock);
5591
3.09k
    pthread_mutex_destroy(&fd->bam_list_lock);
5592
5593
3.09k
    if (ret == 0 && fd->mode == 'w') {
5594
        /* Write EOF block */
5595
1.79k
        if (0 != cram_write_eof_block(fd))
5596
0
            ret = -1;
5597
1.79k
    }
5598
5599
4.16k
    for (bl = fd->bl; bl; bl = next) {
5600
1.07k
        int max_rec = fd->seqs_per_slice * fd->slices_per_container;
5601
5602
1.07k
        next = bl->next;
5603
1.07k
        free_bam_list(bl->bams, max_rec);
5604
1.07k
        free(bl);
5605
1.07k
    }
5606
5607
3.09k
    if (hclose(fd->fp) != 0)
5608
0
        ret = -1;
5609
5610
3.09k
    if (fd->file_def)
5611
3.09k
        cram_free_file_def(fd->file_def);
5612
5613
3.09k
    if (fd->header)
5614
2.99k
        sam_hdr_destroy(fd->header);
5615
5616
3.09k
    free(fd->prefix);
5617
5618
3.09k
    if (fd->ctr)
5619
1.96k
        cram_free_container(fd->ctr);
5620
5621
3.09k
    if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
5622
34
        cram_free_container(fd->ctr_mt);
5623
5624
3.09k
    if (fd->refs)
5625
3.09k
        refs_free(fd->refs);
5626
3.09k
    if (fd->ref_free)
5627
0
        free(fd->ref_free);
5628
5629
148k
    for (i = 0; i < DS_END; i++)
5630
145k
        if (fd->m[i])
5631
145k
            free(fd->m[i]);
5632
5633
3.09k
    if (fd->tags_used) {
5634
3.09k
        khint_t k;
5635
5636
12.6k
        for (k = kh_begin(fd->tags_used); k != kh_end(fd->tags_used); k++) {
5637
9.56k
            if (kh_exist(fd->tags_used, k))
5638
4.99k
                free(kh_val(fd->tags_used, k));
5639
9.56k
        }
5640
5641
3.09k
        kh_destroy(m_metrics, fd->tags_used);
5642
3.09k
    }
5643
5644
3.09k
    if (fd->index)
5645
0
        cram_index_free(fd);
5646
5647
3.09k
    if (fd->own_pool && fd->pool)
5648
0
        hts_tpool_destroy(fd->pool);
5649
5650
3.09k
    if (fd->idxfp)
5651
0
        if (bgzf_close(fd->idxfp) < 0)
5652
0
            ret = -1;
5653
5654
3.09k
    free(fd);
5655
5656
3.09k
    return ret;
5657
3.09k
}
5658
5659
/*
5660
 * Returns 1 if we hit an EOF while reading.
5661
 */
5662
1.96k
int cram_eof(cram_fd *fd) {
5663
1.96k
    return fd->eof;
5664
1.96k
}
5665
5666
5667
/*
5668
 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5669
 * Use this immediately after opening.
5670
 *
5671
 * Returns 0 on success
5672
 *        -1 on failure
5673
 */
5674
1.16k
int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...) {
5675
1.16k
    int r;
5676
1.16k
    va_list args;
5677
5678
1.16k
    va_start(args, opt);
5679
1.16k
    r = cram_set_voption(fd, opt, args);
5680
1.16k
    va_end(args);
5681
5682
1.16k
    return r;
5683
1.16k
}
5684
5685
/*
5686
 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5687
 * Use this immediately after opening.
5688
 *
5689
 * Returns 0 on success
5690
 *        -1 on failure
5691
 */
5692
1.16k
int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
5693
1.16k
    refs_t *refs;
5694
5695
1.16k
    if (!fd) {
5696
0
        errno = EBADF;
5697
0
        return -1;
5698
0
    }
5699
5700
1.16k
    switch (opt) {
5701
1.16k
    case CRAM_OPT_DECODE_MD:
5702
1.16k
        fd->decode_md = va_arg(args, int);
5703
1.16k
        break;
5704
5705
0
    case CRAM_OPT_PREFIX:
5706
0
        if (fd->prefix)
5707
0
            free(fd->prefix);
5708
0
        if (!(fd->prefix = strdup(va_arg(args, char *))))
5709
0
            return -1;
5710
0
        break;
5711
5712
0
    case CRAM_OPT_VERBOSITY:
5713
0
        break;
5714
5715
0
    case CRAM_OPT_SEQS_PER_SLICE:
5716
0
        fd->seqs_per_slice = va_arg(args, int);
5717
0
        if (fd->bases_per_slice == BASES_PER_SLICE)
5718
0
            fd->bases_per_slice = fd->seqs_per_slice * 500;
5719
0
        break;
5720
5721
0
    case CRAM_OPT_BASES_PER_SLICE:
5722
0
        fd->bases_per_slice = va_arg(args, int);
5723
0
        break;
5724
5725
0
    case CRAM_OPT_SLICES_PER_CONTAINER:
5726
0
        fd->slices_per_container = va_arg(args, int);
5727
0
        break;
5728
5729
0
    case CRAM_OPT_EMBED_REF:
5730
0
        fd->embed_ref = va_arg(args, int);
5731
0
        break;
5732
5733
0
    case CRAM_OPT_NO_REF:
5734
0
        fd->no_ref = va_arg(args, int);
5735
0
        break;
5736
5737
0
    case CRAM_OPT_POS_DELTA:
5738
0
        fd->ap_delta = va_arg(args, int);
5739
0
        break;
5740
5741
0
    case CRAM_OPT_IGNORE_MD5:
5742
0
        fd->ignore_md5 = va_arg(args, int);
5743
0
        break;
5744
5745
0
    case CRAM_OPT_LOSSY_NAMES:
5746
0
        fd->lossy_read_names = va_arg(args, int);
5747
        // Currently lossy read names required paired (attached) reads.
5748
        // TLEN 0 or being 1 out causes read pairs to be detached, breaking
5749
        // the lossy read name compression, so we have extra options to
5750
        // slacken the exact TLEN round-trip checks.
5751
0
        fd->tlen_approx = fd->lossy_read_names;
5752
0
        fd->tlen_zero = fd->lossy_read_names;
5753
0
        break;
5754
5755
0
    case CRAM_OPT_USE_BZIP2:
5756
0
        fd->use_bz2 = va_arg(args, int);
5757
0
        break;
5758
5759
0
    case CRAM_OPT_USE_RANS:
5760
0
        fd->use_rans = va_arg(args, int);
5761
0
        break;
5762
5763
0
    case CRAM_OPT_USE_TOK:
5764
0
        fd->use_tok = va_arg(args, int);
5765
0
        break;
5766
5767
0
    case CRAM_OPT_USE_FQZ:
5768
0
        fd->use_fqz = va_arg(args, int);
5769
0
        break;
5770
5771
0
    case CRAM_OPT_USE_ARITH:
5772
0
        fd->use_arith = va_arg(args, int);
5773
0
        break;
5774
5775
0
    case CRAM_OPT_USE_LZMA:
5776
0
        fd->use_lzma = va_arg(args, int);
5777
0
        break;
5778
5779
0
    case CRAM_OPT_SHARED_REF:
5780
0
        fd->shared_ref = 1;
5781
0
        refs = va_arg(args, refs_t *);
5782
0
        if (refs != fd->refs) {
5783
0
            if (fd->refs)
5784
0
                refs_free(fd->refs);
5785
0
            fd->refs = refs;
5786
0
            fd->refs->count++;
5787
0
        }
5788
0
        break;
5789
5790
0
    case CRAM_OPT_RANGE: {
5791
0
        int r = cram_seek_to_refpos(fd, va_arg(args, cram_range *));
5792
0
        pthread_mutex_lock(&fd->range_lock);
5793
//        printf("opt range noseek to %p %d:%ld-%ld\n",
5794
//               fd, fd->range.refid, fd->range.start, fd->range.end);
5795
0
        if (fd->range.refid != -2)
5796
0
            fd->required_fields |= SAM_POS;
5797
0
        pthread_mutex_unlock(&fd->range_lock);
5798
0
        return r;
5799
0
    }
5800
5801
0
    case CRAM_OPT_RANGE_NOSEEK: {
5802
        // As per CRAM_OPT_RANGE, but no seeking
5803
0
        pthread_mutex_lock(&fd->range_lock);
5804
0
        cram_range *r = va_arg(args, cram_range *);
5805
0
        fd->range = *r;
5806
0
        if (r->refid == HTS_IDX_NOCOOR) {
5807
0
            fd->range.refid = -1;
5808
0
            fd->range.start = 0;
5809
0
        } else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) {
5810
0
            fd->range.refid = -2; // special case in cram_next_slice
5811
0
        }
5812
0
        if (fd->range.refid != -2)
5813
0
            fd->required_fields |= SAM_POS;
5814
0
        fd->ooc = 0;
5815
0
        fd->eof = 0;
5816
0
        pthread_mutex_unlock(&fd->range_lock);
5817
0
        return 0;
5818
0
    }
5819
5820
0
    case CRAM_OPT_REFERENCE:
5821
0
        return cram_load_reference(fd, va_arg(args, char *));
5822
5823
0
    case CRAM_OPT_VERSION: {
5824
0
        int major, minor;
5825
0
        char *s = va_arg(args, char *);
5826
0
        if (2 != sscanf(s, "%d.%d", &major, &minor)) {
5827
0
            hts_log_error("Malformed version string %s", s);
5828
0
            return -1;
5829
0
        }
5830
0
        if (!((major == 1 &&  minor == 0) ||
5831
0
              (major == 2 && (minor == 0 || minor == 1)) ||
5832
0
              (major == 3 && (minor == 0 || minor == 1)) ||
5833
0
              (major == 4 &&  minor == 0))) {
5834
0
            hts_log_error("Unknown version string; use 1.0, 2.0, 2.1, 3.0, 3.1 or 4.0");
5835
0
            errno = EINVAL;
5836
0
            return -1;
5837
0
        }
5838
5839
0
        if (major > 3 || (major == 3 && minor > 1)) {
5840
0
            hts_log_warning(
5841
0
                "CRAM version %s is still a draft and subject to change.\n"
5842
0
                "This is a technology demonstration that should not be "
5843
0
                "used for archival data.", s);
5844
0
        }
5845
5846
0
        fd->version = major*256 + minor;
5847
5848
0
        fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0;
5849
5850
0
        fd->use_tok = ((CRAM_MAJOR_VERS(fd->version) == 3 &&
5851
0
                        CRAM_MINOR_VERS(fd->version) >= 1) ||
5852
0
                        CRAM_MAJOR_VERS(fd->version) >= 4) ? 1 : 0;
5853
0
        cram_init_tables(fd);
5854
5855
0
        break;
5856
0
    }
5857
5858
0
    case CRAM_OPT_MULTI_SEQ_PER_SLICE:
5859
0
        fd->multi_seq_user = fd->multi_seq = va_arg(args, int);
5860
0
        break;
5861
5862
0
    case CRAM_OPT_NTHREADS: {
5863
0
        int nthreads =  va_arg(args, int);
5864
0
        if (fd->pool)
5865
0
            return -2;  //already exists!
5866
0
        if (nthreads >= 1) {
5867
0
            if (!(fd->pool = hts_tpool_init(nthreads)))
5868
0
                return -1;
5869
5870
0
            fd->rqueue = hts_tpool_process_init(fd->pool, nthreads*2, 0);
5871
0
            fd->shared_ref = 1;
5872
0
            fd->own_pool = 1;
5873
0
        }
5874
0
        break;
5875
0
    }
5876
5877
0
    case CRAM_OPT_THREAD_POOL: {
5878
0
        htsThreadPool *p = va_arg(args, htsThreadPool *);
5879
0
        if (fd->pool)
5880
0
            return -2;  //already exists!
5881
0
        fd->pool = p ? p->pool : NULL;
5882
0
        if (fd->pool) {
5883
0
            fd->rqueue = hts_tpool_process_init(fd->pool,
5884
0
                                                p->qsize ? p->qsize : hts_tpool_size(fd->pool)*2,
5885
0
                                                0);
5886
0
        }
5887
0
        fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
5888
0
        fd->own_pool = 0;
5889
5890
        //fd->qsize = 1;
5891
        //fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
5892
        //hts_tpool_dispatch(fd->pool, cram_decoder_thread, fd);
5893
0
        break;
5894
0
    }
5895
5896
0
    case CRAM_OPT_REQUIRED_FIELDS:
5897
0
        fd->required_fields = va_arg(args, int);
5898
0
        if (fd->range.refid != -2)
5899
0
            fd->required_fields |= SAM_POS;
5900
0
        break;
5901
5902
0
    case CRAM_OPT_STORE_MD:
5903
0
        fd->store_md = va_arg(args, int);
5904
0
        break;
5905
5906
0
    case CRAM_OPT_STORE_NM:
5907
0
        fd->store_nm = va_arg(args, int);
5908
0
        break;
5909
5910
0
    case HTS_OPT_COMPRESSION_LEVEL:
5911
0
        fd->level = va_arg(args, int);
5912
0
        break;
5913
5914
0
    case HTS_OPT_PROFILE: {
5915
0
        enum hts_profile_option prof = va_arg(args, int);
5916
0
        switch (prof) {
5917
0
        case HTS_PROFILE_FAST:
5918
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 1;
5919
0
            fd->use_tok = 0;
5920
0
            fd->seqs_per_slice = 10000;
5921
0
            break;
5922
5923
0
        case HTS_PROFILE_NORMAL:
5924
0
            break;
5925
5926
0
        case HTS_PROFILE_SMALL:
5927
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 6;
5928
0
            fd->use_bz2 = 1;
5929
0
            fd->use_fqz = 1;
5930
0
            fd->seqs_per_slice = 25000;
5931
0
            break;
5932
5933
0
        case HTS_PROFILE_ARCHIVE:
5934
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 7;
5935
0
            fd->use_bz2 = 1;
5936
0
            fd->use_fqz = 1;
5937
0
            fd->use_arith = 1;
5938
0
            if (fd->level > 7)
5939
0
                fd->use_lzma = 1;
5940
0
            fd->seqs_per_slice = 100000;
5941
0
            break;
5942
0
        }
5943
5944
0
        if (fd->bases_per_slice == BASES_PER_SLICE)
5945
0
            fd->bases_per_slice = fd->seqs_per_slice * 500;
5946
0
        break;
5947
0
    }
5948
5949
0
    default:
5950
0
        hts_log_error("Unknown CRAM option code %d", opt);
5951
0
        errno = EINVAL;
5952
0
        return -1;
5953
1.16k
    }
5954
5955
1.16k
    return 0;
5956
1.16k
}
5957
5958
int cram_check_EOF(cram_fd *fd)
5959
0
{
5960
    // Byte 9 in these templates is & with 0x0f to resolve differences
5961
    // between ITF-8 interpretations between early Java and C
5962
    // implementations of CRAM
5963
0
    static const unsigned char TEMPLATE_2_1[30] = {
5964
0
        0x0b, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
5965
0
        0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
5966
0
        0x01, 0x00, 0x06, 0x06, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00
5967
0
    };
5968
0
    static const unsigned char TEMPLATE_3[38] = {
5969
0
        0x0f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
5970
0
        0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x05,
5971
0
        0xbd, 0xd9, 0x4f, 0x00, 0x01, 0x00, 0x06, 0x06, 0x01, 0x00,
5972
0
        0x01, 0x00, 0x01, 0x00, 0xee, 0x63, 0x01, 0x4b
5973
0
    };
5974
5975
0
    unsigned char buf[38]; // max(sizeof TEMPLATE_*)
5976
5977
0
    uint8_t major = CRAM_MAJOR_VERS(fd->version);
5978
0
    uint8_t minor = CRAM_MINOR_VERS(fd->version);
5979
5980
0
    const unsigned char *template;
5981
0
    ssize_t template_len;
5982
0
    if ((major < 2) ||
5983
0
        (major == 2 && minor == 0)) {
5984
0
        return 3; // No EOF support in cram versions less than 2.1
5985
0
    } else if (major == 2 && minor == 1) {
5986
0
        template = TEMPLATE_2_1;
5987
0
        template_len = sizeof TEMPLATE_2_1;
5988
0
    } else {
5989
0
        template = TEMPLATE_3;
5990
0
        template_len = sizeof TEMPLATE_3;
5991
0
    }
5992
5993
0
    off_t offset = htell(fd->fp);
5994
0
    if (hseek(fd->fp, -template_len, SEEK_END) < 0) {
5995
0
        if (errno == ESPIPE) {
5996
0
            hclearerr(fd->fp);
5997
0
            return 2;
5998
0
        }
5999
0
        else {
6000
0
            return -1;
6001
0
        }
6002
0
    }
6003
0
    if (hread(fd->fp, buf, template_len) != template_len) return -1;
6004
0
    if (hseek(fd->fp, offset, SEEK_SET) < 0) return -1;
6005
0
    buf[8] &= 0x0f;
6006
0
    return (memcmp(template, buf, template_len) == 0)? 1 : 0;
6007
0
}